perlmutter - chunhualiao/public-docs GitHub Wiki
how to submit a script?
cat submit.1node.4gpu.job.sh
#!/bin/bash
# How to submit this job
# sbatch submit.1node.4gpu.job.sh
# you should never run ./submit1.node.4gpu.job.sh
# Or error srun: error: Job request does not match any supported policy.
# Jobs using 1 or 2 GPUs should request the shared queue.
# cpu cores allowed 32: -c 32
# -C is the same as --constraint gpu&hbm80g is the GPU with 80 GB, instead of 40GB
# Set the timestamp format
# timestamp=$(date +%Y-%m-%d_%H-%M-%S)
# --gpus=4
# --gpus-per-task=4 , each MPI process, how many GPUs ? implicitly set gpu-bind
# --ntasks-per-node=4, each compute node, how many MPI processes
# must use regular queue QOS to increase the limit, otherwise 5 minutes only?
# if 80G mem is needed, use gpu&hbm80g instead
#SBATCH -A myaccount_id
#SBATCH -C gpu
#SBATCH -q regular
# TODO remember to adjust the hours requested. 24 hours upper limit
#SBATCH -t 03:00:00
# compute node count
#SBATCH -N 1
# 32 cores or threads requested
#SBATCH -c 32
#SBATCH --gpus-per-node=4
# note --optionName=value
# -optionName value
#SBATCH --ntasks=1
# By default, Slurm may create as many tasks as there are requested CPUs or GPUs if you haven’t explicitly limited the number of tasks.
#SBATCH --job-name=test-using-4gpu
#SBATCH --output=%x_%j_output.out
#SBATCH --error=%x_%j_error.err
export WANDB_API_KEY="????"
module load conda/Miniforge3-24.7.1-0
module load pytorch/2.3.1
#conda activate openmp-qa
#salloc --nodes 1 --qos interactive --time 01:00:00 --constraint gpu --gpus 4 --account=m2959_g
srun python test-4-gpu.py