perlmutter - chunhualiao/public-docs GitHub Wiki

how to submit a script?
cat submit.1node.4gpu.job.sh 
#!/bin/bash

# How to submit this job
# sbatch submit.1node.4gpu.job.sh 
# you should never run ./submit1.node.4gpu.job.sh
# Or error srun: error: Job request does not match any supported policy.

# Jobs using 1 or 2 GPUs should request the shared queue.
# cpu cores allowed 32:  -c 32 
#   -C is the same as --constraint gpu&hbm80g is the GPU with 80 GB, instead of 40GB
# Set the timestamp format
# timestamp=$(date +%Y-%m-%d_%H-%M-%S)
#  --gpus=4 
#   --gpus-per-task=4  , each MPI process, how many GPUs ? implicitly set gpu-bind
#    --ntasks-per-node=4,  each compute node, how many MPI processes
# must use regular queue QOS to increase the limit, otherwise 5 minutes only?

# if 80G mem is needed, use gpu&hbm80g instead
#SBATCH -A myaccount_id
#SBATCH -C gpu
#SBATCH -q regular
# TODO remember to adjust the hours requested. 24 hours upper limit
#SBATCH -t 03:00:00

# compute node count
#SBATCH -N 1

# 32 cores or threads requested
#SBATCH -c 32

#SBATCH --gpus-per-node=4

# note --optionName=value
#       -optionName value


#SBATCH --ntasks=1
# By default, Slurm may create as many tasks as there are requested CPUs or GPUs if you haven’t explicitly limited the number of tasks.
#SBATCH --job-name=test-using-4gpu
#SBATCH --output=%x_%j_output.out
#SBATCH --error=%x_%j_error.err

export WANDB_API_KEY="????"


module load conda/Miniforge3-24.7.1-0
module load pytorch/2.3.1

#conda activate openmp-qa
#salloc --nodes 1 --qos interactive --time 01:00:00 --constraint gpu --gpus 4 --account=m2959_g

srun python test-4-gpu.py