Test GPU CUDA compatability - epam/cloud-pipeline GitHub Wiki

# 1. For DIND-enabled instances - run appropriate cuda version to test
docker run -it --rm --runtime=nvidia --privileged -e NVIDIA_VISIBLE_DEVICES=all -e NVIDIA_DRIVER_CAPABILITIES="compute,utility" nvidia/cuda:12.4.1-runtime-rockylinux8 bash
docker run -it --rm --runtime=nvidia --privileged -e NVIDIA_VISIBLE_DEVICES=all -e NVIDIA_DRIVER_CAPABILITIES="compute,utility" nvidia/cuda:11.3.1-runtime-rockylinux8 bash
# And others from https://hub.docker.com/r/nvidia/cuda/tags
# Main requirement - host drivers shall support selected cuda version (can be checked via nvidia-smi)


# 2. Install python
yum install -y bzip2 && \
mkdir -p /opt/local/mamba && \
cd /opt/local/mamba && \
curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba && \
export PATH=$PATH:/opt/local/mamba/bin && \
export MAMBA_ROOT_PREFIX=/opt/local/mamba && \
eval "$(micromamba shell hook --shell bash)" && \
micromamba create -n torch-test && \
micromamba activate torch-test && \
micromamba install -y python==3.9

# 3. Install pytorch which matches cuda version. Available options are: https://pytorch.org/get-started/previous-versions/ (choose pip approach)
# E.g. for CUDA 11.3:
pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113
# Or for CUDA 11.3:
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124

# 5. Create a test script which checks if pytorch can use GPUs
cat > t.py <<'EOF'
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
EOF

# 6. Execute tests
# - shall report drivers and max cuda version
nvidia-smi
# - shall report three lines: True/False - if cuda is available, Number of GPUs, First GPU model
python t.py