nvidia docker - yszheda/wiki GitHub Wiki
- https://github.com/NVIDIA/nvidia-docker
- https://github.com/NVIDIA/libnvidia-container
- https://github.com/NVIDIA/nvidia-container-runtime
References
$ docker inspect -f '{{index .Config.Labels "com.nvidia.volumes.needed"}}' nvidia/cuda
nvidia_driver
$ docker inspect -f '{{index .Config.Labels "com.nvidia.cuda.version"}}' nvidia/cuda
7.5
- nvidia-docker vs nvidia-container-runtime #815
- https://developer.nvidia.com/nvidia-container-runtime
- 深入理解 nvidia-docker2.0
- nvidia-docker2在kubernetes上实践
Install
- Using GPU from a docker container?
- Brainiarc7/nvidia-docker2-deploy-ubuntu-16.04LTS.md
- https://github.com/gw0/docker-debian-cuda/
- 使用nvidia-docker2
nvidia-docker2
for Docker images which are not based on NV official CUDA images
Use nvidia-cuda-toolkit
is installed in the Docker image.
docker run --runtime=nvidia
(device is not whitelisted in the container):
$ nvidia-smi
Failed to initialize NVML: Unknown Error
$ strace nvidia-smi
execve("/usr/bin/nvidia-smi", ["nvidia-smi"], [/* 22 vars */]) = 0
brk(NULL) = 0x1832000
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
mmap(NULL, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f8f4ea5d000
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
open("/home/bot/dev/dr_ros/install_isolated/lib/tls/x86_64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
stat("/home/bot/dev/dr_ros/install_isolated/lib/tls/x86_64", 0x7ffd678355d0) = -1 ENOENT (No such file or directory)
open("/home/bot/dev/dr_ros/install_isolated/lib/tls/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
stat("/home/bot/dev/dr_ros/install_isolated/lib/tls", 0x7ffd678355d0) = -1 ENOENT (No such file or directory)
open("/home/bot/dev/dr_ros/install_isolated/lib/x86_64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
stat("/home/bot/dev/dr_ros/install_isolated/lib/x86_64", 0x7ffd678355d0) = -1 ENOENT (No such file or directory)
open("/home/bot/dev/dr_ros/install_isolated/lib/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
stat("/home/bot/dev/dr_ros/install_isolated/lib", {st_mode=S_IFDIR|0755, st_size=20480, ...}) = 0
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=248120, ...}) = 0
mmap(NULL, 248120, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f8f4ea20000
close(3) = 0
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
open("/lib/x86_64-linux-gnu/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0Pa\0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=135440, ...}) = 0
mmap(NULL, 2212936, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f8f4e620000
mprotect(0x7f8f4e638000, 2093056, PROT_NONE) = 0
mmap(0x7f8f4e837000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x17000) = 0x7f8f4e837000
mmap(0x7f8f4e839000, 13384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f8f4e839000
close(3) = 0
open("/home/bot/dev/dr_ros/install_isolated/lib/libdl.so.2", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
open("/lib/x86_64-linux-gnu/libdl.so.2", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\200\r\0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0644, st_size=14640, ...}) = 0
mmap(NULL, 2109680, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f8f4e41c000
mprotect(0x7f8f4e41f000, 2093056, PROT_NONE) = 0
mmap(0x7f8f4e61e000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2000) = 0x7f8f4e61e000
close(3) = 0
open("/home/bot/dev/dr_ros/install_isolated/lib/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
open("/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\0\4\2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1689360, ...}) = 0
mmap(NULL, 3795296, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f8f4e07d000
mprotect(0x7f8f4e212000, 2097152, PROT_NONE) = 0
mmap(0x7f8f4e412000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x195000) = 0x7f8f4e412000
mmap(0x7f8f4e418000, 14688, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f8f4e418000
close(3) = 0
open("/home/bot/dev/dr_ros/install_isolated/lib/librt.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
open("/lib/x86_64-linux-gnu/librt.so.1", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\340 \0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0644, st_size=31744, ...}) = 0
mmap(NULL, 2128832, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f8f4de75000
mprotect(0x7f8f4de7c000, 2093056, PROT_NONE) = 0
mmap(0x7f8f4e07b000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x6000) = 0x7f8f4e07b000
close(3) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f8f4ea1e000
arch_prctl(ARCH_SET_FS, 0x7f8f4ea1efc0) = 0
mprotect(0x7f8f4e412000, 16384, PROT_READ) = 0
mprotect(0x7f8f4e837000, 4096, PROT_READ) = 0
mprotect(0x7f8f4e07b000, 4096, PROT_READ) = 0
mprotect(0x7f8f4e61e000, 4096, PROT_READ) = 0
mprotect(0x7f8f4ea60000, 4096, PROT_READ) = 0
munmap(0x7f8f4ea20000, 248120) = 0
set_tid_address(0x7f8f4ea1f290) = 38
set_robust_list(0x7f8f4ea1f2a0, 24) = 0
rt_sigaction(SIGRTMIN, {sa_handler=0x7f8f4e625bd0, sa_mask=[], sa_flags=SA_RESTORER|SA_SIGINFO, sa_restorer=0x7f8f4e6310c0}, NULL, 8) = 0
rt_sigaction(SIGRT_1, {sa_handler=0x7f8f4e625c60, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART|SA_SIGINFO, sa_restorer=0x7f8f4e6310c0}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0
futex(0x7f8f4e61f0a8, FUTEX_WAKE_PRIVATE, 2147483647) = 0
brk(NULL) = 0x1832000
brk(0x1853000) = 0x1853000
open("/home/bot/dev/dr_ros/install_isolated/lib/libnvidia-ml.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=248120, ...}) = 0
mmap(NULL, 248120, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f8f4ea20000
close(3) = 0
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
open("/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0p\335\0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0644, st_size=1312544, ...}) = 0
mmap(NULL, 6124712, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f8f4d89d000
mprotect(0x7f8f4d9c9000, 2097152, PROT_NONE) = 0
mmap(0x7f7c8fab8000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x6000) = 0x7f7c8fab8000
close(3) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f7c9045b000
arch_prctl(ARCH_SET_FS, 0x7f7c9045bd00) = 0
mprotect(0x7f7c8fe4f000, 16384, PROT_READ) = 0
mprotect(0x7f7c90274000, 4096, PROT_READ) = 0
mprotect(0x7f7c8fab8000, 4096, PROT_READ) = 0
mprotect(0x7f7c9005b000, 4096, PROT_READ) = 0
mprotect(0x7f7c9049d000, 4096, PROT_READ) = 0
munmap(0x7f7c9045d000, 248120) = 0
set_tid_address(0x7f7c9045bfd0) = 45
set_robust_list(0x7f7c9045bfe0, 24) = 0
rt_sigaction(SIGRTMIN, {sa_handler=0x7f7c90062bd0, sa_mask=[], sa_flags=SA_RESTORER|SA_SIGINFO, sa_restorer=0x7f7c9006e0c0}, NULL, 8) = 0
rt_sigaction(SIGRT_1, {sa_handler=0x7f7c90062c60, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART|SA_SIGINFO, sa_restorer=0x7f7c9006e0c0}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0
futex(0x7f7c9005c0a8, FUTEX_WAKE_PRIVATE, 2147483647) = 0
brk(NULL) = 0xee6000
brk(0xf07000) = 0xf07000
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=248120, ...}) = 0
mmap(NULL, 248120, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f7c9045d000
close(3) = 0
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
open("/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0p\335\0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0644, st_size=1312544, ...}) = 0
mmap(NULL, 6124712, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f7c8f2da000
mprotect(0x7f7c8f406000, 2097152, PROT_NONE) = 0
mmap(0x7f7c8f606000, 86016, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x12c000) = 0x7f7c8f606000
mmap(0x7f7c8f61b000, 2712744, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f7c8f61b000
close(3) = 0
munmap(0x7f7c9045d000, 248120) = 0
getpid() = 45
open("/proc/modules", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
read(3, "nvidia_uvm 663552 0 - Live 0xfff"..., 1024) = 1024
read(3, "f_conntrack 114688 6 nf_nat_masq"..., 1024) = 1024
read(3, " 0 - Live 0xffffffffc0427000\nnvi"..., 1024) = 1024
read(3, "d_hda_codec_realtek,snd_hda_code"..., 1024) = 1024
close(3) = 0
open("/proc/driver/nvidia/params", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
read(3, "Mobile: 4294967295\nResmanDebugLe"..., 1024) = 491
close(3) = 0
stat("/dev/nvidiactl", {st_mode=S_IFCHR|0666, st_rdev=makedev(195, 255), ...}) = 0
open("/dev/nvidiactl", O_RDWR) = -1 EPERM (Operation not permitted)
open("/dev/nvidiactl", O_RDONLY) = -1 EPERM (Operation not permitted)
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 0), ...}) = 0
write(1, "Failed to initialize NVML: Unkno"..., 41Failed to initialize NVML: Unknown Error
) = 41
exit_group(255) = ?
+++ exited with 255 +++
docker run --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all
docker: Error response from daemon: OCI runtime create failed: container_linux.go:344: starting container process caused "process_linux.go:424: container init caused \"process_linux.go:407: running prestart hook 1 caused \\\"error running hook: exit status 1, stdout: , stderr: exec command: [/usr/bin/nvidia-container-cli --load-kmods --debug=/var/log/nvidia-container-runtime-hook.log configure --ldconfig=@/sbin/ldconfig --device=all --utility --pid=5383 /home/bot/docker/overlay2/c8eddfaa444c2ec572944e7ba3c3eea2a5db6dab69c9bdcdb5d5123124d57412/merged]\\\\nnvidia-container-cli: mount error: file creation failed: /home/bot/docker/overlay2/c8eddfaa444c2ec572944e7ba3c3eea2a5db6dab69c9bdcdb5d5123124d57412/merged/usr/bin/nvidia-smi: file exists\\\\n\\\"\"": unknown.
-- WARNING, the following logs are for debugging purposes only --
I0222 04:19:31.013186 6804 nvc.c:281] initializing library context (version=1.0.1, build=038fb92d00c94f97d61492d4ed1f82e981129b74)
I0222 04:19:31.013281 6804 nvc.c:255] using root /
I0222 04:19:31.013297 6804 nvc.c:256] using ldcache /etc/ld.so.cache
I0222 04:19:31.013310 6804 nvc.c:257] using unprivileged user 65534:65534
I0222 04:19:31.014520 6808 nvc.c:191] loading kernel module nvidia
I0222 04:19:31.014910 6808 nvc.c:203] loading kernel module nvidia_uvm
I0222 04:19:31.015063 6808 nvc.c:211] loading kernel module nvidia_modeset
I0222 04:19:31.015584 6809 driver.c:133] starting driver service
I0222 04:19:31.053406 6804 nvc_container.c:364] configuring container with 'utility supervised'
I0222 04:19:31.053610 6804 nvc_container.c:384] setting pid to 6774
I0222 04:19:31.053621 6804 nvc_container.c:385] setting rootfs to /home/bot/docker/overlay2/e99e2f3211230bd58966a98c6e83ccbdb5672cdb8d292502e240fe5e4182a980/merged
I0222 04:19:31.053629 6804 nvc_container.c:386] setting owner to 0:0
I0222 04:19:31.053636 6804 nvc_container.c:387] setting bins directory to /usr/bin
I0222 04:19:31.053642 6804 nvc_container.c:388] setting libs directory to /usr/lib/x86_64-linux-gnu
I0222 04:19:31.053649 6804 nvc_container.c:389] setting libs32 directory to /usr/lib/i386-linux-gnu
I0222 04:19:31.053656 6804 nvc_container.c:390] setting cudart directory to /usr/local/cuda
I0222 04:19:31.053662 6804 nvc_container.c:391] setting ldconfig to @/sbin/ldconfig (host relative)
I0222 04:19:31.053669 6804 nvc_container.c:392] setting mount namespace to /proc/6774/ns/mnt
I0222 04:19:31.053676 6804 nvc_container.c:394] setting devices cgroup to /sys/fs/cgroup/devices/docker/a31e5b4fb3a5949ecb11c5c013c2cbdb5d91ec9cccc6054717372f552a7d4ded
I0222 04:19:31.053685 6804 nvc_info.c:433] requesting driver information with ''
I0222 04:19:31.054366 6804 nvc_info.c:147] selecting /usr/lib/x86_64-linux-gnu/libnvidia-tls.so.384.130
I0222 04:19:31.054440 6804 nvc_info.c:147] selecting /usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ptxjitcompiler.so.384.130
I0222 04:19:31.054504 6804 nvc_info.c:147] selecting /usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-opencl.so.384.130
I0222 04:19:31.054563 6804 nvc_info.c:147] selecting /usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so.384.130
I0222 04:19:31.054600 6804 nvc_info.c:147] selecting /usr/lib/x86_64-linux-gnu/libnvidia-glsi.so.384.130
I0222 04:19:31.054635 6804 nvc_info.c:147] selecting /usr/lib/x86_64-linux-gnu/libnvidia-glcore.so.384.130
I0222 04:19:31.054667 6804 nvc_info.c:147] selecting /usr/lib/x86_64-linux-gnu/libnvidia-fatbinaryloader.so.384.130
I0222 04:19:31.054703 6804 nvc_info.c:147] selecting /usr/lib/x86_64-linux-gnu/libnvidia-eglcore.so.384.130
I0222 04:19:31.054736 6804 nvc_info.c:147] selecting /usr/lib/x86_64-linux-gnu/libnvidia-compiler.so.384.130
I0222 04:19:31.054809 6804 nvc_info.c:147] selecting /usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-cfg.so.384.130
I0222 04:19:31.054866 6804 nvc_info.c:147] selecting /usr/lib/x86_64-linux-gnu/nvidia/current/libnvcuvid.so.384.130
I0222 04:19:31.055244 6804 nvc_info.c:147] selecting /usr/lib/x86_64-linux-gnu/nvidia/current/libcuda.so.384.130
I0222 04:19:31.055491 6804 nvc_info.c:147] selecting /usr/lib/x86_64-linux-gnu/nvidia/current/libGLX_nvidia.so.384.130
I0222 04:19:31.055553 6804 nvc_info.c:147] selecting /usr/lib/x86_64-linux-gnu/nvidia/current/libGLESv2_nvidia.so.384.130
I0222 04:19:31.055611 6804 nvc_info.c:147] selecting /usr/lib/x86_64-linux-gnu/nvidia/current/libGLESv1_CM_nvidia.so.384.130
I0222 04:19:31.055669 6804 nvc_info.c:147] selecting /usr/lib/x86_64-linux-gnu/nvidia/current/libEGL_nvidia.so.384.130
W0222 04:19:31.055722 6804 nvc_info.c:298] missing library libvdpau_nvidia.so
W0222 04:19:31.055735 6804 nvc_info.c:298] missing library libnvidia-encode.so
W0222 04:19:31.055745 6804 nvc_info.c:298] missing library libnvidia-fbc.so
W0222 04:19:31.055752 6804 nvc_info.c:298] missing library libnvidia-ifr.so
W0222 04:19:31.055758 6804 nvc_info.c:302] missing compat32 library libnvidia-ml.so
W0222 04:19:31.055774 6804 nvc_info.c:302] missing compat32 library libnvidia-cfg.so
W0222 04:19:31.055782 6804 nvc_info.c:302] missing compat32 library libcuda.so
W0222 04:19:31.055791 6804 nvc_info.c:302] missing compat32 library libnvidia-opencl.so
W0222 04:19:31.055800 6804 nvc_info.c:302] missing compat32 library libnvidia-ptxjitcompiler.so
W0222 04:19:31.055809 6804 nvc_info.c:302] missing compat32 library libnvidia-fatbinaryloader.so
W0222 04:19:31.055816 6804 nvc_info.c:302] missing compat32 library libnvidia-compiler.so
W0222 04:19:31.055822 6804 nvc_info.c:302] missing compat32 library libvdpau_nvidia.so
W0222 04:19:31.055829 6804 nvc_info.c:302] missing compat32 library libnvidia-encode.so
W0222 04:19:31.055835 6804 nvc_info.c:302] missing compat32 library libnvcuvid.so
W0222 04:19:31.055842 6804 nvc_info.c:302] missing compat32 library libnvidia-eglcore.so
W0222 04:19:31.055849 6804 nvc_info.c:302] missing compat32 library libnvidia-glcore.so
W0222 04:19:31.055855 6804 nvc_info.c:302] missing compat32 library libnvidia-tls.so
W0222 04:19:31.055862 6804 nvc_info.c:302] missing compat32 library libnvidia-glsi.so
W0222 04:19:31.055868 6804 nvc_info.c:302] missing compat32 library libnvidia-fbc.so
W0222 04:19:31.055875 6804 nvc_info.c:302] missing compat32 library libnvidia-ifr.so
W0222 04:19:31.055882 6804 nvc_info.c:302] missing compat32 library libGLX_nvidia.so
W0222 04:19:31.055888 6804 nvc_info.c:302] missing compat32 library libEGL_nvidia.so
W0222 04:19:31.055895 6804 nvc_info.c:302] missing compat32 library libGLESv2_nvidia.so
W0222 04:19:31.055902 6804 nvc_info.c:302] missing compat32 library libGLESv1_CM_nvidia.so
I0222 04:19:31.056160 6804 nvc_info.c:228] selecting /usr/lib/nvidia/current/nvidia-smi
I0222 04:19:31.056197 6804 nvc_info.c:228] selecting /usr/lib/nvidia/current/nvidia-debugdump
I0222 04:19:31.056215 6804 nvc_info.c:228] selecting /usr/bin/nvidia-persistenced
W0222 04:19:31.056372 6804 nvc_info.c:324] missing binary nvidia-cuda-mps-control
W0222 04:19:31.056379 6804 nvc_info.c:324] missing binary nvidia-cuda-mps-server
I0222 04:19:31.056402 6804 nvc_info.c:365] listing device /dev/nvidiactl
I0222 04:19:31.056409 6804 nvc_info.c:365] listing device /dev/nvidia-uvm
I0222 04:19:31.056416 6804 nvc_info.c:365] listing device /dev/nvidia-uvm-tools
I0222 04:19:31.056422 6804 nvc_info.c:365] listing device /dev/nvidia-modeset
W0222 04:19:31.056445 6804 nvc_info.c:273] missing ipc /var/run/nvidia-persistenced/socket
W0222 04:19:31.056460 6804 nvc_info.c:273] missing ipc /tmp/nvidia-mps
I0222 04:19:31.056467 6804 nvc_info.c:489] requesting device information with ''
I0222 04:19:31.062324 6804 nvc_info.c:519] listing device /dev/nvidia0 (GPU-a11d518d-029c-38a6-dcb9-fbbbe818c226 at 00000000:01:00.0)
I0222 04:19:31.062379 6804 nvc_mount.c:252] mounting tmpfs at /home/bot/docker/overlay2/e99e2f3211230bd58966a98c6e83ccbdb5672cdb8d292502e240fe5e4182a980/merged/proc/driver/nvidia
I0222 04:19:31.078615 6804 nvc.c:318] shutting down library context
I0222 04:19:31.079014 6809 driver.c:192] terminating driver service
I0222 04:19:31.090407 6804 driver.c:233] driver service terminated successfully
docker build
- Request for nvidia-docker support in docker build or "stub" driver symbol libraries in devel images
- building images with nvidia-docker #595
default runtime
/etc/docker/daemon.json
:
"default-runtime": "nvidia",
Trouble-shooting
dial unix /var/lib/nvidia-docker/nvidia-docker.sock: connect: no such file or directory
查/var/log/upstart/nvidia-docker.log
,发现是localhost没被正确解析:
/usr/bin/nvidia-docker-plugin | 2017/10/10 17:29:40 Loading NVIDIA unified memory
/usr/bin/nvidia-docker-plugin | 2017/10/10 17:29:40 Loading NVIDIA management library
/usr/bin/nvidia-docker-plugin | 2017/10/10 17:29:40 Discovering GPU devices
/usr/bin/nvidia-docker-plugin | 2017/10/10 17:29:44 Provisioning volumes at /var/lib/nvidia-docker/volumes
/usr/bin/nvidia-docker-plugin | 2017/10/10 17:29:44 Serving plugin API at /var/lib/nvidia-docker
/usr/bin/nvidia-docker-plugin | 2017/10/10 17:29:44 Serving remote API at localhost:3476
/usr/bin/nvidia-docker-plugin | 2017/10/10 17:29:45 Error: listen tcp: lookup localhost on 8.8.8.8:53: no such host
- https://github.com/NVIDIA/nvidia-docker/issues/105
- https://stackoverflow.com/questions/25372781/docker-error-var-run-docker-sock-no-such-file-or-directory
nvcc not found
Use devel version
Failed to initialize NVML: Unknown Error
Set NVIDIA_VISIBLE_DEVICES
mount error: file creation failed: /var/lib/docker/overlay2/xxx/merged/usr/bin/nvidia-smi: file exists
Should not install nvidia driver inside the Docker image.
- https://github.com/NVIDIA/nvidia-docker/issues/748
- https://github.com/NVIDIA/nvidia-container-runtime/issues/44
- https://github.com/NVIDIA/nvidia-docker/issues/825
- https://github.com/NVIDIA/nvidia-docker/issues/588
No CUDA inside container
-e NVIDIA_DRIVER_CAPABILITIES=compute,utility
unknown flag: --gpus
sudo apt install -y --reinstall docker-ce docker-ce-cli nvidia-container-toolkit