Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.

Commit b59d420

Browse files
committed
Upgrade nvidia-docker to nvidia-docker2
1 parent 67aefaf commit b59d420

7 files changed

Lines changed: 74 additions & 202 deletions

File tree

contrib/packer/ubuntu-16.04-GPU+IB/bootstrap.sh

Lines changed: 7 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -62,39 +62,13 @@ EOF
6262
chmod 755 $nvdriver
6363
$nvdriver -s
6464
# install nvidia-docker
65-
curl -OfSsL https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.1/nvidia-docker_1.0.1-1_amd64.deb
66-
dpkg -i nvidia-docker_1.0.1-1_amd64.deb
67-
rm nvidia-docker_1.0.1-1_amd64.deb
68-
# do not auto-enable nvidia docker service
69-
systemctl disable nvidia-docker.service
70-
systemctl start nvidia-docker.service
71-
systemctl status nvidia-docker.service
72-
# get driver version
73-
nvdriverver=`cat /proc/driver/nvidia/version | grep "Kernel Module" | cut -d ' ' -f 9`
74-
echo nvidia driver version $nvdriverver detected
75-
# create the docker volume now to avoid volume driver conflicts for
76-
# tasks. run this in a loop as it can fail if triggered too quickly
77-
# after start
78-
NV_START=$(date -u +"%s")
79-
set +e
80-
while :
81-
do
82-
echo "Attempting to create nvidia-docker volume with version $nvdriverver"
83-
docker volume create -d nvidia-docker --name nvidia_driver_$nvdriverver
84-
if [ $? -eq 0 ]; then
85-
break
86-
else
87-
NV_NOW=$(date -u +"%s")
88-
NV_DIFF=$((($NV_NOW-$NV_START)/60))
89-
# fail after 5 minutes of attempts
90-
if [ $NV_DIFF -ge 5 ]; then
91-
echo "could not create nvidia-docker volume"
92-
exit 1
93-
fi
94-
sleep 1
95-
fi
96-
done
97-
set -e
65+
curl -fSsL https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add -
66+
curl -fSsL https://nvidia.github.io/nvidia-docker/ubuntu16.04/amd64/nvidia-docker.list | \
67+
tee /etc/apt/sources.list.d/nvidia-docker.list
68+
apt-get update
69+
apt-get install nvidia-docker2
70+
pkill -SIGHUP dockerd
71+
nvidia-docker version
9872
fi
9973
set -e
10074

contrib/packer/ubuntu-16.04-GPU/bootstrap.sh

Lines changed: 7 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -60,38 +60,12 @@ EOF
6060
chmod 755 $nvdriver
6161
$nvdriver -s
6262
# install nvidia-docker
63-
curl -OfSsL https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.1/nvidia-docker_1.0.1-1_amd64.deb
64-
dpkg -i nvidia-docker_1.0.1-1_amd64.deb
65-
rm nvidia-docker_1.0.1-1_amd64.deb
66-
# do not auto-enable nvidia docker service
67-
systemctl disable nvidia-docker.service
68-
systemctl start nvidia-docker.service
69-
systemctl status nvidia-docker.service
70-
# get driver version
71-
nvdriverver=`cat /proc/driver/nvidia/version | grep "Kernel Module" | cut -d ' ' -f 9`
72-
echo nvidia driver version $nvdriverver detected
73-
# create the docker volume now to avoid volume driver conflicts for
74-
# tasks. run this in a loop as it can fail if triggered too quickly
75-
# after start
76-
NV_START=$(date -u +"%s")
77-
set +e
78-
while :
79-
do
80-
echo "Attempting to create nvidia-docker volume with version $nvdriverver"
81-
docker volume create -d nvidia-docker --name nvidia_driver_$nvdriverver
82-
if [ $? -eq 0 ]; then
83-
break
84-
else
85-
NV_NOW=$(date -u +"%s")
86-
NV_DIFF=$((($NV_NOW-$NV_START)/60))
87-
# fail after 5 minutes of attempts
88-
if [ $NV_DIFF -ge 5 ]; then
89-
echo "could not create nvidia-docker volume"
90-
exit 1
91-
fi
92-
sleep 1
93-
fi
94-
done
95-
set -e
63+
curl -fSsL https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add -
64+
curl -fSsL https://nvidia.github.io/nvidia-docker/ubuntu16.04/amd64/nvidia-docker.list | \
65+
tee /etc/apt/sources.list.d/nvidia-docker.list
66+
apt-get update
67+
apt-get install nvidia-docker2
68+
pkill -SIGHUP dockerd
69+
nvidia-docker version
9670
fi
9771
set -e

convoy/fleet.py

Lines changed: 5 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -63,30 +63,6 @@
6363
_REQUEST_CHUNK_SIZE = 4194304
6464
_ROOT_PATH = pathlib.Path(__file__).resolve().parent.parent
6565
_RESOURCES_PATH = None
66-
__NVIDIA_DOCKER_RPM = {
67-
'url': (
68-
'https://github.com/NVIDIA/nvidia-docker/releases/download/'
69-
'v1.0.1/nvidia-docker-1.0.1-1.x86_64.rpm'
70-
),
71-
'sha256': (
72-
'f05dfe7fe655ed39c399db0d6362e351b059f2708c3e6da17f590a000237ec3a'
73-
),
74-
'target': 'nvidia-docker.rpm'
75-
}
76-
_NVIDIA_DOCKER = {
77-
'ubuntuserver': {
78-
'url': (
79-
'https://github.com/NVIDIA/nvidia-docker/releases/download/'
80-
'v1.0.1/nvidia-docker_1.0.1-1_amd64.deb'
81-
),
82-
'sha256': (
83-
'9fbfd98f87ef2fd2e2137e3ba59431890dde6caf96f113ea0a1bd15bb3e51afa'
84-
),
85-
'target': 'nvidia-docker.deb'
86-
},
87-
'centos': __NVIDIA_DOCKER_RPM,
88-
'centos-hpc': __NVIDIA_DOCKER_RPM,
89-
}
9066
_NVIDIA_DRIVER = {
9167
'compute': {
9268
'url': (
@@ -304,36 +280,6 @@ def _setup_nvidia_driver_package(blob_client, config, vm_size):
304280
return pkg
305281

306282

307-
def _setup_nvidia_docker_package(blob_client, config):
308-
# type: (azure.storage.blob.BlockBlobService, dict) -> pathlib.Path
309-
"""Set up the nvidia docker package
310-
:param azure.storage.blob.BlockBlobService blob_client: blob client
311-
:param dict config: configuration dict
312-
:rtype: pathlib.Path
313-
:return: package path
314-
"""
315-
offer = settings.pool_offer(config, lower=True)
316-
pkg = _RESOURCES_PATH / _NVIDIA_DOCKER[offer]['target']
317-
# check to see if package is downloaded
318-
if (not pkg.exists() or
319-
util.compute_sha256_for_file(pkg, False) !=
320-
_NVIDIA_DOCKER[offer]['sha256']):
321-
# download package
322-
logger.debug('downloading NVIDIA docker to {}'.format(
323-
_NVIDIA_DOCKER[offer]['target']))
324-
response = requests.get(_NVIDIA_DOCKER[offer]['url'], stream=True)
325-
with pkg.open('wb') as f:
326-
for chunk in response.iter_content(chunk_size=_REQUEST_CHUNK_SIZE):
327-
if chunk:
328-
f.write(chunk)
329-
logger.debug('wrote {} bytes to {}'.format(pkg.stat().st_size, pkg))
330-
# check sha256
331-
if (util.compute_sha256_for_file(pkg, False) !=
332-
_NVIDIA_DOCKER[offer]['sha256']):
333-
raise RuntimeError('sha256 mismatch for {}'.format(pkg))
334-
return pkg
335-
336-
337283
def _generate_azure_mount_script_name(
338284
batch_account_name, pool_id, is_file_share, is_windows):
339285
# type: (str, str, bool, bool) -> pathlib.Path
@@ -1022,12 +968,9 @@ def _construct_pool_object(
1022968
gpu_type = settings.get_gpu_type_from_vm_size(
1023969
pool_settings.vm_size)
1024970
gpu_driver = pathlib.Path(_NVIDIA_DRIVER[gpu_type]['target'])
1025-
gpupkg = _setup_nvidia_docker_package(blob_client, config)
1026-
_rflist.append((gpupkg.name, gpupkg))
1027-
gpu_env = '{}:{}:{}'.format(
971+
gpu_env = '{}:{}'.format(
1028972
settings.is_gpu_visualization_pool(pool_settings.vm_size),
1029-
gpu_driver.name,
1030-
gpupkg.name)
973+
gpu_driver.name)
1031974
else:
1032975
gpu_env = None
1033976
# get container registries
@@ -2006,8 +1949,9 @@ def _adjust_settings_for_pool_creation(config):
20061949
config, vm_size=pool.vm_size)
20071950
if not allowed and util.is_none_or_empty(node_agent):
20081951
raise ValueError(
2009-
('unsupported Docker Host VM Config, publisher={} offer={} '
2010-
'sku={} vm_size={}').format(publisher, offer, sku, pool.vm_size))
1952+
('unsupported Docker (and/or GPU) Host VM Config, publisher={} '
1953+
'offer={} sku={} vm_size={}').format(
1954+
publisher, offer, sku, pool.vm_size))
20111955
# ensure HPC offers are matched with RDMA sizes
20121956
if (not is_windows and (
20131957
(offer == 'centos-hpc' or offer == 'sles-hpc') and

docs/63-batch-shipyard-custom-images.md

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -167,9 +167,10 @@ be invocable as root with default path and permissions. The Docker socket
167167
(`/var/run/docker.sock`) must be available (it is available by default).
168168

169169
**Important Note:** If you have modified the Docker Root directory to
170-
mount on the node local temporary disk, then you must not enable the
170+
mount on the node local temporary disk, then you must disable the
171171
service to run on boot due to potential races with the disk not being
172-
set up properly before the service starts.
172+
set up before the service starts. Batch Shipyard will take care of properly
173+
starting the service on boot.
173174

174175
#### SSH Server
175176
An SSH server should be installed and operational on port 22. You can
@@ -182,14 +183,9 @@ In order to utilize the GPUs available on compute nodes that have them
182183
(e.g., N-series VMs), the NVIDIA driver must be installed and loaded upon
183184
boot.
184185

185-
Additionally, [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
186+
Additionally, [nvidia-docker2](https://github.com/NVIDIA/nvidia-docker)
186187
must be installed.
187188

188-
**Important Note:** If you have modified the Docker Root directory to
189-
mount on the node local temporary disk, then you must not enable the
190-
nvidia-docker service to run on boot due to potential races with the disk
191-
not being set up properly before the service starts.
192-
193189
#### Infiniband/RDMA-enabled Compute Nodes
194190
The host VM Infiniband/RDMA stack must be enabled with the proper drivers
195191
and the required user-land software for Infiniband installed. It is best to

0 commit comments

Comments
 (0)