From b301e915ba85ed3ac7ec3418fdc1fe502d3d32f9 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 6 Jul 2023 11:37:41 -0400 Subject: [PATCH] Add a way to forward accelerators to Docker containers (#4492) * Add accelerator forwarding to Docker API * Satisfy MyPy * Apply suggestions from code review * Add missing comma --- src/toil/lib/accelerators.py | 36 ++++++++++++++++++++++++++++++++++-- src/toil/lib/docker.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/src/toil/lib/accelerators.py b/src/toil/lib/accelerators.py index 3dbb02364e..e56e6f6899 100644 --- a/src/toil/lib/accelerators.py +++ b/src/toil/lib/accelerators.py @@ -14,8 +14,9 @@ """Accelerator (i.e. GPU) utilities for Toil""" +import os import subprocess -from typing import Dict, List, Optional, Set +from typing import Dict, List, Optional, Set, Union from xml.dom import minidom from toil.job import AcceleratorRequirement @@ -37,6 +38,37 @@ def have_working_nvidia_smi() -> bool: return False return True +@memoize +def get_host_accelerator_numbers() -> List[int]: + """ + Work out what accelerator is what. + + For each accelerator visible to us, returns the host-side (for example, + outside-of-Slurm-job) number for that accelerator. It is often the same as + the apparent number. + + Can be used with Docker's --gpus='"device=#,#,#"' option to forward the + right GPUs as seen from a Docker daemon. + """ + + for number_list_var in ['SLURM_STEP_GPUS', 'SLURM_JOB_GPUS', 'CUDA_VISIBLE_DEVICES', 'NVIDIA_VISIBLE_DEVICES']: + # Any of these can have a list of GPU numbers, but the CUDA/NVIDIA ones + # also support a system of GPU GUIDs that we don't support. + # TODO: If Slurm confinement is set we ignore any attempt to further + # limit us with the app-level variables. Does that make sense? Writing + # code to translate through would be hard and probably not actually + # useful. + if number_list_var in os.environ: + device_string = os.environ[number_list_var] + # Parse all the numbers we have + device_numbers = [int(part) for part in device_string.split(',') if part.isnumeric()] + if len(device_numbers) > 0: + # We found some numbers, so use those + return device_numbers + + # If we don't see a set of limits we understand, say we have all nvidia GPUs + return list(range(count_nvidia_gpus())) + @memoize def have_working_nvidia_docker_runtime() -> bool: """ @@ -83,7 +115,7 @@ def get_individual_local_accelerators() -> List[AcceleratorRequirement]: # For now we only know abput nvidia GPUs return [{'kind': 'gpu', 'brand': 'nvidia', 'api': 'cuda', 'count': 1} for _ in range(count_nvidia_gpus())] -def get_restrictive_environment_for_local_accelerators(accelerator_numbers : Set[int]) -> Dict[str, str]: +def get_restrictive_environment_for_local_accelerators(accelerator_numbers : Union[Set[int], List[int]]) -> Dict[str, str]: """ Get environment variables which can be applied to a process to restrict it to using only the given accelerator numbers. diff --git a/src/toil/lib/docker.py b/src/toil/lib/docker.py index 0a37072eee..ca2f913c17 100644 --- a/src/toil/lib/docker.py +++ b/src/toil/lib/docker.py @@ -17,6 +17,7 @@ import re import struct from shlex import quote +from typing import Optional, List import requests @@ -27,6 +28,8 @@ create_api_error_from_http_exception) from docker.utils.socket import consume_socket_output, demux_adaptor +from toil.lib.accelerators import get_host_accelerator_numbers + logger = logging.getLogger(__name__) FORGO = 0 @@ -68,6 +71,7 @@ def apiDockerCall(job, stream=False, demux=False, streamfile=None, + accelerators: Optional[List[int]] = None, timeout=365 * 24 * 60 * 60, **kwargs): """ @@ -151,6 +155,11 @@ def toil_job(job): not always able to abort ongoing reads and writes in order to respect the timeout. Defaults to 1 year (i.e. wait essentially indefinitely). + :param accelerators: Toil accelerator numbers (usually GPUs) to forward to + the container. These are interpreted in the current + Python process's environment. See + toil.lib.accelerators.get_individual_local_accelerators() + for the menu of available accelerators. :param kwargs: Additional keyword arguments supplied to the docker API's run command. The list is 75 keywords total, for examples and full documentation see: @@ -238,6 +247,27 @@ def toil_job(job): if auto_remove is None: auto_remove = remove + device_requests = [] + if accelerators: + # Map accelerator numbers to host numbers + host_accelerators = [] + accelerator_mapping = get_host_accelerator_numbers() + for our_number in accelerators: + if our_number >= len(accelerator_mapping): + raise RuntimeError( + f"Cannot forward accelerator {our_number} because only " + f"{len(accelerator_mapping)} accelerators are available " + f"to this job." + ) + host_accelerators.append(accelerator_mapping[our_number]) + # TODO: Here we assume that the host accelerators are all GPUs + device_requests.append( + docker.types.DeviceRequest( + device_ids=[','.join(host_accelerators)], + capabilities=[['gpu']] + ) + ) + try: if detach is False: # When detach is False, this returns stdout normally: @@ -261,6 +291,7 @@ def toil_job(job): log_config=log_config, user=user, environment=environment, + device_requests=device_requests, **kwargs) if demux is False: @@ -303,6 +334,7 @@ def toil_job(job): log_config=log_config, user=user, environment=environment, + device_requests=device_requests, **kwargs) if stdout or stderr: if streamfile is None: