Add a way to forward accelerators to Docker containers (#4492)

* Add accelerator forwarding to Docker API * Satisfy MyPy * Apply suggestions from code review * Add missing comma
DataBiosphere · Jul 6, 2023 · b301e91 · b301e91
1 parent d686dac
commit b301e91
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 2 deletions.
diff --git a/src/toil/lib/accelerators.py b/src/toil/lib/accelerators.py
@@ -14,8 +14,9 @@
 
 """Accelerator (i.e. GPU) utilities for Toil"""
 
+import os
 import subprocess
-from typing import Dict, List, Optional, Set
+from typing import Dict, List, Optional, Set, Union
 from xml.dom import minidom
 
 from toil.job import AcceleratorRequirement
@@ -37,6 +38,37 @@ def have_working_nvidia_smi() -> bool:
         return False
     return True
 
+@memoize
+def get_host_accelerator_numbers() -> List[int]:
+    """
+    Work out what accelerator is what.
+
+    For each accelerator visible to us, returns the host-side (for example,
+    outside-of-Slurm-job) number for that accelerator. It is often the same as
+    the apparent number.
+
+    Can be used with Docker's --gpus='"device=#,#,#"' option to forward the
+    right GPUs as seen from a Docker daemon.
+    """
+
+    for number_list_var in ['SLURM_STEP_GPUS', 'SLURM_JOB_GPUS', 'CUDA_VISIBLE_DEVICES', 'NVIDIA_VISIBLE_DEVICES']:
+        # Any of these can have a list of GPU numbers, but the CUDA/NVIDIA ones
+        # also support a system of GPU GUIDs that we don't support.
+        # TODO: If Slurm confinement is set we ignore any attempt to further
+        # limit us with the app-level variables. Does that make sense? Writing
+        # code to translate through would be hard and probably not actually
+        # useful.
+        if number_list_var in os.environ:
+            device_string = os.environ[number_list_var]
+            # Parse all the numbers we have
+            device_numbers = [int(part) for part in device_string.split(',') if part.isnumeric()]
+            if len(device_numbers) > 0:
+                # We found some numbers, so use those
+                return device_numbers
+
+    # If we don't see a set of limits we understand, say we have all nvidia GPUs
+    return list(range(count_nvidia_gpus()))
+
 @memoize
 def have_working_nvidia_docker_runtime() -> bool:
     """
@@ -83,7 +115,7 @@ def get_individual_local_accelerators() -> List[AcceleratorRequirement]:
     # For now we only know abput nvidia GPUs
     return [{'kind': 'gpu', 'brand': 'nvidia', 'api': 'cuda', 'count': 1} for _ in range(count_nvidia_gpus())]
 
-def get_restrictive_environment_for_local_accelerators(accelerator_numbers : Set[int]) -> Dict[str, str]:
+def get_restrictive_environment_for_local_accelerators(accelerator_numbers : Union[Set[int], List[int]]) -> Dict[str, str]:
     """
     Get environment variables which can be applied to a process to restrict it
     to using only the given accelerator numbers.

diff --git a/src/toil/lib/docker.py b/src/toil/lib/docker.py
@@ -17,6 +17,7 @@
 import re
 import struct
 from shlex import quote
+from typing import Optional, List
 
 import requests
 
@@ -27,6 +28,8 @@
                            create_api_error_from_http_exception)
 from docker.utils.socket import consume_socket_output, demux_adaptor
 
+from toil.lib.accelerators import get_host_accelerator_numbers
+
 logger = logging.getLogger(__name__)
 
 FORGO = 0
@@ -68,6 +71,7 @@ def apiDockerCall(job,
                   stream=False,
                   demux=False,
                   streamfile=None,
+                  accelerators: Optional[List[int]] = None,
                   timeout=365 * 24 * 60 * 60,
                   **kwargs):
     """
@@ -151,6 +155,11 @@ def toil_job(job):
                         not always able to abort ongoing reads and writes in order
                         to respect the timeout. Defaults to 1 year (i.e. wait
                         essentially indefinitely).
+    :param accelerators: Toil accelerator numbers (usually GPUs) to forward to
+                         the container. These are interpreted in the current
+                         Python process's environment. See
+                         toil.lib.accelerators.get_individual_local_accelerators()
+                         for the menu of available accelerators.
     :param kwargs: Additional keyword arguments supplied to the docker API's
                    run command.  The list is 75 keywords total, for examples
                    and full documentation see:
@@ -238,6 +247,27 @@ def toil_job(job):
     if auto_remove is None:
         auto_remove = remove
 
+    device_requests = []
+    if accelerators:
+        # Map accelerator numbers to host numbers
+        host_accelerators = []
+        accelerator_mapping = get_host_accelerator_numbers()
+        for our_number in accelerators:
+            if our_number >= len(accelerator_mapping):
+                raise RuntimeError(
+                    f"Cannot forward accelerator {our_number} because only "
+                    f"{len(accelerator_mapping)} accelerators are available "
+                    f"to this job."
+                )
+            host_accelerators.append(accelerator_mapping[our_number])
+        # TODO: Here we assume that the host accelerators are all GPUs
+        device_requests.append(
+            docker.types.DeviceRequest(
+                device_ids=[','.join(host_accelerators)],
+                capabilities=[['gpu']]
+            )
+        )
+
     try:
         if detach is False:
             # When detach is False, this returns stdout normally:
@@ -261,6 +291,7 @@ def toil_job(job):
                                         log_config=log_config,
                                         user=user,
                                         environment=environment,
+                                        device_requests=device_requests,
                                         **kwargs)
 
             if demux is False:
@@ -303,6 +334,7 @@ def toil_job(job):
                                               log_config=log_config,
                                               user=user,
                                               environment=environment,
+                                              device_requests=device_requests,
                                               **kwargs)
             if stdout or stderr:
                 if streamfile is None: