diff --git a/cwltool/cuda.py b/cwltool/cuda.py
index 719bfd8677..7217434e64 100644
--- a/cwltool/cuda.py
+++ b/cwltool/cuda.py
@@ -8,23 +8,47 @@
from .utils import CWLObjectType
+def cuda_device_count() -> str:
+ """Determine the number of attached CUDA GPUs."""
+ # For the number of GPUs, we can use the following query
+ cmd = ["nvidia-smi", "--query-gpu=count", "--format=csv,noheader"]
+ try:
+ # This is equivalent to subprocess.check_output, but use
+ # subprocess.run so we can use separate MagicMocks in test_cuda.py
+ proc = subprocess.run(cmd, stdout=subprocess.PIPE, check=True) # nosec
+ except Exception as e:
+ _logger.warning("Error checking number of GPUs with nvidia-smi: %s", e)
+ return "0"
+ return proc.stdout.decode("utf-8")
+
+
def cuda_version_and_device_count() -> Tuple[str, int]:
"""Determine the CUDA version and number of attached CUDA GPUs."""
+ count = int(cuda_device_count())
+
+ # Since there is no specific query for the cuda version, we have to use
+ # `nvidia-smi -q -x`
+ # However, apparently nvidia-smi is not safe to call concurrently.
+ # With --parallel, sometimes the returned XML will contain
+ # \xff...\xff
+ # (or other arbitrary bytes) and xml.dom.minidom.parseString will raise
+ # "xml.parsers.expat.ExpatError: not well-formed (invalid token)"
+ # So we either need to use `grep -v process_name` to blacklist that tag,
+ # (and hope that no other tags cause problems in the future)
+ # or better yet use `grep cuda_version` to only grab the tags we will use.
+ cmd = "nvidia-smi -q -x | grep cuda_version"
try:
- out = subprocess.check_output(["nvidia-smi", "-q", "-x"]) # nosec
+ out = subprocess.check_output(cmd, shell=True) # nosec
except Exception as e:
_logger.warning("Error checking CUDA version with nvidia-smi: %s", e)
return ("", 0)
- dm = xml.dom.minidom.parseString(out) # nosec
- ag = dm.getElementsByTagName("attached_gpus")
- if len(ag) < 1 or ag[0].firstChild is None:
- _logger.warning(
- "Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty.: %s",
- out,
- )
+ try:
+ dm = xml.dom.minidom.parseString(out) # nosec
+ except xml.parsers.expat.ExpatError as e:
+ _logger.warning("Error parsing XML stdout of nvidia-smi: %s", e)
+ _logger.warning("stdout: %s", out)
return ("", 0)
- ag_element = ag[0].firstChild
cv = dm.getElementsByTagName("cuda_version")
if len(cv) < 1 or cv[0].firstChild is None:
@@ -35,13 +59,10 @@ def cuda_version_and_device_count() -> Tuple[str, int]:
return ("", 0)
cv_element = cv[0].firstChild
- if isinstance(cv_element, xml.dom.minidom.Text) and isinstance(
- ag_element, xml.dom.minidom.Text
- ):
- return (cv_element.data, int(ag_element.data))
+ if isinstance(cv_element, xml.dom.minidom.Text):
+ return (cv_element.data, count)
_logger.warning(
- "Error checking CUDA version with nvidia-smi. "
- "Either 'attached_gpus' or 'cuda_version' was not a text node: %s",
+ "Error checking CUDA version with nvidia-smi. 'cuda_version' was not a text node: %s",
out,
)
return ("", 0)
diff --git a/tests/test_cuda.py b/tests/test_cuda.py
index e8de7cd637..b737538a15 100644
--- a/tests/test_cuda.py
+++ b/tests/test_cuda.py
@@ -90,8 +90,11 @@ def _makebuilder(cudaReq: CWLObjectType) -> Builder:
@mock.patch("subprocess.check_output")
+@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
-def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> None:
+def test_cuda_job_setup_check(
+ makedirs: MagicMock, count: MagicMock, check_output: MagicMock
+) -> None:
runtime_context = RuntimeContext({})
cudaReq: CWLObjectType = {
@@ -101,11 +104,9 @@ def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> N
}
builder = _makebuilder(cudaReq)
+ count.return_value = "1"
check_output.return_value = """
-
-1
1.0
-
"""
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
@@ -113,62 +114,33 @@ def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> N
@mock.patch("subprocess.check_output")
+@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
-def test_cuda_job_setup_check_err(makedirs: MagicMock, check_output: MagicMock) -> None:
- runtime_context = RuntimeContext({})
-
- cudaReq: CWLObjectType = {
- "class": "http://commonwl.org/cwltool#CUDARequirement",
- "cudaVersionMin": "2.0",
- "cudaComputeCapability": "1.0",
- }
- builder = _makebuilder(cudaReq)
-
- check_output.return_value = """
-
-1
-1.0
-
-"""
- jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
- with pytest.raises(WorkflowException):
- jb._setup(runtime_context)
-
-
-@mock.patch("subprocess.check_output")
-@mock.patch("os.makedirs")
-def test_cuda_job_setup_check_err_empty_attached_gpus(
- makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
+def test_cuda_job_setup_check_err(
+ makedirs: MagicMock, count: MagicMock, check_output: MagicMock
) -> None:
runtime_context = RuntimeContext({})
cudaReq: CWLObjectType = {
"class": "http://commonwl.org/cwltool#CUDARequirement",
- "cudaVersionMin": "1.0",
+ "cudaVersionMin": "2.0",
"cudaComputeCapability": "1.0",
}
builder = _makebuilder(cudaReq)
+ count.return_value = "1"
check_output.return_value = """
-
-
1.0
-
"""
-
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)
- assert (
- "Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty."
- in caplog.text
- )
-@mock.patch("subprocess.check_output")
+@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
-def test_cuda_job_setup_check_err_empty_missing_attached_gpus(
- makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
+def test_cuda_job_setup_check_err_missing_query_gpu_count(
+ makedirs: MagicMock, count: MagicMock, caplog: pytest.LogCaptureFixture
) -> None:
runtime_context = RuntimeContext({})
@@ -179,25 +151,19 @@ def test_cuda_job_setup_check_err_empty_missing_attached_gpus(
}
builder = _makebuilder(cudaReq)
- check_output.return_value = """
-
-1.0
-
-"""
+ count.return_value = ""
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)
- assert (
- "Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty."
- in caplog.text
- )
+ assert "invalid literal for int() with base 10" in caplog.text
@mock.patch("subprocess.check_output")
+@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_empty_cuda_version(
- makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
+ makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
) -> None:
runtime_context = RuntimeContext({})
@@ -208,11 +174,9 @@ def test_cuda_job_setup_check_err_empty_cuda_version(
}
builder = _makebuilder(cudaReq)
+ count.return_value = "1"
check_output.return_value = """
-
-1
-
"""
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
@@ -225,9 +189,10 @@ def test_cuda_job_setup_check_err_empty_cuda_version(
@mock.patch("subprocess.check_output")
+@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_missing_cuda_version(
- makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
+ makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
) -> None:
runtime_context = RuntimeContext({})
@@ -238,25 +203,20 @@ def test_cuda_job_setup_check_err_missing_cuda_version(
}
builder = _makebuilder(cudaReq)
- check_output.return_value = """
-
-1
-
-"""
+ count.return_value = "1"
+ check_output.return_value = ""
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)
- assert (
- "Error checking CUDA version with nvidia-smi. Missing 'cuda_version' or it is empty."
- in caplog.text
- )
+ assert "Error parsing XML stdout of nvidia-smi" in caplog.text
@mock.patch("subprocess.check_output")
+@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_wrong_type_cuda_version(
- makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
+ makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
) -> None:
runtime_context = RuntimeContext({})
@@ -267,19 +227,17 @@ def test_cuda_job_setup_check_err_wrong_type_cuda_version(
}
builder = _makebuilder(cudaReq)
+ count.return_value = "1"
check_output.return_value = """
-
-1
-
"""
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)
assert (
- "Error checking CUDA version with nvidia-smi. "
- "Either 'attached_gpus' or 'cuda_version' was not a text node" in caplog.text
+ "Error checking CUDA version with nvidia-smi. 'cuda_version' was not a text node"
+ in caplog.text
)
diff --git a/tests/wf/nvidia-smi-container.cwl b/tests/wf/nvidia-smi-container.cwl
index 84fd72d832..40d6fba987 100644
--- a/tests/wf/nvidia-smi-container.cwl
+++ b/tests/wf/nvidia-smi-container.cwl
@@ -7,7 +7,7 @@ requirements:
cudaVersionMin: "1.0"
cudaComputeCapability: "1.0"
DockerRequirement:
- dockerPull: "nvidia/cuda:11.4.2-runtime-ubuntu20.04"
+ dockerPull: "nvidia/cuda:11.4.3-runtime-ubuntu20.04"
inputs: []
outputs: []
# Assume this will exit non-zero (resulting in a failing test case) if