From 263582bb848da05d8f26037d7e67c1ee66f3834d Mon Sep 17 00:00:00 2001 From: Matthias Diener Date: Tue, 27 Aug 2024 23:19:23 -0500 Subject: [PATCH] DeviceMemoryUsage: add AMD support, use ProcessLogger for version.sh (#1060) * DeviceMemoryUsage: only use on Nvidia, use ProcessLogger for version.sh * don't force loading mpi4py for config checks * fix Numpy actx spelling * add DeviceMemoryUsageAMD * clean up MPI actx handling --- examples/run_examples.sh | 2 +- mirgecom/array_context.py | 27 +++++++++++++++---- mirgecom/logging_quantities.py | 47 +++++++++++++++++++++++++++++----- 3 files changed, 64 insertions(+), 12 deletions(-) diff --git a/examples/run_examples.sh b/examples/run_examples.sh index cedd7f071..60454b00a 100755 --- a/examples/run_examples.sh +++ b/examples/run_examples.sh @@ -16,7 +16,7 @@ function endgroup { # }}} -python -c "from grudge.array_context import MPINumpyArrayConext" && numpy_actx_available=numpy || numpy_actx_available= +python -c "from grudge.array_context import MPINumpyArrayContext" && numpy_actx_available=numpy || numpy_actx_available= echo "Numpy array context available: $numpy_actx_available" diff --git a/mirgecom/array_context.py b/mirgecom/array_context.py index abc9ede3f..0182b693b 100644 --- a/mirgecom/array_context.py +++ b/mirgecom/array_context.py @@ -5,6 +5,7 @@ .. autofunction:: actx_class_is_eager .. autofunction:: actx_class_is_profiling .. autofunction:: actx_class_is_numpy +.. autofunction:: actx_class_is_distributed .. autofunction:: initialize_actx """ @@ -110,6 +111,12 @@ def actx_class_is_numpy(actx_class: Type[ArrayContext]) -> bool: return False +def actx_class_is_distributed(actx_class: Type[ArrayContext]) -> bool: + """Return True if *actx_class* is distributed.""" + from grudge.array_context import MPIBasedArrayContext + return issubclass(actx_class, MPIBasedArrayContext) + + def actx_class_has_fallback_args(actx_class: Type[ArrayContext]) -> bool: """Return True if *actx_class* has fallback arguments.""" import inspect @@ -117,8 +124,11 @@ def actx_class_has_fallback_args(actx_class: Type[ArrayContext]) -> bool: return "use_axis_tag_inference_fallback" in spec.args -def _check_cache_dirs_node() -> None: +def _check_cache_dirs_node(actx: ArrayContext) -> None: """Check whether multiple ranks share cache directories on the same node.""" + if not actx_class_is_distributed(type(actx)): + return + from mpi4py import MPI size = MPI.COMM_WORLD.Get_size() @@ -176,6 +186,9 @@ def _check_gpu_oversubscription(actx: ArrayContext) -> None: Only works with CUDA devices currently due to the use of the PCI_DOMAIN_ID_NV extension. """ + if not actx_class_is_distributed(type(actx)): + return + from mpi4py import MPI import pyopencl as cl @@ -227,11 +240,15 @@ def _check_gpu_oversubscription(actx: ArrayContext) -> None: def log_disk_cache_config(actx: ArrayContext) -> None: """Log the disk cache configuration.""" - from mpi4py import MPI - assert isinstance(actx, (PyOpenCLArrayContext, PytatoPyOpenCLArrayContext)) - rank = MPI.COMM_WORLD.Get_rank() + if actx_class_is_distributed(type(actx)): + from grudge.array_context import MPIBasedArrayContext + assert isinstance(actx, MPIBasedArrayContext) + rank = actx.mpi_communicator.Get_rank() + else: + rank = 0 + res = f"Rank {rank} disk cache config: " from pyopencl.characterize import nv_compute_capability, get_pocl_version @@ -336,7 +353,7 @@ def initialize_actx( # or pocl, and therefore we don't need to examine their caching). if actx_class_is_pyopencl(actx_class): _check_gpu_oversubscription(actx) - _check_cache_dirs_node() + _check_cache_dirs_node(actx) log_disk_cache_config(actx) return actx diff --git a/mirgecom/logging_quantities.py b/mirgecom/logging_quantities.py index 96ae05d82..558aab374 100644 --- a/mirgecom/logging_quantities.py +++ b/mirgecom/logging_quantities.py @@ -29,7 +29,8 @@ .. autoclass:: DiscretizationBasedQuantity .. autoclass:: KernelProfile .. autoclass:: PythonMemoryUsage -.. autoclass:: DeviceMemoryUsage +.. autoclass:: DeviceMemoryUsageCUDA +.. autoclass:: DeviceMemoryUsageAMD .. autofunction:: initialize_logmgr .. autofunction:: logmgr_add_cl_device_info .. autofunction:: logmgr_add_device_memory_usage @@ -40,6 +41,8 @@ .. autofunction:: logmgr_set_time """ +import logging + from logpyle import (LogQuantity, PostLogQuantity, LogManager, MultiPostLogQuantity, add_run_info, add_general_quantities, add_simulation_quantities) @@ -55,6 +58,9 @@ import grudge.op as oper from typing import List + +logger = logging.getLogger(__name__) + MemPoolType = Union[cl.tools.MemoryPool, cl.tools.SVMPool] @@ -114,9 +120,13 @@ def logmgr_add_device_name(logmgr: LogManager, queue: cl.CommandQueue): # noqa: def logmgr_add_device_memory_usage(logmgr: LogManager, queue: cl.CommandQueue) \ -> None: """Add the OpenCL device memory usage to the log.""" - if not queue or not (queue.device.type & cl.device_type.GPU): + if not queue: return - logmgr.add_quantity(DeviceMemoryUsage()) + + if queue.device.vendor.lower().startswith("nvidia"): + logmgr.add_quantity(DeviceMemoryUsageCUDA()) + elif queue.device.vendor.lower().startswith("advanced micro devices"): + logmgr.add_quantity(DeviceMemoryUsageAMD(queue.device)) def logmgr_add_mempool_usage(logmgr: LogManager, pool: MemPoolType) -> None: @@ -192,8 +202,10 @@ def add_package_versions(mgr: LogManager, path_to_version_sh: Optional[str] = No warn("Could not find emirge's version.sh.") else: + from pytools import ProcessLogger try: - output = subprocess.check_output(path_to_version_sh) + with ProcessLogger(logger, "emirge's version.sh"): + output = subprocess.check_output(path_to_version_sh) except OSError as e: warn("Could not record emirge's package versions: " + str(e)) @@ -397,8 +409,8 @@ def __call__(self) -> float: return self.process.memory_info()[0] / 1024 / 1024 -class DeviceMemoryUsage(PostLogQuantity): - """Logging support for GPU memory usage (Nvidia only currently).""" +class DeviceMemoryUsageCUDA(PostLogQuantity): + """Logging support for Nvidia CUDA GPU memory usage.""" def __init__(self, name: Optional[str] = None) -> None: @@ -442,6 +454,29 @@ def __call__(self) -> Optional[float]: return (total.value - free.value) / 1024 / 1024 +class DeviceMemoryUsageAMD(PostLogQuantity): + """Logging support for AMD GPU memory usage.""" + + def __init__(self, dev: cl.Device, name: Optional[str] = None) -> None: + + if name is None: + name = "memory_usage_gpu" + + super().__init__(name, "MByte", description="Memory usage (GPU)") + + self.dev = dev + self.global_mem_size_mbyte = dev.global_mem_size / 1024 / 1024 + + def __call__(self) -> Optional[float]: + """Return the memory usage in MByte.""" + # NB: dev.global_mem_size is in Bytes, + # dev.global_free_memory_amd is in KByte, + # the actual granularity of the returned values appears to be MByte + # (like in CUDA) + + return self.global_mem_size_mbyte - self.dev.global_free_memory_amd[0] / 1024 + + class MempoolMemoryUsage(MultiPostLogQuantity): """Logging support for memory pool usage."""