From 263582bb848da05d8f26037d7e67c1ee66f3834d Mon Sep 17 00:00:00 2001
From: Matthias Diener <mdiener@illinois.edu>
Date: Tue, 27 Aug 2024 23:19:23 -0500
Subject: [PATCH] DeviceMemoryUsage: add AMD support, use ProcessLogger for
 version.sh (#1060)

* DeviceMemoryUsage: only use on Nvidia, use ProcessLogger for version.sh

* don't force loading mpi4py for config checks

* fix Numpy actx spelling

* add DeviceMemoryUsageAMD

* clean up MPI actx handling
---
 examples/run_examples.sh       |  2 +-
 mirgecom/array_context.py      | 27 +++++++++++++++----
 mirgecom/logging_quantities.py | 47 +++++++++++++++++++++++++++++-----
 3 files changed, 64 insertions(+), 12 deletions(-)

diff --git a/examples/run_examples.sh b/examples/run_examples.sh
index cedd7f071..60454b00a 100755
--- a/examples/run_examples.sh
+++ b/examples/run_examples.sh
@@ -16,7 +16,7 @@ function endgroup {
 
 # }}}
 
-python -c "from grudge.array_context import MPINumpyArrayConext" && numpy_actx_available=numpy || numpy_actx_available=
+python -c "from grudge.array_context import MPINumpyArrayContext" && numpy_actx_available=numpy || numpy_actx_available=
 
 echo "Numpy array context available: $numpy_actx_available"
 
diff --git a/mirgecom/array_context.py b/mirgecom/array_context.py
index abc9ede3f..0182b693b 100644
--- a/mirgecom/array_context.py
+++ b/mirgecom/array_context.py
@@ -5,6 +5,7 @@
 .. autofunction:: actx_class_is_eager
 .. autofunction:: actx_class_is_profiling
 .. autofunction:: actx_class_is_numpy
+.. autofunction:: actx_class_is_distributed
 .. autofunction:: initialize_actx
 """
 
@@ -110,6 +111,12 @@ def actx_class_is_numpy(actx_class: Type[ArrayContext]) -> bool:
         return False
 
 
+def actx_class_is_distributed(actx_class: Type[ArrayContext]) -> bool:
+    """Return True if *actx_class* is distributed."""
+    from grudge.array_context import MPIBasedArrayContext
+    return issubclass(actx_class, MPIBasedArrayContext)
+
+
 def actx_class_has_fallback_args(actx_class: Type[ArrayContext]) -> bool:
     """Return True if *actx_class* has fallback arguments."""
     import inspect
@@ -117,8 +124,11 @@ def actx_class_has_fallback_args(actx_class: Type[ArrayContext]) -> bool:
     return "use_axis_tag_inference_fallback" in spec.args
 
 
-def _check_cache_dirs_node() -> None:
+def _check_cache_dirs_node(actx: ArrayContext) -> None:
     """Check whether multiple ranks share cache directories on the same node."""
+    if not actx_class_is_distributed(type(actx)):
+        return
+
     from mpi4py import MPI
 
     size = MPI.COMM_WORLD.Get_size()
@@ -176,6 +186,9 @@ def _check_gpu_oversubscription(actx: ArrayContext) -> None:
     Only works with CUDA devices currently due to the use of the
     PCI_DOMAIN_ID_NV extension.
     """
+    if not actx_class_is_distributed(type(actx)):
+        return
+
     from mpi4py import MPI
     import pyopencl as cl
 
@@ -227,11 +240,15 @@ def _check_gpu_oversubscription(actx: ArrayContext) -> None:
 
 def log_disk_cache_config(actx: ArrayContext) -> None:
     """Log the disk cache configuration."""
-    from mpi4py import MPI
-
     assert isinstance(actx, (PyOpenCLArrayContext, PytatoPyOpenCLArrayContext))
 
-    rank = MPI.COMM_WORLD.Get_rank()
+    if actx_class_is_distributed(type(actx)):
+        from grudge.array_context import MPIBasedArrayContext
+        assert isinstance(actx, MPIBasedArrayContext)
+        rank = actx.mpi_communicator.Get_rank()
+    else:
+        rank = 0
+
     res = f"Rank {rank} disk cache config: "
 
     from pyopencl.characterize import nv_compute_capability, get_pocl_version
@@ -336,7 +353,7 @@ def initialize_actx(
     # or pocl, and therefore we don't need to examine their caching).
     if actx_class_is_pyopencl(actx_class):
         _check_gpu_oversubscription(actx)
-        _check_cache_dirs_node()
+        _check_cache_dirs_node(actx)
         log_disk_cache_config(actx)
 
     return actx
diff --git a/mirgecom/logging_quantities.py b/mirgecom/logging_quantities.py
index 96ae05d82..558aab374 100644
--- a/mirgecom/logging_quantities.py
+++ b/mirgecom/logging_quantities.py
@@ -29,7 +29,8 @@
 .. autoclass:: DiscretizationBasedQuantity
 .. autoclass:: KernelProfile
 .. autoclass:: PythonMemoryUsage
-.. autoclass:: DeviceMemoryUsage
+.. autoclass:: DeviceMemoryUsageCUDA
+.. autoclass:: DeviceMemoryUsageAMD
 .. autofunction:: initialize_logmgr
 .. autofunction:: logmgr_add_cl_device_info
 .. autofunction:: logmgr_add_device_memory_usage
@@ -40,6 +41,8 @@
 .. autofunction:: logmgr_set_time
 """
 
+import logging
+
 from logpyle import (LogQuantity, PostLogQuantity, LogManager,
     MultiPostLogQuantity, add_run_info,
     add_general_quantities, add_simulation_quantities)
@@ -55,6 +58,9 @@
 import grudge.op as oper
 from typing import List
 
+
+logger = logging.getLogger(__name__)
+
 MemPoolType = Union[cl.tools.MemoryPool, cl.tools.SVMPool]
 
 
@@ -114,9 +120,13 @@ def logmgr_add_device_name(logmgr: LogManager, queue: cl.CommandQueue):  # noqa:
 def logmgr_add_device_memory_usage(logmgr: LogManager, queue: cl.CommandQueue) \
         -> None:
     """Add the OpenCL device memory usage to the log."""
-    if not queue or not (queue.device.type & cl.device_type.GPU):
+    if not queue:
         return
-    logmgr.add_quantity(DeviceMemoryUsage())
+
+    if queue.device.vendor.lower().startswith("nvidia"):
+        logmgr.add_quantity(DeviceMemoryUsageCUDA())
+    elif queue.device.vendor.lower().startswith("advanced micro devices"):
+        logmgr.add_quantity(DeviceMemoryUsageAMD(queue.device))
 
 
 def logmgr_add_mempool_usage(logmgr: LogManager, pool: MemPoolType) -> None:
@@ -192,8 +202,10 @@ def add_package_versions(mgr: LogManager, path_to_version_sh: Optional[str] = No
         warn("Could not find emirge's version.sh.")
 
     else:
+        from pytools import ProcessLogger
         try:
-            output = subprocess.check_output(path_to_version_sh)
+            with ProcessLogger(logger, "emirge's version.sh"):
+                output = subprocess.check_output(path_to_version_sh)
         except OSError as e:
             warn("Could not record emirge's package versions: " + str(e))
 
@@ -397,8 +409,8 @@ def __call__(self) -> float:
         return self.process.memory_info()[0] / 1024 / 1024
 
 
-class DeviceMemoryUsage(PostLogQuantity):
-    """Logging support for GPU memory usage (Nvidia only currently)."""
+class DeviceMemoryUsageCUDA(PostLogQuantity):
+    """Logging support for Nvidia CUDA GPU memory usage."""
 
     def __init__(self, name: Optional[str] = None) -> None:
 
@@ -442,6 +454,29 @@ def __call__(self) -> Optional[float]:
             return (total.value - free.value) / 1024 / 1024
 
 
+class DeviceMemoryUsageAMD(PostLogQuantity):
+    """Logging support for AMD GPU memory usage."""
+
+    def __init__(self, dev: cl.Device, name: Optional[str] = None) -> None:
+
+        if name is None:
+            name = "memory_usage_gpu"
+
+        super().__init__(name, "MByte", description="Memory usage (GPU)")
+
+        self.dev = dev
+        self.global_mem_size_mbyte = dev.global_mem_size / 1024 / 1024
+
+    def __call__(self) -> Optional[float]:
+        """Return the memory usage in MByte."""
+        # NB: dev.global_mem_size is in Bytes,
+        #     dev.global_free_memory_amd is in KByte,
+        #     the actual granularity of the returned values appears to be MByte
+        #     (like in CUDA)
+
+        return self.global_mem_size_mbyte - self.dev.global_free_memory_amd[0] / 1024
+
+
 class MempoolMemoryUsage(MultiPostLogQuantity):
     """Logging support for memory pool usage."""