From 51e18058b974fe928eff5deeb133db0200a498e7 Mon Sep 17 00:00:00 2001
From: Matthias Diener <mdiener@illinois.edu>
Date: Fri, 24 May 2024 14:41:21 -0500
Subject: [PATCH] Unify caches

---
 doc/faq/errors.rst                 |  6 ----
 doc/running/large-systems.rst      | 33 -------------------
 mirgecom/array_context.py          | 53 ------------------------------
 scripts/delta-parallel-spawner.sh  |  7 ++--
 scripts/delta.sbatch.sh            |  7 ++--
 scripts/lassen-parallel-spawner.sh | 11 ++-----
 scripts/lassen.bsub.sh             |  6 ++--
 scripts/run-gpus-generic.sh        | 11 ++-----
 scripts/tioga-parallel-spawner.sh  |  4 +--
 scripts/tioga.flux.sh              |  4 +--
 10 files changed, 15 insertions(+), 127 deletions(-)

diff --git a/doc/faq/errors.rst b/doc/faq/errors.rst
index 162878313..5ed825fb3 100644
--- a/doc/faq/errors.rst
+++ b/doc/faq/errors.rst
@@ -68,9 +68,3 @@ to run the simulation::
 
 
 Try running on more nodes and/or devices.
-
-
-What can I do about ``CUDA_ERROR_FILE_NOT_FOUND: file not found`` errors?
--------------------------------------------------------------------------
-
-Please see :ref:`caching-errors` for a workaround.
diff --git a/doc/running/large-systems.rst b/doc/running/large-systems.rst
index ad430dbc4..55bade3ac 100644
--- a/doc/running/large-systems.rst
+++ b/doc/running/large-systems.rst
@@ -21,36 +21,3 @@ to speed up the startup process. `Emirge
 create such a zip file. This can be used by specifying the ``--modules``
 parameter to ``install.sh`` when installing emirge, or by running
 ``makezip.sh`` after installation.
-
-
-.. _caching-errors:
-
-Avoiding errors and overheads due to caching of kernels
--------------------------------------------------------
-
-Several packages used in MirgeCOM cache generated files on the hard
-disk in order to speed up multiple executions of the same kernel. This can lead
-to errors and slowdowns when executing on multiple ranks due to concurrent
-hard disk accesses. Indicators of file system concurrency issues include::
-
-   .conda/envs/dgfem/lib/python3.8/site-packages/pyopencl/cache.py:101: UserWarning:
-   could not obtain cache lock--delete '.cache/pyopencl/pyopencl-compiler-cache-v2-py3.8.3.final.0/lock' if necessary
-
-and::
-
-   pocl-cuda: failed to generate PTX
-   CUDA_ERROR_FILE_NOT_FOUND: file not found
-
-In order to avoid these issues, users should direct the packages to create
-cache files in directories that are private to each rank by using the ``XDG_CACHE_HOME`` and ``POCL_CACHE_DIR``
-environment variables, such as in the following example::
-
-   $ export XDG_CACHE_ROOT="/tmp/$USER/xdg-cache"
-   $ export POCL_CACHE_ROOT="/tmp/$USER/pocl-cache"
-   $ srun -n 512 bash -c 'POCL_CACHE_DIR=$POCL_CACHE_ROOT/$$ XDG_CACHE_HOME=$XDG_CACHE_ROOT/$$ python -m mpi4py examples/wave.py'
-
-
-There is also on-disk caching of compiled kernels done by CUDA itself.
-As of 01/2023, we have not observed issues specific to this caching.
-The CUDA caching behavior can also be controlled via
-`environment variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html?highlight=cuda_cache_disable#cuda-environment-variables>`__.
diff --git a/mirgecom/array_context.py b/mirgecom/array_context.py
index f6c066565..0d04e72dc 100644
--- a/mirgecom/array_context.py
+++ b/mirgecom/array_context.py
@@ -108,58 +108,6 @@ def actx_class_is_numpy(actx_class: Type[ArrayContext]) -> bool:
         return False
 
 
-def _check_cache_dirs_node() -> None:
-    """Check whether multiple ranks share cache directories on the same node."""
-    from mpi4py import MPI
-
-    size = MPI.COMM_WORLD.Get_size()
-
-    if size <= 1:
-        return
-
-    from mirgecom.mpi import shared_split_comm_world
-
-    with shared_split_comm_world() as node_comm:
-        node_rank = node_comm.Get_rank()
-
-        def _check_var(var: str) -> None:
-            from warnings import warn
-
-            try:
-                my_path = os.environ[var]
-            except KeyError:
-                warn(f"Please set the '{var}' variable in your job script to "
-                    "avoid file system overheads when running on large numbers of "
-                    "ranks. See https://mirgecom.readthedocs.io/en/latest/running/large-systems.html "  # noqa: E501
-                    "for more information.")
-                # Create a fake path so there will not be a second warning below.
-                my_path = f"no/such/path/rank{node_rank}"
-
-            all_paths = node_comm.gather(my_path, root=0)
-
-            if node_rank == 0:
-                assert all_paths
-                if len(all_paths) != len(set(all_paths)):
-                    hostname = MPI.Get_processor_name()
-                    dup = [path for path in set(all_paths)
-                                if all_paths.count(path) > 1]
-
-                    from warnings import warn
-                    warn(f"Multiple ranks are sharing '{var}' on node '{hostname}'. "
-                        f"Duplicate '{var}'s: {dup}.")
-
-        _check_var("XDG_CACHE_HOME")
-
-        if os.environ.get("XDG_CACHE_HOME") is None:
-            # When XDG_CACHE_HOME is set but POCL_CACHE_DIR is not, pocl
-            # will use XDG_CACHE_HOME as the cache directory.
-            _check_var("POCL_CACHE_DIR")
-
-        # We haven't observed an issue yet that 'CUDA_CACHE_PATH' fixes,
-        # so disable this check for now.
-        # _check_var("CUDA_CACHE_PATH")
-
-
 def _check_gpu_oversubscription(actx: ArrayContext) -> None:
     """
     Check whether multiple ranks are running on the same GPU on each node.
@@ -323,7 +271,6 @@ def initialize_actx(
     # or pocl, and therefore we don't need to examine their caching).
     if actx_class_is_pyopencl(actx_class):
         _check_gpu_oversubscription(actx)
-        _check_cache_dirs_node()
         log_disk_cache_config(actx)
 
     return actx
diff --git a/scripts/delta-parallel-spawner.sh b/scripts/delta-parallel-spawner.sh
index b1501b1bf..8dfc06891 100755
--- a/scripts/delta-parallel-spawner.sh
+++ b/scripts/delta-parallel-spawner.sh
@@ -5,11 +5,8 @@
 export CUDA_CACHE_DISABLE=0
 
 MIRGE_CACHE_ROOT=${MIRGE_CACHE_ROOT:-"$(pwd)/.mirge-cache"}
-XDG_CACHE_ROOT=${XDG_CACHE_ROOT:-"${MIRGE_CACHE_ROOT}/xdg-cache"}
-CUDA_CACHE_ROOT=${CUDA_CACHE_ROOT:-"${MIRGE_CACHE_ROOT}/cuda-cache"}
-
-XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${XDG_CACHE_ROOT}/rank$SLURM_PROCID"}
-CUDA_CACHE_PATH=${CUDA_CACHE_DIR:-"${CUDA_CACHE_DIR}/rank$SLURM_PROCID"}
+XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${MIRGE_CACHE_ROOT}/xdg-cache"}
+CUDA_CACHE_PATH=${CUDA_CACHE_PATH:-"${MIRGE_CACHE_ROOT}/cuda-cache"}
 
 export XDG_CACHE_HOME
 export CUDA_CACHE_PATH
diff --git a/scripts/delta.sbatch.sh b/scripts/delta.sbatch.sh
index 4c8ceee25..4ba6cde62 100644
--- a/scripts/delta.sbatch.sh
+++ b/scripts/delta.sbatch.sh
@@ -30,11 +30,8 @@ echo nnodes=$nnodes nproc=$nproc
 
 srun_cmd="srun -N $nnodes -n $nproc"
 
-# See
-# https://mirgecom.readthedocs.io/en/latest/running.html#avoiding-overheads-due-to-caching-of-kernels
-# on why this is important
 MIRGE_CACHE_ROOT=${MIRGE_CACHE_ROOT:-"$(pwd)/.mirge-cache/"}
-export XDG_CACHE_HOME_ROOT="${MIRGE_CACHE_ROOT}/xdg-cache/rank"
+export XDG_CACHE_HOME="${MIRGE_CACHE_ROOT}/xdg-cache"
 
 # Run application
-$srun_cmd bash -c 'XDG_CACHE_HOME=$XDG_CACHE_HOME_ROOT$SLURM_PROCID python -u -O -m mpi4py ./pulse.py'
+$srun_cmd python -u -m mpi4py ./pulse.py
diff --git a/scripts/lassen-parallel-spawner.sh b/scripts/lassen-parallel-spawner.sh
index 6479fac7e..d8b761bb3 100755
--- a/scripts/lassen-parallel-spawner.sh
+++ b/scripts/lassen-parallel-spawner.sh
@@ -7,15 +7,8 @@ export CUDA_CACHE_DISABLE=0
 
 # MIRGE env vars used to setup cache locations
 MIRGE_CACHE_ROOT=${MIRGE_CACHE_ROOT:-"$(pwd)/.mirge-cache/"}
-XDG_CACHE_ROOT=${XDG_CACHE_ROOT:-"${MIRGE_CACHE_ROOT}/xdg-cache"}
-CUDA_CACHE_ROOT=${CUDA_CACHE_ROOT:-"${MIRGE_CACHE_ROOT}/cuda-cache"}
-
-# These vars are used by pocl, pyopencl, loopy, and cuda for cache location
-XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${XDG_CACHE_ROOT}/rank$OMPI_COMM_WORLD_RANK"}
-CUDA_CACHE_PATH=${CUDA_CACHE_DIR:-"${CUDA_CACHE_ROOT}/rank$OMPI_COMM_WORLD_RANK"}
-# The system sets a default CUDA_CACHE_PATH which is node-local :(
-# User still has full path control, but we discard the system default
-# CUDA_CACHE_PATH=${CUDA_CACHE_PATH:-"${CUDA_CACHE_ROOT}/rank$OMPI_COMM_WORLD_RANK"}
+XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${MIRGE_CACHE_ROOT}/xdg-cache"}
+CUDA_CACHE_PATH=${CUDA_CACHE_PATH:-"${MIRGE_CACHE_ROOT}/cuda-cache"}
 
 export XDG_CACHE_HOME
 export CUDA_CACHE_PATH
diff --git a/scripts/lassen.bsub.sh b/scripts/lassen.bsub.sh
index 2c6c72687..f22ff5da8 100644
--- a/scripts/lassen.bsub.sh
+++ b/scripts/lassen.bsub.sh
@@ -27,11 +27,11 @@ jsrun_cmd="jsrun -g 1 -a 1 -n $nproc"
 # https://mirgecom.readthedocs.io/en/latest/running.html#avoiding-overheads-due-to-caching-of-kernels
 # on why this is important
 MIRGE_CACHE_ROOT=${MIRGE_CACHE_ROOT:-"$(pwd)/.mirge-cache/"}
-export XDG_CACHE_HOME_ROOT="${MIRGE_CACHE_ROOT}/xdg-cache/rank"
+export XDG_CACHE_HOME="${MIRGE_CACHE_ROOT}/xdg-cache"
 
 # Reenable CUDA cache
 export CUDA_CACHE_DISABLE=0
-export CUDA_CACHE_PATH_ROOT="${MIRGE_CACHE_ROOT}/cuda-cache/rank"
+export CUDA_CACHE_PATH="${MIRGE_CACHE_ROOT}/cuda-cache"
 
 # Print task allocation
 $jsrun_cmd js_task_info
@@ -39,4 +39,4 @@ $jsrun_cmd js_task_info
 echo "----------------------------"
 
 # Run application
-$jsrun_cmd bash -c 'CUDA_CACHE_PATH=$CUDA_CACHE_PATH_ROOT$OMPI_COMM_WORLD_RANK XDG_CACHE_HOME=$XDG_CACHE_HOME_ROOT$OMPI_COMM_WORLD_RANK python -m mpi4py ../examples/pulse.py --lazy'
+$jsrun_cmd python -m mpi4py ../examples/pulse.py --lazy
diff --git a/scripts/run-gpus-generic.sh b/scripts/run-gpus-generic.sh
index b81917430..e01a7a116 100755
--- a/scripts/run-gpus-generic.sh
+++ b/scripts/run-gpus-generic.sh
@@ -11,25 +11,20 @@
 #
 # Run it like this:
 #   mpiexec -n 2 bash run-gpus-generic.sh python -m mpi4py pulse.py --lazy
-# unset CUDA_CACHE_DISABLE
+
 export CUDA_CACHE_DISABLE=0
 MIRGE_CACHE_ROOT=${MIRGE_CACHE_ROOT:-"$(pwd)/.mirge-cache/"}
-XDG_CACHE_ROOT=${XDG_CACHE_ROOT:-"${MIRGE_CACHE_ROOT}/xdg-cache"}
-CUDA_CACHE_ROOT=${CUDA_CACHE_ROOT:-"${MIRGE_CACHE_ROOT}/cuda-cache"}
+XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${MIRGE_CACHE_ROOT}/xdg-cache"}
+CUDA_CACHE_PATH=${CUDA_CACHE_PATH:-"${MIRGE_CACHE_ROOT}/cuda-cache"}
 
 if [[ -n "$OMPI_COMM_WORLD_NODE_RANK" ]]; then
     # Open MPI
     export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK
-    RANK_ID="rank${OMPI_COMM_WORLD_RANK}"
 elif [[ -n "$MPI_LOCALRANKID" ]]; then
     # mpich/mvapich
     export CUDA_VISIBLE_DEVICES=$MPI_LOCALRANKID
-    RANK_ID="rank${MPI_LOCALRANKID}"
 fi
 
-XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${XDG_CACHE_ROOT}/${RANK_ID}"}
-CUDA_CACHE_PATH=${CUDA_CACHE_DIR:-"${CUDA_CACHE_ROOT}/${RANK_ID}"}
-
 export XDG_CACHE_HOME
 export CUDA_CACHE_PATH
 
diff --git a/scripts/tioga-parallel-spawner.sh b/scripts/tioga-parallel-spawner.sh
index b91f56477..ce3d9df7d 100644
--- a/scripts/tioga-parallel-spawner.sh
+++ b/scripts/tioga-parallel-spawner.sh
@@ -3,9 +3,7 @@
 # Used to wrap the spawning of parallel mirgecom drivers on Tioga.
 
 MIRGE_CACHE_ROOT=${MIRGE_CACHE_ROOT:-"$(pwd)/.mirge-cache/"}
-XDG_CACHE_ROOT=${XDG_CACHE_ROOT:-"${MIRGE_CACHE_ROOT}/xdg-cache"}
-
-XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${XDG_CACHE_ROOT}/rank$FLUX_TASK_RANK"}
+XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${MIRGE_CACHE_ROOT}/xdg-cache"}
 
 export XDG_CACHE_HOME
 
diff --git a/scripts/tioga.flux.sh b/scripts/tioga.flux.sh
index 28f9bac93..6d42a7807 100644
--- a/scripts/tioga.flux.sh
+++ b/scripts/tioga.flux.sh
@@ -21,6 +21,6 @@ export PYOPENCL_CTX="AMD:0"
 run_cmd="flux run -N $nnodes -n $nproc --exclusive"
 
 MIRGE_CACHE_ROOT=${MIRGE_CACHE_ROOT:-"$(pwd)/.mirge-cache/"}
-export XDG_CACHE_ROOT=${XDG_CACHE_ROOT:-"${MIRGE_CACHE_ROOT}/xdg-cache"}
+export XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${MIRGE_CACHE_ROOT}/xdg-cache"}
 
-$run_cmd bash -c 'XDG_CACHE_HOME=$XDG_CACHE_ROOT/$FLUX_TASK_RANK ROCR_VISIBLE_DEVICES=$FLUX_TASK_LOCAL_ID python -m mpi4py examples/pulse.py --lazy '
+$run_cmd bash -c 'ROCR_VISIBLE_DEVICES=$FLUX_TASK_LOCAL_ID python -m mpi4py examples/pulse.py --lazy '