From 51e18058b974fe928eff5deeb133db0200a498e7 Mon Sep 17 00:00:00 2001 From: Matthias Diener Date: Fri, 24 May 2024 14:41:21 -0500 Subject: [PATCH] Unify caches --- doc/faq/errors.rst | 6 ---- doc/running/large-systems.rst | 33 ------------------- mirgecom/array_context.py | 53 ------------------------------ scripts/delta-parallel-spawner.sh | 7 ++-- scripts/delta.sbatch.sh | 7 ++-- scripts/lassen-parallel-spawner.sh | 11 ++----- scripts/lassen.bsub.sh | 6 ++-- scripts/run-gpus-generic.sh | 11 ++----- scripts/tioga-parallel-spawner.sh | 4 +-- scripts/tioga.flux.sh | 4 +-- 10 files changed, 15 insertions(+), 127 deletions(-) diff --git a/doc/faq/errors.rst b/doc/faq/errors.rst index 162878313..5ed825fb3 100644 --- a/doc/faq/errors.rst +++ b/doc/faq/errors.rst @@ -68,9 +68,3 @@ to run the simulation:: Try running on more nodes and/or devices. - - -What can I do about ``CUDA_ERROR_FILE_NOT_FOUND: file not found`` errors? -------------------------------------------------------------------------- - -Please see :ref:`caching-errors` for a workaround. diff --git a/doc/running/large-systems.rst b/doc/running/large-systems.rst index ad430dbc4..55bade3ac 100644 --- a/doc/running/large-systems.rst +++ b/doc/running/large-systems.rst @@ -21,36 +21,3 @@ to speed up the startup process. `Emirge create such a zip file. This can be used by specifying the ``--modules`` parameter to ``install.sh`` when installing emirge, or by running ``makezip.sh`` after installation. - - -.. _caching-errors: - -Avoiding errors and overheads due to caching of kernels -------------------------------------------------------- - -Several packages used in MirgeCOM cache generated files on the hard -disk in order to speed up multiple executions of the same kernel. This can lead -to errors and slowdowns when executing on multiple ranks due to concurrent -hard disk accesses. Indicators of file system concurrency issues include:: - - .conda/envs/dgfem/lib/python3.8/site-packages/pyopencl/cache.py:101: UserWarning: - could not obtain cache lock--delete '.cache/pyopencl/pyopencl-compiler-cache-v2-py3.8.3.final.0/lock' if necessary - -and:: - - pocl-cuda: failed to generate PTX - CUDA_ERROR_FILE_NOT_FOUND: file not found - -In order to avoid these issues, users should direct the packages to create -cache files in directories that are private to each rank by using the ``XDG_CACHE_HOME`` and ``POCL_CACHE_DIR`` -environment variables, such as in the following example:: - - $ export XDG_CACHE_ROOT="/tmp/$USER/xdg-cache" - $ export POCL_CACHE_ROOT="/tmp/$USER/pocl-cache" - $ srun -n 512 bash -c 'POCL_CACHE_DIR=$POCL_CACHE_ROOT/$$ XDG_CACHE_HOME=$XDG_CACHE_ROOT/$$ python -m mpi4py examples/wave.py' - - -There is also on-disk caching of compiled kernels done by CUDA itself. -As of 01/2023, we have not observed issues specific to this caching. -The CUDA caching behavior can also be controlled via -`environment variables `__. diff --git a/mirgecom/array_context.py b/mirgecom/array_context.py index f6c066565..0d04e72dc 100644 --- a/mirgecom/array_context.py +++ b/mirgecom/array_context.py @@ -108,58 +108,6 @@ def actx_class_is_numpy(actx_class: Type[ArrayContext]) -> bool: return False -def _check_cache_dirs_node() -> None: - """Check whether multiple ranks share cache directories on the same node.""" - from mpi4py import MPI - - size = MPI.COMM_WORLD.Get_size() - - if size <= 1: - return - - from mirgecom.mpi import shared_split_comm_world - - with shared_split_comm_world() as node_comm: - node_rank = node_comm.Get_rank() - - def _check_var(var: str) -> None: - from warnings import warn - - try: - my_path = os.environ[var] - except KeyError: - warn(f"Please set the '{var}' variable in your job script to " - "avoid file system overheads when running on large numbers of " - "ranks. See https://mirgecom.readthedocs.io/en/latest/running/large-systems.html " # noqa: E501 - "for more information.") - # Create a fake path so there will not be a second warning below. - my_path = f"no/such/path/rank{node_rank}" - - all_paths = node_comm.gather(my_path, root=0) - - if node_rank == 0: - assert all_paths - if len(all_paths) != len(set(all_paths)): - hostname = MPI.Get_processor_name() - dup = [path for path in set(all_paths) - if all_paths.count(path) > 1] - - from warnings import warn - warn(f"Multiple ranks are sharing '{var}' on node '{hostname}'. " - f"Duplicate '{var}'s: {dup}.") - - _check_var("XDG_CACHE_HOME") - - if os.environ.get("XDG_CACHE_HOME") is None: - # When XDG_CACHE_HOME is set but POCL_CACHE_DIR is not, pocl - # will use XDG_CACHE_HOME as the cache directory. - _check_var("POCL_CACHE_DIR") - - # We haven't observed an issue yet that 'CUDA_CACHE_PATH' fixes, - # so disable this check for now. - # _check_var("CUDA_CACHE_PATH") - - def _check_gpu_oversubscription(actx: ArrayContext) -> None: """ Check whether multiple ranks are running on the same GPU on each node. @@ -323,7 +271,6 @@ def initialize_actx( # or pocl, and therefore we don't need to examine their caching). if actx_class_is_pyopencl(actx_class): _check_gpu_oversubscription(actx) - _check_cache_dirs_node() log_disk_cache_config(actx) return actx diff --git a/scripts/delta-parallel-spawner.sh b/scripts/delta-parallel-spawner.sh index b1501b1bf..8dfc06891 100755 --- a/scripts/delta-parallel-spawner.sh +++ b/scripts/delta-parallel-spawner.sh @@ -5,11 +5,8 @@ export CUDA_CACHE_DISABLE=0 MIRGE_CACHE_ROOT=${MIRGE_CACHE_ROOT:-"$(pwd)/.mirge-cache"} -XDG_CACHE_ROOT=${XDG_CACHE_ROOT:-"${MIRGE_CACHE_ROOT}/xdg-cache"} -CUDA_CACHE_ROOT=${CUDA_CACHE_ROOT:-"${MIRGE_CACHE_ROOT}/cuda-cache"} - -XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${XDG_CACHE_ROOT}/rank$SLURM_PROCID"} -CUDA_CACHE_PATH=${CUDA_CACHE_DIR:-"${CUDA_CACHE_DIR}/rank$SLURM_PROCID"} +XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${MIRGE_CACHE_ROOT}/xdg-cache"} +CUDA_CACHE_PATH=${CUDA_CACHE_PATH:-"${MIRGE_CACHE_ROOT}/cuda-cache"} export XDG_CACHE_HOME export CUDA_CACHE_PATH diff --git a/scripts/delta.sbatch.sh b/scripts/delta.sbatch.sh index 4c8ceee25..4ba6cde62 100644 --- a/scripts/delta.sbatch.sh +++ b/scripts/delta.sbatch.sh @@ -30,11 +30,8 @@ echo nnodes=$nnodes nproc=$nproc srun_cmd="srun -N $nnodes -n $nproc" -# See -# https://mirgecom.readthedocs.io/en/latest/running.html#avoiding-overheads-due-to-caching-of-kernels -# on why this is important MIRGE_CACHE_ROOT=${MIRGE_CACHE_ROOT:-"$(pwd)/.mirge-cache/"} -export XDG_CACHE_HOME_ROOT="${MIRGE_CACHE_ROOT}/xdg-cache/rank" +export XDG_CACHE_HOME="${MIRGE_CACHE_ROOT}/xdg-cache" # Run application -$srun_cmd bash -c 'XDG_CACHE_HOME=$XDG_CACHE_HOME_ROOT$SLURM_PROCID python -u -O -m mpi4py ./pulse.py' +$srun_cmd python -u -m mpi4py ./pulse.py diff --git a/scripts/lassen-parallel-spawner.sh b/scripts/lassen-parallel-spawner.sh index 6479fac7e..d8b761bb3 100755 --- a/scripts/lassen-parallel-spawner.sh +++ b/scripts/lassen-parallel-spawner.sh @@ -7,15 +7,8 @@ export CUDA_CACHE_DISABLE=0 # MIRGE env vars used to setup cache locations MIRGE_CACHE_ROOT=${MIRGE_CACHE_ROOT:-"$(pwd)/.mirge-cache/"} -XDG_CACHE_ROOT=${XDG_CACHE_ROOT:-"${MIRGE_CACHE_ROOT}/xdg-cache"} -CUDA_CACHE_ROOT=${CUDA_CACHE_ROOT:-"${MIRGE_CACHE_ROOT}/cuda-cache"} - -# These vars are used by pocl, pyopencl, loopy, and cuda for cache location -XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${XDG_CACHE_ROOT}/rank$OMPI_COMM_WORLD_RANK"} -CUDA_CACHE_PATH=${CUDA_CACHE_DIR:-"${CUDA_CACHE_ROOT}/rank$OMPI_COMM_WORLD_RANK"} -# The system sets a default CUDA_CACHE_PATH which is node-local :( -# User still has full path control, but we discard the system default -# CUDA_CACHE_PATH=${CUDA_CACHE_PATH:-"${CUDA_CACHE_ROOT}/rank$OMPI_COMM_WORLD_RANK"} +XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${MIRGE_CACHE_ROOT}/xdg-cache"} +CUDA_CACHE_PATH=${CUDA_CACHE_PATH:-"${MIRGE_CACHE_ROOT}/cuda-cache"} export XDG_CACHE_HOME export CUDA_CACHE_PATH diff --git a/scripts/lassen.bsub.sh b/scripts/lassen.bsub.sh index 2c6c72687..f22ff5da8 100644 --- a/scripts/lassen.bsub.sh +++ b/scripts/lassen.bsub.sh @@ -27,11 +27,11 @@ jsrun_cmd="jsrun -g 1 -a 1 -n $nproc" # https://mirgecom.readthedocs.io/en/latest/running.html#avoiding-overheads-due-to-caching-of-kernels # on why this is important MIRGE_CACHE_ROOT=${MIRGE_CACHE_ROOT:-"$(pwd)/.mirge-cache/"} -export XDG_CACHE_HOME_ROOT="${MIRGE_CACHE_ROOT}/xdg-cache/rank" +export XDG_CACHE_HOME="${MIRGE_CACHE_ROOT}/xdg-cache" # Reenable CUDA cache export CUDA_CACHE_DISABLE=0 -export CUDA_CACHE_PATH_ROOT="${MIRGE_CACHE_ROOT}/cuda-cache/rank" +export CUDA_CACHE_PATH="${MIRGE_CACHE_ROOT}/cuda-cache" # Print task allocation $jsrun_cmd js_task_info @@ -39,4 +39,4 @@ $jsrun_cmd js_task_info echo "----------------------------" # Run application -$jsrun_cmd bash -c 'CUDA_CACHE_PATH=$CUDA_CACHE_PATH_ROOT$OMPI_COMM_WORLD_RANK XDG_CACHE_HOME=$XDG_CACHE_HOME_ROOT$OMPI_COMM_WORLD_RANK python -m mpi4py ../examples/pulse.py --lazy' +$jsrun_cmd python -m mpi4py ../examples/pulse.py --lazy diff --git a/scripts/run-gpus-generic.sh b/scripts/run-gpus-generic.sh index b81917430..e01a7a116 100755 --- a/scripts/run-gpus-generic.sh +++ b/scripts/run-gpus-generic.sh @@ -11,25 +11,20 @@ # # Run it like this: # mpiexec -n 2 bash run-gpus-generic.sh python -m mpi4py pulse.py --lazy -# unset CUDA_CACHE_DISABLE + export CUDA_CACHE_DISABLE=0 MIRGE_CACHE_ROOT=${MIRGE_CACHE_ROOT:-"$(pwd)/.mirge-cache/"} -XDG_CACHE_ROOT=${XDG_CACHE_ROOT:-"${MIRGE_CACHE_ROOT}/xdg-cache"} -CUDA_CACHE_ROOT=${CUDA_CACHE_ROOT:-"${MIRGE_CACHE_ROOT}/cuda-cache"} +XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${MIRGE_CACHE_ROOT}/xdg-cache"} +CUDA_CACHE_PATH=${CUDA_CACHE_PATH:-"${MIRGE_CACHE_ROOT}/cuda-cache"} if [[ -n "$OMPI_COMM_WORLD_NODE_RANK" ]]; then # Open MPI export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK - RANK_ID="rank${OMPI_COMM_WORLD_RANK}" elif [[ -n "$MPI_LOCALRANKID" ]]; then # mpich/mvapich export CUDA_VISIBLE_DEVICES=$MPI_LOCALRANKID - RANK_ID="rank${MPI_LOCALRANKID}" fi -XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${XDG_CACHE_ROOT}/${RANK_ID}"} -CUDA_CACHE_PATH=${CUDA_CACHE_DIR:-"${CUDA_CACHE_ROOT}/${RANK_ID}"} - export XDG_CACHE_HOME export CUDA_CACHE_PATH diff --git a/scripts/tioga-parallel-spawner.sh b/scripts/tioga-parallel-spawner.sh index b91f56477..ce3d9df7d 100644 --- a/scripts/tioga-parallel-spawner.sh +++ b/scripts/tioga-parallel-spawner.sh @@ -3,9 +3,7 @@ # Used to wrap the spawning of parallel mirgecom drivers on Tioga. MIRGE_CACHE_ROOT=${MIRGE_CACHE_ROOT:-"$(pwd)/.mirge-cache/"} -XDG_CACHE_ROOT=${XDG_CACHE_ROOT:-"${MIRGE_CACHE_ROOT}/xdg-cache"} - -XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${XDG_CACHE_ROOT}/rank$FLUX_TASK_RANK"} +XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${MIRGE_CACHE_ROOT}/xdg-cache"} export XDG_CACHE_HOME diff --git a/scripts/tioga.flux.sh b/scripts/tioga.flux.sh index 28f9bac93..6d42a7807 100644 --- a/scripts/tioga.flux.sh +++ b/scripts/tioga.flux.sh @@ -21,6 +21,6 @@ export PYOPENCL_CTX="AMD:0" run_cmd="flux run -N $nnodes -n $nproc --exclusive" MIRGE_CACHE_ROOT=${MIRGE_CACHE_ROOT:-"$(pwd)/.mirge-cache/"} -export XDG_CACHE_ROOT=${XDG_CACHE_ROOT:-"${MIRGE_CACHE_ROOT}/xdg-cache"} +export XDG_CACHE_HOME=${XDG_CACHE_HOME:-"${MIRGE_CACHE_ROOT}/xdg-cache"} -$run_cmd bash -c 'XDG_CACHE_HOME=$XDG_CACHE_ROOT/$FLUX_TASK_RANK ROCR_VISIBLE_DEVICES=$FLUX_TASK_LOCAL_ID python -m mpi4py examples/pulse.py --lazy ' +$run_cmd bash -c 'ROCR_VISIBLE_DEVICES=$FLUX_TASK_LOCAL_ID python -m mpi4py examples/pulse.py --lazy '