From a7afcba3cffd86acc748edf39a8a2f33c973bf5d Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Thu, 2 Nov 2023 08:45:19 -0700 Subject: [PATCH] `amrex.omp_threads`: Can Avoid SMT (#3607) ## Summary In all our applications in BLAST, the OpenMP default to use all [logical cores on modern CPUs](https://en.wikipedia.org/wiki/Simultaneous_multithreading) results in significantly slower performance than just using the physical cores with AMReX. Thus, we introduce a new option `amrex.omp_threads` that enables control over the OpenMP threads at startup and has - for most popular systems - an implementation to find out the actual number of physical threads and default to it. For codes, users that change the default to `amrex.omp_threads = nosmt`, the `OMP_NUM_THREADS` variable will still take precedence. This is a bit unusual (because CLI options usually have higher precedence than env vars - and they do if the user provides a number here), but done intentionally: this way, codes like WarpX can set the `nosmt` default and HPC job scripts will set the exact, preferably benchmarked number of threads as usual without surprises. - [x] document ## Tests Performed for AMReX OMP Backend Tests were performed with very small examples, WarpX 3D LWFA test as checked in or AMReX AMRCore 3d test. - [x] Ubuntu 22.04 Laptop w/ 12th Gen Intel i9-12900H: @ax3l - 20 logical cores; the first 12 logical cores use 2x SMT/HT - 20 virtual (default) -> 14 physical (`amrex.omp_threads = nosmt`) - faster runtime! - [x] Perlmutter (SUSE Linux Enterprise 15.4, kernel 5.14.21) - [CPU node](https://docs.nersc.gov/systems/perlmutter/architecture/) with 2x [AMD EPYC 7763](https://www.amd.com/en/products/cpu/amd-epyc-7763) - 2x SMT - 256 default, 128 with `amrex.omp_threads = nosmt` - faster runtime! - [x] Frontier (SUSE Linux Enterprise 15.4, kernel 5.14.21) - 1x AMD EPYC 7763 64-Core Processor (w/ 2x SMT enabled) - 2x SMT - 128 default - 64 with `amrex.omp_threads = nosmt` - faster runtime! - The ideal result might also be lower, due to first cores used by OS and [low-noise cores](https://docs.olcf.ornl.gov/systems/frontier_user_guide.html#low-noise-mode-layout) after that. But that is an orthogonal question and should be set in job scripts: `#SBATCH --ntasks-per-node=8` `#SBATCH --cpus-per-task=7` `#SBATCH --gpus-per-task=1` - [x] Summit (RHEL 8.2, kernel 4.18.0) - 2x IBM Power9 (each 22 physical cores each, each 6 disabled/hidden for OS?, 4x SMT enabled; cpuinfo says 128 total) - 4x SMT - 128 default, 32 with `amrex.omp_threads = nosmt` - faster runtime! - [x] [Lassen](https://hpc.llnl.gov/hardware/compute-platforms/lassen) (RHEL 7.9, kernel 4.14.0) - 2x IBM Power9 (each 22 physical cores, each 2 reserved for OS?, 4x SMT enabled) - 4x SMT - 160 default, 44 with `amrex.omp_threads = nosmt` - faster runtime! - The ideal result might be even down to 40, but that is an orthogonal question and should be set in job scripts. - [x] macOS M1 (arm64/aarch64) mini: - no SMT/HT - 8 default, 8 with `amrex.omp_threads = nosmt` - [x] macOS (OSX Ventura 13.5.2, 2.8 GHz Quad-Core Intel Core i7-8569U) Intel x86_64 @n01r - 2x SMT - 8 default, 4 with `amrex.omp_threads = nosmt` - faster runtime! - [x] macOS (OSX Ventura 13.5.2) M1 Max on mac studio @RTSandberg - no SMT/HT - 10 default, 10 with `amrex.omp_threads = nosmt` - [ ] some BSD/FreeBSD system? - no user requests - low priority, we just keep the default for now - [ ] Windows... looking for a system ## Additional background ## Checklist The proposed changes: - [ ] fix a bug or incorrect behavior in AMReX - [x] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate --------- Co-authored-by: Weiqun Zhang --- .../source/InputsComputeBackends.rst | 21 +++ .../source/Inputs_Chapter.rst | 1 + Src/Base/AMReX.cpp | 15 +- Src/Base/AMReX_OpenMP.H | 15 +- Src/Base/AMReX_OpenMP.cpp | 177 ++++++++++++++++++ Src/Base/CMakeLists.txt | 1 + Src/Base/Make.package | 1 + 7 files changed, 224 insertions(+), 7 deletions(-) create mode 100644 Docs/sphinx_documentation/source/InputsComputeBackends.rst create mode 100644 Src/Base/AMReX_OpenMP.cpp diff --git a/Docs/sphinx_documentation/source/InputsComputeBackends.rst b/Docs/sphinx_documentation/source/InputsComputeBackends.rst new file mode 100644 index 00000000000..26e5d527508 --- /dev/null +++ b/Docs/sphinx_documentation/source/InputsComputeBackends.rst @@ -0,0 +1,21 @@ +.. _Chap:InputsComputeBackends: + +Compute Backends +================ + +The following inputs must be preceded by ``amrex.`` and determine runtime options of CPU or GPU compute implementations. + ++------------------------+-----------------------------------------------------------------------+-------------+------------+ +| Parameter | Description | Type | Default | ++========================+=======================================================================+=============+============+ +| ``omp_threads`` | If OpenMP is enabled, this can be used to set the default number of | String | ``system`` | +| | threads. The special value ``nosmt`` can be used to avoid using | or Int | | +| | threads for virtual cores (aka Hyperthreading or SMT), as is default | | | +| | in OpenMP, and instead only spawns threads equal to the number of | | | +| | physical cores in the system. | | | +| | For the values ``system`` and ``nosmt``, the environment variable | | | +| | ``OMP_NUM_THREADS`` takes precedence. For Integer values, | | | +| | ``OMP_NUM_THREADS`` is ignored. | | | ++------------------------+-----------------------------------------------------------------------+-------------+------------+ + +For GPU-specific parameters, see also the :ref:`GPU chapter `. diff --git a/Docs/sphinx_documentation/source/Inputs_Chapter.rst b/Docs/sphinx_documentation/source/Inputs_Chapter.rst index 0a64aeb492c..43ead40b3c6 100644 --- a/Docs/sphinx_documentation/source/Inputs_Chapter.rst +++ b/Docs/sphinx_documentation/source/Inputs_Chapter.rst @@ -9,6 +9,7 @@ Run-time Inputs InputsProblemDefinition InputsTimeStepping InputsLoadBalancing + InputsComputeBackends InputsPlotFiles InputsCheckpoint diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp index 147f8275c57..4449dab1955 100644 --- a/Src/Base/AMReX.cpp +++ b/Src/Base/AMReX.cpp @@ -52,6 +52,7 @@ #endif #ifdef AMREX_USE_OMP +#include #include #endif @@ -72,7 +73,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -459,15 +462,17 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, #endif #ifdef AMREX_USE_OMP + amrex::OpenMP::init_threads(); + + // status output if (system::verbose > 0) { // static_assert(_OPENMP >= 201107, "OpenMP >= 3.1 is required."); amrex::Print() << "OMP initialized with " << omp_get_max_threads() << " OMP threads\n"; } -#endif -#if defined(AMREX_USE_MPI) && defined(AMREX_USE_OMP) + // warn if over-subscription is detected if (system::verbose > 0) { auto ncores = int(std::thread::hardware_concurrency()); if (ncores != 0 && // It might be zero according to the C++ standard. @@ -476,8 +481,10 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, amrex::Print(amrex::ErrorStream()) << "AMReX Warning: You might be oversubscribing CPU cores with OMP threads.\n" << " There are " << ncores << " cores per node.\n" - << " There are " << ParallelDescriptor::NProcsPerNode() << " MPI ranks per node.\n" - << " But OMP is initialized with " << omp_get_max_threads() << " threads per rank.\n" +#if defined(AMREX_USE_MPI) + << " There are " << ParallelDescriptor::NProcsPerNode() << " MPI ranks (processes) per node.\n" +#endif + << " But OMP is initialized with " << omp_get_max_threads() << " threads per process.\n" << " You should consider setting OMP_NUM_THREADS=" << ncores/ParallelDescriptor::NProcsPerNode() << " or less in the environment.\n"; } diff --git a/Src/Base/AMReX_OpenMP.H b/Src/Base/AMReX_OpenMP.H index 8eb8ada4513..ce267b9be73 100644 --- a/Src/Base/AMReX_OpenMP.H +++ b/Src/Base/AMReX_OpenMP.H @@ -11,10 +11,12 @@ namespace amrex::OpenMP { inline int get_max_threads () { return omp_get_max_threads(); } inline int get_thread_num () { return omp_get_thread_num(); } inline int in_parallel () { return omp_in_parallel(); } + inline void set_num_threads (int num) { omp_set_num_threads(num); } + void init_threads (); } -#else +#else // AMREX_USE_OMP namespace amrex::OpenMP { @@ -22,9 +24,16 @@ namespace amrex::OpenMP { constexpr int get_max_threads () { return 1; } constexpr int get_thread_num () { return 0; } constexpr int in_parallel () { return false; } - + constexpr void set_num_threads (int) { /* nothing */ } + constexpr void init_threads () { /* nothing */ } } -#endif +#endif // AMREX_USE_OMP + +namespace amrex { + /** ... */ + int + numUniquePhysicalCores(); +} #endif diff --git a/Src/Base/AMReX_OpenMP.cpp b/Src/Base/AMReX_OpenMP.cpp new file mode 100644 index 00000000000..5ddd9944411 --- /dev/null +++ b/Src/Base/AMReX_OpenMP.cpp @@ -0,0 +1,177 @@ +#include +#include +#include +#include + +#if defined(__APPLE__) +#include +#include +#endif + +#if defined(_WIN32) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace amrex +{ + int + numUniquePhysicalCores () + { + int ncores; + +#if defined(__APPLE__) + size_t len = sizeof(ncores); + // See hw.physicalcpu and hw.physicalcpu_max + // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_system_capabilities/ + // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname + if (sysctlbyname("hw.physicalcpu", &ncores, &len, NULL, 0) == -1) { + if (system::verbose > 0) { + amrex::Print() << "numUniquePhysicalCores(): Error receiving hw.physicalcpu! " + << "Defaulting to visible cores.\n"; + } + ncores = int(std::thread::hardware_concurrency()); + } +#elif defined(__linux__) + std::set> uniqueThreadSets; + int cpuIndex = 0; + + while (true) { + // for each logical CPU in cpuIndex from 0...N-1 + std::string path = "/sys/devices/system/cpu/cpu" + std::to_string(cpuIndex) + "/topology/thread_siblings_list"; + std::ifstream file(path); + if (!file.is_open()) { + break; // no further CPUs to check + } + + // find its siblings + std::vector siblings; + std::string line; + if (std::getline(file, line)) { + std::stringstream ss(line); + std::string token; + + // Possible syntax: 0-3, 8-11, 14,17 + // https://github.com/torvalds/linux/blob/v6.5/Documentation/ABI/stable/sysfs-devices-system-cpu#L68-L72 + while (std::getline(ss, token, ',')) { + size_t dashPos = token.find('-'); + if (dashPos != std::string::npos) { + // Range detected + int start = std::stoi(token.substr(0, dashPos)); + int end = std::stoi(token.substr(dashPos + 1)); + for (int i = start; i <= end; ++i) { + siblings.push_back(i); + } + } else { + siblings.push_back(std::stoi(token)); + } + } + } + + // and record the siblings group + // (assumes: ascending and unique sets per cpuIndex) + uniqueThreadSets.insert(siblings); + cpuIndex++; + } + + if (cpuIndex == 0) { + if (system::verbose > 0) { + amrex::Print() << "numUniquePhysicalCores(): Error reading CPU info.\n"; + } + ncores = int(std::thread::hardware_concurrency()); + } else { + ncores = int(uniqueThreadSets.size()); + } +#elif defined(_WIN32) + DWORD length = 0; + bool result = GetLogicalProcessorInformation(NULL, &length); + + if (!result) { + if (system::verbose > 0) { + amrex::Print() << "numUniquePhysicalCores(): Failed to get logical processor information! " + << "Defaulting to visible cores.\n"; + } + ncores = int(std::thread::hardware_concurrency()); + } + else { + std::vector buffer(length / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)); + if (!GetLogicalProcessorInformation(&buffer[0], &length)) { + if (system::verbose > 0) { + amrex::Print() << "numUniquePhysicalCores(): Failed to get logical processor information! " + << "Defaulting to visible cores.\n"; + } + ncores = int(std::thread::hardware_concurrency()); + } else { + ncores = 0; + for (const auto& info : buffer) { + if (info.Relationship == RelationProcessorCore) { + ncores++; + } + } + } + } +#else + // TODO: + // BSD + if (system::verbose > 0) { + amrex::Print() << "numUniquePhysicalCores(): Unknown system. Defaulting to visible cores.\n"; + } + ncores = int(std::thread::hardware_concurrency()); +#endif + return ncores; + } +} // namespace amrex + +#ifdef AMREX_USE_OMP +namespace amrex::OpenMP +{ + void init_threads () + { + amrex::ParmParse pp("amrex"); + std::string omp_threads = "system"; + pp.queryAdd("omp_threads", omp_threads); + + auto to_int = [](std::string const & str_omp_threads) { + std::optional num; + try { num = std::stoi(str_omp_threads); } + catch (...) { /* nothing */ } + return num; + }; + + if (omp_threads == "system") { + // default or OMP_NUM_THREADS environment variable + } else if (omp_threads == "nosmt") { + char const *env_omp_num_threads = std::getenv("OMP_NUM_THREADS"); + if (env_omp_num_threads != nullptr && amrex::system::verbose > 1) { + amrex::Print() << "amrex.omp_threads was set to nosmt," + << "but OMP_NUM_THREADS was set. Will keep " + << "OMP_NUM_THREADS=" << env_omp_num_threads << ".\n"; + } else { + omp_set_num_threads(numUniquePhysicalCores()); + } + } else { + std::optional num_omp_threads = to_int(omp_threads); + if (num_omp_threads.has_value()) { + omp_set_num_threads(num_omp_threads.value()); + } + else { + if (amrex::system::verbose > 0) { + amrex::Print() << "amrex.omp_threads has an unknown value: " + << omp_threads + << " (try system, nosmt, or a positive integer)\n"; + } + } + } + } +} // namespace amrex::OpenMP +#endif // AMREX_USE_OMP diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt index 544de3aed8c..459ec3bd7c4 100644 --- a/Src/Base/CMakeLists.txt +++ b/Src/Base/CMakeLists.txt @@ -53,6 +53,7 @@ foreach(D IN LISTS AMReX_SPACEDIM) AMReX_ParallelDescriptor.H AMReX_ParallelDescriptor.cpp AMReX_OpenMP.H + AMReX_OpenMP.cpp AMReX_ParallelReduce.H AMReX_ForkJoin.H AMReX_ForkJoin.cpp diff --git a/Src/Base/Make.package b/Src/Base/Make.package index 29b4c25dc84..276887ebd79 100644 --- a/Src/Base/Make.package +++ b/Src/Base/Make.package @@ -38,6 +38,7 @@ C$(AMREX_BASE)_headers += AMReX_REAL.H AMReX_INT.H AMReX_CONSTANTS.H AMReX_SPACE C$(AMREX_BASE)_sources += AMReX_DistributionMapping.cpp AMReX_ParallelDescriptor.cpp C$(AMREX_BASE)_headers += AMReX_DistributionMapping.H AMReX_ParallelDescriptor.H C$(AMREX_BASE)_headers += AMReX_OpenMP.H +C$(AMREX_BASE)_sources += AMReX_OpenMP.cpp C$(AMREX_BASE)_headers += AMReX_ParallelReduce.H