From d3a9afe0c5ff6476b97ac2b717090fa6c3ee61fa Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Mon, 25 Mar 2024 13:21:27 +0100
Subject: [PATCH 1/4] Use @batch reduction functionality for bounds check

---
 docs/src/performance.md                       | 11 ----
 .../subcell_bounds_check_2d.jl                | 56 ++++++-------------
 src/solvers/dgsem_tree/subcell_limiters_2d.jl | 11 +---
 3 files changed, 19 insertions(+), 59 deletions(-)

diff --git a/docs/src/performance.md b/docs/src/performance.md
index 40970e58c5..9f81d3c3d8 100644
--- a/docs/src/performance.md
+++ b/docs/src/performance.md
@@ -282,14 +282,3 @@ requires. It can thus be seen as a proxy for "energy used" and, as an extension,
     timing result, you need to set the analysis interval such that the
     `AnalysisCallback` is invoked at least once during the course of the simulation and
     discard the first PID value.
-
-## Performance issues with multi-threaded reductions
-[False sharing](https://en.wikipedia.org/wiki/False_sharing) is a known performance issue
-for systems with distributed caches. It also occurred for the implementation of a thread
-parallel bounds checking routine for the subcell IDP limiting
-in [PR #1736](https://github.com/trixi-framework/Trixi.jl/pull/1736).
-After some [testing and discussion](https://github.com/trixi-framework/Trixi.jl/pull/1736#discussion_r1423881895),
-it turned out that initializing a vector of length `n * Threads.nthreads()` and only using every
-n-th entry instead of a vector of length `Threads.nthreads()` fixes the problem.
-Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)`.
-Now, the bounds checking routine of the IDP limiting scales as hoped.
diff --git a/src/callbacks_stage/subcell_bounds_check_2d.jl b/src/callbacks_stage/subcell_bounds_check_2d.jl
index 19d73968c9..70f37089ec 100644
--- a/src/callbacks_stage/subcell_bounds_check_2d.jl
+++ b/src/callbacks_stage/subcell_bounds_check_2d.jl
@@ -12,25 +12,15 @@
     (; variable_bounds) = limiter.cache.subcell_limiter_coefficients
     (; idp_bounds_delta_local, idp_bounds_delta_global) = limiter.cache
 
-    # Note: Accessing the threaded memory vector `idp_bounds_delta_local` with
-    # `deviation = idp_bounds_delta_local[key][Threads.threadid()]` causes critical performance
-    # issues due to False Sharing.
-    # Initializing a vector with n times the length and using every n-th entry fixes this
-    # problem and allows proper scaling:
-    # `deviation = idp_bounds_delta_local[key][n * Threads.threadid()]`
-    # Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)`
-    stride_size = div(128, sizeof(eltype(u))) # = n
-
     if local_minmax
         for v in limiter.local_minmax_variables_cons
             v_string = string(v)
             key_min = Symbol(v_string, "_min")
             key_max = Symbol(v_string, "_max")
-            deviation_min_threaded = idp_bounds_delta_local[key_min]
-            deviation_max_threaded = idp_bounds_delta_local[key_max]
-            @threaded for element in eachelement(solver, cache)
-                deviation_min = deviation_min_threaded[stride_size * Threads.threadid()]
-                deviation_max = deviation_max_threaded[stride_size * Threads.threadid()]
+            deviation_min = idp_bounds_delta_local[key_min]
+            deviation_max = idp_bounds_delta_local[key_max]
+            @batch reduction=((max, deviation_min), (max, deviation_max)) for element in eachelement(solver,
+                                                                                                     cache)
                 for j in eachnode(solver), i in eachnode(solver)
                     var = u[v, i, j, element]
                     deviation_min = max(deviation_min,
@@ -38,9 +28,9 @@
                     deviation_max = max(deviation_max,
                                         var - variable_bounds[key_max][i, j, element])
                 end
-                deviation_min_threaded[stride_size * Threads.threadid()] = deviation_min
-                deviation_max_threaded[stride_size * Threads.threadid()] = deviation_max
             end
+            idp_bounds_delta_local[key_min] = deviation_min
+            idp_bounds_delta_local[key_max] = deviation_max
         end
     end
     if positivity
@@ -49,40 +39,35 @@
                 continue
             end
             key = Symbol(string(v), "_min")
-            deviation_threaded = idp_bounds_delta_local[key]
-            @threaded for element in eachelement(solver, cache)
-                deviation = deviation_threaded[stride_size * Threads.threadid()]
+            deviation = idp_bounds_delta_local[key]
+            @batch reduction=(max, deviation) for element in eachelement(solver, cache)
                 for j in eachnode(solver), i in eachnode(solver)
                     var = u[v, i, j, element]
                     deviation = max(deviation,
                                     variable_bounds[key][i, j, element] - var)
                 end
-                deviation_threaded[stride_size * Threads.threadid()] = deviation
             end
+            idp_bounds_delta_local[key] = deviation
         end
         for variable in limiter.positivity_variables_nonlinear
             key = Symbol(string(variable), "_min")
-            deviation_threaded = idp_bounds_delta_local[key]
-            @threaded for element in eachelement(solver, cache)
-                deviation = deviation_threaded[stride_size * Threads.threadid()]
+            deviation = idp_bounds_delta_local[key]
+            @batch reduction=(max, deviation) for element in eachelement(solver, cache)
                 for j in eachnode(solver), i in eachnode(solver)
                     var = variable(get_node_vars(u, equations, solver, i, j, element),
                                    equations)
                     deviation = max(deviation,
                                     variable_bounds[key][i, j, element] - var)
                 end
-                deviation_threaded[stride_size * Threads.threadid()] = deviation
             end
+            idp_bounds_delta_local[key] = deviation
         end
     end
 
     for (key, _) in idp_bounds_delta_local
-        # Calculate maximum deviations of all threads
-        idp_bounds_delta_local[key][stride_size] = maximum(idp_bounds_delta_local[key][stride_size * i]
-                                                           for i in 1:Threads.nthreads())
         # Update global maximum deviations
         idp_bounds_delta_global[key] = max(idp_bounds_delta_global[key],
-                                           idp_bounds_delta_local[key][stride_size])
+                                           idp_bounds_delta_local[key])
     end
 
     if save_errors
@@ -92,10 +77,7 @@
             if local_minmax
                 for v in limiter.local_minmax_variables_cons
                     v_string = string(v)
-                    print(f, ", ",
-                          idp_bounds_delta_local[Symbol(v_string, "_min")][stride_size],
-                          ", ",
-                          idp_bounds_delta_local[Symbol(v_string, "_max")][stride_size])
+                    print(f, ", ", idp_bounds_delta_local[Symbol(v_string, "_min")], ", ", idp_bounds_delta_local[Symbol(v_string, "_max")])
                 end
             end
             if positivity
@@ -103,21 +85,17 @@
                     if v in limiter.local_minmax_variables_cons
                         continue
                     end
-                    print(f, ", ",
-                          idp_bounds_delta_local[Symbol(string(v), "_min")][stride_size])
+                    print(f, ", ", idp_bounds_delta_local[Symbol(string(v), "_min")])
                 end
                 for variable in limiter.positivity_variables_nonlinear
-                    print(f, ", ",
-                          idp_bounds_delta_local[Symbol(string(variable), "_min")][stride_size])
+                    print(f, ", ", idp_bounds_delta_local[Symbol(string(variable), "_min")])
                 end
             end
             println(f)
         end
         # Reset local maximum deviations
         for (key, _) in idp_bounds_delta_local
-            for i in 1:Threads.nthreads()
-                idp_bounds_delta_local[key][stride_size * i] = zero(eltype(idp_bounds_delta_local[key][stride_size]))
-            end
+            idp_bounds_delta_local[key] = zero(eltype(idp_bounds_delta_local[key]))
         end
     end
 
diff --git a/src/solvers/dgsem_tree/subcell_limiters_2d.jl b/src/solvers/dgsem_tree/subcell_limiters_2d.jl
index 3f7954c895..9343cee439 100644
--- a/src/solvers/dgsem_tree/subcell_limiters_2d.jl
+++ b/src/solvers/dgsem_tree/subcell_limiters_2d.jl
@@ -18,18 +18,11 @@ function create_cache(limiter::Type{SubcellLimiterIDP}, equations::AbstractEquat
 
     # Memory for bounds checking routine with `BoundsCheckCallback`.
     # Local variable contains the maximum deviation since the last export.
-    # Using a threaded vector to parallelize bounds check.
-    idp_bounds_delta_local = Dict{Symbol, Vector{real(basis)}}()
+    idp_bounds_delta_local = Dict{Symbol, real(basis)}()
     # Global variable contains the total maximum deviation.
     idp_bounds_delta_global = Dict{Symbol, real(basis)}()
-    # Note: False sharing causes critical performance issues on multiple threads when using a vector
-    # of length `Threads.nthreads()`. Initializing a vector of length `n * Threads.nthreads()`
-    # and then only using every n-th entry, fixes the problem and allows proper scaling.
-    # Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)`
-    stride_size = div(128, sizeof(eltype(basis.nodes))) # = n
     for key in bound_keys
-        idp_bounds_delta_local[key] = [zero(real(basis))
-                                       for _ in 1:(stride_size * Threads.nthreads())]
+        idp_bounds_delta_local[key] = zero(real(basis))
         idp_bounds_delta_global[key] = zero(real(basis))
     end
 

From 48586a121efe2c1039bcd5fb82b05fe32080dbf9 Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Mon, 25 Mar 2024 13:26:06 +0100
Subject: [PATCH 2/4] fmt

---
 src/callbacks_stage/subcell_bounds_check_2d.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/callbacks_stage/subcell_bounds_check_2d.jl b/src/callbacks_stage/subcell_bounds_check_2d.jl
index 70f37089ec..638bbf8003 100644
--- a/src/callbacks_stage/subcell_bounds_check_2d.jl
+++ b/src/callbacks_stage/subcell_bounds_check_2d.jl
@@ -77,7 +77,8 @@
             if local_minmax
                 for v in limiter.local_minmax_variables_cons
                     v_string = string(v)
-                    print(f, ", ", idp_bounds_delta_local[Symbol(v_string, "_min")], ", ", idp_bounds_delta_local[Symbol(v_string, "_max")])
+                    print(f, ", ", idp_bounds_delta_local[Symbol(v_string, "_min")],
+                          ", ", idp_bounds_delta_local[Symbol(v_string, "_max")])
                 end
             end
             if positivity
@@ -88,7 +89,8 @@
                     print(f, ", ", idp_bounds_delta_local[Symbol(string(v), "_min")])
                 end
                 for variable in limiter.positivity_variables_nonlinear
-                    print(f, ", ", idp_bounds_delta_local[Symbol(string(variable), "_min")])
+                    print(f, ", ",
+                          idp_bounds_delta_local[Symbol(string(variable), "_min")])
                 end
             end
             println(f)

From 8a38052966a88de9f1826528cb894c8924c9c18c Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Mon, 25 Mar 2024 16:38:38 +0100
Subject: [PATCH 3/4] Adapt compat bound for Polyester.jl

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 27df49ed4f..3db90c9928 100644
--- a/Project.toml
+++ b/Project.toml
@@ -75,7 +75,7 @@ MuladdMacro = "0.2.2"
 Octavian = "0.3.21"
 OffsetArrays = "1.12"
 P4est = "0.4.9"
-Polyester = "0.7.5"
+Polyester = "0.7.10"
 PrecompileTools = "1.1"
 Preferences = "1.3"
 Printf = "1"

From 51ad911282dcc1f4704ee7b7a973fb98bcf7beba Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Mon, 25 Mar 2024 20:54:07 +0100
Subject: [PATCH 4/4] Add comments

---
 src/callbacks_stage/subcell_bounds_check_2d.jl | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/callbacks_stage/subcell_bounds_check_2d.jl b/src/callbacks_stage/subcell_bounds_check_2d.jl
index 638bbf8003..3a56ea71f6 100644
--- a/src/callbacks_stage/subcell_bounds_check_2d.jl
+++ b/src/callbacks_stage/subcell_bounds_check_2d.jl
@@ -12,6 +12,14 @@
     (; variable_bounds) = limiter.cache.subcell_limiter_coefficients
     (; idp_bounds_delta_local, idp_bounds_delta_global) = limiter.cache
 
+    # Note: In order to get the maximum deviation from the target bounds, this bounds check
+    # requires a reduction in every RK stage and for every enabled limiting option. To make
+    # this Thread-parallel we are using Polyester.jl's (at least v0.7.10) `@batch reduction`
+    # functionality.
+    # Although `@threaded` and `@batch` are currently used equivalently in Trixi.jl, we use
+    # `@batch` here to allow a possible redefinition of `@threaded` without creating errors here.
+    # See also https://github.com/trixi-framework/Trixi.jl/pull/1888#discussion_r1537785293.
+
     if local_minmax
         for v in limiter.local_minmax_variables_cons
             v_string = string(v)
@@ -23,6 +31,10 @@
                                                                                                      cache)
                 for j in eachnode(solver), i in eachnode(solver)
                     var = u[v, i, j, element]
+                    # Note: We always save the absolute deviations >= 0 and therefore use the
+                    # `max` operator for the lower and upper bound. The different directions of
+                    # upper and lower bound are considered in their calculations with a
+                    # different sign.
                     deviation_min = max(deviation_min,
                                         variable_bounds[key_min][i, j, element] - var)
                     deviation_max = max(deviation_max,