From d3a9afe0c5ff6476b97ac2b717090fa6c3ee61fa Mon Sep 17 00:00:00 2001 From: bennibolm Date: Mon, 25 Mar 2024 13:21:27 +0100 Subject: [PATCH 1/4] Use @batch reduction functionality for bounds check --- docs/src/performance.md | 11 ---- .../subcell_bounds_check_2d.jl | 56 ++++++------------- src/solvers/dgsem_tree/subcell_limiters_2d.jl | 11 +--- 3 files changed, 19 insertions(+), 59 deletions(-) diff --git a/docs/src/performance.md b/docs/src/performance.md index 40970e58c5..9f81d3c3d8 100644 --- a/docs/src/performance.md +++ b/docs/src/performance.md @@ -282,14 +282,3 @@ requires. It can thus be seen as a proxy for "energy used" and, as an extension, timing result, you need to set the analysis interval such that the `AnalysisCallback` is invoked at least once during the course of the simulation and discard the first PID value. - -## Performance issues with multi-threaded reductions -[False sharing](https://en.wikipedia.org/wiki/False_sharing) is a known performance issue -for systems with distributed caches. It also occurred for the implementation of a thread -parallel bounds checking routine for the subcell IDP limiting -in [PR #1736](https://github.com/trixi-framework/Trixi.jl/pull/1736). -After some [testing and discussion](https://github.com/trixi-framework/Trixi.jl/pull/1736#discussion_r1423881895), -it turned out that initializing a vector of length `n * Threads.nthreads()` and only using every -n-th entry instead of a vector of length `Threads.nthreads()` fixes the problem. -Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)`. -Now, the bounds checking routine of the IDP limiting scales as hoped. diff --git a/src/callbacks_stage/subcell_bounds_check_2d.jl b/src/callbacks_stage/subcell_bounds_check_2d.jl index 19d73968c9..70f37089ec 100644 --- a/src/callbacks_stage/subcell_bounds_check_2d.jl +++ b/src/callbacks_stage/subcell_bounds_check_2d.jl @@ -12,25 +12,15 @@ (; variable_bounds) = limiter.cache.subcell_limiter_coefficients (; idp_bounds_delta_local, idp_bounds_delta_global) = limiter.cache - # Note: Accessing the threaded memory vector `idp_bounds_delta_local` with - # `deviation = idp_bounds_delta_local[key][Threads.threadid()]` causes critical performance - # issues due to False Sharing. - # Initializing a vector with n times the length and using every n-th entry fixes this - # problem and allows proper scaling: - # `deviation = idp_bounds_delta_local[key][n * Threads.threadid()]` - # Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)` - stride_size = div(128, sizeof(eltype(u))) # = n - if local_minmax for v in limiter.local_minmax_variables_cons v_string = string(v) key_min = Symbol(v_string, "_min") key_max = Symbol(v_string, "_max") - deviation_min_threaded = idp_bounds_delta_local[key_min] - deviation_max_threaded = idp_bounds_delta_local[key_max] - @threaded for element in eachelement(solver, cache) - deviation_min = deviation_min_threaded[stride_size * Threads.threadid()] - deviation_max = deviation_max_threaded[stride_size * Threads.threadid()] + deviation_min = idp_bounds_delta_local[key_min] + deviation_max = idp_bounds_delta_local[key_max] + @batch reduction=((max, deviation_min), (max, deviation_max)) for element in eachelement(solver, + cache) for j in eachnode(solver), i in eachnode(solver) var = u[v, i, j, element] deviation_min = max(deviation_min, @@ -38,9 +28,9 @@ deviation_max = max(deviation_max, var - variable_bounds[key_max][i, j, element]) end - deviation_min_threaded[stride_size * Threads.threadid()] = deviation_min - deviation_max_threaded[stride_size * Threads.threadid()] = deviation_max end + idp_bounds_delta_local[key_min] = deviation_min + idp_bounds_delta_local[key_max] = deviation_max end end if positivity @@ -49,40 +39,35 @@ continue end key = Symbol(string(v), "_min") - deviation_threaded = idp_bounds_delta_local[key] - @threaded for element in eachelement(solver, cache) - deviation = deviation_threaded[stride_size * Threads.threadid()] + deviation = idp_bounds_delta_local[key] + @batch reduction=(max, deviation) for element in eachelement(solver, cache) for j in eachnode(solver), i in eachnode(solver) var = u[v, i, j, element] deviation = max(deviation, variable_bounds[key][i, j, element] - var) end - deviation_threaded[stride_size * Threads.threadid()] = deviation end + idp_bounds_delta_local[key] = deviation end for variable in limiter.positivity_variables_nonlinear key = Symbol(string(variable), "_min") - deviation_threaded = idp_bounds_delta_local[key] - @threaded for element in eachelement(solver, cache) - deviation = deviation_threaded[stride_size * Threads.threadid()] + deviation = idp_bounds_delta_local[key] + @batch reduction=(max, deviation) for element in eachelement(solver, cache) for j in eachnode(solver), i in eachnode(solver) var = variable(get_node_vars(u, equations, solver, i, j, element), equations) deviation = max(deviation, variable_bounds[key][i, j, element] - var) end - deviation_threaded[stride_size * Threads.threadid()] = deviation end + idp_bounds_delta_local[key] = deviation end end for (key, _) in idp_bounds_delta_local - # Calculate maximum deviations of all threads - idp_bounds_delta_local[key][stride_size] = maximum(idp_bounds_delta_local[key][stride_size * i] - for i in 1:Threads.nthreads()) # Update global maximum deviations idp_bounds_delta_global[key] = max(idp_bounds_delta_global[key], - idp_bounds_delta_local[key][stride_size]) + idp_bounds_delta_local[key]) end if save_errors @@ -92,10 +77,7 @@ if local_minmax for v in limiter.local_minmax_variables_cons v_string = string(v) - print(f, ", ", - idp_bounds_delta_local[Symbol(v_string, "_min")][stride_size], - ", ", - idp_bounds_delta_local[Symbol(v_string, "_max")][stride_size]) + print(f, ", ", idp_bounds_delta_local[Symbol(v_string, "_min")], ", ", idp_bounds_delta_local[Symbol(v_string, "_max")]) end end if positivity @@ -103,21 +85,17 @@ if v in limiter.local_minmax_variables_cons continue end - print(f, ", ", - idp_bounds_delta_local[Symbol(string(v), "_min")][stride_size]) + print(f, ", ", idp_bounds_delta_local[Symbol(string(v), "_min")]) end for variable in limiter.positivity_variables_nonlinear - print(f, ", ", - idp_bounds_delta_local[Symbol(string(variable), "_min")][stride_size]) + print(f, ", ", idp_bounds_delta_local[Symbol(string(variable), "_min")]) end end println(f) end # Reset local maximum deviations for (key, _) in idp_bounds_delta_local - for i in 1:Threads.nthreads() - idp_bounds_delta_local[key][stride_size * i] = zero(eltype(idp_bounds_delta_local[key][stride_size])) - end + idp_bounds_delta_local[key] = zero(eltype(idp_bounds_delta_local[key])) end end diff --git a/src/solvers/dgsem_tree/subcell_limiters_2d.jl b/src/solvers/dgsem_tree/subcell_limiters_2d.jl index 3f7954c895..9343cee439 100644 --- a/src/solvers/dgsem_tree/subcell_limiters_2d.jl +++ b/src/solvers/dgsem_tree/subcell_limiters_2d.jl @@ -18,18 +18,11 @@ function create_cache(limiter::Type{SubcellLimiterIDP}, equations::AbstractEquat # Memory for bounds checking routine with `BoundsCheckCallback`. # Local variable contains the maximum deviation since the last export. - # Using a threaded vector to parallelize bounds check. - idp_bounds_delta_local = Dict{Symbol, Vector{real(basis)}}() + idp_bounds_delta_local = Dict{Symbol, real(basis)}() # Global variable contains the total maximum deviation. idp_bounds_delta_global = Dict{Symbol, real(basis)}() - # Note: False sharing causes critical performance issues on multiple threads when using a vector - # of length `Threads.nthreads()`. Initializing a vector of length `n * Threads.nthreads()` - # and then only using every n-th entry, fixes the problem and allows proper scaling. - # Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)` - stride_size = div(128, sizeof(eltype(basis.nodes))) # = n for key in bound_keys - idp_bounds_delta_local[key] = [zero(real(basis)) - for _ in 1:(stride_size * Threads.nthreads())] + idp_bounds_delta_local[key] = zero(real(basis)) idp_bounds_delta_global[key] = zero(real(basis)) end From 48586a121efe2c1039bcd5fb82b05fe32080dbf9 Mon Sep 17 00:00:00 2001 From: bennibolm Date: Mon, 25 Mar 2024 13:26:06 +0100 Subject: [PATCH 2/4] fmt --- src/callbacks_stage/subcell_bounds_check_2d.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/callbacks_stage/subcell_bounds_check_2d.jl b/src/callbacks_stage/subcell_bounds_check_2d.jl index 70f37089ec..638bbf8003 100644 --- a/src/callbacks_stage/subcell_bounds_check_2d.jl +++ b/src/callbacks_stage/subcell_bounds_check_2d.jl @@ -77,7 +77,8 @@ if local_minmax for v in limiter.local_minmax_variables_cons v_string = string(v) - print(f, ", ", idp_bounds_delta_local[Symbol(v_string, "_min")], ", ", idp_bounds_delta_local[Symbol(v_string, "_max")]) + print(f, ", ", idp_bounds_delta_local[Symbol(v_string, "_min")], + ", ", idp_bounds_delta_local[Symbol(v_string, "_max")]) end end if positivity @@ -88,7 +89,8 @@ print(f, ", ", idp_bounds_delta_local[Symbol(string(v), "_min")]) end for variable in limiter.positivity_variables_nonlinear - print(f, ", ", idp_bounds_delta_local[Symbol(string(variable), "_min")]) + print(f, ", ", + idp_bounds_delta_local[Symbol(string(variable), "_min")]) end end println(f) From 8a38052966a88de9f1826528cb894c8924c9c18c Mon Sep 17 00:00:00 2001 From: bennibolm Date: Mon, 25 Mar 2024 16:38:38 +0100 Subject: [PATCH 3/4] Adapt compat bound for Polyester.jl --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 27df49ed4f..3db90c9928 100644 --- a/Project.toml +++ b/Project.toml @@ -75,7 +75,7 @@ MuladdMacro = "0.2.2" Octavian = "0.3.21" OffsetArrays = "1.12" P4est = "0.4.9" -Polyester = "0.7.5" +Polyester = "0.7.10" PrecompileTools = "1.1" Preferences = "1.3" Printf = "1" From 51ad911282dcc1f4704ee7b7a973fb98bcf7beba Mon Sep 17 00:00:00 2001 From: bennibolm Date: Mon, 25 Mar 2024 20:54:07 +0100 Subject: [PATCH 4/4] Add comments --- src/callbacks_stage/subcell_bounds_check_2d.jl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/callbacks_stage/subcell_bounds_check_2d.jl b/src/callbacks_stage/subcell_bounds_check_2d.jl index 638bbf8003..3a56ea71f6 100644 --- a/src/callbacks_stage/subcell_bounds_check_2d.jl +++ b/src/callbacks_stage/subcell_bounds_check_2d.jl @@ -12,6 +12,14 @@ (; variable_bounds) = limiter.cache.subcell_limiter_coefficients (; idp_bounds_delta_local, idp_bounds_delta_global) = limiter.cache + # Note: In order to get the maximum deviation from the target bounds, this bounds check + # requires a reduction in every RK stage and for every enabled limiting option. To make + # this Thread-parallel we are using Polyester.jl's (at least v0.7.10) `@batch reduction` + # functionality. + # Although `@threaded` and `@batch` are currently used equivalently in Trixi.jl, we use + # `@batch` here to allow a possible redefinition of `@threaded` without creating errors here. + # See also https://github.com/trixi-framework/Trixi.jl/pull/1888#discussion_r1537785293. + if local_minmax for v in limiter.local_minmax_variables_cons v_string = string(v) @@ -23,6 +31,10 @@ cache) for j in eachnode(solver), i in eachnode(solver) var = u[v, i, j, element] + # Note: We always save the absolute deviations >= 0 and therefore use the + # `max` operator for the lower and upper bound. The different directions of + # upper and lower bound are considered in their calculations with a + # different sign. deviation_min = max(deviation_min, variable_bounds[key_min][i, j, element] - var) deviation_max = max(deviation_max,