From df197b8f8a2fe32e12b62d4a0048d0c7a8e45168 Mon Sep 17 00:00:00 2001 From: bennibolm Date: Tue, 14 Nov 2023 11:43:45 +0100 Subject: [PATCH 1/8] Implement parallel bounds check for IDP limiting --- src/callbacks_stage/subcell_bounds_check.jl | 10 ++- .../subcell_bounds_check_2d.jl | 77 ++++++++++++------- src/solvers/dgsem_tree/subcell_limiters_2d.jl | 13 ++-- 3 files changed, 63 insertions(+), 37 deletions(-) diff --git a/src/callbacks_stage/subcell_bounds_check.jl b/src/callbacks_stage/subcell_bounds_check.jl index d7e30ab162..d77998e810 100644 --- a/src/callbacks_stage/subcell_bounds_check.jl +++ b/src/callbacks_stage/subcell_bounds_check.jl @@ -118,7 +118,7 @@ end @inline function finalize_callback(callback::BoundsCheckCallback, semi, limiter::SubcellLimiterIDP) (; local_minmax, positivity) = limiter - (; idp_bounds_delta) = limiter.cache + (; idp_bounds_delta_threaded) = limiter.cache variables = varnames(cons2cons, semi.equations) println("─"^100) @@ -128,8 +128,10 @@ end for v in limiter.local_minmax_variables_cons v_string = string(v) println("$(variables[v]):") - println("-lower bound: ", idp_bounds_delta[Symbol(v_string, "_min")][2]) - println("-upper bound: ", idp_bounds_delta[Symbol(v_string, "_max")][2]) + println("- lower bound: ", + idp_bounds_delta_threaded[Symbol(v_string, "_min")][1][2]) + println("- upper bound: ", + idp_bounds_delta_threaded[Symbol(v_string, "_max")][1][2]) end end if positivity @@ -138,7 +140,7 @@ end continue end println(string(variables[v]) * ":\n- positivity: ", - idp_bounds_delta[Symbol(string(v), "_min")][2]) + idp_bounds_delta_threaded[Symbol(string(v), "_min")][1][2]) end end println("─"^100 * "\n") diff --git a/src/callbacks_stage/subcell_bounds_check_2d.jl b/src/callbacks_stage/subcell_bounds_check_2d.jl index d52eb6edb9..0cdf3df5f1 100644 --- a/src/callbacks_stage/subcell_bounds_check_2d.jl +++ b/src/callbacks_stage/subcell_bounds_check_2d.jl @@ -10,26 +10,38 @@ time, iter, output_directory, save_errors) (; local_minmax, positivity) = solver.volume_integral.limiter (; variable_bounds) = limiter.cache.subcell_limiter_coefficients - (; idp_bounds_delta) = limiter.cache + (; idp_bounds_delta_threaded) = limiter.cache if local_minmax for v in limiter.local_minmax_variables_cons v_string = string(v) key_min = Symbol(v_string, "_min") key_max = Symbol(v_string, "_max") - deviation_min = idp_bounds_delta[key_min] - deviation_max = idp_bounds_delta[key_max] - for element in eachelement(solver, cache), j in eachnode(solver), - i in eachnode(solver) - - var = u[v, i, j, element] - deviation_min[1] = max(deviation_min[1], - variable_bounds[key_min][i, j, element] - var) - deviation_max[1] = max(deviation_max[1], - var - variable_bounds[key_max][i, j, element]) + deviation_min_threaded = idp_bounds_delta_threaded[key_min] + deviation_max_threaded = idp_bounds_delta_threaded[key_max] + @threaded for element in eachelement(solver, cache) + deviation_min = deviation_min_threaded[Threads.threadid()] + deviation_max = deviation_max_threaded[Threads.threadid()] + for j in eachnode(solver), i in eachnode(solver) + var = u[v, i, j, element] + deviation_min[1] = max(deviation_min[1], + variable_bounds[key_min][i, j, element] - + var) + deviation_max[1] = max(deviation_max[1], + var - + variable_bounds[key_max][i, j, element]) + end end - deviation_min[2] = max(deviation_min[2], deviation_min[1]) - deviation_max[2] = max(deviation_max[2], deviation_max[1]) + for i in 2:Threads.nthreads() + deviation_min_threaded[1][1] = max(deviation_min_threaded[1][1], + deviation_min_threaded[i][1]) + deviation_max_threaded[1][1] = max(deviation_max_threaded[1][1], + deviation_max_threaded[i][1]) + end + deviation_min_threaded[1][2] = max(deviation_min_threaded[1][2], + deviation_min_threaded[1][1]) + deviation_max_threaded[1][2] = max(deviation_max_threaded[1][2], + deviation_max_threaded[1][1]) end end if positivity @@ -38,15 +50,21 @@ continue end key = Symbol(string(v), "_min") - deviation = idp_bounds_delta[key] - for element in eachelement(solver, cache), j in eachnode(solver), - i in eachnode(solver) - - var = u[v, i, j, element] - deviation[1] = max(deviation[1], - variable_bounds[key][i, j, element] - var) + deviation_threaded = idp_bounds_delta_threaded[key] + @threaded for element in eachelement(solver, cache) + deviation = deviation_threaded[Threads.threadid()] + for j in eachnode(solver), i in eachnode(solver) + var = u[v, i, j, element] + deviation[1] = max(deviation[1], + variable_bounds[key][i, j, element] - var) + end + end + for i in 2:Threads.nthreads() + deviation_threaded[1][1] = max(deviation_threaded[1][1], + deviation_threaded[i][1]) end - deviation[2] = max(deviation[2], deviation[1]) + deviation_threaded[1][2] = max(deviation_threaded[1][2], + deviation_threaded[1][1]) end end if save_errors @@ -56,8 +74,10 @@ if local_minmax for v in limiter.local_minmax_variables_cons v_string = string(v) - print(f, ", ", idp_bounds_delta[Symbol(v_string, "_min")][1], ", ", - idp_bounds_delta[Symbol(v_string, "_max")][1]) + print(f, ", ", + idp_bounds_delta_threaded[Symbol(v_string, "_min")][1][1], + ", ", + idp_bounds_delta_threaded[Symbol(v_string, "_max")][1][1]) end end if positivity @@ -65,14 +85,17 @@ if v in limiter.local_minmax_variables_cons continue end - print(f, ", ", idp_bounds_delta[Symbol(string(v), "_min")][1]) + print(f, ", ", + idp_bounds_delta_threaded[Symbol(string(v), "_min")][1][1]) end end println(f) end - # Reset first entries of idp_bounds_delta - for (key, _) in idp_bounds_delta - idp_bounds_delta[key][1] = zero(eltype(idp_bounds_delta[key][1])) + # Reset first entries of idp_bounds_delta_threaded + for (key, _) in idp_bounds_delta_threaded + for i in 1:Threads.nthreads() + idp_bounds_delta_threaded[key][i][1] = zero(eltype(idp_bounds_delta_threaded[key][i][1])) + end end end diff --git a/src/solvers/dgsem_tree/subcell_limiters_2d.jl b/src/solvers/dgsem_tree/subcell_limiters_2d.jl index bc69e55f26..025f4ec6e1 100644 --- a/src/solvers/dgsem_tree/subcell_limiters_2d.jl +++ b/src/solvers/dgsem_tree/subcell_limiters_2d.jl @@ -13,22 +13,23 @@ function create_cache(limiter::Type{SubcellLimiterIDP}, equations::AbstractEquat nnodes(basis), bound_keys) - # Memory for bounds checking routine with `BoundsCheckCallback`. + # Threaded memory for bounds checking routine with `BoundsCheckCallback`. # The first entry of each vector contains the maximum deviation since the last export. - # The second one contains the total maximum deviation. - idp_bounds_delta = Dict{Symbol, Vector{real(basis)}}() + # In the second entry, the total maximum deviation is saved. + idp_bounds_delta_threaded = Dict{Symbol, Vector{Vector{real(basis)}}}() for key in bound_keys - idp_bounds_delta[key] = zeros(real(basis), 2) + idp_bounds_delta_threaded[key] = [zeros(real(basis), 2) + for _ in 1:Threads.nthreads()] end - return (; subcell_limiter_coefficients, idp_bounds_delta) + return (; subcell_limiter_coefficients, idp_bounds_delta_threaded) end function (limiter::SubcellLimiterIDP)(u::AbstractArray{<:Any, 4}, semi, dg::DGSEM, t, dt; kwargs...) @unpack alpha = limiter.cache.subcell_limiter_coefficients - alpha .= zero(eltype(alpha)) + @trixi_timeit timer() "reset alpha" reset_du!(alpha, dg, semi.cache) if limiter.local_minmax @trixi_timeit timer() "local min/max limiting" idp_local_minmax!(alpha, limiter, From e1bd68da84c2974bfe07eb8c1dfaa3321fee778f Mon Sep 17 00:00:00 2001 From: bennibolm Date: Tue, 14 Nov 2023 14:39:46 +0100 Subject: [PATCH 2/8] Add missing warning as "experimental" from last PR --- src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl b/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl index 2fc62f548d..9af8b65b4c 100644 --- a/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl +++ b/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl @@ -470,6 +470,9 @@ end For subcell limiting, the calculation of local bounds for non-periodic domains require the boundary outer state. This function returns the boundary value at time `t` and for node with spatial indices `indices`. + +!!! warning "Experimental implementation" + This is an experimental feature and may change in future releases. """ @inline function get_boundary_outer_state(boundary_condition::BoundaryConditionDirichlet, cache, t, equations, dg, indices...) From f241139d53d23ba76b843047b3f1fa24014ecc89 Mon Sep 17 00:00:00 2001 From: bennibolm Date: Mon, 20 Nov 2023 15:32:29 +0100 Subject: [PATCH 3/8] Updating `idp_bounds_delta_threaded` for all bounds at once --- .../subcell_bounds_check_2d.jl | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/src/callbacks_stage/subcell_bounds_check_2d.jl b/src/callbacks_stage/subcell_bounds_check_2d.jl index 0cdf3df5f1..a23bbf4282 100644 --- a/src/callbacks_stage/subcell_bounds_check_2d.jl +++ b/src/callbacks_stage/subcell_bounds_check_2d.jl @@ -32,16 +32,6 @@ variable_bounds[key_max][i, j, element]) end end - for i in 2:Threads.nthreads() - deviation_min_threaded[1][1] = max(deviation_min_threaded[1][1], - deviation_min_threaded[i][1]) - deviation_max_threaded[1][1] = max(deviation_max_threaded[1][1], - deviation_max_threaded[i][1]) - end - deviation_min_threaded[1][2] = max(deviation_min_threaded[1][2], - deviation_min_threaded[1][1]) - deviation_max_threaded[1][2] = max(deviation_max_threaded[1][2], - deviation_max_threaded[1][1]) end end if positivity @@ -59,14 +49,20 @@ variable_bounds[key][i, j, element] - var) end end - for i in 2:Threads.nthreads() - deviation_threaded[1][1] = max(deviation_threaded[1][1], - deviation_threaded[i][1]) - end - deviation_threaded[1][2] = max(deviation_threaded[1][2], - deviation_threaded[1][1]) end end + + for (key, _) in idp_bounds_delta_threaded + # Calculate maximum deviations of all threads + for i in 2:Threads.nthreads() + idp_bounds_delta_threaded[key][1][1] = max(idp_bounds_delta_threaded[key][1][1], + idp_bounds_delta_threaded[key][i][1]) + end + # Update global maximum deviations + idp_bounds_delta_threaded[key][1][2] = max(idp_bounds_delta_threaded[key][1][2], + idp_bounds_delta_threaded[key][1][1]) + end + if save_errors # Print to output file open("$output_directory/deviations.txt", "a") do f From 78957b71ee1fc54f23361598b95c7a5e4536ed93 Mon Sep 17 00:00:00 2001 From: bennibolm Date: Wed, 6 Dec 2023 15:08:01 +0100 Subject: [PATCH 4/8] Revise parallel memory structure --- src/callbacks_stage/subcell_bounds_check.jl | 8 +-- .../subcell_bounds_check_2d.jl | 54 +++++++++---------- src/solvers/dgsem_tree/subcell_limiters_2d.jl | 17 +++--- 3 files changed, 38 insertions(+), 41 deletions(-) diff --git a/src/callbacks_stage/subcell_bounds_check.jl b/src/callbacks_stage/subcell_bounds_check.jl index d77998e810..9f34a6b3b4 100644 --- a/src/callbacks_stage/subcell_bounds_check.jl +++ b/src/callbacks_stage/subcell_bounds_check.jl @@ -118,7 +118,7 @@ end @inline function finalize_callback(callback::BoundsCheckCallback, semi, limiter::SubcellLimiterIDP) (; local_minmax, positivity) = limiter - (; idp_bounds_delta_threaded) = limiter.cache + (; idp_bounds_delta_global) = limiter.cache variables = varnames(cons2cons, semi.equations) println("─"^100) @@ -129,9 +129,9 @@ end v_string = string(v) println("$(variables[v]):") println("- lower bound: ", - idp_bounds_delta_threaded[Symbol(v_string, "_min")][1][2]) + idp_bounds_delta_global[Symbol(v_string, "_min")]) println("- upper bound: ", - idp_bounds_delta_threaded[Symbol(v_string, "_max")][1][2]) + idp_bounds_delta_global[Symbol(v_string, "_max")]) end end if positivity @@ -140,7 +140,7 @@ end continue end println(string(variables[v]) * ":\n- positivity: ", - idp_bounds_delta_threaded[Symbol(string(v), "_min")][1][2]) + idp_bounds_delta_global[Symbol(string(v), "_min")]) end end println("─"^100 * "\n") diff --git a/src/callbacks_stage/subcell_bounds_check_2d.jl b/src/callbacks_stage/subcell_bounds_check_2d.jl index a23bbf4282..036107d8aa 100644 --- a/src/callbacks_stage/subcell_bounds_check_2d.jl +++ b/src/callbacks_stage/subcell_bounds_check_2d.jl @@ -10,27 +10,27 @@ time, iter, output_directory, save_errors) (; local_minmax, positivity) = solver.volume_integral.limiter (; variable_bounds) = limiter.cache.subcell_limiter_coefficients - (; idp_bounds_delta_threaded) = limiter.cache + (; idp_bounds_delta_local, idp_bounds_delta_global) = limiter.cache if local_minmax for v in limiter.local_minmax_variables_cons v_string = string(v) key_min = Symbol(v_string, "_min") key_max = Symbol(v_string, "_max") - deviation_min_threaded = idp_bounds_delta_threaded[key_min] - deviation_max_threaded = idp_bounds_delta_threaded[key_max] + deviation_min_threaded = idp_bounds_delta_local[key_min] + deviation_max_threaded = idp_bounds_delta_local[key_max] @threaded for element in eachelement(solver, cache) deviation_min = deviation_min_threaded[Threads.threadid()] deviation_max = deviation_max_threaded[Threads.threadid()] for j in eachnode(solver), i in eachnode(solver) var = u[v, i, j, element] - deviation_min[1] = max(deviation_min[1], - variable_bounds[key_min][i, j, element] - - var) - deviation_max[1] = max(deviation_max[1], - var - - variable_bounds[key_max][i, j, element]) + deviation_min = max(deviation_min, + variable_bounds[key_min][i, j, element] - var) + deviation_max = max(deviation_max, + var - variable_bounds[key_max][i, j, element]) end + deviation_min_threaded[Threads.threadid()] = deviation_min + deviation_max_threaded[Threads.threadid()] = deviation_max end end end @@ -40,27 +40,25 @@ continue end key = Symbol(string(v), "_min") - deviation_threaded = idp_bounds_delta_threaded[key] + deviation_threaded = idp_bounds_delta_local[key] @threaded for element in eachelement(solver, cache) deviation = deviation_threaded[Threads.threadid()] for j in eachnode(solver), i in eachnode(solver) var = u[v, i, j, element] - deviation[1] = max(deviation[1], - variable_bounds[key][i, j, element] - var) + deviation = max(deviation, + variable_bounds[key][i, j, element] - var) end + deviation_threaded[Threads.threadid()] = deviation end end end - for (key, _) in idp_bounds_delta_threaded - # Calculate maximum deviations of all threads - for i in 2:Threads.nthreads() - idp_bounds_delta_threaded[key][1][1] = max(idp_bounds_delta_threaded[key][1][1], - idp_bounds_delta_threaded[key][i][1]) - end + for (key, _) in idp_bounds_delta_local + # Reduce threaded local maximum deviations. Save in first entry. + idp_bounds_delta_local[key][1] = reduce(max, idp_bounds_delta_local[key]) # Update global maximum deviations - idp_bounds_delta_threaded[key][1][2] = max(idp_bounds_delta_threaded[key][1][2], - idp_bounds_delta_threaded[key][1][1]) + idp_bounds_delta_global[key] = max(idp_bounds_delta_global[key], + idp_bounds_delta_local[key][1]) end if save_errors @@ -71,9 +69,8 @@ for v in limiter.local_minmax_variables_cons v_string = string(v) print(f, ", ", - idp_bounds_delta_threaded[Symbol(v_string, "_min")][1][1], - ", ", - idp_bounds_delta_threaded[Symbol(v_string, "_max")][1][1]) + idp_bounds_delta_local[Symbol(v_string, "_min")][1], ", ", + idp_bounds_delta_local[Symbol(v_string, "_max")][1]) end end if positivity @@ -81,17 +78,14 @@ if v in limiter.local_minmax_variables_cons continue end - print(f, ", ", - idp_bounds_delta_threaded[Symbol(string(v), "_min")][1][1]) + print(f, ", ", idp_bounds_delta_local[Symbol(string(v), "_min")][1]) end end println(f) end - # Reset first entries of idp_bounds_delta_threaded - for (key, _) in idp_bounds_delta_threaded - for i in 1:Threads.nthreads() - idp_bounds_delta_threaded[key][i][1] = zero(eltype(idp_bounds_delta_threaded[key][i][1])) - end + # Reset local maximum deviations + for (key, _) in idp_bounds_delta_local + idp_bounds_delta_local[key] .= zero(eltype(idp_bounds_delta_local[key][1])) end end diff --git a/src/solvers/dgsem_tree/subcell_limiters_2d.jl b/src/solvers/dgsem_tree/subcell_limiters_2d.jl index 025f4ec6e1..e41fffd502 100644 --- a/src/solvers/dgsem_tree/subcell_limiters_2d.jl +++ b/src/solvers/dgsem_tree/subcell_limiters_2d.jl @@ -13,16 +13,19 @@ function create_cache(limiter::Type{SubcellLimiterIDP}, equations::AbstractEquat nnodes(basis), bound_keys) - # Threaded memory for bounds checking routine with `BoundsCheckCallback`. - # The first entry of each vector contains the maximum deviation since the last export. - # In the second entry, the total maximum deviation is saved. - idp_bounds_delta_threaded = Dict{Symbol, Vector{Vector{real(basis)}}}() + # Memory for bounds checking routine with `BoundsCheckCallback`. + # Local variable contains the maximum deviation since the last export. + # Using threaded variable to parallelize bounds check. + idp_bounds_delta_local = Dict{Symbol, Vector{real(basis)}}() + # Global variable contains the total maximum deviation. + idp_bounds_delta_global = Dict{Symbol, real(basis)}() for key in bound_keys - idp_bounds_delta_threaded[key] = [zeros(real(basis), 2) - for _ in 1:Threads.nthreads()] + idp_bounds_delta_local[key] = [zero(real(basis)) for _ in 1:Threads.nthreads()] + idp_bounds_delta_global[key] = zero(real(basis)) end - return (; subcell_limiter_coefficients, idp_bounds_delta_threaded) + return (; subcell_limiter_coefficients, idp_bounds_delta_local, + idp_bounds_delta_global) end function (limiter::SubcellLimiterIDP)(u::AbstractArray{<:Any, 4}, semi, dg::DGSEM, t, From 19151b2c9711cf65741d940d2b2c20fa75f370ea Mon Sep 17 00:00:00 2001 From: bennibolm Date: Thu, 7 Dec 2023 15:43:21 +0100 Subject: [PATCH 5/8] Using maximum instead of reduce --- src/callbacks_stage/subcell_bounds_check_2d.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/callbacks_stage/subcell_bounds_check_2d.jl b/src/callbacks_stage/subcell_bounds_check_2d.jl index 036107d8aa..34ff789e3b 100644 --- a/src/callbacks_stage/subcell_bounds_check_2d.jl +++ b/src/callbacks_stage/subcell_bounds_check_2d.jl @@ -55,7 +55,7 @@ for (key, _) in idp_bounds_delta_local # Reduce threaded local maximum deviations. Save in first entry. - idp_bounds_delta_local[key][1] = reduce(max, idp_bounds_delta_local[key]) + idp_bounds_delta_local[key][1] = maximum(idp_bounds_delta_local[key]) # Update global maximum deviations idp_bounds_delta_global[key] = max(idp_bounds_delta_global[key], idp_bounds_delta_local[key][1]) From 7d1648a738c77fee3aecb9c7349998e1e6bb6ed7 Mon Sep 17 00:00:00 2001 From: bennibolm Date: Wed, 20 Dec 2023 14:37:46 +0100 Subject: [PATCH 6/8] Expand vector length to fix False Sharing problem --- .../subcell_bounds_check_2d.jl | 38 ++++++++++++------- src/solvers/dgsem_tree/subcell_limiters_2d.jl | 8 +++- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/src/callbacks_stage/subcell_bounds_check_2d.jl b/src/callbacks_stage/subcell_bounds_check_2d.jl index 34ff789e3b..f23e9c3d3c 100644 --- a/src/callbacks_stage/subcell_bounds_check_2d.jl +++ b/src/callbacks_stage/subcell_bounds_check_2d.jl @@ -12,6 +12,13 @@ (; variable_bounds) = limiter.cache.subcell_limiter_coefficients (; idp_bounds_delta_local, idp_bounds_delta_global) = limiter.cache + # Note: Accessing the threaded memory vector `idp_bounds_delta_local` with + # `deviation = idp_bounds_delta_local[key][Threads.threadid()]` causes critical performance + # issues due to False Sharing. + # Initializing a vector with eight times the length and using every 8th entry fixes this + # problem and allows proper scaling: + # `deviation = idp_bounds_delta_local[key][8 * Threads.threadid()]` + if local_minmax for v in limiter.local_minmax_variables_cons v_string = string(v) @@ -20,8 +27,8 @@ deviation_min_threaded = idp_bounds_delta_local[key_min] deviation_max_threaded = idp_bounds_delta_local[key_max] @threaded for element in eachelement(solver, cache) - deviation_min = deviation_min_threaded[Threads.threadid()] - deviation_max = deviation_max_threaded[Threads.threadid()] + deviation_min = deviation_min_threaded[8 * Threads.threadid()] + deviation_max = deviation_max_threaded[8 * Threads.threadid()] for j in eachnode(solver), i in eachnode(solver) var = u[v, i, j, element] deviation_min = max(deviation_min, @@ -29,8 +36,8 @@ deviation_max = max(deviation_max, var - variable_bounds[key_max][i, j, element]) end - deviation_min_threaded[Threads.threadid()] = deviation_min - deviation_max_threaded[Threads.threadid()] = deviation_max + deviation_min_threaded[8 * Threads.threadid()] = deviation_min + deviation_max_threaded[8 * Threads.threadid()] = deviation_max end end end @@ -42,23 +49,24 @@ key = Symbol(string(v), "_min") deviation_threaded = idp_bounds_delta_local[key] @threaded for element in eachelement(solver, cache) - deviation = deviation_threaded[Threads.threadid()] + deviation = deviation_threaded[8 * Threads.threadid()] for j in eachnode(solver), i in eachnode(solver) var = u[v, i, j, element] deviation = max(deviation, variable_bounds[key][i, j, element] - var) end - deviation_threaded[Threads.threadid()] = deviation + deviation_threaded[8 * Threads.threadid()] = deviation end end end for (key, _) in idp_bounds_delta_local - # Reduce threaded local maximum deviations. Save in first entry. - idp_bounds_delta_local[key][1] = maximum(idp_bounds_delta_local[key]) + # Calculate maximum deviations of all threads + idp_bounds_delta_local[key][8] = maximum(idp_bounds_delta_local[key][8 * i] + for i in 1:Threads.nthreads()) # Update global maximum deviations idp_bounds_delta_global[key] = max(idp_bounds_delta_global[key], - idp_bounds_delta_local[key][1]) + idp_bounds_delta_local[key][8]) end if save_errors @@ -69,8 +77,9 @@ for v in limiter.local_minmax_variables_cons v_string = string(v) print(f, ", ", - idp_bounds_delta_local[Symbol(v_string, "_min")][1], ", ", - idp_bounds_delta_local[Symbol(v_string, "_max")][1]) + idp_bounds_delta_local[Symbol(v_string, "_min")][8], + ", ", + idp_bounds_delta_local[Symbol(v_string, "_max")][8]) end end if positivity @@ -78,14 +87,17 @@ if v in limiter.local_minmax_variables_cons continue end - print(f, ", ", idp_bounds_delta_local[Symbol(string(v), "_min")][1]) + print(f, ", ", + idp_bounds_delta_local[Symbol(string(v), "_min")][8]) end end println(f) end # Reset local maximum deviations for (key, _) in idp_bounds_delta_local - idp_bounds_delta_local[key] .= zero(eltype(idp_bounds_delta_local[key][1])) + for i in 1:Threads.nthreads() + idp_bounds_delta_local[key][8 * i] = zero(eltype(idp_bounds_delta_local[key][8])) + end end end diff --git a/src/solvers/dgsem_tree/subcell_limiters_2d.jl b/src/solvers/dgsem_tree/subcell_limiters_2d.jl index a65a3d4c80..50f6ad0e22 100644 --- a/src/solvers/dgsem_tree/subcell_limiters_2d.jl +++ b/src/solvers/dgsem_tree/subcell_limiters_2d.jl @@ -14,12 +14,16 @@ function create_cache(limiter::Type{SubcellLimiterIDP}, equations::AbstractEquat # Memory for bounds checking routine with `BoundsCheckCallback`. # Local variable contains the maximum deviation since the last export. - # Using threaded variable to parallelize bounds check. + # Using a threaded vector to parallelize bounds check. idp_bounds_delta_local = Dict{Symbol, Vector{real(basis)}}() # Global variable contains the total maximum deviation. idp_bounds_delta_global = Dict{Symbol, real(basis)}() for key in bound_keys - idp_bounds_delta_local[key] = [zero(real(basis)) for _ in 1:Threads.nthreads()] + # False sharing causes critical performance issues on multiple threads when using a vector + # of length Threads.nthreads(). Initializing a vector of length 8*Threads.nthreads() + # and then using every 8th entry, fixes the problem and allows proper scaling. + idp_bounds_delta_local[key] = [zero(real(basis)) + for _ in 1:(8 * Threads.nthreads())] idp_bounds_delta_global[key] = zero(real(basis)) end From b63e787a1786ecf464717b77a0d2b3dc679dd7f1 Mon Sep 17 00:00:00 2001 From: bennibolm Date: Wed, 20 Dec 2023 16:16:43 +0100 Subject: [PATCH 7/8] Generalize stride size in vector --- .../subcell_bounds_check_2d.jl | 32 ++++++++++--------- src/solvers/dgsem_tree/subcell_limiters_2d.jl | 10 +++--- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/src/callbacks_stage/subcell_bounds_check_2d.jl b/src/callbacks_stage/subcell_bounds_check_2d.jl index f23e9c3d3c..545d19b513 100644 --- a/src/callbacks_stage/subcell_bounds_check_2d.jl +++ b/src/callbacks_stage/subcell_bounds_check_2d.jl @@ -15,9 +15,11 @@ # Note: Accessing the threaded memory vector `idp_bounds_delta_local` with # `deviation = idp_bounds_delta_local[key][Threads.threadid()]` causes critical performance # issues due to False Sharing. - # Initializing a vector with eight times the length and using every 8th entry fixes this + # Initializing a vector with n times the length and using every n-th entry fixes this # problem and allows proper scaling: - # `deviation = idp_bounds_delta_local[key][8 * Threads.threadid()]` + # `deviation = idp_bounds_delta_local[key][n * Threads.threadid()]` + # Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)` + stride_size = div(128, sizeof(eltype(u))) # = n if local_minmax for v in limiter.local_minmax_variables_cons @@ -27,8 +29,8 @@ deviation_min_threaded = idp_bounds_delta_local[key_min] deviation_max_threaded = idp_bounds_delta_local[key_max] @threaded for element in eachelement(solver, cache) - deviation_min = deviation_min_threaded[8 * Threads.threadid()] - deviation_max = deviation_max_threaded[8 * Threads.threadid()] + deviation_min = deviation_min_threaded[stride_size * Threads.threadid()] + deviation_max = deviation_max_threaded[stride_size * Threads.threadid()] for j in eachnode(solver), i in eachnode(solver) var = u[v, i, j, element] deviation_min = max(deviation_min, @@ -36,8 +38,8 @@ deviation_max = max(deviation_max, var - variable_bounds[key_max][i, j, element]) end - deviation_min_threaded[8 * Threads.threadid()] = deviation_min - deviation_max_threaded[8 * Threads.threadid()] = deviation_max + deviation_min_threaded[stride_size * Threads.threadid()] = deviation_min + deviation_max_threaded[stride_size * Threads.threadid()] = deviation_max end end end @@ -49,24 +51,24 @@ key = Symbol(string(v), "_min") deviation_threaded = idp_bounds_delta_local[key] @threaded for element in eachelement(solver, cache) - deviation = deviation_threaded[8 * Threads.threadid()] + deviation = deviation_threaded[stride_size * Threads.threadid()] for j in eachnode(solver), i in eachnode(solver) var = u[v, i, j, element] deviation = max(deviation, variable_bounds[key][i, j, element] - var) end - deviation_threaded[8 * Threads.threadid()] = deviation + deviation_threaded[stride_size * Threads.threadid()] = deviation end end end for (key, _) in idp_bounds_delta_local # Calculate maximum deviations of all threads - idp_bounds_delta_local[key][8] = maximum(idp_bounds_delta_local[key][8 * i] - for i in 1:Threads.nthreads()) + idp_bounds_delta_local[key][stride_size] = maximum(idp_bounds_delta_local[key][stride_size * i] + for i in 1:Threads.nthreads()) # Update global maximum deviations idp_bounds_delta_global[key] = max(idp_bounds_delta_global[key], - idp_bounds_delta_local[key][8]) + idp_bounds_delta_local[key][stride_size]) end if save_errors @@ -77,9 +79,9 @@ for v in limiter.local_minmax_variables_cons v_string = string(v) print(f, ", ", - idp_bounds_delta_local[Symbol(v_string, "_min")][8], + idp_bounds_delta_local[Symbol(v_string, "_min")][stride_size], ", ", - idp_bounds_delta_local[Symbol(v_string, "_max")][8]) + idp_bounds_delta_local[Symbol(v_string, "_max")][stride_size]) end end if positivity @@ -88,7 +90,7 @@ continue end print(f, ", ", - idp_bounds_delta_local[Symbol(string(v), "_min")][8]) + idp_bounds_delta_local[Symbol(string(v), "_min")][stride_size]) end end println(f) @@ -96,7 +98,7 @@ # Reset local maximum deviations for (key, _) in idp_bounds_delta_local for i in 1:Threads.nthreads() - idp_bounds_delta_local[key][8 * i] = zero(eltype(idp_bounds_delta_local[key][8])) + idp_bounds_delta_local[key][stride_size * i] = zero(eltype(idp_bounds_delta_local[key][stride_size])) end end end diff --git a/src/solvers/dgsem_tree/subcell_limiters_2d.jl b/src/solvers/dgsem_tree/subcell_limiters_2d.jl index 50f6ad0e22..e1d45f8608 100644 --- a/src/solvers/dgsem_tree/subcell_limiters_2d.jl +++ b/src/solvers/dgsem_tree/subcell_limiters_2d.jl @@ -18,12 +18,14 @@ function create_cache(limiter::Type{SubcellLimiterIDP}, equations::AbstractEquat idp_bounds_delta_local = Dict{Symbol, Vector{real(basis)}}() # Global variable contains the total maximum deviation. idp_bounds_delta_global = Dict{Symbol, real(basis)}() + # Note: False sharing causes critical performance issues on multiple threads when using a vector + # of length `Threads.nthreads()`. Initializing a vector of length `n * Threads.nthreads()` + # and then only using every n-th entry, fixes the problem and allows proper scaling. + # Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)` + stride_size = div(128, sizeof(eltype(basis.nodes))) # = n for key in bound_keys - # False sharing causes critical performance issues on multiple threads when using a vector - # of length Threads.nthreads(). Initializing a vector of length 8*Threads.nthreads() - # and then using every 8th entry, fixes the problem and allows proper scaling. idp_bounds_delta_local[key] = [zero(real(basis)) - for _ in 1:(8 * Threads.nthreads())] + for _ in 1:(stride_size * Threads.nthreads())] idp_bounds_delta_global[key] = zero(real(basis)) end From 7d8313e302255759553534c1f2bebc7fc16a103e Mon Sep 17 00:00:00 2001 From: bennibolm Date: Tue, 30 Jan 2024 10:04:36 +0100 Subject: [PATCH 8/8] Add suggested comment --- src/solvers/dgsem_tree/subcell_limiters_2d.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/solvers/dgsem_tree/subcell_limiters_2d.jl b/src/solvers/dgsem_tree/subcell_limiters_2d.jl index e1d45f8608..3d272359fe 100644 --- a/src/solvers/dgsem_tree/subcell_limiters_2d.jl +++ b/src/solvers/dgsem_tree/subcell_limiters_2d.jl @@ -37,6 +37,7 @@ function (limiter::SubcellLimiterIDP)(u::AbstractArray{<:Any, 4}, semi, dg::DGSE dt; kwargs...) @unpack alpha = limiter.cache.subcell_limiter_coefficients + # TODO: Do not abuse `reset_du!` but maybe implement a generic `set_zero!` @trixi_timeit timer() "reset alpha" reset_du!(alpha, dg, semi.cache) if limiter.local_minmax