Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(Thread-)Parallelize bounds check routine for subcell IDP limiting #1736

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions src/callbacks_stage/subcell_bounds_check.jl
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ end
@inline function finalize_callback(callback::BoundsCheckCallback, semi,
limiter::SubcellLimiterIDP)
(; local_minmax, positivity) = limiter
(; idp_bounds_delta) = limiter.cache
(; idp_bounds_delta_global) = limiter.cache
variables = varnames(cons2cons, semi.equations)

println("─"^100)
Expand All @@ -128,8 +128,10 @@ end
for v in limiter.local_minmax_variables_cons
v_string = string(v)
println("$(variables[v]):")
println("-lower bound: ", idp_bounds_delta[Symbol(v_string, "_min")][2])
println("-upper bound: ", idp_bounds_delta[Symbol(v_string, "_max")][2])
println("- lower bound: ",
idp_bounds_delta_global[Symbol(v_string, "_min")])
println("- upper bound: ",
idp_bounds_delta_global[Symbol(v_string, "_max")])
end
end
if positivity
Expand All @@ -138,7 +140,7 @@ end
continue
end
println(string(variables[v]) * ":\n- positivity: ",
idp_bounds_delta[Symbol(string(v), "_min")][2])
idp_bounds_delta_global[Symbol(string(v), "_min")])
end
end
println("─"^100 * "\n")
Expand Down
81 changes: 54 additions & 27 deletions src/callbacks_stage/subcell_bounds_check_2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,37 @@
time, iter, output_directory, save_errors)
(; local_minmax, positivity) = solver.volume_integral.limiter
(; variable_bounds) = limiter.cache.subcell_limiter_coefficients
(; idp_bounds_delta) = limiter.cache
(; idp_bounds_delta_local, idp_bounds_delta_global) = limiter.cache

# Note: Accessing the threaded memory vector `idp_bounds_delta_local` with
# `deviation = idp_bounds_delta_local[key][Threads.threadid()]` causes critical performance
# issues due to False Sharing.
# Initializing a vector with n times the length and using every n-th entry fixes this
# problem and allows proper scaling:
# `deviation = idp_bounds_delta_local[key][n * Threads.threadid()]`
# Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)`
stride_size = div(128, sizeof(eltype(u))) # = n

if local_minmax
for v in limiter.local_minmax_variables_cons
v_string = string(v)
key_min = Symbol(v_string, "_min")
key_max = Symbol(v_string, "_max")
deviation_min = idp_bounds_delta[key_min]
deviation_max = idp_bounds_delta[key_max]
for element in eachelement(solver, cache), j in eachnode(solver),
i in eachnode(solver)

var = u[v, i, j, element]
deviation_min[1] = max(deviation_min[1],
variable_bounds[key_min][i, j, element] - var)
deviation_max[1] = max(deviation_max[1],
var - variable_bounds[key_max][i, j, element])
deviation_min_threaded = idp_bounds_delta_local[key_min]
deviation_max_threaded = idp_bounds_delta_local[key_max]
@threaded for element in eachelement(solver, cache)
deviation_min = deviation_min_threaded[stride_size * Threads.threadid()]
deviation_max = deviation_max_threaded[stride_size * Threads.threadid()]
for j in eachnode(solver), i in eachnode(solver)
var = u[v, i, j, element]
deviation_min = max(deviation_min,
variable_bounds[key_min][i, j, element] - var)
deviation_max = max(deviation_max,
var - variable_bounds[key_max][i, j, element])
end
deviation_min_threaded[stride_size * Threads.threadid()] = deviation_min
deviation_max_threaded[stride_size * Threads.threadid()] = deviation_max
end
deviation_min[2] = max(deviation_min[2], deviation_min[1])
deviation_max[2] = max(deviation_max[2], deviation_max[1])
end
end
if positivity
Expand All @@ -38,41 +49,57 @@
continue
end
key = Symbol(string(v), "_min")
deviation = idp_bounds_delta[key]
for element in eachelement(solver, cache), j in eachnode(solver),
i in eachnode(solver)

var = u[v, i, j, element]
deviation[1] = max(deviation[1],
variable_bounds[key][i, j, element] - var)
deviation_threaded = idp_bounds_delta_local[key]
@threaded for element in eachelement(solver, cache)
deviation = deviation_threaded[stride_size * Threads.threadid()]
for j in eachnode(solver), i in eachnode(solver)
var = u[v, i, j, element]
deviation = max(deviation,
variable_bounds[key][i, j, element] - var)
end
deviation_threaded[stride_size * Threads.threadid()] = deviation
end
deviation[2] = max(deviation[2], deviation[1])
end
end

for (key, _) in idp_bounds_delta_local
# Calculate maximum deviations of all threads
idp_bounds_delta_local[key][stride_size] = maximum(idp_bounds_delta_local[key][stride_size * i]
for i in 1:Threads.nthreads())
# Update global maximum deviations
idp_bounds_delta_global[key] = max(idp_bounds_delta_global[key],
idp_bounds_delta_local[key][stride_size])
end

if save_errors
# Print to output file
open("$output_directory/deviations.txt", "a") do f
print(f, iter, ", ", time)
if local_minmax
for v in limiter.local_minmax_variables_cons
v_string = string(v)
print(f, ", ", idp_bounds_delta[Symbol(v_string, "_min")][1], ", ",
idp_bounds_delta[Symbol(v_string, "_max")][1])
print(f, ", ",
idp_bounds_delta_local[Symbol(v_string, "_min")][stride_size],
", ",
idp_bounds_delta_local[Symbol(v_string, "_max")][stride_size])
end
end
if positivity
for v in limiter.positivity_variables_cons
if v in limiter.local_minmax_variables_cons
continue
end
print(f, ", ", idp_bounds_delta[Symbol(string(v), "_min")][1])
print(f, ", ",
idp_bounds_delta_local[Symbol(string(v), "_min")][stride_size])
end
end
println(f)
end
# Reset first entries of idp_bounds_delta
for (key, _) in idp_bounds_delta
idp_bounds_delta[key][1] = zero(eltype(idp_bounds_delta[key][1]))
# Reset local maximum deviations
for (key, _) in idp_bounds_delta_local
for i in 1:Threads.nthreads()
idp_bounds_delta_local[key][stride_size * i] = zero(eltype(idp_bounds_delta_local[key][stride_size]))
end
end
end

Expand Down
3 changes: 3 additions & 0 deletions src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,9 @@ end
For subcell limiting, the calculation of local bounds for non-periodic domains require the boundary
outer state. This function returns the boundary value at time `t` and for node with spatial
indices `indices`.

!!! warning "Experimental implementation"
This is an experimental feature and may change in future releases.
sloede marked this conversation as resolved.
Show resolved Hide resolved
"""
@inline function get_boundary_outer_state(boundary_condition::BoundaryConditionDirichlet,
cache, t, equations, dg, indices...)
Expand Down
23 changes: 17 additions & 6 deletions src/solvers/dgsem_tree/subcell_limiters_2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,32 @@ function create_cache(limiter::Type{SubcellLimiterIDP}, equations::AbstractEquat
bound_keys)

# Memory for bounds checking routine with `BoundsCheckCallback`.
# The first entry of each vector contains the maximum deviation since the last export.
# The second one contains the total maximum deviation.
idp_bounds_delta = Dict{Symbol, Vector{real(basis)}}()
# Local variable contains the maximum deviation since the last export.
# Using a threaded vector to parallelize bounds check.
idp_bounds_delta_local = Dict{Symbol, Vector{real(basis)}}()
# Global variable contains the total maximum deviation.
idp_bounds_delta_global = Dict{Symbol, real(basis)}()
# Note: False sharing causes critical performance issues on multiple threads when using a vector
# of length `Threads.nthreads()`. Initializing a vector of length `n * Threads.nthreads()`
# and then only using every n-th entry, fixes the problem and allows proper scaling.
# Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)`
stride_size = div(128, sizeof(eltype(basis.nodes))) # = n
sloede marked this conversation as resolved.
Show resolved Hide resolved
for key in bound_keys
idp_bounds_delta[key] = zeros(real(basis), 2)
idp_bounds_delta_local[key] = [zero(real(basis))
for _ in 1:(stride_size * Threads.nthreads())]
idp_bounds_delta_global[key] = zero(real(basis))
end

return (; subcell_limiter_coefficients, idp_bounds_delta)
return (; subcell_limiter_coefficients, idp_bounds_delta_local,
idp_bounds_delta_global)
end

function (limiter::SubcellLimiterIDP)(u::AbstractArray{<:Any, 4}, semi, dg::DGSEM, t,
dt;
kwargs...)
@unpack alpha = limiter.cache.subcell_limiter_coefficients
alpha .= zero(eltype(alpha))
# TODO: Do not abuse `reset_du!` but maybe implement a generic `set_zero!`
@trixi_timeit timer() "reset alpha" reset_du!(alpha, dg, semi.cache)
bennibolm marked this conversation as resolved.
Show resolved Hide resolved

if limiter.local_minmax
@trixi_timeit timer() "local min/max limiting" idp_local_minmax!(alpha, limiter,
Expand Down
Loading