diff --git a/src/fmm.jl b/src/fmm.jl index 1cc460a..7b24b20 100644 --- a/src/fmm.jl +++ b/src/fmm.jl @@ -143,63 +143,13 @@ function upward_pass_singlethread!(branches::AbstractVector{<:Branch{TF}}, syste end end -function body_2_multipole_multithread!(branches, systems::Tuple, expansion_order::Val{P}, leaf_index, n_threads) where P - ## load balance - leaf_assignments = fill(1:0, length(systems), n_threads) - for (i_system,system) in enumerate(systems) - - # total number of bodies - n_bodies = 0 - for i_leaf in leaf_index - n_bodies += length(branches[i_leaf].bodies_index[i_system]) - end - - # number of bodies per thread - n_per_thread, rem = divrem(n_bodies, n_threads) - rem > 0 && (n_per_thread += 1) - - # if there are too many threads, we'll actually hurt performance - n_per_thread < MIN_NPT_B2M && (n_per_thread = MIN_NPT_B2M) - - # create chunks - i_start = 1 - i_thread = 1 - n_bodies = 0 - for (i_end,i_leaf) in enumerate(leaf_index) - n_bodies += length(branches[i_leaf].bodies_index[i_system]) - if n_bodies >= n_per_thread - leaf_assignments[i_system,i_thread] = i_start:i_end - i_start = i_end+1 - i_thread += 1 - n_bodies = 0 - end - end - i_thread <= n_threads && (leaf_assignments[i_system,i_thread] = i_start:length(leaf_index)) - end - - ## compute multipole expansion coefficients - Threads.@threads for i_thread in 1:n_threads - for (i_system,system) in enumerate(systems) - leaf_assignment = leaf_assignments[i_system,i_thread] - for i_branch in view(leaf_index, leaf_assignment) - branch = branches[i_branch] - Threads.lock(branch.lock) do - body_to_multipole!(system, branch, branch.bodies_index[i_system], branch.harmonics, expansion_order) - end - end - end - end -end +function body_to_multipole_multithread!(branches, systems, expansion_order::Val{P}, leaf_index, n_threads) where P -function body_2_multipole_multithread!(branches, system, expansion_order::Val{P}, leaf_index, n_threads) where P ## load balance leaf_assignments = fill(1:0, n_threads) # total number of bodies - n_bodies = 0 - for i_leaf in leaf_index - n_bodies += length(branches[i_leaf].bodies_index) - end + n_bodies = get_n_bodies(systems) # number of bodies per thread n_per_thread, rem = divrem(n_bodies, n_threads) @@ -213,7 +163,7 @@ function body_2_multipole_multithread!(branches, system, expansion_order::Val{P} i_thread = 1 n_bodies = 0 for (i_end,i_leaf) in enumerate(leaf_index) - n_bodies += length(branches[i_leaf].bodies_index) + n_bodies += get_n_bodies(branches[i_leaf]) if n_bodies >= n_per_thread leaf_assignments[i_thread] = i_start:i_end i_start = i_end+1 @@ -224,15 +174,21 @@ function body_2_multipole_multithread!(branches, system, expansion_order::Val{P} i_thread <= n_threads && (leaf_assignments[i_thread] = i_start:length(leaf_index)) ## compute multipole expansion coefficients - Threads.@threads for i_thread in 1:n_threads - for i_branch in view(leaf_index, leaf_assignments[i_thread]) + Threads.@threads for assignment in leaf_assignments + for i_task in assignment + i_branch = leaf_index[i_task] branch = branches[i_branch] - body_to_multipole!(system, branch, branch.bodies_index, branch.harmonics, expansion_order) + body_to_multipole!(branch, systems, branch.harmonics, expansion_order) end end end -function translate_multipoles_multithread!(branches, expansion_order::Val{P}, levels_index, n_threads) where P +function translate_multipoles_multithread!(branches::AbstractVector{<:Branch{TF}}, expansion_order::Val{P}, lamb_helmholtz, levels_index, n_threads) where {TF,P} + # try preallocating one container per task to be reused + Ts = [zeros(length_Ts(P)) for _ in 1:n_threads] + eimϕs = [zeros(2, P+1) for _ in 1:n_threads] + weights_tmp_1 = [initialize_expansion(P, TF) for _ in 1:n_threads] + weights_tmp_2 = [initialize_expansion(P, TF) for _ in 1:n_threads] # iterate over levels for level_index in view(levels_index,length(levels_index):-1:2) @@ -246,7 +202,10 @@ function translate_multipoles_multithread!(branches, expansion_order::Val{P}, le n_per_thread < MIN_NPT_M2M && (n_per_thread = MIN_NPT_M2M) # assign thread start branches - Threads.@threads for i_start in 1:n_per_thread:n_branches + i_starts = 1:n_per_thread:n_branches + Threads.@threads for i_task in 1:length(i_starts) + i_start = i_starts[i_task] + # get final branch i_end = min(n_branches, i_start+n_per_thread-1) @@ -255,19 +214,19 @@ function translate_multipoles_multithread!(branches, expansion_order::Val{P}, le child_branch = branches[i_branch] parent_branch = branches[child_branch.i_parent] Threads.lock(parent_branch.lock) do - M2M!(parent_branch, child_branch, child_branch.harmonics, child_branch.ML, expansion_order) + multipole_to_multipole!(parent_branch, child_branch, weights_tmp_1[i_task], weights_tmp_2[i_task], Ts[i_task], eimϕs[i_task], ζs_mag, Hs_π2, expansion_order, lamb_helmholtz) end end end end end -function upward_pass_multithread!(branches, systems, expansion_order, levels_index, leaf_index, n_threads) +function upward_pass_multithread!(branches, systems, expansion_order, lamb_helmholtz, levels_index, leaf_index, n_threads) # create multipole expansions - body_2_multipole_multithread!(branches, systems, expansion_order, leaf_index, n_threads) + body_to_multipole_multithread!(branches, systems, expansion_order, leaf_index, n_threads) # m2m translation - translate_multipoles_multithread!(branches, expansion_order, levels_index, n_threads) + translate_multipoles_multithread!(branches, expansion_order, lamb_helmholtz, levels_index, n_threads) end #------- direct interaction matrix -------# @@ -293,48 +252,28 @@ function horizontal_pass_singlethread!(target_branches::Vector{<:Branch{TF}}, so end -# function horizontal_pass_singlethread!(target_branches, source_branches, m2l_list, expansion_order, harmonics, L) -# for (i_target, j_source) in m2l_list -# @lock target_branches[i_target].lock M2L!(target_branches[i_target], source_branches[j_source], harmonics, L, expansion_order) -# end -# end - -# @inline function preallocate_horizontal_pass(expansion_type, expansion_order) -# harmonics = zeros(expansion_type, (expansion_order<<1 + 1)*(expansion_order<<1 + 1)) -# L = zeros(expansion_type, 4) -# return harmonics, L -# end +function horizontal_pass_multithread!(target_branches, source_branches::Vector{<:Branch{TF}}, m2l_list, expansion_order::Val{P}, lamb_helmholtz, n_threads) where {TF,P} + # try preallocating one container per task to be reused + Ts = [zeros(length_Ts(P)) for _ in 1:n_threads] + eimϕs = [zeros(2, P+1) for _ in 1:n_threads] + weights_tmp_1 = [initialize_expansion(P, TF) for _ in 1:n_threads] + weights_tmp_2 = [initialize_expansion(P, TF) for _ in 1:n_threads] -# @inline function preallocate_horizontal_pass(expansion_type, expansion_order, n) -# containers = [preallocate_horizontal_pass(expansion_type, expansion_order) for _ in 1:n] -# end - -function horizontal_pass_multithread!(target_branches, source_branches::Vector{<:Branch{TF}}, m2l_list, expansion_order::Val{P}, n_threads) where {TF,P} # number of translations per thread n_per_thread, rem = divrem(length(m2l_list),n_threads) rem > 0 && (n_per_thread += 1) assignments = 1:n_per_thread:length(m2l_list) - # preallocate memory - # harmonics_preallocated = [initialize_harmonics(P,TF) for _ in 1:length(assignments)] - # ML_preallocated = [initialize_ML(P,TF) for _ in 1:length(assignments)] - # execute tasks - Threads.@threads for i_thread in 1:length(assignments) - # Threads.@threads for i_start in 1:n_per_thread:length(m2l_list) - i_start = assignments[i_thread] + Threads.@threads for i_task in 1:length(assignments) + i_start = assignments[i_task] i_stop = min(i_start+n_per_thread-1, length(m2l_list)) - # harmonics = harmonics_preallocated[i_thread] - # ML = ML_preallocated[i_thread] - # harmonics = initialize_harmonics(P,TF) - # ML = initialize_ML(P,TF) for (i_target, j_source) in m2l_list[i_start:i_stop] - Threads.lock(target_branches[i_target].lock) do - M2L!(target_branches[i_target], source_branches[j_source], expansion_order) - # M2L!(target_branches[i_target], source_branches[j_source], harmonics, ML, expansion_order) + target_branch = target_branches[i_target] + source_branch = source_branches[j_source] + Threads.lock(target_branch.lock) do + multipole_to_local!(target_branch, source_branch, weights_tmp_1[i_task], weights_tmp_2[i_task], Ts[i_task], eimϕs[i_task], ζs_mag, ηs_mag, Hs_π2, expansion_order, lamb_helmholtz) end - # target_branch = target_branches[i_target] - # Threads.@lock target_branch.lock M2L!(target_branch, source_branches[j_source], expansion_order) end end @@ -368,7 +307,13 @@ function downward_pass_singlethread!(branches::AbstractVector{<:Branch{TF}}, sys end end -function translate_locals_multithread!(branches, expansion_order::Val{P}, levels_index, n_threads) where P +function translate_locals_multithread!(branches::AbstractVector{<:Branch{TF}}, expansion_order::Val{P}, lamb_helmholtz, levels_index, n_threads) where {TF,P} + + # try preallocating one container per task to be reused + Ts = [zeros(length_Ts(P)) for _ in 1:n_threads] + eimϕs = [zeros(2, P+1) for _ in 1:n_threads] + weights_tmp_1 = [initialize_expansion(P, TF) for _ in 1:n_threads] + weights_tmp_2 = [initialize_expansion(P, TF) for _ in 1:n_threads] # iterate over levels for level_index in view(levels_index,2:length(levels_index)) @@ -381,19 +326,25 @@ function translate_locals_multithread!(branches, expansion_order::Val{P}, levels n_per_thread < MIN_NPT_L2L && (n_per_thread = MIN_NPT_L2L) # loop over branches - Threads.@threads for i_start in 1:n_per_thread:length(level_index) + i_starts = 1:n_per_thread:length(level_index) + #Threads.@threads for (i_task,i_start) in enumerate(1:n_per_thread:length(level_index)) + Threads.@threads for i_task in 1:length(i_starts) + i_start = i_starts[i_task] i_stop = min(i_start+n_per_thread-1,length(level_index)) # loop over branches for child_branch in view(branches,view(level_index,i_start:i_stop)) - L2L!(branches[child_branch.i_parent], child_branch, child_branch.harmonics, child_branch.ML, expansion_order) + local_to_local!(branches[child_branch.i_parent], child_branch, weights_tmp_1[i_task], weights_tmp_2[i_task], Ts[i_task], eimϕs[i_task], ηs_mag, Hs_π2, expansion_order, lamb_helmholtz) end end end return nothing end -function local_2_body_multithread!(branches, systems, derivatives_switches, expansion_order, leaf_index, n_threads) +function local_to_body_multithread!(branches::AbstractVector{<:Branch{TF}}, systems, derivatives_switches, expansion_order::Val{P}, lamb_helmholtz, leaf_index, n_threads) where {TF,P} + # preallocate containers + velocity_n_m = [initialize_velocity_n_m(P,TF) for _ in 1:n_threads] + # create assignments n_bodies = 0 for i_leaf in leaf_index @@ -423,21 +374,21 @@ function local_2_body_multithread!(branches, systems, derivatives_switches, expa resize!(assignments, i_thread) # spread remainder across rem chunks - Threads.@threads for i_thread in eachindex(assignments) - assignment = assignments[i_thread] + Threads.@threads for i_task in eachindex(assignments) + assignment = assignments[i_task] for i_leaf in view(leaf_index,assignment) leaf = branches[i_leaf] - L2B!(systems, leaf, derivatives_switches, expansion_order) + evaluate_local!(systems, leaf, leaf.harmonics, velocity_n_m[i_task], expansion_order, lamb_helmholtz, derivatives_switches) end end end -function downward_pass_multithread!(branches, systems, derivatives_switch, expansion_order, levels_index, leaf_index, n_threads) - # m2m translation - translate_locals_multithread!(branches, expansion_order, levels_index, n_threads) +function downward_pass_multithread!(branches, systems, derivatives_switch, expansion_order, lamb_helmholtz, levels_index, leaf_index, n_threads) + # local to local translation + translate_locals_multithread!(branches, expansion_order, lamb_helmholtz, levels_index, n_threads) # local to body interaction - local_2_body_multithread!(branches, systems, derivatives_switch, expansion_order, leaf_index, n_threads) + local_to_body_multithread!(branches, systems, derivatives_switch, expansion_order, lamb_helmholtz, leaf_index, n_threads) end ##### @@ -946,6 +897,10 @@ function fmm!(target_tree::Tree, target_systems, source_tree::Tree, source_syste n_sources = get_n_bodies(source_systems) n_targets = get_n_bodies(target_systems) + # wrap lamb_helmholtz in Val + lamb_helmholtz = Val(lamb_helmholtz) + + if n_sources > 0 && n_targets > 0 # precompute y-axis rotation by π/2 matrices (if not already done) @@ -968,11 +923,11 @@ function fmm!(target_tree::Tree, target_systems, source_tree::Tree, source_syste @sync begin Threads.@spawn nearfield_singlethread!(target_systems, direct_target_bodies, derivatives_switches, source_systems, direct_source_bodies, Val(gpu)) - upward_pass && upward_pass_multithread!(source_tree.branches, source_systems, source_tree.expansion_order, source_tree.levels_index, source_tree.leaf_index, n_threads-1) - horizontal_pass && length(m2l_list) > 0 && horizontal_pass_multithread!(target_tree.branches, source_tree.branches, m2l_list, source_tree.expansion_order, n_threads-1) + upward_pass && upward_pass_multithread!(source_tree.branches, source_systems, source_tree.expansion_order, lamb_helmholtz, source_tree.levels_index, source_tree.leaf_index, n_threads-1) + horizontal_pass && length(m2l_list) > 0 && horizontal_pass_multithread!(target_tree.branches, source_tree.branches, m2l_list, source_tree.expansion_order, lamb_helmholtz, n_threads-1) end - downward_pass && downward_pass_multithread!(target_tree.branches, target_systems, derivatives_switches, target_tree.expansion_order, target_tree.levels_index, target_tree.leaf_index, n_threads) + downward_pass && downward_pass_multithread!(target_tree.branches, target_systems, derivatives_switches, target_tree.expansion_order, lamb_helmholtz, target_tree.levels_index, target_tree.leaf_index, n_threads) else @@ -980,16 +935,16 @@ function fmm!(target_tree::Tree, target_systems, source_tree::Tree, source_syste if n_threads == 1 # && !gpu || gpu nearfield_singlethread!(target_systems, direct_target_bodies, derivatives_switches, source_systems, direct_source_bodies, Val(gpu)) - upward_pass && upward_pass_singlethread!(source_tree.branches, source_systems, source_tree.expansion_order, Val(lamb_helmholtz)) - horizontal_pass && length(m2l_list) > 0 && horizontal_pass_singlethread!(target_tree.branches, source_tree.branches, m2l_list, source_tree.expansion_order, Val(lamb_helmholtz)) - downward_pass && downward_pass_singlethread!(target_tree.branches, target_systems, target_tree.expansion_order, Val(lamb_helmholtz), derivatives_switches) + upward_pass && upward_pass_singlethread!(source_tree.branches, source_systems, source_tree.expansion_order, lamb_helmholtz) + horizontal_pass && length(m2l_list) > 0 && horizontal_pass_singlethread!(target_tree.branches, source_tree.branches, m2l_list, source_tree.expansion_order, lamb_helmholtz) + downward_pass && downward_pass_singlethread!(target_tree.branches, target_systems, target_tree.expansion_order, lamb_helmholtz, derivatives_switches) else # n_threads > 1 && !gpu nearfield_multithread!(target_systems, direct_target_bodies, derivatives_switches, source_systems, direct_source_bodies, n_threads) - upward_pass && upward_pass_multithread!(source_tree.branches, source_systems, source_tree.expansion_order, source_tree.levels_index, source_tree.leaf_index, n_threads) - horizontal_pass && length(m2l_list) > 0 && horizontal_pass_multithread!(target_tree.branches, source_tree.branches, m2l_list, source_tree.expansion_order, n_threads) - downward_pass && downward_pass_multithread!(target_tree.branches, target_systems, derivatives_switches, target_tree.expansion_order, target_tree.levels_index, target_tree.leaf_index, n_threads) + upward_pass && upward_pass_multithread!(source_tree.branches, source_systems, source_tree.expansion_order, lamb_helmholtz, source_tree.levels_index, source_tree.leaf_index, n_threads) + horizontal_pass && length(m2l_list) > 0 && horizontal_pass_multithread!(target_tree.branches, source_tree.branches, m2l_list, source_tree.expansion_order, lamb_helmholtz, n_threads) + downward_pass && downward_pass_multithread!(target_tree.branches, target_systems, derivatives_switches, target_tree.expansion_order, lamb_helmholtz, target_tree.levels_index, target_tree.leaf_index, n_threads) end diff --git a/src/tree.jl b/src/tree.jl index 22f0a3d..af958ff 100644 --- a/src/tree.jl +++ b/src/tree.jl @@ -791,6 +791,12 @@ function initialize_expansion(expansion_order, type=Float64) return zeros(type, 2, 2, ((expansion_order+1) * (expansion_order+2)) >> 1) end +function initialize_velocity_n_m(expansion_order, type=Float64) + p = expansion_order + n_harmonics = harmonic_index(p,p) + return zeros(type, 2, 3, n_harmonics) +end + function initialize_harmonics(expansion_order, type=Float64) p = expansion_order n_harmonics = harmonic_index(p,p)