From 87809e4cf8a840e390331fdbe7226a47595dee76 Mon Sep 17 00:00:00 2001 From: James Cook Date: Fri, 17 Jan 2025 12:53:43 +0000 Subject: [PATCH 01/18] this is running with my dev'd MPIQR with 3arg ldiv! --- moment_kinetics/Project.toml | 4 + .../src/electron_kinetic_equation.jl | 88 +++++++++++++------ moment_kinetics/src/nonlinear_solvers.jl | 17 ++-- moment_kinetics/src/time_advance.jl | 2 +- moment_kinetics/src/timer_utils.jl | 2 +- 5 files changed, 78 insertions(+), 35 deletions(-) diff --git a/moment_kinetics/Project.toml b/moment_kinetics/Project.toml index fe15a7856..9f8296ef4 100644 --- a/moment_kinetics/Project.toml +++ b/moment_kinetics/Project.toml @@ -19,12 +19,14 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LsqFit = "2fda8390-95c7-5789-9bda-21331edee243" MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267" +MPIQR = "4acd9526-7714-4958-a7d5-1632fb49fc8a" Measures = "442fdcdd-2543-5da2-b0f3-8c86c306513e" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Preferences = "21216c6a-2e73-6563-6e65-726566657250" Primes = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Quadmath = "be4d8f0f-7fa4-5f49-b795-2f01399ab2dd" Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665" @@ -50,6 +52,8 @@ file_io_netcdf = "NCDatasets" manufactured_solns_ext = ["Symbolics", "IfElse"] [compat] +MPIQR = "0.1.0" +ProgressMeter = "1.10.2" julia = "1.9.0" [extras] diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl index 556490a6d..c179557f1 100644 --- a/moment_kinetics/src/electron_kinetic_equation.jl +++ b/moment_kinetics/src/electron_kinetic_equation.jl @@ -3,6 +3,8 @@ module electron_kinetic_equation using LinearAlgebra using MPI using SparseArrays +using MPIQR +using ProgressMeter export get_electron_critical_velocities @@ -70,7 +72,6 @@ using ..velocity_moments: integrate_over_vspace, calculate_electron_moment_deriv import ..runge_kutta const integral_correction_sharpness = 4.0 - """ update_electron_pdf is a function that uses the electron kinetic equation to solve for the updated electron pdf @@ -1217,7 +1218,7 @@ pressure \$p_{e∥}\$. end if nl_solver_params.solves_since_precon_update[] ≥ nl_solver_params.preconditioner_update_interval -global_rank[] == 0 && println("recalculating precon") +global_rank[] == 0 && println("recalculating precon 1") nl_solver_params.solves_since_precon_update[] = 0 nl_solver_params.precon_dt[] = t_params.dt[] @@ -1231,27 +1232,45 @@ global_rank[] == 0 && println("recalculating precon") external_source_settings, num_diss_params, t_params, ion_dt, ir, evolve_ppar) - begin_serial_region() - if block_rank[] == 0 - if size(orig_lu) == (1, 1) + #begin_serial_region() + qrblocksize = 4 + localcols = MPIQR.localcolumns(block_rank[], size(precon_matrix, 2), + qrblocksize, block_size[])#1:size(precon_matrix, 2)# + if block_rank[] >= 0 # == 0 + if size(orig_lu, 1) == 1 # Have not properly created the LU decomposition before, so # cannot reuse it. - @timeit_debug global_timer "lu" nl_solver_params.preconditioners[ir] = - (lu(sparse(precon_matrix)), precon_matrix, input_buffer, - output_buffer) + #@timeit_debug global_timer "lu" nl_solver_params.preconditioners[ir] = + # (lu(sparse(precon_matrix)), precon_matrix, input_buffer, + # output_buffer) + @timeit_debug global_timer "lu" begin + mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]), + size(precon_matrix); blocksize=qrblocksize) + progress = Progress(mpiqrmatrix, dt=iszero(block_rank[]) ? 1 : 2^31; + showspeed=true) + mpiqrstruct = qr!(mpiqrmatrix; progress=progress) + nl_solver_params.preconditioners[ir] = + (mpiqrstruct, precon_matrix, input_buffer, output_buffer) + end else # LU decomposition was previously created. The Jacobian always # has the same sparsity pattern, so by using `lu!()` we can # reuse some setup. try - @timeit_debug global_timer "lu!" lu!(orig_lu, sparse(precon_matrix); check=false) + #@timeit_debug global_timer "lu!" lu!(orig_lu, sparse(precon_matrix); check=false) + @timeit_debug global_timer "lu!" begin + orig_lu[:, localcols] = precon_matrix[:, localcols] + qr!(orig_lu) + end catch e if !isa(e, ArgumentError) rethrow(e) end println("Sparsity pattern of matrix changed, rebuilding " * " LU from scratch") - @timeit_debug global_timer "lu" orig_lu = lu(sparse(precon_matrix)) + mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]), + size(precon_matrix); blocksize=qrblocksize) + @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix) end nl_solver_params.preconditioners[ir] = (orig_lu, precon_matrix, input_buffer, output_buffer) @@ -1280,10 +1299,10 @@ global_rank[] == 0 && println("recalculating precon") counter += 1 end - begin_serial_region() - @serial_region begin - @timeit_debug global_timer "ldiv!" ldiv!(this_output_buffer, precon_lu, this_input_buffer) - end + #begin_serial_region() + #@serial_region begin + @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer) + #end begin_serial_region() counter = 1 @@ -1347,7 +1366,7 @@ global_rank[] == 0 && println("recalculating precon") end if nl_solver_params.solves_since_precon_update[] ≥ nl_solver_params.preconditioner_update_interval -global_rank[] == 0 && println("recalculating precon") +global_rank[] == 0 && println("recalculating precon 2") nl_solver_params.solves_since_precon_update[] = 0 nl_solver_params.precon_dt[] = t_params.dt[] @@ -1931,7 +1950,7 @@ to allow the outer r-loop to be parallelised. function recalculate_preconditioner!() if nl_solver_params.preconditioner_type === Val(:electron_lu) -global_rank[] == 0 && println("recalculating precon") +global_rank[] == 0 && println("recalculating precon 3") nl_solver_params.solves_since_precon_update[] = 0 nl_solver_params.precon_dt[] = ion_dt @@ -1944,20 +1963,35 @@ global_rank[] == 0 && println("recalculating precon") z_advect, vpa_advect, scratch_dummy, external_source_settings, num_diss_params, t_params, ion_dt, ir, true, :all, true, false) - begin_serial_region() - if block_rank[] == 0 - if size(orig_lu) == (1, 1) + #begin_serial_region() + qrblocksize = 4 + localcols = MPIQR.localcols(block_rank[], size(precon_matrix, 2), + qrblocksize, block_size[]) + + if block_rank[] == 0 # >= 0 + if size(orig_lu, 1) == 1 # Have not properly created the LU decomposition before, so # cannot reuse it. - @timeit_debug global_timer "lu" nl_solver_params.preconditioners[ir] = - (lu(sparse(precon_matrix)), precon_matrix, input_buffer, - output_buffer) + #@timeit_debug global_timer "lu" nl_solver_params.preconditioners[ir] = + # (lu(sparse(precon_matrix)), precon_matrix, input_buffer, + # output_buffer) + @timeit_debug global_timer "lu" begin + mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]), size(precon_matrix); + blocksize=qrblocksize) + mpiqrstruct = qr!(mpiqrmatrix) + nl_solver_params.preconditioners[ir] = + (mpiqrstruct, precon_matrix, input_buffer, output_buffer) + end else # LU decomposition was previously created. The Jacobian always # has the same sparsity pattern, so by using `lu!()` we can # reuse some setup. try - @timeit_debug global_timer "lu!" lu!(orig_lu, sparse(precon_matrix); check=false) + #@timeit_debug global_timer "lu!" lu!(orig_lu, sparse(precon_matrix); check=false) + @timeit_debug global_timer "lu!" begin + orig_lu[:, localcols] = precon_matrix[:, localcols] + qr!(orig_lu) + end catch e if !isa(e, ArgumentError) rethrow(e) @@ -2007,10 +2041,10 @@ global_rank[] == 0 && println("recalculating precon") counter += 1 end - begin_serial_region() - @serial_region begin - @timeit_debug global_timer "ldiv!" ldiv!(this_output_buffer, precon_lu, this_input_buffer) - end + #begin_serial_region() + #@serial_region begin + @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer) + #end begin_serial_region() counter = 1 diff --git a/moment_kinetics/src/nonlinear_solvers.jl b/moment_kinetics/src/nonlinear_solvers.jl index 361547970..f13d5ddc4 100644 --- a/moment_kinetics/src/nonlinear_solvers.jl +++ b/moment_kinetics/src/nonlinear_solvers.jl @@ -41,6 +41,7 @@ using LinearAlgebra using MPI using SparseArrays using StatsBase: mean +using MPIQR struct nl_solver_info{TH,TV,Tcsg,Tlig,Tprecon,Tpretype} rtol::mk_float @@ -166,23 +167,25 @@ function setup_nonlinear_solve(active, input_dict, coords, outer_coords=(); defa V .= 0.0 end end - + mpiqr = MPIQR.MPIQRMatrix(zeros(Float64, 1, 1), (1, block_size[]); + blocksize=1, comm=comm_block[]) + mpiqrstruct = MPIQR.MPIQRStruct(mpiqr) if preconditioner_type === Val(:lu) # Create dummy LU solver objects so we can create an array for preconditioners. # These will be calculated properly within the time loop. - preconditioners = fill(lu(sparse(1.0*I, total_size_coords, total_size_coords)), + preconditioners = fill(mpiqrstruct, reverse(outer_coord_sizes)) elseif preconditioner_type === Val(:electron_split_lu) - preconditioners = (z=fill(lu(sparse(1.0*I, coords.z.n, coords.z.n)), + preconditioners = (z=fill(mpiqrstruct, tuple(coords.vpa.n, reverse(outer_coord_sizes)...)), - vpa=fill(lu(sparse(1.0*I, coords.vpa.n, coords.vpa.n)), + vpa=fill(mpqirstruct, tuple(coords.z.n, reverse(outer_coord_sizes)...)), - ppar=fill(lu(sparse(1.0*I, coords.z.n, coords.z.n)), + ppar=fill(mpiqrstruct, reverse(outer_coord_sizes)), ) elseif preconditioner_type === Val(:electron_lu) pdf_plus_ppar_size = total_size_coords + coords.z.n - preconditioners = fill((lu(sparse(1.0*I, 1, 1)), + preconditioners = fill((mpiqrstruct, allocate_shared_float(pdf_plus_ppar_size, pdf_plus_ppar_size), allocate_shared_float(pdf_plus_ppar_size), allocate_shared_float(pdf_plus_ppar_size), @@ -202,6 +205,7 @@ function setup_nonlinear_solve(active, input_dict, coords, outer_coords=(); defa # Plus one for the one point of ppar that is included in the 'v solve'. v_solve_n = nvperp * nvpa + 1 v_solve_implicit_lus = Vector{SparseArrays.UMFPACK.UmfpackLU{mk_float, mk_int}}(undef, v_solve_nsolve) + #v_solve_implicit_lus = Vector{Any}(undef, v_solve_nsolve) v_solve_explicit_matrices = Vector{SparseMatrixCSC{mk_float, mk_int}}(undef, v_solve_nsolve) # This buffer is not shared-memory, because it will be used for a serial LU solve. v_solve_buffer = allocate_float(v_solve_n) @@ -221,6 +225,7 @@ function setup_nonlinear_solve(active, input_dict, coords, outer_coords=(); defa end z_solve_n = nz z_solve_implicit_lus = Vector{SparseArrays.UMFPACK.UmfpackLU{mk_float, mk_int}}(undef, z_solve_nsolve) + #z_solve_implicit_lus = Vector{Any}(undef, z_solve_nsolve) z_solve_explicit_matrices = Vector{SparseMatrixCSC{mk_float, mk_int}}(undef, z_solve_nsolve) # This buffer is not shared-memory, because it will be used for a serial LU solve. z_solve_buffer = allocate_float(z_solve_n) diff --git a/moment_kinetics/src/time_advance.jl b/moment_kinetics/src/time_advance.jl index 777f9b4ec..129d4944d 100644 --- a/moment_kinetics/src/time_advance.jl +++ b/moment_kinetics/src/time_advance.jl @@ -493,7 +493,7 @@ function setup_time_info(t_input, n_variables, code_time, dt_reload, electron_preconditioner_type = Val(:electron_lu) else # Want to parallelise preconditioner, so use ADI method. - electron_preconditioner_type = Val(:electron_adi) + electron_preconditioner_type = Val(:electron_lu) # now actually MPIQR end else if t_input["implicit_electron_time_evolving"] ∈ keys(electron_precon_types) diff --git a/moment_kinetics/src/timer_utils.jl b/moment_kinetics/src/timer_utils.jl index 3ff8f02ff..307558612 100644 --- a/moment_kinetics/src/timer_utils.jl +++ b/moment_kinetics/src/timer_utils.jl @@ -24,7 +24,7 @@ To control the debug timers in `moment_kinetics` we define this function once, i To activate debug timers, edit this function so that it returns `true`. """ -timeit_debug_enabled() = false +timeit_debug_enabled() = true """ Global object used to collect timings of various parts of the code From 57588a5875ea632c1751030677e0bdf875b5d481 Mon Sep 17 00:00:00 2001 From: James Cook Date: Fri, 17 Jan 2025 13:44:09 +0000 Subject: [PATCH 02/18] block_rank[] >= 0 plus others --- .../src/electron_kinetic_equation.jl | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl index c179557f1..070d3cf93 100644 --- a/moment_kinetics/src/electron_kinetic_equation.jl +++ b/moment_kinetics/src/electron_kinetic_equation.jl @@ -71,6 +71,8 @@ using ..velocity_moments: integrate_over_vspace, calculate_electron_moment_deriv # Only needed so we can reference it in a docstring import ..runge_kutta +const MPIQRBLOCKSIZE = 4 # 1 will be slow, 4 is about optimum, 8 may be faster or slower, 16 will be slower. + const integral_correction_sharpness = 4.0 """ update_electron_pdf is a function that uses the electron kinetic equation @@ -1233,10 +1235,10 @@ global_rank[] == 0 && println("recalculating precon 1") ir, evolve_ppar) #begin_serial_region() - qrblocksize = 4 - localcols = MPIQR.localcolumns(block_rank[], size(precon_matrix, 2), - qrblocksize, block_size[])#1:size(precon_matrix, 2)# + _block_synchronize() if block_rank[] >= 0 # == 0 + localcols = MPIQR.localcolumns(block_rank[], size(precon_matrix, 2), + MPIQRBLOCKSIZE, block_size[])#1:size(precon_matrix, 2)# if size(orig_lu, 1) == 1 # Have not properly created the LU decomposition before, so # cannot reuse it. @@ -1244,8 +1246,8 @@ global_rank[] == 0 && println("recalculating precon 1") # (lu(sparse(precon_matrix)), precon_matrix, input_buffer, # output_buffer) @timeit_debug global_timer "lu" begin - mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]), - size(precon_matrix); blocksize=qrblocksize) + mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols], + size(precon_matrix); blocksize=MPIQRBLOCKSIZE) progress = Progress(mpiqrmatrix, dt=iszero(block_rank[]) ? 1 : 2^31; showspeed=true) mpiqrstruct = qr!(mpiqrmatrix; progress=progress) @@ -1268,8 +1270,8 @@ global_rank[] == 0 && println("recalculating precon 1") end println("Sparsity pattern of matrix changed, rebuilding " * " LU from scratch") - mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]), - size(precon_matrix); blocksize=qrblocksize) + mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols], + size(precon_matrix); blocksize=MPIQRBLOCKSIZE) @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix) end nl_solver_params.preconditioners[ir] = @@ -1279,6 +1281,7 @@ global_rank[] == 0 && println("recalculating precon 1") nl_solver_params.preconditioners[ir] = (orig_lu, precon_matrix, input_buffer, output_buffer) end + _block_synchronize() end @@ -1964,11 +1967,10 @@ global_rank[] == 0 && println("recalculating precon 3") num_diss_params, t_params, ion_dt, ir, true, :all, true, false) #begin_serial_region() - qrblocksize = 4 - localcols = MPIQR.localcols(block_rank[], size(precon_matrix, 2), - qrblocksize, block_size[]) - - if block_rank[] == 0 # >= 0 + _block_synchronize() + if block_rank[] >= 0 # >= 0 + localcols = MPIQR.localcols(block_rank[], size(precon_matrix, 2), + MPIQRBLOCKSIZE, block_size[]) if size(orig_lu, 1) == 1 # Have not properly created the LU decomposition before, so # cannot reuse it. @@ -1977,7 +1979,7 @@ global_rank[] == 0 && println("recalculating precon 3") # output_buffer) @timeit_debug global_timer "lu" begin mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]), size(precon_matrix); - blocksize=qrblocksize) + blocksize=MPIQRBLOCKSIZE) mpiqrstruct = qr!(mpiqrmatrix) nl_solver_params.preconditioners[ir] = (mpiqrstruct, precon_matrix, input_buffer, output_buffer) @@ -1998,7 +2000,9 @@ global_rank[] == 0 && println("recalculating precon 3") end println("Sparsity pattern of matrix changed, rebuilding " * " LU from scratch") - @timeit_debug global_timer "lu" orig_lu = lu(sparse(precon_matrix)) + mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols], + size(precon_matrix); blocksize=MPIQRBLOCKSIZE) + @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix) end nl_solver_params.preconditioners[ir] = (orig_lu, precon_matrix, input_buffer, output_buffer) @@ -2007,6 +2011,7 @@ global_rank[] == 0 && println("recalculating precon 3") nl_solver_params.preconditioners[ir] = (orig_lu, precon_matrix, input_buffer, output_buffer) end + _block_synchronize() return nothing end From a839dc84dbcf374df9f7ed698bdd303c6304db5f Mon Sep 17 00:00:00 2001 From: James Cook Date: Mon, 20 Jan 2025 17:21:22 +0000 Subject: [PATCH 03/18] this works with latest MPIQR.jl --- moment_kinetics/Project.toml | 4 +- .../src/electron_kinetic_equation.jl | 46 ++++++------------- 2 files changed, 14 insertions(+), 36 deletions(-) diff --git a/moment_kinetics/Project.toml b/moment_kinetics/Project.toml index 9f8296ef4..e8a430cc7 100644 --- a/moment_kinetics/Project.toml +++ b/moment_kinetics/Project.toml @@ -26,7 +26,6 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Preferences = "21216c6a-2e73-6563-6e65-726566657250" Primes = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" -ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Quadmath = "be4d8f0f-7fa4-5f49-b795-2f01399ab2dd" Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665" @@ -53,7 +52,6 @@ manufactured_solns_ext = ["Symbolics", "IfElse"] [compat] MPIQR = "0.1.0" -ProgressMeter = "1.10.2" julia = "1.9.0" [extras] @@ -61,4 +59,4 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Random", "Test", "NCDatasets"] +test = ["Random", "Test"] diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl index 070d3cf93..750b4e904 100644 --- a/moment_kinetics/src/electron_kinetic_equation.jl +++ b/moment_kinetics/src/electron_kinetic_equation.jl @@ -4,7 +4,6 @@ using LinearAlgebra using MPI using SparseArrays using MPIQR -using ProgressMeter export get_electron_critical_velocities @@ -1220,7 +1219,6 @@ pressure \$p_{e∥}\$. end if nl_solver_params.solves_since_precon_update[] ≥ nl_solver_params.preconditioner_update_interval -global_rank[] == 0 && println("recalculating precon 1") nl_solver_params.solves_since_precon_update[] = 0 nl_solver_params.precon_dt[] = t_params.dt[] @@ -1234,7 +1232,6 @@ global_rank[] == 0 && println("recalculating precon 1") external_source_settings, num_diss_params, t_params, ion_dt, ir, evolve_ppar) - #begin_serial_region() _block_synchronize() if block_rank[] >= 0 # == 0 localcols = MPIQR.localcolumns(block_rank[], size(precon_matrix, 2), @@ -1242,26 +1239,20 @@ global_rank[] == 0 && println("recalculating precon 1") if size(orig_lu, 1) == 1 # Have not properly created the LU decomposition before, so # cannot reuse it. - #@timeit_debug global_timer "lu" nl_solver_params.preconditioners[ir] = - # (lu(sparse(precon_matrix)), precon_matrix, input_buffer, - # output_buffer) @timeit_debug global_timer "lu" begin mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols], - size(precon_matrix); blocksize=MPIQRBLOCKSIZE) - progress = Progress(mpiqrmatrix, dt=iszero(block_rank[]) ? 1 : 2^31; - showspeed=true) - mpiqrstruct = qr!(mpiqrmatrix; progress=progress) + size(precon_matrix); blocksize=MPIQRBLOCKSIZE, comm=comm_block[]) + mpiqrstruct = qr!(mpiqrmatrix) nl_solver_params.preconditioners[ir] = - (mpiqrstruct, precon_matrix, input_buffer, output_buffer) + (mpiqrstruct, precon_matrix, input_buffer, output_buffer) end else # LU decomposition was previously created. The Jacobian always # has the same sparsity pattern, so by using `lu!()` we can # reuse some setup. try - #@timeit_debug global_timer "lu!" lu!(orig_lu, sparse(precon_matrix); check=false) @timeit_debug global_timer "lu!" begin - orig_lu[:, localcols] = precon_matrix[:, localcols] + orig_lu[:, localcols] .= precon_matrix[:, localcols] qr!(orig_lu) end catch e @@ -1271,7 +1262,7 @@ global_rank[] == 0 && println("recalculating precon 1") println("Sparsity pattern of matrix changed, rebuilding " * " LU from scratch") mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols], - size(precon_matrix); blocksize=MPIQRBLOCKSIZE) + size(precon_matrix); blocksize=MPIQRBLOCKSIZE, comm=comm_block[]) @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix) end nl_solver_params.preconditioners[ir] = @@ -1302,10 +1293,8 @@ global_rank[] == 0 && println("recalculating precon 1") counter += 1 end - #begin_serial_region() - #@serial_region begin - @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer) - #end + _block_synchronize() + @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer) begin_serial_region() counter = 1 @@ -1369,7 +1358,6 @@ global_rank[] == 0 && println("recalculating precon 1") end if nl_solver_params.solves_since_precon_update[] ≥ nl_solver_params.preconditioner_update_interval -global_rank[] == 0 && println("recalculating precon 2") nl_solver_params.solves_since_precon_update[] = 0 nl_solver_params.precon_dt[] = t_params.dt[] @@ -1953,7 +1941,6 @@ to allow the outer r-loop to be parallelised. function recalculate_preconditioner!() if nl_solver_params.preconditioner_type === Val(:electron_lu) -global_rank[] == 0 && println("recalculating precon 3") nl_solver_params.solves_since_precon_update[] = 0 nl_solver_params.precon_dt[] = ion_dt @@ -1966,7 +1953,6 @@ global_rank[] == 0 && println("recalculating precon 3") z_advect, vpa_advect, scratch_dummy, external_source_settings, num_diss_params, t_params, ion_dt, ir, true, :all, true, false) - #begin_serial_region() _block_synchronize() if block_rank[] >= 0 # >= 0 localcols = MPIQR.localcols(block_rank[], size(precon_matrix, 2), @@ -1974,12 +1960,9 @@ global_rank[] == 0 && println("recalculating precon 3") if size(orig_lu, 1) == 1 # Have not properly created the LU decomposition before, so # cannot reuse it. - #@timeit_debug global_timer "lu" nl_solver_params.preconditioners[ir] = - # (lu(sparse(precon_matrix)), precon_matrix, input_buffer, - # output_buffer) @timeit_debug global_timer "lu" begin mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]), size(precon_matrix); - blocksize=MPIQRBLOCKSIZE) + blocksize=MPIQRBLOCKSIZE, comm=comm_block[]) mpiqrstruct = qr!(mpiqrmatrix) nl_solver_params.preconditioners[ir] = (mpiqrstruct, precon_matrix, input_buffer, output_buffer) @@ -1989,9 +1972,8 @@ global_rank[] == 0 && println("recalculating precon 3") # has the same sparsity pattern, so by using `lu!()` we can # reuse some setup. try - #@timeit_debug global_timer "lu!" lu!(orig_lu, sparse(precon_matrix); check=false) @timeit_debug global_timer "lu!" begin - orig_lu[:, localcols] = precon_matrix[:, localcols] + orig_lu[:, localcols] .= precon_matrix[:, localcols] qr!(orig_lu) end catch e @@ -2000,8 +1982,8 @@ global_rank[] == 0 && println("recalculating precon 3") end println("Sparsity pattern of matrix changed, rebuilding " * " LU from scratch") - mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols], - size(precon_matrix); blocksize=MPIQRBLOCKSIZE) + mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols], size(precon_matrix); + blocksize=MPIQRBLOCKSIZE, comm=comm_block[]) @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix) end nl_solver_params.preconditioners[ir] = @@ -2046,10 +2028,8 @@ global_rank[] == 0 && println("recalculating precon 3") counter += 1 end - #begin_serial_region() - #@serial_region begin - @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer) - #end + _block_synchronize() + @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer) begin_serial_region() counter = 1 From 7b80960c4b5ebcd0fb9a88abc37f560864449870 Mon Sep 17 00:00:00 2001 From: James Cook Date: Tue, 21 Jan 2025 09:43:25 +0000 Subject: [PATCH 04/18] put println statements back in --- moment_kinetics/src/electron_kinetic_equation.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl index 750b4e904..5adb40ceb 100644 --- a/moment_kinetics/src/electron_kinetic_equation.jl +++ b/moment_kinetics/src/electron_kinetic_equation.jl @@ -1219,6 +1219,7 @@ pressure \$p_{e∥}\$. end if nl_solver_params.solves_since_precon_update[] ≥ nl_solver_params.preconditioner_update_interval + global_rank[] == 0 && println("recalculating precon :electron_lu 1") nl_solver_params.solves_since_precon_update[] = 0 nl_solver_params.precon_dt[] = t_params.dt[] @@ -1349,6 +1350,7 @@ pressure \$p_{e∥}\$. left_preconditioner = identity right_preconditioner = lu_precon! elseif nl_solver_params.preconditioner_type === Val(:electron_adi) + global_rank[] == 0 && println("recalculating precon :electron_adi") if t_params.dt[] > 1.5 * nl_solver_params.precon_dt[] || t_params.dt[] < 2.0/3.0 * nl_solver_params.precon_dt[] @@ -1941,6 +1943,7 @@ to allow the outer r-loop to be parallelised. function recalculate_preconditioner!() if nl_solver_params.preconditioner_type === Val(:electron_lu) + global_rank[] == 0 && println("recalculating precon :electron_lu 2") nl_solver_params.solves_since_precon_update[] = 0 nl_solver_params.precon_dt[] = ion_dt From bc7f5f4e4ed781ec72b63145d23f4fb43512e9f1 Mon Sep 17 00:00:00 2001 From: James Cook Date: Tue, 21 Jan 2025 09:44:28 +0000 Subject: [PATCH 05/18] delete dead line --- moment_kinetics/src/nonlinear_solvers.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/moment_kinetics/src/nonlinear_solvers.jl b/moment_kinetics/src/nonlinear_solvers.jl index f13d5ddc4..ef7828005 100644 --- a/moment_kinetics/src/nonlinear_solvers.jl +++ b/moment_kinetics/src/nonlinear_solvers.jl @@ -205,7 +205,6 @@ function setup_nonlinear_solve(active, input_dict, coords, outer_coords=(); defa # Plus one for the one point of ppar that is included in the 'v solve'. v_solve_n = nvperp * nvpa + 1 v_solve_implicit_lus = Vector{SparseArrays.UMFPACK.UmfpackLU{mk_float, mk_int}}(undef, v_solve_nsolve) - #v_solve_implicit_lus = Vector{Any}(undef, v_solve_nsolve) v_solve_explicit_matrices = Vector{SparseMatrixCSC{mk_float, mk_int}}(undef, v_solve_nsolve) # This buffer is not shared-memory, because it will be used for a serial LU solve. v_solve_buffer = allocate_float(v_solve_n) From f7468a628b3663cccb73aa478532df8cbe8a72a5 Mon Sep 17 00:00:00 2001 From: James Cook Date: Tue, 21 Jan 2025 09:45:17 +0000 Subject: [PATCH 06/18] timeit_debug_enabled set back to false --- moment_kinetics/src/timer_utils.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moment_kinetics/src/timer_utils.jl b/moment_kinetics/src/timer_utils.jl index 307558612..3ff8f02ff 100644 --- a/moment_kinetics/src/timer_utils.jl +++ b/moment_kinetics/src/timer_utils.jl @@ -24,7 +24,7 @@ To control the debug timers in `moment_kinetics` we define this function once, i To activate debug timers, edit this function so that it returns `true`. """ -timeit_debug_enabled() = true +timeit_debug_enabled() = false """ Global object used to collect timings of various parts of the code From 43f2414e81a5a3a0bd5ad708551fddcd7b483e9c Mon Sep 17 00:00:00 2001 From: James Cook Date: Tue, 21 Jan 2025 09:45:44 +0000 Subject: [PATCH 07/18] delete dead line --- moment_kinetics/src/nonlinear_solvers.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/moment_kinetics/src/nonlinear_solvers.jl b/moment_kinetics/src/nonlinear_solvers.jl index ef7828005..2f786431e 100644 --- a/moment_kinetics/src/nonlinear_solvers.jl +++ b/moment_kinetics/src/nonlinear_solvers.jl @@ -224,7 +224,6 @@ function setup_nonlinear_solve(active, input_dict, coords, outer_coords=(); defa end z_solve_n = nz z_solve_implicit_lus = Vector{SparseArrays.UMFPACK.UmfpackLU{mk_float, mk_int}}(undef, z_solve_nsolve) - #z_solve_implicit_lus = Vector{Any}(undef, z_solve_nsolve) z_solve_explicit_matrices = Vector{SparseMatrixCSC{mk_float, mk_int}}(undef, z_solve_nsolve) # This buffer is not shared-memory, because it will be used for a serial LU solve. z_solve_buffer = allocate_float(z_solve_n) From 676a9be0f5c58e2ce40622c8e749cff3bf34890e Mon Sep 17 00:00:00 2001 From: James Cook Date: Tue, 21 Jan 2025 09:47:04 +0000 Subject: [PATCH 08/18] time_advance was forced to use MPIQR rather than ADI - revert this to how it was before --- moment_kinetics/src/time_advance.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moment_kinetics/src/time_advance.jl b/moment_kinetics/src/time_advance.jl index 129d4944d..c1d4d8f53 100644 --- a/moment_kinetics/src/time_advance.jl +++ b/moment_kinetics/src/time_advance.jl @@ -493,7 +493,7 @@ function setup_time_info(t_input, n_variables, code_time, dt_reload, electron_preconditioner_type = Val(:electron_lu) else # Want to parallelise preconditioner, so use ADI method. - electron_preconditioner_type = Val(:electron_lu) # now actually MPIQR + electron_preconditioner_type = Val(:electron_adi) # now actually MPIQR end else if t_input["implicit_electron_time_evolving"] ∈ keys(electron_precon_types) From 35ba4b11b6ed1d6c61121d8d77e9995243fdaaa4 Mon Sep 17 00:00:00 2001 From: James Cook Date: Tue, 21 Jan 2025 09:49:47 +0000 Subject: [PATCH 09/18] tidy up --- moment_kinetics/src/electron_kinetic_equation.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl index 5adb40ceb..01f25b781 100644 --- a/moment_kinetics/src/electron_kinetic_equation.jl +++ b/moment_kinetics/src/electron_kinetic_equation.jl @@ -1236,7 +1236,7 @@ pressure \$p_{e∥}\$. _block_synchronize() if block_rank[] >= 0 # == 0 localcols = MPIQR.localcolumns(block_rank[], size(precon_matrix, 2), - MPIQRBLOCKSIZE, block_size[])#1:size(precon_matrix, 2)# + MPIQRBLOCKSIZE, block_size[]) if size(orig_lu, 1) == 1 # Have not properly created the LU decomposition before, so # cannot reuse it. From 933d749599eccf6870b79a06f25879e9f9e143b2 Mon Sep 17 00:00:00 2001 From: James Cook Date: Tue, 21 Jan 2025 13:53:32 +0000 Subject: [PATCH 10/18] increase MPIQRBLOCKSIZE to 8 --- moment_kinetics/src/electron_kinetic_equation.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl index 01f25b781..81580fcf8 100644 --- a/moment_kinetics/src/electron_kinetic_equation.jl +++ b/moment_kinetics/src/electron_kinetic_equation.jl @@ -70,7 +70,7 @@ using ..velocity_moments: integrate_over_vspace, calculate_electron_moment_deriv # Only needed so we can reference it in a docstring import ..runge_kutta -const MPIQRBLOCKSIZE = 4 # 1 will be slow, 4 is about optimum, 8 may be faster or slower, 16 will be slower. +const MPIQRBLOCKSIZE = 8 # 1 will be slow, 4 is good for smaller matrices, 8 is good for larger matrices, 16 - who knows?! const integral_correction_sharpness = 4.0 """ From 52f939e8bd28b56a1fad2c637ab05deb5a6429cd Mon Sep 17 00:00:00 2001 From: James Cook Date: Tue, 21 Jan 2025 14:54:31 +0000 Subject: [PATCH 11/18] delete obsolete comment --- moment_kinetics/src/time_advance.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moment_kinetics/src/time_advance.jl b/moment_kinetics/src/time_advance.jl index c1d4d8f53..777f9b4ec 100644 --- a/moment_kinetics/src/time_advance.jl +++ b/moment_kinetics/src/time_advance.jl @@ -493,7 +493,7 @@ function setup_time_info(t_input, n_variables, code_time, dt_reload, electron_preconditioner_type = Val(:electron_lu) else # Want to parallelise preconditioner, so use ADI method. - electron_preconditioner_type = Val(:electron_adi) # now actually MPIQR + electron_preconditioner_type = Val(:electron_adi) end else if t_input["implicit_electron_time_evolving"] ∈ keys(electron_precon_types) From 2ed2b17d8324ce0d2e8f7d26f91715bf44c097ea Mon Sep 17 00:00:00 2001 From: James Cook Date: Tue, 21 Jan 2025 14:55:25 +0000 Subject: [PATCH 12/18] make using statements alphabetical --- moment_kinetics/src/nonlinear_solvers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moment_kinetics/src/nonlinear_solvers.jl b/moment_kinetics/src/nonlinear_solvers.jl index 2f786431e..568dba47f 100644 --- a/moment_kinetics/src/nonlinear_solvers.jl +++ b/moment_kinetics/src/nonlinear_solvers.jl @@ -39,9 +39,9 @@ using ..type_definitions: mk_float, mk_int using LinearAlgebra using MPI +using MPIQR using SparseArrays using StatsBase: mean -using MPIQR struct nl_solver_info{TH,TV,Tcsg,Tlig,Tprecon,Tpretype} rtol::mk_float From 2f932ee65db53ae05af71a0f16b9c229c91eefea Mon Sep 17 00:00:00 2001 From: James Cook Date: Tue, 21 Jan 2025 14:56:02 +0000 Subject: [PATCH 13/18] make using statements alphabetical --- moment_kinetics/src/electron_kinetic_equation.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl index 81580fcf8..016e9a997 100644 --- a/moment_kinetics/src/electron_kinetic_equation.jl +++ b/moment_kinetics/src/electron_kinetic_equation.jl @@ -2,8 +2,8 @@ module electron_kinetic_equation using LinearAlgebra using MPI -using SparseArrays using MPIQR +using SparseArrays export get_electron_critical_velocities From 733e9196fa7ede5b17b0bf412af1bd8b2d188a22 Mon Sep 17 00:00:00 2001 From: James Cook Date: Tue, 21 Jan 2025 14:58:59 +0000 Subject: [PATCH 14/18] make all caps const MPIQRBLOCKSIZE mpiqr_blocksize as inkeeping with style --- .../src/electron_kinetic_equation.jl | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl index 016e9a997..4f1adfe15 100644 --- a/moment_kinetics/src/electron_kinetic_equation.jl +++ b/moment_kinetics/src/electron_kinetic_equation.jl @@ -70,9 +70,10 @@ using ..velocity_moments: integrate_over_vspace, calculate_electron_moment_deriv # Only needed so we can reference it in a docstring import ..runge_kutta -const MPIQRBLOCKSIZE = 8 # 1 will be slow, 4 is good for smaller matrices, 8 is good for larger matrices, 16 - who knows?! - const integral_correction_sharpness = 4.0 + +const mpiqr_blocksize = 8 # 1 will be slow, 4 is good for smaller matrices, 8 is good for larger matrices, 16 - who knows?! + """ update_electron_pdf is a function that uses the electron kinetic equation to solve for the updated electron pdf @@ -1236,13 +1237,13 @@ pressure \$p_{e∥}\$. _block_synchronize() if block_rank[] >= 0 # == 0 localcols = MPIQR.localcolumns(block_rank[], size(precon_matrix, 2), - MPIQRBLOCKSIZE, block_size[]) + mpiqr_blocksize, block_size[]) if size(orig_lu, 1) == 1 # Have not properly created the LU decomposition before, so # cannot reuse it. @timeit_debug global_timer "lu" begin mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols], - size(precon_matrix); blocksize=MPIQRBLOCKSIZE, comm=comm_block[]) + size(precon_matrix); blocksize=mpiqr_blocksize, comm=comm_block[]) mpiqrstruct = qr!(mpiqrmatrix) nl_solver_params.preconditioners[ir] = (mpiqrstruct, precon_matrix, input_buffer, output_buffer) @@ -1263,7 +1264,7 @@ pressure \$p_{e∥}\$. println("Sparsity pattern of matrix changed, rebuilding " * " LU from scratch") mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols], - size(precon_matrix); blocksize=MPIQRBLOCKSIZE, comm=comm_block[]) + size(precon_matrix); blocksize=mpiqr_blocksize, comm=comm_block[]) @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix) end nl_solver_params.preconditioners[ir] = @@ -1959,13 +1960,13 @@ to allow the outer r-loop to be parallelised. _block_synchronize() if block_rank[] >= 0 # >= 0 localcols = MPIQR.localcols(block_rank[], size(precon_matrix, 2), - MPIQRBLOCKSIZE, block_size[]) + mpiqr_blocksize, block_size[]) if size(orig_lu, 1) == 1 # Have not properly created the LU decomposition before, so # cannot reuse it. @timeit_debug global_timer "lu" begin mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]), size(precon_matrix); - blocksize=MPIQRBLOCKSIZE, comm=comm_block[]) + blocksize=mpiqr_blocksize, comm=comm_block[]) mpiqrstruct = qr!(mpiqrmatrix) nl_solver_params.preconditioners[ir] = (mpiqrstruct, precon_matrix, input_buffer, output_buffer) @@ -1986,7 +1987,7 @@ to allow the outer r-loop to be parallelised. println("Sparsity pattern of matrix changed, rebuilding " * " LU from scratch") mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols], size(precon_matrix); - blocksize=MPIQRBLOCKSIZE, comm=comm_block[]) + blocksize=mpiqr_blocksize, comm=comm_block[]) @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix) end nl_solver_params.preconditioners[ir] = From 207ba22080f1c0718c5ca4ce93a4344438033614 Mon Sep 17 00:00:00 2001 From: James Cook Date: Tue, 21 Jan 2025 16:09:08 +0000 Subject: [PATCH 15/18] remove unnecessary defensive _block_synchonize() calls --- moment_kinetics/src/electron_kinetic_equation.jl | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl index 4f1adfe15..2f49b3574 100644 --- a/moment_kinetics/src/electron_kinetic_equation.jl +++ b/moment_kinetics/src/electron_kinetic_equation.jl @@ -1234,8 +1234,7 @@ pressure \$p_{e∥}\$. external_source_settings, num_diss_params, t_params, ion_dt, ir, evolve_ppar) - _block_synchronize() - if block_rank[] >= 0 # == 0 + if block_rank[] >= 0 localcols = MPIQR.localcolumns(block_rank[], size(precon_matrix, 2), mpiqr_blocksize, block_size[]) if size(orig_lu, 1) == 1 @@ -1274,7 +1273,6 @@ pressure \$p_{e∥}\$. nl_solver_params.preconditioners[ir] = (orig_lu, precon_matrix, input_buffer, output_buffer) end - _block_synchronize() end @@ -1295,7 +1293,6 @@ pressure \$p_{e∥}\$. counter += 1 end - _block_synchronize() @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer) begin_serial_region() @@ -1957,8 +1954,7 @@ to allow the outer r-loop to be parallelised. z_advect, vpa_advect, scratch_dummy, external_source_settings, num_diss_params, t_params, ion_dt, ir, true, :all, true, false) - _block_synchronize() - if block_rank[] >= 0 # >= 0 + if block_rank[] >= 0 localcols = MPIQR.localcols(block_rank[], size(precon_matrix, 2), mpiqr_blocksize, block_size[]) if size(orig_lu, 1) == 1 @@ -1997,7 +1993,6 @@ to allow the outer r-loop to be parallelised. nl_solver_params.preconditioners[ir] = (orig_lu, precon_matrix, input_buffer, output_buffer) end - _block_synchronize() return nothing end @@ -2032,7 +2027,6 @@ to allow the outer r-loop to be parallelised. counter += 1 end - _block_synchronize() @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer) begin_serial_region() From 792de1d22cfb809f3f3edbab40b89527828ad606 Mon Sep 17 00:00:00 2001 From: James Cook Date: Wed, 22 Jan 2025 11:45:22 +0000 Subject: [PATCH 16/18] use in-place 2-arg ldiv! --- moment_kinetics/src/electron_kinetic_equation.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl index 2f49b3574..d33ad29e6 100644 --- a/moment_kinetics/src/electron_kinetic_equation.jl +++ b/moment_kinetics/src/electron_kinetic_equation.jl @@ -1293,7 +1293,8 @@ pressure \$p_{e∥}\$. counter += 1 end - @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer) + this_output_buffer .= this_input_buffer + @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(precon_lu, this_output_buffer) begin_serial_region() counter = 1 @@ -2027,7 +2028,8 @@ to allow the outer r-loop to be parallelised. counter += 1 end - @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer) + this_output_buffer .= this_input_buffer + @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(precon_lu, this_output_buffer) begin_serial_region() counter = 1 From 5f2a96b88b51287d0c91e7f53a661c78428fa82f Mon Sep 17 00:00:00 2001 From: James Cook Date: Thu, 23 Jan 2025 14:07:31 +0000 Subject: [PATCH 17/18] runs fast - not checked if correct though --- moment_kinetics/Project.toml | 4 +- .../src/electron_kinetic_equation.jl | 84 ++++++++++++------- moment_kinetics/src/nonlinear_solvers.jl | 16 ++-- 3 files changed, 62 insertions(+), 42 deletions(-) diff --git a/moment_kinetics/Project.toml b/moment_kinetics/Project.toml index e8a430cc7..edcd6deba 100644 --- a/moment_kinetics/Project.toml +++ b/moment_kinetics/Project.toml @@ -19,7 +19,7 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LsqFit = "2fda8390-95c7-5789-9bda-21331edee243" MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267" -MPIQR = "4acd9526-7714-4958-a7d5-1632fb49fc8a" +MUMPS = "55d2b088-9f4e-11e9-26c0-150b02ea6a46" Measures = "442fdcdd-2543-5da2-b0f3-8c86c306513e" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" @@ -51,7 +51,7 @@ file_io_netcdf = "NCDatasets" manufactured_solns_ext = ["Symbolics", "IfElse"] [compat] -MPIQR = "0.1.0" +MUMPS = "1.5.0" julia = "1.9.0" [extras] diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl index d33ad29e6..33785aa2a 100644 --- a/moment_kinetics/src/electron_kinetic_equation.jl +++ b/moment_kinetics/src/electron_kinetic_equation.jl @@ -2,7 +2,7 @@ module electron_kinetic_equation using LinearAlgebra using MPI -using MPIQR +using MUMPS using SparseArrays export get_electron_critical_velocities @@ -72,8 +72,6 @@ import ..runge_kutta const integral_correction_sharpness = 4.0 -const mpiqr_blocksize = 8 # 1 will be slow, 4 is good for smaller matrices, 8 is good for larger matrices, 16 - who knows?! - """ update_electron_pdf is a function that uses the electron kinetic equation to solve for the updated electron pdf @@ -1233,19 +1231,19 @@ pressure \$p_{e∥}\$. vperp_spectral, vpa_spectral, z_advect, vpa_advect, scratch_dummy, external_source_settings, num_diss_params, t_params, ion_dt, ir, evolve_ppar) - + mumps = Mumps{mk_float}(mumps_unsymmetric, default_icntl, default_cntl64; + comm=MPI.API.MPI_Comm_f2c(comm_block[].val)) if block_rank[] >= 0 - localcols = MPIQR.localcolumns(block_rank[], size(precon_matrix, 2), - mpiqr_blocksize, block_size[]) - if size(orig_lu, 1) == 1 + if orig_lu.n == 0#size(orig_lu, 1) == 1 # Have not properly created the LU decomposition before, so # cannot reuse it. @timeit_debug global_timer "lu" begin - mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols], - size(precon_matrix); blocksize=mpiqr_blocksize, comm=comm_block[]) - mpiqrstruct = qr!(mpiqrmatrix) + if block_rank[] == 0 + MUMPS.associate_matrix!(mumps, sparse(precon_matrix)) + end + MUMPS.factorize!(mumps) nl_solver_params.preconditioners[ir] = - (mpiqrstruct, precon_matrix, input_buffer, output_buffer) + (mumps, precon_matrix, input_buffer, output_buffer) end else # LU decomposition was previously created. The Jacobian always @@ -1253,8 +1251,10 @@ pressure \$p_{e∥}\$. # reuse some setup. try @timeit_debug global_timer "lu!" begin - orig_lu[:, localcols] .= precon_matrix[:, localcols] - qr!(orig_lu) + if block_rank[] == 0 + MUMPS.associate_matrix!(orig_lu, sparse(precon_matrix)) + end + MUMPS.factorize!(orig_lu) end catch e if !isa(e, ArgumentError) @@ -1262,9 +1262,15 @@ pressure \$p_{e∥}\$. end println("Sparsity pattern of matrix changed, rebuilding " * " LU from scratch") - mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols], - size(precon_matrix); blocksize=mpiqr_blocksize, comm=comm_block[]) - @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix) + @timeit_debug global_timer "lu" begin + if block_rank[] == 0 + MUMPS.associate_matrix!(mumps, sparse(precon_matrix)) + end + MUMPS.factorize!(mumps) + nl_solver_params.preconditioners[ir] = + (mumps, precon_matrix, input_buffer, output_buffer) + end + end nl_solver_params.preconditioners[ir] = (orig_lu, precon_matrix, input_buffer, output_buffer) @@ -1294,7 +1300,10 @@ pressure \$p_{e∥}\$. end this_output_buffer .= this_input_buffer - @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(precon_lu, this_output_buffer) + if block_rank[] == 0 + MUMPS.associate_rhs!(precon_lu, this_output_buffer) + end + @timeit_debug global_timer "ldiv!" MUMPS.solve!(precon_lu) begin_serial_region() counter = 1 @@ -1955,18 +1964,19 @@ to allow the outer r-loop to be parallelised. z_advect, vpa_advect, scratch_dummy, external_source_settings, num_diss_params, t_params, ion_dt, ir, true, :all, true, false) + mumps = Mumps{mk_float}(mumps_unsymmetric, default_icntl, default_cntl64; + comm=MPI.API.MPI_Comm_f2c(comm_block[].val)) if block_rank[] >= 0 - localcols = MPIQR.localcols(block_rank[], size(precon_matrix, 2), - mpiqr_blocksize, block_size[]) - if size(orig_lu, 1) == 1 + if orig_lu.n == 0#size(orig_lu, 1) == 1 # Have not properly created the LU decomposition before, so # cannot reuse it. @timeit_debug global_timer "lu" begin - mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]), size(precon_matrix); - blocksize=mpiqr_blocksize, comm=comm_block[]) - mpiqrstruct = qr!(mpiqrmatrix) - nl_solver_params.preconditioners[ir] = - (mpiqrstruct, precon_matrix, input_buffer, output_buffer) + if block_rank[] == 0 + MUMPS.associate_matrix!(mumps, sparse(precon_matrix)) + end + MUMPS.factorize!(mumps) + nl_solver_params.preconditioners[ir] = + (mumps, precon_matrix, input_buffer, output_buffer) end else # LU decomposition was previously created. The Jacobian always @@ -1974,8 +1984,12 @@ to allow the outer r-loop to be parallelised. # reuse some setup. try @timeit_debug global_timer "lu!" begin - orig_lu[:, localcols] .= precon_matrix[:, localcols] - qr!(orig_lu) + if block_rank[] == 0 + MUMPS.associate_matrix!(orig_lu, sparse(precon_matrix)) + end + MUMPS.factorize!(orig_lu) + nl_solver_params.preconditioners[ir] = + (orig_lu, precon_matrix, input_buffer, output_buffer) end catch e if !isa(e, ArgumentError) @@ -1983,9 +1997,12 @@ to allow the outer r-loop to be parallelised. end println("Sparsity pattern of matrix changed, rebuilding " * " LU from scratch") - mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols], size(precon_matrix); - blocksize=mpiqr_blocksize, comm=comm_block[]) - @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix) + if block_rank[] == 0 + MUMPS.associate_matrix!(orig_lu, sparse(precon_matrix)) + end + @timeit_debug global_timer "lu" MUMPS.factorize!(orig_lu) + nl_solver_params.preconditioners[ir] = + (orig_lu, precon_matrix, input_buffer, output_buffer) end nl_solver_params.preconditioners[ir] = (orig_lu, precon_matrix, input_buffer, output_buffer) @@ -2028,8 +2045,11 @@ to allow the outer r-loop to be parallelised. counter += 1 end - this_output_buffer .= this_input_buffer - @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(precon_lu, this_output_buffer) + this_output_buffer .= this_input_buffer + if block_rank[] == 0 + MUMPS.associate_rhs!(precon_lu, this_output_buffer) + end + @timeit_debug global_timer "ldiv!" MUMPS.solve!(precon_lu) begin_serial_region() counter = 1 diff --git a/moment_kinetics/src/nonlinear_solvers.jl b/moment_kinetics/src/nonlinear_solvers.jl index 568dba47f..f94d2385c 100644 --- a/moment_kinetics/src/nonlinear_solvers.jl +++ b/moment_kinetics/src/nonlinear_solvers.jl @@ -39,7 +39,7 @@ using ..type_definitions: mk_float, mk_int using LinearAlgebra using MPI -using MPIQR +using MUMPS using SparseArrays using StatsBase: mean @@ -167,25 +167,25 @@ function setup_nonlinear_solve(active, input_dict, coords, outer_coords=(); defa V .= 0.0 end end - mpiqr = MPIQR.MPIQRMatrix(zeros(Float64, 1, 1), (1, block_size[]); - blocksize=1, comm=comm_block[]) - mpiqrstruct = MPIQR.MPIQRStruct(mpiqr) + + mumps = Mumps{mk_float}(mumps_unsymmetric, default_icntl, default_cntl64; + comm=MPI.API.MPI_Comm_f2c(comm_block[].val)) if preconditioner_type === Val(:lu) # Create dummy LU solver objects so we can create an array for preconditioners. # These will be calculated properly within the time loop. - preconditioners = fill(mpiqrstruct, + preconditioners = fill(mumps, reverse(outer_coord_sizes)) elseif preconditioner_type === Val(:electron_split_lu) - preconditioners = (z=fill(mpiqrstruct, + preconditioners = (z=fill(mumps, tuple(coords.vpa.n, reverse(outer_coord_sizes)...)), vpa=fill(mpqirstruct, tuple(coords.z.n, reverse(outer_coord_sizes)...)), - ppar=fill(mpiqrstruct, + ppar=fill(mumps, reverse(outer_coord_sizes)), ) elseif preconditioner_type === Val(:electron_lu) pdf_plus_ppar_size = total_size_coords + coords.z.n - preconditioners = fill((mpiqrstruct, + preconditioners = fill((mumps, allocate_shared_float(pdf_plus_ppar_size, pdf_plus_ppar_size), allocate_shared_float(pdf_plus_ppar_size), allocate_shared_float(pdf_plus_ppar_size), From 08b44aee9d17950b1020f9046a6e0093461211f7 Mon Sep 17 00:00:00 2001 From: James Cook Date: Fri, 24 Jan 2025 11:40:43 +0000 Subject: [PATCH 18/18] add _block_synchnize() calls for safety - can remove late if needed --- moment_kinetics/src/electron_kinetic_equation.jl | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl index 33785aa2a..96d39e73e 100644 --- a/moment_kinetics/src/electron_kinetic_equation.jl +++ b/moment_kinetics/src/electron_kinetic_equation.jl @@ -1231,8 +1231,10 @@ pressure \$p_{e∥}\$. vperp_spectral, vpa_spectral, z_advect, vpa_advect, scratch_dummy, external_source_settings, num_diss_params, t_params, ion_dt, ir, evolve_ppar) - mumps = Mumps{mk_float}(mumps_unsymmetric, default_icntl, default_cntl64; + mumps = Mumps{mk_float}(mumps_unsymmetric, get_icntl(), default_cntl64; comm=MPI.API.MPI_Comm_f2c(comm_block[].val)) + begin_serial_region() + _block_synchronize() if block_rank[] >= 0 if orig_lu.n == 0#size(orig_lu, 1) == 1 # Have not properly created the LU decomposition before, so @@ -1299,11 +1301,15 @@ pressure \$p_{e∥}\$. counter += 1 end + begin_serial_region() this_output_buffer .= this_input_buffer if block_rank[] == 0 MUMPS.associate_rhs!(precon_lu, this_output_buffer) end + + _block_synchronize() @timeit_debug global_timer "ldiv!" MUMPS.solve!(precon_lu) + _block_synchronize() begin_serial_region() counter = 1 @@ -1964,8 +1970,10 @@ to allow the outer r-loop to be parallelised. z_advect, vpa_advect, scratch_dummy, external_source_settings, num_diss_params, t_params, ion_dt, ir, true, :all, true, false) - mumps = Mumps{mk_float}(mumps_unsymmetric, default_icntl, default_cntl64; + mumps = Mumps{mk_float}(mumps_unsymmetric, get_icntl(), default_cntl64; comm=MPI.API.MPI_Comm_f2c(comm_block[].val)) + begin_serial_region() + _block_synchronize() if block_rank[] >= 0 if orig_lu.n == 0#size(orig_lu, 1) == 1 # Have not properly created the LU decomposition before, so @@ -2045,11 +2053,14 @@ to allow the outer r-loop to be parallelised. counter += 1 end + begin_serial_region() this_output_buffer .= this_input_buffer if block_rank[] == 0 MUMPS.associate_rhs!(precon_lu, this_output_buffer) end + _block_synchronize() @timeit_debug global_timer "ldiv!" MUMPS.solve!(precon_lu) + _block_synchronize() begin_serial_region() counter = 1