From 87809e4cf8a840e390331fdbe7226a47595dee76 Mon Sep 17 00:00:00 2001
From: James Cook <james.cook@ukaea.uk>
Date: Fri, 17 Jan 2025 12:53:43 +0000
Subject: [PATCH 01/18] this is running with my dev'd MPIQR with 3arg ldiv!

---
 moment_kinetics/Project.toml                  |  4 +
 .../src/electron_kinetic_equation.jl          | 88 +++++++++++++------
 moment_kinetics/src/nonlinear_solvers.jl      | 17 ++--
 moment_kinetics/src/time_advance.jl           |  2 +-
 moment_kinetics/src/timer_utils.jl            |  2 +-
 5 files changed, 78 insertions(+), 35 deletions(-)

diff --git a/moment_kinetics/Project.toml b/moment_kinetics/Project.toml
index fe15a7856..9f8296ef4 100644
--- a/moment_kinetics/Project.toml
+++ b/moment_kinetics/Project.toml
@@ -19,12 +19,14 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LsqFit = "2fda8390-95c7-5789-9bda-21331edee243"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
+MPIQR = "4acd9526-7714-4958-a7d5-1632fb49fc8a"
 Measures = "442fdcdd-2543-5da2-b0f3-8c86c306513e"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Primes = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Quadmath = "be4d8f0f-7fa4-5f49-b795-2f01399ab2dd"
 Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
 Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665"
@@ -50,6 +52,8 @@ file_io_netcdf = "NCDatasets"
 manufactured_solns_ext = ["Symbolics", "IfElse"]
 
 [compat]
+MPIQR = "0.1.0"
+ProgressMeter = "1.10.2"
 julia = "1.9.0"
 
 [extras]
diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl
index 556490a6d..c179557f1 100644
--- a/moment_kinetics/src/electron_kinetic_equation.jl
+++ b/moment_kinetics/src/electron_kinetic_equation.jl
@@ -3,6 +3,8 @@ module electron_kinetic_equation
 using LinearAlgebra
 using MPI
 using SparseArrays
+using MPIQR
+using ProgressMeter
 
 export get_electron_critical_velocities
 
@@ -70,7 +72,6 @@ using ..velocity_moments: integrate_over_vspace, calculate_electron_moment_deriv
 import ..runge_kutta
 
 const integral_correction_sharpness = 4.0
-
 """
 update_electron_pdf is a function that uses the electron kinetic equation 
 to solve for the updated electron pdf
@@ -1217,7 +1218,7 @@ pressure \$p_{e∥}\$.
         end
 
         if nl_solver_params.solves_since_precon_update[] ≥ nl_solver_params.preconditioner_update_interval
-global_rank[] == 0 && println("recalculating precon")
+global_rank[] == 0 && println("recalculating precon 1")
             nl_solver_params.solves_since_precon_update[] = 0
             nl_solver_params.precon_dt[] = t_params.dt[]
 
@@ -1231,27 +1232,45 @@ global_rank[] == 0 && println("recalculating precon")
                 external_source_settings, num_diss_params, t_params, ion_dt,
                 ir, evolve_ppar)
 
-            begin_serial_region()
-            if block_rank[] == 0
-                if size(orig_lu) == (1, 1)
+            #begin_serial_region()
+            qrblocksize = 4
+            localcols = MPIQR.localcolumns(block_rank[], size(precon_matrix, 2),
+                                           qrblocksize, block_size[])#1:size(precon_matrix, 2)#
+            if block_rank[] >= 0 # == 0
+                if size(orig_lu, 1) == 1
                     # Have not properly created the LU decomposition before, so
                     # cannot reuse it.
-                    @timeit_debug global_timer "lu" nl_solver_params.preconditioners[ir] =
-                        (lu(sparse(precon_matrix)), precon_matrix, input_buffer,
-                         output_buffer)
+                    #@timeit_debug global_timer "lu" nl_solver_params.preconditioners[ir] =
+                    #    (lu(sparse(precon_matrix)), precon_matrix, input_buffer,
+                    #     output_buffer)
+                    @timeit_debug global_timer "lu" begin
+                        mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]),
+                          size(precon_matrix); blocksize=qrblocksize)
+                        progress = Progress(mpiqrmatrix, dt=iszero(block_rank[]) ? 1 : 2^31;
+                                            showspeed=true)
+                        mpiqrstruct = qr!(mpiqrmatrix; progress=progress)
+                        nl_solver_params.preconditioners[ir] =
+                        (mpiqrstruct, precon_matrix, input_buffer, output_buffer)
+                    end
                 else
                     # LU decomposition was previously created. The Jacobian always
                     # has the same sparsity pattern, so by using `lu!()` we can
                     # reuse some setup.
                     try
-                        @timeit_debug global_timer "lu!" lu!(orig_lu, sparse(precon_matrix); check=false)
+                        #@timeit_debug global_timer "lu!" lu!(orig_lu, sparse(precon_matrix); check=false)
+                        @timeit_debug global_timer "lu!" begin
+                          orig_lu[:, localcols] = precon_matrix[:, localcols]
+                          qr!(orig_lu)
+                        end
                     catch e
                         if !isa(e, ArgumentError)
                             rethrow(e)
                         end
                         println("Sparsity pattern of matrix changed, rebuilding "
                                 * " LU from scratch")
-                        @timeit_debug global_timer "lu" orig_lu = lu(sparse(precon_matrix))
+                        mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]),
+                          size(precon_matrix); blocksize=qrblocksize)
+                        @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix)
                     end
                     nl_solver_params.preconditioners[ir] =
                         (orig_lu, precon_matrix, input_buffer, output_buffer)
@@ -1280,10 +1299,10 @@ global_rank[] == 0 && println("recalculating precon")
                 counter += 1
             end
 
-            begin_serial_region()
-            @serial_region begin
-                @timeit_debug global_timer "ldiv!" ldiv!(this_output_buffer, precon_lu, this_input_buffer)
-            end
+            #begin_serial_region()
+            #@serial_region begin
+                @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer)
+            #end
 
             begin_serial_region()
             counter = 1
@@ -1347,7 +1366,7 @@ global_rank[] == 0 && println("recalculating precon")
         end
 
         if nl_solver_params.solves_since_precon_update[] ≥ nl_solver_params.preconditioner_update_interval
-global_rank[] == 0 && println("recalculating precon")
+global_rank[] == 0 && println("recalculating precon 2")
             nl_solver_params.solves_since_precon_update[] = 0
             nl_solver_params.precon_dt[] = t_params.dt[]
 
@@ -1931,7 +1950,7 @@ to allow the outer r-loop to be parallelised.
 
         function recalculate_preconditioner!()
             if nl_solver_params.preconditioner_type === Val(:electron_lu)
-global_rank[] == 0 && println("recalculating precon")
+global_rank[] == 0 && println("recalculating precon 3")
                 nl_solver_params.solves_since_precon_update[] = 0
                 nl_solver_params.precon_dt[] = ion_dt
 
@@ -1944,20 +1963,35 @@ global_rank[] == 0 && println("recalculating precon")
                     z_advect, vpa_advect, scratch_dummy, external_source_settings,
                     num_diss_params, t_params, ion_dt, ir, true, :all, true, false)
 
-                begin_serial_region()
-                if block_rank[] == 0
-                    if size(orig_lu) == (1, 1)
+                #begin_serial_region()
+                qrblocksize = 4
+                localcols = MPIQR.localcols(block_rank[], size(precon_matrix, 2),
+                                            qrblocksize, block_size[])
+
+                if block_rank[] == 0 # >= 0
+                    if size(orig_lu, 1) == 1
                         # Have not properly created the LU decomposition before, so
                         # cannot reuse it.
-                        @timeit_debug global_timer "lu" nl_solver_params.preconditioners[ir] =
-                            (lu(sparse(precon_matrix)), precon_matrix, input_buffer,
-                             output_buffer)
+                        #@timeit_debug global_timer "lu" nl_solver_params.preconditioners[ir] =
+                        #    (lu(sparse(precon_matrix)), precon_matrix, input_buffer,
+                        #     output_buffer)
+                        @timeit_debug global_timer "lu" begin
+                            mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]), size(precon_matrix);
+                                                      blocksize=qrblocksize)
+                            mpiqrstruct = qr!(mpiqrmatrix)
+                            nl_solver_params.preconditioners[ir] =
+                            (mpiqrstruct, precon_matrix, input_buffer, output_buffer)
+                        end
                     else
                         # LU decomposition was previously created. The Jacobian always
                         # has the same sparsity pattern, so by using `lu!()` we can
                         # reuse some setup.
                         try
-                            @timeit_debug global_timer "lu!" lu!(orig_lu, sparse(precon_matrix); check=false)
+                            #@timeit_debug global_timer "lu!" lu!(orig_lu, sparse(precon_matrix); check=false)
+                            @timeit_debug global_timer "lu!" begin
+                              orig_lu[:, localcols] = precon_matrix[:, localcols]
+                              qr!(orig_lu)
+                            end
                         catch e
                             if !isa(e, ArgumentError)
                                 rethrow(e)
@@ -2007,10 +2041,10 @@ global_rank[] == 0 && println("recalculating precon")
                     counter += 1
                 end
 
-                begin_serial_region()
-                @serial_region begin
-                    @timeit_debug global_timer "ldiv!" ldiv!(this_output_buffer, precon_lu, this_input_buffer)
-                end
+                #begin_serial_region()
+                #@serial_region begin
+                    @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer)
+                #end
 
                 begin_serial_region()
                 counter = 1
diff --git a/moment_kinetics/src/nonlinear_solvers.jl b/moment_kinetics/src/nonlinear_solvers.jl
index 361547970..f13d5ddc4 100644
--- a/moment_kinetics/src/nonlinear_solvers.jl
+++ b/moment_kinetics/src/nonlinear_solvers.jl
@@ -41,6 +41,7 @@ using LinearAlgebra
 using MPI
 using SparseArrays
 using StatsBase: mean
+using MPIQR
 
 struct nl_solver_info{TH,TV,Tcsg,Tlig,Tprecon,Tpretype}
     rtol::mk_float
@@ -166,23 +167,25 @@ function setup_nonlinear_solve(active, input_dict, coords, outer_coords=(); defa
             V .= 0.0
         end
     end
-
+    mpiqr = MPIQR.MPIQRMatrix(zeros(Float64, 1, 1), (1, block_size[]);
+                              blocksize=1, comm=comm_block[])
+    mpiqrstruct = MPIQR.MPIQRStruct(mpiqr)
     if preconditioner_type === Val(:lu)
         # Create dummy LU solver objects so we can create an array for preconditioners.
         # These will be calculated properly within the time loop.
-        preconditioners = fill(lu(sparse(1.0*I, total_size_coords, total_size_coords)),
+        preconditioners = fill(mpiqrstruct,
                                reverse(outer_coord_sizes))
     elseif preconditioner_type === Val(:electron_split_lu)
-        preconditioners = (z=fill(lu(sparse(1.0*I, coords.z.n, coords.z.n)),
+        preconditioners = (z=fill(mpiqrstruct,
                                   tuple(coords.vpa.n, reverse(outer_coord_sizes)...)),
-                           vpa=fill(lu(sparse(1.0*I, coords.vpa.n, coords.vpa.n)),
+                           vpa=fill(mpqirstruct,
                                     tuple(coords.z.n, reverse(outer_coord_sizes)...)),
-                           ppar=fill(lu(sparse(1.0*I, coords.z.n, coords.z.n)),
+                           ppar=fill(mpiqrstruct,
                                      reverse(outer_coord_sizes)),
                           )
     elseif preconditioner_type === Val(:electron_lu)
         pdf_plus_ppar_size = total_size_coords + coords.z.n
-        preconditioners = fill((lu(sparse(1.0*I, 1, 1)),
+        preconditioners = fill((mpiqrstruct,
                                 allocate_shared_float(pdf_plus_ppar_size, pdf_plus_ppar_size),
                                 allocate_shared_float(pdf_plus_ppar_size),
                                 allocate_shared_float(pdf_plus_ppar_size),
@@ -202,6 +205,7 @@ function setup_nonlinear_solve(active, input_dict, coords, outer_coords=(); defa
             # Plus one for the one point of ppar that is included in the 'v solve'.
             v_solve_n = nvperp * nvpa + 1
             v_solve_implicit_lus = Vector{SparseArrays.UMFPACK.UmfpackLU{mk_float, mk_int}}(undef, v_solve_nsolve)
+            #v_solve_implicit_lus = Vector{Any}(undef, v_solve_nsolve)
             v_solve_explicit_matrices = Vector{SparseMatrixCSC{mk_float, mk_int}}(undef, v_solve_nsolve)
             # This buffer is not shared-memory, because it will be used for a serial LU solve.
             v_solve_buffer = allocate_float(v_solve_n)
@@ -221,6 +225,7 @@ function setup_nonlinear_solve(active, input_dict, coords, outer_coords=(); defa
             end
             z_solve_n = nz
             z_solve_implicit_lus = Vector{SparseArrays.UMFPACK.UmfpackLU{mk_float, mk_int}}(undef, z_solve_nsolve)
+            #z_solve_implicit_lus = Vector{Any}(undef, z_solve_nsolve)
             z_solve_explicit_matrices = Vector{SparseMatrixCSC{mk_float, mk_int}}(undef, z_solve_nsolve)
             # This buffer is not shared-memory, because it will be used for a serial LU solve.
             z_solve_buffer = allocate_float(z_solve_n)
diff --git a/moment_kinetics/src/time_advance.jl b/moment_kinetics/src/time_advance.jl
index 777f9b4ec..129d4944d 100644
--- a/moment_kinetics/src/time_advance.jl
+++ b/moment_kinetics/src/time_advance.jl
@@ -493,7 +493,7 @@ function setup_time_info(t_input, n_variables, code_time, dt_reload,
                     electron_preconditioner_type = Val(:electron_lu)
                 else
                     # Want to parallelise preconditioner, so use ADI method.
-                    electron_preconditioner_type = Val(:electron_adi)
+                    electron_preconditioner_type = Val(:electron_lu) # now actually MPIQR
                 end
             else
                 if t_input["implicit_electron_time_evolving"] ∈ keys(electron_precon_types)
diff --git a/moment_kinetics/src/timer_utils.jl b/moment_kinetics/src/timer_utils.jl
index 3ff8f02ff..307558612 100644
--- a/moment_kinetics/src/timer_utils.jl
+++ b/moment_kinetics/src/timer_utils.jl
@@ -24,7 +24,7 @@ To control the debug timers in `moment_kinetics` we define this function once, i
 
 To activate debug timers, edit this function so that it returns `true`.
 """
-timeit_debug_enabled() = false
+timeit_debug_enabled() = true
 
 """
 Global object used to collect timings of various parts of the code

From 57588a5875ea632c1751030677e0bdf875b5d481 Mon Sep 17 00:00:00 2001
From: James Cook <cookjws@gmail.com>
Date: Fri, 17 Jan 2025 13:44:09 +0000
Subject: [PATCH 02/18] block_rank[] >= 0 plus others

---
 .../src/electron_kinetic_equation.jl          | 33 +++++++++++--------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl
index c179557f1..070d3cf93 100644
--- a/moment_kinetics/src/electron_kinetic_equation.jl
+++ b/moment_kinetics/src/electron_kinetic_equation.jl
@@ -71,6 +71,8 @@ using ..velocity_moments: integrate_over_vspace, calculate_electron_moment_deriv
 # Only needed so we can reference it in a docstring
 import ..runge_kutta
 
+const MPIQRBLOCKSIZE = 4 # 1 will be slow, 4 is about optimum, 8 may be faster or slower, 16 will be slower.
+
 const integral_correction_sharpness = 4.0
 """
 update_electron_pdf is a function that uses the electron kinetic equation 
@@ -1233,10 +1235,10 @@ global_rank[] == 0 && println("recalculating precon 1")
                 ir, evolve_ppar)
 
             #begin_serial_region()
-            qrblocksize = 4
-            localcols = MPIQR.localcolumns(block_rank[], size(precon_matrix, 2),
-                                           qrblocksize, block_size[])#1:size(precon_matrix, 2)#
+            _block_synchronize()
             if block_rank[] >= 0 # == 0
+                localcols = MPIQR.localcolumns(block_rank[], size(precon_matrix, 2),
+                                               MPIQRBLOCKSIZE, block_size[])#1:size(precon_matrix, 2)#
                 if size(orig_lu, 1) == 1
                     # Have not properly created the LU decomposition before, so
                     # cannot reuse it.
@@ -1244,8 +1246,8 @@ global_rank[] == 0 && println("recalculating precon 1")
                     #    (lu(sparse(precon_matrix)), precon_matrix, input_buffer,
                     #     output_buffer)
                     @timeit_debug global_timer "lu" begin
-                        mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]),
-                          size(precon_matrix); blocksize=qrblocksize)
+                        mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols],
+                          size(precon_matrix); blocksize=MPIQRBLOCKSIZE)
                         progress = Progress(mpiqrmatrix, dt=iszero(block_rank[]) ? 1 : 2^31;
                                             showspeed=true)
                         mpiqrstruct = qr!(mpiqrmatrix; progress=progress)
@@ -1268,8 +1270,8 @@ global_rank[] == 0 && println("recalculating precon 1")
                         end
                         println("Sparsity pattern of matrix changed, rebuilding "
                                 * " LU from scratch")
-                        mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]),
-                          size(precon_matrix); blocksize=qrblocksize)
+                        mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols],
+                          size(precon_matrix); blocksize=MPIQRBLOCKSIZE)
                         @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix)
                     end
                     nl_solver_params.preconditioners[ir] =
@@ -1279,6 +1281,7 @@ global_rank[] == 0 && println("recalculating precon 1")
                 nl_solver_params.preconditioners[ir] =
                     (orig_lu, precon_matrix, input_buffer, output_buffer)
             end
+            _block_synchronize()
         end
 
 
@@ -1964,11 +1967,10 @@ global_rank[] == 0 && println("recalculating precon 3")
                     num_diss_params, t_params, ion_dt, ir, true, :all, true, false)
 
                 #begin_serial_region()
-                qrblocksize = 4
-                localcols = MPIQR.localcols(block_rank[], size(precon_matrix, 2),
-                                            qrblocksize, block_size[])
-
-                if block_rank[] == 0 # >= 0
+                _block_synchronize()
+                if block_rank[] >= 0 # >= 0
+                    localcols = MPIQR.localcols(block_rank[], size(precon_matrix, 2),
+                                                MPIQRBLOCKSIZE, block_size[])
                     if size(orig_lu, 1) == 1
                         # Have not properly created the LU decomposition before, so
                         # cannot reuse it.
@@ -1977,7 +1979,7 @@ global_rank[] == 0 && println("recalculating precon 3")
                         #     output_buffer)
                         @timeit_debug global_timer "lu" begin
                             mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]), size(precon_matrix);
-                                                      blocksize=qrblocksize)
+                                                            blocksize=MPIQRBLOCKSIZE)
                             mpiqrstruct = qr!(mpiqrmatrix)
                             nl_solver_params.preconditioners[ir] =
                             (mpiqrstruct, precon_matrix, input_buffer, output_buffer)
@@ -1998,7 +2000,9 @@ global_rank[] == 0 && println("recalculating precon 3")
                             end
                             println("Sparsity pattern of matrix changed, rebuilding "
                                     * " LU from scratch")
-                            @timeit_debug global_timer "lu" orig_lu = lu(sparse(precon_matrix))
+                            mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols],
+                              size(precon_matrix); blocksize=MPIQRBLOCKSIZE)
+                            @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix)
                         end
                         nl_solver_params.preconditioners[ir] =
                             (orig_lu, precon_matrix, input_buffer, output_buffer)
@@ -2007,6 +2011,7 @@ global_rank[] == 0 && println("recalculating precon 3")
                     nl_solver_params.preconditioners[ir] =
                         (orig_lu, precon_matrix, input_buffer, output_buffer)
                 end
+                _block_synchronize()
 
                 return nothing
             end

From a839dc84dbcf374df9f7ed698bdd303c6304db5f Mon Sep 17 00:00:00 2001
From: James Cook <cookjws@gmail.com>
Date: Mon, 20 Jan 2025 17:21:22 +0000
Subject: [PATCH 03/18] this works with latest MPIQR.jl

---
 moment_kinetics/Project.toml                  |  4 +-
 .../src/electron_kinetic_equation.jl          | 46 ++++++-------------
 2 files changed, 14 insertions(+), 36 deletions(-)

diff --git a/moment_kinetics/Project.toml b/moment_kinetics/Project.toml
index 9f8296ef4..e8a430cc7 100644
--- a/moment_kinetics/Project.toml
+++ b/moment_kinetics/Project.toml
@@ -26,7 +26,6 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Primes = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Quadmath = "be4d8f0f-7fa4-5f49-b795-2f01399ab2dd"
 Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
 Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665"
@@ -53,7 +52,6 @@ manufactured_solns_ext = ["Symbolics", "IfElse"]
 
 [compat]
 MPIQR = "0.1.0"
-ProgressMeter = "1.10.2"
 julia = "1.9.0"
 
 [extras]
@@ -61,4 +59,4 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Random", "Test", "NCDatasets"]
+test = ["Random", "Test"]
diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl
index 070d3cf93..750b4e904 100644
--- a/moment_kinetics/src/electron_kinetic_equation.jl
+++ b/moment_kinetics/src/electron_kinetic_equation.jl
@@ -4,7 +4,6 @@ using LinearAlgebra
 using MPI
 using SparseArrays
 using MPIQR
-using ProgressMeter
 
 export get_electron_critical_velocities
 
@@ -1220,7 +1219,6 @@ pressure \$p_{e∥}\$.
         end
 
         if nl_solver_params.solves_since_precon_update[] ≥ nl_solver_params.preconditioner_update_interval
-global_rank[] == 0 && println("recalculating precon 1")
             nl_solver_params.solves_since_precon_update[] = 0
             nl_solver_params.precon_dt[] = t_params.dt[]
 
@@ -1234,7 +1232,6 @@ global_rank[] == 0 && println("recalculating precon 1")
                 external_source_settings, num_diss_params, t_params, ion_dt,
                 ir, evolve_ppar)
 
-            #begin_serial_region()
             _block_synchronize()
             if block_rank[] >= 0 # == 0
                 localcols = MPIQR.localcolumns(block_rank[], size(precon_matrix, 2),
@@ -1242,26 +1239,20 @@ global_rank[] == 0 && println("recalculating precon 1")
                 if size(orig_lu, 1) == 1
                     # Have not properly created the LU decomposition before, so
                     # cannot reuse it.
-                    #@timeit_debug global_timer "lu" nl_solver_params.preconditioners[ir] =
-                    #    (lu(sparse(precon_matrix)), precon_matrix, input_buffer,
-                    #     output_buffer)
                     @timeit_debug global_timer "lu" begin
                         mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols],
-                          size(precon_matrix); blocksize=MPIQRBLOCKSIZE)
-                        progress = Progress(mpiqrmatrix, dt=iszero(block_rank[]) ? 1 : 2^31;
-                                            showspeed=true)
-                        mpiqrstruct = qr!(mpiqrmatrix; progress=progress)
+                          size(precon_matrix); blocksize=MPIQRBLOCKSIZE, comm=comm_block[])
+                        mpiqrstruct = qr!(mpiqrmatrix)
                         nl_solver_params.preconditioners[ir] =
-                        (mpiqrstruct, precon_matrix, input_buffer, output_buffer)
+                          (mpiqrstruct, precon_matrix, input_buffer, output_buffer)
                     end
                 else
                     # LU decomposition was previously created. The Jacobian always
                     # has the same sparsity pattern, so by using `lu!()` we can
                     # reuse some setup.
                     try
-                        #@timeit_debug global_timer "lu!" lu!(orig_lu, sparse(precon_matrix); check=false)
                         @timeit_debug global_timer "lu!" begin
-                          orig_lu[:, localcols] = precon_matrix[:, localcols]
+                          orig_lu[:, localcols] .= precon_matrix[:, localcols]
                           qr!(orig_lu)
                         end
                     catch e
@@ -1271,7 +1262,7 @@ global_rank[] == 0 && println("recalculating precon 1")
                         println("Sparsity pattern of matrix changed, rebuilding "
                                 * " LU from scratch")
                         mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols],
-                          size(precon_matrix); blocksize=MPIQRBLOCKSIZE)
+                          size(precon_matrix); blocksize=MPIQRBLOCKSIZE, comm=comm_block[])
                         @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix)
                     end
                     nl_solver_params.preconditioners[ir] =
@@ -1302,10 +1293,8 @@ global_rank[] == 0 && println("recalculating precon 1")
                 counter += 1
             end
 
-            #begin_serial_region()
-            #@serial_region begin
-                @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer)
-            #end
+            _block_synchronize()
+            @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer)
 
             begin_serial_region()
             counter = 1
@@ -1369,7 +1358,6 @@ global_rank[] == 0 && println("recalculating precon 1")
         end
 
         if nl_solver_params.solves_since_precon_update[] ≥ nl_solver_params.preconditioner_update_interval
-global_rank[] == 0 && println("recalculating precon 2")
             nl_solver_params.solves_since_precon_update[] = 0
             nl_solver_params.precon_dt[] = t_params.dt[]
 
@@ -1953,7 +1941,6 @@ to allow the outer r-loop to be parallelised.
 
         function recalculate_preconditioner!()
             if nl_solver_params.preconditioner_type === Val(:electron_lu)
-global_rank[] == 0 && println("recalculating precon 3")
                 nl_solver_params.solves_since_precon_update[] = 0
                 nl_solver_params.precon_dt[] = ion_dt
 
@@ -1966,7 +1953,6 @@ global_rank[] == 0 && println("recalculating precon 3")
                     z_advect, vpa_advect, scratch_dummy, external_source_settings,
                     num_diss_params, t_params, ion_dt, ir, true, :all, true, false)
 
-                #begin_serial_region()
                 _block_synchronize()
                 if block_rank[] >= 0 # >= 0
                     localcols = MPIQR.localcols(block_rank[], size(precon_matrix, 2),
@@ -1974,12 +1960,9 @@ global_rank[] == 0 && println("recalculating precon 3")
                     if size(orig_lu, 1) == 1
                         # Have not properly created the LU decomposition before, so
                         # cannot reuse it.
-                        #@timeit_debug global_timer "lu" nl_solver_params.preconditioners[ir] =
-                        #    (lu(sparse(precon_matrix)), precon_matrix, input_buffer,
-                        #     output_buffer)
                         @timeit_debug global_timer "lu" begin
                             mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]), size(precon_matrix);
-                                                            blocksize=MPIQRBLOCKSIZE)
+                              blocksize=MPIQRBLOCKSIZE, comm=comm_block[])
                             mpiqrstruct = qr!(mpiqrmatrix)
                             nl_solver_params.preconditioners[ir] =
                             (mpiqrstruct, precon_matrix, input_buffer, output_buffer)
@@ -1989,9 +1972,8 @@ global_rank[] == 0 && println("recalculating precon 3")
                         # has the same sparsity pattern, so by using `lu!()` we can
                         # reuse some setup.
                         try
-                            #@timeit_debug global_timer "lu!" lu!(orig_lu, sparse(precon_matrix); check=false)
                             @timeit_debug global_timer "lu!" begin
-                              orig_lu[:, localcols] = precon_matrix[:, localcols]
+                              orig_lu[:, localcols] .= precon_matrix[:, localcols]
                               qr!(orig_lu)
                             end
                         catch e
@@ -2000,8 +1982,8 @@ global_rank[] == 0 && println("recalculating precon 3")
                             end
                             println("Sparsity pattern of matrix changed, rebuilding "
                                     * " LU from scratch")
-                            mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols],
-                              size(precon_matrix); blocksize=MPIQRBLOCKSIZE)
+                            mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols], size(precon_matrix);
+                              blocksize=MPIQRBLOCKSIZE, comm=comm_block[])
                             @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix)
                         end
                         nl_solver_params.preconditioners[ir] =
@@ -2046,10 +2028,8 @@ global_rank[] == 0 && println("recalculating precon 3")
                     counter += 1
                 end
 
-                #begin_serial_region()
-                #@serial_region begin
-                    @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer)
-                #end
+                _block_synchronize()
+                @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer)
 
                 begin_serial_region()
                 counter = 1

From 7b80960c4b5ebcd0fb9a88abc37f560864449870 Mon Sep 17 00:00:00 2001
From: James Cook <james.cook@ukaea.uk>
Date: Tue, 21 Jan 2025 09:43:25 +0000
Subject: [PATCH 04/18] put println statements back in

---
 moment_kinetics/src/electron_kinetic_equation.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl
index 750b4e904..5adb40ceb 100644
--- a/moment_kinetics/src/electron_kinetic_equation.jl
+++ b/moment_kinetics/src/electron_kinetic_equation.jl
@@ -1219,6 +1219,7 @@ pressure \$p_{e∥}\$.
         end
 
         if nl_solver_params.solves_since_precon_update[] ≥ nl_solver_params.preconditioner_update_interval
+            global_rank[] == 0 && println("recalculating precon :electron_lu 1")
             nl_solver_params.solves_since_precon_update[] = 0
             nl_solver_params.precon_dt[] = t_params.dt[]
 
@@ -1349,6 +1350,7 @@ pressure \$p_{e∥}\$.
         left_preconditioner = identity
         right_preconditioner = lu_precon!
     elseif nl_solver_params.preconditioner_type === Val(:electron_adi)
+        global_rank[] == 0 && println("recalculating precon :electron_adi")
 
         if t_params.dt[] > 1.5 * nl_solver_params.precon_dt[] ||
                 t_params.dt[] < 2.0/3.0 * nl_solver_params.precon_dt[]
@@ -1941,6 +1943,7 @@ to allow the outer r-loop to be parallelised.
 
         function recalculate_preconditioner!()
             if nl_solver_params.preconditioner_type === Val(:electron_lu)
+                global_rank[] == 0 && println("recalculating precon :electron_lu 2")
                 nl_solver_params.solves_since_precon_update[] = 0
                 nl_solver_params.precon_dt[] = ion_dt
 

From bc7f5f4e4ed781ec72b63145d23f4fb43512e9f1 Mon Sep 17 00:00:00 2001
From: James Cook <james.cook@ukaea.uk>
Date: Tue, 21 Jan 2025 09:44:28 +0000
Subject: [PATCH 05/18] delete dead line

---
 moment_kinetics/src/nonlinear_solvers.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/moment_kinetics/src/nonlinear_solvers.jl b/moment_kinetics/src/nonlinear_solvers.jl
index f13d5ddc4..ef7828005 100644
--- a/moment_kinetics/src/nonlinear_solvers.jl
+++ b/moment_kinetics/src/nonlinear_solvers.jl
@@ -205,7 +205,6 @@ function setup_nonlinear_solve(active, input_dict, coords, outer_coords=(); defa
             # Plus one for the one point of ppar that is included in the 'v solve'.
             v_solve_n = nvperp * nvpa + 1
             v_solve_implicit_lus = Vector{SparseArrays.UMFPACK.UmfpackLU{mk_float, mk_int}}(undef, v_solve_nsolve)
-            #v_solve_implicit_lus = Vector{Any}(undef, v_solve_nsolve)
             v_solve_explicit_matrices = Vector{SparseMatrixCSC{mk_float, mk_int}}(undef, v_solve_nsolve)
             # This buffer is not shared-memory, because it will be used for a serial LU solve.
             v_solve_buffer = allocate_float(v_solve_n)

From f7468a628b3663cccb73aa478532df8cbe8a72a5 Mon Sep 17 00:00:00 2001
From: James Cook <james.cook@ukaea.uk>
Date: Tue, 21 Jan 2025 09:45:17 +0000
Subject: [PATCH 06/18] timeit_debug_enabled set back to false

---
 moment_kinetics/src/timer_utils.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moment_kinetics/src/timer_utils.jl b/moment_kinetics/src/timer_utils.jl
index 307558612..3ff8f02ff 100644
--- a/moment_kinetics/src/timer_utils.jl
+++ b/moment_kinetics/src/timer_utils.jl
@@ -24,7 +24,7 @@ To control the debug timers in `moment_kinetics` we define this function once, i
 
 To activate debug timers, edit this function so that it returns `true`.
 """
-timeit_debug_enabled() = true
+timeit_debug_enabled() = false
 
 """
 Global object used to collect timings of various parts of the code

From 43f2414e81a5a3a0bd5ad708551fddcd7b483e9c Mon Sep 17 00:00:00 2001
From: James Cook <james.cook@ukaea.uk>
Date: Tue, 21 Jan 2025 09:45:44 +0000
Subject: [PATCH 07/18] delete dead line

---
 moment_kinetics/src/nonlinear_solvers.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/moment_kinetics/src/nonlinear_solvers.jl b/moment_kinetics/src/nonlinear_solvers.jl
index ef7828005..2f786431e 100644
--- a/moment_kinetics/src/nonlinear_solvers.jl
+++ b/moment_kinetics/src/nonlinear_solvers.jl
@@ -224,7 +224,6 @@ function setup_nonlinear_solve(active, input_dict, coords, outer_coords=(); defa
             end
             z_solve_n = nz
             z_solve_implicit_lus = Vector{SparseArrays.UMFPACK.UmfpackLU{mk_float, mk_int}}(undef, z_solve_nsolve)
-            #z_solve_implicit_lus = Vector{Any}(undef, z_solve_nsolve)
             z_solve_explicit_matrices = Vector{SparseMatrixCSC{mk_float, mk_int}}(undef, z_solve_nsolve)
             # This buffer is not shared-memory, because it will be used for a serial LU solve.
             z_solve_buffer = allocate_float(z_solve_n)

From 676a9be0f5c58e2ce40622c8e749cff3bf34890e Mon Sep 17 00:00:00 2001
From: James Cook <james.cook@ukaea.uk>
Date: Tue, 21 Jan 2025 09:47:04 +0000
Subject: [PATCH 08/18] time_advance was forced to use MPIQR rather than ADI -
 revert this to how it was before

---
 moment_kinetics/src/time_advance.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moment_kinetics/src/time_advance.jl b/moment_kinetics/src/time_advance.jl
index 129d4944d..c1d4d8f53 100644
--- a/moment_kinetics/src/time_advance.jl
+++ b/moment_kinetics/src/time_advance.jl
@@ -493,7 +493,7 @@ function setup_time_info(t_input, n_variables, code_time, dt_reload,
                     electron_preconditioner_type = Val(:electron_lu)
                 else
                     # Want to parallelise preconditioner, so use ADI method.
-                    electron_preconditioner_type = Val(:electron_lu) # now actually MPIQR
+                    electron_preconditioner_type = Val(:electron_adi) # now actually MPIQR
                 end
             else
                 if t_input["implicit_electron_time_evolving"] ∈ keys(electron_precon_types)

From 35ba4b11b6ed1d6c61121d8d77e9995243fdaaa4 Mon Sep 17 00:00:00 2001
From: James Cook <james.cook@ukaea.uk>
Date: Tue, 21 Jan 2025 09:49:47 +0000
Subject: [PATCH 09/18] tidy up

---
 moment_kinetics/src/electron_kinetic_equation.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl
index 5adb40ceb..01f25b781 100644
--- a/moment_kinetics/src/electron_kinetic_equation.jl
+++ b/moment_kinetics/src/electron_kinetic_equation.jl
@@ -1236,7 +1236,7 @@ pressure \$p_{e∥}\$.
             _block_synchronize()
             if block_rank[] >= 0 # == 0
                 localcols = MPIQR.localcolumns(block_rank[], size(precon_matrix, 2),
-                                               MPIQRBLOCKSIZE, block_size[])#1:size(precon_matrix, 2)#
+                                               MPIQRBLOCKSIZE, block_size[])
                 if size(orig_lu, 1) == 1
                     # Have not properly created the LU decomposition before, so
                     # cannot reuse it.

From 933d749599eccf6870b79a06f25879e9f9e143b2 Mon Sep 17 00:00:00 2001
From: James Cook <james.cook@ukaea.uk>
Date: Tue, 21 Jan 2025 13:53:32 +0000
Subject: [PATCH 10/18] increase MPIQRBLOCKSIZE to 8

---
 moment_kinetics/src/electron_kinetic_equation.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl
index 01f25b781..81580fcf8 100644
--- a/moment_kinetics/src/electron_kinetic_equation.jl
+++ b/moment_kinetics/src/electron_kinetic_equation.jl
@@ -70,7 +70,7 @@ using ..velocity_moments: integrate_over_vspace, calculate_electron_moment_deriv
 # Only needed so we can reference it in a docstring
 import ..runge_kutta
 
-const MPIQRBLOCKSIZE = 4 # 1 will be slow, 4 is about optimum, 8 may be faster or slower, 16 will be slower.
+const MPIQRBLOCKSIZE = 8 # 1 will be slow, 4 is good for smaller matrices, 8 is good for larger matrices, 16 - who knows?!
 
 const integral_correction_sharpness = 4.0
 """

From 52f939e8bd28b56a1fad2c637ab05deb5a6429cd Mon Sep 17 00:00:00 2001
From: James Cook <james.cook@ukaea.uk>
Date: Tue, 21 Jan 2025 14:54:31 +0000
Subject: [PATCH 11/18] delete obsolete comment

---
 moment_kinetics/src/time_advance.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moment_kinetics/src/time_advance.jl b/moment_kinetics/src/time_advance.jl
index c1d4d8f53..777f9b4ec 100644
--- a/moment_kinetics/src/time_advance.jl
+++ b/moment_kinetics/src/time_advance.jl
@@ -493,7 +493,7 @@ function setup_time_info(t_input, n_variables, code_time, dt_reload,
                     electron_preconditioner_type = Val(:electron_lu)
                 else
                     # Want to parallelise preconditioner, so use ADI method.
-                    electron_preconditioner_type = Val(:electron_adi) # now actually MPIQR
+                    electron_preconditioner_type = Val(:electron_adi)
                 end
             else
                 if t_input["implicit_electron_time_evolving"] ∈ keys(electron_precon_types)

From 2ed2b17d8324ce0d2e8f7d26f91715bf44c097ea Mon Sep 17 00:00:00 2001
From: James Cook <james.cook@ukaea.uk>
Date: Tue, 21 Jan 2025 14:55:25 +0000
Subject: [PATCH 12/18] make using statements alphabetical

---
 moment_kinetics/src/nonlinear_solvers.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moment_kinetics/src/nonlinear_solvers.jl b/moment_kinetics/src/nonlinear_solvers.jl
index 2f786431e..568dba47f 100644
--- a/moment_kinetics/src/nonlinear_solvers.jl
+++ b/moment_kinetics/src/nonlinear_solvers.jl
@@ -39,9 +39,9 @@ using ..type_definitions: mk_float, mk_int
 
 using LinearAlgebra
 using MPI
+using MPIQR
 using SparseArrays
 using StatsBase: mean
-using MPIQR
 
 struct nl_solver_info{TH,TV,Tcsg,Tlig,Tprecon,Tpretype}
     rtol::mk_float

From 2f932ee65db53ae05af71a0f16b9c229c91eefea Mon Sep 17 00:00:00 2001
From: James Cook <james.cook@ukaea.uk>
Date: Tue, 21 Jan 2025 14:56:02 +0000
Subject: [PATCH 13/18] make using statements alphabetical

---
 moment_kinetics/src/electron_kinetic_equation.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl
index 81580fcf8..016e9a997 100644
--- a/moment_kinetics/src/electron_kinetic_equation.jl
+++ b/moment_kinetics/src/electron_kinetic_equation.jl
@@ -2,8 +2,8 @@ module electron_kinetic_equation
 
 using LinearAlgebra
 using MPI
-using SparseArrays
 using MPIQR
+using SparseArrays
 
 export get_electron_critical_velocities
 

From 733e9196fa7ede5b17b0bf412af1bd8b2d188a22 Mon Sep 17 00:00:00 2001
From: James Cook <james.cook@ukaea.uk>
Date: Tue, 21 Jan 2025 14:58:59 +0000
Subject: [PATCH 14/18] make all caps const MPIQRBLOCKSIZE mpiqr_blocksize as
 inkeeping with style

---
 .../src/electron_kinetic_equation.jl            | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl
index 016e9a997..4f1adfe15 100644
--- a/moment_kinetics/src/electron_kinetic_equation.jl
+++ b/moment_kinetics/src/electron_kinetic_equation.jl
@@ -70,9 +70,10 @@ using ..velocity_moments: integrate_over_vspace, calculate_electron_moment_deriv
 # Only needed so we can reference it in a docstring
 import ..runge_kutta
 
-const MPIQRBLOCKSIZE = 8 # 1 will be slow, 4 is good for smaller matrices, 8 is good for larger matrices, 16 - who knows?!
-
 const integral_correction_sharpness = 4.0
+
+const mpiqr_blocksize = 8 # 1 will be slow, 4 is good for smaller matrices, 8 is good for larger matrices, 16 - who knows?!
+
 """
 update_electron_pdf is a function that uses the electron kinetic equation 
 to solve for the updated electron pdf
@@ -1236,13 +1237,13 @@ pressure \$p_{e∥}\$.
             _block_synchronize()
             if block_rank[] >= 0 # == 0
                 localcols = MPIQR.localcolumns(block_rank[], size(precon_matrix, 2),
-                                               MPIQRBLOCKSIZE, block_size[])
+                                               mpiqr_blocksize, block_size[])
                 if size(orig_lu, 1) == 1
                     # Have not properly created the LU decomposition before, so
                     # cannot reuse it.
                     @timeit_debug global_timer "lu" begin
                         mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols],
-                          size(precon_matrix); blocksize=MPIQRBLOCKSIZE, comm=comm_block[])
+                          size(precon_matrix); blocksize=mpiqr_blocksize, comm=comm_block[])
                         mpiqrstruct = qr!(mpiqrmatrix)
                         nl_solver_params.preconditioners[ir] =
                           (mpiqrstruct, precon_matrix, input_buffer, output_buffer)
@@ -1263,7 +1264,7 @@ pressure \$p_{e∥}\$.
                         println("Sparsity pattern of matrix changed, rebuilding "
                                 * " LU from scratch")
                         mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols],
-                          size(precon_matrix); blocksize=MPIQRBLOCKSIZE, comm=comm_block[])
+                          size(precon_matrix); blocksize=mpiqr_blocksize, comm=comm_block[])
                         @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix)
                     end
                     nl_solver_params.preconditioners[ir] =
@@ -1959,13 +1960,13 @@ to allow the outer r-loop to be parallelised.
                 _block_synchronize()
                 if block_rank[] >= 0 # >= 0
                     localcols = MPIQR.localcols(block_rank[], size(precon_matrix, 2),
-                                                MPIQRBLOCKSIZE, block_size[])
+                                                mpiqr_blocksize, block_size[])
                     if size(orig_lu, 1) == 1
                         # Have not properly created the LU decomposition before, so
                         # cannot reuse it.
                         @timeit_debug global_timer "lu" begin
                             mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]), size(precon_matrix);
-                              blocksize=MPIQRBLOCKSIZE, comm=comm_block[])
+                              blocksize=mpiqr_blocksize, comm=comm_block[])
                             mpiqrstruct = qr!(mpiqrmatrix)
                             nl_solver_params.preconditioners[ir] =
                             (mpiqrstruct, precon_matrix, input_buffer, output_buffer)
@@ -1986,7 +1987,7 @@ to allow the outer r-loop to be parallelised.
                             println("Sparsity pattern of matrix changed, rebuilding "
                                     * " LU from scratch")
                             mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols], size(precon_matrix);
-                              blocksize=MPIQRBLOCKSIZE, comm=comm_block[])
+                              blocksize=mpiqr_blocksize, comm=comm_block[])
                             @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix)
                         end
                         nl_solver_params.preconditioners[ir] =

From 207ba22080f1c0718c5ca4ce93a4344438033614 Mon Sep 17 00:00:00 2001
From: James Cook <james.cook@ukaea.uk>
Date: Tue, 21 Jan 2025 16:09:08 +0000
Subject: [PATCH 15/18] remove unnecessary defensive _block_synchonize() calls

---
 moment_kinetics/src/electron_kinetic_equation.jl | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl
index 4f1adfe15..2f49b3574 100644
--- a/moment_kinetics/src/electron_kinetic_equation.jl
+++ b/moment_kinetics/src/electron_kinetic_equation.jl
@@ -1234,8 +1234,7 @@ pressure \$p_{e∥}\$.
                 external_source_settings, num_diss_params, t_params, ion_dt,
                 ir, evolve_ppar)
 
-            _block_synchronize()
-            if block_rank[] >= 0 # == 0
+            if block_rank[] >= 0
                 localcols = MPIQR.localcolumns(block_rank[], size(precon_matrix, 2),
                                                mpiqr_blocksize, block_size[])
                 if size(orig_lu, 1) == 1
@@ -1274,7 +1273,6 @@ pressure \$p_{e∥}\$.
                 nl_solver_params.preconditioners[ir] =
                     (orig_lu, precon_matrix, input_buffer, output_buffer)
             end
-            _block_synchronize()
         end
 
 
@@ -1295,7 +1293,6 @@ pressure \$p_{e∥}\$.
                 counter += 1
             end
 
-            _block_synchronize()
             @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer)
 
             begin_serial_region()
@@ -1957,8 +1954,7 @@ to allow the outer r-loop to be parallelised.
                     z_advect, vpa_advect, scratch_dummy, external_source_settings,
                     num_diss_params, t_params, ion_dt, ir, true, :all, true, false)
 
-                _block_synchronize()
-                if block_rank[] >= 0 # >= 0
+                if block_rank[] >= 0
                     localcols = MPIQR.localcols(block_rank[], size(precon_matrix, 2),
                                                 mpiqr_blocksize, block_size[])
                     if size(orig_lu, 1) == 1
@@ -1997,7 +1993,6 @@ to allow the outer r-loop to be parallelised.
                     nl_solver_params.preconditioners[ir] =
                         (orig_lu, precon_matrix, input_buffer, output_buffer)
                 end
-                _block_synchronize()
 
                 return nothing
             end
@@ -2032,7 +2027,6 @@ to allow the outer r-loop to be parallelised.
                     counter += 1
                 end
 
-                _block_synchronize()
                 @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer)
 
                 begin_serial_region()

From 792de1d22cfb809f3f3edbab40b89527828ad606 Mon Sep 17 00:00:00 2001
From: James Cook <james.cook@ukaea.uk>
Date: Wed, 22 Jan 2025 11:45:22 +0000
Subject: [PATCH 16/18] use in-place 2-arg ldiv!

---
 moment_kinetics/src/electron_kinetic_equation.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl
index 2f49b3574..d33ad29e6 100644
--- a/moment_kinetics/src/electron_kinetic_equation.jl
+++ b/moment_kinetics/src/electron_kinetic_equation.jl
@@ -1293,7 +1293,8 @@ pressure \$p_{e∥}\$.
                 counter += 1
             end
 
-            @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer)
+            this_output_buffer .= this_input_buffer
+            @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(precon_lu, this_output_buffer)
 
             begin_serial_region()
             counter = 1
@@ -2027,7 +2028,8 @@ to allow the outer r-loop to be parallelised.
                     counter += 1
                 end
 
-                @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(this_output_buffer, precon_lu, this_input_buffer)
+                this_output_buffer .= this_input_buffer
+                @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(precon_lu, this_output_buffer)
 
                 begin_serial_region()
                 counter = 1

From 5f2a96b88b51287d0c91e7f53a661c78428fa82f Mon Sep 17 00:00:00 2001
From: James Cook <james.cook@ukaea.uk>
Date: Thu, 23 Jan 2025 14:07:31 +0000
Subject: [PATCH 17/18] runs fast - not checked if correct though

---
 moment_kinetics/Project.toml                  |  4 +-
 .../src/electron_kinetic_equation.jl          | 84 ++++++++++++-------
 moment_kinetics/src/nonlinear_solvers.jl      | 16 ++--
 3 files changed, 62 insertions(+), 42 deletions(-)

diff --git a/moment_kinetics/Project.toml b/moment_kinetics/Project.toml
index e8a430cc7..edcd6deba 100644
--- a/moment_kinetics/Project.toml
+++ b/moment_kinetics/Project.toml
@@ -19,7 +19,7 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LsqFit = "2fda8390-95c7-5789-9bda-21331edee243"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
-MPIQR = "4acd9526-7714-4958-a7d5-1632fb49fc8a"
+MUMPS = "55d2b088-9f4e-11e9-26c0-150b02ea6a46"
 Measures = "442fdcdd-2543-5da2-b0f3-8c86c306513e"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@@ -51,7 +51,7 @@ file_io_netcdf = "NCDatasets"
 manufactured_solns_ext = ["Symbolics", "IfElse"]
 
 [compat]
-MPIQR = "0.1.0"
+MUMPS = "1.5.0"
 julia = "1.9.0"
 
 [extras]
diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl
index d33ad29e6..33785aa2a 100644
--- a/moment_kinetics/src/electron_kinetic_equation.jl
+++ b/moment_kinetics/src/electron_kinetic_equation.jl
@@ -2,7 +2,7 @@ module electron_kinetic_equation
 
 using LinearAlgebra
 using MPI
-using MPIQR
+using MUMPS
 using SparseArrays
 
 export get_electron_critical_velocities
@@ -72,8 +72,6 @@ import ..runge_kutta
 
 const integral_correction_sharpness = 4.0
 
-const mpiqr_blocksize = 8 # 1 will be slow, 4 is good for smaller matrices, 8 is good for larger matrices, 16 - who knows?!
-
 """
 update_electron_pdf is a function that uses the electron kinetic equation 
 to solve for the updated electron pdf
@@ -1233,19 +1231,19 @@ pressure \$p_{e∥}\$.
                 vperp_spectral, vpa_spectral, z_advect, vpa_advect, scratch_dummy,
                 external_source_settings, num_diss_params, t_params, ion_dt,
                 ir, evolve_ppar)
-
+            mumps = Mumps{mk_float}(mumps_unsymmetric, default_icntl, default_cntl64;
+                                    comm=MPI.API.MPI_Comm_f2c(comm_block[].val))
             if block_rank[] >= 0
-                localcols = MPIQR.localcolumns(block_rank[], size(precon_matrix, 2),
-                                               mpiqr_blocksize, block_size[])
-                if size(orig_lu, 1) == 1
+                if orig_lu.n == 0#size(orig_lu, 1) == 1
                     # Have not properly created the LU decomposition before, so
                     # cannot reuse it.
                     @timeit_debug global_timer "lu" begin
-                        mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols],
-                          size(precon_matrix); blocksize=mpiqr_blocksize, comm=comm_block[])
-                        mpiqrstruct = qr!(mpiqrmatrix)
+                        if block_rank[] == 0
+                          MUMPS.associate_matrix!(mumps, sparse(precon_matrix))
+                        end
+                        MUMPS.factorize!(mumps)
                         nl_solver_params.preconditioners[ir] =
-                          (mpiqrstruct, precon_matrix, input_buffer, output_buffer)
+                          (mumps, precon_matrix, input_buffer, output_buffer)
                     end
                 else
                     # LU decomposition was previously created. The Jacobian always
@@ -1253,8 +1251,10 @@ pressure \$p_{e∥}\$.
                     # reuse some setup.
                     try
                         @timeit_debug global_timer "lu!" begin
-                          orig_lu[:, localcols] .= precon_matrix[:, localcols]
-                          qr!(orig_lu)
+                          if block_rank[] == 0
+                            MUMPS.associate_matrix!(orig_lu, sparse(precon_matrix))
+                          end
+                          MUMPS.factorize!(orig_lu)
                         end
                     catch e
                         if !isa(e, ArgumentError)
@@ -1262,9 +1262,15 @@ pressure \$p_{e∥}\$.
                         end
                         println("Sparsity pattern of matrix changed, rebuilding "
                                 * " LU from scratch")
-                        mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols],
-                          size(precon_matrix); blocksize=mpiqr_blocksize, comm=comm_block[])
-                        @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix)
+                        @timeit_debug global_timer "lu" begin
+                          if block_rank[] == 0
+                            MUMPS.associate_matrix!(mumps, sparse(precon_matrix))
+                          end
+                          MUMPS.factorize!(mumps)
+                          nl_solver_params.preconditioners[ir] =
+                            (mumps, precon_matrix, input_buffer, output_buffer)
+                        end
+
                     end
                     nl_solver_params.preconditioners[ir] =
                         (orig_lu, precon_matrix, input_buffer, output_buffer)
@@ -1294,7 +1300,10 @@ pressure \$p_{e∥}\$.
             end
 
             this_output_buffer .= this_input_buffer
-            @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(precon_lu, this_output_buffer)
+            if block_rank[] == 0
+              MUMPS.associate_rhs!(precon_lu, this_output_buffer)
+            end
+            @timeit_debug global_timer "ldiv!" MUMPS.solve!(precon_lu)
 
             begin_serial_region()
             counter = 1
@@ -1955,18 +1964,19 @@ to allow the outer r-loop to be parallelised.
                     z_advect, vpa_advect, scratch_dummy, external_source_settings,
                     num_diss_params, t_params, ion_dt, ir, true, :all, true, false)
 
+            mumps = Mumps{mk_float}(mumps_unsymmetric, default_icntl, default_cntl64;
+                                    comm=MPI.API.MPI_Comm_f2c(comm_block[].val))
                 if block_rank[] >= 0
-                    localcols = MPIQR.localcols(block_rank[], size(precon_matrix, 2),
-                                                mpiqr_blocksize, block_size[])
-                    if size(orig_lu, 1) == 1
+                    if orig_lu.n == 0#size(orig_lu, 1) == 1
                         # Have not properly created the LU decomposition before, so
                         # cannot reuse it.
                         @timeit_debug global_timer "lu" begin
-                            mpiqrmatrix = MPIQR.MPIQRMatrix((precon_matrix[:, localcols]), size(precon_matrix);
-                              blocksize=mpiqr_blocksize, comm=comm_block[])
-                            mpiqrstruct = qr!(mpiqrmatrix)
-                            nl_solver_params.preconditioners[ir] =
-                            (mpiqrstruct, precon_matrix, input_buffer, output_buffer)
+                          if block_rank[] == 0
+                            MUMPS.associate_matrix!(mumps, sparse(precon_matrix))
+                          end
+                          MUMPS.factorize!(mumps)
+                          nl_solver_params.preconditioners[ir] =
+                            (mumps, precon_matrix, input_buffer, output_buffer)
                         end
                     else
                         # LU decomposition was previously created. The Jacobian always
@@ -1974,8 +1984,12 @@ to allow the outer r-loop to be parallelised.
                         # reuse some setup.
                         try
                             @timeit_debug global_timer "lu!" begin
-                              orig_lu[:, localcols] .= precon_matrix[:, localcols]
-                              qr!(orig_lu)
+                              if block_rank[] == 0
+                                MUMPS.associate_matrix!(orig_lu, sparse(precon_matrix))
+                              end
+                              MUMPS.factorize!(orig_lu)
+                              nl_solver_params.preconditioners[ir] =
+                                (orig_lu, precon_matrix, input_buffer, output_buffer)
                             end
                         catch e
                             if !isa(e, ArgumentError)
@@ -1983,9 +1997,12 @@ to allow the outer r-loop to be parallelised.
                             end
                             println("Sparsity pattern of matrix changed, rebuilding "
                                     * " LU from scratch")
-                            mpiqrmatrix = MPIQR.MPIQRMatrix(precon_matrix[:, localcols], size(precon_matrix);
-                              blocksize=mpiqr_blocksize, comm=comm_block[])
-                            @timeit_debug global_timer "lu" orig_lu = qr!(mpiqrmatrix)
+                            if block_rank[] == 0
+                              MUMPS.associate_matrix!(orig_lu, sparse(precon_matrix))
+                            end
+                            @timeit_debug global_timer "lu" MUMPS.factorize!(orig_lu)
+                            nl_solver_params.preconditioners[ir] =
+                              (orig_lu, precon_matrix, input_buffer, output_buffer)
                         end
                         nl_solver_params.preconditioners[ir] =
                             (orig_lu, precon_matrix, input_buffer, output_buffer)
@@ -2028,8 +2045,11 @@ to allow the outer r-loop to be parallelised.
                     counter += 1
                 end
 
-                this_output_buffer .= this_input_buffer
-                @timeit_debug global_timer "ldiv!" MPIQR.ldiv!(precon_lu, this_output_buffer)
+            this_output_buffer .= this_input_buffer
+            if block_rank[] == 0
+              MUMPS.associate_rhs!(precon_lu, this_output_buffer)
+            end
+            @timeit_debug global_timer "ldiv!" MUMPS.solve!(precon_lu)
 
                 begin_serial_region()
                 counter = 1
diff --git a/moment_kinetics/src/nonlinear_solvers.jl b/moment_kinetics/src/nonlinear_solvers.jl
index 568dba47f..f94d2385c 100644
--- a/moment_kinetics/src/nonlinear_solvers.jl
+++ b/moment_kinetics/src/nonlinear_solvers.jl
@@ -39,7 +39,7 @@ using ..type_definitions: mk_float, mk_int
 
 using LinearAlgebra
 using MPI
-using MPIQR
+using MUMPS
 using SparseArrays
 using StatsBase: mean
 
@@ -167,25 +167,25 @@ function setup_nonlinear_solve(active, input_dict, coords, outer_coords=(); defa
             V .= 0.0
         end
     end
-    mpiqr = MPIQR.MPIQRMatrix(zeros(Float64, 1, 1), (1, block_size[]);
-                              blocksize=1, comm=comm_block[])
-    mpiqrstruct = MPIQR.MPIQRStruct(mpiqr)
+
+    mumps = Mumps{mk_float}(mumps_unsymmetric, default_icntl, default_cntl64;
+                            comm=MPI.API.MPI_Comm_f2c(comm_block[].val))
     if preconditioner_type === Val(:lu)
         # Create dummy LU solver objects so we can create an array for preconditioners.
         # These will be calculated properly within the time loop.
-        preconditioners = fill(mpiqrstruct,
+        preconditioners = fill(mumps,
                                reverse(outer_coord_sizes))
     elseif preconditioner_type === Val(:electron_split_lu)
-        preconditioners = (z=fill(mpiqrstruct,
+        preconditioners = (z=fill(mumps,
                                   tuple(coords.vpa.n, reverse(outer_coord_sizes)...)),
                            vpa=fill(mpqirstruct,
                                     tuple(coords.z.n, reverse(outer_coord_sizes)...)),
-                           ppar=fill(mpiqrstruct,
+                           ppar=fill(mumps,
                                      reverse(outer_coord_sizes)),
                           )
     elseif preconditioner_type === Val(:electron_lu)
         pdf_plus_ppar_size = total_size_coords + coords.z.n
-        preconditioners = fill((mpiqrstruct,
+        preconditioners = fill((mumps,
                                 allocate_shared_float(pdf_plus_ppar_size, pdf_plus_ppar_size),
                                 allocate_shared_float(pdf_plus_ppar_size),
                                 allocate_shared_float(pdf_plus_ppar_size),

From 08b44aee9d17950b1020f9046a6e0093461211f7 Mon Sep 17 00:00:00 2001
From: James Cook <james.cook@ukaea.uk>
Date: Fri, 24 Jan 2025 11:40:43 +0000
Subject: [PATCH 18/18] add _block_synchnize() calls for safety - can remove
 late if needed

---
 moment_kinetics/src/electron_kinetic_equation.jl | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/moment_kinetics/src/electron_kinetic_equation.jl b/moment_kinetics/src/electron_kinetic_equation.jl
index 33785aa2a..96d39e73e 100644
--- a/moment_kinetics/src/electron_kinetic_equation.jl
+++ b/moment_kinetics/src/electron_kinetic_equation.jl
@@ -1231,8 +1231,10 @@ pressure \$p_{e∥}\$.
                 vperp_spectral, vpa_spectral, z_advect, vpa_advect, scratch_dummy,
                 external_source_settings, num_diss_params, t_params, ion_dt,
                 ir, evolve_ppar)
-            mumps = Mumps{mk_float}(mumps_unsymmetric, default_icntl, default_cntl64;
+            mumps = Mumps{mk_float}(mumps_unsymmetric, get_icntl(), default_cntl64;
                                     comm=MPI.API.MPI_Comm_f2c(comm_block[].val))
+            begin_serial_region()
+            _block_synchronize()
             if block_rank[] >= 0
                 if orig_lu.n == 0#size(orig_lu, 1) == 1
                     # Have not properly created the LU decomposition before, so
@@ -1299,11 +1301,15 @@ pressure \$p_{e∥}\$.
                 counter += 1
             end
 
+                begin_serial_region()
             this_output_buffer .= this_input_buffer
             if block_rank[] == 0
               MUMPS.associate_rhs!(precon_lu, this_output_buffer)
             end
+
+            _block_synchronize()
             @timeit_debug global_timer "ldiv!" MUMPS.solve!(precon_lu)
+            _block_synchronize()
 
             begin_serial_region()
             counter = 1
@@ -1964,8 +1970,10 @@ to allow the outer r-loop to be parallelised.
                     z_advect, vpa_advect, scratch_dummy, external_source_settings,
                     num_diss_params, t_params, ion_dt, ir, true, :all, true, false)
 
-            mumps = Mumps{mk_float}(mumps_unsymmetric, default_icntl, default_cntl64;
+            mumps = Mumps{mk_float}(mumps_unsymmetric, get_icntl(), default_cntl64;
                                     comm=MPI.API.MPI_Comm_f2c(comm_block[].val))
+                begin_serial_region()
+                _block_synchronize()
                 if block_rank[] >= 0
                     if orig_lu.n == 0#size(orig_lu, 1) == 1
                         # Have not properly created the LU decomposition before, so
@@ -2045,11 +2053,14 @@ to allow the outer r-loop to be parallelised.
                     counter += 1
                 end
 
+                begin_serial_region()
             this_output_buffer .= this_input_buffer
             if block_rank[] == 0
               MUMPS.associate_rhs!(precon_lu, this_output_buffer)
             end
+            _block_synchronize()
             @timeit_debug global_timer "ldiv!" MUMPS.solve!(precon_lu)
+            _block_synchronize()
 
                 begin_serial_region()
                 counter = 1