diff --git a/.ci_fedora.sh b/.ci_fedora.sh
index 452afb4b7e..b8805abb15 100755
--- a/.ci_fedora.sh
+++ b/.ci_fedora.sh
@@ -50,7 +50,7 @@ then
     cp -a /tmp/BOUT-dev /home/test/
     chown -R test /home/test
     chmod u+rwX /home/test -R
-    sudo -u test ${0/\/tmp/\/home\/test} $mpi
+    su - test -c "${0/\/tmp/\/home\/test} $mpi"
 ## If we are called as normal user, run test
 else
     . /etc/profile.d/modules.sh
diff --git a/.clang-format b/.clang-format
index f51c5bde87..a80c59bddd 100644
--- a/.clang-format
+++ b/.clang-format
@@ -109,6 +109,8 @@ SpacesInParentheses: false
 SpacesInSquareBrackets: false
 StatementMacros:
   - BOUT_OMP
+  - BOUT_OMP_PERF
+  - BOUT_OMP_SAFE
 Standard:        c++14
 TabWidth:        8
 UseTab:          Never
diff --git a/.clang-tidy b/.clang-tidy
index 3be0af4917..0117c20e42 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -2,7 +2,6 @@
 Checks:          'clang-diagnostic-*,clang-analyzer-*,performance-*,readability-*,bugprone-*,clang-analyzer-*,cppcoreguidelines-*,mpi-*,misc-*,-readability-magic-numbers,-cppcoreguidelines-avoid-magic-numbers,-misc-non-private-member-variables-in-classes,-clang-analyzer-optin.mpi*,-bugprone-exception-escape,-cppcoreguidelines-pro-bounds-pointer-arithmetic,-readability-function-cognitive-complexity,-misc-no-recursion,-bugprone-easily-swappable-parameters'
 WarningsAsErrors: ''
 HeaderFilterRegex: ''
-AnalyzeTemporaryDtors: false
 FormatStyle:     file
 CheckOptions:
 
@@ -10,7 +9,7 @@ CheckOptions:
   - key:             readability-identifier-length.IgnoredVariableNames
     value:           '^[dn]?[xyz]$'
   - key:             readability-identifier-length.IgnoredParameterNames
-    value:           '^[fijkxyz][01xyz]?$'
+    value:           '^[dfijknxyz][01xyz]?$'
   - key:             readability-identifier-length.IgnoredLoopCounterNames
     value:           '^[ijkxyz_]$'
 
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 42965e75e8..bdaeb3dc4f 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -39,7 +39,7 @@ jobs:
         is_cron:
           - ${{ github.event_name == 'cron' }}
         config:
-          - name: "CMake, PETSc unreleased, ADIOS"
+          - name: "CMake, PETSc unreleased, ADIOS2"
             os: ubuntu-20.04
             cmake_options: "-DBUILD_SHARED_LIBS=ON
                             -DBOUT_ENABLE_METRIC_3D=ON
diff --git a/.gitignore b/.gitignore
index 7ddf9526ab..934da1c0de 100644
--- a/.gitignore
+++ b/.gitignore
@@ -85,3 +85,5 @@ coverage/
 /_version.txt
 /BOUT++-v*.tar.gz
 /BOUT++-v*.tar.xz
+/CMakeCache.txt
+/CMakeFiles/cmake.check_cache
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 483672fb67..f57a78a14a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -366,7 +366,7 @@ else()
   set(BOUT_GENERATE_FIELDOPS_DEFAULT OFF)
 endif()
 
-execute_process(COMMAND ${Python3_EXECUTABLE} -c "import zoidberg"
+execute_process(COMMAND ${Python3_EXECUTABLE} -c "import importlib.util ; import sys; sys.exit(importlib.util.find_spec(\"zoidberg\") is None)"
   RESULT_VARIABLE zoidberg_FOUND)
 if (zoidberg_FOUND EQUAL 0)
   set(zoidberg_FOUND ON)
@@ -774,7 +774,7 @@ set(BOUT_HAS_PNETCDF OFF)
 # while for static builds we need the dependencies too
 if (BUILD_SHARED_LIBS)
   # Include rpath linker flag so user doesn't need to set LD_LIBRARY_PATH
-  set(CONFIG_LDFLAGS "${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG}\$BOUT_LIB_PATH -L\$BOUT_LIB_PATH -lbout++ -lfmt")
+  set(CONFIG_LDFLAGS "${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG}\$BOUT_LIB_PATH -L\$BOUT_LIB_PATH -lbout++ -lfmt ${CONFIG_LDFLAGS_SHARED}")
 else()
   set(CONFIG_LDFLAGS "${CONFIG_LDFLAGS}")
 endif()
@@ -935,7 +935,7 @@ message("
    SUNDIALS support         : ${BOUT_HAS_SUNDIALS}
    HYPRE support            : ${BOUT_HAS_HYPRE}
    NetCDF support           : ${BOUT_HAS_NETCDF}
-   ADIOS support            : ${BOUT_HAS_ADIOS}
+   ADIOS2 support           : ${BOUT_HAS_ADIOS2}
    FFTW support             : ${BOUT_HAS_FFTW}
    LAPACK support           : ${BOUT_HAS_LAPACK}
    OpenMP support           : ${BOUT_USE_OPENMP}
diff --git a/bin/bout-build-deps.sh b/bin/bout-build-deps.sh
index 19e3b2a0d3..d96d500dc9 100755
--- a/bin/bout-build-deps.sh
+++ b/bin/bout-build-deps.sh
@@ -98,7 +98,7 @@ netcdf() {
 
 nccxx() {
     cd $BUILD
-    wget -c ftp://ftp.unidata.ucar.edu/pub/netcdf/netcdf-cxx4-$NCCXXVER.tar.gz || :
+    wget -c https://downloads.unidata.ucar.edu/netcdf-cxx/$NCCXXVER/netcdf-cxx4-$NCCXXVER.tar.gz || :
     tar -xf netcdf-cxx4-$NCCXXVER.tar.gz
     cd netcdf-cxx4-$NCCXXVER
     CPPFLAGS="-I$PREFIX/include" LDFLAGS="-L$PREFIX/lib/" ./configure --prefix=$PREFIX $NCCXXFLAGS
@@ -286,17 +286,17 @@ set -x
 ## Setup folders and links
 setup
 ## Build and install hdf5
-hdf5
+test $NO_HDF5 || hdf5
 ## Build and install netcdf
-netcdf
+test $NO_NETCDF || netcdf
 ## Build and install C++ interface for netcdf
-nccxx
+test $NO_NCXX || nccxx
 ## Build and install FFTW
-fftw
+test $NO_FFTW || fftw
 ## Build and install Sundials
-sundials
+test $NO_SUNDIALS || sundials
 ## Build and install PETSc
-petsc
+test $NO_PETSC || petsc
 ## Download BOUT++ submodules
 submod
 # Install python packages
diff --git a/bin/bout-config.in b/bin/bout-config.in
index a9045fff39..b5a62a42eb 100755
--- a/bin/bout-config.in
+++ b/bin/bout-config.in
@@ -29,7 +29,7 @@ idlpath="@IDLCONFIGPATH@"
 pythonpath="@PYTHONCONFIGPATH@"
 
 has_netcdf="@BOUT_HAS_NETCDF@"
-has_adios="@BOUT_HAS_ADIOS@"
+has_adios2="@BOUT_HAS_ADIOS2@"
 has_legacy_netcdf="@BOUT_HAS_LEGACY_NETCDF@"
 has_pnetcdf="@BOUT_HAS_PNETCDF@"
 has_pvode="@BOUT_HAS_PVODE@"
@@ -71,18 +71,18 @@ Available values for OPTION include:
   --idl          IDL path
   --python       Python path
 
-  --has-netcdf  NetCDF file support
-  --has-adios   ADIOS file support
+  --has-netcdf   NetCDF file support
+  --has-adios2   ADIOS2 file support
   --has-legacy-netcdf  Legacy NetCDF file support
-  --has-pnetcdf Parallel NetCDF file support
-  --has-pvode   PVODE solver support
-  --has-cvode   SUNDIALS CVODE solver support
-  --has-ida     SUNDIALS IDA solver support
-  --has-lapack  LAPACK support
-  --has-petsc   PETSc support
-  --has-hypre   Hypre support
-  --has-slepc   SLEPc support
-  --has-nls     Natural Language Support
+  --has-pnetcdf  Parallel NetCDF file support
+  --has-pvode    PVODE solver support
+  --has-cvode    SUNDIALS CVODE solver support
+  --has-ida      SUNDIALS IDA solver support
+  --has-lapack   LAPACK support
+  --has-petsc    PETSc support
+  --has-hypre    Hypre support
+  --has-slepc    SLEPc support
+  --has-nls      Natural Language Support
 
   --petsc-has-sundials 
 
@@ -123,6 +123,7 @@ all()
         echo "  --has-slepc   -> $has_slepc"
         echo "  --has-arkode  -> $has_arkode"
         echo "  --has-nls     -> $has_nls"
+        echo "  --has-openmp  -> $has_openmp"
         echo
         echo "  --petsc-has-sundials -> $petsc_has_sundials"
         echo
diff --git a/bout++Config.cmake.in b/bout++Config.cmake.in
index 3d824e455f..5af0dc43ea 100644
--- a/bout++Config.cmake.in
+++ b/bout++Config.cmake.in
@@ -15,7 +15,7 @@ set(BOUT_USE_METRIC_3D @BOUT_USE_METRIC_3D@)
 
 set(BOUT_HAS_PVODE @BOUT_HAS_PVODE@)
 set(BOUT_HAS_NETCDF @BOUT_HAS_NETCDF@)
-set(BOUT_HAS_ADIOS @BOUT_HAS_ADIOS@)
+set(BOUT_HAS_ADIOS2 @BOUT_HAS_ADIOS2@)
 set(BOUT_HAS_FFTW @BOUT_HAS_FFTW@)
 set(BOUT_HAS_LAPACK @BOUT_HAS_LAPACK@)
 set(BOUT_HAS_PETSC @BOUT_HAS_PETSC@)
diff --git a/cmake/FindCython.cmake b/cmake/FindCython.cmake
index 76f43480d9..3b98cde89e 100644
--- a/cmake/FindCython.cmake
+++ b/cmake/FindCython.cmake
@@ -10,7 +10,7 @@
 #   CYTHON_FOUND - true if Cython was found
 #   CYTHON_VERSION - Cython version
 
-execute_process(COMMAND ${Python_EXECUTABLE} -c "import cython ; print(cython.__version__)"
+execute_process(COMMAND ${Python3_EXECUTABLE} -c "import cython ; print(cython.__version__)"
   RESULT_VARIABLE _cython_runs
   OUTPUT_VARIABLE CYTHON_VERSION
   OUTPUT_STRIP_TRAILING_WHITESPACE
diff --git a/cmake/FindNumpy.cmake b/cmake/FindNumpy.cmake
index 201bc19221..b6de6e3e35 100644
--- a/cmake/FindNumpy.cmake
+++ b/cmake/FindNumpy.cmake
@@ -12,32 +12,32 @@
 #   Numpy_INCLUDE_DIR
 
 
-find_package(Python 3.6 COMPONENTS Interpreter Development)
+find_package(Python3 3.6 COMPONENTS Interpreter Development)
 
-if (NOT Python_FOUND)
+if (NOT Python3_FOUND)
   message(STATUS "Could not find numpy as python was not found. Maybe the developement package is missing?")
-  set(Numpy_FOUND ${Python_FOUND})
+  set(Numpy_FOUND ${Python3_FOUND})
   return()
 endif()
 
 if (NOT Numpy_FOUND)
-  execute_process(COMMAND ${Python_EXECUTABLE} -c "import numpy ; print(numpy.__version__)"
+  execute_process(COMMAND ${Python3_EXECUTABLE} -c "import numpy ; print(numpy.__version__)"
     OUTPUT_STRIP_TRAILING_WHITESPACE
     OUTPUT_VARIABLE Numpy_VERSION
     )
-  execute_process(COMMAND ${Python_EXECUTABLE} -c "import numpy ; print(numpy.get_include())"
+  execute_process(COMMAND ${Python3_EXECUTABLE} -c "import numpy ; print(numpy.get_include())"
     OUTPUT_STRIP_TRAILING_WHITESPACE
     OUTPUT_VARIABLE _numpy_include_dirs
     )
 endif()
 
 if (Numpy_DEBUG)
-  message(STATUS "Looking for numpy headers in: ${_numpy_include_dirs} ${PYTHON_INCLUDE_DIR}")
+  message(STATUS "Looking for numpy headers in: ${_numpy_include_dirs} ${Python3_INCLUDE_DIRS}")
 endif()
 
 find_path(Numpy_INCLUDE_DIR
   numpy/arrayobject.h
-  PATHS "${_numpy_include_dirs}" "${PYTHON_INCLUDE_DIR}"
+  PATHS "${_numpy_include_dirs}" "${Python3_INCLUDE_DIRS}"
   PATH_SUFFIXES numpy/core/include
   )
 
diff --git a/cmake/FindSUNDIALS.cmake b/cmake/FindSUNDIALS.cmake
index 1ecb5db429..15b266d06a 100644
--- a/cmake/FindSUNDIALS.cmake
+++ b/cmake/FindSUNDIALS.cmake
@@ -104,16 +104,8 @@ endforeach()
 
 if (SUNDIALS_INCLUDE_DIR)
   file(READ "${SUNDIALS_INCLUDE_DIR}/sundials_config.h" SUNDIALS_CONFIG_FILE)
-  string(FIND "${SUNDIALS_CONFIG_FILE}" "SUNDIALS_PACKAGE_VERSION" index)
-  if("${index}" LESS 0)
-    # Version >3
-    set(SUNDIALS_VERSION_REGEX_PATTERN
-      ".*#define SUNDIALS_VERSION \"([0-9]+)\\.([0-9]+)\\.([0-9]+)\".*")
-  else()
-    # Version <3
-    set(SUNDIALS_VERSION_REGEX_PATTERN
-      ".*#define SUNDIALS_PACKAGE_VERSION \"([0-9]+)\\.([0-9]+)\\.([0-9]+)\".*")
-  endif()
+  set(SUNDIALS_VERSION_REGEX_PATTERN
+    ".*#define SUNDIALS_VERSION \"([0-9]+)\\.([0-9]+)\\.([0-9]+)\".*")
   string(REGEX MATCH ${SUNDIALS_VERSION_REGEX_PATTERN} _ "${SUNDIALS_CONFIG_FILE}")
   set(SUNDIALS_VERSION_MAJOR ${CMAKE_MATCH_1} CACHE STRING "")
   set(SUNDIALS_VERSION_MINOR ${CMAKE_MATCH_2} CACHE STRING "")
diff --git a/cmake/SetupBOUTThirdParty.cmake b/cmake/SetupBOUTThirdParty.cmake
index ef0fd438d4..9c49fe6fdc 100644
--- a/cmake/SetupBOUTThirdParty.cmake
+++ b/cmake/SetupBOUTThirdParty.cmake
@@ -8,6 +8,9 @@ endif ()
 # determined in SetupCompilers.cmake
 if (BOUT_USE_OPENMP)
   target_link_libraries(bout++ PUBLIC OpenMP::OpenMP_CXX)
+  set(CONFIG_LDFLAGS "${CONFIG_LDFLAGS} -fopenmp")
+  set(CONFIG_LDFLAGS_SHARED "${CONFIG_LDFLAGS_SHARED} -fopenmp")
+  set(CONFIG_CFLAGS "${CONFIG_CFLAGS} -fopenmp")
 endif()
 
 # determined in SetupCompilers.cmake
@@ -187,10 +190,10 @@ endif()
 message(STATUS "NetCDF support: ${BOUT_USE_NETCDF}")
 set(BOUT_HAS_NETCDF ${BOUT_USE_NETCDF})
 
-option(BOUT_USE_ADIOS "Enable support for ADIOS output" ON)
-option(BOUT_DOWNLOAD_ADIOS "Download and build ADIOS2" OFF)
-if (BOUT_USE_ADIOS)
-  if (BOUT_DOWNLOAD_ADIOS)
+option(BOUT_USE_ADIOS2 "Enable support for ADIOS output" ON)
+option(BOUT_DOWNLOAD_ADIOS2 "Download and build ADIOS2" OFF)
+if (BOUT_USE_ADIOS2)
+  if (BOUT_DOWNLOAD_ADIOS2)
     message(STATUS "Downloading and configuring ADIOS2")
     include(FetchContent)
     FetchContent_Declare(
@@ -217,12 +220,12 @@ if (BOUT_USE_ADIOS)
       find_package(MPI REQUIRED COMPONENTS C)
       target_link_libraries(bout++ PUBLIC adios2::cxx11_mpi MPI::MPI_C)
     else()
-      set(BOUT_USE_ADIOS OFF)
+      set(BOUT_USE_ADIOS2 OFF)
     endif()
   endif()
 endif()
-message(STATUS "ADIOS support: ${BOUT_USE_ADIOS}")
-set(BOUT_HAS_ADIOS ${BOUT_USE_ADIOS})
+message(STATUS "ADIOS2 support: ${BOUT_USE_ADIOS2}")
+set(BOUT_HAS_ADIOS2 ${BOUT_USE_ADIOS2})
 
 
 option(BOUT_USE_FFTW "Enable support for FFTW" ON)
@@ -278,8 +281,8 @@ if (BOUT_USE_SUNDIALS)
     include(FetchContent)
     FetchContent_Declare(
       sundials
-      GIT_REPOSITORY https://github.com/ZedThree/sundials
-      GIT_TAG        cmake-export-fixes
+      GIT_REPOSITORY https://github.com/LLNL/sundials
+      GIT_TAG        v7.0.0
       )
     # Note: These are settings for building SUNDIALS
     set(EXAMPLES_ENABLE_C OFF CACHE BOOL "" FORCE)
@@ -294,7 +297,11 @@ if (BOUT_USE_SUNDIALS)
     FetchContent_MakeAvailable(sundials)
     message(STATUS "SUNDIALS done configuring")
   else()
+    enable_language(C)
     find_package(SUNDIALS REQUIRED)
+    if (SUNDIALS_VERSION VERSION_LESS 4.0.0)
+      message(FATAL_ERROR "SUNDIALS_VERSION 4.0.0 or newer is required. Found version ${SUNDIALS_VERSION}.")
+    endif()
   endif()
   target_link_libraries(bout++ PUBLIC SUNDIALS::nvecparallel)
   target_link_libraries(bout++ PUBLIC SUNDIALS::cvode)
diff --git a/cmake_build_defines.hxx.in b/cmake_build_defines.hxx.in
index ed6e8685f6..4d63a01b7d 100644
--- a/cmake_build_defines.hxx.in
+++ b/cmake_build_defines.hxx.in
@@ -13,7 +13,7 @@
 #cmakedefine01 BOUT_HAS_IDA
 #cmakedefine01 BOUT_HAS_LAPACK
 #cmakedefine01 BOUT_HAS_NETCDF
-#cmakedefine01 BOUT_HAS_ADIOS
+#cmakedefine01 BOUT_HAS_ADIOS2
 #cmakedefine01 BOUT_HAS_PETSC
 #cmakedefine01 BOUT_HAS_PRETTY_FUNCTION
 #cmakedefine01 BOUT_HAS_PVODE
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 3849d34852..022b16e248 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -11,6 +11,7 @@ add_subdirectory(backtrace)
 add_subdirectory(blob2d)
 add_subdirectory(blob2d-outerloop)
 add_subdirectory(blob2d-laplacexz)
+add_subdirectory(boutpp)
 add_subdirectory(boundary-conditions/advection)
 add_subdirectory(conducting-wall-mode)
 add_subdirectory(conduction)
diff --git a/examples/blob2d/blob2d.cxx b/examples/blob2d/blob2d.cxx
index f41f857d46..7007bbeb77 100644
--- a/examples/blob2d/blob2d.cxx
+++ b/examples/blob2d/blob2d.cxx
@@ -25,7 +25,6 @@ class Blob2D : public PhysicsModel {
   BoutReal rho_s;   ///< Bohm gyro radius
   BoutReal Omega_i; ///< Ion cyclotron frequency
   BoutReal c_s;     ///< Bohm sound speed
-  BoutReal n0;      ///< Reference density
 
   // Constants to calculate the parameters
   BoutReal Te0; ///< Isothermal temperature [eV]
@@ -61,7 +60,6 @@ class Blob2D : public PhysicsModel {
     m_i = options["m_i"].withDefault(2 * 1.667e-27);
     m_e = options["m_e"].withDefault(9.11e-31);
 
-    n0 = options["n0"].doc("Background density in cubic m").withDefault(1e19);
     D_vort = options["D_vort"].doc("Viscous diffusion coefficient").withDefault(0.0);
     D_n = options["D_n"].doc("Density diffusion coefficient").withDefault(0.0);
 
diff --git a/examples/blob2d/delta_0.25/BOUT.inp b/examples/blob2d/delta_0.25/BOUT.inp
index 58d1e36741..841fcaf235 100644
--- a/examples/blob2d/delta_0.25/BOUT.inp
+++ b/examples/blob2d/delta_0.25/BOUT.inp
@@ -87,8 +87,6 @@ flags = 49152  # set_rhs i.e. identity matrix in boundaries
 
 Te0 = 5    # Electron Temperature (eV)
 
-n0 = 2e+18  # Background plasma density (m^-3)
-
 compressible = false  # Compressibility?
 
 boussinesq = true  # Boussinesq approximation (no perturbed n in vorticity)
diff --git a/examples/blob2d/delta_1/BOUT.inp b/examples/blob2d/delta_1/BOUT.inp
index 417911271d..39213ddd36 100644
--- a/examples/blob2d/delta_1/BOUT.inp
+++ b/examples/blob2d/delta_1/BOUT.inp
@@ -87,8 +87,6 @@ flags = 49152  # set_rhs i.e. identity matrix in boundaries
 
 Te0 = 5    # Electron Temperature (eV)
 
-n0 = 2e+18  # Background plasma density (m^-3)
-
 compressible = false  # Compressibility?
 
 boussinesq = true  # Boussinesq approximation (no perturbed n in vorticity)
diff --git a/examples/blob2d/delta_10/BOUT.inp b/examples/blob2d/delta_10/BOUT.inp
index 353c28c3b2..f4507b871b 100644
--- a/examples/blob2d/delta_10/BOUT.inp
+++ b/examples/blob2d/delta_10/BOUT.inp
@@ -87,8 +87,6 @@ flags = 49152  # set_rhs i.e. identity matrix in boundaries
 
 Te0 = 5    # Electron Temperature (eV)
 
-n0 = 2e+18  # Background plasma density (m^-3)
-
 compressible = false  # Compressibility?
 
 boussinesq = true  # Boussinesq approximation (no perturbed n in vorticity)
diff --git a/examples/boutpp/CMakeLists.txt b/examples/boutpp/CMakeLists.txt
new file mode 100644
index 0000000000..e46a7ae990
--- /dev/null
+++ b/examples/boutpp/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 3.13)
+
+if (NOT TARGET bout++::bout++)
+  find_package(bout++ REQUIRED)
+endif()
+
+bout_copy_file(runexample)
+bout_copy_file(blob2d.py)
+bout_copy_file(simulation.py)
+bout_copy_file(data/BOUT.inp)
diff --git a/examples/boutpp/blob2d.py b/examples/boutpp/blob2d.py
index d5f370a454..4dc8ea60ac 100755
--- a/examples/boutpp/blob2d.py
+++ b/examples/boutpp/blob2d.py
@@ -24,7 +24,7 @@ def init(self, restart):
 
         self.phiSolver = bc.Laplacian()
 
-        options = bc.Options("model")
+        options = bc.Options.root("model")
         # Temperature in eV
         Te0 = options.get("Te0", 30)
         e = options.get("e", 1.602e-19)
@@ -70,12 +70,20 @@ def init(self, restart):
 
         # /************ Create a solver for potential ********/
 
+        opts_boussinesq = bc.Options.root("phiBoussinesq")
+        opts_non_boussinesq = bc.Options.root("phiSolver")
+
         if self.boussinesq:
             # BOUT.inp section "phiBoussinesq"
-            self.phiSolver = bc.Laplacian(bc.Options("phiBoussinesq"))
+            opts_used = opts_boussinesq
+            opts_unused = opts_non_boussinesq
         else:
             # BOUT.inp section "phiSolver"
-            self.phiSolver = bc.Laplacian(bc.Options("phiSolver"))
+            opts_used = opts_non_boussinesq
+            opts_unused = opts_boussinesq
+
+        self.phiSolver = bc.Laplacian(opts_used)
+        opts_unused.setConditionallyUsed()
 
         # Starting guess for first solve (if iterative)
         self.phi = bc.create3D("0")
@@ -165,8 +173,8 @@ def ensure_blob():
 
 # settings used by the core code
 
-NOUT = 50      # number of time-steps
-TIMESTEP = 50  # time between outputs [1/wci]
+nout = 50      # number of time-steps
+timestep = 50  # time between outputs [1/wci]
 
 
 MXG = 2      # Number of X guard cells
@@ -198,8 +206,8 @@ def ensure_blob():
 
 [mesh:ddz]
 
-first = FFT
-second = FFT
+first = C2
+second = C2
 upwind = W3
 
 ###################################################
@@ -207,8 +215,8 @@ def ensure_blob():
 
 [solver]
 
-ATOL = 1.0e-10  # absolute tolerance
-RTOL = 1.0e-5   # relative tolerance
+atol = 1e-10  # absolute tolerance
+rtol = 1e-05   # relative tolerance
 mxstep = 10000  # Maximum internal steps per output
 
 ###################################################
@@ -221,22 +229,20 @@ def ensure_blob():
 
 fourth_order = true  # 4th order or 2nd order
 
-flags = 0  # inversion flags for phi
-             # 0  = Zero value
-             # 10 = Zero gradient AC inner & outer
-             # 15 = Zero gradient AC and DC
-             # 768 = Zero laplace inner & outer
+# 0  = Zero value
+# 10 = Zero gradient AC inner & outer
+# 15 = Zero gradient AC and DC
+# 768 = Zero laplace inner & outer
 
 [phiSolver:precon]  # Preconditioner (if pctype=user)
-filter     = 0.     # Must not filter solution
-flags      = 49152  # set_rhs i.e. identity matrix in boundaries
+filter = 0.0     # Must not filter solution
+flags = 49152  # set_rhs i.e. identity matrix in boundaries
 
 ###################################################
 # Electrostatic potential solver (Boussinesq)
 
 [phiBoussinesq]
 # By default type is tri (serial) or spt (parallel)
-flags = 0
 
 ##################################################
 # general settings for the model
@@ -245,14 +251,12 @@ def ensure_blob():
 
 Te0 = 5    # Electron Temperature (eV)
 
-n0 = 2e18  # Background plasma density (m^-3)
-
 compressible = false  # Compressibility?
 
 boussinesq = true  # Boussinesq approximation (no perturbed n in vorticity)
 
-D_vort = 1e-6  # Viscosity
-D_n = 1e-6    # Diffusion
+D_vort = 1e-06  # Viscosity
+D_n = 1e-06    # Diffusion
 
 R_c = 1.5  # Radius of curvature (m)
 
@@ -261,7 +265,7 @@ def ensure_blob():
 # These can be overridden for individual variables in
 # a section of that name.
 
-[All]
+[all]
 scale = 0.0 # default size of initial perturbations
 
 bndry_all = neumann # Zero-gradient on all boundaries
@@ -278,9 +282,8 @@ def ensure_blob():
 
 
 if __name__ == "__main__":
-    if "--create" in sys.argv:
-        sys.argv.remove("--create")
-        ensure_blob()
+    ensure_blob()
+
     bc.init("-d blob".split(" ") + sys.argv[1:])
 
     # Create an instance
diff --git a/examples/boutpp/data/BOUT.inp b/examples/boutpp/data/BOUT.inp
new file mode 100644
index 0000000000..d91707ec1b
--- /dev/null
+++ b/examples/boutpp/data/BOUT.inp
@@ -0,0 +1,9 @@
+nout=10
+timestep=10
+
+[mesh]
+nx=160
+ny=1
+nz=n/n
+
+MYG=0
diff --git a/examples/elm-pb/elm_pb.cxx b/examples/elm-pb/elm_pb.cxx
index e81742747a..f108e58e2f 100644
--- a/examples/elm-pb/elm_pb.cxx
+++ b/examples/elm-pb/elm_pb.cxx
@@ -1427,23 +1427,30 @@ class ELMpb : public PhysicsModel {
 
     if (sheath_boundaries) {
 
+      // Need to shift into field-aligned coordinates before applying
+      // parallel boundary conditions
+
+      auto phi_fa = toFieldAligned(phi);
+      auto P_fa = toFieldAligned(P);
+      auto Jpar_fa = toFieldAligned(Jpar);
+
       // At y = ystart (lower boundary)
 
       for (RangeIterator r = mesh->iterateBndryLowerY(); !r.isDone(); r++) {
         for (int jz = 0; jz < mesh->LocalNz; jz++) {
 
           // Zero-gradient potential
-          BoutReal phisheath = phi(r.ind, mesh->ystart, jz);
+          BoutReal const phisheath = phi_fa(r.ind, mesh->ystart, jz);
 
           BoutReal jsheath = -(sqrt(mi_me) / (2. * sqrt(PI))) * phisheath;
 
           // Apply boundary condition half-way between cells
           for (int jy = mesh->ystart - 1; jy >= 0; jy--) {
             // Neumann conditions
-            P(r.ind, jy, jz) = P(r.ind, mesh->ystart, jz);
-            phi(r.ind, jy, jz) = phisheath;
+            P_fa(r.ind, jy, jz) = P_fa(r.ind, mesh->ystart, jz);
+            phi_fa(r.ind, jy, jz) = phisheath;
             // Dirichlet condition on Jpar
-            Jpar(r.ind, jy, jz) = 2. * jsheath - Jpar(r.ind, mesh->ystart, jz);
+            Jpar_fa(r.ind, jy, jz) = 2. * jsheath - Jpar_fa(r.ind, mesh->ystart, jz);
           }
         }
       }
@@ -1454,22 +1461,27 @@ class ELMpb : public PhysicsModel {
         for (int jz = 0; jz < mesh->LocalNz; jz++) {
 
           // Zero-gradient potential
-          BoutReal phisheath = phi(r.ind, mesh->yend, jz);
+          BoutReal const phisheath = phi_fa(r.ind, mesh->yend, jz);
 
           BoutReal jsheath = (sqrt(mi_me) / (2. * sqrt(PI))) * phisheath;
 
           // Apply boundary condition half-way between cells
           for (int jy = mesh->yend + 1; jy < mesh->LocalNy; jy++) {
             // Neumann conditions
-            P(r.ind, jy, jz) = P(r.ind, mesh->yend, jz);
-            phi(r.ind, jy, jz) = phisheath;
+            P_fa(r.ind, jy, jz) = P_fa(r.ind, mesh->yend, jz);
+            phi_fa(r.ind, jy, jz) = phisheath;
             // Dirichlet condition on Jpar
             // WARNING: this is not correct if staggered grids are used
             ASSERT3(not mesh->StaggerGrids);
-            Jpar(r.ind, jy, jz) = 2. * jsheath - Jpar(r.ind, mesh->yend, jz);
+            Jpar_fa(r.ind, jy, jz) = 2. * jsheath - Jpar_fa(r.ind, mesh->yend, jz);
           }
         }
       }
+
+      // Shift back from field aligned coordinates
+      phi = fromFieldAligned(phi_fa);
+      P = fromFieldAligned(P_fa);
+      Jpar = fromFieldAligned(Jpar_fa);
     }
 
     ////////////////////////////////////////////////////
diff --git a/examples/fci-wave-logn/boundary/BOUT.inp b/examples/fci-wave-logn/boundary/BOUT.inp
index 11e57ec47d..0632aa949b 100644
--- a/examples/fci-wave-logn/boundary/BOUT.inp
+++ b/examples/fci-wave-logn/boundary/BOUT.inp
@@ -20,7 +20,7 @@ expand_divergence = false
 background = 1e-06   # Background density
 
 [all]
-bndry_par_all = parallel_neumann
+bndry_par_all = parallel_neumann_o2
 bndry_all = neumann
 
 [n]
@@ -28,15 +28,15 @@ bndry_all = neumann
 zl = z / (2*pi)
 function = fciwave:background + 1e-3*exp(-((x-0.7)/0.1)^2 - ((zl-0.3)/0.1)^2)
 
-bndry_par_yup = parallel_neumann
-bndry_par_ydown = parallel_neumann
+bndry_par_yup = parallel_neumann_o2
+bndry_par_ydown = parallel_neumann_o2
 
 [logn]
 
 function = log(n:function)
 
-bndry_par_yup = parallel_neumann
-bndry_par_ydown = parallel_neumann
+bndry_par_yup = parallel_neumann_o2
+bndry_par_ydown = parallel_neumann_o2
 
 [v]
 
diff --git a/examples/fci-wave-logn/div-integrate/BOUT.inp b/examples/fci-wave-logn/div-integrate/BOUT.inp
index a37bf3e2a5..66bdbce5f2 100644
--- a/examples/fci-wave-logn/div-integrate/BOUT.inp
+++ b/examples/fci-wave-logn/div-integrate/BOUT.inp
@@ -20,7 +20,7 @@ expand_divergence = false
 background = 1e-06   # Background density
 
 [all]
-bndry_par_all = parallel_neumann
+bndry_par_all = parallel_neumann_o2
 bndry_all = neumann
 
 [n]
@@ -28,15 +28,15 @@ bndry_all = neumann
 zl = z / (2*pi)
 function = fciwave:background + 1e-3*exp(-((x-0.7)/0.1)^2 - ((zl-0.3)/0.1)^2)
 
-bndry_par_yup = parallel_neumann
-bndry_par_ydown = parallel_neumann
+bndry_par_yup = parallel_neumann_o2
+bndry_par_ydown = parallel_neumann_o2
 
 [logn]
 
 function = log(n:function)
 
-bndry_par_yup = parallel_neumann
-bndry_par_ydown = parallel_neumann
+bndry_par_yup = parallel_neumann_o2
+bndry_par_ydown = parallel_neumann_o2
 
 [v]
 
diff --git a/examples/fci-wave-logn/expanded/BOUT.inp b/examples/fci-wave-logn/expanded/BOUT.inp
index 3a2935c6e8..e084511d24 100644
--- a/examples/fci-wave-logn/expanded/BOUT.inp
+++ b/examples/fci-wave-logn/expanded/BOUT.inp
@@ -20,7 +20,7 @@ expand_divergence = true
 background = 1e-06   # Background density
 
 [all]
-bndry_par_all = parallel_neumann
+bndry_par_all = parallel_neumann_o2
 bndry_all = neumann
 
 [n]
@@ -28,15 +28,15 @@ bndry_all = neumann
 zl = z / (2*pi)
 function = fciwave:background + 1e-3*exp(-((x-0.7)/0.1)^2 - ((zl-0.3)/0.1)^2)
 
-bndry_par_yup = parallel_neumann
-bndry_par_ydown = parallel_neumann
+bndry_par_yup = parallel_neumann_o2
+bndry_par_ydown = parallel_neumann_o2
 
 [logn]
 
 function = log(n:function)
 
-bndry_par_yup = parallel_neumann
-bndry_par_ydown = parallel_neumann
+bndry_par_yup = parallel_neumann_o2
+bndry_par_ydown = parallel_neumann_o2
 
 [v]
 
diff --git a/examples/fci-wave-logn/fci-wave.cxx b/examples/fci-wave-logn/fci-wave.cxx
index 731897ad4e..2ea9048421 100644
--- a/examples/fci-wave-logn/fci-wave.cxx
+++ b/examples/fci-wave-logn/fci-wave.cxx
@@ -62,7 +62,7 @@ class FCIwave : public PhysicsModel {
 
     // Neumann boundaries simplifies parallel derivatives
     Bxyz.applyBoundary("neumann");
-    Bxyz.applyParallelBoundary("parallel_neumann");
+    Bxyz.applyParallelBoundary("parallel_neumann_o2");
     SAVE_ONCE(Bxyz);
 
     Options::getRoot()->getSection("fciwave")->get("expand_divergence", expand_divergence,
diff --git a/examples/fci-wave/div-integrate/BOUT.inp b/examples/fci-wave/div-integrate/BOUT.inp
index eb41d5f228..68f2326f52 100644
--- a/examples/fci-wave/div-integrate/BOUT.inp
+++ b/examples/fci-wave/div-integrate/BOUT.inp
@@ -21,7 +21,7 @@ log_density = false  # Evolve log(n)?
 background = 1e-06   # Background density
 
 [all]
-bndry_par_all = parallel_neumann
+bndry_par_all = parallel_neumann_o2
 bndry_all = neumann
 
 [n]
@@ -29,15 +29,15 @@ bndry_all = neumann
 zl = z / (2*pi)
 function = fciwave:background + 1e-3*exp(-((x-0.7)/0.1)^2 - ((zl-0.3)/0.1)^2)
 
-bndry_par_yup = parallel_neumann
-bndry_par_ydown = parallel_neumann
+bndry_par_yup = parallel_neumann_o2
+bndry_par_ydown = parallel_neumann_o2
 
 [logn]
 
 function = log(n:function)
 
-bndry_par_yup = parallel_neumann
-bndry_par_ydown = parallel_neumann
+bndry_par_yup = parallel_neumann_o2
+bndry_par_ydown = parallel_neumann_o2
 
 [v]
 
diff --git a/examples/fci-wave/div/BOUT.inp b/examples/fci-wave/div/BOUT.inp
index 70b60757eb..3f497df6c7 100644
--- a/examples/fci-wave/div/BOUT.inp
+++ b/examples/fci-wave/div/BOUT.inp
@@ -21,7 +21,7 @@ log_density = false  # Evolve log(n)?
 background = 1e-06   # Background density
 
 [all]
-bndry_par_all = parallel_neumann
+bndry_par_all = parallel_neumann_o2
 bndry_all = neumann
 
 [n]
@@ -29,15 +29,15 @@ bndry_all = neumann
 zl = z / (2*pi)
 function = fciwave:background + 1e-3*exp(-((x-0.7)/0.1)^2 - ((zl-0.3)/0.1)^2)
 
-bndry_par_yup = parallel_neumann
-bndry_par_ydown = parallel_neumann
+bndry_par_yup = parallel_neumann_o2
+bndry_par_ydown = parallel_neumann_o2
 
 [logn]
 
 function = log(n:function)
 
-bndry_par_yup = parallel_neumann
-bndry_par_ydown = parallel_neumann
+bndry_par_yup = parallel_neumann_o2
+bndry_par_ydown = parallel_neumann_o2
 
 [v]
 
diff --git a/examples/fci-wave/fci-wave.cxx b/examples/fci-wave/fci-wave.cxx
index 226b52c808..2fd383ed3f 100644
--- a/examples/fci-wave/fci-wave.cxx
+++ b/examples/fci-wave/fci-wave.cxx
@@ -69,7 +69,7 @@ class FCIwave : public PhysicsModel {
 
     // Neumann boundaries simplifies parallel derivatives
     Bxyz.applyBoundary("neumann");
-    Bxyz.applyParallelBoundary("parallel_neumann");
+    Bxyz.applyParallelBoundary("parallel_neumann_o2");
     SAVE_ONCE(Bxyz);
 
     SOLVE_FOR(nv);
diff --git a/examples/fci-wave/logn/BOUT.inp b/examples/fci-wave/logn/BOUT.inp
index f97d8cc891..26f8a99d63 100644
--- a/examples/fci-wave/logn/BOUT.inp
+++ b/examples/fci-wave/logn/BOUT.inp
@@ -21,7 +21,7 @@ log_density = true  # Evolve log(n)?
 background = 1e-06   # Background density
 
 [all]
-bndry_par_all = parallel_neumann
+bndry_par_all = parallel_neumann_o2
 bndry_all = neumann
 
 [n]
@@ -29,15 +29,15 @@ bndry_all = neumann
 zl = z / (2*pi)
 function = fciwave:background + 1e-3*exp(-((x-0.7)/0.1)^2 - ((zl-0.3)/0.1)^2)
 
-bndry_par_yup = parallel_neumann
-bndry_par_ydown = parallel_neumann
+bndry_par_yup = parallel_neumann_o2
+bndry_par_ydown = parallel_neumann_o2
 
 [logn]
 
 function = log(n:function)
 
-bndry_par_yup = parallel_neumann
-bndry_par_ydown = parallel_neumann
+bndry_par_yup = parallel_neumann_o2
+bndry_par_ydown = parallel_neumann_o2
 
 [nv]
 
diff --git a/examples/laplace-petsc3d/data/BOUT.inp b/examples/laplace-petsc3d/data/BOUT.inp
index 86a52c69f2..7e81d992a2 100644
--- a/examples/laplace-petsc3d/data/BOUT.inp
+++ b/examples/laplace-petsc3d/data/BOUT.inp
@@ -6,7 +6,7 @@ mz = 128
 function = mixmode(x, 1.)*mixmode(y, 2.)*mixmode(z, 3.)
 bndry_xin = none
 bndry_xout = none
-bndry_par_all = parallel_neumann
+bndry_par_all = parallel_neumann_o2
 
 [rhs]
 function = mixmode(x, 4.)*mixmode(y, 5.)*mixmode(z, 6.)
@@ -22,7 +22,7 @@ function = 1. + .1*mixmode(x, 10.)*mixmode(y, 11.)*mixmode(z, 12.)
 [C2]
 #function = 0.
 function = .1*mixmode(x, 13.)*mixmode(y, 14.)*mixmode(z, 15.)
-bndry_par_all = parallel_neumann
+bndry_par_all = parallel_neumann_o2
 
 [A]
 function = 0.0
@@ -46,7 +46,7 @@ transform_from_field_aligned = false
 [initial]
 bndry_xin = neumann
 bndry_xout = neumann
-bndry_par_all = parallel_neumann
+bndry_par_all = parallel_neumann_o2
 
 [input1]
 function = mixmode(x, 1.)*mixmode(z, 2.)
diff --git a/examples/performance/iterator-offsets/iterator-offsets.cxx b/examples/performance/iterator-offsets/iterator-offsets.cxx
index 08149f855e..2376b63578 100644
--- a/examples/performance/iterator-offsets/iterator-offsets.cxx
+++ b/examples/performance/iterator-offsets/iterator-offsets.cxx
@@ -73,7 +73,7 @@ int main(int argc, char** argv) {
 #if BOUT_USE_OPENMP
   ITERATOR_TEST_BLOCK(
     "Nested loop (omp)",
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for(int i=0;i<mesh->LocalNx;++i) {
     for (int j = mesh->ystart; j < mesh->yend; ++j) {
       for (int k = 0; k < mesh->LocalNz; ++k) {
@@ -98,7 +98,7 @@ int main(int argc, char** argv) {
                       deriv(a, result, "RGN_NOY"););
 
   ITERATOR_TEST_BLOCK(
-      "Region with stencil", BOUT_OMP(parallel) {
+      "Region with stencil", BOUT_OMP_PERF(parallel) {
         stencil s;
         BOUT_FOR_INNER(i, mesh->getRegion3D("RGN_NOY")) {
           s.m = a[i.ym()];
@@ -110,7 +110,7 @@ int main(int argc, char** argv) {
       });
 
   ITERATOR_TEST_BLOCK(
-      "Region with stencil and function pointer", BOUT_OMP(parallel) {
+      "Region with stencil and function pointer", BOUT_OMP_PERF(parallel) {
         stencil s;
         BOUT_FOR_INNER(i, mesh->getRegion3D("RGN_NOY")) {
           s.m = a[i.ym()];
diff --git a/examples/performance/iterator/iterator.cxx b/examples/performance/iterator/iterator.cxx
index 7f9eb7ce1f..af1163d927 100644
--- a/examples/performance/iterator/iterator.cxx
+++ b/examples/performance/iterator/iterator.cxx
@@ -66,7 +66,7 @@ int main(int argc, char** argv) {
       "C loop", for (int j = 0; j < len; ++j) { rd[j] = ad[j] + bd[j]; };);
 #if BOUT_USE_OPENMP
   ITERATOR_TEST_BLOCK("C loop (omp)",
-		      BOUT_OMP(parallel for)
+		      BOUT_OMP_PERF(parallel for)
 		      for(int j=0;j<len;++j) {
     rd[j] = ad[j] + bd[j];
 		      };
@@ -85,7 +85,7 @@ int main(int argc, char** argv) {
 
 #if BOUT_USE_OPENMP
   ITERATOR_TEST_BLOCK("Nested loop (omp)",
-		      BOUT_OMP(parallel for)
+		      BOUT_OMP_PERF(parallel for)
 		      for(int i=0;i<mesh->LocalNx;++i) {
     for (int j = 0; j < mesh->LocalNy; ++j) {
       for (int k = 0; k < mesh->LocalNz; ++k) {
diff --git a/include/bout/adios_object.hxx b/include/bout/adios_object.hxx
index 9d2f545b46..4750930373 100755
--- a/include/bout/adios_object.hxx
+++ b/include/bout/adios_object.hxx
@@ -14,7 +14,7 @@
 
 #include "bout/build_config.hxx"
 
-#if BOUT_HAS_ADIOS
+#if BOUT_HAS_ADIOS2
 
 #include <adios2.h>
 #include <memory>
@@ -79,5 +79,5 @@ void ADIOSSetParameters(const std::string& input, const char delimKeyValue,
 
 } // namespace bout
 
-#endif //BOUT_HAS_ADIOS
+#endif //BOUT_HAS_ADIOS2
 #endif //ADIOS_OBJECT_HXX
diff --git a/include/bout/array.hxx b/include/bout/array.hxx
index 060b4900a1..0caaed9c86 100644
--- a/include/bout/array.hxx
+++ b/include/bout/array.hxx
@@ -23,15 +23,15 @@
  *     o Added Umpire support, in multiple iterations/variations
  */
 
-#ifndef __ARRAY_H__
-#define __ARRAY_H__
+#ifndef BOUT_ARRAY_H
+#define BOUT_ARRAY_H
 
 #include <algorithm>
 #include <map>
 #include <memory>
 #include <vector>
 
-#ifdef _OPENMP
+#if BOUT_USE_OPENMP
 #include <omp.h>
 #endif
 
@@ -375,22 +375,14 @@ private:
    * @param[in] cleanup   If set to true, deletes all dataBlock and clears the store
    */
   static storeType& store(bool cleanup = false) {
-#ifdef _OPENMP
     static arenaType arena(omp_get_max_threads());
-#else
-    static arenaType arena(1);
-#endif
     if (!cleanup) {
-#ifdef _OPENMP
       return arena[omp_get_thread_num()];
-#else
-      return arena[0];
-#endif
     }
 
     // Clean by deleting all data -- possible that just stores.clear() is
     // sufficient rather than looping over each entry.
-    BOUT_OMP(single)
+    BOUT_OMP_SAFE(single)
     {
       for (auto& stores : arena) {
         for (auto& p : stores) {
@@ -486,4 +478,4 @@ bool operator==(const Array<T, B1>& lhs, const Array<T, B2>& rhs) {
   return std::equal(lhs.begin(), lhs.end(), rhs.begin());
 }
 
-#endif // __ARRAY_H__
+#endif // BOUT_ARRAY_H
diff --git a/include/bout/assert.hxx b/include/bout/assert.hxx
index 233641966b..653c44ed42 100644
--- a/include/bout/assert.hxx
+++ b/include/bout/assert.hxx
@@ -14,8 +14,8 @@
  * 
  */
 
-#ifndef __BOUT_ASSERT_H__
-#define __BOUT_ASSERT_H__
+#ifndef BOUT_ASSERT_H
+#define BOUT_ASSERT_H
 
 #include "bout/boutexception.hxx"
 
@@ -65,4 +65,4 @@
 #define ASSERT3(condition)
 #endif
 
-#endif // __BOUT_ASSERT_H__
+#endif // BOUT_ASSERT_H
diff --git a/include/bout/boundary_factory.hxx b/include/bout/boundary_factory.hxx
index 208b7cdb61..5f1f6e06a6 100644
--- a/include/bout/boundary_factory.hxx
+++ b/include/bout/boundary_factory.hxx
@@ -1,13 +1,16 @@
 
 class BoundaryFactory;
 
-#ifndef __BNDRY_FACTORY_H__
-#define __BNDRY_FACTORY_H__
+#ifndef BOUT_BNDRY_FACTORY_H
+#define BOUT_BNDRY_FACTORY_H
 
-#include "bout/boundary_op.hxx"
-#include "bout/boundary_region.hxx"
-#include "bout/parallel_boundary_op.hxx"
-#include "bout/parallel_boundary_region.hxx"
+class BoundaryOpBase;
+class BoundaryOpPar;
+class BoundaryOp;
+class BoundaryRegionBase;
+class BoundaryRegionPar;
+class BoundaryRegion;
+class BoundaryModifier;
 
 #include <map>
 #include <string>
@@ -126,4 +129,4 @@ private:
   // BoundaryModifier* findBoundaryMod(const string &s);
 };
 
-#endif // __BNDRY_FACTORY_H__
+#endif // BOUT_BNDRY_FACTORY_H
diff --git a/include/bout/boundary_region.hxx b/include/bout/boundary_region.hxx
index 542460580c..58de12045e 100644
--- a/include/bout/boundary_region.hxx
+++ b/include/bout/boundary_region.hxx
@@ -1,8 +1,8 @@
 
 class BoundaryRegion;
 
-#ifndef __BNDRY_REGION_H__
-#define __BNDRY_REGION_H__
+#ifndef BOUT_BNDRY_REGION_H
+#define BOUT_BNDRY_REGION_H
 
 #include <string>
 #include <utility>
@@ -142,4 +142,4 @@ private:
   int xs, xe;
 };
 
-#endif // __BNDRY_REGION_H__
+#endif // BOUT_BNDRY_REGION_H
diff --git a/include/bout/boundary_standard.hxx b/include/bout/boundary_standard.hxx
index 96d43de24d..b1116e159f 100644
--- a/include/bout/boundary_standard.hxx
+++ b/include/bout/boundary_standard.hxx
@@ -1,7 +1,7 @@
 /// Some standard boundary conditions
 
-#ifndef __BNDRY_STD_H__
-#define __BNDRY_STD_H__
+#ifndef BOUT_BNDRY_STD_H
+#define BOUT_BNDRY_STD_H
 
 #include "bout/boundary_op.hxx"
 #include "bout/bout_types.hxx"
@@ -516,4 +516,4 @@ public:
 private:
 };
 
-#endif // __BNDRY_STD_H__
+#endif // BOUT_BNDRY_STD_H
diff --git a/include/bout/bout.hxx b/include/bout/bout.hxx
index d929a19c2f..09433bcc3b 100644
--- a/include/bout/bout.hxx
+++ b/include/bout/bout.hxx
@@ -34,6 +34,7 @@
 #ifndef BOUT_H
 #define BOUT_H
 
+// IWYU pragma: begin_keep, begin_export
 #include "bout/build_config.hxx"
 
 #include "bout/boutcomm.hxx"
@@ -53,6 +54,7 @@
 #include "bout/vector3d.hxx"
 #include "bout/version.hxx"
 #include "bout/where.hxx"
+// IWYU pragma: end_keep, end_export
 
 // BOUT++ main functions
 
diff --git a/include/bout/bout_enum_class.hxx b/include/bout/bout_enum_class.hxx
index ef251b4c2f..f8c9e364c5 100644
--- a/include/bout/bout_enum_class.hxx
+++ b/include/bout/bout_enum_class.hxx
@@ -19,8 +19,8 @@
  * along with BOUT++.  If not, see <http://www.gnu.org/licenses/>.
  **************************************************************************/
 
-#ifndef __BOUT_ENUM_CLASS_H__
-#define __BOUT_ENUM_CLASS_H__
+#ifndef BOUT_ENUM_CLASS_H
+#define BOUT_ENUM_CLASS_H
 
 #include "bout/boutexception.hxx"
 #include "bout/macro_for_each.hxx"
@@ -100,4 +100,4 @@
     return out << toString(e);                                                 \
   }
 
-#endif // __BOUT_ENUM_CLASS_H__
+#endif // BOUT_ENUM_CLASS_H
diff --git a/include/bout/bout_types.hxx b/include/bout/bout_types.hxx
index 5a00b5144b..c1f06fca7c 100644
--- a/include/bout/bout_types.hxx
+++ b/include/bout/bout_types.hxx
@@ -19,8 +19,8 @@
  * along with BOUT++.  If not, see <http://www.gnu.org/licenses/>.
  **************************************************************************/
 
-#ifndef __BOUT_TYPES_H__
-#define __BOUT_TYPES_H__
+#ifndef BOUT_TYPES_H
+#define BOUT_TYPES_H
 
 #include <limits>
 #include <string>
@@ -140,4 +140,4 @@ struct enumWrapper {
 /// Boundary condition function
 using FuncPtr = BoutReal (*)(BoutReal t, BoutReal x, BoutReal y, BoutReal z);
 
-#endif // __BOUT_TYPES_H__
+#endif // BOUT_TYPES_H
diff --git a/include/bout/boutcomm.hxx b/include/bout/boutcomm.hxx
index fea401af02..9342d29741 100644
--- a/include/bout/boutcomm.hxx
+++ b/include/bout/boutcomm.hxx
@@ -27,8 +27,8 @@
 
 class BoutComm;
 
-#ifndef __BOUTCOMM_H__
-#define __BOUTCOMM_H__
+#ifndef BOUT_BOUTCOMM_H
+#define BOUT_BOUTCOMM_H
 
 #include <mpi.h>
 
@@ -68,4 +68,4 @@ private:
   static BoutComm* instance; ///< The only instance of this class (Singleton)
 };
 
-#endif // __BOUTCOMM_H__
+#endif // BOUT_BOUTCOMM_H
diff --git a/include/bout/build_config.hxx b/include/bout/build_config.hxx
index c97962f7cf..08158d00e9 100644
--- a/include/bout/build_config.hxx
+++ b/include/bout/build_config.hxx
@@ -17,7 +17,7 @@ constexpr auto has_gettext = static_cast<bool>(BOUT_HAS_GETTEXT);
 constexpr auto has_lapack = static_cast<bool>(BOUT_HAS_LAPACK);
 constexpr auto has_legacy_netcdf = static_cast<bool>(BOUT_HAS_LEGACY_NETCDF);
 constexpr auto has_netcdf = static_cast<bool>(BOUT_HAS_NETCDF);
-constexpr auto has_adios = static_cast<bool>(BOUT_HAS_ADIOS);
+constexpr auto has_adios2 = static_cast<bool>(BOUT_HAS_ADIOS2);
 constexpr auto has_petsc = static_cast<bool>(BOUT_HAS_PETSC);
 constexpr auto has_hypre = static_cast<bool>(BOUT_HAS_HYPRE);
 constexpr auto has_umpire = static_cast<bool>(BOUT_HAS_UMPIRE);
diff --git a/include/bout/constants.hxx b/include/bout/constants.hxx
index c811799aef..273ab2270e 100644
--- a/include/bout/constants.hxx
+++ b/include/bout/constants.hxx
@@ -3,8 +3,8 @@
  * 
  **************************************************************************/
 
-#ifndef __CONSTANTS_H__
-#define __CONSTANTS_H__
+#ifndef BOUT_CONSTANTS_H
+#define BOUT_CONSTANTS_H
 
 #include <bout/bout_types.hxx>
 
@@ -28,4 +28,4 @@ constexpr BoutReal M_Deuterium = 2.01410178 * amu; ///< Mass of a Deuterium atom
 constexpr BoutReal M_Tritium = 3.0160492 * amu;    ///< Mass of a Tritium atom
 } // namespace SI
 
-#endif // __CONSTANTS_H__
+#endif // BOUT_CONSTANTS_H
diff --git a/include/bout/coordinates.hxx b/include/bout/coordinates.hxx
index 42efcad84c..49feffa0a7 100644
--- a/include/bout/coordinates.hxx
+++ b/include/bout/coordinates.hxx
@@ -30,8 +30,8 @@
  *
  **************************************************************************/
 
-#ifndef __COORDINATES_H__
-#define __COORDINATES_H__
+#ifndef BOUT_COORDINATES_H
+#define BOUT_COORDINATES_H
 
 #include "bout/field2d.hxx"
 #include "bout/field3d.hxx"
@@ -262,4 +262,4 @@ private:
 };
 */
 
-#endif // __COORDINATES_H__
+#endif // BOUT_COORDINATES_H
diff --git a/include/bout/cyclic_reduction.hxx b/include/bout/cyclic_reduction.hxx
index d4ef958e93..d4c0920910 100644
--- a/include/bout/cyclic_reduction.hxx
+++ b/include/bout/cyclic_reduction.hxx
@@ -38,8 +38,8 @@
  *
  ************************************************************************/
 
-#ifndef __CYCLIC_REDUCE_H__
-#define __CYCLIC_REDUCE_H__
+#ifndef BOUT_CYCLIC_REDUCE_H
+#define BOUT_CYCLIC_REDUCE_H
 
 #ifdef DIAGNOSE
 #undef DIAGNOSE
@@ -101,7 +101,7 @@ public:
     Matrix<T> bMatrix(1, N);
     Matrix<T> cMatrix(1, N);
 
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (int i = 0; i < N; ++i) {
       aMatrix(0, i) = a[i];
       bMatrix(0, i) = b[i];
@@ -126,7 +126,7 @@ public:
     allocMemory(nprocs, nsys, N);
 
     // Fill coefficient array
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (int j = 0; j < Nsys; j++) {
       for (int i = 0; i < N; i++) {
         coefs(j, 4 * i) = a(j, i);
@@ -149,7 +149,7 @@ public:
     Matrix<T> xMatrix(1, N);
 
     // Copy input data into matrix
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (int i = 0; i < N; ++i) {
       rhsMatrix(0, i) = rhs[i];
     }
@@ -158,7 +158,7 @@ public:
     solve(rhsMatrix, xMatrix);
 
     // Copy result back into argument
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (int i = 0; i < N; ++i) {
       x[i] = xMatrix(0, i);
     }
@@ -184,7 +184,7 @@ public:
 
     // Insert RHS into coefs array. Ordered to allow efficient partitioning
     // for MPI send/receives
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (int j = 0; j < Nsys; j++) {
       for (int i = 0; i < N; i++) {
         coefs(j, 4 * i + 3) = rhs(j, i);
@@ -230,7 +230,7 @@ public:
 
         if (p == myproc) {
           // Just copy the data
-          BOUT_OMP(parallel for)
+          BOUT_OMP_PERF(parallel for)
           for (int i = 0; i < myns; i++) {
             for (int j = 0; j < 8; j++) {
               ifcs(i, 8 * p + j) = myif(sys0 + i, j);
@@ -285,7 +285,7 @@ public:
 #ifdef DIAGNOSE
           output << "Copying received data from " << p << endl;
 #endif
-          BOUT_OMP(parallel for)
+          BOUT_OMP_PERF(parallel for)
           for (int i = 0; i < myns; i++) {
             for (int j = 0; j < 8; j++) {
 #ifdef DIAGNOSE
@@ -317,7 +317,7 @@ public:
       x1.ensureUnique();
       xn.ensureUnique();
 
-      BOUT_OMP(parallel for)
+      BOUT_OMP_PERF(parallel for)
       for (int i = 0; i < myns; ++i) {
         //  (a  b) (x1) = (b1)
         //  (c  d) (xn)   (bn)
@@ -364,7 +364,7 @@ public:
 
         if (p == myproc) {
           // Just copy the data
-          BOUT_OMP(parallel for)
+          BOUT_OMP_PERF(parallel for)
           for (int i = 0; i < myns; i++) {
             x1[sys0 + i] = ifx(i, 2 * p);
             xn[sys0 + i] = ifx(i, 2 * p + 1);
@@ -389,7 +389,7 @@ public:
         // Send data
         for (int p = 0; p < nprocs; p++) { // Loop over processor
           if (p != myproc) {
-            BOUT_OMP(parallel for)
+            BOUT_OMP_PERF(parallel for)
             for (int i = 0; i < myns; i++) {
               ifp[2 * i] = ifx(i, 2 * p);
               ifp[2 * i + 1] = ifx(i, 2 * p + 1);
@@ -427,7 +427,7 @@ public:
             nsp++;
           }
 
-          BOUT_OMP(parallel for)
+          BOUT_OMP_PERF(parallel for)
           for (int i = 0; i < nsp; i++) {
             x1[s0 + i] = recvbuffer(fromproc, 2 * i);
             xn[s0 + i] = recvbuffer(fromproc, 2 * i + 1);
@@ -540,7 +540,7 @@ private:
     }
 #endif
 
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (int j = 0; j < ns; j++) {
       // Calculate upper interface equation
 
@@ -619,7 +619,7 @@ private:
     // Tridiagonal system, solve using serial Thomas algorithm
     // xa -- Result for each system
     // co -- Coefficients & rhs for each system
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (int i = 0; i < ns; i++) { // Loop over systems
       Array<T> gam(nloc);          // Thread-local array
       T bet = 1.0;
@@ -640,4 +640,4 @@ private:
   }
 };
 
-#endif // __CYCLIC_REDUCE_H__
+#endif // BOUT_CYCLIC_REDUCE_H
diff --git a/include/bout/dcomplex.hxx b/include/bout/dcomplex.hxx
index 569b5f2c13..75bc9d26ff 100644
--- a/include/bout/dcomplex.hxx
+++ b/include/bout/dcomplex.hxx
@@ -29,8 +29,8 @@
  * along with BOUT++.  If not, see <http://www.gnu.org/licenses/>.
  * 
  */
-#ifndef __DCOMPLEX_H__
-#define __DCOMPLEX_H__
+#ifndef BOUT_DCOMPLEX_H
+#define BOUT_DCOMPLEX_H
 
 #include "bout/bout_types.hxx"
 #include <complex>
@@ -44,4 +44,4 @@ struct fcmplx {
   BoutReal r, i;
 };
 
-#endif // __DCOMPLEX_H__
+#endif // BOUT_DCOMPLEX_H
diff --git a/include/bout/derivs.hxx b/include/bout/derivs.hxx
index c01e1562fc..1c360bb9cd 100644
--- a/include/bout/derivs.hxx
+++ b/include/bout/derivs.hxx
@@ -26,8 +26,8 @@
  *
  **************************************************************************/
 
-#ifndef __DERIVS_H__
-#define __DERIVS_H__
+#ifndef BOUT_DERIVS_H
+#define BOUT_DERIVS_H
 
 #include "bout/field2d.hxx"
 #include "bout/field3d.hxx"
@@ -701,4 +701,4 @@ Coordinates::FieldMetric D2DYDZ(const Field2D& f, CELL_LOC outloc = CELL_DEFAULT
                                 const std::string& method = "DEFAULT",
                                 const std::string& region = "RGN_NOBNDRY");
 
-#endif // __DERIVS_H__
+#endif // BOUT_DERIVS_H
diff --git a/include/bout/difops.hxx b/include/bout/difops.hxx
index 2b5c6746fd..71053d454a 100644
--- a/include/bout/difops.hxx
+++ b/include/bout/difops.hxx
@@ -33,8 +33,8 @@
  * 
  *******************************************************************************/
 
-#ifndef __DIFOPS_H__
-#define __DIFOPS_H__
+#ifndef BOUT_DIFOPS_H
+#define BOUT_DIFOPS_H
 
 #include "bout/field2d.hxx"
 #include "bout/field3d.hxx"
@@ -310,4 +310,4 @@ Field3D bracket(const Field3D& f, const Field2D& g, BRACKET_METHOD method = BRAC
 Field3D bracket(const Field3D& f, const Field3D& g, BRACKET_METHOD method = BRACKET_STD,
                 CELL_LOC outloc = CELL_DEFAULT, Solver* solver = nullptr);
 
-#endif /* __DIFOPS_H__ */
+#endif /* BOUT_DIFOPS_H */
diff --git a/include/bout/expr.hxx b/include/bout/expr.hxx
index e03c07aa49..267af202ed 100644
--- a/include/bout/expr.hxx
+++ b/include/bout/expr.hxx
@@ -9,8 +9,8 @@
  *
  **************************************************************************/
 
-#ifndef __EXPR_H__
-#define __EXPR_H__
+#ifndef BOUT_EXPR_H
+#define BOUT_EXPR_H
 
 #warning expr.hxx is deprecated. Do not use!
 
@@ -205,4 +205,4 @@ const Field3D eval3D(Expr e) {
   return result;
 }
 
-#endif // __EXPR_H__
+#endif // BOUT_EXPR_H
diff --git a/include/bout/fft.hxx b/include/bout/fft.hxx
index 8e74321f2a..fdec8b7bec 100644
--- a/include/bout/fft.hxx
+++ b/include/bout/fft.hxx
@@ -25,8 +25,8 @@
  * 
  *******************************************************************************/
 
-#ifndef __FFT_H__
-#define __FFT_H__
+#ifndef BOUT_FFT_H
+#define BOUT_FFT_H
 
 #include "bout/dcomplex.hxx"
 #include <bout/array.hxx>
@@ -132,4 +132,4 @@ inline void DST_rev(dcomplex* in, int length, BoutReal* out) {
   return bout::fft::DST_rev(in, length, out);
 }
 
-#endif // __FFT_H__
+#endif // BOUT_FFT_H
diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx
index 5bac67beb2..10b801ef8d 100644
--- a/include/bout/field2d.hxx
+++ b/include/bout/field2d.hxx
@@ -27,8 +27,8 @@
 class Field2D;
 
 #pragma once
-#ifndef __FIELD2D_H__
-#define __FIELD2D_H__
+#ifndef BOUT_FIELD2D_H
+#define BOUT_FIELD2D_H
 
 class Mesh;
 #include "bout/field.hxx"
@@ -374,4 +374,4 @@ bool operator==(const Field2D& a, const Field2D& b);
 
 std::ostream& operator<<(std::ostream& out, const Field2D& value);
 
-#endif /* __FIELD2D_H__ */
+#endif /* BOUT_FIELD2D_H */
diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx
index 9f5326253d..ba8c8e879e 100644
--- a/include/bout/field3d.hxx
+++ b/include/bout/field3d.hxx
@@ -23,8 +23,8 @@
 class Field3D;
 
 #pragma once
-#ifndef __FIELD3D_H__
-#define __FIELD3D_H__
+#ifndef BOUT_FIELD3D_H
+#define BOUT_FIELD3D_H
 
 class Mesh; // #include "bout/mesh.hxx"
 #include "bout/bout_types.hxx"
@@ -656,4 +656,4 @@ bool operator==(const Field3D& a, const Field3D& b);
 /// Output a string describing a Field3D to a stream
 std::ostream& operator<<(std::ostream& out, const Field3D& value);
 
-#endif /* __FIELD3D_H__ */
+#endif /* BOUT_FIELD3D_H */
diff --git a/include/bout/field_data.hxx b/include/bout/field_data.hxx
index 03b9d6759b..185dcabf2d 100644
--- a/include/bout/field_data.hxx
+++ b/include/bout/field_data.hxx
@@ -44,7 +44,8 @@ class Coordinates;
 class Mesh;
 
 #include "bout/boundary_region.hxx"
-#include "bout/parallel_boundary_region.hxx"
+class BoundaryRegionPar;
+enum class BndryLoc;
 
 #include "bout/sys/expressionparser.hxx"
 
diff --git a/include/bout/field_factory.hxx b/include/bout/field_factory.hxx
index ee228d836c..2a20226b2e 100644
--- a/include/bout/field_factory.hxx
+++ b/include/bout/field_factory.hxx
@@ -26,8 +26,8 @@
 
 class FieldFactory;
 
-#ifndef __FIELD_FACTORY_H__
-#define __FIELD_FACTORY_H__
+#ifndef BOUT_FIELD_FACTORY_H
+#define BOUT_FIELD_FACTORY_H
 
 #include "bout/mesh.hxx"
 
@@ -165,4 +165,4 @@ public:
   }
 };
 
-#endif // __FIELD_FACTORY_H__
+#endif // BOUT_FIELD_FACTORY_H
diff --git a/include/bout/fieldgroup.hxx b/include/bout/fieldgroup.hxx
index c33bd63e16..184766c6b8 100644
--- a/include/bout/fieldgroup.hxx
+++ b/include/bout/fieldgroup.hxx
@@ -1,5 +1,5 @@
-#ifndef __FIELDGROUP_H__
-#define __FIELDGROUP_H__
+#ifndef BOUT_FIELDGROUP_H
+#define BOUT_FIELDGROUP_H
 
 #include "bout/field_data.hxx"
 #include <bout/field3d.hxx>
@@ -190,4 +190,4 @@ private:
 /// Combine two FieldGroups
 FieldGroup operator+(const FieldGroup& lhs, const FieldGroup& rhs);
 
-#endif // __FIELDGROUP_H__
+#endif // BOUT_FIELDGROUP_H
diff --git a/include/bout/fieldperp.hxx b/include/bout/fieldperp.hxx
index 3b8ed45db6..6995308dbe 100644
--- a/include/bout/fieldperp.hxx
+++ b/include/bout/fieldperp.hxx
@@ -25,8 +25,8 @@
 
 class FieldPerp;
 
-#ifndef __FIELDPERP_H__
-#define __FIELDPERP_H__
+#ifndef BOUT_FIELDPERP_H
+#define BOUT_FIELDPERP_H
 
 #include "bout/field.hxx"
 
diff --git a/include/bout/fv_ops.hxx b/include/bout/fv_ops.hxx
index 5f1e688bd8..94007a57a2 100644
--- a/include/bout/fv_ops.hxx
+++ b/include/bout/fv_ops.hxx
@@ -2,8 +2,8 @@
   Finite-volume discretisation methods. Flux-conservative form
  */
 
-#ifndef __FV_OPS_H__
-#define __FV_OPS_H__
+#ifndef BOUT_FV_OPS_H
+#define BOUT_FV_OPS_H
 
 #include "bout/field3d.hxx"
 #include "bout/globals.hxx"
@@ -525,4 +525,4 @@ const Field3D Div_f_v(const Field3D& n_in, const Vector3D& v, bool bndry_flux) {
    */
 Field3D Div_Perp_Lap(const Field3D& a, const Field3D& f, CELL_LOC outloc = CELL_DEFAULT);
 } // namespace FV
-#endif // __FV_OPS_H__
+#endif // BOUT_FV_OPS_H
diff --git a/include/bout/globalfield.hxx b/include/bout/globalfield.hxx
index 85252f4962..038a0875bf 100644
--- a/include/bout/globalfield.hxx
+++ b/include/bout/globalfield.hxx
@@ -6,8 +6,8 @@
 class GlobalField;
 class GlobalField2D;
 
-#ifndef __GLOBALFIELD_H__
-#define __GLOBALFIELD_H__
+#ifndef BOUT_GLOBALFIELD_H
+#define BOUT_GLOBALFIELD_H
 
 #include "mesh.hxx"
 
@@ -257,4 +257,4 @@ private:
   bool data_valid;
 };
 
-#endif // __GLOBALFIELD_H__
+#endif // BOUT_GLOBALFIELD_H
diff --git a/include/bout/globals.hxx b/include/bout/globals.hxx
index ae7edff298..64b3a09ee3 100644
--- a/include/bout/globals.hxx
+++ b/include/bout/globals.hxx
@@ -24,8 +24,8 @@
  *
  **************************************************************************/
 
-#ifndef __GLOBALS_H__
-#define __GLOBALS_H__
+#ifndef BOUT_GLOBALS_H
+#define BOUT_GLOBALS_H
 
 #include "bout/macro_for_each.hxx"
 
@@ -97,4 +97,4 @@ SETTING(MpiWrapper* mpi, nullptr); ///< The MPI wrapper object
 } // namespace globals
 } // namespace bout
 
-#endif // __GLOBALS_H__
+#endif // BOUT_GLOBALS_H
diff --git a/include/bout/griddata.hxx b/include/bout/griddata.hxx
index 875cb07d7a..29a32e5779 100644
--- a/include/bout/griddata.hxx
+++ b/include/bout/griddata.hxx
@@ -25,8 +25,8 @@
 
 class GridDataSource;
 
-#ifndef __GRIDDATA_H__
-#define __GRIDDATA_H__
+#ifndef BOUT_GRIDDATA_H
+#define BOUT_GRIDDATA_H
 
 #include "mesh.hxx"
 #include "bout/bout_types.hxx"
@@ -299,4 +299,4 @@ private:
   Options* options;
 };
 
-#endif // __GRIDDATA_H__
+#endif // BOUT_GRIDDATA_H
diff --git a/include/bout/gyro_average.hxx b/include/bout/gyro_average.hxx
index 0f9f2a13f7..63ef13279b 100644
--- a/include/bout/gyro_average.hxx
+++ b/include/bout/gyro_average.hxx
@@ -29,8 +29,8 @@
  *
  **************************************************************/
 
-#ifndef __GYRO_AVERAGE_H__
-#define __GYRO_AVERAGE_H__
+#ifndef BOUT_GYRO_AVERAGE_H
+#define BOUT_GYRO_AVERAGE_H
 
 #include "bout/field3d.hxx"
 #include "bout/invert_laplace.hxx"
@@ -115,4 +115,4 @@ Field3D gyroPade2(const Field3D& f, const Field2D& rho,
 Field3D gyroPade2(const Field3D& f, BoutReal rho, int inner_boundary_flags = GYRO_FLAGS,
                   int outer_boundary_flags = GYRO_FLAGS);
 
-#endif // __GYRO_AVERAGE_H__
+#endif // BOUT_GYRO_AVERAGE_H
diff --git a/include/bout/hypre_interface.hxx b/include/bout/hypre_interface.hxx
index c26548e95e..cd3af7d39c 100644
--- a/include/bout/hypre_interface.hxx
+++ b/include/bout/hypre_interface.hxx
@@ -480,7 +480,7 @@ public:
           weights.begin(), weights.end(), std::back_inserter(values),
           [&value_](BoutReal weight) -> HYPRE_Complex { return weight * value_; });
       const HYPRE_BigInt ncolumns = static_cast<HYPRE_BigInt>(positions.size());
-      // BOUT_OMP(critical)
+      // BOUT_OMP_SAFE(critical)
       for (HYPRE_BigInt i = 0; i < ncolumns; ++i) {
         matrix->setVal(row, positions[i], values[i]);
       }
@@ -495,7 +495,7 @@ public:
           weights.begin(), weights.end(), std::back_inserter(values),
           [&value_](BoutReal weight) -> HYPRE_Complex { return weight * value_; });
       const HYPRE_BigInt ncolumns = static_cast<HYPRE_BigInt>(positions.size());
-      // BOUT_OMP(critical)
+      // BOUT_OMP_SAFE(critical)
       for (HYPRE_BigInt i = 0; i < ncolumns; ++i) {
         matrix->addVal(row, positions[i], values[i]);
       }
diff --git a/include/bout/initialprofiles.hxx b/include/bout/initialprofiles.hxx
index 71cab22431..a2fc050b15 100644
--- a/include/bout/initialprofiles.hxx
+++ b/include/bout/initialprofiles.hxx
@@ -23,8 +23,8 @@
  *
  **************************************************************************/
 
-#ifndef __INITIALPROF_H__
-#define __INITIALPROF_H__
+#ifndef BOUT_INITIALPROF_H
+#define BOUT_INITIALPROF_H
 
 #include <string>
 
@@ -113,4 +113,4 @@ void initial_profile(const std::string& name, Vector2D& var);
  */
 void initial_profile(const std::string& name, Vector3D& var);
 
-#endif // __INITIALPROF_H__
+#endif // BOUT_INITIALPROF_H
diff --git a/include/bout/interpolation.hxx b/include/bout/interpolation.hxx
index aab3f61281..1f4b0a51b5 100644
--- a/include/bout/interpolation.hxx
+++ b/include/bout/interpolation.hxx
@@ -23,8 +23,8 @@
  *
  **************************************************************************/
 
-#ifndef __INTERP_H__
-#define __INTERP_H__
+#ifndef BOUT_INTERP_H
+#define BOUT_INTERP_H
 
 #include "bout/mesh.hxx"
 
@@ -202,4 +202,4 @@ const T interp_to(const T& var, CELL_LOC loc, const std::string region = "RGN_AL
   return result;
 }
 
-#endif // __INTERP_H__
+#endif // BOUT_INTERP_H
diff --git a/include/bout/interpolation_xz.hxx b/include/bout/interpolation_xz.hxx
index 3f8e37d3fd..52dc38f174 100644
--- a/include/bout/interpolation_xz.hxx
+++ b/include/bout/interpolation_xz.hxx
@@ -21,8 +21,8 @@
  *
  **************************************************************************/
 
-#ifndef __INTERP_XZ_H__
-#define __INTERP_XZ_H__
+#ifndef BOUT_INTERP_XZ_H
+#define BOUT_INTERP_XZ_H
 
 #include "bout/mask.hxx"
 
@@ -286,4 +286,4 @@ public:
 template <class DerivedType>
 using RegisterXZInterpolation = XZInterpolationFactory::RegisterInFactory<DerivedType>;
 
-#endif // __INTERP_XZ_H__
+#endif // BOUT_INTERP_XZ_H
diff --git a/include/bout/interpolation_z.hxx b/include/bout/interpolation_z.hxx
index b11d7ff5b6..68cf5b0b06 100644
--- a/include/bout/interpolation_z.hxx
+++ b/include/bout/interpolation_z.hxx
@@ -20,8 +20,8 @@
  *
  **************************************************************************/
 
-#ifndef __INTERP_Z_H__
-#define __INTERP_Z_H__
+#ifndef BOUT_INTERP_Z_H
+#define BOUT_INTERP_Z_H
 
 #include "bout/generic_factory.hxx"
 #include "bout/paralleltransform.hxx"
@@ -125,4 +125,4 @@ private:
   Field3D h11;
 };
 
-#endif // __INTERP_Z_H__
+#endif // BOUT_INTERP_Z_H
diff --git a/include/bout/invert/laplacexy.hxx b/include/bout/invert/laplacexy.hxx
index c07db58478..19da48dd4d 100644
--- a/include/bout/invert/laplacexy.hxx
+++ b/include/bout/invert/laplacexy.hxx
@@ -30,8 +30,8 @@
  *
  **************************************************************************/
 
-#ifndef __LAPLACE_XY_H__
-#define __LAPLACE_XY_H__
+#ifndef BOUT_LAPLACE_XY_H
+#define BOUT_LAPLACE_XY_H
 
 #include "bout/build_config.hxx"
 
@@ -222,4 +222,4 @@ private:
 };
 
 #endif // BOUT_HAS_PETSC
-#endif // __LAPLACE_XY_H__
+#endif // BOUT_LAPLACE_XY_H
diff --git a/include/bout/invert/laplacexy2.hxx b/include/bout/invert/laplacexy2.hxx
index 6945de7b99..51f75f467d 100644
--- a/include/bout/invert/laplacexy2.hxx
+++ b/include/bout/invert/laplacexy2.hxx
@@ -30,8 +30,8 @@
  *
  **************************************************************************/
 
-#ifndef __LAPLACE_XY2_H__
-#define __LAPLACE_XY2_H__
+#ifndef BOUT_LAPLACE_XY2_H
+#define BOUT_LAPLACE_XY2_H
 
 #include "bout/build_defines.hxx"
 
@@ -141,4 +141,4 @@ private:
 };
 
 #endif // BOUT_HAS_PETSC
-#endif // __LAPLACE_XY_H__
+#endif // BOUT_LAPLACE_XY2_H
diff --git a/include/bout/invert/laplacexz.hxx b/include/bout/invert/laplacexz.hxx
index 1b1ebef832..11f1c69330 100644
--- a/include/bout/invert/laplacexz.hxx
+++ b/include/bout/invert/laplacexz.hxx
@@ -28,8 +28,8 @@
  *
  **************************************************************************/
 
-#ifndef __LAPLACEXZ_H__
-#define __LAPLACEXZ_H__
+#ifndef BOUT_LAPLACEXZ_H
+#define BOUT_LAPLACEXZ_H
 
 #include <bout/field3d.hxx>
 #include <bout/generic_factory.hxx>
@@ -91,4 +91,4 @@ protected:
 private:
 };
 
-#endif // __LAPLACEXZ_H__
+#endif // BOUT_LAPLACEXZ_H
diff --git a/include/bout/invert_laplace.hxx b/include/bout/invert_laplace.hxx
index 78417b9fce..0b416d4aab 100644
--- a/include/bout/invert_laplace.hxx
+++ b/include/bout/invert_laplace.hxx
@@ -31,8 +31,8 @@
 
 class Laplacian;
 
-#ifndef __LAPLACE_H__
-#define __LAPLACE_H__
+#ifndef BOUT_LAPLACE_H
+#define BOUT_LAPLACE_H
 
 #include "bout/build_config.hxx"
 
@@ -238,6 +238,10 @@ public:
   virtual void setInnerBoundaryFlags(int f) { inner_boundary_flags = f; }
   virtual void setOuterBoundaryFlags(int f) { outer_boundary_flags = f; }
 
+  virtual int getGlobalFlags() const { return global_flags; }
+  virtual int getInnerBoundaryFlags() const { return inner_boundary_flags; }
+  virtual int getOuterBoundaryFlags() const { return outer_boundary_flags; }
+
   /// Does this solver use Field3D coefficients (true) or only their DC component (false)
   virtual bool uses3DCoefs() const { return false; }
 
@@ -308,9 +312,23 @@ protected:
   int extra_yguards_lower; ///< exclude some number of points at the lower boundary, useful for staggered grids or when boundary conditions make inversion redundant
   int extra_yguards_upper; ///< exclude some number of points at the upper boundary, useful for staggered grids or when boundary conditions make inversion redundant
 
-  int global_flags;         ///< Default flags
-  int inner_boundary_flags; ///< Flags to set inner boundary condition
-  int outer_boundary_flags; ///< Flags to set outer boundary condition
+  /// Return true if global/default \p flag is set
+  bool isGlobalFlagSet(int flag) const { return (global_flags & flag) != 0; }
+  /// Return true if \p flag is set for the inner boundary condition
+  bool isInnerBoundaryFlagSet(int flag) const {
+    return (inner_boundary_flags & flag) != 0;
+  }
+  /// Return true if \p flag is set for the outer boundary condition
+  bool isOuterBoundaryFlagSet(int flag) const {
+    return (outer_boundary_flags & flag) != 0;
+  }
+
+  /// Return true if \p flag is set for the inner boundary condition
+  /// and this is the first proc in X direction
+  bool isInnerBoundaryFlagSetOnFirstX(int flag) const;
+  /// Return true if \p flag is set for the outer boundary condition
+  /// and this the last proc in X direction
+  bool isOuterBoundaryFlagSetOnLastX(int flag) const;
 
   void tridagCoefs(int jx, int jy, BoutReal kwave, dcomplex& a, dcomplex& b, dcomplex& c,
                    const Field2D* ccoef = nullptr, const Field2D* d = nullptr,
@@ -322,15 +340,13 @@ protected:
                    CELL_LOC loc = CELL_DEFAULT);
 
   void tridagMatrix(dcomplex* avec, dcomplex* bvec, dcomplex* cvec, dcomplex* bk, int jy,
-                    int kz, BoutReal kwave, int flags, int inner_boundary_flags,
-                    int outer_boundary_flags, const Field2D* a, const Field2D* ccoef,
+                    int kz, BoutReal kwave, const Field2D* a, const Field2D* ccoef,
                     const Field2D* d, bool includeguards = true, bool zperiodic = true) {
-    tridagMatrix(avec, bvec, cvec, bk, jy, kz, kwave, flags, inner_boundary_flags,
-                 outer_boundary_flags, a, ccoef, ccoef, d, includeguards, zperiodic);
+    tridagMatrix(avec, bvec, cvec, bk, jy, kz, kwave, a, ccoef, ccoef, d, includeguards,
+                 zperiodic);
   }
   void tridagMatrix(dcomplex* avec, dcomplex* bvec, dcomplex* cvec, dcomplex* bk, int jy,
-                    int kz, BoutReal kwave, int flags, int inner_boundary_flags,
-                    int outer_boundary_flags, const Field2D* a, const Field2D* c1coef,
+                    int kz, BoutReal kwave, const Field2D* a, const Field2D* c1coef,
                     const Field2D* c2coef, const Field2D* d, bool includeguards = true,
                     bool zperiodic = true);
   CELL_LOC location;   ///< staggered grid location of this solver
@@ -339,6 +355,10 @@ protected:
                        ///  localmesh->getCoordinates(location) once
 
 private:
+  int global_flags;         ///< Default flags
+  int inner_boundary_flags; ///< Flags to set inner boundary condition
+  int outer_boundary_flags; ///< Flags to set outer boundary condition
+
   /// Singleton instance
   static std::unique_ptr<Laplacian> instance;
   /// Name for writing performance infomation; default taken from
@@ -374,4 +394,4 @@ void laplace_tridag_coefs(int jx, int jy, int jz, dcomplex& a, dcomplex& b, dcom
                           const Field2D* ccoef = nullptr, const Field2D* d = nullptr,
                           CELL_LOC loc = CELL_DEFAULT);
 
-#endif // __LAPLACE_H__
+#endif // BOUT_LAPLACE_H
diff --git a/include/bout/invert_parderiv.hxx b/include/bout/invert_parderiv.hxx
index 5a83a7f4e8..e9623e0f9f 100644
--- a/include/bout/invert_parderiv.hxx
+++ b/include/bout/invert_parderiv.hxx
@@ -28,8 +28,8 @@
  *
  ************************************************************************/
 
-#ifndef __INV_PAR_H__
-#define __INV_PAR_H__
+#ifndef BOUT_INV_PAR_H
+#define BOUT_INV_PAR_H
 
 #include "bout/field2d.hxx"
 #include "bout/field3d.hxx"
@@ -189,4 +189,4 @@ protected:
 private:
 };
 
-#endif // __INV_PAR_H__
+#endif // BOUT_INV_PAR_H
diff --git a/include/bout/invert_pardiv.hxx b/include/bout/invert_pardiv.hxx
index 23ea59e943..0153cc1987 100644
--- a/include/bout/invert_pardiv.hxx
+++ b/include/bout/invert_pardiv.hxx
@@ -31,11 +31,11 @@
 #ifndef INV_PARDIV_H
 #define INV_PARDIV_H
 
-#include "field2d.hxx"
-#include "field3d.hxx"
-#include "options.hxx"
-#include "unused.hxx"
+#include "bout/field2d.hxx"
+#include "bout/field3d.hxx"
 #include "bout/generic_factory.hxx"
+#include "bout/options.hxx"
+#include "bout/unused.hxx"
 
 // Pardivergence implementations
 constexpr auto PARDIVCYCLIC = "cyclic";
diff --git a/include/bout/invertable_operator.hxx b/include/bout/invertable_operator.hxx
index 1940177bca..a45fc3565f 100644
--- a/include/bout/invertable_operator.hxx
+++ b/include/bout/invertable_operator.hxx
@@ -30,8 +30,8 @@ class InvertableOperator;
 };
 }; // namespace bout
 
-#ifndef __INVERTABLE_OPERATOR_H__
-#define __INVERTABLE_OPERATOR_H__
+#ifndef BOUT_INVERTABLE_OPERATOR_H
+#define BOUT_INVERTABLE_OPERATOR_H
 
 #include "bout/build_config.hxx"
 
diff --git a/include/bout/lapack_routines.hxx b/include/bout/lapack_routines.hxx
index 70a3128f81..d81c0b422d 100644
--- a/include/bout/lapack_routines.hxx
+++ b/include/bout/lapack_routines.hxx
@@ -20,8 +20,8 @@
  *
  **************************************************************************/
 
-#ifndef __LAPACK_ROUTINES_H__
-#define __LAPACK_ROUTINES_H__
+#ifndef BOUT_LAPACK_ROUTINES_H
+#define BOUT_LAPACK_ROUTINES_H
 
 #include <bout/utils.hxx>
 
@@ -56,4 +56,4 @@ void cyclic_tridag(dcomplex* a, dcomplex* b, dcomplex* c, dcomplex* r, dcomplex*
 /// Complex band matrix solver
 void cband_solve(Matrix<dcomplex>& a, int n, int m1, int m2, Array<dcomplex>& b);
 
-#endif // __LAPACK_ROUTINES_H__
+#endif // BOUT_LAPACK_ROUTINES_H
diff --git a/include/bout/macro_for_each.hxx b/include/bout/macro_for_each.hxx
index 10cbd21818..1cfe373c3f 100644
--- a/include/bout/macro_for_each.hxx
+++ b/include/bout/macro_for_each.hxx
@@ -1,6 +1,6 @@
 
-#ifndef __MACRO_FOR_EACH_H__
-#define __MACRO_FOR_EACH_H__
+#ifndef BOUT_MACRO_FOR_EACH_H
+#define BOUT_MACRO_FOR_EACH_H
 
 // Provides a macro MACRO_FOR_EACH which applies a
 // macro to each argument in a VA_ARGS list
diff --git a/include/bout/mask.hxx b/include/bout/mask.hxx
index 89197ddcf2..4250d21105 100644
--- a/include/bout/mask.hxx
+++ b/include/bout/mask.hxx
@@ -19,8 +19,8 @@
  * along with BOUT++.  If not, see <http://www.gnu.org/licenses/>.
  **************************************************************************/
 
-#ifndef __MASK_H__
-#define __MASK_H__
+#ifndef BOUT_MASK_H
+#define BOUT_MASK_H
 
 #include <vector>
 
@@ -79,4 +79,4 @@ inline std::unique_ptr<Region<Ind3D>> regionFromMask(const BoutMask& mask,
   }
   return std::make_unique<Region<Ind3D>>(indices);
 }
-#endif //__MASK_H__
+#endif //BOUT_MASK_H
diff --git a/include/bout/mesh.hxx b/include/bout/mesh.hxx
index 8f73552ea5..c80716fc12 100644
--- a/include/bout/mesh.hxx
+++ b/include/bout/mesh.hxx
@@ -40,8 +40,8 @@
 
 class Mesh;
 
-#ifndef __MESH_H__
-#define __MESH_H__
+#ifndef BOUT_MESH_H
+#define BOUT_MESH_H
 
 #include "mpi.h"
 
@@ -55,22 +55,24 @@ class Mesh;
 #include "bout/field_data.hxx"
 #include "bout/options.hxx"
 
-#include "fieldgroup.hxx"
+#include "bout/fieldgroup.hxx"
 
-#include "bout/boundary_region.hxx"
-#include "bout/parallel_boundary_region.hxx"
+class BoundaryRegion;
+class BoundaryRegionPar;
 
-#include "sys/range.hxx" // RangeIterator
+#include "bout/sys/range.hxx" // RangeIterator
 
 #include <bout/griddata.hxx>
 
-#include "coordinates.hxx" // Coordinates class
+#include "bout/coordinates.hxx" // Coordinates class
 
 #include "bout/unused.hxx"
 
 #include "bout/generic_factory.hxx"
 #include <bout/region.hxx>
 
+#include <bout/bout_enum_class.hxx>
+
 #include <list>
 #include <map>
 #include <memory>
@@ -90,6 +92,9 @@ public:
   ReturnType create(Options* options = nullptr, GridDataSource* source = nullptr) const;
 };
 
+BOUT_ENUM_CLASS(BoundaryParType, all, xin, xout, fwd, bwd, xin_fwd, xout_fwd, xin_bwd,
+                xout_bwd, SIZE);
+
 template <class DerivedType>
 using RegisterMesh = MeshFactory::RegisterInFactory<DerivedType>;
 
@@ -485,11 +490,20 @@ public:
   /// Add a boundary region to this processor
   virtual void addBoundary(BoundaryRegion* UNUSED(bndry)) {}
 
-  /// Get all the parallel (Y) boundaries on this processor
-  virtual std::vector<BoundaryRegionPar*> getBoundariesPar() = 0;
+  /// Get the list of parallel boundary regions. The option specifies with
+  /// region to get. Default is to get all regions. All possible options are
+  /// listed at the top of this file, see BoundaryParType.
+  /// For example:
+  /// get all regions:
+  /// mesh->getBoundariesPar(Mesh::BoundaryParType::all)
+  /// get only xout:
+  /// mesh->getBoundariesPar(Mesh::BoundaryParType::xout)
+  virtual std::vector<std::shared_ptr<BoundaryRegionPar>>
+  getBoundariesPar(BoundaryParType type = BoundaryParType::all) = 0;
 
   /// Add a parallel(Y) boundary to this processor
-  virtual void addBoundaryPar(BoundaryRegionPar* UNUSED(bndry)) {}
+  virtual void addBoundaryPar(std::shared_ptr<BoundaryRegionPar> UNUSED(bndry),
+                              BoundaryParType UNUSED(type)) {}
 
   /// Branch-cut special handling (experimental)
   virtual Field3D smoothSeparatrix(const Field3D& f) { return f; }
@@ -853,4 +867,4 @@ Mesh::getRegion<FieldPerp>(const std::string& region_name) const {
   return getRegionPerp(region_name);
 }
 
-#endif // __MESH_H__
+#endif // BOUT_MESH_H
diff --git a/include/bout/monitor.hxx b/include/bout/monitor.hxx
index 5bc4fc7e12..359096e74f 100644
--- a/include/bout/monitor.hxx
+++ b/include/bout/monitor.hxx
@@ -1,5 +1,5 @@
-#ifndef __MONITOR_H__
-#define __MONITOR_H__
+#ifndef BOUT_MONITOR_H
+#define BOUT_MONITOR_H
 
 #include "bout/assert.hxx"
 #include "bout/bout_types.hxx"
@@ -125,4 +125,4 @@ public:
   void writeProgress(BoutReal simtime, bool output_split);
 };
 
-#endif // __MONITOR_H__
+#endif // BOUT_MONITOR_H
diff --git a/include/bout/mpi_wrapper.hxx b/include/bout/mpi_wrapper.hxx
index 65b14cf84f..826405d8da 100644
--- a/include/bout/mpi_wrapper.hxx
+++ b/include/bout/mpi_wrapper.hxx
@@ -27,8 +27,8 @@
 
 class MpiWrapper;
 
-#ifndef __MPIWRAPPER_H__
-#define __MPIWRAPPER_H__
+#ifndef BOUT_MPIWRAPPER_H
+#define BOUT_MPIWRAPPER_H
 
 #include <mpi.h>
 
@@ -153,4 +153,4 @@ public:
   virtual double MPI_Wtime() { return ::MPI_Wtime(); }
 };
 
-#endif // __MPIWRAPPER_H__
+#endif // BOUT_MPIWRAPPER_H
diff --git a/include/bout/msg_stack.hxx b/include/bout/msg_stack.hxx
index e8158c3200..adbf1bbbcb 100644
--- a/include/bout/msg_stack.hxx
+++ b/include/bout/msg_stack.hxx
@@ -26,8 +26,8 @@
 
 class MsgStack;
 
-#ifndef __MSG_STACK_H__
-#define __MSG_STACK_H__
+#ifndef BOUT_MSG_STACK_H
+#define BOUT_MSG_STACK_H
 
 #include "bout/build_config.hxx"
 
@@ -212,4 +212,4 @@ private:
  */
 #define AUTO_TRACE() TRACE(__thefunc__) // NOLINT
 
-#endif // __MSG_STACK_H__
+#endif // BOUT_MSG_STACK_H
diff --git a/include/bout/multiostream.hxx b/include/bout/multiostream.hxx
index b90ccf9419..ca3cc2d0c7 100644
--- a/include/bout/multiostream.hxx
+++ b/include/bout/multiostream.hxx
@@ -1,5 +1,5 @@
-#ifndef __MULTIOSTREAM_H__
-#define __MULTIOSTREAM_H__
+#ifndef BOUT_MULTIOSTREAM_H
+#define BOUT_MULTIOSTREAM_H
 
 #include <algorithm>
 #include <streambuf>
@@ -89,4 +89,4 @@ public:
 using cmultiostream = multiostream<char>;
 using wmultiostream = multiostream<wchar_t>;
 
-#endif // __MULTIOSTREAM_H__
+#endif // BOUT_MULTIOSTREAM_H
diff --git a/include/bout/openmpwrap.hxx b/include/bout/openmpwrap.hxx
index 032705e61a..582df7b86c 100644
--- a/include/bout/openmpwrap.hxx
+++ b/include/bout/openmpwrap.hxx
@@ -24,9 +24,16 @@
  *
  **************************************************************************/
 
-#ifndef __OPENMPWRAP_H__
-#define __OPENMPWRAP_H__
+#ifndef BOUT_OPENMPWRAP_H
+#define BOUT_OPENMPWRAP_H
 
+#include "bout/build_defines.hxx"
+
+#if BOUT_USE_OPENMP || defined(_OPENMP)
+#include "omp.h"
+#endif
+
+#ifdef _OPENMP
 //Some helpers for indirection -- required so that the _Pragma gets "omp <x>"
 //where <x> is any number of valid omp options/environments (e.g. atomic, critical etc.)
 #define INDIRECT0(a) #a
@@ -35,12 +42,30 @@
 
 //Define a macro wrapper to the use of `#pragma omp` to avoid unknown pragma
 //warnings when compiling without openmp support.
-#if BOUT_USE_OPENMP
+#define BOUT_OMP_SAFE(...) _Pragma(INDIRECT2(__VA_ARGS__))
 #define BOUT_OMP(...) _Pragma(INDIRECT2(__VA_ARGS__))
 #else
+#define BOUT_OMP_SAFE(...)
 #define BOUT_OMP(...)
 #endif
 
+#if BOUT_USE_OPENMP
+
+#ifndef INDIRECT2
+#error expected macro INDIRECT2 to be available
+#endif
+
+#define BOUT_OMP_PERF(...) _Pragma(INDIRECT2(__VA_ARGS__))
+#else
+#define BOUT_OMP_PERF(...)
+#endif
+
+#ifndef _OPENMP
+inline int constexpr omp_get_max_threads() { return 1; }
+inline int constexpr omp_get_num_threads() { return 1; }
+inline int constexpr omp_get_thread_num() { return 0; }
+#endif
+
 //Perhaps want to cleanup local helpers with below, but DON'T!
 //This would cause uses of BOUT_OMP to break
 // #undef INDIRECT0
diff --git a/include/bout/operatorstencil.hxx b/include/bout/operatorstencil.hxx
index 9a60f94ca7..118dc7a068 100644
--- a/include/bout/operatorstencil.hxx
+++ b/include/bout/operatorstencil.hxx
@@ -27,8 +27,8 @@
  *
  **************************************************************************/
 
-#ifndef __OPERATORSTENCIL_H__
-#define __OPERATORSTENCIL_H__
+#ifndef BOUT_OPERATORSTENCIL_H
+#define BOUT_OPERATORSTENCIL_H
 
 #include <algorithm>
 #include <functional>
@@ -322,4 +322,4 @@ OperatorStencil<T> starStencil(Mesh* localmesh) {
   return stencil;
 }
 
-#endif // __OPERATORSTENCIL_H__
+#endif // BOUT_OPERATORSTENCIL_H
diff --git a/include/bout/options.hxx b/include/bout/options.hxx
index aa12442451..d6bcfd5f68 100644
--- a/include/bout/options.hxx
+++ b/include/bout/options.hxx
@@ -241,7 +241,8 @@ public:
   ///
   ///     Option option2 = option1.copy();
   ///
-  Options(const Options& other) = delete; // Use a reference or .copy() method
+  [[deprecated("Please use a reference or .copy() instead")]] Options(
+      const Options& other);
 
   /// Copy assignment must be explicit
   ///
@@ -251,7 +252,8 @@ public:
   ///
   ///     option2.value = option1.value;
   ///
-  Options& operator=(const Options& other) = delete; // Use a reference or .copy() method
+  [[deprecated("Please use a reference or .copy() instead")]] Options&
+  operator=(const Options& other); // Use a reference or .copy() method
 
   /// Make a deep copy of this Options,
   /// recursively copying children.
@@ -364,7 +366,8 @@ public:
   ///         {"long_name", "some velocity"}
   ///       });
   Options& setAttributes(
-      std::initializer_list<std::pair<std::string, Options::AttributeType>> attrs) {
+      const std::initializer_list<std::pair<std::string, Options::AttributeType>>&
+          attrs) {
     for (const auto& attr : attrs) {
       attributes[attr.first] = attr.second;
     }
diff --git a/include/bout/options_io.hxx b/include/bout/options_io.hxx
index 4c70159514..57be8bbaae 100644
--- a/include/bout/options_io.hxx
+++ b/include/bout/options_io.hxx
@@ -111,7 +111,7 @@ public:
   static constexpr auto default_type =
 #if BOUT_HAS_NETCDF
       "netcdf";
-#elif BOUT_HAS_ADIOS
+#elif BOUT_HAS_ADIOS2
       "adios";
 #else
       "invalid";
diff --git a/include/bout/optionsreader.hxx b/include/bout/optionsreader.hxx
index 32c302a3f7..de3d40514d 100644
--- a/include/bout/optionsreader.hxx
+++ b/include/bout/optionsreader.hxx
@@ -31,8 +31,8 @@
 
 class OptionsReader;
 
-#ifndef __OPTIONSREADER_H__
-#define __OPTIONSREADER_H__
+#ifndef BOUT_OPTIONSREADER_H
+#define BOUT_OPTIONSREADER_H
 
 #include "bout/options.hxx"
 
@@ -108,4 +108,4 @@ private:
   static OptionsReader* instance;
 };
 
-#endif // __OPTIONSREADER_H__
+#endif // BOUT_OPTIONSREADER_H
diff --git a/include/bout/output.hxx b/include/bout/output.hxx
index a44e987197..2862899067 100644
--- a/include/bout/output.hxx
+++ b/include/bout/output.hxx
@@ -26,8 +26,8 @@
 class Output;
 
 #pragma once
-#ifndef __OUTPUT_H__
-#define __OUTPUT_H__
+#ifndef BOUT_OUTPUT_H
+#define BOUT_OUTPUT_H
 
 #include "bout/multiostream.hxx"
 #include <fstream>
@@ -304,4 +304,4 @@ extern ConditionalOutput output_verbose;  ///< less interesting messages
 /// Generic output, given the same level as output_progress
 extern ConditionalOutput output;
 
-#endif // __OUTPUT_H__
+#endif // BOUT_OUTPUT_H
diff --git a/include/bout/parallel_boundary_op.hxx b/include/bout/parallel_boundary_op.hxx
index d17aa8e48a..d8620e892b 100644
--- a/include/bout/parallel_boundary_op.hxx
+++ b/include/bout/parallel_boundary_op.hxx
@@ -1,5 +1,5 @@
-#ifndef __PAR_BNDRY_OP_H__
-#define __PAR_BNDRY_OP_H__
+#ifndef BOUT_PAR_BNDRY_OP_H
+#define BOUT_PAR_BNDRY_OP_H
 
 #include "bout/boundary_op.hxx"
 #include "bout/bout_types.hxx"
@@ -52,7 +52,7 @@ protected:
   BoutReal getValue(const BoundaryRegionPar& bndry, BoutReal t);
 };
 
-template <class T>
+template <class T, bool isNeumann = false>
 class BoundaryOpParTemp : public BoundaryOpPar {
 public:
   using BoundaryOpPar::BoundaryOpPar;
@@ -89,51 +89,74 @@ public:
     throw BoutException("Can't apply parallel boundary conditions to Field2D!");
   }
   void apply(Field3D& f) override { return apply(f, 0); }
+
+  void apply(Field3D& f, BoutReal t) override {
+    f.ynext(bndry->dir).allocate(); // Ensure unique before modifying
+
+    auto dy = f.getCoordinates()->dy;
+
+    for (bndry->first(); !bndry->isDone(); bndry->next()) {
+      BoutReal value = getValue(*bndry, t);
+      if (isNeumann) {
+        value *= dy[bndry->ind()];
+      }
+      static_cast<T*>(this)->apply_stencil(f, bndry, value);
+    }
+  }
 };
 
 //////////////////////////////////////////////////
 // Implementations
 
-class BoundaryOpPar_dirichlet : public BoundaryOpParTemp<BoundaryOpPar_dirichlet> {
+class BoundaryOpPar_dirichlet_o1 : public BoundaryOpParTemp<BoundaryOpPar_dirichlet_o1> {
 public:
   using BoundaryOpParTemp::BoundaryOpParTemp;
-
-  using BoundaryOpParTemp::apply;
-  void apply(Field3D& f, BoutReal t) override;
+  static void apply_stencil(Field3D& f, const BoundaryRegionPar* bndry, BoutReal value) {
+    bndry->dirichlet_o1(f, value);
+  }
 };
 
-class BoundaryOpPar_dirichlet_O3 : public BoundaryOpParTemp<BoundaryOpPar_dirichlet_O3> {
+class BoundaryOpPar_dirichlet_o2 : public BoundaryOpParTemp<BoundaryOpPar_dirichlet_o2> {
 public:
   using BoundaryOpParTemp::BoundaryOpParTemp;
-
-  using BoundaryOpParTemp::apply;
-  void apply(Field3D& f, BoutReal t) override;
+  static void apply_stencil(Field3D& f, const BoundaryRegionPar* bndry, BoutReal value) {
+    bndry->dirichlet_o2(f, value);
+  }
 };
 
-class BoundaryOpPar_dirichlet_interp
-    : public BoundaryOpParTemp<BoundaryOpPar_dirichlet_interp> {
+class BoundaryOpPar_dirichlet_o3 : public BoundaryOpParTemp<BoundaryOpPar_dirichlet_o3> {
 public:
   using BoundaryOpParTemp::BoundaryOpParTemp;
-
-  using BoundaryOpParTemp::apply;
-  void apply(Field3D& f, BoutReal t) override;
+  static void apply_stencil(Field3D& f, const BoundaryRegionPar* bndry, BoutReal value) {
+    bndry->dirichlet_o3(f, value);
+  }
 };
 
-class BoundaryOpPar_neumann : public BoundaryOpParTemp<BoundaryOpPar_neumann> {
+class BoundaryOpPar_neumann_o1
+    : public BoundaryOpParTemp<BoundaryOpPar_neumann_o1, true> {
 public:
   using BoundaryOpParTemp::BoundaryOpParTemp;
-
-  using BoundaryOpParTemp::apply;
-  void apply(Field3D& f, BoutReal t) override;
+  static void apply_stencil(Field3D& f, const BoundaryRegionPar* bndry, BoutReal value) {
+    bndry->neumann_o1(f, value);
+  }
 };
 
-class BoundaryOpPar_neumann_c2_simple
-    : public BoundaryOpParTemp<BoundaryOpPar_neumann_c2_simple> {
+class BoundaryOpPar_neumann_o2
+    : public BoundaryOpParTemp<BoundaryOpPar_neumann_o2, true> {
 public:
   using BoundaryOpParTemp::BoundaryOpParTemp;
+  static void apply_stencil(Field3D& f, const BoundaryRegionPar* bndry, BoutReal value) {
+    bndry->neumann_o2(f, value);
+  }
+};
 
-  using BoundaryOpParTemp::apply;
-  void apply(Field3D& f, BoutReal t) override;
+class BoundaryOpPar_neumann_o3
+    : public BoundaryOpParTemp<BoundaryOpPar_neumann_o3, true> {
+public:
+  using BoundaryOpParTemp::BoundaryOpParTemp;
+  static void apply_stencil(Field3D& f, const BoundaryRegionPar* bndry, BoutReal value) {
+    bndry->neumann_o3(f, value);
+  }
 };
 
-#endif // __PAR_BNDRY_OP_H__
+#endif // BOUT_PAR_BNDRY_OP_H
diff --git a/include/bout/parallel_boundary_region.hxx b/include/bout/parallel_boundary_region.hxx
index 3d5525a303..308b5ac5d7 100644
--- a/include/bout/parallel_boundary_region.hxx
+++ b/include/bout/parallel_boundary_region.hxx
@@ -1,22 +1,58 @@
-#ifndef __PAR_BNDRY_H__
-#define __PAR_BNDRY_H__
+#ifndef BOUT_PAR_BNDRY_H
+#define BOUT_PAR_BNDRY_H
 
 #include "bout/boundary_region.hxx"
 #include "bout/bout_types.hxx"
 #include <vector>
 
+#include <bout/field3d.hxx>
+#include <bout/mesh.hxx>
+
 /**
  * Boundary region for parallel direction. This contains a vector of points that are
  * inside the boundary.
  *
  */
-class BoundaryRegionPar : public BoundaryRegionBase {
 
-  struct IndexPoint {
-    int jx;
-    int jy;
-    int jz;
-  };
+namespace parallel_stencil {
+// generated by src/mesh/parallel_boundary_stencil.cxx.py
+inline BoutReal pow(BoutReal val, int exp) {
+  // constexpr int expval = exp;
+  // static_assert(expval == 2 or expval == 3, "This pow is only for exponent 2 or 3");
+  if (exp == 2) {
+    return val * val;
+  }
+  ASSERT3(exp == 3);
+  return val * val * val;
+}
+inline BoutReal dirichlet_o1(BoutReal UNUSED(spacing0), BoutReal value0) {
+  return value0;
+}
+inline BoutReal dirichlet_o2(BoutReal spacing0, BoutReal value0, BoutReal spacing1,
+                             BoutReal value1) {
+  return (spacing0 * value1 - spacing1 * value0) / (spacing0 - spacing1);
+}
+inline BoutReal neumann_o2(BoutReal UNUSED(spacing0), BoutReal value0, BoutReal spacing1,
+                           BoutReal value1) {
+  return -spacing1 * value0 + value1;
+}
+inline BoutReal dirichlet_o3(BoutReal spacing0, BoutReal value0, BoutReal spacing1,
+                             BoutReal value1, BoutReal spacing2, BoutReal value2) {
+  return (pow(spacing0, 2) * spacing1 * value2 - pow(spacing0, 2) * spacing2 * value1
+          - spacing0 * pow(spacing1, 2) * value2 + spacing0 * pow(spacing2, 2) * value1
+          + pow(spacing1, 2) * spacing2 * value0 - spacing1 * pow(spacing2, 2) * value0)
+         / ((spacing0 - spacing1) * (spacing0 - spacing2) * (spacing1 - spacing2));
+}
+inline BoutReal neumann_o3(BoutReal spacing0, BoutReal value0, BoutReal spacing1,
+                           BoutReal value1, BoutReal spacing2, BoutReal value2) {
+  return (2 * spacing0 * spacing1 * value2 - 2 * spacing0 * spacing2 * value1
+          + pow(spacing1, 2) * spacing2 * value0 - pow(spacing1, 2) * value2
+          - spacing1 * pow(spacing2, 2) * value0 + pow(spacing2, 2) * value1)
+         / ((spacing1 - spacing2) * (2 * spacing0 - spacing1 - spacing2));
+}
+} // namespace parallel_stencil
+
+class BoundaryRegionPar : public BoundaryRegionBase {
 
   struct RealPoint {
     BoutReal s_x;
@@ -26,13 +62,15 @@ class BoundaryRegionPar : public BoundaryRegionBase {
 
   struct Indices {
     // Indices of the boundary point
-    IndexPoint index;
+    Ind3D index;
     // Intersection with boundary in index space
     RealPoint intersection;
     // Distance to intersection
     BoutReal length;
     // Angle between field line and boundary
-    BoutReal angle;
+    // BoutReal angle;
+    // How many points we can go in the opposite direction
+    signed char valid;
   };
 
   using IndicesVec = std::vector<Indices>;
@@ -46,28 +84,122 @@ class BoundaryRegionPar : public BoundaryRegionBase {
 public:
   BoundaryRegionPar(const std::string& name, int dir, Mesh* passmesh)
       : BoundaryRegionBase(name, passmesh), dir(dir) {
+    ASSERT0(std::abs(dir) == 1);
     BoundaryRegionBase::isParallel = true;
   }
   BoundaryRegionPar(const std::string& name, BndryLoc loc, int dir, Mesh* passmesh)
       : BoundaryRegionBase(name, loc, passmesh), dir(dir) {
     BoundaryRegionBase::isParallel = true;
+    ASSERT0(std::abs(dir) == 1);
   }
 
   /// Add a point to the boundary
-  void add_point(int jx, int jy, int jz, BoutReal x, BoutReal y, BoutReal z,
-                 BoutReal length, BoutReal angle);
+  void add_point(Ind3D ind, BoutReal x, BoutReal y, BoutReal z, BoutReal length,
+                 char valid) {
+    bndry_points.push_back({ind, {x, y, z}, length, valid});
+  }
+  void add_point(int ix, int iy, int iz, BoutReal x, BoutReal y, BoutReal z,
+                 BoutReal length, char valid) {
+    bndry_points.push_back({xyz2ind(ix, iy, iz, localmesh), {x, y, z}, length, valid});
+  }
+
+  // final, so they can be inlined
+  void first() final { bndry_position = begin(bndry_points); }
+  void next() final { ++bndry_position; }
+  bool isDone() final { return (bndry_position == end(bndry_points)); }
 
-  void first() override;
-  void next() override;
-  bool isDone() override;
+  // getter
+  Ind3D ind() const { return bndry_position->index; }
+  BoutReal s_x() const { return bndry_position->intersection.s_x; }
+  BoutReal s_y() const { return bndry_position->intersection.s_y; }
+  BoutReal s_z() const { return bndry_position->intersection.s_z; }
+  BoutReal length() const { return bndry_position->length; }
+  char valid() const { return bndry_position->valid; }
 
-  /// Index of the point in the boundary
-  int x, y, z;
-  BoutReal s_x, s_y, s_z;
-  BoutReal length;
-  BoutReal angle;
+  // setter
+  void setValid(char val) { bndry_position->valid = val; }
+
+  bool contains(const BoundaryRegionPar& bndry) const {
+    return std::binary_search(
+        begin(bndry_points), end(bndry_points), *bndry.bndry_position,
+        [](const Indices& i1, const Indices& i2) { return i1.index < i2.index; });
+  }
+
+  // extrapolate a given point to the boundary
+  BoutReal extrapolate_o1(const Field3D& f) const { return f[ind()]; }
+  BoutReal extrapolate_o2(const Field3D& f) const {
+    ASSERT3(valid() >= 0);
+    if (valid() < 1) {
+      return extrapolate_o1(f);
+    }
+    return f[ind()] * (1 + length()) - f.ynext(-dir)[ind().yp(-dir)] * length();
+  }
+
+  // dirichlet boundary code
+  void dirichlet_o1(Field3D& f, BoutReal value) const {
+    f.ynext(dir)[ind().yp(dir)] = value;
+  }
+
+  void dirichlet_o2(Field3D& f, BoutReal value) const {
+    if (length() < small_value) {
+      return dirichlet_o1(f, value);
+    }
+    ynext(f) = parallel_stencil::dirichlet_o2(1, f[ind()], 1 - length(), value);
+    // ynext(f) = f[ind()] * (1 + 1/length()) + value / length();
+  }
+
+  void dirichlet_o3(Field3D& f, BoutReal value) const {
+    ASSERT3(valid() >= 0);
+    if (valid() < 1) {
+      return dirichlet_o2(f, value);
+    }
+    if (length() < small_value) {
+      ynext(f) = parallel_stencil::dirichlet_o2(2, yprev(f), 1 - length(), value);
+    } else {
+      ynext(f) =
+          parallel_stencil::dirichlet_o3(2, yprev(f), 1, f[ind()], 1 - length(), value);
+    }
+  }
+
+  // NB: value needs to be scaled by dy
+  // neumann_o1 is actually o2 if we would use an appropriate one-sided stencil.
+  // But in general we do not, and thus for normal C2 stencils, this is 1st order.
+  void neumann_o1(Field3D& f, BoutReal value) const { ynext(f) = f[ind()] + value; }
+
+  // NB: value needs to be scaled by dy
+  void neumann_o2(Field3D& f, BoutReal value) const {
+    ASSERT3(valid() >= 0);
+    if (valid() < 1) {
+      return neumann_o1(f, value);
+    }
+    ynext(f) = yprev(f) + 2 * value;
+  }
+
+  // NB: value needs to be scaled by dy
+  void neumann_o3(Field3D& f, BoutReal value) const {
+    ASSERT3(valid() >= 0);
+    if (valid() < 1) {
+      return neumann_o1(f, value);
+    }
+    ynext(f) =
+        parallel_stencil::neumann_o3(1 - length(), value, 1, f[ind()], 2, yprev(f));
+  }
 
   const int dir;
+
+private:
+  constexpr static BoutReal small_value = 1e-2;
+
+  // BoutReal get(const Field3D& f, int off)
+  const BoutReal& ynext(const Field3D& f) const { return f.ynext(dir)[ind().yp(dir)]; }
+  BoutReal& ynext(Field3D& f) const { return f.ynext(dir)[ind().yp(dir)]; }
+  const BoutReal& yprev(const Field3D& f) const { return f.ynext(-dir)[ind().yp(-dir)]; }
+  BoutReal& yprev(Field3D& f) const { return f.ynext(-dir)[ind().yp(-dir)]; }
+  static Ind3D xyz2ind(int x, int y, int z, Mesh* mesh) {
+    const int ny = mesh->LocalNy;
+    const int nz = mesh->LocalNz;
+    return Ind3D{(x * ny + y) * nz + z, ny, nz};
+  }
 };
 
-#endif //  __PAR_BNDRY_H__
+#endif //  BOUT_PAR_BNDRY_H
diff --git a/include/bout/paralleltransform.hxx b/include/bout/paralleltransform.hxx
index 4a7e4989c8..0aafa04303 100644
--- a/include/bout/paralleltransform.hxx
+++ b/include/bout/paralleltransform.hxx
@@ -3,8 +3,8 @@
  * values along Y
  */
 
-#ifndef __PARALLELTRANSFORM_H__
-#define __PARALLELTRANSFORM_H__
+#ifndef BOUT_PARALLELTRANSFORM_H
+#define BOUT_PARALLELTRANSFORM_H
 
 #include "bout/bout_types.hxx"
 #include "bout/field3d.hxx"
@@ -317,4 +317,4 @@ private:
                               const std::vector<ParallelSlicePhase>& phases) const;
 };
 
-#endif // __PARALLELTRANSFORM_H__
+#endif // BOUT_PARALLELTRANSFORM_H
diff --git a/include/bout/petsc_interface.hxx b/include/bout/petsc_interface.hxx
index 0afcc8a30a..407e5ac18e 100644
--- a/include/bout/petsc_interface.hxx
+++ b/include/bout/petsc_interface.hxx
@@ -175,7 +175,7 @@ public:
 #endif
     BoutReal value = BoutNaN;
     int status = 0;
-    BOUT_OMP(critical)
+    BOUT_OMP_SAFE(critical)
     status = VecGetValues(*get(), 1, &global, &value);
     if (status != 0) {
       throw BoutException("Error when getting element of a PETSc vector.");
@@ -355,7 +355,7 @@ public:
       PetscBool assembled = PETSC_FALSE;
       MatAssembled(*petscMatrix, &assembled);
       if (assembled == PETSC_TRUE) {
-        BOUT_OMP(critical)
+        BOUT_OMP_SAFE(critical)
         MatGetValues(*petscMatrix, 1, &petscRow, 1, &petscCol, &value);
       } else {
         value = 0.;
@@ -400,7 +400,7 @@ public:
                      [&val](BoutReal weight) -> PetscScalar { return weight * val; });
 
       int status = 0;
-      BOUT_OMP(critical)
+      BOUT_OMP_SAFE(critical)
       status = MatSetValues(*petscMatrix, 1, &petscRow, positions.size(),
                             positions.data(), values.data(), mode);
       if (status != 0) {
@@ -467,7 +467,7 @@ public:
 #endif
     BoutReal value = BoutNaN;
     int status = 0;
-    BOUT_OMP(critical)
+    BOUT_OMP_SAFE(critical)
     status = MatGetValues(*get(), 1, &global1, 1, &global2, &value);
     if (status != 0) {
       throw BoutException("Error when getting elements of a PETSc matrix.");
diff --git a/include/bout/petsclib.hxx b/include/bout/petsclib.hxx
index 35334ce773..2008671286 100644
--- a/include/bout/petsclib.hxx
+++ b/include/bout/petsclib.hxx
@@ -59,7 +59,7 @@ class Options;
 // means we _must_ `#include` this header _before_ any PETSc header!
 #define PETSC_HAVE_BROKEN_RECURSIVE_MACRO
 
-#include <petsc.h>
+#include <petsc.h> // IWYU pragma: export
 #include <petscversion.h>
 
 #include "bout/boutexception.hxx"
diff --git a/include/bout/physicsmodel.hxx b/include/bout/physicsmodel.hxx
index ada97fc6fc..9fa25d8b0f 100644
--- a/include/bout/physicsmodel.hxx
+++ b/include/bout/physicsmodel.hxx
@@ -34,8 +34,8 @@
 
 class PhysicsModel;
 
-#ifndef __PHYSICS_MODEL_H__
-#define __PHYSICS_MODEL_H__
+#ifndef BOUT_PHYSICS_MODEL_H
+#define BOUT_PHYSICS_MODEL_H
 
 #include "solver.hxx"
 #include "bout/bout.hxx"
@@ -566,4 +566,4 @@ private:
 #define SAVE_REPEAT(...) \
   { MACRO_FOR_EACH(SAVE_REPEAT1, __VA_ARGS__) }
 
-#endif // __PHYSICS_MODEL_H__
+#endif // BOUT_PHYSICS_MODEL_H
diff --git a/include/bout/region.hxx b/include/bout/region.hxx
index cbaf0d0c31..4649b680eb 100644
--- a/include/bout/region.hxx
+++ b/include/bout/region.hxx
@@ -39,20 +39,26 @@
 /// because an Ind2D essentially doesn't keep track of the
 /// z-dimension.
 
-#ifndef __REGION_H__
-#define __REGION_H__
+#ifndef BOUT_REGION_H
+#define BOUT_REGION_H
 
 #include <algorithm>
 #include <ostream>
+#include <string>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
 #include "bout/assert.hxx"
 #include "bout/bout_types.hxx"
-#include "bout/openmpwrap.hxx"
+#include "bout/boutexception.hxx"
+#include "bout/build_defines.hxx"
+#include "bout/openmpwrap.hxx" // IWYU pragma: keep
+
 class BoutMask;
 
+// NOLINTBEGIN(cppcoreguidelines-macro-usage,bugprone-macro-parentheses)
+
 /// The MAXREGIONBLOCKSIZE value can be tuned to try to optimise
 /// performance on specific hardware. It determines what the largest
 /// contiguous block size can be. As we hope the compiler will vectorise
@@ -110,16 +116,16 @@ class BoutMask;
 ///     }
 //
 
-#define BOUT_FOR_SERIAL(index, region)                                            \
-  for (auto block = region.getBlocks().cbegin(), end = region.getBlocks().cend(); \
-       block < end; ++block)                                                      \
+#define BOUT_FOR_SERIAL(index, region)                                                \
+  for (auto block = (region).getBlocks().cbegin(), end = (region).getBlocks().cend(); \
+       block < end; ++block)                                                          \
     for (auto index = block->first; index < block->second; ++index)
 
 #if BOUT_USE_OPENMP
-#define BOUT_FOR_OMP(index, region, omp_pragmas)                                    \
-  BOUT_OMP(omp_pragmas)                                                             \
-  for (auto block = region.getBlocks().cbegin(); block < region.getBlocks().cend(); \
-       ++block)                                                                     \
+#define BOUT_FOR_OMP(index, region, omp_pragmas)                                        \
+  BOUT_OMP_PERF(omp_pragmas)                                                            \
+  for (auto block = (region).getBlocks().cbegin(); block < (region).getBlocks().cend(); \
+       ++block)                                                                         \
     for (auto index = block->first; index < block->second; ++index)
 #else
 // No OpenMP, so fall back to slightly more efficient serial form
@@ -127,10 +133,11 @@ class BoutMask;
 #endif
 
 #define BOUT_FOR(index, region) \
-  BOUT_FOR_OMP(index, region, parallel for schedule(BOUT_OPENMP_SCHEDULE))
+  BOUT_FOR_OMP(index, (region), parallel for schedule(BOUT_OPENMP_SCHEDULE))
 
 #define BOUT_FOR_INNER(index, region) \
-  BOUT_FOR_OMP(index, region, for schedule(BOUT_OPENMP_SCHEDULE) nowait)
+  BOUT_FOR_OMP(index, (region), for schedule(BOUT_OPENMP_SCHEDULE) nowait)
+// NOLINTEND(cppcoreguidelines-macro-usage,bugprone-macro-parentheses)
 
 enum class IND_TYPE { IND_3D = 0, IND_2D = 1, IND_PERP = 2 };
 
@@ -232,7 +239,7 @@ struct SpecificInd {
   /// and is determined by the `dir` template argument. The offset corresponds
   /// to the `dd` template argument.
   template <int dd, DIRECTION dir>
-  const inline SpecificInd plus() const {
+  inline SpecificInd plus() const {
     static_assert(dir == DIRECTION::X || dir == DIRECTION::Y || dir == DIRECTION::Z
                       || dir == DIRECTION::YAligned || dir == DIRECTION::YOrthogonal,
                   "Unhandled DIRECTION in SpecificInd::plus");
@@ -252,7 +259,7 @@ struct SpecificInd {
   /// and is determined by the `dir` template argument. The offset corresponds
   /// to the `dd` template argument.
   template <int dd, DIRECTION dir>
-  const inline SpecificInd minus() const {
+  inline SpecificInd minus() const {
     static_assert(dir == DIRECTION::X || dir == DIRECTION::Y || dir == DIRECTION::Z
                       || dir == DIRECTION::YAligned || dir == DIRECTION::YOrthogonal,
                   "Unhandled DIRECTION in SpecificInd::minus");
@@ -268,11 +275,11 @@ struct SpecificInd {
     }
   }
 
-  const inline SpecificInd xp(int dx = 1) const { return {ind + (dx * ny * nz), ny, nz}; }
+  inline SpecificInd xp(int dx = 1) const { return {ind + (dx * ny * nz), ny, nz}; }
   /// The index one point -1 in x
-  const inline SpecificInd xm(int dx = 1) const { return xp(-dx); }
+  inline SpecificInd xm(int dx = 1) const { return xp(-dx); }
   /// The index one point +1 in y
-  const inline SpecificInd yp(int dy = 1) const {
+  inline SpecificInd yp(int dy = 1) const {
 #if CHECK >= 4
     if (y() + dy < 0 or y() + dy >= ny) {
       throw BoutException("Offset in y ({:d}) would go out of bounds at {:d}", dy, ind);
@@ -282,12 +289,12 @@ struct SpecificInd {
     return {ind + (dy * nz), ny, nz};
   }
   /// The index one point -1 in y
-  const inline SpecificInd ym(int dy = 1) const { return yp(-dy); }
+  inline SpecificInd ym(int dy = 1) const { return yp(-dy); }
   /// The index one point +1 in z. Wraps around zend to zstart
   /// An alternative, non-branching calculation is :
   /// ind + dz - nz * ((ind + dz) / nz  - ind / nz)
   /// but this appears no faster (and perhaps slower).
-  const inline SpecificInd zp(int dz = 1) const {
+  inline SpecificInd zp(int dz = 1) const {
     ASSERT3(dz >= 0);
     dz = dz <= nz ? dz : dz % nz; //Fix in case dz > nz, if not force it to be in range
     return {(ind + dz) % nz < dz ? ind - nz + dz : ind + dz, ny, nz};
@@ -296,22 +303,22 @@ struct SpecificInd {
   /// An alternative, non-branching calculation is :
   /// ind - dz + nz * ( (nz + ind) / nz - (nz + ind - dz) / nz)
   /// but this appears no faster (and perhaps slower).
-  const inline SpecificInd zm(int dz = 1) const {
+  inline SpecificInd zm(int dz = 1) const {
     dz = dz <= nz ? dz : dz % nz; //Fix in case dz > nz, if not force it to be in range
     ASSERT3(dz >= 0);
     return {(ind) % nz < dz ? ind + nz - dz : ind - dz, ny, nz};
   }
 
   // and for 2 cells
-  const inline SpecificInd xpp() const { return xp(2); }
-  const inline SpecificInd xmm() const { return xm(2); }
-  const inline SpecificInd ypp() const { return yp(2); }
-  const inline SpecificInd ymm() const { return ym(2); }
-  const inline SpecificInd zpp() const { return zp(2); }
-  const inline SpecificInd zmm() const { return zm(2); }
+  inline SpecificInd xpp() const { return xp(2); }
+  inline SpecificInd xmm() const { return xm(2); }
+  inline SpecificInd ypp() const { return yp(2); }
+  inline SpecificInd ymm() const { return ym(2); }
+  inline SpecificInd zpp() const { return zp(2); }
+  inline SpecificInd zmm() const { return zm(2); }
 
   /// Generic offset of \p index in multiple directions simultaneously
-  const inline SpecificInd offset(int dx, int dy, int dz) const {
+  inline SpecificInd offset(int dx, int dy, int dz) const {
     auto temp = (dz > 0) ? zp(dz) : zm(-dz);
     return temp.yp(dy).xp(dx);
   }
@@ -380,16 +387,16 @@ using Ind2D = SpecificInd<IND_TYPE::IND_2D>;
 using IndPerp = SpecificInd<IND_TYPE::IND_PERP>;
 
 /// Get string representation of Ind3D
-inline const std::string toString(const Ind3D& i) {
+inline std::string toString(const Ind3D& i) {
   return "(" + std::to_string(i.x()) + ", " + std::to_string(i.y()) + ", "
          + std::to_string(i.z()) + ")";
 }
 /// Get string representation of Ind2D
-inline const std::string toString(const Ind2D& i) {
+inline std::string toString(const Ind2D& i) {
   return "(" + std::to_string(i.x()) + ", " + std::to_string(i.y()) + ")";
 }
 /// Get string representation of IndPerp
-inline const std::string toString(const IndPerp& i) {
+inline std::string toString(const IndPerp& i) {
   return "(" + std::to_string(i.x()) + ", " + std::to_string(i.z()) + ")";
 }
 
@@ -516,10 +523,10 @@ public:
 
   // Want to make this private to disable but think it may be needed as we put Regions
   // into maps which seems to need to be able to make "empty" objects.
-  Region<T>() = default;
+  Region() = default;
 
-  Region<T>(int xstart, int xend, int ystart, int yend, int zstart, int zend, int ny,
-            int nz, int maxregionblocksize = MAXREGIONBLOCKSIZE)
+  Region(int xstart, int xend, int ystart, int yend, int zstart, int zend, int ny, int nz,
+         int maxregionblocksize = MAXREGIONBLOCKSIZE)
       : ny(ny), nz(nz) {
 #if CHECK > 1
     if constexpr (std::is_base_of_v<Ind2D, T>) {
@@ -560,20 +567,18 @@ public:
     blocks = getContiguousBlocks(maxregionblocksize);
   };
 
-  Region<T>(RegionIndices& indices, int maxregionblocksize = MAXREGIONBLOCKSIZE)
-      : indices(indices) {
-    blocks = getContiguousBlocks(maxregionblocksize);
-  };
+  Region(RegionIndices& indices, int maxregionblocksize = MAXREGIONBLOCKSIZE)
+      : indices(indices), blocks(getContiguousBlocks(maxregionblocksize)){};
 
-  Region<T>(ContiguousBlocks& blocks) : blocks(blocks) { indices = getRegionIndices(); };
+  // We need to first set the blocks, and only after that call getRegionIndices.
+  // Do not put in the member initialisation
+  // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
+  Region(ContiguousBlocks& blocks) : blocks(blocks) { indices = getRegionIndices(); };
 
   bool operator==(const Region<T>& other) const {
     return std::equal(this->begin(), this->end(), other.begin(), other.end());
   }
 
-  /// Destructor
-  ~Region() = default;
-
   /// Expose the iterator over indices for use in range-based
   /// for-loops or with STL algorithms, etc.
   ///
@@ -760,8 +765,8 @@ public:
     //   globalPos = (index/period) * period; // Find which period block we're in
     //   newIndex = globalPos + localPos;
     for (unsigned int i = 0; i < newInd.size(); i++) {
-      int index = newInd[i].ind;
-      int whichBlock = index / period;
+      const int index = newInd[i].ind;
+      const int whichBlock = index / period;
       newInd[i].ind = ((index + shift) % period) + period * whichBlock;
     };
 
@@ -785,20 +790,21 @@ public:
     std::vector<int> blockSizes(result.numBlocks);
 
     // Get the size of each block using lambda to calculate size
-    std::transform(std::begin(blocks), std::end(blocks), std::begin(blockSizes),
-                   [](const ContiguousBlock& a) { return a.second.ind - a.first.ind; });
+    std::transform(
+        std::begin(blocks), std::end(blocks), std::begin(blockSizes),
+        [](const ContiguousBlock& block) { return block.second.ind - block.first.ind; });
 
     auto minMaxSize = std::minmax_element(std::begin(blockSizes), std::end(blockSizes));
 
-    result.minBlockSize =
-        *(minMaxSize.first); //Note have to derefence to get actual value
-    result.numMinBlocks =
-        std::count(std::begin(blockSizes), std::end(blockSizes), result.minBlockSize);
+    // Note have to derefence to get actual value
+    result.minBlockSize = *(minMaxSize.first);
+    result.numMinBlocks = static_cast<int>(
+        std::count(std::begin(blockSizes), std::end(blockSizes), result.minBlockSize));
 
-    result.maxBlockSize =
-        *(minMaxSize.second); //Note have to derefence to get actual value
-    result.numMaxBlocks =
-        std::count(std::begin(blockSizes), std::end(blockSizes), result.maxBlockSize);
+    // Note have to derefence to get actual value
+    result.maxBlockSize = *(minMaxSize.second);
+    result.numMaxBlocks = static_cast<int>(
+        std::count(std::begin(blockSizes), std::end(blockSizes), result.maxBlockSize));
 
     result.maxImbalance = static_cast<BoutReal>(result.maxBlockSize)
                           / static_cast<BoutReal>(result.minBlockSize);
@@ -853,10 +859,10 @@ private:
     int z = zstart;
 
     bool done = false;
-    int j = -1;
+    int ind = -1;
     while (!done) {
-      j++;
-      region[j].ind = (x * ny + y) * nz + z;
+      ind++;
+      region[ind].ind = (x * ny + y) * nz + z;
       if (x == xend && y == yend && z == zend) {
         done = true;
       }
@@ -979,4 +985,4 @@ unsigned int size(const Region<T>& region) {
   return region.size();
 }
 
-#endif /* __REGION_H__ */
+#endif /* BOUT_REGION_H */
diff --git a/include/bout/rkscheme.hxx b/include/bout/rkscheme.hxx
index f4e5959aff..ba818c04fe 100644
--- a/include/bout/rkscheme.hxx
+++ b/include/bout/rkscheme.hxx
@@ -32,8 +32,8 @@
 
 class RKScheme;
 
-#ifndef __RKSCHEME_H__
-#define __RKSCHEME_H__
+#ifndef BOUT_RKSCHEME_H
+#define BOUT_RKSCHEME_H
 
 #include "bout/generic_factory.hxx"
 #include <bout/bout_types.hxx>
@@ -140,4 +140,4 @@ private:
   void zeroSteps();
 };
 
-#endif // __RKSCHEME_H__
+#endif // BOUT_RKSCHEME_H
diff --git a/include/bout/rvec.hxx b/include/bout/rvec.hxx
index 0b611d64bf..492228b9ea 100644
--- a/include/bout/rvec.hxx
+++ b/include/bout/rvec.hxx
@@ -1,11 +1,11 @@
 
 #pragma once
-#ifndef __RVEC_H__
-#define __RVEC_H__
+#ifndef BOUT_RVEC_H
+#define BOUT_RVEC_H
 
 #include <bout/bout_types.hxx>
 
 #include <vector>
 using rvec = std::vector<BoutReal>;
 
-#endif // __RVEC_H__
+#endif // BOUT_RVEC_H
diff --git a/include/bout/scorepwrapper.hxx b/include/bout/scorepwrapper.hxx
index 210d48e49f..2eb67cda30 100644
--- a/include/bout/scorepwrapper.hxx
+++ b/include/bout/scorepwrapper.hxx
@@ -1,5 +1,5 @@
-#ifndef __BOUT_SCOREP_H__
-#define __BOUT_SCOREP_H__
+#ifndef BOUT_SCOREP_H
+#define BOUT_SCOREP_H
 
 #include "bout/build_config.hxx"
 
diff --git a/include/bout/single_index_ops.hxx b/include/bout/single_index_ops.hxx
index 6a9089510b..60bd78bc36 100644
--- a/include/bout/single_index_ops.hxx
+++ b/include/bout/single_index_ops.hxx
@@ -16,7 +16,7 @@ using EXEC_POL = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
 using EXEC_POL = RAJA::loop_exec;
 #endif // end BOUT_USE_CUDA
 ////-----------CUDA settings------------------------------------------------------end
-#endif
+#endif // end BOUT_HAS_RAJA
 
 // Ind3D: i.zp():
 BOUT_HOST_DEVICE inline int i_zp(const int id, const int nz) {
diff --git a/include/bout/slepclib.hxx b/include/bout/slepclib.hxx
index f6df9ce98c..e59a9c2913 100644
--- a/include/bout/slepclib.hxx
+++ b/include/bout/slepclib.hxx
@@ -42,8 +42,8 @@
 
 class SlepcLib;
 
-#ifndef __SLEPCLIB_H__
-#define __SLEPCLIB_H__
+#ifndef BOUT_SLEPCLIB_H
+#define BOUT_SLEPCLIB_H
 
 #include "bout/build_config.hxx"
 
@@ -89,4 +89,4 @@ public:
 
 #endif // BOUT_HAS_SLEPC
 
-#endif //  __SLEPCLIB_H__
+#endif //  BOUT_SLEPCLIB_H
diff --git a/include/bout/smoothing.hxx b/include/bout/smoothing.hxx
index 8a0d6e81b8..9485602053 100644
--- a/include/bout/smoothing.hxx
+++ b/include/bout/smoothing.hxx
@@ -25,8 +25,8 @@
  *
  **************************************************************/
 
-#ifndef __SMOOTHING_H__
-#define __SMOOTHING_H__
+#ifndef BOUT_SMOOTHING_H
+#define BOUT_SMOOTHING_H
 
 #include "bout/field3d.hxx"
 
@@ -135,4 +135,4 @@ const Field3D nl_filter_z(const Field3D& f, BoutReal w = 1.0);
  */
 const Field3D nl_filter(const Field3D& f, BoutReal w = 1.0);
 
-#endif // __SMOOTHING_H__
+#endif // BOUT_SMOOTHING_H
diff --git a/include/bout/solverfactory.hxx b/include/bout/solverfactory.hxx
index a628aed0c1..a0ecd646b8 100644
--- a/include/bout/solverfactory.hxx
+++ b/include/bout/solverfactory.hxx
@@ -1,5 +1,5 @@
-#ifndef __SOLVER_FACTORY_H__
-#define __SOLVER_FACTORY_H__
+#ifndef BOUT_SOLVER_FACTORY_H
+#define BOUT_SOLVER_FACTORY_H
 
 #ifndef _MSC_VER
 #warning("Deprecated header: use #include <bout/solver.hxx> instead")
@@ -9,4 +9,4 @@
 
 #include <bout/solver.hxx>
 
-#endif // __SOLVER_FACTORY_H__
+#endif // BOUT_SOLVER_FACTORY_H
diff --git a/include/bout/sourcex.hxx b/include/bout/sourcex.hxx
index 6727c8bcc9..e01c469af6 100644
--- a/include/bout/sourcex.hxx
+++ b/include/bout/sourcex.hxx
@@ -2,8 +2,8 @@
  * Radial mask operators
  **************************************************************/
 
-#ifndef __MASKX_H__
-#define __MASKX_H__
+#ifndef BOUT_MASKX_H
+#define BOUT_MASKX_H
 
 #include "bout/field3d.hxx"
 
@@ -21,4 +21,4 @@ const Field3D sink_tanhxr(const Field2D& f0, const Field3D& f, BoutReal swidth,
 
 const Field3D buff_x(const Field3D& f, bool BoutRealspace = true);
 
-#endif // __MASKX_H__
+#endif // BOUT_MASKX_H
diff --git a/include/bout/stencils.hxx b/include/bout/stencils.hxx
index fa55e7dd2d..2466047297 100644
--- a/include/bout/stencils.hxx
+++ b/include/bout/stencils.hxx
@@ -25,8 +25,8 @@
  *
  **************************************************************************/
 
-#ifndef __STENCILS_H__
-#define __STENCILS_H__
+#ifndef BOUT_STENCILS_H
+#define BOUT_STENCILS_H
 
 #include "bout/bout_types.hxx"
 
@@ -125,4 +125,4 @@ stencil inline populateStencil(const FieldType& f, const typename FieldType::ind
   populateStencil<direction, stagger, nGuard, FieldType>(s, f, i);
   return s;
 }
-#endif /* __STENCILS_H__ */
+#endif /* BOUT_STENCILS_H */
diff --git a/include/bout/sundials_backports.hxx b/include/bout/sundials_backports.hxx
index c4f4aa59ef..4ec334f4d4 100644
--- a/include/bout/sundials_backports.hxx
+++ b/include/bout/sundials_backports.hxx
@@ -1,81 +1,74 @@
-// Backports for SUNDIALS compatibility between versions 3-6
+// Backports for SUNDIALS compatibility between versions 4-7
 //
 // These are common backports shared between the CVode, ARKode, and IDA solvers
 //
 // Copyright 2022 Peter Hill, BOUT++ Team
-// SPDX-License-Identifier: LGPLv3
+// SPDX-License-Identifier: LGPL-3.0-or-later
 
 #ifndef BOUT_SUNDIALS_BACKPORTS_H
 #define BOUT_SUNDIALS_BACKPORTS_H
 
+#include "bout/bout_types.hxx"
+
+#include <type_traits>
+
 #include <nvector/nvector_parallel.h>
 #include <sundials/sundials_config.h>
 #include <sundials/sundials_iterative.h>
+#include <sundials/sundials_nonlinearsolver.h>
 #include <sundials/sundials_types.h>
-
-#if SUNDIALS_VERSION_MAJOR >= 3
 #include <sunlinsol/sunlinsol_spgmr.h>
-#endif
-
-#if SUNDIALS_VERSION_MAJOR >= 4
-#include <sundials/sundials_nonlinearsolver.h>
 #include <sunnonlinsol/sunnonlinsol_fixedpoint.h>
 #include <sunnonlinsol/sunnonlinsol_newton.h>
-#endif
-
-#include "bout/unused.hxx"
 
-#if SUNDIALS_VERSION_MAJOR < 3
-using SUNLinearSolver = int*;
-inline void SUNLinSolFree([[maybe_unused]] SUNLinearSolver solver) {}
-using sunindextype = long int;
+#if SUNDIALS_VERSION_MAJOR >= 6
+#include <sundials/sundials_context.hpp>
 #endif
 
-#if SUNDIALS_VERSION_MAJOR < 4
-using SUNNonlinearSolver = int*;
-inline void SUNNonlinSolFree([[maybe_unused]] SUNNonlinearSolver solver) {}
+#if SUNDIALS_VERSION_MAJOR < 6
+using sundials_real_type = realtype;
+#else
+using sundials_real_type = sunrealtype;
 #endif
 
-#if SUNDIALS_VERSION_MAJOR < 6
-namespace sundials {
-struct Context {
-  Context(void* comm [[maybe_unused]]) {}
-};
-} // namespace sundials
+static_assert(std::is_same_v<BoutReal, sundials_real_type>,
+              "BOUT++ and SUNDIALS real types do not match");
 
-using SUNContext = sundials::Context;
+#define SUNDIALS_CONTROLLER_SUPPORT \
+  (SUNDIALS_VERSION_MAJOR > 6       \
+   || SUNDIALS_VERSION_MAJOR == 6 && SUNDIALS_VERSION_MINOR >= 7)
+#define SUNDIALS_TABLE_BY_NAME_SUPPORT \
+  (SUNDIALS_VERSION_MAJOR > 6          \
+   || SUNDIALS_VERSION_MAJOR == 6 && SUNDIALS_VERSION_MINOR >= 4)
 
+#if SUNDIALS_VERSION_MAJOR < 6
 constexpr auto SUN_PREC_RIGHT = PREC_RIGHT;
 constexpr auto SUN_PREC_LEFT = PREC_LEFT;
 constexpr auto SUN_PREC_NONE = PREC_NONE;
 
-inline N_Vector N_VNew_Parallel(MPI_Comm comm, sunindextype local_length,
-                                sunindextype global_length,
-                                [[maybe_unused]] SUNContext sunctx) {
-  return N_VNew_Parallel(comm, local_length, global_length);
-}
+namespace sundials {
+using Context = std::nullptr_t;
+} // namespace sundials
+#endif
 
-#if SUNDIALS_VERSION_MAJOR >= 3
-inline SUNLinearSolver SUNLinSol_SPGMR(N_Vector y, int pretype, int maxl,
-                                       [[maybe_unused]] SUNContext sunctx) {
-#if SUNDIALS_VERSION_MAJOR == 3
-  return SUNSPGMR(y, pretype, maxl);
+inline sundials::Context createSUNContext([[maybe_unused]] MPI_Comm& comm) {
+#if SUNDIALS_VERSION_MAJOR < 6
+  return nullptr;
+#elif SUNDIALS_VERSION_MAJOR < 7
+  return sundials::Context(static_cast<void*>(&comm));
 #else
-  return SUNLinSol_SPGMR(y, pretype, maxl);
+  return sundials::Context(comm);
 #endif
 }
-#if SUNDIALS_VERSION_MAJOR >= 4
-inline SUNNonlinearSolver SUNNonlinSol_FixedPoint(N_Vector y, int m,
-                                                  [[maybe_unused]] SUNContext sunctx) {
-  return SUNNonlinSol_FixedPoint(y, m);
-}
 
-inline SUNNonlinearSolver SUNNonlinSol_Newton(N_Vector y,
-                                              [[maybe_unused]] SUNContext sunctx) {
-  return SUNNonlinSol_Newton(y);
+template <typename Func, typename... Args>
+inline decltype(auto) callWithSUNContext(Func f, [[maybe_unused]] sundials::Context& ctx,
+                                         Args&&... args) {
+#if SUNDIALS_VERSION_MAJOR < 6
+  return f(std::forward<Args>(args)...);
+#else
+  return f(std::forward<Args>(args)..., ctx);
+#endif
 }
-#endif // SUNDIALS_VERSION_MAJOR >= 4
-#endif // SUNDIALS_VERSION_MAJOR >= 3
-#endif // SUNDIALS_VERSION_MAJOR < 6
 
 #endif // BOUT_SUNDIALS_BACKPORTS_H
diff --git a/include/bout/surfaceiter.hxx b/include/bout/surfaceiter.hxx
index ebe33b9864..a031b30ba6 100644
--- a/include/bout/surfaceiter.hxx
+++ b/include/bout/surfaceiter.hxx
@@ -4,8 +4,8 @@
 
 class SurfaceIter;
 
-#ifndef __SURFACEITER_H__
-#define __SURFACEITER_H__
+#ifndef BOUT_SURFACEITER_H
+#define BOUT_SURFACEITER_H
 
 #include "mesh.hxx"
 
@@ -63,4 +63,4 @@ private:
   const int lastpos;
 };
 
-#endif // __SURFACEITER_H__
+#endif // BOUT_SURFACEITER_H
diff --git a/include/bout/sys/gettext.hxx b/include/bout/sys/gettext.hxx
index 2ada87ab63..a17412118c 100644
--- a/include/bout/sys/gettext.hxx
+++ b/include/bout/sys/gettext.hxx
@@ -1,7 +1,7 @@
 /// Support for i18n using GNU gettext
 
-#ifndef __BOUT_GETTEXT_H__
-#define __BOUT_GETTEXT_H__
+#ifndef BOUT_GETTEXT_H
+#define BOUT_GETTEXT_H
 
 #include "bout/build_config.hxx"
 
@@ -19,4 +19,4 @@
 #define _(string) string
 
 #endif // BOUT_HAS_GETTEXT
-#endif // __BOUT_GETTEXT_H__
+#endif // BOUT_GETTEXT_H
diff --git a/include/bout/sys/range.hxx b/include/bout/sys/range.hxx
index a210983f25..9d8aa96cd7 100644
--- a/include/bout/sys/range.hxx
+++ b/include/bout/sys/range.hxx
@@ -21,8 +21,8 @@
 
 */
 
-#ifndef __RANGE_H__
-#define __RANGE_H__
+#ifndef BOUT_RANGE_H
+#define BOUT_RANGE_H
 
 class RangeIterator {
 public:
@@ -74,4 +74,4 @@ private:
   bool delete_next = false;    // Flag to delete this->n if we created it
 };
 
-#endif // __RANGE_H__
+#endif // BOUT_RANGE_H
diff --git a/include/bout/sys/timer.hxx b/include/bout/sys/timer.hxx
index 6f04630c9d..f3beba27b1 100644
--- a/include/bout/sys/timer.hxx
+++ b/include/bout/sys/timer.hxx
@@ -1,5 +1,5 @@
-#ifndef __TIMER_H__
-#define __TIMER_H__
+#ifndef BOUT_TIMER_H
+#define BOUT_TIMER_H
 
 #include <chrono>
 #include <map>
@@ -134,4 +134,4 @@ public:
 };
 
 #define AUTO_TIME() Timer CONCATENATE(time_, __LINE__)(__thefunc__)
-#endif // __TIMER_H__
+#endif // BOUT_TIMER_H
diff --git a/include/bout/sys/uncopyable.hxx b/include/bout/sys/uncopyable.hxx
index 76606620ed..35418cb7f6 100644
--- a/include/bout/sys/uncopyable.hxx
+++ b/include/bout/sys/uncopyable.hxx
@@ -1,7 +1,7 @@
 // From Scott Meyers' "Effective C++, third edition"
 
-#ifndef __UNCOPYABLE_H__
-#define __UNCOPYABLE_H__
+#ifndef BOUT_UNCOPYABLE_H
+#define BOUT_UNCOPYABLE_H
 
 /// Inherit from this class (private) to prevent copying
 class Uncopyable {
@@ -14,4 +14,4 @@ public:
   Uncopyable& operator=(const Uncopyable&) = delete;
 };
 
-#endif // __UNCOPYABLE_H__
+#endif // BOUT_UNCOPYABLE_H
diff --git a/include/bout/template_combinations.hxx b/include/bout/template_combinations.hxx
index 81848cf252..49a42e6bca 100644
--- a/include/bout/template_combinations.hxx
+++ b/include/bout/template_combinations.hxx
@@ -27,8 +27,8 @@
  *
  **************************************************************************/
 
-#ifndef __TEMPLATE_COMBINATIONS_H__
-#define __TEMPLATE_COMBINATIONS_H__
+#ifndef BOUT_TEMPLATE_COMBINATIONS_H
+#define BOUT_TEMPLATE_COMBINATIONS_H
 
 #include <bout/unused.hxx>
 
diff --git a/include/bout/unused.hxx b/include/bout/unused.hxx
index 74fd3c2f98..7ef67cfe84 100644
--- a/include/bout/unused.hxx
+++ b/include/bout/unused.hxx
@@ -1,5 +1,5 @@
-#ifndef __UNUSED_H__
-#define __UNUSED_H__
+#ifndef BOUT_UNUSED_H
+#define BOUT_UNUSED_H
 
 /// Mark a function parameter as unused in the function body
 ///
@@ -37,4 +37,4 @@
 #define UNUSED(x) x
 #endif
 
-#endif //__UNUSED_H__
+#endif //BOUT_UNUSED_H
diff --git a/include/bout/utils.hxx b/include/bout/utils.hxx
index 0ec87fd4d7..19fc8bed39 100644
--- a/include/bout/utils.hxx
+++ b/include/bout/utils.hxx
@@ -26,8 +26,8 @@
  *
  **************************************************************************/
 
-#ifndef __UTILS_H__
-#define __UTILS_H__
+#ifndef BOUT_UTILS_H
+#define BOUT_UTILS_H
 
 #include "bout/bout_types.hxx"
 #include "bout/boutexception.hxx"
@@ -362,6 +362,14 @@ public:
     return data[i.ind];
   }
 
+  T& operator[](Ind3D i) {
+    // ny and nz are private :-(
+    // ASSERT2(i.nz == n3);
+    // ASSERT2(i.ny == n2);
+    ASSERT2(0 <= i.ind && i.ind < n1 * n2 * n3);
+    return data[i.ind];
+  }
+
   Tensor& operator=(const T& val) {
     for (auto& i : data) {
       i = val;
@@ -712,4 +720,4 @@ inline bool flagSet(int bitset, int flag) { return (bitset & flag) != 0; }
 } // namespace utils
 } // namespace bout
 
-#endif // __UTILS_H__
+#endif // BOUT_UTILS_H
diff --git a/include/bout/vecops.hxx b/include/bout/vecops.hxx
index 4a03d06b5e..9166503855 100644
--- a/include/bout/vecops.hxx
+++ b/include/bout/vecops.hxx
@@ -26,8 +26,8 @@
  *
  **************************************************************************/
 
-#ifndef __VECOPS_H__
-#define __VECOPS_H__
+#ifndef BOUT_VECOPS_H
+#define BOUT_VECOPS_H
 
 #include "bout/bout_types.hxx"
 #include "bout/coordinates.hxx"
@@ -129,4 +129,4 @@ Vector3D V_dot_Grad(const Vector2D& v, const Vector3D& a);
 Vector3D V_dot_Grad(const Vector3D& v, const Vector2D& a);
 Vector3D V_dot_Grad(const Vector3D& v, const Vector3D& a);
 
-#endif // __VECOPS_H__
+#endif // BOUT_VECOPS_H
diff --git a/include/bout/vector2d.hxx b/include/bout/vector2d.hxx
index 974c5f81db..bdc375e698 100644
--- a/include/bout/vector2d.hxx
+++ b/include/bout/vector2d.hxx
@@ -34,8 +34,8 @@
 class Vector2D;
 
 #pragma once
-#ifndef __VECTOR2D_H__
-#define __VECTOR2D_H__
+#ifndef BOUT_VECTOR2D_H
+#define BOUT_VECTOR2D_H
 
 class Field2D;
 class Field3D;
@@ -217,4 +217,4 @@ inline Vector2D zeroFrom(const Vector2D& v) {
  */
 inline Vector2D& ddt(Vector2D& f) { return *(f.timeDeriv()); }
 
-#endif // __VECTOR2D_H__
+#endif // BOUT_VECTOR2D_H
diff --git a/include/bout/vector3d.hxx b/include/bout/vector3d.hxx
index 93ee798663..0c71dcffa5 100644
--- a/include/bout/vector3d.hxx
+++ b/include/bout/vector3d.hxx
@@ -30,8 +30,8 @@
 class Vector3D;
 
 #pragma once
-#ifndef __VECTOR3D_H__
-#define __VECTOR3D_H__
+#ifndef BOUT_VECTOR3D_H
+#define BOUT_VECTOR3D_H
 
 class Field2D;
 class Vector2D;
@@ -237,4 +237,4 @@ inline Vector3D zeroFrom(const Vector3D& v) {
  */
 inline Vector3D& ddt(Vector3D& f) { return *(f.timeDeriv()); }
 
-#endif // __VECTOR3D_H__
+#endif // BOUT_VECTOR3D_H
diff --git a/include/bout/where.hxx b/include/bout/where.hxx
index 504dc028b1..c798d75de8 100644
--- a/include/bout/where.hxx
+++ b/include/bout/where.hxx
@@ -25,8 +25,8 @@
  *
  **************************************************************************/
 
-#ifndef __WHERE_H__
-#define __WHERE_H__
+#ifndef BOUT_WHERE_H
+#define BOUT_WHERE_H
 
 #include "bout/field.hxx"
 #include "bout/field2d.hxx"
@@ -85,4 +85,4 @@ auto where(const T& test, BoutReal gt0, BoutReal le0) -> ResultType {
   return result;
 }
 
-#endif // __WHERE_H__
+#endif // BOUT_WHERE_H
diff --git a/manual/sphinx/conf.py b/manual/sphinx/conf.py
index 29c0985841..d27e8ab1fd 100755
--- a/manual/sphinx/conf.py
+++ b/manual/sphinx/conf.py
@@ -88,7 +88,7 @@ def __getattr__(cls, name):
         + " -DBOUT_UPDATE_GIT_SUBMODULE=OFF"
         + " -DBOUT_TESTS=OFF"
         + " -DBOUT_ALLOW_INSOURCE_BUILD=ON"
-        + f" -DPython_ROOT_DIR={pydir}"
+        + f" -DPython3_ROOT_DIR={pydir}"
         + f" -Dmpark_variant_DIR={pwd}/externalpackages/mpark.variant/"
         + f" -Dfmt_DIR={pwd}/externalpackages/fmt/"
     )
diff --git a/manual/sphinx/developer_docs/data_types.rst b/manual/sphinx/developer_docs/data_types.rst
index 2e303381f9..fa8e9e6ea6 100644
--- a/manual/sphinx/developer_docs/data_types.rst
+++ b/manual/sphinx/developer_docs/data_types.rst
@@ -300,7 +300,7 @@ verion of the macro::
 For loops inside parallel regions, there is ``BOUT_FOR_INNER``::
 
     Field3D f(0.0);
-    BOUT_OMP(parallel) {
+    BOUT_OMP_PERF(parallel) {
       BOUT_FOR_INNER(i, f.getMesh()->getRegion3D("RGN_ALL")) {
          f[i] = a[i] + b[i];
       }
@@ -357,7 +357,7 @@ Tuning BOUT_FOR loops
 The ``BOUT_FOR`` macros use two nested loops: The outer loop is OpenMP
 parallelised, and iterates over contiguous blocks::
 
-  BOUT_OMP(parallel for schedule(guided))
+  BOUT_OMP_PERF(parallel for schedule(guided))
   for (auto block = region.getBlocks().cbegin();
        block < region.getBlocks().cend();
        ++block)
diff --git a/manual/sphinx/user_docs/adios2.rst b/manual/sphinx/user_docs/adios2.rst
index 8a6228cd3a..d8e0135c0d 100644
--- a/manual/sphinx/user_docs/adios2.rst
+++ b/manual/sphinx/user_docs/adios2.rst
@@ -11,14 +11,14 @@ Installation
 The easiest way to configure BOUT++ with ADIOS2 is to tell CMake to download and build it
 with this flag::
 
-  -DBOUT_DOWNLOAD_ADIOS=ON
+  -DBOUT_DOWNLOAD_ADIOS2=ON
 
 The ``master`` branch will be downloaded from `Github <https://github.com/ornladios/ADIOS2>`_,
 configured and built with BOUT++.
 
-Alternatively, if ADIOS is already installed then the following flags can be used::
+Alternatively, if ADIOS2 is already installed then the following flags can be used::
 
-  -DBOUT_USE_ADIOS=ON -DADIOS2_ROOT=/path/to/adios2
+  -DBOUT_USE_ADIOS2=ON -DADIOS2_ROOT=/path/to/adios2
 
 Output files
 ------------
diff --git a/manual/sphinx/user_docs/advanced_install.rst b/manual/sphinx/user_docs/advanced_install.rst
index e25be12b4b..048a26a6e3 100644
--- a/manual/sphinx/user_docs/advanced_install.rst
+++ b/manual/sphinx/user_docs/advanced_install.rst
@@ -145,13 +145,12 @@ where ``<build-directory>`` is the path to the build directory
 
 MPCDF HPC Systems
 ~~~~~~~~~~~~~~~~~
+After cloning BOUT-dev and checking out the branch you want (e.g. db-outer), run:
 .. code-block:: bash
 
-    module purge # or at least onload intel and impi and mkl
-    module load gcc/10 cmake/3.18 openmpi/4
-    # ensure python3 is >= python3.6 - skip if you have a newer python3 loaded
-    mkdir -p $HOME/bin ; test -e $HOME/bin/python3 || ln -s $(which python3.6) $HOME/bin/python3
-    BUILD=/ptmp/$USER/bout-deps bin/bout-build-deps.sh
+    module purge # or at least onload intel
+    module load gcc/13 anaconda/3/2021.11 impi/2021.9 hdf5-serial/1.12.2 mkl/2022.0 netcdf-serial/4.8.1 fftw-mpi/3.3.10
+    BUILD=/ptmp/$USER/bout-deps NO_HDF5=1 NO_NETCDF=1 NO_FFTW=1 bin/bout-build-deps.sh
 
 and follow the instructions for configuring BOUT++. To enable openMP
 for a production run use:
@@ -159,11 +158,11 @@ for a production run use:
 .. code-block:: bash
 
     module load bout-dep
-    cmake .. -DBOUT_USE_NETCDF=ON -DnetCDF_ROOT=$BOUT_DEP -DnetCDFCxx_ROOT=$BOUT_DEP \
+    cmake .. -DBOUT_USE_NETCDF=ON -DnetCDFCxx_ROOT=$BOUT_DEP \
       -DBOUT_USE_PETSC=ON -DPETSC_DIR=$BOUT_DEP \
-      -DBOUT_USE_FFTW=ON -DFFTW_ROOT=$BOUT_DEP \
+      -DBOUT_USE_FFTW=ON \
       -DBOUT_USE_SUNDIALS=ON -DSUNDIALS_ROOT=$BOUT_DEP \
-      -DBOUT_ENABLE_OPENMP=ON \
+      -DBOUT_ENABLE_OPENMP=OFF \
       -DCMAKE_BUILD_TYPE=Release
 
 
@@ -306,9 +305,10 @@ solver. Currently, BOUT++ also supports the SUNDIALS solvers CVODE, IDA
 and ARKODE which are available from
 https://computation.llnl.gov/casc/sundials/main.html.
 
-.. note:: BOUT++ currently supports SUNDIALS > 2.6, up to 5.4.0 as of
-          September 2020. It is advisable to use the highest possible
-          version
+.. note:: BOUT++ currently supports SUNDIALS > 2.6, up to 6.7.0 as of
+          January 2024. It is advisable to use the highest possible
+          version. Support for SUNDIALS versions < 4 will be removed
+          in the next release.
 
 The full installation guide is found in the downloaded ``.tar.gz``,
 but we will provide a step-by-step guide to install it and make it
diff --git a/manual/sphinx/user_docs/installing.rst b/manual/sphinx/user_docs/installing.rst
index eb155909bf..10f5d9b9f1 100644
--- a/manual/sphinx/user_docs/installing.rst
+++ b/manual/sphinx/user_docs/installing.rst
@@ -373,7 +373,7 @@ For SUNDIALS, use ``-DBOUT_DOWNLOAD_SUNDIALS=ON``. If using ``ccmake`` this opti
 may not appear initially. This automatically sets ``BOUT_USE_SUNDIALS=ON``, and
 configures SUNDIALS to use MPI.
 
-For ADIOS2, use ``-DBOUT_DOWNLOAD_ADIOS=ON``. This will download and
+For ADIOS2, use ``-DBOUT_DOWNLOAD_ADIOS2=ON``. This will download and
 configure `ADIOS2 <https://adios2.readthedocs.io/>`_, enabling BOUT++
 to read and write this high-performance parallel file format.
 
diff --git a/requirements.txt b/requirements.txt
index 75358b10db..dcbe5cef5c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,8 @@
-Jinja2>=2.11.3
-numpy>=1.14.1
-scipy>=1.0.0
-netcdf4~=1.6.0
-matplotlib>=2.0.0
+Jinja2~=3.1.0
+numpy~=2.0.0
+scipy>=1.11.0
+netcdf4~=1.7.1
+matplotlib>=3.7.0
 Cython~=3.0.0
 boututils~=0.2.1
 boutdata~=0.2.1
diff --git a/src/bout++.cxx b/src/bout++.cxx
index 481a928bec..ff25b1163e 100644
--- a/src/bout++.cxx
+++ b/src/bout++.cxx
@@ -59,7 +59,7 @@ const char DEFAULT_DIR[] = "data";
 #include "bout/bout.hxx"
 #undef BOUT_NO_USING_NAMESPACE_BOUTGLOBALS
 
-#if BOUT_HAS_ADIOS
+#if BOUT_HAS_ADIOS2
 #include "bout/adios_object.hxx"
 #endif
 
@@ -165,7 +165,7 @@ int BoutInitialise(int& argc, char**& argv) {
 
     savePIDtoFile(args.data_dir, MYPE);
 
-#if BOUT_HAS_ADIOS
+#if BOUT_HAS_ADIOS2
     bout::ADIOSInit(BoutComm::get());
 #endif
 
@@ -572,7 +572,7 @@ void printCompileTimeOptions() {
   constexpr auto netcdf_flavour =
       has_netcdf ? (has_legacy_netcdf ? " (Legacy)" : " (NetCDF4)") : "";
   output_info.write(_("\tNetCDF support {}{}\n"), is_enabled(has_netcdf), netcdf_flavour);
-  output_info.write(_("\tADIOS support {}\n"), is_enabled(has_adios));
+  output_info.write(_("\tADIOS2 support {}\n"), is_enabled(has_adios2));
   output_info.write(_("\tPETSc support {}\n"), is_enabled(has_petsc));
   output_info.write(_("\tPretty function name support {}\n"),
                     is_enabled(has_pretty_function));
@@ -582,11 +582,8 @@ void printCompileTimeOptions() {
   output_info.write(_("\tSUNDIALS support {}\n"), is_enabled(has_sundials));
   output_info.write(_("\tBacktrace in exceptions {}\n"), is_enabled(use_backtrace));
   output_info.write(_("\tColour in logs {}\n"), is_enabled(use_color));
-  output_info.write(_("\tOpenMP parallelisation {}"), is_enabled(use_openmp));
-#ifdef _OPENMP
-  output_info.write(_(", using {} threads"), omp_get_max_threads());
-#endif
-  output_info.write("\n");
+  output_info.write(_("\tOpenMP parallelisation {}, using {} threads\n"),
+                    is_enabled(use_openmp), omp_get_max_threads());
   output_info.write(_("\tExtra debug output {}\n"), is_enabled(use_output_debug));
   output_info.write(_("\tFloating-point exceptions {}\n"), is_enabled(use_sigfpe));
   output_info.write(_("\tSignal handling support {}\n"), is_enabled(use_signal));
@@ -701,7 +698,7 @@ void addBuildFlagsToOptions(Options& options) {
   options["has_gettext"].force(bout::build::has_gettext);
   options["has_lapack"].force(bout::build::has_lapack);
   options["has_netcdf"].force(bout::build::has_netcdf);
-  options["has_adios"].force(bout::build::has_adios);
+  options["has_adios2"].force(bout::build::has_adios2);
   options["has_petsc"].force(bout::build::has_petsc);
   options["has_hypre"].force(bout::build::has_hypre);
   options["has_umpire"].force(bout::build::has_umpire);
@@ -715,6 +712,7 @@ void addBuildFlagsToOptions(Options& options) {
   options["use_backtrace"].force(bout::build::use_backtrace);
   options["use_color"].force(bout::build::use_color);
   options["use_openmp"].force(bout::build::use_openmp);
+  options["openmp_threads"].force(omp_get_max_threads());
   options["use_output_debug"].force(bout::build::use_output_debug);
   options["use_sigfpe"].force(bout::build::use_sigfpe);
   options["use_signal"].force(bout::build::use_signal);
@@ -797,7 +795,7 @@ int BoutFinalise(bool write_settings) {
   // Call HYPER_Finalize if not already called
   bout::HypreLib::cleanup();
 
-#if BOUT_HAS_ADIOS
+#if BOUT_HAS_ADIOS2
   bout::ADIOSFinalize();
 #endif
 
diff --git a/src/field/field3d.cxx b/src/field/field3d.cxx
index b4bb0d394f..4ed9641f44 100644
--- a/src/field/field3d.cxx
+++ b/src/field/field3d.cxx
@@ -32,6 +32,8 @@
 
 #include <cmath>
 
+#include "bout/parallel_boundary_op.hxx"
+#include "bout/parallel_boundary_region.hxx"
 #include <bout/assert.hxx>
 #include <bout/boundary_factory.hxx>
 #include <bout/boundary_op.hxx>
@@ -504,7 +506,7 @@ void Field3D::applyParallelBoundary(const std::string& condition) {
   /// Loop over the mesh boundary regions
   for (const auto& reg : fieldmesh->getBoundariesPar()) {
     auto op = std::unique_ptr<BoundaryOpPar>{
-        dynamic_cast<BoundaryOpPar*>(bfact->create(condition, reg))};
+        dynamic_cast<BoundaryOpPar*>(bfact->create(condition, reg.get()))};
     op->apply(*this);
   }
 }
@@ -524,7 +526,7 @@ void Field3D::applyParallelBoundary(const std::string& region,
   for (const auto& reg : fieldmesh->getBoundariesPar()) {
     if (reg->label == region) {
       auto op = std::unique_ptr<BoundaryOpPar>{
-          dynamic_cast<BoundaryOpPar*>(bfact->create(condition, reg))};
+          dynamic_cast<BoundaryOpPar*>(bfact->create(condition, reg.get()))};
       op->apply(*this);
       break;
     }
@@ -548,9 +550,9 @@ void Field3D::applyParallelBoundary(const std::string& region,
       // BoundaryFactory can't create boundaries using Field3Ds, so get temporary
       // boundary of the right type
       auto tmp = std::unique_ptr<BoundaryOpPar>{
-          dynamic_cast<BoundaryOpPar*>(bfact->create(condition, reg))};
+          dynamic_cast<BoundaryOpPar*>(bfact->create(condition, reg.get()))};
       // then clone that with the actual argument
-      auto op = std::unique_ptr<BoundaryOpPar>{tmp->clone(reg, f)};
+      auto op = std::unique_ptr<BoundaryOpPar>{tmp->clone(reg.get(), f)};
       op->apply(*this);
       break;
     }
@@ -618,7 +620,7 @@ Field3D filter(const Field3D& var, int N0, const std::string& rgn) {
 
   const Region<Ind2D>& region = var.getRegion2D(region_str);
 
-  BOUT_OMP(parallel)
+  BOUT_OMP_PERF(parallel)
   {
     Array<dcomplex> f(ncz / 2 + 1);
 
@@ -668,7 +670,7 @@ Field3D lowPass(const Field3D& var, int zmax, bool keep_zonal, const std::string
 
   const Region<Ind2D>& region = var.getRegion2D(region_str);
 
-  BOUT_OMP(parallel)
+  BOUT_OMP_PERF(parallel)
   {
     Array<dcomplex> f(ncz / 2 + 1);
 
diff --git a/src/field/field_data.cxx b/src/field/field_data.cxx
index ee8bd97b30..529f595316 100644
--- a/src/field/field_data.cxx
+++ b/src/field/field_data.cxx
@@ -1,4 +1,6 @@
 
+#include "bout/parallel_boundary_op.hxx"
+#include "bout/parallel_boundary_region.hxx"
 #include "bout/unused.hxx"
 #include <bout/boundary_factory.hxx>
 #include <bout/field_data.hxx>
@@ -151,10 +153,9 @@ void FieldData::setBoundary(const std::string& name) {
   }
 
   /// Get the mesh boundary regions
-  std::vector<BoundaryRegionPar*> par_reg = mesh->getBoundariesPar();
   /// Loop over the mesh parallel boundary regions
   for (const auto& reg : mesh->getBoundariesPar()) {
-    auto* op = dynamic_cast<BoundaryOpPar*>(bfact->createFromOptions(name, reg));
+    auto* op = dynamic_cast<BoundaryOpPar*>(bfact->createFromOptions(name, reg.get()));
     if (op != nullptr) {
       bndry_op_par.push_back(op);
     }
diff --git a/src/field/fieldgenerators.hxx b/src/field/fieldgenerators.hxx
index 66ef11a855..2485b4b82d 100644
--- a/src/field/fieldgenerators.hxx
+++ b/src/field/fieldgenerators.hxx
@@ -4,8 +4,8 @@
  * These classes are used by FieldFactory
  */
 
-#ifndef __FIELDGENERATORS_H__
-#define __FIELDGENERATORS_H__
+#ifndef BOUT_FIELDGENERATORS_H
+#define BOUT_FIELDGENERATORS_H
 
 #include <bout/boutexception.hxx>
 #include <bout/field_factory.hxx>
@@ -352,4 +352,4 @@ private:
   FieldGeneratorPtr test, gt0, lt0;
 };
 
-#endif // __FIELDGENERATORS_H__
+#endif // BOUT_FIELDGENERATORS_H
diff --git a/src/invert/fft_fftw.cxx b/src/invert/fft_fftw.cxx
index 514396c828..d66f35beee 100644
--- a/src/invert/fft_fftw.cxx
+++ b/src/invert/fft_fftw.cxx
@@ -258,7 +258,7 @@ void rfft([[maybe_unused]] const BoutReal* in, [[maybe_unused]] int length,
     // use a `single` block here as that requires all threads to reach the
     // block (implicit barrier) which may not be true in all cases (e.g.
     // if there are 8 threads but only 4 call the fft routine).
-    BOUT_OMP(critical(rfft))
+    BOUT_OMP_SAFE(critical(rfft))
     if ((size != length) || (nthreads < n_th)) {
       if (size > 0) {
         // Free all memory
@@ -335,7 +335,7 @@ void irfft([[maybe_unused]] const dcomplex* in, [[maybe_unused]] int length,
     // use a `single` block here as that requires all threads to reach the
     // block (implicit barrier) which may not be true in all cases (e.g.
     // if there are 8 threads but only 4 call the fft routine).
-    BOUT_OMP(critical(irfft))
+    BOUT_OMP_SAFE(critical(irfft))
     if ((size != length) || (nthreads < n_th)) {
       if (size > 0) {
         // Free all memory
diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
index 2687bf7187..5ce4e540b7 100644
--- a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
+++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
@@ -33,11 +33,13 @@
  *
  */
 
-#include "cyclic_laplace.hxx"
-#include "bout/build_config.hxx"
+#include "bout/build_defines.hxx"
 
 #if not BOUT_USE_METRIC_3D
 
+#include "cyclic_laplace.hxx"
+#include "bout/assert.hxx"
+#include "bout/bout_types.hxx"
 #include <bout/boutexception.hxx>
 #include <bout/constants.hxx>
 #include <bout/fft.hxx>
@@ -47,7 +49,7 @@
 #include <bout/sys/timer.hxx>
 #include <bout/utils.hxx>
 
-#include "cyclic_laplace.hxx"
+#include <vector>
 
 LaplaceCyclic::LaplaceCyclic(Options* opt, const CELL_LOC loc, Mesh* mesh_in,
                              Solver* UNUSED(solver))
@@ -120,18 +122,18 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) {
 
   // If the flags to assign that only one guard cell should be used is set
   int inbndry = localmesh->xstart, outbndry = localmesh->xstart;
-  if (((global_flags & INVERT_BOTH_BNDRY_ONE) != 0) || (localmesh->xstart < 2)) {
+  if (isGlobalFlagSet(INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) {
     inbndry = outbndry = 1;
   }
-  if ((inner_boundary_flags & INVERT_BNDRY_ONE) != 0) {
+  if (isInnerBoundaryFlagSet(INVERT_BNDRY_ONE)) {
     inbndry = 1;
   }
-  if ((outer_boundary_flags & INVERT_BNDRY_ONE) != 0) {
+  if (isOuterBoundaryFlagSet(INVERT_BNDRY_ONE)) {
     outbndry = 1;
   }
 
   if (dst) {
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>(
@@ -139,13 +141,13 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) {
 
       // Loop over X indices, including boundaries but not guard cells. (unless periodic
       // in x)
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int ix = xs; ix <= xe; ix++) {
         // Take DST in Z direction and put result in k1d
 
-        if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX())
+        if (((ix < inbndry) && isInnerBoundaryFlagSetOnFirstX(INVERT_SET))
             || ((localmesh->LocalNx - ix - 1 < outbndry)
-                && (outer_boundary_flags & INVERT_SET) && localmesh->lastX())) {
+                && isOuterBoundaryFlagSetOnLastX(INVERT_SET))) {
           // Use the values in x0 in the boundary
           DST(x0[ix] + 1, localmesh->LocalNz - 2, std::begin(k1d));
         } else {
@@ -161,7 +163,7 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) {
       // Get elements of the tridiagonal matrix
       // including boundary conditions
       BoutReal zlen = getUniform(coords->dz) * (localmesh->LocalNz - 3);
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int kz = 0; kz < nmode; kz++) {
         // wave number is 1/[rad]; DST has extra 2.
         BoutReal kwave = kz * 2.0 * PI / (2. * zlen);
@@ -169,8 +171,7 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) {
         tridagMatrix(&a(kz, 0), &b(kz, 0), &c(kz, 0), &bcmplx(kz, 0), jy,
                      kz,    // wave number index
                      kwave, // kwave (inverse wave length)
-                     global_flags, inner_boundary_flags, outer_boundary_flags, &Acoef,
-                     &C1coef, &C2coef, &Dcoef,
+                     &Acoef, &C1coef, &C2coef, &Dcoef,
                      false,  // Don't include guard cells in arrays
                      false); // Z domain not periodic
       }
@@ -181,14 +182,14 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) {
     cr->solve(bcmplx, xcmplx);
 
     // FFT back to real space
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
 
       // ZFFT routine expects input of this length
       auto k1d = Array<dcomplex>(localmesh->LocalNz);
 
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ix = xs; ix <= xe; ix++) {
         for (int kz = 0; kz < nmode; kz++) {
           k1d[kz] = xcmplx(kz, ix - xs);
@@ -206,7 +207,7 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) {
     }
   } else {
     const BoutReal zlength = getUniform(coords->zlength());
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       // ZFFT routine expects input of this length
@@ -214,13 +215,13 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) {
 
       // Loop over X indices, including boundaries but not guard
       // cells (unless periodic in x)
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int ix = xs; ix <= xe; ix++) {
         // Take FFT in Z direction, apply shift, and put result in k1d
 
-        if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX())
+        if (((ix < inbndry) && isInnerBoundaryFlagSetOnFirstX(INVERT_SET))
             || ((localmesh->LocalNx - ix - 1 < outbndry)
-                && (outer_boundary_flags & INVERT_SET) && localmesh->lastX())) {
+                && isOuterBoundaryFlagSetOnLastX(INVERT_SET))) {
           // Use the values in x0 in the boundary
           rfft(x0[ix], localmesh->LocalNz, std::begin(k1d));
         } else {
@@ -235,14 +236,13 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) {
 
       // Get elements of the tridiagonal matrix
       // including boundary conditions
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int kz = 0; kz < nmode; kz++) {
         BoutReal kwave = kz * 2.0 * PI / zlength; // wave number is 1/[rad]
         tridagMatrix(&a(kz, 0), &b(kz, 0), &c(kz, 0), &bcmplx(kz, 0), jy,
                      kz,    // True for the component constant (DC) in Z
                      kwave, // Z wave number
-                     global_flags, inner_boundary_flags, outer_boundary_flags, &Acoef,
-                     &C1coef, &C2coef, &Dcoef,
+                     &Acoef, &C1coef, &C2coef, &Dcoef,
                      false); // Don't include guard cells in arrays
       }
     }
@@ -269,15 +269,15 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) {
     }
 
     // FFT back to real space
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       // ZFFT routine expects input of this length
       auto k1d = Array<dcomplex>((localmesh->LocalNz) / 2 + 1);
 
-      const bool zero_DC = (global_flags & INVERT_ZERO_DC) != 0;
+      const bool zero_DC = isGlobalFlagSet(INVERT_ZERO_DC);
 
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ix = xs; ix <= xe; ix++) {
         if (zero_DC) {
           k1d[0] = 0.;
@@ -316,13 +316,13 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
 
   // If the flags to assign that only one guard cell should be used is set
   int inbndry = localmesh->xstart, outbndry = localmesh->xstart;
-  if (((global_flags & INVERT_BOTH_BNDRY_ONE) != 0) || (localmesh->xstart < 2)) {
+  if (isGlobalFlagSet(INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) {
     inbndry = outbndry = 1;
   }
-  if ((inner_boundary_flags & INVERT_BNDRY_ONE) != 0) {
+  if (isInnerBoundaryFlagSet(INVERT_BNDRY_ONE)) {
     inbndry = 1;
   }
-  if ((outer_boundary_flags & INVERT_BNDRY_ONE) != 0) {
+  if (isOuterBoundaryFlagSet(INVERT_BNDRY_ONE)) {
     outbndry = 1;
   }
 
@@ -350,6 +350,9 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
   const int nsys = nmode * ny;  // Number of systems of equations to solve
   const int nxny = nx * ny;     // Number of points in X-Y
 
+  // This is just to silence static analysis
+  ASSERT0(ny > 0);
+
   auto a3D = Matrix<dcomplex>(nsys, nx);
   auto b3D = Matrix<dcomplex>(nsys, nx);
   auto c3D = Matrix<dcomplex>(nsys, nx);
@@ -358,7 +361,7 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
   auto bcmplx3D = Matrix<dcomplex>(nsys, nx);
 
   if (dst) {
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       // ZFFT routine expects input of this length
@@ -366,7 +369,7 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
 
       // Loop over X and Y indices, including boundaries but not guard cells.
       // (unless periodic in x)
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int ind = 0; ind < nxny; ++ind) {
         // ind = (ix - xs)*(ye - ys + 1) + (iy - ys)
         int ix = xs + ind / ny;
@@ -374,10 +377,9 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
 
         // Take DST in Z direction and put result in k1d
 
-        if (((ix < inbndry) && ((inner_boundary_flags & INVERT_SET) != 0)
-             && localmesh->firstX())
+        if (((ix < inbndry) && isInnerBoundaryFlagSetOnFirstX(INVERT_SET))
             || ((localmesh->LocalNx - ix - 1 < outbndry)
-                && ((outer_boundary_flags & INVERT_SET) != 0) && localmesh->lastX())) {
+                && isOuterBoundaryFlagSetOnLastX(INVERT_SET))) {
           // Use the values in x0 in the boundary
           DST(x0(ix, iy) + 1, localmesh->LocalNz - 2, std::begin(k1d));
         } else {
@@ -393,7 +395,7 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
       // Get elements of the tridiagonal matrix
       // including boundary conditions
       const BoutReal zlen = getUniform(coords->dz) * (localmesh->LocalNz - 3);
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ind = 0; ind < nsys; ind++) {
         // ind = (iy - ys) * nmode + kz
         int iy = ys + ind / nmode;
@@ -405,8 +407,7 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
         tridagMatrix(&a3D(ind, 0), &b3D(ind, 0), &c3D(ind, 0), &bcmplx3D(ind, 0), iy,
                      kz,    // wave number index
                      kwave, // kwave (inverse wave length)
-                     global_flags, inner_boundary_flags, outer_boundary_flags, &Acoef,
-                     &C1coef, &C2coef, &Dcoef,
+                     &Acoef, &C1coef, &C2coef, &Dcoef,
                      false,  // Don't include guard cells in arrays
                      false); // Z domain not periodic
       }
@@ -417,13 +418,13 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
     cr->solve(bcmplx3D, xcmplx3D);
 
     // FFT back to real space
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       // ZFFT routine expects input of length LocalNz
       auto k1d = Array<dcomplex>(localmesh->LocalNz);
 
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ind = 0; ind < nxny; ++ind) { // Loop over X and Y
         // ind = (ix - xs)*(ye - ys + 1) + (iy - ys)
         int ix = xs + ind / ny;
@@ -445,7 +446,7 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
     }
   } else {
     const BoutReal zlength = getUniform(coords->zlength());
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       // ZFFT routine expects input of this length
@@ -454,7 +455,7 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
       // Loop over X and Y indices, including boundaries but not guard cells
       // (unless periodic in x)
 
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int ind = 0; ind < nxny; ++ind) {
         // ind = (ix - xs)*(ye - ys + 1) + (iy - ys)
         int ix = xs + ind / ny;
@@ -462,10 +463,9 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
 
         // Take FFT in Z direction, apply shift, and put result in k1d
 
-        if (((ix < inbndry) && ((inner_boundary_flags & INVERT_SET) != 0)
-             && localmesh->firstX())
+        if (((ix < inbndry) && isInnerBoundaryFlagSetOnFirstX(INVERT_SET))
             || ((localmesh->LocalNx - ix - 1 < outbndry)
-                && ((outer_boundary_flags & INVERT_SET) != 0) && localmesh->lastX())) {
+                && isOuterBoundaryFlagSetOnLastX(INVERT_SET))) {
           // Use the values in x0 in the boundary
           rfft(x0(ix, iy), localmesh->LocalNz, std::begin(k1d));
         } else {
@@ -480,7 +480,7 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
 
       // Get elements of the tridiagonal matrix
       // including boundary conditions
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ind = 0; ind < nsys; ind++) {
         // ind = (iy - ys) * nmode + kz
         int iy = ys + ind / nmode;
@@ -490,8 +490,7 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
         tridagMatrix(&a3D(ind, 0), &b3D(ind, 0), &c3D(ind, 0), &bcmplx3D(ind, 0), iy,
                      kz,    // True for the component constant (DC) in Z
                      kwave, // Z wave number
-                     global_flags, inner_boundary_flags, outer_boundary_flags, &Acoef,
-                     &C1coef, &C2coef, &Dcoef,
+                     &Acoef, &C1coef, &C2coef, &Dcoef,
                      false); // Don't include guard cells in arrays
       }
     }
@@ -502,9 +501,8 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
 
     if (localmesh->periodicX) {
       // Subtract X average of kz=0 mode
-      BoutReal local[ny + 1];
+      std::vector<BoutReal> local(ny + 1, 0.0);
       for (int y = 0; y < ny; y++) {
-        local[y] = 0.0;
         for (int ix = xs; ix <= xe; ix++) {
           local[y] += xcmplx3D(y * nmode, ix - xs).real();
         }
@@ -512,8 +510,9 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
       local[ny] = static_cast<BoutReal>(xe - xs + 1);
 
       // Global reduce
-      BoutReal global[ny + 1];
-      MPI_Allreduce(local, global, ny + 1, MPI_DOUBLE, MPI_SUM, localmesh->getXcomm());
+      std::vector<BoutReal> global(ny + 1, 0.0);
+      MPI_Allreduce(local.data(), global.data(), ny + 1, MPI_DOUBLE, MPI_SUM,
+                    localmesh->getXcomm());
       // Subtract average from kz=0 modes
       for (int y = 0; y < ny; y++) {
         BoutReal avg = global[y] / global[ny];
@@ -524,15 +523,15 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
     }
 
     // FFT back to real space
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>((localmesh->LocalNz) / 2
                                  + 1); // ZFFT routine expects input of this length
 
-      const bool zero_DC = (global_flags & INVERT_ZERO_DC) != 0;
+      const bool zero_DC = isGlobalFlagSet(INVERT_ZERO_DC);
 
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ind = 0; ind < nxny; ++ind) { // Loop over X and Y
         // ind = (ix - xs)*(ye - ys + 1) + (iy - ys)
         int ix = xs + ind / ny;
diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx
index 841f0a4e05..febffa7d18 100644
--- a/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx
+++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx
@@ -28,8 +28,8 @@
 
 class LaplaceCyclic;
 
-#ifndef __LAP_CYCLIC_H__
-#define __LAP_CYCLIC_H__
+#ifndef BOUT_LAP_CYCLIC_H
+#define BOUT_LAP_CYCLIC_H
 
 #include "bout/build_config.hxx"
 #include "bout/invert_laplace.hxx"
@@ -125,4 +125,4 @@ private:
 
 #endif // BOUT_USE_METRIC_3D
 
-#endif // __SPT_H__
+#endif // BOUT_LAP_CYCLIC_H
diff --git a/src/invert/laplace/impls/hypre3d/hypre3d_laplace.cxx b/src/invert/laplace/impls/hypre3d/hypre3d_laplace.cxx
index c74e184be3..d789e5e408 100644
--- a/src/invert/laplace/impls/hypre3d/hypre3d_laplace.cxx
+++ b/src/invert/laplace/impls/hypre3d/hypre3d_laplace.cxx
@@ -99,7 +99,7 @@ LaplaceHypre3d::LaplaceHypre3d(Options* opt, const CELL_LOC loc, Mesh* mesh_in,
 
   // Set up boundary conditions in operator
   BOUT_FOR_SERIAL(i, indexer->getRegionInnerX()) {
-    if (inner_boundary_flags & INVERT_AC_GRAD) {
+    if (isInnerBoundaryFlagSet(INVERT_AC_GRAD)) {
       // Neumann on inner X boundary
       operator3D(i, i) = -1. / coords->dx[i] / sqrt(coords->g_11[i]);
       operator3D(i, i.xp()) = 1. / coords->dx[i] / sqrt(coords->g_11[i]);
@@ -111,7 +111,7 @@ LaplaceHypre3d::LaplaceHypre3d(Options* opt, const CELL_LOC loc, Mesh* mesh_in,
   }
 
   BOUT_FOR_SERIAL(i, indexer->getRegionOuterX()) {
-    if (outer_boundary_flags & INVERT_AC_GRAD) {
+    if (isOuterBoundaryFlagSet(INVERT_AC_GRAD)) {
       // Neumann on outer X boundary
       operator3D(i, i) = 1. / coords->dx[i] / sqrt(coords->g_11[i]);
       operator3D(i, i.xm()) = -1. / coords->dx[i] / sqrt(coords->g_11[i]);
@@ -180,9 +180,9 @@ Field3D LaplaceHypre3d::solve(const Field3D& b_in, const Field3D& x0) {
   // Adjust vectors to represent boundary conditions and check that
   // boundary cells are finite
   BOUT_FOR_SERIAL(i, indexer->getRegionInnerX()) {
-    const BoutReal val = (inner_boundary_flags & INVERT_SET) ? x0[i] : 0.;
+    const BoutReal val = isInnerBoundaryFlagSet(INVERT_SET) ? x0[i] : 0.;
     ASSERT1(std::isfinite(val));
-    if (!(inner_boundary_flags & INVERT_RHS)) {
+    if (!(isInnerBoundaryFlagSet(INVERT_RHS))) {
       b[i] = val;
     } else {
       ASSERT1(std::isfinite(b[i]));
@@ -190,9 +190,9 @@ Field3D LaplaceHypre3d::solve(const Field3D& b_in, const Field3D& x0) {
   }
 
   BOUT_FOR_SERIAL(i, indexer->getRegionOuterX()) {
-    const BoutReal val = (outer_boundary_flags & INVERT_SET) ? x0[i] : 0.;
+    const BoutReal val = (isOuterBoundaryFlagSet(INVERT_SET)) ? x0[i] : 0.;
     ASSERT1(std::isfinite(val));
-    if (!(outer_boundary_flags & INVERT_RHS)) {
+    if (!(isOuterBoundaryFlagSet(INVERT_RHS))) {
       b[i] = val;
     } else {
       ASSERT1(std::isfinite(b[i]));
diff --git a/src/invert/laplace/impls/hypre3d/hypre3d_laplace.hxx b/src/invert/laplace/impls/hypre3d/hypre3d_laplace.hxx
index c9c44ac19e..05a0604c4f 100644
--- a/src/invert/laplace/impls/hypre3d/hypre3d_laplace.hxx
+++ b/src/invert/laplace/impls/hypre3d/hypre3d_laplace.hxx
@@ -30,8 +30,8 @@ class LaplaceHypre3d;
 
 #include "bout/build_config.hxx"
 
-#ifndef __LAPLACE_HYPRE3D_H__
-#define __LAPLACE_HYPRE3D_H__
+#ifndef BOUT_LAPLACE_HYPRE3D_H
+#define BOUT_LAPLACE_HYPRE3D_H
 
 #if BOUT_HAS_HYPRE
 
@@ -227,4 +227,4 @@ public:
 
 #endif // BOUT_HAS_HYPRE
 
-#endif //__LAPLACE_HYPRE3D_H__
+#endif //BOUT_LAPLACE_HYPRE3D_H
diff --git a/src/invert/laplace/impls/iterative_parallel_tri/iterative_parallel_tri.cxx b/src/invert/laplace/impls/iterative_parallel_tri/iterative_parallel_tri.cxx
index 2457ff3b8e..f79463769a 100644
--- a/src/invert/laplace/impls/iterative_parallel_tri/iterative_parallel_tri.cxx
+++ b/src/invert/laplace/impls/iterative_parallel_tri/iterative_parallel_tri.cxx
@@ -293,12 +293,10 @@ FieldPerp LaplaceIPT::solve(const FieldPerp& b, const FieldPerp& x0) {
    */
   auto bcmplx = Matrix<dcomplex>(nmode, ncx);
 
-  const bool invert_inner_boundary =
-      isInnerBoundaryFlagSet(INVERT_SET) and localmesh->firstX();
-  const bool invert_outer_boundary =
-      isOuterBoundaryFlagSet(INVERT_SET) and localmesh->lastX();
+  const bool invert_inner_boundary = isInnerBoundaryFlagSetOnFirstX(INVERT_SET);
+  const bool invert_outer_boundary = isOuterBoundaryFlagSetOnLastX(INVERT_SET);
 
-  BOUT_OMP(parallel for)
+  BOUT_OMP_PERF(parallel for)
   for (int ix = 0; ix < ncx; ix++) {
     /* This for loop will set the bk (initialized by the constructor)
      * bk is the z fourier modes of b in z
@@ -345,8 +343,7 @@ FieldPerp LaplaceIPT::solve(const FieldPerp& b, const FieldPerp& x0) {
                  kz,
                  // wave number (different from kz only if we are taking a part
                  // of the z-domain [and not from 0 to 2*pi])
-                 kz * kwaveFactor, global_flags, inner_boundary_flags,
-                 outer_boundary_flags, &A, &C, &D);
+                 kz * kwaveFactor, &A, &C, &D);
 
     // Patch up internal boundaries
     if (not localmesh->lastX()) {
diff --git a/src/invert/laplace/impls/iterative_parallel_tri/iterative_parallel_tri.hxx b/src/invert/laplace/impls/iterative_parallel_tri/iterative_parallel_tri.hxx
index 563ae7e61f..02e3eca06c 100644
--- a/src/invert/laplace/impls/iterative_parallel_tri/iterative_parallel_tri.hxx
+++ b/src/invert/laplace/impls/iterative_parallel_tri/iterative_parallel_tri.hxx
@@ -26,8 +26,8 @@
 
 class LaplaceIPT;
 
-#ifndef __IPT_H__
-#define __IPT_H__
+#ifndef BOUT_IPT_H
+#define BOUT_IPT_H
 
 #include "bout/build_config.hxx"
 #include "bout/invert_laplace.hxx"
@@ -234,16 +234,8 @@ private:
 
   /// First and last interior points xstart, xend
   int xs, xe;
-
-  bool isGlobalFlagSet(int flag) const { return (global_flags & flag) != 0; }
-  bool isInnerBoundaryFlagSet(int flag) const {
-    return (inner_boundary_flags & flag) != 0;
-  }
-  bool isOuterBoundaryFlagSet(int flag) const {
-    return (outer_boundary_flags & flag) != 0;
-  }
 };
 
 #endif // BOUT_USE_METRIC_3D
 
-#endif // __IPT_H__
+#endif // BOUT_IPT_H
diff --git a/src/invert/laplace/impls/multigrid/multigrid_alg.cxx b/src/invert/laplace/impls/multigrid/multigrid_alg.cxx
index 88556e02ad..fa97a43116 100644
--- a/src/invert/laplace/impls/multigrid/multigrid_alg.cxx
+++ b/src/invert/laplace/impls/multigrid/multigrid_alg.cxx
@@ -104,14 +104,14 @@ void MultigridAlg::getSolution(BoutReal* x, BoutReal* b, int flag) {
       Array<BoutReal> r(ldim);
       for (int n = 1; n < flag; n++) {
         residualVec(level, x, b, std::begin(r));
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int i = 0; i < ldim; i++) {
           y[i] = 0.0;
         }
         cycleMG(level, std::begin(y), std::begin(r));
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int i = 0; i < ldim; i++) {
           x[i] = x[i] + y[i];
         }
@@ -135,8 +135,8 @@ void MultigridAlg::cycleMG(int level, BoutReal* sol, BoutReal* rhs) {
 
     projection(level, std::begin(r), std::begin(pr));
 
-    BOUT_OMP(parallel default(shared))
-    BOUT_OMP(for)
+    BOUT_OMP_PERF(parallel default(shared))
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < (lnx[level - 1] + 2) * (lnz[level - 1] + 2); i++) {
       y[i] = 0.0;
     }
@@ -144,8 +144,8 @@ void MultigridAlg::cycleMG(int level, BoutReal* sol, BoutReal* rhs) {
     cycleMG(level - 1, std::begin(y), std::begin(pr));
 
     prolongation(level - 1, std::begin(y), std::begin(iy));
-    BOUT_OMP(parallel default(shared))
-    BOUT_OMP(for)
+    BOUT_OMP_PERF(parallel default(shared))
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < (lnx[level] + 2) * (lnz[level] + 2); i++) {
       sol[i] += iy[i];
     }
@@ -156,15 +156,15 @@ void MultigridAlg::cycleMG(int level, BoutReal* sol, BoutReal* rhs) {
 
 void MultigridAlg::projection(int level, BoutReal* r, BoutReal* pr) {
 
-  BOUT_OMP(parallel default(shared))
+  BOUT_OMP_PERF(parallel default(shared))
   {
-    BOUT_OMP(for)
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < (lnx[level - 1] + 2) * (lnz[level - 1] + 2); i++) {
       pr[i] = 0.;
     }
     int xend = lnx[level - 1] + 1;
     int zend = lnz[level - 1] + 1;
-    BOUT_OMP(for collapse(2))
+    BOUT_OMP_PERF(for collapse(2))
     for (int i = 1; i < xend; i++) {
       for (int k = 1; k < zend; k++) {
         int i2 = 2 * i - 1;
@@ -183,16 +183,16 @@ void MultigridAlg::projection(int level, BoutReal* r, BoutReal* pr) {
 
 void MultigridAlg::prolongation(int level, BoutReal* x, BoutReal* ix) {
 
-  BOUT_OMP(parallel default(shared))
+  BOUT_OMP_PERF(parallel default(shared))
   {
-    BOUT_OMP(for)
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < (lnx[level + 1] + 2) * (lnz[level + 1] + 2); i++) {
       ix[i] = 0.;
     }
 
     int xend = lnx[level] + 1;
     int zend = lnz[level] + 1;
-    BOUT_OMP(for collapse(2))
+    BOUT_OMP_PERF(for collapse(2))
     for (int i = 1; i < xend; i++) {
       for (int k = 1; k < zend; k++) {
         int i2 = 2 * i - 1;
@@ -219,16 +219,16 @@ void MultigridAlg::smoothings(int level, BoutReal* x, BoutReal* b) {
   dim = mm * (lnx[level] + 2);
   if (mgsm == 0) {
     Array<BoutReal> x0(dim);
-    BOUT_OMP(parallel default(shared))
+    BOUT_OMP_PERF(parallel default(shared))
     for (int num = 0; num < 2; num++) {
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int i = 0; i < dim; i++) {
         x0[i] = x[i];
       }
 
       int xend = lnx[level] + 1;
       int zend = lnz[level] + 1;
-      BOUT_OMP(for collapse(2))
+      BOUT_OMP_PERF(for collapse(2))
       for (int i = 1; i < xend; i++) {
         for (int k = 1; k < zend; k++) {
           int nn = i * mm + k;
@@ -313,8 +313,8 @@ void MultigridAlg::pGMRES(BoutReal* sol, BoutReal* rhs, int level, int iplag) {
   Array<BoutReal> q(ldim);
   Array<BoutReal> r(ldim);
 
-  BOUT_OMP(parallel default(shared))
-  BOUT_OMP(for)
+  BOUT_OMP_PERF(parallel default(shared))
+  BOUT_OMP_PERF(for)
   for (int i = 0; i < ldim; i++) {
     sol[i] = 0.0;
   }
@@ -335,8 +335,8 @@ void MultigridAlg::pGMRES(BoutReal* sol, BoutReal* rhs, int level, int iplag) {
     delete[] v;
     return;
   }
-  BOUT_OMP(parallel default(shared))
-  BOUT_OMP(for)
+  BOUT_OMP_PERF(parallel default(shared))
+  BOUT_OMP_PERF(for)
   for (int i = 0; i < ldim; i++) {
     r[i] = 0.0;
   }
@@ -345,8 +345,8 @@ void MultigridAlg::pGMRES(BoutReal* sol, BoutReal* rhs, int level, int iplag) {
   } else {
     cycleMG(level, std::begin(r), rhs);
   }
-  BOUT_OMP(parallel default(shared))
-  BOUT_OMP(for)
+  BOUT_OMP_PERF(parallel default(shared))
+  BOUT_OMP_PERF(for)
   for (int i = 0; i < ldim; i++) {
     v[0][i] = r[i];
   }
@@ -360,21 +360,21 @@ void MultigridAlg::pGMRES(BoutReal* sol, BoutReal* rhs, int level, int iplag) {
     }
     a0 = 1.0 / a1;
     g[0] = a1;
-    BOUT_OMP(parallel default(shared))
+    BOUT_OMP_PERF(parallel default(shared))
     {
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int i = 0; i < ldim; i++) {
         v[0][i] *= a0;
       }
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int i = 1; i < MAXGM + 1; i++) {
         g[i] = 0.0;
       }
     }
     for (it = 0; it < MAXGM; it++) {
       multiAVec(level, v[it], std::begin(q));
-      BOUT_OMP(parallel default(shared))
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(parallel default(shared))
+      BOUT_OMP_PERF(for)
       for (int i = 0; i < ldim; i++) {
         v[it + 1][i] = 0.0;
       }
@@ -407,8 +407,8 @@ void MultigridAlg::pGMRES(BoutReal* sol, BoutReal* rhs, int level, int iplag) {
       }
       a0 = 1.0 / a1;
       h[it + 1][it] = a1;
-      BOUT_OMP(parallel default(shared))
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(parallel default(shared))
+      BOUT_OMP_PERF(for)
       for (int i = 0; i < ldim; i++) {
         v[it + 1][i] *= a0;
       }
@@ -444,13 +444,13 @@ void MultigridAlg::pGMRES(BoutReal* sol, BoutReal* rhs, int level, int iplag) {
         }
         y[i] = y[i] / h[i][i];
       }
-      BOUT_OMP(parallel default(shared))
+      BOUT_OMP_PERF(parallel default(shared))
       {
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(for)
         for (int i = 0; i < ldim; i++) {
           p[i] = sol[i];
         }
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(for)
         for (int k = 0; k < ldim; k++) {
           for (int i = 0; i <= it; i++) {
             p[k] += y[i] * v[i][k];
@@ -492,8 +492,8 @@ void MultigridAlg::pGMRES(BoutReal* sol, BoutReal* rhs, int level, int iplag) {
       perror = error;
     }
     /* Restart with new initial */
-    BOUT_OMP(parallel default(shared))
-    BOUT_OMP(for)
+    BOUT_OMP_PERF(parallel default(shared))
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < ldim; i++) {
       v[0][i] = 0.0;
     }
@@ -503,8 +503,8 @@ void MultigridAlg::pGMRES(BoutReal* sol, BoutReal* rhs, int level, int iplag) {
       cycleMG(level, v[0], std::begin(r));
     }
 
-    BOUT_OMP(parallel default(shared))
-    BOUT_OMP(for)
+    BOUT_OMP_PERF(parallel default(shared))
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < ldim; i++) {
       sol[i] = p[i];
     }
@@ -559,11 +559,11 @@ BoutReal MultigridAlg::vectorProd(int level, BoutReal* x, BoutReal* y) {
 
   BoutReal val;
   BoutReal ini_e = 0.0;
-  BOUT_OMP(parallel default(shared))
+  BOUT_OMP_PERF(parallel default(shared))
   {
     int xend = lnx[level] + 1;
     int zend = lnz[level] + 1;
-    BOUT_OMP(for reduction(+:ini_e) collapse(2))
+    BOUT_OMP_PERF(for reduction(+:ini_e) collapse(2))
     for (int i = 1; i < xend; i++) {
       for (int k = 1; k < zend; k++) {
         int ii = i * (lnz[level] + 2) + k;
@@ -583,16 +583,16 @@ BoutReal MultigridAlg::vectorProd(int level, BoutReal* x, BoutReal* y) {
 void MultigridAlg::multiAVec(int level, BoutReal* x, BoutReal* b) {
 
   int mm = lnz[level] + 2;
-  BOUT_OMP(parallel default(shared))
+  BOUT_OMP_PERF(parallel default(shared))
   {
-    BOUT_OMP(for)
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < mm * (lnx[level] + 2); i++) {
       b[i] = 0.0;
     }
 
     int xend = lnx[level] + 1;
     int zend = lnz[level] + 1;
-    BOUT_OMP(for collapse(2))
+    BOUT_OMP_PERF(for collapse(2))
     for (int i = 1; i < xend; i++) {
       for (int k = 1; k < zend; k++) {
         int nn = i * mm + k;
@@ -614,16 +614,16 @@ void MultigridAlg::residualVec(int level, BoutReal* x, BoutReal* b, BoutReal* r)
 
   int mm;
   mm = lnz[level] + 2;
-  BOUT_OMP(parallel default(shared))
+  BOUT_OMP_PERF(parallel default(shared))
   {
-    BOUT_OMP(for)
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < mm * (lnx[level] + 2); i++) {
       r[i] = 0.0;
     }
 
     int xend = lnx[level] + 1;
     int zend = lnz[level] + 1;
-    BOUT_OMP(for collapse(2))
+    BOUT_OMP_PERF(for collapse(2))
     for (int i = 1; i < xend; i++) {
       for (int k = 1; k < zend; k++) {
         int nn = i * mm + k;
@@ -646,16 +646,16 @@ void MultigridAlg::setMatrixC(int level) {
 
   BoutReal ratio = 8.0;
 
-  BOUT_OMP(parallel default(shared))
+  BOUT_OMP_PERF(parallel default(shared))
   {
-    BOUT_OMP(for)
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < (lnx[level - 1] + 2) * (lnz[level - 1] + 2) * 9; i++) {
       matmg[level - 1][i] = 0.0;
     }
 
     int xend = lnx[level - 1] + 1;
     int zend = lnz[level - 1] + 1;
-    BOUT_OMP(for collapse(2))
+    BOUT_OMP_PERF(for collapse(2))
     for (int i = 1; i < xend; i++) {
       for (int k = 1; k < zend; k++) {
         int i2 = 2 * i - 1;
@@ -809,8 +809,8 @@ void MultigridAlg::solveMG(BoutReal* sol, BoutReal* rhs, int level) {
   BoutReal ini_e, perror, error, rederr;
   int ldim = (lnx[level] + 2) * (lnz[level] + 2);
 
-  BOUT_OMP(parallel default(shared))
-  BOUT_OMP(for)
+  BOUT_OMP_PERF(parallel default(shared))
+  BOUT_OMP_PERF(for)
   for (int i = 0; i < ldim; i++) {
     sol[i] = 0.0;
   }
@@ -825,22 +825,22 @@ void MultigridAlg::solveMG(BoutReal* sol, BoutReal* rhs, int level) {
   }
   Array<BoutReal> y(ldim);
   Array<BoutReal> r(ldim);
-  BOUT_OMP(parallel default(shared))
-   BOUT_OMP(for)
+  BOUT_OMP_PERF(parallel default(shared))
+   BOUT_OMP_PERF(for)
    for (int i = 0; i < ldim; i++) {
     r[i] = rhs[i];
    }
 
    perror = ini_e;
    for (m = 0; m < MAXIT; m++) {
-    BOUT_OMP(parallel default(shared))
-    BOUT_OMP(for)
+     BOUT_OMP_PERF(parallel default(shared))
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < ldim; i++) {
       y[i] = 0.0;
     }
     cycleMG(level, std::begin(y), std::begin(r));
-    BOUT_OMP(parallel default(shared))
-    BOUT_OMP(for)
+    BOUT_OMP_PERF(parallel default(shared))
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < ldim; i++) {
       sol[i] = sol[i] + y[i];
     }
diff --git a/src/invert/laplace/impls/multigrid/multigrid_laplace.cxx b/src/invert/laplace/impls/multigrid/multigrid_laplace.cxx
index beb9262ed8..c5076cd499 100644
--- a/src/invert/laplace/impls/multigrid/multigrid_laplace.cxx
+++ b/src/invert/laplace/impls/multigrid/multigrid_laplace.cxx
@@ -67,7 +67,6 @@ LaplaceMultigrid::LaplaceMultigrid(Options* opt, const CELL_LOC loc, Mesh* mesh_
   opts->get("atol", atol, pow(10.0, -20), true);
   opts->get("dtol", dtol, pow(10.0, 5), true);
   opts->get("smtype", mgsm, 1, true);
-#if BOUT_USE_OPENMP
   if (mgsm != 0 && omp_get_max_threads() > 1) {
     output_warn << "WARNING: in multigrid Laplace solver, for smtype!=0 the smoothing "
                    "cannot be parallelised with OpenMP threads."
@@ -75,7 +74,6 @@ LaplaceMultigrid::LaplaceMultigrid(Options* opt, const CELL_LOC loc, Mesh* mesh_
                 << "         Consider using smtype=0 instead when using OpenMP threads."
                 << endl;
   }
-#endif
   opts->get("jacomega", omega, 0.8, true);
   opts->get("solvertype", mgplag, 1, true);
   opts->get("cftype", cftype, 0, true);
@@ -86,19 +84,18 @@ LaplaceMultigrid::LaplaceMultigrid(Options* opt, const CELL_LOC loc, Mesh* mesh_
   // Initialize, allocate memory, etc.
   comms_tagbase = 385; // Some random number
 
-  int implemented_global_flags = INVERT_START_NEW;
-  if (global_flags & ~implemented_global_flags) {
+  constexpr int implemented_global_flags = INVERT_START_NEW;
+  if (isGlobalFlagSet(~implemented_global_flags)) {
     throw BoutException("Attempted to set Laplacian inversion flag that is not "
                         "implemented in LaplaceMultigrid.");
   }
-  int implemented_boundary_flags =
-      INVERT_AC_GRAD + INVERT_SET
-      + INVERT_DC_GRAD; // INVERT_DC_GRAD does not actually do anything, but harmless to set while comparing to Fourier solver with Neumann boundary conditions
-  if (inner_boundary_flags & ~implemented_boundary_flags) {
+  // INVERT_DC_GRAD does not actually do anything, but harmless to set while comparing to Fourier solver with Neumann boundary conditions
+  constexpr int implemented_boundary_flags = INVERT_AC_GRAD + INVERT_SET + INVERT_DC_GRAD;
+  if (isInnerBoundaryFlagSet(~implemented_boundary_flags)) {
     throw BoutException("Attempted to set Laplacian inner boundary inversion flag that "
                         "is not implemented in LaplaceMultigrid.");
   }
-  if (outer_boundary_flags & ~implemented_boundary_flags) {
+  if (isOuterBoundaryFlagSet(~implemented_boundary_flags)) {
     throw BoutException("Attempted to set Laplacian outer boundary inversion flag that "
                         "is not implemented in LaplaceMultigrid.");
   }
@@ -218,11 +215,9 @@ LaplaceMultigrid::LaplaceMultigrid(Options* opt, const CELL_LOC loc, Mesh* mesh_
     } else {
       output << "Multigrid solver with merging " << mgmpi << endl;
     }
-#if BOUT_USE_OPENMP
-    BOUT_OMP(parallel)
-    BOUT_OMP(master)
+    BOUT_OMP_SAFE(parallel)
+    BOUT_OMP_SAFE(master)
     { output << "Num threads = " << omp_get_num_threads() << endl; }
-#endif
   }
 }
 
@@ -246,10 +241,10 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
   int lz2 = lzz + 2;
   int lxx = kMG->lnx[level];
 
-  if (global_flags & INVERT_START_NEW) {
+  if (isGlobalFlagSet(INVERT_START_NEW)) {
     // set initial guess to zero
-    BOUT_OMP(parallel default(shared))
-    BOUT_OMP(for collapse(2))
+    BOUT_OMP_PERF(parallel default(shared))
+    BOUT_OMP_PERF(for collapse(2))
     for (int i = 1; i < lxx + 1; i++) {
       for (int k = 1; k < lzz + 1; k++) {
         x[i * lz2 + k] = 0.;
@@ -257,8 +252,8 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
     }
   } else {
     // Read initial guess into local array, ignoring guard cells
-    BOUT_OMP(parallel default(shared))
-    BOUT_OMP(for collapse(2))
+    BOUT_OMP_PERF(parallel default(shared))
+    BOUT_OMP_PERF(for collapse(2))
     for (int i = 1; i < lxx + 1; i++) {
       for (int k = 1; k < lzz + 1; k++) {
         int i2 = i - 1 + localmesh->xstart;
@@ -269,8 +264,8 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
   }
 
   // Read RHS into local array
-  BOUT_OMP(parallel default(shared))
-  BOUT_OMP(for collapse(2))
+  BOUT_OMP_PERF(parallel default(shared))
+  BOUT_OMP_PERF(for collapse(2))
   for (int i = 1; i < lxx + 1; i++) {
     for (int k = 1; k < lzz + 1; k++) {
       int i2 = i - 1 + localmesh->xstart;
@@ -280,12 +275,12 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
   }
 
   if (localmesh->firstX()) {
-    if (inner_boundary_flags & INVERT_AC_GRAD) {
+    if (isInnerBoundaryFlagSet(INVERT_AC_GRAD)) {
       // Neumann boundary condition
-      if (inner_boundary_flags & INVERT_SET) {
+      if (isInnerBoundaryFlagSet(INVERT_SET)) {
         // guard cells of x0 specify gradient to set at inner boundary
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int k = 1; k < lzz + 1; k++) {
           int k2 = k - 1;
           x[k] = -x0(localmesh->xstart - 1, k2)
@@ -294,8 +289,8 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
         }
       } else {
         // zero gradient inner boundary condition
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int k = 1; k < lzz + 1; k++) {
           // set inner guard cells
           x[k] = 0.0;
@@ -303,10 +298,10 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
       }
     } else {
       // Dirichlet boundary condition
-      if (inner_boundary_flags & INVERT_SET) {
+      if (isInnerBoundaryFlagSet(INVERT_SET)) {
         // guard cells of x0 specify value to set at inner boundary
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int k = 1; k < lzz + 1; k++) {
           int k2 = k - 1;
           x[k] = 2. * x0(localmesh->xstart - 1, k2);
@@ -314,8 +309,8 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
         }
       } else {
         // zero value inner boundary condition
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int k = 1; k < lzz + 1; k++) {
           // set inner guard cells
           x[k] = 0.;
@@ -324,12 +319,12 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
     }
   }
   if (localmesh->lastX()) {
-    if (outer_boundary_flags & INVERT_AC_GRAD) {
+    if (isOuterBoundaryFlagSet(INVERT_AC_GRAD)) {
       // Neumann boundary condition
-      if (inner_boundary_flags & INVERT_SET) {
+      if (isInnerBoundaryFlagSet(INVERT_SET)) {
         // guard cells of x0 specify gradient to set at outer boundary
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int k = 1; k < lzz + 1; k++) {
           int k2 = k - 1;
           x[(lxx + 1) * lz2 + k] = x0(localmesh->xend + 1, k2)
@@ -339,8 +334,8 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
         }
       } else {
         // zero gradient outer boundary condition
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int k = 1; k < lzz + 1; k++) {
           // set outer guard cells
           x[(lxx + 1) * lz2 + k] = 0.;
@@ -348,10 +343,10 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
       }
     } else {
       // Dirichlet boundary condition
-      if (outer_boundary_flags & INVERT_SET) {
+      if (isOuterBoundaryFlagSet(INVERT_SET)) {
         // guard cells of x0 specify value to set at outer boundary
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int k = 1; k < lzz + 1; k++) {
           int k2 = k - 1;
           x[(lxx + 1) * lz2 + k] = 2. * x0(localmesh->xend + 1, k2);
@@ -359,8 +354,8 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
         }
       } else {
         // zero value inner boundary condition
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int k = 1; k < lzz + 1; k++) {
           // set outer guard cells
           x[(lxx + 1) * lz2 + k] = 0.;
@@ -370,8 +365,8 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
   }
 
   // Exchange ghost cells of initial guess
-  BOUT_OMP(parallel default(shared))
-  BOUT_OMP(for)
+  BOUT_OMP_PERF(parallel default(shared))
+  BOUT_OMP_PERF(for)
   for (int i = 0; i < lxx + 2; i++) {
     x[i * lz2] = x[(i + 1) * lz2 - 2];
     x[(i + 1) * lz2 - 1] = x[i * lz2 + 1];
@@ -471,8 +466,8 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
 #endif
 
   // Copy solution into a FieldPerp to return
-  BOUT_OMP(parallel default(shared))
-  BOUT_OMP(for collapse(2))
+  BOUT_OMP_PERF(parallel default(shared))
+  BOUT_OMP_PERF(for collapse(2))
   for (int i = 1; i < lxx + 1; i++) {
     for (int k = 1; k < lzz + 1; k++) {
       int i2 = i - 1 + localmesh->xstart;
@@ -481,13 +476,13 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
     }
   }
   if (localmesh->firstX()) {
-    if (inner_boundary_flags & INVERT_AC_GRAD) {
+    if (isInnerBoundaryFlagSet(INVERT_AC_GRAD)) {
       // Neumann boundary condition
-      if (inner_boundary_flags & INVERT_SET) {
+      if (isInnerBoundaryFlagSet(INVERT_SET)) {
         // guard cells of x0 specify gradient to set at inner boundary
         int i2 = -1 + localmesh->xstart;
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int k = 1; k < lzz + 1; k++) {
           int k2 = k - 1;
           result(i2, k2) = x[lz2 + k]
@@ -498,8 +493,8 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
       } else {
         // zero gradient inner boundary condition
         int i2 = -1 + localmesh->xstart;
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int k = 1; k < lzz + 1; k++) {
           int k2 = k - 1;
           result(i2, k2) = x[lz2 + k];
@@ -507,11 +502,11 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
       }
     } else {
       // Dirichlet boundary condition
-      if (inner_boundary_flags & INVERT_SET) {
+      if (isInnerBoundaryFlagSet(INVERT_SET)) {
         // guard cells of x0 specify value to set at inner boundary
         int i2 = -1 + localmesh->xstart;
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int k = 1; k < lzz + 1; k++) {
           int k2 = k - 1;
           result(i2, k2) = 2. * x0(localmesh->xstart - 1, k2) - x[lz2 + k];
@@ -519,8 +514,8 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
       } else {
         // zero value inner boundary condition
         int i2 = -1 + localmesh->xstart;
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int k = 1; k < lzz + 1; k++) {
           int k2 = k - 1;
           result(i2, k2) = -x[lz2 + k];
@@ -529,13 +524,13 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
     }
   }
   if (localmesh->lastX()) {
-    if (outer_boundary_flags & INVERT_AC_GRAD) {
+    if (isOuterBoundaryFlagSet(INVERT_AC_GRAD)) {
       // Neumann boundary condition
-      if (inner_boundary_flags & INVERT_SET) {
+      if (isInnerBoundaryFlagSet(INVERT_SET)) {
         // guard cells of x0 specify gradient to set at outer boundary
         int i2 = lxx + localmesh->xstart;
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int k = 1; k < lzz + 1; k++) {
           int k2 = k - 1;
           result(i2, k2) = x[lxx * lz2 + k]
@@ -546,8 +541,8 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
       } else {
         // zero gradient outer boundary condition
         int i2 = lxx + localmesh->xstart;
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int k = 1; k < lzz + 1; k++) {
           int k2 = k - 1;
           result(i2, k2) = x[lxx * lz2 + k];
@@ -555,11 +550,11 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
       }
     } else {
       // Dirichlet boundary condition
-      if (outer_boundary_flags & INVERT_SET) {
+      if (isOuterBoundaryFlagSet(INVERT_SET)) {
         // guard cells of x0 specify value to set at outer boundary
         int i2 = lxx + localmesh->xstart;
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int k = 1; k < lzz + 1; k++) {
           int k2 = k - 1;
           result(i2, k2) = 2. * x0(localmesh->xend + 1, k2) - x[lxx * lz2 + k];
@@ -567,8 +562,8 @@ FieldPerp LaplaceMultigrid::solve(const FieldPerp& b_in, const FieldPerp& x0) {
       } else {
         // zero value inner boundary condition
         int i2 = lxx + localmesh->xstart;
-        BOUT_OMP(parallel default(shared))
-        BOUT_OMP(for)
+        BOUT_OMP_PERF(parallel default(shared))
+        BOUT_OMP_PERF(for)
         for (int k = 1; k < lzz + 1; k++) {
           int k2 = k - 1;
           result(i2, k2) = -x[lxx * lz2 + k];
@@ -592,8 +587,8 @@ void LaplaceMultigrid::generateMatrixF(int level) {
   int llx = kMG->lnx[level];
   int llz = kMG->lnz[level];
 
-  BOUT_OMP(parallel default(shared))
-  BOUT_OMP(for collapse(2))
+  BOUT_OMP_PERF(parallel default(shared))
+  BOUT_OMP_PERF(for collapse(2))
   for (int i = 1; i < llx + 1; i++) {
     for (int k = 1; k < llz + 1; k++) {
       int i2 = i - 1 + localmesh->xstart;
@@ -655,10 +650,10 @@ void LaplaceMultigrid::generateMatrixF(int level) {
   // Here put boundary conditions
 
   if (kMG->rProcI == 0) {
-    if (inner_boundary_flags & INVERT_AC_GRAD) {
+    if (isInnerBoundaryFlagSet(INVERT_AC_GRAD)) {
       // Neumann boundary condition
-      BOUT_OMP(parallel default(shared))
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(parallel default(shared))
+      BOUT_OMP_PERF(for)
       for (int k = 1; k < llz + 1; k++) {
         int ic = llz + 2 + k;
         mat[ic * 9 + 3] += mat[ic * 9];
@@ -673,8 +668,8 @@ void LaplaceMultigrid::generateMatrixF(int level) {
       }
     } else {
       // Dirichlet boundary condition
-      BOUT_OMP(parallel default(shared))
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(parallel default(shared))
+      BOUT_OMP_PERF(for)
       for (int k = 1; k < llz + 1; k++) {
         int ic = llz + 2 + k;
         mat[ic * 9 + 3] -= mat[ic * 9];
@@ -690,10 +685,10 @@ void LaplaceMultigrid::generateMatrixF(int level) {
     }
   }
   if (kMG->rProcI == kMG->xNP - 1) {
-    if (outer_boundary_flags & INVERT_AC_GRAD) {
+    if (isOuterBoundaryFlagSet(INVERT_AC_GRAD)) {
       // Neumann boundary condition
-      BOUT_OMP(parallel default(shared))
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(parallel default(shared))
+      BOUT_OMP_PERF(for)
       for (int k = 1; k < llz + 1; k++) {
         int ic = llx * (llz + 2) + k;
         mat[ic * 9 + 3] += mat[ic * 9 + 6];
@@ -708,8 +703,8 @@ void LaplaceMultigrid::generateMatrixF(int level) {
       }
     } else {
       // Dirichlet boundary condition
-      BOUT_OMP(parallel default(shared))
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(parallel default(shared))
+      BOUT_OMP_PERF(for)
       for (int k = 1; k < llz + 1; k++) {
         int ic = llx * (llz + 2) + k;
         mat[ic * 9 + 3] -= mat[ic * 9 + 6];
diff --git a/src/invert/laplace/impls/multigrid/multigrid_laplace.hxx b/src/invert/laplace/impls/multigrid/multigrid_laplace.hxx
index 4186147874..f0b3cfc5c1 100644
--- a/src/invert/laplace/impls/multigrid/multigrid_laplace.hxx
+++ b/src/invert/laplace/impls/multigrid/multigrid_laplace.hxx
@@ -28,8 +28,8 @@
  *
  **************************************************************************/
 
-#ifndef __MULTIGRID_LAPLACE_H__
-#define __MULTIGRID_LAPLACE_H__
+#ifndef BOUT_MULTIGRID_LAPLACE_H
+#define BOUT_MULTIGRID_LAPLACE_H
 
 #include "bout/build_config.hxx"
 #include "bout/invert_laplace.hxx"
@@ -246,4 +246,4 @@ RegisterLaplace<LaplaceMultigrid> registerlaplacemultigrid(LAPLACE_MULTIGRID);
 
 #endif // BOUT_USE_METRIC_3D
 
-#endif // __MULTIGRID_LAPLACE_H__
+#endif // BOUT_MULTIGRID_LAPLACE_H
diff --git a/src/invert/laplace/impls/multigrid/multigrid_solver.cxx b/src/invert/laplace/impls/multigrid/multigrid_solver.cxx
index 6d448e4db7..0c5ad82d6c 100644
--- a/src/invert/laplace/impls/multigrid/multigrid_solver.cxx
+++ b/src/invert/laplace/impls/multigrid/multigrid_solver.cxx
@@ -290,15 +290,15 @@ void Multigrid1DP::lowestSolver(BoutReal* x, BoutReal* b, int UNUSED(plag)) {
 
     int nx = (xProcI % rMG->zNP) * lnx[0];
 
-    BOUT_OMP(parallel default(shared))
+    BOUT_OMP_PERF(parallel default(shared))
     {
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int i = 0; i < dim; i++) {
         y[i] = 0.0;
         r[i] = 0.0;
       }
 
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int i = 0; i < dimg; i++) {
         yl[i] = 0.0;
         yg[i] = 0.0;
@@ -306,7 +306,7 @@ void Multigrid1DP::lowestSolver(BoutReal* x, BoutReal* b, int UNUSED(plag)) {
 
       int xend = lnx[0] + 1;
       int zend = lnz[0] + 1;
-      BOUT_OMP(for collapse(2))
+      BOUT_OMP_PERF(for collapse(2))
       for (int ix = 1; ix < xend; ix++) {
         for (int iz = 1; iz < zend; iz++) {
           int nn = (nx + ix) * (lnz[0] + 2) + iz;
@@ -319,11 +319,11 @@ void Multigrid1DP::lowestSolver(BoutReal* x, BoutReal* b, int UNUSED(plag)) {
                                       MPI_SUM, comm2D);
 
     int nz = (xProcI % rMG->zNP) * (rMG->lnz[level]);
-    BOUT_OMP(parallel default(shared))
+    BOUT_OMP_PERF(parallel default(shared))
     {
       int xend = rMG->lnx[level] + 1;
       int zend = rMG->lnz[level] + 1;
-      BOUT_OMP(for collapse(2))
+      BOUT_OMP_PERF(for collapse(2))
       for (int ix = 1; ix < xend; ix++) {
         for (int iz = 1; iz < zend; iz++) {
           int nn = ix * (lnz[0] + 2) + nz + iz;
@@ -335,9 +335,9 @@ void Multigrid1DP::lowestSolver(BoutReal* x, BoutReal* b, int UNUSED(plag)) {
 
     rMG->getSolution(std::begin(y), std::begin(r), 1);
 
-    BOUT_OMP(parallel default(shared))
+    BOUT_OMP_PERF(parallel default(shared))
     {
-    BOUT_OMP(for)
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < dimg; i++) {
       yl[i] = 0.0;
       yg[i] = 0.0;
@@ -345,7 +345,7 @@ void Multigrid1DP::lowestSolver(BoutReal* x, BoutReal* b, int UNUSED(plag)) {
 
     int xend = rMG->lnx[level] + 1;
     int zend = rMG->lnz[level] + 1;
-      BOUT_OMP(for collapse(2))
+      BOUT_OMP_PERF(for collapse(2))
       for (int ix = 1; ix < xend; ix++) {
         for (int iz = 1; iz < zend; iz++) {
           int nn = ix * (lnz[0] + 2) + nz + iz;
@@ -357,11 +357,11 @@ void Multigrid1DP::lowestSolver(BoutReal* x, BoutReal* b, int UNUSED(plag)) {
     bout::globals::mpi->MPI_Allreduce(std::begin(yl), std::begin(yg), dimg, MPI_DOUBLE,
                                       MPI_SUM, comm2D);
 
-    BOUT_OMP(parallel default(shared))
+    BOUT_OMP_PERF(parallel default(shared))
     {
       int xend = lnx[0] + 1;
       int zend = lnz[0] + 1;
-      BOUT_OMP(for collapse(2))
+      BOUT_OMP_PERF(for collapse(2))
       for (int ix = 1; ix < xend; ix++) {
         for (int iz = 1; iz < zend; iz++) {
           int nn = (nx + ix) * (lnz[0] + 2) + iz;
@@ -377,16 +377,16 @@ void Multigrid1DP::lowestSolver(BoutReal* x, BoutReal* b, int UNUSED(plag)) {
     Array<BoutReal> y(dim);
     Array<BoutReal> r(dim);
     int nx = xProcI * lnx[0];
-    BOUT_OMP(parallel default(shared))
+    BOUT_OMP_PERF(parallel default(shared))
     {
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int i = 0; i < dim; i++) {
         y[i] = 0.0;
         r[i] = 0.0;
       }
       int xend = lnx[0] + 1;
       int zend = lnz[0] + 1;
-      BOUT_OMP(for collapse(2))
+      BOUT_OMP_PERF(for collapse(2))
       for (int ix = 1; ix < xend; ix++) {
         for (int iz = 1; iz < zend; iz++) {
           int nn = (nx + ix) * (lnz[0] + 2) + iz;
@@ -397,18 +397,18 @@ void Multigrid1DP::lowestSolver(BoutReal* x, BoutReal* b, int UNUSED(plag)) {
     }
     bout::globals::mpi->MPI_Allreduce(std::begin(y), std::begin(r), dim, MPI_DOUBLE,
                                       MPI_SUM, commMG);
-    BOUT_OMP(parallel default(shared))
-    BOUT_OMP(for)
+    BOUT_OMP_PERF(parallel default(shared))
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < dim; i++) {
       y[i] = 0.0;
     }
     sMG->getSolution(std::begin(y), std::begin(r), 1);
 
-    BOUT_OMP(parallel default(shared))
+    BOUT_OMP_PERF(parallel default(shared))
     {
       int xend = lnx[0] + 1;
       int zend = lnz[0] + 1;
-      BOUT_OMP(for collapse(2))
+      BOUT_OMP_PERF(for collapse(2))
       for (int ix = 1; ix < xend; ix++) {
         for (int iz = 1; iz < zend; iz++) {
           int nn = (nx + ix) * (lnz[0] + 2) + iz;
@@ -430,21 +430,21 @@ void Multigrid1DP::convertMatrixF2D(int level) {
   Array<BoutReal> yl(dim * 9);
   Array<BoutReal> yg(dim * 9);
   int nx = (xProcI % rMG->zNP) * lnx[0];
-  BOUT_OMP(parallel default(shared))
+  BOUT_OMP_PERF(parallel default(shared))
   {
-    BOUT_OMP(for)
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < dim * 9; i++) {
       yl[i] = 0.0;
       yg[i] = 0.0;
     }
-    BOUT_OMP(for)
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < (rMG->lnx[level] + 2) * (rMG->lnz[level] + 2) * 9; i++) {
       rMG->matmg[level][i] = 0.0;
     }
 
     int xend = lnx[0] + 1;
     int zend = lnz[0] + 1;
-    BOUT_OMP(for collapse(2))
+    BOUT_OMP_PERF(for collapse(2))
     for (int ix = 1; ix < xend; ix++) {
       for (int iz = 1; iz < zend; iz++) {
         int nn = (nx + ix) * (lnz[0] + 2) + iz;
@@ -494,11 +494,11 @@ void Multigrid1DP::convertMatrixF2D(int level) {
   }
   int nz = (xProcI % rMG->zNP) * (rMG->lnz[level]);
 
-  BOUT_OMP(parallel default(shared))
+  BOUT_OMP_PERF(parallel default(shared))
   {
     int xend = rMG->lnx[level] + 1;
     int zend = rMG->lnz[level] + 1;
-    BOUT_OMP(for collapse(2))
+    BOUT_OMP_PERF(for collapse(2))
     for (int ix = 1; ix < xend; ix++) {
       for (int iz = 1; iz < zend; iz++) {
         int nn = ix * (lnz[0] + 2) + nz + iz;
@@ -517,16 +517,16 @@ void Multigrid1DP::convertMatrixFS(int level) {
   Array<BoutReal> yl(dim * 9);
   BoutReal* yg = sMG->matmg[level];
   int nx = xProcI * lnx[0];
-  BOUT_OMP(parallel default(shared))
+  BOUT_OMP_PERF(parallel default(shared))
   {
-    BOUT_OMP(for)
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < dim * 9; i++) {
       yl[i] = 0.0;
       yg[i] = 0.0;
     }
     int xend = lnx[0] + 1;
     int zend = lnz[0] + 1;
-    BOUT_OMP(for collapse(2))
+    BOUT_OMP_PERF(for collapse(2))
     for (int ix = 1; ix < xend; ix++) {
       for (int iz = 1; iz < zend; iz++) {
         int nn = (nx + ix) * (lnz[0] + 2) + iz;
@@ -675,9 +675,9 @@ void Multigrid2DPf1D::lowestSolver(BoutReal* x, BoutReal* b, int UNUSED(plag)) {
     Array<BoutReal> r(dim);
     int nx = xProcI * lnx[0];
     int nz = zProcI * lnz[0];
-    BOUT_OMP(parallel default(shared))
+    BOUT_OMP_PERF(parallel default(shared))
     {
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int i = 0; i < dim; i++) {
         y[i] = 0.0;
         r[i] = 0.0;
@@ -685,7 +685,7 @@ void Multigrid2DPf1D::lowestSolver(BoutReal* x, BoutReal* b, int UNUSED(plag)) {
 
       int xend = lnx[0] + 1;
       int zend = lnz[0] + 1;
-      BOUT_OMP(for collapse(2))
+      BOUT_OMP_PERF(for collapse(2))
       for (int ix = 1; ix < xend; ix++) {
         for (int iz = 1; iz < zend; iz++) {
           int nn = (nx + ix) * (gnz[0] + 2) + nz + iz;
@@ -696,17 +696,17 @@ void Multigrid2DPf1D::lowestSolver(BoutReal* x, BoutReal* b, int UNUSED(plag)) {
     }
     bout::globals::mpi->MPI_Allreduce(std::begin(y), std::begin(r), dim, MPI_DOUBLE,
                                       MPI_SUM, commMG);
-    BOUT_OMP(parallel default(shared))
-    BOUT_OMP(for)
+    BOUT_OMP_PERF(parallel default(shared))
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < dim; i++) {
       y[i] = 0.0;
     }
     sMG->getSolution(std::begin(y), std::begin(r), 1);
-    BOUT_OMP(parallel default(shared))
+    BOUT_OMP_PERF(parallel default(shared))
     {
       int xend = lnx[0] + 1;
       int zend = lnz[0] + 1;
-      BOUT_OMP(for collapse(2))
+      BOUT_OMP_PERF(for collapse(2))
       for (int ix = 1; ix < xend; ix++) {
         for (int iz = 1; iz < zend; iz++) {
           int nn = (nx + ix) * (gnz[0] + 2) + nz + iz;
@@ -728,16 +728,16 @@ void Multigrid2DPf1D::convertMatrixFS(int level) {
   BoutReal* yg = sMG->matmg[level];
   int nx = xProcI * lnx[0];
   int nz = zProcI * lnz[0];
-  BOUT_OMP(parallel default(shared))
+  BOUT_OMP_PERF(parallel default(shared))
   {
-    BOUT_OMP(for)
+    BOUT_OMP_PERF(for)
     for (int i = 0; i < dim * 9; i++) {
       yl[i] = 0.0;
       yg[i] = 0.0;
     }
     int xend = lnx[0] + 1;
     int zend = lnz[0] + 1;
-    BOUT_OMP(for collapse(2))
+    BOUT_OMP_PERF(for collapse(2))
     for (int ix = 1; ix < xend; ix++) {
       for (int iz = 1; iz < zend; iz++) {
         int nn = (nx + ix) * (gnz[0] + 2) + nz + iz;
diff --git a/src/invert/laplace/impls/naulin/naulin_laplace.cxx b/src/invert/laplace/impls/naulin/naulin_laplace.cxx
index d82f874cbb..e315d3c771 100644
--- a/src/invert/laplace/impls/naulin/naulin_laplace.cxx
+++ b/src/invert/laplace/impls/naulin/naulin_laplace.cxx
@@ -174,9 +174,9 @@ LaplaceNaulin::LaplaceNaulin(Options* opt, const CELL_LOC loc, Mesh* mesh_in,
   // invert Delp2 and we will not converge
   ASSERT0(delp2type == "cyclic" || delp2type == "spt" || delp2type == "tri");
   // Use same flags for FFT solver as for NaulinSolver
-  delp2solver->setGlobalFlags(global_flags);
-  delp2solver->setInnerBoundaryFlags(inner_boundary_flags);
-  delp2solver->setOuterBoundaryFlags(outer_boundary_flags);
+  delp2solver->setGlobalFlags(getGlobalFlags());
+  delp2solver->setInnerBoundaryFlags(getInnerBoundaryFlags());
+  delp2solver->setOuterBoundaryFlags(getOuterBoundaryFlags());
 
   static int naulinsolver_count = 1;
   setPerformanceName(fmt::format("{}{}", "naulinsolver", ++naulinsolver_count));
@@ -258,7 +258,7 @@ Field3D LaplaceNaulin::solve(const Field3D& rhs, const Field3D& x0) {
     // Note take a copy of the 'b' argument, because we want to return a copy of it in the
     // result
 
-    if ((inner_boundary_flags & INVERT_SET) || (outer_boundary_flags & INVERT_SET)) {
+    if (isInnerBoundaryFlagSet(INVERT_SET) || isOuterBoundaryFlagSet(INVERT_SET)) {
       // This passes in the boundary conditions from x0's guard cells
       copy_x_boundaries(x_guess, x0, localmesh);
     }
diff --git a/src/invert/laplace/impls/naulin/naulin_laplace.hxx b/src/invert/laplace/impls/naulin/naulin_laplace.hxx
index f544e74336..e464ef18e7 100644
--- a/src/invert/laplace/impls/naulin/naulin_laplace.hxx
+++ b/src/invert/laplace/impls/naulin/naulin_laplace.hxx
@@ -25,8 +25,8 @@
 
 class LaplaceNaulin;
 
-#ifndef __LAP_NAULIN_H__
-#define __LAP_NAULIN_H__
+#ifndef BOUT_LAP_NAULIN_H
+#define BOUT_LAP_NAULIN_H
 
 #include <bout/invert_laplace.hxx>
 #include <bout/options.hxx>
@@ -179,4 +179,4 @@ private:
   void copy_x_boundaries(Field3D& x, const Field3D& x0, Mesh* mesh);
 };
 
-#endif // __LAP_NAULIN_H__
+#endif // BOUT_LAP_NAULIN_H
diff --git a/src/invert/laplace/impls/pcr/pcr.cxx b/src/invert/laplace/impls/pcr/pcr.cxx
index 9402ba9f1b..48bbdbac4b 100644
--- a/src/invert/laplace/impls/pcr/pcr.cxx
+++ b/src/invert/laplace/impls/pcr/pcr.cxx
@@ -149,19 +149,19 @@ FieldPerp LaplacePCR::solve(const FieldPerp& rhs, const FieldPerp& x0) {
   // If the flags to assign that only one guard cell should be used is set
   inbndry = localmesh->xstart;
   outbndry = localmesh->xstart;
-  if (((global_flags & INVERT_BOTH_BNDRY_ONE) != 0) || (localmesh->xstart < 2)) {
+  if (isGlobalFlagSet(INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) {
     inbndry = outbndry = 1;
   }
-  if ((inner_boundary_flags & INVERT_BNDRY_ONE) != 0) {
+  if (isInnerBoundaryFlagSet(INVERT_BNDRY_ONE)) {
     inbndry = 1;
   }
-  if ((outer_boundary_flags & INVERT_BNDRY_ONE) != 0) {
+  if (isOuterBoundaryFlagSet(INVERT_BNDRY_ONE)) {
     outbndry = 1;
   }
 
   if (dst) {
     const BoutReal zlen = getUniform(coords->dz) * (localmesh->LocalNz - 3);
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>(
@@ -169,14 +169,13 @@ FieldPerp LaplacePCR::solve(const FieldPerp& rhs, const FieldPerp& x0) {
 
       // Loop over X indices, including boundaries but not guard cells. (unless periodic
       // in x)
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int ix = xs; ix <= xe; ix++) {
         // Take DST in Z direction and put result in k1d
 
-        if (((ix < inbndry) && ((inner_boundary_flags & INVERT_SET) != 0)
-             && localmesh->firstX())
+        if (((ix < inbndry) && isInnerBoundaryFlagSetOnFirstX(INVERT_SET))
             || ((localmesh->LocalNx - ix - 1 < outbndry)
-                && ((outer_boundary_flags & INVERT_SET) != 0) && localmesh->lastX())) {
+                && isOuterBoundaryFlagSetOnLastX(INVERT_SET))) {
           // Use the values in x0 in the boundary
           DST(x0[ix] + 1, localmesh->LocalNz - 2, std::begin(k1d));
         } else {
@@ -191,7 +190,7 @@ FieldPerp LaplacePCR::solve(const FieldPerp& rhs, const FieldPerp& x0) {
 
       // Get elements of the tridiagonal matrix
       // including boundary conditions
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int kz = 0; kz < nmode; kz++) {
         BoutReal kwave =
             kz * 2.0 * PI / (2. * zlen); // wave number is 1/[rad]; DST has extra 2.
@@ -199,23 +198,22 @@ FieldPerp LaplacePCR::solve(const FieldPerp& rhs, const FieldPerp& x0) {
         tridagMatrix(&a(kz, 0), &b(kz, 0), &c(kz, 0), &bcmplx(kz, 0), jy,
                      kz,    // wave number index
                      kwave, // kwave (inverse wave length)
-                     global_flags, inner_boundary_flags, outer_boundary_flags, &Acoef,
-                     &C1coef, &C2coef, &Dcoef,
+                     &Acoef, &C1coef, &C2coef, &Dcoef,
                      false); // Don't include guard cells in arrays
       }
-    } // BOUT_OMP(parallel)
+    } // BOUT_OMP_PERF(parallel)
 
     // Solve tridiagonal systems
     cr_pcr_solver(a, b, c, bcmplx, xcmplx);
 
     // FFT back to real space
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>(
           localmesh->LocalNz); // ZFFT routine expects input of this length
 
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ix = xs; ix <= xe; ix++) {
         for (int kz = 0; kz < nmode; kz++) {
           k1d[kz] = xcmplx(kz, ix - xs);
@@ -233,7 +231,7 @@ FieldPerp LaplacePCR::solve(const FieldPerp& rhs, const FieldPerp& x0) {
     }
   } else {
     const BoutReal zlength = getUniform(coords->zlength());
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>((localmesh->LocalNz) / 2
@@ -241,14 +239,13 @@ FieldPerp LaplacePCR::solve(const FieldPerp& rhs, const FieldPerp& x0) {
 
       // Loop over X indices, including boundaries but not guard cells (unless periodic in
       // x)
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int ix = xs; ix <= xe; ix++) {
         // Take FFT in Z direction, apply shift, and put result in k1d
 
-        if (((ix < inbndry) && ((inner_boundary_flags & INVERT_SET) != 0)
-             && localmesh->firstX())
+        if (((ix < inbndry) && isInnerBoundaryFlagSetOnFirstX(INVERT_SET))
             || ((localmesh->LocalNx - ix - 1 < outbndry)
-                && ((outer_boundary_flags & INVERT_SET) != 0) && localmesh->lastX())) {
+                && isOuterBoundaryFlagSetOnLastX(INVERT_SET))) {
           // Use the values in x0 in the boundary
           rfft(x0[ix], localmesh->LocalNz, std::begin(k1d));
         } else {
@@ -263,31 +260,30 @@ FieldPerp LaplacePCR::solve(const FieldPerp& rhs, const FieldPerp& x0) {
 
       // Get elements of the tridiagonal matrix
       // including boundary conditions
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int kz = 0; kz < nmode; kz++) {
         BoutReal kwave = kz * 2.0 * PI / zlength; // wave number is 1/[rad]
         tridagMatrix(&a(kz, 0), &b(kz, 0), &c(kz, 0), &bcmplx(kz, 0), jy,
                      kz,    // True for the component constant (DC) in Z
                      kwave, // Z wave number
-                     global_flags, inner_boundary_flags, outer_boundary_flags, &Acoef,
-                     &C1coef, &C2coef, &Dcoef,
+                     &Acoef, &C1coef, &C2coef, &Dcoef,
                      false); // Don't include guard cells in arrays
       }
-    } // BOUT_OMP(parallel)
+    } // BOUT_OMP_PERF(parallel)
 
     // Solve tridiagonal systems
     cr_pcr_solver(a, b, c, bcmplx, xcmplx);
 
     // FFT back to real space
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>((localmesh->LocalNz) / 2
                                  + 1); // ZFFT routine expects input of this length
 
-      const bool zero_DC = (global_flags & INVERT_ZERO_DC) != 0;
+      const bool zero_DC = isGlobalFlagSet(INVERT_ZERO_DC);
 
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ix = xs; ix <= xe; ix++) {
         if (zero_DC) {
           k1d[0] = 0.;
@@ -327,13 +323,13 @@ Field3D LaplacePCR::solve(const Field3D& rhs, const Field3D& x0) {
   // If the flags to assign that only one guard cell should be used is set
   inbndry = localmesh->xstart;
   outbndry = localmesh->xstart;
-  if (((global_flags & INVERT_BOTH_BNDRY_ONE) != 0) || (localmesh->xstart < 2)) {
+  if (isGlobalFlagSet(INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) {
     inbndry = outbndry = 1;
   }
-  if ((inner_boundary_flags & INVERT_BNDRY_ONE) != 0) {
+  if (isInnerBoundaryFlagSet(INVERT_BNDRY_ONE)) {
     inbndry = 1;
   }
-  if ((outer_boundary_flags & INVERT_BNDRY_ONE) != 0) {
+  if (isOuterBoundaryFlagSet(INVERT_BNDRY_ONE)) {
     outbndry = 1;
   }
 
@@ -371,7 +367,7 @@ Field3D LaplacePCR::solve(const Field3D& rhs, const Field3D& x0) {
 
   if (dst) {
     const BoutReal zlen = getUniform(coords->dz) * (localmesh->LocalNz - 3);
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>(
@@ -379,7 +375,7 @@ Field3D LaplacePCR::solve(const Field3D& rhs, const Field3D& x0) {
 
       // Loop over X and Y indices, including boundaries but not guard cells.
       // (unless periodic in x)
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int ind = 0; ind < nxny; ++ind) {
         // ind = (ix - xs)*(ye - ys + 1) + (iy - ys)
         int ix = xs + ind / ny;
@@ -387,10 +383,9 @@ Field3D LaplacePCR::solve(const Field3D& rhs, const Field3D& x0) {
 
         // Take DST in Z direction and put result in k1d
 
-        if (((ix < inbndry) && ((inner_boundary_flags & INVERT_SET) != 0)
-             && localmesh->firstX())
+        if (((ix < inbndry) && isInnerBoundaryFlagSetOnFirstX(INVERT_SET))
             || ((localmesh->LocalNx - ix - 1 < outbndry)
-                && ((outer_boundary_flags & INVERT_SET) != 0) && localmesh->lastX())) {
+                && isOuterBoundaryFlagSetOnLastX(INVERT_SET))) {
           // Use the values in x0 in the boundary
           DST(x0(ix, iy) + 1, localmesh->LocalNz - 2, std::begin(k1d));
         } else {
@@ -405,7 +400,7 @@ Field3D LaplacePCR::solve(const Field3D& rhs, const Field3D& x0) {
 
       // Get elements of the tridiagonal matrix
       // including boundary conditions
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ind = 0; ind < nsys; ind++) {
         // ind = (iy - ys) * nmode + kz
         int iy = ys + ind / nmode;
@@ -417,23 +412,22 @@ Field3D LaplacePCR::solve(const Field3D& rhs, const Field3D& x0) {
         tridagMatrix(&a3D(ind, 0), &b3D(ind, 0), &c3D(ind, 0), &bcmplx3D(ind, 0), iy,
                      kz,    // wave number index
                      kwave, // kwave (inverse wave length)
-                     global_flags, inner_boundary_flags, outer_boundary_flags, &Acoef,
-                     &C1coef, &C2coef, &Dcoef,
+                     &Acoef, &C1coef, &C2coef, &Dcoef,
                      false); // Don't include guard cells in arrays
       }
-    } // BOUT_OMP(parallel)
+    } // BOUT_OMP_PERF(parallel)
 
     // Solve tridiagonal systems
     cr_pcr_solver(a3D, b3D, c3D, bcmplx3D, xcmplx3D);
 
     // FFT back to real space
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>(
           localmesh->LocalNz); // ZFFT routine expects input of this length
 
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ind = 0; ind < nxny; ++ind) { // Loop over X and Y
         // ind = (ix - xs)*(ye - ys + 1) + (iy - ys)
         int ix = xs + ind / ny;
@@ -455,7 +449,7 @@ Field3D LaplacePCR::solve(const Field3D& rhs, const Field3D& x0) {
     }
   } else {
     const BoutReal zlength = getUniform(coords->zlength());
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>(localmesh->LocalNz / 2
@@ -464,7 +458,7 @@ Field3D LaplacePCR::solve(const Field3D& rhs, const Field3D& x0) {
       // Loop over X and Y indices, including boundaries but not guard cells
       // (unless periodic in x)
 
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int ind = 0; ind < nxny; ++ind) {
         // ind = (ix - xs)*(ye - ys + 1) + (iy - ys)
         int ix = xs + ind / ny;
@@ -472,10 +466,9 @@ Field3D LaplacePCR::solve(const Field3D& rhs, const Field3D& x0) {
 
         // Take FFT in Z direction, apply shift, and put result in k1d
 
-        if (((ix < inbndry) && ((inner_boundary_flags & INVERT_SET) != 0)
-             && localmesh->firstX())
+        if (((ix < inbndry) && isInnerBoundaryFlagSetOnFirstX(INVERT_SET))
             || ((localmesh->LocalNx - ix - 1 < outbndry)
-                && ((outer_boundary_flags & INVERT_SET) != 0) && localmesh->lastX())) {
+                && isOuterBoundaryFlagSetOnLastX(INVERT_SET))) {
           // Use the values in x0 in the boundary
           rfft(x0(ix, iy), localmesh->LocalNz, std::begin(k1d));
         } else {
@@ -490,7 +483,7 @@ Field3D LaplacePCR::solve(const Field3D& rhs, const Field3D& x0) {
 
       // Get elements of the tridiagonal matrix
       // including boundary conditions
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ind = 0; ind < nsys; ind++) {
         // ind = (iy - ys) * nmode + kz
         int iy = ys + ind / nmode;
@@ -500,25 +493,24 @@ Field3D LaplacePCR::solve(const Field3D& rhs, const Field3D& x0) {
         tridagMatrix(&a3D(ind, 0), &b3D(ind, 0), &c3D(ind, 0), &bcmplx3D(ind, 0), iy,
                      kz,    // True for the component constant (DC) in Z
                      kwave, // Z wave number
-                     global_flags, inner_boundary_flags, outer_boundary_flags, &Acoef,
-                     &C1coef, &C2coef, &Dcoef,
+                     &Acoef, &C1coef, &C2coef, &Dcoef,
                      false); // Don't include guard cells in arrays
       }
-    } // BOUT_OMP(parallel)
+    } // BOUT_OMP_PERF(parallel)
 
     // Solve tridiagonal systems
     cr_pcr_solver(a3D, b3D, c3D, bcmplx3D, xcmplx3D);
 
     // FFT back to real space
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>((localmesh->LocalNz) / 2
                                  + 1); // ZFFT routine expects input of this length
 
-      const bool zero_DC = (global_flags & INVERT_ZERO_DC) != 0;
+      const bool zero_DC = isGlobalFlagSet(INVERT_ZERO_DC);
 
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ind = 0; ind < nxny; ++ind) { // Loop over X and Y
         int ix = xs + ind / ny;
         int iy = ys + ind % ny;
diff --git a/src/invert/laplace/impls/pcr/pcr.hxx b/src/invert/laplace/impls/pcr/pcr.hxx
index 38b7c356d3..ec4637f56c 100644
--- a/src/invert/laplace/impls/pcr/pcr.hxx
+++ b/src/invert/laplace/impls/pcr/pcr.hxx
@@ -172,14 +172,6 @@ private:
   /// First and last interior points xstart, xend
   int xs, xe;
 
-  bool isGlobalFlagSet(int flag) const { return (global_flags & flag) != 0; }
-  bool isInnerBoundaryFlagSet(int flag) const {
-    return (inner_boundary_flags & flag) != 0;
-  }
-  bool isOuterBoundaryFlagSet(int flag) const {
-    return (outer_boundary_flags & flag) != 0;
-  }
-
   bool dst{false};
 };
 
diff --git a/src/invert/laplace/impls/pcr_thomas/pcr_thomas.cxx b/src/invert/laplace/impls/pcr_thomas/pcr_thomas.cxx
index 925fb842ce..61c8f58694 100644
--- a/src/invert/laplace/impls/pcr_thomas/pcr_thomas.cxx
+++ b/src/invert/laplace/impls/pcr_thomas/pcr_thomas.cxx
@@ -145,19 +145,19 @@ FieldPerp LaplacePCR_THOMAS::solve(const FieldPerp& rhs, const FieldPerp& x0) {
   // If the flags to assign that only one guard cell should be used is set
   int inbndry = localmesh->xstart;
   int outbndry = localmesh->xstart;
-  if (((global_flags & INVERT_BOTH_BNDRY_ONE) != 0) || (localmesh->xstart < 2)) {
+  if (isGlobalFlagSet(INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) {
     inbndry = outbndry = 1;
   }
-  if ((inner_boundary_flags & INVERT_BNDRY_ONE) != 0) {
+  if (isInnerBoundaryFlagSet(INVERT_BNDRY_ONE)) {
     inbndry = 1;
   }
-  if ((outer_boundary_flags & INVERT_BNDRY_ONE) != 0) {
+  if (isOuterBoundaryFlagSet(INVERT_BNDRY_ONE)) {
     outbndry = 1;
   }
 
   if (dst) {
     const BoutReal zlength = getUniform(coords->dz) * (localmesh->LocalNz - 3);
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>(
@@ -165,14 +165,13 @@ FieldPerp LaplacePCR_THOMAS::solve(const FieldPerp& rhs, const FieldPerp& x0) {
 
       // Loop over X indices, including boundaries but not guard cells. (unless periodic
       // in x)
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int ix = xs; ix <= xe; ix++) {
         // Take DST in Z direction and put result in k1d
 
-        if (((ix < inbndry) && ((inner_boundary_flags & INVERT_SET) != 0)
-             && localmesh->firstX())
+        if (((ix < inbndry) && isInnerBoundaryFlagSetOnFirstX(INVERT_SET))
             || ((localmesh->LocalNx - ix - 1 < outbndry)
-                && ((outer_boundary_flags & INVERT_SET) != 0) && localmesh->lastX())) {
+                && isOuterBoundaryFlagSetOnLastX(INVERT_SET))) {
           // Use the values in x0 in the boundary
           DST(x0[ix] + 1, localmesh->LocalNz - 2, std::begin(k1d));
         } else {
@@ -187,7 +186,7 @@ FieldPerp LaplacePCR_THOMAS::solve(const FieldPerp& rhs, const FieldPerp& x0) {
 
       // Get elements of the tridiagonal matrix
       // including boundary conditions
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int kz = 0; kz < nmode; kz++) {
         // wave number is 1/[rad]; DST has extra 2.
         const BoutReal kwave = kz * 2.0 * PI / (2. * zlength);
@@ -195,8 +194,7 @@ FieldPerp LaplacePCR_THOMAS::solve(const FieldPerp& rhs, const FieldPerp& x0) {
         tridagMatrix(&a(kz, 0), &b(kz, 0), &c(kz, 0), &bcmplx(kz, 0), jy,
                      kz,    // wave number index
                      kwave, // kwave (inverse wave length)
-                     global_flags, inner_boundary_flags, outer_boundary_flags, &Acoef,
-                     &C1coef, &C2coef, &Dcoef,
+                     &Acoef, &C1coef, &C2coef, &Dcoef,
                      false); // Don't include guard cells in arrays
       }
     }
@@ -205,13 +203,13 @@ FieldPerp LaplacePCR_THOMAS::solve(const FieldPerp& rhs, const FieldPerp& x0) {
     pcr_thomas_solver(a, b, c, bcmplx, xcmplx);
 
     // FFT back to real space
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>(
           localmesh->LocalNz); // ZFFT routine expects input of this length
 
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ix = xs; ix <= xe; ix++) {
         for (int kz = 0; kz < nmode; kz++) {
           k1d[kz] = xcmplx(kz, ix - xs);
@@ -229,7 +227,7 @@ FieldPerp LaplacePCR_THOMAS::solve(const FieldPerp& rhs, const FieldPerp& x0) {
     }
   } else {
     const BoutReal zlength = getUniform(coords->zlength());
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>((localmesh->LocalNz) / 2
@@ -237,14 +235,13 @@ FieldPerp LaplacePCR_THOMAS::solve(const FieldPerp& rhs, const FieldPerp& x0) {
 
       // Loop over X indices, including boundaries but not guard cells (unless periodic in
       // x)
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int ix = xs; ix <= xe; ix++) {
         // Take FFT in Z direction, apply shift, and put result in k1d
 
-        if (((ix < inbndry) && ((inner_boundary_flags & INVERT_SET) != 0)
-             && localmesh->firstX())
+        if (((ix < inbndry) && isInnerBoundaryFlagSetOnFirstX(INVERT_SET))
             || ((localmesh->LocalNx - ix - 1 < outbndry)
-                && ((outer_boundary_flags & INVERT_SET) != 0) && localmesh->lastX())) {
+                && isOuterBoundaryFlagSetOnLastX(INVERT_SET))) {
           // Use the values in x0 in the boundary
           rfft(x0[ix], localmesh->LocalNz, std::begin(k1d));
         } else {
@@ -259,14 +256,13 @@ FieldPerp LaplacePCR_THOMAS::solve(const FieldPerp& rhs, const FieldPerp& x0) {
 
       // Get elements of the tridiagonal matrix
       // including boundary conditions
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int kz = 0; kz < nmode; kz++) {
         const BoutReal kwave = kz * 2.0 * PI / zlength; // wave number is 1/[rad]
         tridagMatrix(&a(kz, 0), &b(kz, 0), &c(kz, 0), &bcmplx(kz, 0), jy,
                      kz,    // True for the component constant (DC) in Z
                      kwave, // Z wave number
-                     global_flags, inner_boundary_flags, outer_boundary_flags, &Acoef,
-                     &C1coef, &C2coef, &Dcoef,
+                     &Acoef, &C1coef, &C2coef, &Dcoef,
                      false); // Don't include guard cells in arrays
       }
     }
@@ -275,15 +271,15 @@ FieldPerp LaplacePCR_THOMAS::solve(const FieldPerp& rhs, const FieldPerp& x0) {
     pcr_thomas_solver(a, b, c, bcmplx, xcmplx);
 
     // FFT back to real space
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>((localmesh->LocalNz) / 2
                                  + 1); // ZFFT routine expects input of this length
 
-      const bool zero_DC = (global_flags & INVERT_ZERO_DC) != 0;
+      const bool zero_DC = isGlobalFlagSet(INVERT_ZERO_DC);
 
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ix = xs; ix <= xe; ix++) {
         if (zero_DC) {
           k1d[0] = 0.;
@@ -323,13 +319,13 @@ Field3D LaplacePCR_THOMAS::solve(const Field3D& rhs, const Field3D& x0) {
   // If the flags to assign that only one guard cell should be used is set
   int inbndry = localmesh->xstart;
   int outbndry = localmesh->xstart;
-  if (((global_flags & INVERT_BOTH_BNDRY_ONE) != 0) || (localmesh->xstart < 2)) {
+  if (isGlobalFlagSet(INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) {
     inbndry = outbndry = 1;
   }
-  if ((inner_boundary_flags & INVERT_BNDRY_ONE) != 0) {
+  if (isInnerBoundaryFlagSet(INVERT_BNDRY_ONE)) {
     inbndry = 1;
   }
-  if ((outer_boundary_flags & INVERT_BNDRY_ONE) != 0) {
+  if (isOuterBoundaryFlagSet(INVERT_BNDRY_ONE)) {
     outbndry = 1;
   }
 
@@ -367,7 +363,7 @@ Field3D LaplacePCR_THOMAS::solve(const Field3D& rhs, const Field3D& x0) {
 
   if (dst) {
     const BoutReal zlength = getUniform(coords->dz) * (localmesh->LocalNz - 3);
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>(
@@ -375,7 +371,7 @@ Field3D LaplacePCR_THOMAS::solve(const Field3D& rhs, const Field3D& x0) {
 
       // Loop over X and Y indices, including boundaries but not guard cells.
       // (unless periodic in x)
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int ind = 0; ind < nxny; ++ind) {
         // ind = (ix - xs)*(ye - ys + 1) + (iy - ys)
         int ix = xs + ind / ny;
@@ -383,10 +379,9 @@ Field3D LaplacePCR_THOMAS::solve(const Field3D& rhs, const Field3D& x0) {
 
         // Take DST in Z direction and put result in k1d
 
-        if (((ix < inbndry) && ((inner_boundary_flags & INVERT_SET) != 0)
-             && localmesh->firstX())
+        if (((ix < inbndry) && isInnerBoundaryFlagSetOnFirstX(INVERT_SET))
             || ((localmesh->LocalNx - ix - 1 < outbndry)
-                && ((outer_boundary_flags & INVERT_SET) != 0) && localmesh->lastX())) {
+                && isOuterBoundaryFlagSetOnLastX(INVERT_SET))) {
           // Use the values in x0 in the boundary
           DST(x0(ix, iy) + 1, localmesh->LocalNz - 2, std::begin(k1d));
         } else {
@@ -401,7 +396,7 @@ Field3D LaplacePCR_THOMAS::solve(const Field3D& rhs, const Field3D& x0) {
 
       // Get elements of the tridiagonal matrix
       // including boundary conditions
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ind = 0; ind < nsys; ind++) {
         // ind = (iy - ys) * nmode + kz
         int iy = ys + ind / nmode;
@@ -413,8 +408,7 @@ Field3D LaplacePCR_THOMAS::solve(const Field3D& rhs, const Field3D& x0) {
         tridagMatrix(&a3D(ind, 0), &b3D(ind, 0), &c3D(ind, 0), &bcmplx3D(ind, 0), iy,
                      kz,    // wave number index
                      kwave, // kwave (inverse wave length)
-                     global_flags, inner_boundary_flags, outer_boundary_flags, &Acoef,
-                     &C1coef, &C2coef, &Dcoef,
+                     &Acoef, &C1coef, &C2coef, &Dcoef,
                      false); // Don't include guard cells in arrays
       }
     }
@@ -423,13 +417,13 @@ Field3D LaplacePCR_THOMAS::solve(const Field3D& rhs, const Field3D& x0) {
     pcr_thomas_solver(a3D, b3D, c3D, bcmplx3D, xcmplx3D);
 
     // FFT back to real space
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>(
           localmesh->LocalNz); // ZFFT routine expects input of this length
 
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ind = 0; ind < nxny; ++ind) { // Loop over X and Y
         // ind = (ix - xs)*(ye - ys + 1) + (iy - ys)
         int ix = xs + ind / ny;
@@ -451,7 +445,7 @@ Field3D LaplacePCR_THOMAS::solve(const Field3D& rhs, const Field3D& x0) {
     }
   } else {
     const BoutReal zlength = getUniform(coords->zlength());
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>(localmesh->LocalNz / 2
@@ -460,7 +454,7 @@ Field3D LaplacePCR_THOMAS::solve(const Field3D& rhs, const Field3D& x0) {
       // Loop over X and Y indices, including boundaries but not guard cells
       // (unless periodic in x)
 
-      BOUT_OMP(for)
+      BOUT_OMP_PERF(for)
       for (int ind = 0; ind < nxny; ++ind) {
         // ind = (ix - xs)*(ye - ys + 1) + (iy - ys)
         int ix = xs + ind / ny;
@@ -468,10 +462,9 @@ Field3D LaplacePCR_THOMAS::solve(const Field3D& rhs, const Field3D& x0) {
 
         // Take FFT in Z direction, apply shift, and put result in k1d
 
-        if (((ix < inbndry) && ((inner_boundary_flags & INVERT_SET) != 0)
-             && localmesh->firstX())
+        if (((ix < inbndry) && isInnerBoundaryFlagSetOnFirstX(INVERT_SET))
             || ((localmesh->LocalNx - ix - 1 < outbndry)
-                && ((outer_boundary_flags & INVERT_SET) != 0) && localmesh->lastX())) {
+                && isOuterBoundaryFlagSetOnLastX(INVERT_SET))) {
           // Use the values in x0 in the boundary
           rfft(x0(ix, iy), localmesh->LocalNz, std::begin(k1d));
         } else {
@@ -486,7 +479,7 @@ Field3D LaplacePCR_THOMAS::solve(const Field3D& rhs, const Field3D& x0) {
 
       // Get elements of the tridiagonal matrix
       // including boundary conditions
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ind = 0; ind < nsys; ind++) {
         // ind = (iy - ys) * nmode + kz
         int iy = ys + ind / nmode;
@@ -497,8 +490,7 @@ Field3D LaplacePCR_THOMAS::solve(const Field3D& rhs, const Field3D& x0) {
         tridagMatrix(&a3D(ind, 0), &b3D(ind, 0), &c3D(ind, 0), &bcmplx3D(ind, 0), iy,
                      kz,    // True for the component constant (DC) in Z
                      kwave, // Z wave number
-                     global_flags, inner_boundary_flags, outer_boundary_flags, &Acoef,
-                     &C1coef, &C2coef, &Dcoef,
+                     &Acoef, &C1coef, &C2coef, &Dcoef,
                      false); // Don't include guard cells in arrays
       }
     }
@@ -507,15 +499,15 @@ Field3D LaplacePCR_THOMAS::solve(const Field3D& rhs, const Field3D& x0) {
     pcr_thomas_solver(a3D, b3D, c3D, bcmplx3D, xcmplx3D);
 
     // FFT back to real space
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       /// Create a local thread-scope working array
       auto k1d = Array<dcomplex>((localmesh->LocalNz) / 2
                                  + 1); // ZFFT routine expects input of this length
 
-      const bool zero_DC = (global_flags & INVERT_ZERO_DC) != 0;
+      const bool zero_DC = isGlobalFlagSet(INVERT_ZERO_DC);
 
-      BOUT_OMP(for nowait)
+      BOUT_OMP_PERF(for nowait)
       for (int ind = 0; ind < nxny; ++ind) { // Loop over X and Y
         int ix = xs + ind / ny;
         int iy = ys + ind % ny;
diff --git a/src/invert/laplace/impls/pcr_thomas/pcr_thomas.hxx b/src/invert/laplace/impls/pcr_thomas/pcr_thomas.hxx
index 009a1def2b..e12a647789 100644
--- a/src/invert/laplace/impls/pcr_thomas/pcr_thomas.hxx
+++ b/src/invert/laplace/impls/pcr_thomas/pcr_thomas.hxx
@@ -175,14 +175,6 @@ private:
   /// First and last interior points xstart, xend
   int xs, xe;
 
-  bool isGlobalFlagSet(int flag) const { return (global_flags & flag) != 0; }
-  bool isInnerBoundaryFlagSet(int flag) const {
-    return (inner_boundary_flags & flag) != 0;
-  }
-  bool isOuterBoundaryFlagSet(int flag) const {
-    return (outer_boundary_flags & flag) != 0;
-  }
-
   bool dst{false};
 };
 
diff --git a/src/invert/laplace/impls/petsc/petsc_laplace.cxx b/src/invert/laplace/impls/petsc/petsc_laplace.cxx
index d125b90694..f06f4c7de6 100644
--- a/src/invert/laplace/impls/petsc/petsc_laplace.cxx
+++ b/src/invert/laplace/impls/petsc/petsc_laplace.cxx
@@ -23,7 +23,8 @@
  * along with BOUT++.  If not, see <http://www.gnu.org/licenses/>.
  *
  **************************************************************************/
-#include "bout/build_config.hxx"
+
+#include "bout/build_defines.hxx"
 
 #if BOUT_HAS_PETSC
 
@@ -32,6 +33,8 @@
 #include <bout/assert.hxx>
 #include <bout/boutcomm.hxx>
 #include <bout/mesh.hxx>
+#include <bout/output.hxx>
+#include <bout/petsclib.hxx>
 #include <bout/sys/timer.hxx>
 #include <bout/utils.hxx>
 
@@ -49,14 +52,13 @@
 #define KSP_PREONLY "preonly"
 
 static PetscErrorCode laplacePCapply(PC pc, Vec x, Vec y) {
-  int ierr;
+  PetscFunctionBegin; // NOLINT
 
-  // Get the context
-  LaplacePetsc* s;
-  ierr = PCShellGetContext(pc, reinterpret_cast<void**>(&s));
+  LaplacePetsc* laplace = nullptr;
+  const int ierr = PCShellGetContext(pc, reinterpret_cast<void**>(&laplace)); // NOLINT
   CHKERRQ(ierr);
 
-  PetscFunctionReturn(s->precon(x, y));
+  PetscFunctionReturn(laplace->precon(x, y)); // NOLINT
 }
 
 LaplacePetsc::LaplacePetsc(Options* opt, const CELL_LOC loc, Mesh* mesh_in,
@@ -79,28 +81,9 @@ LaplacePetsc::LaplacePetsc(Options* opt, const CELL_LOC loc, Mesh* mesh_in,
   }
 
 #if CHECK > 0
-  // These are the implemented flags
-  implemented_flags = INVERT_START_NEW;
-  implemented_boundary_flags = INVERT_AC_GRAD + INVERT_SET + INVERT_RHS;
   // Checking flags are set to something which is not implemented
-  // This is done binary (which is possible as each flag is a power of 2)
-  if (global_flags & ~implemented_flags) {
-    if (global_flags & INVERT_4TH_ORDER) {
-      output << "For PETSc based Laplacian inverter, use 'fourth_order=true' instead of "
-                "setting INVERT_4TH_ORDER flag"
-             << endl;
-    }
-    throw BoutException("Attempted to set Laplacian inversion flag that is not "
-                        "implemented in petsc_laplace.cxx");
-  }
-  if (inner_boundary_flags & ~implemented_boundary_flags) {
-    throw BoutException("Attempted to set Laplacian inversion boundary flag that is not "
-                        "implemented in petsc_laplace.cxx");
-  }
-  if (outer_boundary_flags & ~implemented_boundary_flags) {
-    throw BoutException("Attempted to set Laplacian inversion boundary flag that is not "
-                        "implemented in petsc_laplace.cxx");
-  }
+  checkFlags();
+
   if (localmesh->periodicX) {
     throw BoutException("LaplacePetsc does not work with periodicity in the x direction "
                         "(localmesh->PeriodicX == true). Change boundary conditions or "
@@ -360,25 +343,7 @@ FieldPerp LaplacePetsc::solve(const FieldPerp& b, const FieldPerp& x0) {
   ASSERT1(x0.getLocation() == location);
 
 #if CHECK > 0
-  // Checking flags are set to something which is not implemented (see
-  // constructor for details)
-  if (global_flags & !implemented_flags) {
-    if (global_flags & INVERT_4TH_ORDER) {
-      output << "For PETSc based Laplacian inverter, use 'fourth_order=true' instead of "
-                "setting INVERT_4TH_ORDER flag"
-             << endl;
-    }
-    throw BoutException("Attempted to set Laplacian inversion flag that is not "
-                        "implemented in petsc_laplace.cxx");
-  }
-  if (inner_boundary_flags & ~implemented_boundary_flags) {
-    throw BoutException("Attempted to set Laplacian inversion boundary flag that is not "
-                        "implemented in petsc_laplace.cxx");
-  }
-  if (outer_boundary_flags & ~implemented_boundary_flags) {
-    throw BoutException("Attempted to set Laplacian inversion boundary flag that is not "
-                        "implemented in petsc_laplace.cxx");
-  }
+  checkFlags();
 #endif
 
   int y = b.getIndex(); // Get the Y index
@@ -415,7 +380,7 @@ FieldPerp LaplacePetsc::solve(const FieldPerp& b, const FieldPerp& x0) {
         for (int z = 0; z < localmesh->LocalNz; z++) {
           PetscScalar val; // Value of element to be set in the matrix
           // If Neumann Boundary Conditions are set.
-          if (inner_boundary_flags & INVERT_AC_GRAD) {
+          if (isInnerBoundaryFlagSet(INVERT_AC_GRAD)) {
             // Set values corresponding to nodes adjacent in x
             if (fourth_order) {
               // Fourth Order Accuracy on Boundary
@@ -472,9 +437,9 @@ FieldPerp LaplacePetsc::solve(const FieldPerp& b, const FieldPerp& x0) {
 
           // Set Components of RHS
           // If the inner boundary value should be set by b or x0
-          if (inner_boundary_flags & INVERT_RHS) {
+          if (isInnerBoundaryFlagSet(INVERT_RHS)) {
             val = b[x][z];
-          } else if (inner_boundary_flags & INVERT_SET) {
+          } else if (isInnerBoundaryFlagSet(INVERT_SET)) {
             val = x0[x][z];
           }
 
@@ -680,7 +645,7 @@ FieldPerp LaplacePetsc::solve(const FieldPerp& b, const FieldPerp& x0) {
           Element(i, x, z, 0, 0, val, MatA);
 
           // If Neumann Boundary Conditions are set.
-          if (outer_boundary_flags & INVERT_AC_GRAD) {
+          if (isOuterBoundaryFlagSet(INVERT_AC_GRAD)) {
             // Set values corresponding to nodes adjacent in x
             if (fourth_order) {
               // Fourth Order Accuracy on Boundary
@@ -733,9 +698,9 @@ FieldPerp LaplacePetsc::solve(const FieldPerp& b, const FieldPerp& x0) {
           // Set Components of RHS
           // If the inner boundary value should be set by b or x0
           val = 0;
-          if (outer_boundary_flags & INVERT_RHS) {
+          if (isOuterBoundaryFlagSet(INVERT_RHS)) {
             val = b[x][z];
-          } else if (outer_boundary_flags & INVERT_SET) {
+          } else if (isOuterBoundaryFlagSet(INVERT_SET)) {
             val = x0[x][z];
           }
 
@@ -812,7 +777,7 @@ FieldPerp LaplacePetsc::solve(const FieldPerp& b, const FieldPerp& x0) {
       KSPSetTolerances(ksp, rtol, atol, dtol, maxits);
 
       // If the initial guess is not set to zero
-      if (!(global_flags & INVERT_START_NEW)) {
+      if (!isGlobalFlagSet(INVERT_START_NEW)) {
         KSPSetInitialGuessNonzero(ksp, static_cast<PetscBool>(true));
       }
 
@@ -1194,4 +1159,24 @@ int LaplacePetsc::precon(Vec x, Vec y) {
   return 0;
 }
 
+void LaplacePetsc::checkFlags() {
+  if (isGlobalFlagSet(~implemented_flags)) {
+    if (isGlobalFlagSet(INVERT_4TH_ORDER)) {
+      output_error.write(
+          "For PETSc based Laplacian inverter, use 'fourth_order=true' instead of "
+          "setting INVERT_4TH_ORDER flag\n");
+    }
+    throw BoutException("Attempted to set Laplacian inversion flag that is not "
+                        "implemented in petsc_laplace.cxx");
+  }
+  if (isInnerBoundaryFlagSet(~implemented_boundary_flags)) {
+    throw BoutException("Attempted to set Laplacian inversion boundary flag that is not "
+                        "implemented in petsc_laplace.cxx");
+  }
+  if (isOuterBoundaryFlagSet(~implemented_boundary_flags)) {
+    throw BoutException("Attempted to set Laplacian inversion boundary flag that is not "
+                        "implemented in petsc_laplace.cxx");
+  }
+}
+
 #endif // BOUT_HAS_PETSC_3_3
diff --git a/src/invert/laplace/impls/petsc/petsc_laplace.hxx b/src/invert/laplace/impls/petsc/petsc_laplace.hxx
index 3b1d3bcb49..55482644be 100644
--- a/src/invert/laplace/impls/petsc/petsc_laplace.hxx
+++ b/src/invert/laplace/impls/petsc/petsc_laplace.hxx
@@ -26,8 +26,8 @@
  *
  **************************************************************************/
 
-#ifndef __PETSC_LAPLACE_H__
-#define __PETSC_LAPLACE_H__
+#ifndef BOUT_PETSC_LAPLACE_H
+#define BOUT_PETSC_LAPLACE_H
 
 #include "bout/build_config.hxx"
 #include "bout/invert_laplace.hxx"
@@ -254,12 +254,13 @@ private:
   void vecToField(Vec x, FieldPerp& f);       // Copy a vector into a fieldperp
   void fieldToVec(const FieldPerp& f, Vec x); // Copy a fieldperp into a vector
 
-#if CHECK > 0
-  int implemented_flags;
-  int implemented_boundary_flags;
-#endif
+  static constexpr int implemented_flags = INVERT_START_NEW;
+  static constexpr int implemented_boundary_flags =
+      INVERT_AC_GRAD | INVERT_SET | INVERT_RHS;
+
+  void checkFlags();
 };
 
 #endif //BOUT_HAS_PETSC
 
-#endif //__PETSC_LAPLACE_H__
+#endif //BOUT_PETSC_LAPLACE_H
diff --git a/src/invert/laplace/impls/petsc3damg/petsc3damg.cxx b/src/invert/laplace/impls/petsc3damg/petsc3damg.cxx
index d1e2207725..a7bfd209ee 100644
--- a/src/invert/laplace/impls/petsc3damg/petsc3damg.cxx
+++ b/src/invert/laplace/impls/petsc3damg/petsc3damg.cxx
@@ -84,12 +84,12 @@ LaplacePetsc3dAmg::LaplacePetsc3dAmg(Options* opt, const CELL_LOC loc, Mesh* mes
 #if CHECK > 0
   // Checking flags are set to something which is not implemented
   // This is done binary (which is possible as each flag is a power of 2)
-  if (flagSet(global_flags, INVERT_4TH_ORDER)) {
+  if (isGlobalFlagSet(INVERT_4TH_ORDER)) {
     output.write("For PETSc based Laplacian inverter, use 'fourth_order=true' instead of "
                  "setting INVERT_4TH_ORDER flag\n");
   }
 
-  if (flagSet(global_flags, ~implemented_flags)) {
+  if (isGlobalFlagSet(~implemented_flags)) {
     throw BoutException("Attempted to set global Laplacian inversion flag that is not "
                         "implemented in petsc_laplace.cxx");
   }
@@ -102,8 +102,8 @@ LaplacePetsc3dAmg::LaplacePetsc3dAmg(Options* opt, const CELL_LOC loc, Mesh* mes
                           name);
     }
   };
-  unimplementedBoundaryFlag(inner_boundary_flags, "inner");
-  unimplementedBoundaryFlag(outer_boundary_flags, "outer");
+  unimplementedBoundaryFlag(getInnerBoundaryFlags(), "inner");
+  unimplementedBoundaryFlag(getOuterBoundaryFlags(), "outer");
   unimplementedBoundaryFlag(lower_boundary_flags, "lower");
   unimplementedBoundaryFlag(upper_boundary_flags, "upper");
 
@@ -119,7 +119,7 @@ LaplacePetsc3dAmg::LaplacePetsc3dAmg(Options* opt, const CELL_LOC loc, Mesh* mes
   }
 
   // Set up boundary conditions in operator
-  const bool inner_X_neumann = flagSet(inner_boundary_flags, INVERT_AC_GRAD);
+  const bool inner_X_neumann = isInnerBoundaryFlagSet(INVERT_AC_GRAD);
   const auto inner_X_BC = inner_X_neumann ? -1. / coords->dx / sqrt(coords->g_11) : 0.5;
   const auto inner_X_BC_plus = inner_X_neumann ? -inner_X_BC : 0.5;
 
@@ -128,7 +128,7 @@ LaplacePetsc3dAmg::LaplacePetsc3dAmg(Options* opt, const CELL_LOC loc, Mesh* mes
     operator3D(i, i.xp()) = inner_X_BC_plus[i];
   }
 
-  const bool outer_X_neumann = flagSet(outer_boundary_flags, INVERT_AC_GRAD);
+  const bool outer_X_neumann = isOuterBoundaryFlagSet(INVERT_AC_GRAD);
   const auto outer_X_BC = outer_X_neumann ? 1. / coords->dx / sqrt(coords->g_11) : 0.5;
   const auto outer_X_BC_minus = outer_X_neumann ? -outer_X_BC : 0.5;
 
@@ -191,8 +191,8 @@ Field3D LaplacePetsc3dAmg::solve(const Field3D& b_in, const Field3D& x0) {
 
   // Adjust vectors to represent boundary conditions and check that
   // boundary cells are finite
-  setBC(rhs, b_in, indexer->getRegionInnerX(), inner_boundary_flags, x0);
-  setBC(rhs, b_in, indexer->getRegionOuterX(), outer_boundary_flags, x0);
+  setBC(rhs, b_in, indexer->getRegionInnerX(), getInnerBoundaryFlags(), x0);
+  setBC(rhs, b_in, indexer->getRegionOuterX(), getOuterBoundaryFlags(), x0);
   setBC(rhs, b_in, indexer->getRegionLowerY(), lower_boundary_flags, x0);
   setBC(rhs, b_in, indexer->getRegionUpperY(), upper_boundary_flags, x0);
 
@@ -460,7 +460,7 @@ void LaplacePetsc3dAmg::updateMatrix3D() {
     KSPSetTolerances(ksp, rtol, atol, dtol, maxits);
 
     // If the initial guess is not set to zero
-    if ((global_flags & INVERT_START_NEW) == 0) {
+    if (!isGlobalFlagSet(INVERT_START_NEW)) {
       KSPSetInitialGuessNonzero(ksp, (PetscBool) true);
     }
 
diff --git a/src/invert/laplace/impls/petsc3damg/petsc3damg.hxx b/src/invert/laplace/impls/petsc3damg/petsc3damg.hxx
index 99a04bd2dd..456b85b5e6 100644
--- a/src/invert/laplace/impls/petsc3damg/petsc3damg.hxx
+++ b/src/invert/laplace/impls/petsc3damg/petsc3damg.hxx
@@ -27,8 +27,8 @@
  **************************************************************************/
 class LaplacePetsc3dAmg;
 
-#ifndef __PETSC_LAPLACE_3DAMG_H__
-#define __PETSC_LAPLACE_3DAMG_H__
+#ifndef BOUT_PETSC_LAPLACE_3DAMG_H
+#define BOUT_PETSC_LAPLACE_3DAMG_H
 
 #include "bout/build_config.hxx"
 #include "bout/invert_laplace.hxx"
@@ -228,4 +228,4 @@ private:
 
 #endif //BOUT_HAS_PETSC
 
-#endif //__PETSC_LAPLACE_3DAMG_H__
+#endif //BOUT_PETSC_LAPLACE_3DAMG_H
diff --git a/src/invert/laplace/impls/serial_band/serial_band.cxx b/src/invert/laplace/impls/serial_band/serial_band.cxx
index 955e9a7ed1..4e7bb4c63f 100644
--- a/src/invert/laplace/impls/serial_band/serial_band.cxx
+++ b/src/invert/laplace/impls/serial_band/serial_band.cxx
@@ -99,16 +99,16 @@ FieldPerp LaplaceSerialBand::solve(const FieldPerp& b, const FieldPerp& x0) {
 
   int xbndry = localmesh->xstart; // Width of the x boundary
   // If the flags to assign that only one guard cell should be used is set
-  if ((global_flags & INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) {
+  if (isGlobalFlagSet(INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) {
     xbndry = 1;
   }
 
-  BOUT_OMP(parallel for)
+  BOUT_OMP_PERF(parallel for)
   for (int ix = 0; ix < localmesh->LocalNx; ix++) {
     // for fixed ix,jy set a complex vector rho(z)
 
-    if (((ix < xbndry) && (inner_boundary_flags & INVERT_SET))
-        || ((ncx - ix < xbndry) && (outer_boundary_flags & INVERT_SET))) {
+    if (((ix < xbndry) && isInnerBoundaryFlagSet(INVERT_SET))
+        || ((ncx - ix < xbndry) && (isOuterBoundaryFlagSet(INVERT_SET)))) {
       // Use the values in x0 in the boundary
       rfft(x0[ix], ncz, &bk(ix, 0));
     } else {
@@ -247,10 +247,10 @@ FieldPerp LaplaceSerialBand::solve(const FieldPerp& b, const FieldPerp& x0) {
     for (int ix = 0; ix < xbndry; ix++) {
       // Set zero-value. Change to zero-gradient if needed
 
-      if (!(inner_boundary_flags & (INVERT_RHS | INVERT_SET))) {
+      if (!isInnerBoundaryFlagSet(INVERT_RHS | INVERT_SET)) {
         bk1d[ix] = 0.0;
       }
-      if (!(outer_boundary_flags & (INVERT_RHS | INVERT_SET))) {
+      if (!isOuterBoundaryFlagSet(INVERT_RHS | INVERT_SET)) {
         bk1d[ncx - ix] = 0.0;
       }
 
@@ -265,8 +265,8 @@ FieldPerp LaplaceSerialBand::solve(const FieldPerp& b, const FieldPerp& x0) {
       // DC
 
       // Inner boundary
-      if (inner_boundary_flags & (INVERT_DC_GRAD + INVERT_SET)
-          || inner_boundary_flags & (INVERT_DC_GRAD + INVERT_RHS)) {
+      if (isInnerBoundaryFlagSet(INVERT_DC_GRAD + INVERT_SET)
+          || isInnerBoundaryFlagSet(INVERT_DC_GRAD + INVERT_RHS)) {
         // Zero gradient at inner boundary. 2nd-order accurate
         // Boundary at midpoint
         for (int ix = 0; ix < xbndry; ix++) {
@@ -277,7 +277,7 @@ FieldPerp LaplaceSerialBand::solve(const FieldPerp& b, const FieldPerp& x0) {
           A(ix, 4) = 0.;
         }
 
-      } else if (inner_boundary_flags & INVERT_DC_GRAD) {
+      } else if (isInnerBoundaryFlagSet(INVERT_DC_GRAD)) {
         // Zero gradient at inner boundary. 2nd-order accurate
         // Boundary at midpoint
         for (int ix = 0; ix < xbndry; ix++) {
@@ -288,7 +288,7 @@ FieldPerp LaplaceSerialBand::solve(const FieldPerp& b, const FieldPerp& x0) {
           A(ix, 4) = 0.;
         }
 
-      } else if (inner_boundary_flags & INVERT_DC_GRADPAR) {
+      } else if (isInnerBoundaryFlagSet(INVERT_DC_GRADPAR)) {
         for (int ix = 0; ix < xbndry; ix++) {
           A(ix, 0) = 0.;
           A(ix, 1) = 0.;
@@ -296,7 +296,7 @@ FieldPerp LaplaceSerialBand::solve(const FieldPerp& b, const FieldPerp& x0) {
           A(ix, 3) = 4. / sqrt(coords->g_22(ix + 1, jy));
           A(ix, 4) = -1. / sqrt(coords->g_22(ix + 2, jy));
         }
-      } else if (inner_boundary_flags & INVERT_DC_GRADPARINV) {
+      } else if (isInnerBoundaryFlagSet(INVERT_DC_GRADPARINV)) {
         for (int ix = 0; ix < xbndry; ix++) {
           A(ix, 0) = 0.;
           A(ix, 1) = 0.;
@@ -304,7 +304,7 @@ FieldPerp LaplaceSerialBand::solve(const FieldPerp& b, const FieldPerp& x0) {
           A(ix, 3) = 4. * sqrt(coords->g_22(ix + 1, jy));
           A(ix, 4) = -sqrt(coords->g_22(ix + 2, jy));
         }
-      } else if (inner_boundary_flags & INVERT_DC_LAP) {
+      } else if (isInnerBoundaryFlagSet(INVERT_DC_LAP)) {
         for (int ix = 0; ix < xbndry; ix++) {
           A(ix, 0) = 0.;
           A(ix, 1) = 0.;
@@ -315,7 +315,7 @@ FieldPerp LaplaceSerialBand::solve(const FieldPerp& b, const FieldPerp& x0) {
       }
 
       // Outer boundary
-      if (outer_boundary_flags & INVERT_DC_GRAD) {
+      if (isOuterBoundaryFlagSet(INVERT_DC_GRAD)) {
         // Zero gradient at outer boundary
         for (int ix = 0; ix < xbndry; ix++) {
           A(ncx - ix, 1) = -1.0;
@@ -326,12 +326,12 @@ FieldPerp LaplaceSerialBand::solve(const FieldPerp& b, const FieldPerp& x0) {
       // AC
 
       // Inner boundarySQ(kwave)*coef2
-      if (inner_boundary_flags & INVERT_AC_GRAD) {
+      if (isInnerBoundaryFlagSet(INVERT_AC_GRAD)) {
         // Zero gradient at inner boundary
         for (int ix = 0; ix < xbndry; ix++) {
           A(ix, 3) = -1.0;
         }
-      } else if (inner_boundary_flags & INVERT_AC_LAP) {
+      } else if (isInnerBoundaryFlagSet(INVERT_AC_LAP)) {
         // Enforce zero laplacian for 2nd and 4th-order
 
         int ix = 1;
@@ -369,12 +369,12 @@ FieldPerp LaplaceSerialBand::solve(const FieldPerp& b, const FieldPerp& x0) {
       }
 
       // Outer boundary
-      if (outer_boundary_flags & INVERT_AC_GRAD) {
+      if (isOuterBoundaryFlagSet(INVERT_AC_GRAD)) {
         // Zero gradient at outer boundary
         for (int ix = 0; ix < xbndry; ix++) {
           A(ncx - ix, 1) = -1.0;
         }
-      } else if (outer_boundary_flags & INVERT_AC_LAP) {
+      } else if (isOuterBoundaryFlagSet(INVERT_AC_LAP)) {
         // Enforce zero laplacian for 2nd and 4th-order
         // NOTE: Currently ignoring XZ term and coef4 assumed zero on boundary
         // FIX THIS IF IT WORKS
@@ -417,7 +417,7 @@ FieldPerp LaplaceSerialBand::solve(const FieldPerp& b, const FieldPerp& x0) {
     // Perform inversion
     cband_solve(A, localmesh->LocalNx, 2, 2, bk1d);
 
-    if ((global_flags & INVERT_KX_ZERO) && (iz == 0)) {
+    if (isGlobalFlagSet(INVERT_KX_ZERO) && (iz == 0)) {
       // Set the Kx = 0, n = 0 component to zero. For now just subtract
       // Should do in the inversion e.g. Sherman-Morrison formula
 
@@ -440,7 +440,7 @@ FieldPerp LaplaceSerialBand::solve(const FieldPerp& b, const FieldPerp& x0) {
   // Done inversion, transform back
 
   for (int ix = 0; ix <= ncx; ix++) {
-    if (global_flags & INVERT_ZERO_DC) {
+    if (isGlobalFlagSet(INVERT_ZERO_DC)) {
       xk(ix, 0) = 0.0;
     }
 
diff --git a/src/invert/laplace/impls/serial_band/serial_band.hxx b/src/invert/laplace/impls/serial_band/serial_band.hxx
index 186e716a95..d1f0fc7c65 100644
--- a/src/invert/laplace/impls/serial_band/serial_band.hxx
+++ b/src/invert/laplace/impls/serial_band/serial_band.hxx
@@ -26,8 +26,8 @@
 
 class LaplaceSerialBand;
 
-#ifndef __SERIAL_BAND_H__
-#define __SERIAL_BAND_H__
+#ifndef BOUT_SERIAL_BAND_H
+#define BOUT_SERIAL_BAND_H
 
 #include "bout/build_config.hxx"
 #include "bout/invert_laplace.hxx"
@@ -95,4 +95,4 @@ private:
 
 #endif // BOUT_USE_METRIC_3D
 
-#endif // __SERIAL_BAND_H__
+#endif // BOUT_SERIAL_BAND_H
diff --git a/src/invert/laplace/impls/serial_tri/serial_tri.cxx b/src/invert/laplace/impls/serial_tri/serial_tri.cxx
index e76650c751..f46a0a46e5 100644
--- a/src/invert/laplace/impls/serial_tri/serial_tri.cxx
+++ b/src/invert/laplace/impls/serial_tri/serial_tri.cxx
@@ -91,13 +91,13 @@ FieldPerp LaplaceSerialTri::solve(const FieldPerp& b, const FieldPerp& x0) {
   int inbndry = localmesh->xstart, outbndry = localmesh->xstart;
 
   // If the flags to assign that only one guard cell should be used is set
-  if ((global_flags & INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) {
+  if (isGlobalFlagSet(INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) {
     inbndry = outbndry = 1;
   }
-  if (inner_boundary_flags & INVERT_BNDRY_ONE) {
+  if (isInnerBoundaryFlagSet(INVERT_BNDRY_ONE)) {
     inbndry = 1;
   }
-  if (outer_boundary_flags & INVERT_BNDRY_ONE) {
+  if (isOuterBoundaryFlagSet(INVERT_BNDRY_ONE)) {
     outbndry = 1;
   }
 
@@ -133,15 +133,15 @@ FieldPerp LaplaceSerialTri::solve(const FieldPerp& b, const FieldPerp& x0) {
   auto bvec = Array<dcomplex>(ncx);
   auto cvec = Array<dcomplex>(ncx);
 
-  BOUT_OMP(parallel for)
+  BOUT_OMP_PERF(parallel for)
   for (int ix = 0; ix < ncx; ix++) {
     /* This for loop will set the bk (initialized by the constructor)
      * bk is the z fourier modes of b in z
      * If the INVERT_SET flag is set (meaning that x0 will be used to set the
      * bounadry values),
      */
-    if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET))
-        || ((ncx - 1 - ix < outbndry) && (outer_boundary_flags & INVERT_SET))) {
+    if (((ix < inbndry) && isInnerBoundaryFlagSet(INVERT_SET))
+        || ((ncx - 1 - ix < outbndry) && (isOuterBoundaryFlagSet(INVERT_SET)))) {
       // Use the values in x0 in the boundary
 
       // x0 is the input
@@ -185,8 +185,7 @@ FieldPerp LaplaceSerialTri::solve(const FieldPerp& b, const FieldPerp& x0) {
                  kz,
                  // wave number (different from kz only if we are taking a part
                  // of the z-domain [and not from 0 to 2*pi])
-                 kz * kwaveFactor, global_flags, inner_boundary_flags,
-                 outer_boundary_flags, &A, &C, &D);
+                 kz * kwaveFactor, &A, &C, &D);
 
     ///////// PERFORM INVERSION /////////
     if (!localmesh->periodicX) {
@@ -208,7 +207,7 @@ FieldPerp LaplaceSerialTri::solve(const FieldPerp& b, const FieldPerp& x0) {
     }
 
     // If the global flag is set to INVERT_KX_ZERO
-    if ((global_flags & INVERT_KX_ZERO) && (kz == 0)) {
+    if (isGlobalFlagSet(INVERT_KX_ZERO) && (kz == 0)) {
       dcomplex offset(0.0);
       for (int ix = localmesh->xstart; ix <= localmesh->xend; ix++) {
         offset += xk1d[ix];
@@ -228,7 +227,7 @@ FieldPerp LaplaceSerialTri::solve(const FieldPerp& b, const FieldPerp& x0) {
   // Done inversion, transform back
   for (int ix = 0; ix < ncx; ix++) {
 
-    if (global_flags & INVERT_ZERO_DC) {
+    if (isGlobalFlagSet(INVERT_ZERO_DC)) {
       xk(ix, 0) = 0.0;
     }
 
diff --git a/src/invert/laplace/impls/serial_tri/serial_tri.hxx b/src/invert/laplace/impls/serial_tri/serial_tri.hxx
index 05fa375de7..5b0419fa27 100644
--- a/src/invert/laplace/impls/serial_tri/serial_tri.hxx
+++ b/src/invert/laplace/impls/serial_tri/serial_tri.hxx
@@ -26,8 +26,8 @@
 
 class LaplaceSerialTri;
 
-#ifndef __SERIAL_TRI_H__
-#define __SERIAL_TRI_H__
+#ifndef BOUT_SERIAL_TRI_H
+#define BOUT_SERIAL_TRI_H
 
 #include <bout/dcomplex.hxx>
 #include <bout/invert_laplace.hxx>
@@ -80,4 +80,4 @@ private:
   Field2D A, C, D;
 };
 
-#endif // __SERIAL_TRI_H__
+#endif // BOUT_SERIAL_TRI_H
diff --git a/src/invert/laplace/impls/spt/spt.cxx b/src/invert/laplace/impls/spt/spt.cxx
index 92959e1194..2e4c844c94 100644
--- a/src/invert/laplace/impls/spt/spt.cxx
+++ b/src/invert/laplace/impls/spt/spt.cxx
@@ -65,10 +65,9 @@ LaplaceSPT::LaplaceSPT(Options* opt, const CELL_LOC loc, Mesh* mesh_in,
     ye = localmesh->LocalNy - 1; // Contains upper boundary
   }
 
-  alldata = new SPT_data[ye - ys + 1];
-  alldata -= ys; // Re-number indices to start at ys
+  alldata.reallocate(ye - ys + 1);
   for (int jy = ys; jy <= ye; jy++) {
-    alldata[jy].comm_tag = SPT_DATA + jy; // Give each one a different tag
+    alldata[jy - ys].comm_tag = SPT_DATA + jy; // Give each one a different tag
   }
 
   // Temporary array for taking FFTs
@@ -76,11 +75,6 @@ LaplaceSPT::LaplaceSPT(Options* opt, const CELL_LOC loc, Mesh* mesh_in,
   dc1d.reallocate(ncz / 2 + 1);
 }
 
-LaplaceSPT::~LaplaceSPT() {
-  alldata += ys; // Return to index from 0
-  delete[] alldata;
-}
-
 FieldPerp LaplaceSPT::solve(const FieldPerp& b) { return solve(b, b); }
 
 FieldPerp LaplaceSPT::solve(const FieldPerp& b, const FieldPerp& x0) {
@@ -90,15 +84,15 @@ FieldPerp LaplaceSPT::solve(const FieldPerp& b, const FieldPerp& x0) {
 
   FieldPerp x{emptyFrom(b)};
 
-  if ((inner_boundary_flags & INVERT_SET) || (outer_boundary_flags & INVERT_SET)) {
+  if (isInnerBoundaryFlagSet(INVERT_SET) || isOuterBoundaryFlagSet(INVERT_SET)) {
     FieldPerp bs = copy(b);
 
     int xbndry = localmesh->xstart;
     // If the flags to assign that only one guard cell should be used is set
-    if ((global_flags & INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) {
+    if (isGlobalFlagSet(INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) {
       xbndry = 1;
     }
-    if ((inner_boundary_flags & INVERT_SET) && localmesh->firstX()) {
+    if (isInnerBoundaryFlagSetOnFirstX(INVERT_SET)) {
       // Copy x0 inner boundary into bs
       for (int ix = 0; ix < xbndry; ix++) {
         for (int iz = 0; iz < localmesh->LocalNz; iz++) {
@@ -106,7 +100,7 @@ FieldPerp LaplaceSPT::solve(const FieldPerp& b, const FieldPerp& x0) {
         }
       }
     }
-    if ((outer_boundary_flags & INVERT_SET) && localmesh->lastX()) {
+    if (isOuterBoundaryFlagSetOnLastX(INVERT_SET)) {
       // Copy x0 outer boundary into bs
       for (int ix = localmesh->LocalNx - 1; ix >= localmesh->LocalNx - xbndry; ix--) {
         for (int iz = 0; iz < localmesh->LocalNz; iz++) {
@@ -141,29 +135,29 @@ Field3D LaplaceSPT::solve(const Field3D& b) {
 
   for (int jy = ys; jy <= ye; jy++) {
     // And start another one going
-    start(sliceXZ(b, jy), alldata[jy]);
+    start(sliceXZ(b, jy), alldata[jy - ys]);
 
     // Move each calculation along one processor
     for (int jy2 = ys; jy2 < jy; jy2++) {
-      next(alldata[jy2]);
+      next(alldata[jy2 - ys]);
     }
   }
 
   bool running = true;
-  do {
+  while (running) {
     // Move each calculation along until the last one is finished
-    for (int jy = ys; jy <= ye; jy++) {
-      running = next(alldata[jy]) == 0;
+    for (auto& data : alldata) {
+      running = next(data) == 0;
     }
-  } while (running);
+  }
 
   FieldPerp xperp(localmesh);
   xperp.setLocation(location);
   xperp.allocate();
 
   // All calculations finished. Get result
-  for (int jy = ys; jy <= ye; jy++) {
-    finish(alldata[jy], xperp);
+  for (auto& data : alldata) {
+    finish(data, xperp);
     x = xperp;
   }
 
@@ -173,17 +167,17 @@ Field3D LaplaceSPT::solve(const Field3D& b) {
 Field3D LaplaceSPT::solve(const Field3D& b, const Field3D& x0) {
   ASSERT1(localmesh == b.getMesh() && localmesh == x0.getMesh());
 
-  if (((inner_boundary_flags & INVERT_SET) && localmesh->firstX())
-      || ((outer_boundary_flags & INVERT_SET) && localmesh->lastX())) {
+  if ((isInnerBoundaryFlagSetOnFirstX(INVERT_SET))
+      || isOuterBoundaryFlagSetOnLastX(INVERT_SET)) {
     Field3D bs = copy(b);
 
     int xbndry = localmesh->xstart;
     // If the flags to assign that only one guard cell should be used is set
-    if ((global_flags & INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) {
+    if (isGlobalFlagSet(INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) {
       xbndry = 1;
     }
 
-    if ((inner_boundary_flags & INVERT_SET) && localmesh->firstX()) {
+    if (isInnerBoundaryFlagSetOnFirstX(INVERT_SET)) {
       // Copy x0 inner boundary into bs
       for (int ix = 0; ix < xbndry; ix++) {
         for (int iy = 0; iy < localmesh->LocalNy; iy++) {
@@ -193,7 +187,7 @@ Field3D LaplaceSPT::solve(const Field3D& b, const Field3D& x0) {
         }
       }
     }
-    if ((outer_boundary_flags & INVERT_SET) && localmesh->lastX()) {
+    if (isOuterBoundaryFlagSetOnLastX(INVERT_SET)) {
       // Copy x0 outer boundary into bs
       for (int ix = localmesh->LocalNx - 1; ix >= localmesh->LocalNx - xbndry; ix--) {
         for (int iy = 0; iy < localmesh->LocalNy; iy++) {
@@ -323,15 +317,14 @@ int LaplaceSPT::start(const FieldPerp& b, SPT_data& data) {
   /// Set matrix elements
   for (int kz = 0; kz <= maxmode; kz++) {
     tridagMatrix(&data.avec(kz, 0), &data.bvec(kz, 0), &data.cvec(kz, 0), &data.bk(kz, 0),
-                 data.jy, kz, kz * kwaveFactor, global_flags, inner_boundary_flags,
-                 outer_boundary_flags, &Acoef, &Ccoef, &Dcoef);
+                 data.jy, kz, kz * kwaveFactor, &Acoef, &Ccoef, &Dcoef);
   }
 
   data.proc = 0; //< Starts at processor 0
   data.dir = 1;
 
   if (localmesh->firstX()) {
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (int kz = 0; kz <= maxmode; kz++) {
       dcomplex bet, u0;
       // Start tridiagonal solve
@@ -382,7 +375,7 @@ int LaplaceSPT::next(SPT_data& data) {
     if (localmesh->lastX()) {
       // Last processor, turn-around
 
-      BOUT_OMP(parallel for)
+      BOUT_OMP_PERF(parallel for)
       for (int kz = 0; kz <= maxmode; kz++) {
         dcomplex bet, u0;
         dcomplex gp, up;
@@ -409,7 +402,7 @@ int LaplaceSPT::next(SPT_data& data) {
     } else if (data.dir > 0) {
       // In the middle of X, forward direction
 
-      BOUT_OMP(parallel for)
+      BOUT_OMP_PERF(parallel for)
       for (int kz = 0; kz <= maxmode; kz++) {
         dcomplex bet, u0;
         bet = dcomplex(data.buffer[4 * kz], data.buffer[4 * kz + 1]);
@@ -429,7 +422,7 @@ int LaplaceSPT::next(SPT_data& data) {
     } else if (localmesh->firstX()) {
       // Back to the start
 
-      BOUT_OMP(parallel for)
+      BOUT_OMP_PERF(parallel for)
       for (int kz = 0; kz <= maxmode; kz++) {
         dcomplex gp, up;
         gp = dcomplex(data.buffer[4 * kz], data.buffer[4 * kz + 1]);
@@ -441,7 +434,7 @@ int LaplaceSPT::next(SPT_data& data) {
     } else {
       // Middle of X, back-substitution stage
 
-      BOUT_OMP(parallel for)
+      BOUT_OMP_PERF(parallel for)
       for (int kz = 0; kz <= maxmode; kz++) {
         dcomplex gp = dcomplex(data.buffer[4 * kz], data.buffer[4 * kz + 1]);
         dcomplex up = dcomplex(data.buffer[4 * kz + 2], data.buffer[4 * kz + 3]);
@@ -516,7 +509,7 @@ void LaplaceSPT::finish(SPT_data& data, FieldPerp& x) {
       dc1d[kz] = 0.0;
     }
 
-    if (global_flags & INVERT_ZERO_DC) {
+    if (isGlobalFlagSet(INVERT_ZERO_DC)) {
       dc1d[0] = 0.0;
     }
 
diff --git a/src/invert/laplace/impls/spt/spt.hxx b/src/invert/laplace/impls/spt/spt.hxx
index 27e9c8100c..a9d5b2583f 100644
--- a/src/invert/laplace/impls/spt/spt.hxx
+++ b/src/invert/laplace/impls/spt/spt.hxx
@@ -38,8 +38,8 @@
 
 class LaplaceSPT;
 
-#ifndef __SPT_H__
-#define __SPT_H__
+#ifndef BOUT_SPT_H
+#define BOUT_SPT_H
 
 #include <bout/dcomplex.hxx>
 #include <bout/invert_laplace.hxx>
@@ -69,7 +69,6 @@ class LaplaceSPT : public Laplacian {
 public:
   LaplaceSPT(Options* opt = nullptr, const CELL_LOC = CELL_CENTRE,
              Mesh* mesh_in = nullptr, Solver* solver = nullptr);
-  ~LaplaceSPT();
 
   using Laplacian::setCoefA;
   void setCoefA(const Field2D& val) override {
@@ -106,17 +105,15 @@ public:
   Field3D solve(const Field3D& b, const Field3D& x0) override;
 
 private:
-  enum { SPT_DATA = 1123 }; ///< 'magic' number for SPT MPI messages
+  constexpr static int SPT_DATA = 1123; ///< 'magic' number for SPT MPI messages
 
   Field2D Acoef, Ccoef, Dcoef;
 
   /// Data structure for SPT algorithm
   struct SPT_data {
-    SPT_data() : comm_tag(SPT_DATA) {}
     void allocate(int mm, int nx); // Allocates memory
-    ~SPT_data(){};                 // Free memory
 
-    int jy; ///< Y index
+    int jy = 0; ///< Y index
 
     Matrix<dcomplex> bk; ///< b vector in Fourier space
     Matrix<dcomplex> xk;
@@ -125,19 +122,19 @@ private:
 
     Matrix<dcomplex> avec, bvec, cvec; ///< Diagonal bands of matrix
 
-    int proc; // Which processor has this reached?
-    int dir;  // Which direction is it going?
+    int proc = 0; // Which processor has this reached?
+    int dir = 1;  // Which direction is it going?
 
-    comm_handle recv_handle; // Handle for receives
+    comm_handle recv_handle = nullptr; // Handle for receives
 
-    int comm_tag; // Tag for communication
+    int comm_tag = SPT_DATA; // Tag for communication
 
     Array<BoutReal> buffer;
   };
 
   int ys, ye;         // Range of Y indices
   SPT_data slicedata; // Used to solve for a single FieldPerp
-  SPT_data* alldata;  // Used to solve a Field3D
+  Array<SPT_data> alldata; // Used to solve a Field3D
 
   Array<dcomplex> dc1d; ///< 1D in Z for taking FFTs
 
@@ -159,4 +156,4 @@ namespace {
 RegisterLaplace<LaplaceSPT> registerlaplacespt(LAPLACE_SPT);
 } // namespace
 
-#endif // __SPT_H__
+#endif // BOUT_SPT_H
diff --git a/src/invert/laplace/invert_laplace.cxx b/src/invert/laplace/invert_laplace.cxx
index 505b04cc4f..4032499781 100644
--- a/src/invert/laplace/invert_laplace.cxx
+++ b/src/invert/laplace/invert_laplace.cxx
@@ -424,20 +424,16 @@ void Laplacian::tridagCoefs(int jx, int jy, BoutReal kwave, dcomplex& a, dcomple
 #if BOUT_USE_METRIC_3D
 void Laplacian::tridagMatrix(dcomplex* /*avec*/, dcomplex* /*bvec*/, dcomplex* /*cvec*/,
                              dcomplex* /*bk*/, int /*jy*/, int /*kz*/, BoutReal /*kwave*/,
-                             int /*global_flags*/, int /*inner_boundary_flags*/,
-                             int /*outer_boundary_flags*/, const Field2D* /*a*/,
-                             const Field2D* /*c1coef*/, const Field2D* /*c2coef*/,
-                             const Field2D* /*d*/, bool /*includeguards*/,
-                             bool /*zperiodic*/) {
+                             const Field2D* /*a*/, const Field2D* /*c1coef*/,
+                             const Field2D* /*c2coef*/, const Field2D* /*d*/,
+                             bool /*includeguards*/, bool /*zperiodic*/) {
   throw BoutException("Error: tridagMatrix does not yet work with 3D metric.");
 }
 #else
 void Laplacian::tridagMatrix(dcomplex* avec, dcomplex* bvec, dcomplex* cvec, dcomplex* bk,
-                             int jy, int kz, BoutReal kwave, int global_flags,
-                             int inner_boundary_flags, int outer_boundary_flags,
-                             const Field2D* a, const Field2D* c1coef,
-                             const Field2D* c2coef, const Field2D* d, bool includeguards,
-                             bool zperiodic) {
+                             int jy, int kz, BoutReal kwave, const Field2D* a,
+                             const Field2D* c1coef, const Field2D* c2coef,
+                             const Field2D* d, bool includeguards, bool zperiodic) {
   ASSERT1(a->getLocation() == location);
   ASSERT1(c1coef->getLocation() == location);
   ASSERT1(c2coef->getLocation() == location);
@@ -469,13 +465,13 @@ void Laplacian::tridagMatrix(dcomplex* avec, dcomplex* bvec, dcomplex* cvec, dco
   int inbndry = localmesh->xstart, outbndry = localmesh->xstart;
 
   // If the flags to assign that only one guard cell should be used is set
-  if ((global_flags & INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) {
+  if (isGlobalFlagSet(INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) {
     inbndry = outbndry = 1;
   }
-  if (inner_boundary_flags & INVERT_BNDRY_ONE) {
+  if (isInnerBoundaryFlagSet(INVERT_BNDRY_ONE)) {
     inbndry = 1;
   }
-  if (outer_boundary_flags & INVERT_BNDRY_ONE) {
+  if (isOuterBoundaryFlagSet(INVERT_BNDRY_ONE)) {
     outbndry = 1;
   }
 
@@ -497,7 +493,7 @@ void Laplacian::tridagMatrix(dcomplex* avec, dcomplex* bvec, dcomplex* cvec, dco
 
       // If no user specified value is set on inner boundary, set the first
       // element in b (in the equation AX=b) to 0
-      if (!(inner_boundary_flags & (INVERT_RHS | INVERT_SET))) {
+      if (!isInnerBoundaryFlagSet(INVERT_RHS | INVERT_SET)) {
         for (int ix = 0; ix < inbndry; ix++) {
           bk[ix] = 0.;
         }
@@ -506,34 +502,35 @@ void Laplacian::tridagMatrix(dcomplex* avec, dcomplex* bvec, dcomplex* cvec, dco
       // DC i.e. kz = 0 (the offset mode)
       if (kz == 0) {
 
-        if (inner_boundary_flags & INVERT_DC_GRAD
-            && (inner_boundary_flags & INVERT_SET || inner_boundary_flags & INVERT_RHS)) {
+        if (isInnerBoundaryFlagSet(INVERT_DC_GRAD)
+            && (isInnerBoundaryFlagSet(INVERT_SET)
+                || isInnerBoundaryFlagSet(INVERT_RHS))) {
           // Zero gradient at inner boundary
           for (int ix = 0; ix < inbndry; ix++) {
             avec[ix] = 0.;
             bvec[ix] = -1. / sqrt(coords->g_11(ix, jy)) / coords->dx(ix, jy);
             cvec[ix] = 1. / sqrt(coords->g_11(ix, jy)) / coords->dx(ix, jy);
           }
-        } else if (inner_boundary_flags & INVERT_DC_GRAD) {
+        } else if (isInnerBoundaryFlagSet(INVERT_DC_GRAD)) {
           // Zero gradient at inner boundary
           for (int ix = 0; ix < inbndry; ix++) {
             avec[ix] = 0.;
             bvec[ix] = -1.;
             cvec[ix] = 1.;
           }
-        } else if (inner_boundary_flags & INVERT_DC_GRADPAR) {
+        } else if (isInnerBoundaryFlagSet(INVERT_DC_GRADPAR)) {
           for (int ix = 0; ix < inbndry; ix++) {
             avec[ix] = 0.0;
             bvec[ix] = 1.0 / sqrt(coords->g_22(ix, jy));
             cvec[ix] = -1.0 / sqrt(coords->g_22(ix + 1, jy));
           }
-        } else if (inner_boundary_flags & INVERT_DC_GRADPARINV) {
+        } else if (isInnerBoundaryFlagSet(INVERT_DC_GRADPARINV)) {
           for (int ix = 0; ix < inbndry; ix++) {
             avec[ix] = 0.0;
             bvec[ix] = sqrt(coords->g_22(ix, jy));
             cvec[ix] = -sqrt(coords->g_22(ix + 1, jy));
           }
-        } else if (inner_boundary_flags & INVERT_DC_LAP) {
+        } else if (isInnerBoundaryFlagSet(INVERT_DC_LAP)) {
           // Decaying boundary conditions
           BoutReal k = 0.0;
           if (a != nullptr) {
@@ -548,7 +545,7 @@ void Laplacian::tridagMatrix(dcomplex* avec, dcomplex* bvec, dcomplex* cvec, dco
             bvec[ix] = 1.;
             cvec[ix] = -exp(-k * coords->dx(ix, jy) / sqrt(coords->g11(ix, jy)));
           }
-        } else if (inner_boundary_flags & INVERT_IN_CYLINDER) {
+        } else if (isInnerBoundaryFlagSet(INVERT_IN_CYLINDER)) {
           // Condition for inner radial boundary for cylindrical coordinates
           /* Explanation:
            * The discrete fourier transform is defined as
@@ -602,8 +599,9 @@ void Laplacian::tridagMatrix(dcomplex* avec, dcomplex* bvec, dcomplex* cvec, dco
       // AC i.e. kz =/= 0 (all other modes than the offset mode)
       else {
 
-        if (inner_boundary_flags & INVERT_AC_GRAD
-            && (inner_boundary_flags & INVERT_SET || inner_boundary_flags & INVERT_RHS)) {
+        if (isInnerBoundaryFlagSet(INVERT_AC_GRAD)
+            && (isInnerBoundaryFlagSet(INVERT_SET)
+                || isInnerBoundaryFlagSet(INVERT_RHS))) {
           // Zero gradient at inner boundary
           for (int ix = 0; ix < inbndry; ix++) {
             avec[ix] = dcomplex(0., 0.);
@@ -611,14 +609,14 @@ void Laplacian::tridagMatrix(dcomplex* avec, dcomplex* bvec, dcomplex* cvec, dco
                 dcomplex(-1., 0.) / sqrt(coords->g_11(ix, jy)) / coords->dx(ix, jy);
             cvec[ix] = dcomplex(1., 0.) / sqrt(coords->g_11(ix, jy)) / coords->dx(ix, jy);
           }
-        } else if (inner_boundary_flags & INVERT_AC_GRAD) {
+        } else if (isInnerBoundaryFlagSet(INVERT_AC_GRAD)) {
           // Zero gradient at inner boundary
           for (int ix = 0; ix < inbndry; ix++) {
             avec[ix] = dcomplex(0., 0.);
             bvec[ix] = dcomplex(-1., 0.);
             cvec[ix] = dcomplex(1., 0.);
           }
-        } else if (inner_boundary_flags & INVERT_AC_LAP) {
+        } else if (isInnerBoundaryFlagSet(INVERT_AC_LAP)) {
           // Use decaying zero-Laplacian solution in the boundary
           for (int ix = 0; ix < inbndry; ix++) {
             avec[ix] = 0.0;
@@ -626,9 +624,9 @@ void Laplacian::tridagMatrix(dcomplex* avec, dcomplex* bvec, dcomplex* cvec, dco
             cvec[ix] = -exp(-1.0 * sqrt(coords->g33(ix, jy) / coords->g11(ix, jy)) * kwave
                             * coords->dx(ix, jy));
           }
-        } else if (inner_boundary_flags & INVERT_IN_CYLINDER) {
+        } else if (isInnerBoundaryFlagSet(INVERT_IN_CYLINDER)) {
           // Condition for inner radial boundary for cylindrical coordinates
-          // Explanation under "if (inner_boundary_flags & INVERT_IN_CYLINDER)"
+          // Explanation under "if (isInnerBoundaryFlagSet(INVERT_IN_CYLINDER))"
           for (int ix = 0; ix < inbndry; ix++) {
             avec[ix] = 0.;
             bvec[ix] = 1.;
@@ -655,7 +653,7 @@ void Laplacian::tridagMatrix(dcomplex* avec, dcomplex* bvec, dcomplex* cvec, dco
 
       // If no user specified value is set on outer boundary, set the last
       // element in b (in the equation AX=b) to 0
-      if (!(outer_boundary_flags & (INVERT_RHS | INVERT_SET))) {
+      if (!isOuterBoundaryFlagSet(INVERT_RHS | INVERT_SET)) {
         for (int ix = 0; ix < outbndry; ix++) {
           bk[ncx - ix] = 0.;
         }
@@ -664,36 +662,37 @@ void Laplacian::tridagMatrix(dcomplex* avec, dcomplex* bvec, dcomplex* cvec, dco
       // DC i.e. kz = 0 (the offset mode)
       if (kz == 0) {
 
-        if (outer_boundary_flags & INVERT_DC_GRAD
-            && (outer_boundary_flags & INVERT_SET || outer_boundary_flags & INVERT_RHS)) {
+        if (isOuterBoundaryFlagSet(INVERT_DC_GRAD)
+            && (isOuterBoundaryFlagSet(INVERT_SET)
+                || isOuterBoundaryFlagSet(INVERT_RHS))) {
           // Zero gradient at outer boundary
           for (int ix = 0; ix < outbndry; ix++) {
-            avec[ncx - ix] = dcomplex(-1., 0.) / sqrt(coords->g_11(ncx - ix, jy))
-                             / coords->dx(ncx - ix, jy);
-            bvec[ncx - ix] = dcomplex(1., 0.) / sqrt(coords->g_11(ncx - ix, jy))
-                             / coords->dx(ncx - ix, jy);
+            avec[ncx - ix] = dcomplex(-1., 0.) / sqrt(coords->g_11(xe - ix, jy))
+                             / coords->dx(xe - ix, jy);
+            bvec[ncx - ix] = dcomplex(1., 0.) / sqrt(coords->g_11(xe - ix, jy))
+                             / coords->dx(xe - ix, jy);
             cvec[ncx - ix] = dcomplex(0., 0.);
           }
-        } else if (outer_boundary_flags & INVERT_DC_GRAD) {
+        } else if (isOuterBoundaryFlagSet(INVERT_DC_GRAD)) {
           // Zero gradient at outer boundary
           for (int ix = 0; ix < outbndry; ix++) {
             avec[ncx - ix] = dcomplex(1., 0.);
             bvec[ncx - ix] = dcomplex(-1., 0.);
             cvec[ncx - ix] = dcomplex(0., 0.);
           }
-        } else if (inner_boundary_flags & INVERT_DC_GRADPAR) {
+        } else if (isOuterBoundaryFlagSet(INVERT_DC_GRADPAR)) {
           for (int ix = 0; ix < inbndry; ix++) {
-            avec[ncx - ix] = 1.0 / sqrt(coords->g_22(ncx - ix + 1, jy));
-            bvec[ncx - ix] = -1.0 / sqrt(coords->g_22(ncx - ix, jy));
+            avec[ncx - ix] = 1.0 / sqrt(coords->g_22(xe - ix - 1, jy));
+            bvec[ncx - ix] = -1.0 / sqrt(coords->g_22(xe - ix, jy));
             cvec[ncx - ix] = 0.0;
           }
-        } else if (inner_boundary_flags & INVERT_DC_GRADPARINV) {
+        } else if (isOuterBoundaryFlagSet(INVERT_DC_GRADPARINV)) {
           for (int ix = 0; ix < inbndry; ix++) {
-            avec[ncx - ix] = sqrt(coords->g_22(ncx - ix - 1, jy));
-            bvec[ncx - ix] = -sqrt(coords->g_22(ncx - ix, jy));
+            avec[ncx - ix] = sqrt(coords->g_22(xe - ix - 1, jy));
+            bvec[ncx - ix] = -sqrt(coords->g_22(xe - ix, jy));
             cvec[ncx - ix] = 0.0;
           }
-        } else if (inner_boundary_flags & INVERT_DC_LAP) {
+        } else if (isOuterBoundaryFlagSet(INVERT_DC_LAP)) {
           // Decaying boundary conditions
           BoutReal k = 0.0;
           if (a != nullptr) {
@@ -707,7 +706,7 @@ void Laplacian::tridagMatrix(dcomplex* avec, dcomplex* bvec, dcomplex* cvec, dco
             cvec[ncx - ix] = 0.;
             bvec[ncx - ix] = 1.;
             avec[ncx - ix] =
-                -exp(-k * coords->dx(ncx - ix, jy) / sqrt(coords->g11(ncx - ix, jy)));
+                -exp(-k * coords->dx(xe - ix, jy) / sqrt(coords->g11(xe - ix, jy)));
           }
         } else {
           // Order 2 dirichlet BC (boundary half between points)
@@ -722,24 +721,25 @@ void Laplacian::tridagMatrix(dcomplex* avec, dcomplex* bvec, dcomplex* cvec, dco
       // AC i.e. kz =/= 0 (all other modes than the offset mode)
       else {
 
-        if (outer_boundary_flags & INVERT_AC_GRAD
-            && (outer_boundary_flags & INVERT_SET || outer_boundary_flags & INVERT_RHS)) {
+        if (isOuterBoundaryFlagSet(INVERT_AC_GRAD)
+            && (isOuterBoundaryFlagSet(INVERT_SET)
+                || isOuterBoundaryFlagSet(INVERT_RHS))) {
           // Zero gradient at outer boundary
           for (int ix = 0; ix < outbndry; ix++) {
-            avec[ncx - ix] = dcomplex(-1., 0.) / sqrt(coords->g_11(ncx - ix, jy))
-                             / coords->dx(ncx - ix, jy);
-            bvec[ncx - ix] = dcomplex(1., 0.) / sqrt(coords->g_11(ncx - ix, jy))
-                             / coords->dx(ncx - ix, jy);
+            avec[ncx - ix] = dcomplex(-1., 0.) / sqrt(coords->g_11(xe - ix, jy))
+                             / coords->dx(xe - ix, jy);
+            bvec[ncx - ix] = dcomplex(1., 0.) / sqrt(coords->g_11(xe - ix, jy))
+                             / coords->dx(xe - ix, jy);
             cvec[ncx - ix] = dcomplex(0., 0.);
           }
-        } else if (outer_boundary_flags & INVERT_AC_GRAD) {
+        } else if (isOuterBoundaryFlagSet(INVERT_AC_GRAD)) {
           // Zero gradient at outer boundary
           for (int ix = 0; ix < outbndry; ix++) {
             avec[ncx - ix] = dcomplex(1., 0.);
             bvec[ncx - ix] = dcomplex(-1., 0.);
             cvec[ncx - ix] = dcomplex(0., 0.);
           }
-        } else if (outer_boundary_flags & INVERT_AC_LAP) {
+        } else if (isOuterBoundaryFlagSet(INVERT_AC_LAP)) {
           // Use decaying zero-Laplacian solution in the boundary
           for (int ix = 0; ix < outbndry; ix++) {
             avec[ncx - ix] =
@@ -795,6 +795,13 @@ void Laplacian::LaplacianMonitor::outputVars(Options& output_options,
   laplacian->outputVars(output_options, time_dimension);
 }
 
+bool Laplacian::isInnerBoundaryFlagSetOnFirstX(int flag) const {
+  return isInnerBoundaryFlagSet(flag) and localmesh->firstX();
+}
+bool Laplacian::isOuterBoundaryFlagSetOnLastX(int flag) const {
+  return isOuterBoundaryFlagSet(flag) and localmesh->lastX();
+}
+
 /**********************************************************************************
  *                              LEGACY INTERFACE
  *
diff --git a/src/invert/laplacexz/impls/petsc/laplacexz-petsc.hxx b/src/invert/laplacexz/impls/petsc/laplacexz-petsc.hxx
index 47967390f9..7e15be5a34 100644
--- a/src/invert/laplacexz/impls/petsc/laplacexz-petsc.hxx
+++ b/src/invert/laplacexz/impls/petsc/laplacexz-petsc.hxx
@@ -6,8 +6,8 @@
 
 class LaplaceXZpetsc;
 
-#ifndef __LAPLACEXZ_PETSC_H__
-#define __LAPLACEXZ_PETSC_H__
+#ifndef BOUT_LAPLACEXZ_PETSC_H
+#define BOUT_LAPLACEXZ_PETSC_H
 
 #include "bout/build_config.hxx"
 #include "bout/invert/laplacexz.hxx"
@@ -73,4 +73,4 @@ private:
 };
 
 #endif // BOUT_HAS_PETSC
-#endif // __LAPLACEXZ_PETSC_H__
+#endif // BOUT_LAPLACEXZ_PETSC_H
diff --git a/src/invert/parderiv/impls/cyclic/cyclic.hxx b/src/invert/parderiv/impls/cyclic/cyclic.hxx
index 0c581adc52..6493a3b945 100644
--- a/src/invert/parderiv/impls/cyclic/cyclic.hxx
+++ b/src/invert/parderiv/impls/cyclic/cyclic.hxx
@@ -39,8 +39,8 @@
  *
  ************************************************************************/
 
-#ifndef __INV_PAR_CR_H__
-#define __INV_PAR_CR_H__
+#ifndef BOUT_INV_PAR_CR_H
+#define BOUT_INV_PAR_CR_H
 
 #include "bout/build_config.hxx"
 #include "bout/invert_parderiv.hxx"
@@ -110,4 +110,4 @@ RegisterInvertPar<InvertParCR> registerinvertparcyclic{PARDERIVCYCLIC};
 
 #endif // BOUT_USE_METRIC_3D
 
-#endif // __INV_PAR_CR_H__
+#endif // BOUT_INV_PAR_CR_H
diff --git a/src/mesh/boundary_factory.cxx b/src/mesh/boundary_factory.cxx
index 5f5978f132..00282566a9 100644
--- a/src/mesh/boundary_factory.cxx
+++ b/src/mesh/boundary_factory.cxx
@@ -1,3 +1,5 @@
+#include "bout/parallel_boundary_op.hxx"
+#include "bout/parallel_boundary_region.hxx"
 #include <bout/boundary_factory.hxx>
 #include <bout/boundary_standard.hxx>
 #include <bout/globals.hxx>
@@ -41,10 +43,12 @@ BoundaryFactory::BoundaryFactory() {
   addMod(new BoundaryFromFieldAligned(), "fromFieldAligned");
 
   // Parallel boundaries
-  add(new BoundaryOpPar_dirichlet(), "parallel_dirichlet");
-  add(new BoundaryOpPar_dirichlet_O3(), "parallel_dirichlet_O3");
-  add(new BoundaryOpPar_dirichlet_interp(), "parallel_dirichlet_interp");
-  add(new BoundaryOpPar_neumann(), "parallel_neumann");
+  add(new BoundaryOpPar_dirichlet_o1(), "parallel_dirichlet_o1");
+  add(new BoundaryOpPar_dirichlet_o2(), "parallel_dirichlet_o2");
+  add(new BoundaryOpPar_dirichlet_o3(), "parallel_dirichlet_o3");
+  add(new BoundaryOpPar_neumann_o1(), "parallel_neumann_o1");
+  add(new BoundaryOpPar_neumann_o2(), "parallel_neumann_o2");
+  add(new BoundaryOpPar_neumann_o3(), "parallel_neumann_o3");
 }
 
 BoundaryFactory::~BoundaryFactory() {
diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx
index 5ec0bb79e1..4e515449ca 100644
--- a/src/mesh/coordinates.cxx
+++ b/src/mesh/coordinates.cxx
@@ -925,7 +925,7 @@ void Coordinates::outputVars(Options& output_options) {
 }
 
 const Field2D& Coordinates::zlength() const {
-  BOUT_OMP(critical)
+  BOUT_OMP_SAFE(critical)
   if (not zlength_cache) {
     zlength_cache = std::make_unique<Field2D>(0., localmesh);
 
@@ -1502,7 +1502,7 @@ Field3D Coordinates::DDY(const Field3D& f, CELL_LOC outloc, const std::string& m
   if (!f.hasParallelSlices() and !transform->canToFromFieldAligned()) {
     Field3D f_parallel = f;
     transform->calcParallelSlices(f_parallel);
-    f_parallel.applyParallelBoundary("parallel_neumann");
+    f_parallel.applyParallelBoundary("parallel_neumann_o2");
     return bout::derivatives::index::DDY(f_parallel, outloc, method, region);
   }
 #endif
@@ -1908,7 +1908,7 @@ Coordinates::Grad2_par2_DDY_invSg(CELL_LOC outloc, const std::string& method) co
 
   // Communicate to get parallel slices
   localmesh->communicate(*invSgCache);
-  invSgCache->applyParallelBoundary("parallel_neumann");
+  invSgCache->applyParallelBoundary("parallel_neumann_o2");
 
   // cache
   auto ptr = std::make_unique<FieldMetric>();
diff --git a/src/mesh/difops.cxx b/src/mesh/difops.cxx
index f252abe0ea..2e25dfeedb 100644
--- a/src/mesh/difops.cxx
+++ b/src/mesh/difops.cxx
@@ -774,7 +774,7 @@ Field3D bracket(const Field3D& f, const Field2D& g, BRACKET_METHOD method,
   case BRACKET_ARAKAWA_OLD: {
 #if not(BOUT_USE_METRIC_3D)
     const int ncz = mesh->LocalNz;
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (int jx = mesh->xstart; jx <= mesh->xend; jx++) {
       for (int jy = mesh->ystart; jy <= mesh->yend; jy++) {
         const BoutReal partialFactor = 1.0 / (12 * metric->dz(jx, jy));
@@ -1100,7 +1100,7 @@ Field3D bracket(const Field3D& f, const Field3D& g, BRACKET_METHOD method,
     Field3D f_temp = f;
     Field3D g_temp = g;
 
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (int jx = mesh->xstart; jx <= mesh->xend; jx++) {
       for (int jy = mesh->ystart; jy <= mesh->yend; jy++) {
 #if not(BOUT_USE_METRIC_3D)
diff --git a/src/mesh/fv_ops.cxx b/src/mesh/fv_ops.cxx
index 0a5d5f9624..cd5b924e9e 100644
--- a/src/mesh/fv_ops.cxx
+++ b/src/mesh/fv_ops.cxx
@@ -22,7 +22,7 @@ Slices<T> makeslices(bool use_slices, const T& field) {
 
 namespace FV {
 
-// Div ( a Grad_perp(f) ) -- ∇⊥ ( a ⋅ ∇⊥ f) --  Vorticity
+// Div ( a Grad_perp(f) ) -- ∇ ⋅ ( a ∇⊥ f) --  Vorticity
 Field3D Div_a_Grad_perp(const Field3D& a, const Field3D& f) {
   ASSERT2(a.getLocation() == f.getLocation());
 
diff --git a/src/mesh/impls/bout/boutmesh.cxx b/src/mesh/impls/bout/boutmesh.cxx
index 04824eeb7e..115d9f38a0 100644
--- a/src/mesh/impls/bout/boutmesh.cxx
+++ b/src/mesh/impls/bout/boutmesh.cxx
@@ -35,6 +35,7 @@
 
 #include "boutmesh.hxx"
 
+#include <bout/boundary_region.hxx>
 #include <bout/boutcomm.hxx>
 #include <bout/boutexception.hxx>
 #include <bout/constants.hxx>
@@ -44,6 +45,7 @@
 #include <bout/msg_stack.hxx>
 #include <bout/options.hxx>
 #include <bout/output.hxx>
+#include <bout/parallel_boundary_region.hxx>
 #include <bout/sys/timer.hxx>
 #include <bout/utils.hxx>
 
@@ -80,9 +82,6 @@ BoutMesh::~BoutMesh() {
   for (const auto& bndry : boundary) {
     delete bndry;
   }
-  for (const auto& bndry : par_boundary) {
-    delete bndry;
-  }
 
   if (comm_x != MPI_COMM_NULL) {
     MPI_Comm_free(&comm_x);
@@ -3037,11 +3036,36 @@ RangeIterator BoutMesh::iterateBndryUpperY() const {
 
 std::vector<BoundaryRegion*> BoutMesh::getBoundaries() { return boundary; }
 
-std::vector<BoundaryRegionPar*> BoutMesh::getBoundariesPar() { return par_boundary; }
+std::vector<std::shared_ptr<BoundaryRegionPar>>
+BoutMesh::getBoundariesPar(BoundaryParType type) {
+  return par_boundary[static_cast<int>(type)];
+}
 
-void BoutMesh::addBoundaryPar(BoundaryRegionPar* bndry) {
+void BoutMesh::addBoundaryPar(std::shared_ptr<BoundaryRegionPar> bndry,
+                              BoundaryParType type) {
   output_info << "Adding new parallel boundary: " << bndry->label << endl;
-  par_boundary.push_back(bndry);
+  switch (type) {
+  case BoundaryParType::xin_fwd:
+    par_boundary[static_cast<int>(BoundaryParType::xin)].push_back(bndry);
+    par_boundary[static_cast<int>(BoundaryParType::fwd)].push_back(bndry);
+    break;
+  case BoundaryParType::xin_bwd:
+    par_boundary[static_cast<int>(BoundaryParType::xin)].push_back(bndry);
+    par_boundary[static_cast<int>(BoundaryParType::bwd)].push_back(bndry);
+    break;
+  case BoundaryParType::xout_fwd:
+    par_boundary[static_cast<int>(BoundaryParType::xout)].push_back(bndry);
+    par_boundary[static_cast<int>(BoundaryParType::fwd)].push_back(bndry);
+    break;
+  case BoundaryParType::xout_bwd:
+    par_boundary[static_cast<int>(BoundaryParType::xout)].push_back(bndry);
+    par_boundary[static_cast<int>(BoundaryParType::bwd)].push_back(bndry);
+    break;
+  default:
+    throw BoutException("Unexpected type of boundary {}", toString(type));
+  }
+  par_boundary[static_cast<int>(type)].push_back(bndry);
+  par_boundary[static_cast<int>(BoundaryParType::all)].push_back(bndry);
 }
 
 Field3D BoutMesh::smoothSeparatrix(const Field3D& f) {
diff --git a/src/mesh/impls/bout/boutmesh.hxx b/src/mesh/impls/bout/boutmesh.hxx
index 20bf1d7d46..cc674d401a 100644
--- a/src/mesh/impls/bout/boutmesh.hxx
+++ b/src/mesh/impls/bout/boutmesh.hxx
@@ -1,6 +1,6 @@
 
-#ifndef __BOUTMESH_H__
-#define __BOUTMESH_H__
+#ifndef BOUT_BOUTMESH_H
+#define BOUT_BOUTMESH_H
 
 #include "mpi.h"
 
@@ -158,8 +158,10 @@ public:
 
   // Boundary regions
   std::vector<BoundaryRegion*> getBoundaries() override;
-  std::vector<BoundaryRegionPar*> getBoundariesPar() override;
-  void addBoundaryPar(BoundaryRegionPar* bndry) override;
+  std::vector<std::shared_ptr<BoundaryRegionPar>>
+  getBoundariesPar(BoundaryParType type) override;
+  void addBoundaryPar(std::shared_ptr<BoundaryRegionPar> bndry,
+                      BoundaryParType type) override;
   std::set<std::string> getPossibleBoundaries() const override;
 
   Field3D smoothSeparatrix(const Field3D& f) override;
@@ -393,8 +395,10 @@ protected:
   void addBoundaryRegions();
 
 private:
-  std::vector<BoundaryRegion*> boundary;        // Vector of boundary regions
-  std::vector<BoundaryRegionPar*> par_boundary; // Vector of parallel boundary regions
+  std::vector<BoundaryRegion*> boundary; // Vector of boundary regions
+  std::array<std::vector<std::shared_ptr<BoundaryRegionPar>>,
+             static_cast<int>(BoundaryParType::SIZE)>
+      par_boundary; // Vector of parallel boundary regions
 
   //////////////////////////////////////////////////
   // Communications
@@ -485,4 +489,4 @@ CheckMeshResult checkBoutMeshYDecomposition(int num_y_processors, int ny,
                                             int ny_inner);
 } // namespace bout
 
-#endif // __BOUTMESH_H__
+#endif // BOUT_BOUTMESH_H
diff --git a/src/mesh/index_derivs.cxx b/src/mesh/index_derivs.cxx
index 9cccd6f7d7..ebecb96700 100644
--- a/src/mesh/index_derivs.cxx
+++ b/src/mesh/index_derivs.cxx
@@ -445,7 +445,7 @@ class FFTDerivativeType {
     }
     const int kmax = ncz / 2 - kfilter; // Up to and including this wavenumber index
 
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       Array<dcomplex> cv(ncz / 2 + 1);
       const BoutReal kwaveFac = TWOPI / ncz;
@@ -502,7 +502,7 @@ class FFT2ndDerivativeType {
     const int ncz = theMesh->getNpoints(direction);
     const int kmax = ncz / 2;
 
-    BOUT_OMP(parallel)
+    BOUT_OMP_PERF(parallel)
     {
       Array<dcomplex> cv(ncz / 2 + 1);
       const BoutReal kwaveFac = TWOPI / ncz;
diff --git a/src/mesh/mesh.cxx b/src/mesh/mesh.cxx
index 0f6315a987..870f3413cd 100644
--- a/src/mesh/mesh.cxx
+++ b/src/mesh/mesh.cxx
@@ -801,12 +801,12 @@ std::optional<size_t> Mesh::getCommonRegion(std::optional<size_t> lhs,
    */
   const size_t pos = (high * (high - 1)) / 2 + low;
   if (region3Dintersect.size() <= pos) {
-    BOUT_OMP(critical(mesh_intersection_realloc))
+    BOUT_OMP_SAFE(critical(mesh_intersection_realloc))
     // By default this function does not need the mutex, however, if we are
     // going to allocate global memory, we need to use a mutex.
     // Now that we have the mutex, we need to check again whether a
     // different thread was faster and already allocated.
-    // BOUT_OMP(single) would work in most cases, but it would fail if the
+    // BOUT_OMP_SAFE(single) would work in most cases, but it would fail if the
     // function is called in parallel with different arguments. While BOUT++
     // is not currently doing it, other openmp parallised projects might be
     // calling BOUT++ in this way.
@@ -821,7 +821,7 @@ std::optional<size_t> Mesh::getCommonRegion(std::optional<size_t> lhs,
     return region3Dintersect[pos];
   }
   {
-    BOUT_OMP(critical(mesh_intersection))
+    BOUT_OMP_SAFE(critical(mesh_intersection))
     // See comment above why we need to check again in case of OpenMP
 #if BOUT_USE_OPENMP
     if (!region3Dintersect[pos].has_value())
diff --git a/src/mesh/parallel/fci.cxx b/src/mesh/parallel/fci.cxx
index 23b2b91eab..cb8c19bbd7 100644
--- a/src/mesh/parallel/fci.cxx
+++ b/src/mesh/parallel/fci.cxx
@@ -47,9 +47,9 @@
 
 #include <string>
 
-FCIMap::FCIMap(Mesh& mesh, const Coordinates::FieldMetric& dy, Options& options,
-               int offset_, BoundaryRegionPar* inner_boundary,
-               BoundaryRegionPar* outer_boundary, bool zperiodic)
+FCIMap::FCIMap(Mesh& mesh, const Coordinates::FieldMetric& UNUSED(dy), Options& options,
+               int offset_, const std::shared_ptr<BoundaryRegionPar>& inner_boundary,
+               const std::shared_ptr<BoundaryRegionPar>& outer_boundary, bool zperiodic)
     : map_mesh(mesh), offset(offset_),
       region_no_boundary(map_mesh.getRegion("RGN_NOBNDRY")),
       corner_boundary_mask(map_mesh) {
@@ -222,13 +222,16 @@ FCIMap::FCIMap(Mesh& mesh, const Coordinates::FieldMetric& dy, Options& options,
     const BoutReal dx = (dZ_dz * dR - dR_dz * dZ) / det;
     const BoutReal dz = (dR_dx * dZ - dZ_dx * dR) / det;
 
-    // Negative xt_prime means we've hit the inner boundary, otherwise
-    // the outer boundary
-    auto* boundary = (xt_prime[i] < map_mesh.xstart) ? inner_boundary : outer_boundary;
+    // Negative xt_prime means we've hit the inner boundary, otherwise the
+    // outer boundary. However, if any of the surrounding points are negative,
+    // that also means inner. So to differentiate between inner and outer we
+    // need at least 2 points in the domain.
+    ASSERT2(map_mesh.xend - map_mesh.xstart >= 2);
+    auto boundary = (xt_prime[i] < map_mesh.xstart) ? inner_boundary : outer_boundary;
     boundary->add_point(x, y, z, x + dx, y + 0.5 * offset,
-                        z + dz,      // Intersection point in local index space
-                        0.5 * dy[i], // Distance to intersection
-                        PI           // Right-angle intersection
+                        z + dz, // Intersection point in local index space
+                        0.5,    // Distance to intersection
+                        1       // Default to that there is a point in the other direction
     );
   }
   region_no_boundary = region_no_boundary.mask(to_remove);
diff --git a/src/mesh/parallel/fci.hxx b/src/mesh/parallel/fci.hxx
index a749c084cc..3ec3321a6a 100644
--- a/src/mesh/parallel/fci.hxx
+++ b/src/mesh/parallel/fci.hxx
@@ -23,8 +23,8 @@
  *
  **************************************************************************/
 
-#ifndef __FCITRANSFORM_H__
-#define __FCITRANSFORM_H__
+#ifndef BOUT_FCITRANSFORM_H
+#define BOUT_FCITRANSFORM_H
 
 #include <bout/interpolation_xz.hxx>
 #include <bout/mask.hxx>
@@ -44,8 +44,8 @@ class FCIMap {
 public:
   FCIMap() = delete;
   FCIMap(Mesh& mesh, const Coordinates::FieldMetric& dy, Options& options, int offset,
-         BoundaryRegionPar* inner_boundary, BoundaryRegionPar* outer_boundary,
-         bool zperiodic);
+         const std::shared_ptr<BoundaryRegionPar>& inner_boundary,
+         const std::shared_ptr<BoundaryRegionPar>& outer_boundary, bool zperiodic);
 
   // The mesh this map was created on
   Mesh& map_mesh;
@@ -79,19 +79,19 @@ public:
     FCITransform::checkInputGrid();
 
     auto forward_boundary_xin =
-        new BoundaryRegionPar("FCI_forward", BNDRY_PAR_FWD_XIN, +1, &mesh);
-    auto backward_boundary_xin =
-        new BoundaryRegionPar("FCI_backward", BNDRY_PAR_BKWD_XIN, -1, &mesh);
+        std::make_shared<BoundaryRegionPar>("FCI_forward", BNDRY_PAR_FWD_XIN, +1, &mesh);
+    auto backward_boundary_xin = std::make_shared<BoundaryRegionPar>(
+        "FCI_backward", BNDRY_PAR_BKWD_XIN, -1, &mesh);
     auto forward_boundary_xout =
-        new BoundaryRegionPar("FCI_forward", BNDRY_PAR_FWD_XOUT, +1, &mesh);
-    auto backward_boundary_xout =
-        new BoundaryRegionPar("FCI_backward", BNDRY_PAR_BKWD_XOUT, -1, &mesh);
+        std::make_shared<BoundaryRegionPar>("FCI_forward", BNDRY_PAR_FWD_XOUT, +1, &mesh);
+    auto backward_boundary_xout = std::make_shared<BoundaryRegionPar>(
+        "FCI_backward", BNDRY_PAR_BKWD_XOUT, -1, &mesh);
 
     // Add the boundary region to the mesh's vector of parallel boundaries
-    mesh.addBoundaryPar(forward_boundary_xin);
-    mesh.addBoundaryPar(backward_boundary_xin);
-    mesh.addBoundaryPar(forward_boundary_xout);
-    mesh.addBoundaryPar(backward_boundary_xout);
+    mesh.addBoundaryPar(forward_boundary_xin, BoundaryParType::xin_fwd);
+    mesh.addBoundaryPar(backward_boundary_xin, BoundaryParType::xin_bwd);
+    mesh.addBoundaryPar(forward_boundary_xout, BoundaryParType::xout_fwd);
+    mesh.addBoundaryPar(backward_boundary_xout, BoundaryParType::xout_bwd);
 
     field_line_maps.reserve(mesh.ystart * 2);
     for (int offset = 1; offset < mesh.ystart + 1; ++offset) {
@@ -100,6 +100,22 @@ public:
       field_line_maps.emplace_back(mesh, dy, options, -offset, backward_boundary_xin,
                                    backward_boundary_xout, zperiodic);
     }
+    ASSERT0(mesh.ystart == 1);
+    std::shared_ptr<BoundaryRegionPar> bndries[]{
+        forward_boundary_xin, forward_boundary_xout, backward_boundary_xin,
+        backward_boundary_xout};
+    for (auto& bndry : bndries) {
+      for (const auto& bndry2 : bndries) {
+        if (bndry->dir == bndry2->dir) {
+          continue;
+        }
+        for (bndry->first(); !bndry->isDone(); bndry->next()) {
+          if (bndry2->contains(*bndry)) {
+            bndry->setValid(0);
+          }
+        }
+      }
+    }
   }
 
   void calcParallelSlices(Field3D& f) override;
@@ -142,4 +158,4 @@ private:
   std::vector<FCIMap> field_line_maps;
 };
 
-#endif // __FCITRANSFORM_H__
+#endif // BOUT_FCITRANSFORM_H
diff --git a/src/mesh/parallel/shiftedmetric.cxx b/src/mesh/parallel/shiftedmetric.cxx
index 84084d9cbb..382052047d 100644
--- a/src/mesh/parallel/shiftedmetric.cxx
+++ b/src/mesh/parallel/shiftedmetric.cxx
@@ -6,7 +6,9 @@
  *
  */
 
+#include "bout/parallel_boundary_region.hxx"
 #include "bout/paralleltransform.hxx"
+#include <bout/boundary_region.hxx>
 #include <bout/constants.hxx>
 #include <bout/fft.hxx>
 #include <bout/mesh.hxx>
diff --git a/src/mesh/parallel/shiftedmetricinterp.cxx b/src/mesh/parallel/shiftedmetricinterp.cxx
index 214f7ded76..7f3637e79c 100644
--- a/src/mesh/parallel/shiftedmetricinterp.cxx
+++ b/src/mesh/parallel/shiftedmetricinterp.cxx
@@ -29,7 +29,7 @@
 
 #include "shiftedmetricinterp.hxx"
 #include "bout/constants.hxx"
-#include "bout/mask.hxx"
+#include "bout/parallel_boundary_region.hxx"
 
 ShiftedMetricInterp::ShiftedMetricInterp(Mesh& mesh, CELL_LOC location_in,
                                          Field2D zShift_in, BoutReal zlength_in,
@@ -114,11 +114,16 @@ ShiftedMetricInterp::ShiftedMetricInterp(Mesh& mesh, CELL_LOC location_in,
 
   interp_from_aligned->calcWeights(zt_prime_from);
 
+  int yvalid = mesh.LocalNy - 2 * mesh.ystart;
+  // avoid overflow - no stencil need more than 5 points
+  if (yvalid > 20) {
+    yvalid = 20;
+  }
   // Create regions for parallel boundary conditions
   Field2D dy;
   mesh.get(dy, "dy", 1.);
-  auto forward_boundary_xin =
-      new BoundaryRegionPar("parallel_forward_xin", BNDRY_PAR_FWD_XIN, +1, &mesh);
+  auto forward_boundary_xin = std::make_shared<BoundaryRegionPar>(
+      "parallel_forward_xin", BNDRY_PAR_FWD_XIN, +1, &mesh);
   for (auto it = mesh.iterateBndryUpperY(); not it.isDone(); it.next()) {
     for (int z = mesh.zstart; z <= mesh.zend; z++) {
       forward_boundary_xin->add_point(
@@ -128,14 +133,13 @@ ShiftedMetricInterp::ShiftedMetricInterp(Mesh& mesh, CELL_LOC location_in,
           zlength * BoutReal(z) / BoutReal(mesh.GlobalNz) // z
               + 0.5 * (zShift(it.ind, mesh.yend + 1) - zShift(it.ind, mesh.yend)),
           0.25
-              * (dy(it.ind, mesh.yend) // dy/2
-                 + dy(it.ind, mesh.yend + 1)),
-          0. // angle?
-      );
+              * (1                                                     // dy/2
+                 + dy(it.ind, mesh.yend + 1) / dy(it.ind, mesh.yend)), // length
+          yvalid);
     }
   }
-  auto backward_boundary_xin =
-      new BoundaryRegionPar("parallel_backward_xin", BNDRY_PAR_BKWD_XIN, -1, &mesh);
+  auto backward_boundary_xin = std::make_shared<BoundaryRegionPar>(
+      "parallel_backward_xin", BNDRY_PAR_BKWD_XIN, -1, &mesh);
   for (auto it = mesh.iterateBndryLowerY(); not it.isDone(); it.next()) {
     for (int z = mesh.zstart; z <= mesh.zend; z++) {
       backward_boundary_xin->add_point(
@@ -145,15 +149,14 @@ ShiftedMetricInterp::ShiftedMetricInterp(Mesh& mesh, CELL_LOC location_in,
           zlength * BoutReal(z) / BoutReal(mesh.GlobalNz) // z
               + 0.5 * (zShift(it.ind, mesh.ystart) - zShift(it.ind, mesh.ystart - 1)),
           0.25
-              * (dy(it.ind, mesh.ystart - 1) // dy/2
-                 + dy(it.ind, mesh.ystart)),
-          0. // angle?
-      );
+              * (1 // dy/2
+                 + dy(it.ind, mesh.ystart - 1) / dy(it.ind, mesh.ystart)),
+          yvalid);
     }
   }
   // Create regions for parallel boundary conditions
-  auto forward_boundary_xout =
-      new BoundaryRegionPar("parallel_forward_xout", BNDRY_PAR_FWD_XOUT, +1, &mesh);
+  auto forward_boundary_xout = std::make_shared<BoundaryRegionPar>(
+      "parallel_forward_xout", BNDRY_PAR_FWD_XOUT, +1, &mesh);
   for (auto it = mesh.iterateBndryUpperY(); not it.isDone(); it.next()) {
     for (int z = mesh.zstart; z <= mesh.zend; z++) {
       forward_boundary_xout->add_point(
@@ -163,14 +166,13 @@ ShiftedMetricInterp::ShiftedMetricInterp(Mesh& mesh, CELL_LOC location_in,
           zlength * BoutReal(z) / BoutReal(mesh.GlobalNz) // z
               + 0.5 * (zShift(it.ind, mesh.yend + 1) - zShift(it.ind, mesh.yend)),
           0.25
-              * (dy(it.ind, mesh.yend) // dy/2
-                 + dy(it.ind, mesh.yend + 1)),
-          0. // angle?
-      );
+              * (1 // dy/2
+                 + dy(it.ind, mesh.yend + 1) / dy(it.ind, mesh.yend)),
+          yvalid);
     }
   }
-  auto backward_boundary_xout =
-      new BoundaryRegionPar("parallel_backward_xout", BNDRY_PAR_BKWD_XOUT, -1, &mesh);
+  auto backward_boundary_xout = std::make_shared<BoundaryRegionPar>(
+      "parallel_backward_xout", BNDRY_PAR_BKWD_XOUT, -1, &mesh);
   for (auto it = mesh.iterateBndryLowerY(); not it.isDone(); it.next()) {
     for (int z = mesh.zstart; z <= mesh.zend; z++) {
       backward_boundary_xout->add_point(
@@ -180,18 +182,17 @@ ShiftedMetricInterp::ShiftedMetricInterp(Mesh& mesh, CELL_LOC location_in,
           zlength * BoutReal(z) / BoutReal(mesh.GlobalNz) // z
               + 0.5 * (zShift(it.ind, mesh.ystart) - zShift(it.ind, mesh.ystart - 1)),
           0.25
-              * (dy(it.ind, mesh.ystart - 1) // dy/2
-                 + dy(it.ind, mesh.ystart)),
-          0. // angle?
-      );
+              * (dy(it.ind, mesh.ystart - 1) / dy(it.ind, mesh.ystart) // dy/2
+                 + 1),
+          yvalid);
     }
   }
 
   // Add the boundary region to the mesh's vector of parallel boundaries
-  mesh.addBoundaryPar(forward_boundary_xin);
-  mesh.addBoundaryPar(backward_boundary_xin);
-  mesh.addBoundaryPar(forward_boundary_xout);
-  mesh.addBoundaryPar(backward_boundary_xout);
+  mesh.addBoundaryPar(forward_boundary_xin, BoundaryParType::xin_fwd);
+  mesh.addBoundaryPar(backward_boundary_xin, BoundaryParType::xin_bwd);
+  mesh.addBoundaryPar(forward_boundary_xout, BoundaryParType::xout_fwd);
+  mesh.addBoundaryPar(backward_boundary_xout, BoundaryParType::xin_bwd);
 }
 
 void ShiftedMetricInterp::checkInputGrid() {
diff --git a/src/mesh/parallel/shiftedmetricinterp.hxx b/src/mesh/parallel/shiftedmetricinterp.hxx
index 93ea2f07be..6852ea15a9 100644
--- a/src/mesh/parallel/shiftedmetricinterp.hxx
+++ b/src/mesh/parallel/shiftedmetricinterp.hxx
@@ -24,8 +24,8 @@
  *
  **************************************************************************/
 
-#ifndef __SHIFTEDINTERP_H__
-#define __SHIFTEDINTERP_H__
+#ifndef BOUT_SHIFTEDINTERP_H
+#define BOUT_SHIFTEDINTERP_H
 
 #include <bout/interpolation_z.hxx>
 #include <bout/paralleltransform.hxx>
@@ -129,4 +129,4 @@ private:
   const std::size_t ydown_index;
 };
 
-#endif // __SHIFTEDINTERP_H__
+#endif // BOUT_SHIFTEDINTERP_H
diff --git a/src/mesh/parallel_boundary_op.cxx b/src/mesh/parallel_boundary_op.cxx
index 8b2c294a4a..ebd9852791 100644
--- a/src/mesh/parallel_boundary_op.cxx
+++ b/src/mesh/parallel_boundary_op.cxx
@@ -6,18 +6,15 @@
 #include "bout/output.hxx"
 
 BoutReal BoundaryOpPar::getValue(const BoundaryRegionPar& bndry, BoutReal t) {
-
-  Mesh* mesh = bndry.localmesh;
-
   BoutReal value;
 
   switch (value_type) {
   case ValueType::GEN:
-    return gen_values->generate(
-        bout::generator::Context(bndry.s_x, bndry.s_y, bndry.s_z, CELL_CENTRE, mesh, t));
+    return gen_values->generate(bout::generator::Context(
+        bndry.s_x(), bndry.s_y(), bndry.s_z(), CELL_CENTRE, bndry.localmesh, t));
   case ValueType::FIELD:
     // FIXME: Interpolate to s_x, s_y, s_z...
-    value = (*field_values)(bndry.x, bndry.y, bndry.z);
+    value = (*field_values)[bndry.ind()];
     return value;
   case ValueType::REAL:
     return real_value;
@@ -25,123 +22,3 @@ BoutReal BoundaryOpPar::getValue(const BoundaryRegionPar& bndry, BoutReal t) {
     throw BoutException("Invalid value_type encountered in BoundaryOpPar::getValue");
   }
 }
-
-//////////////////////////////////////////
-// Dirichlet boundary
-
-void BoundaryOpPar_dirichlet::apply(Field3D& f, BoutReal t) {
-  Field3D& f_next = f.ynext(bndry->dir);
-
-  Coordinates& coord = *(f.getCoordinates());
-
-  // Loop over grid points If point is in boundary, then fill in
-  // f_next such that the field would be VALUE on the boundary
-  for (bndry->first(); !bndry->isDone(); bndry->next()) {
-    // temp variables for convenience
-    int x = bndry->x;
-    int y = bndry->y;
-    int z = bndry->z;
-
-    // Generate the boundary value
-    BoutReal value = getValue(*bndry, t);
-
-    // Scale the field and normalise to the desired value
-    BoutReal y_prime = bndry->length;
-    BoutReal f2 = (f(x, y, z) - value) * (coord.dy(x, y, z) - y_prime) / y_prime;
-
-    f_next(x, y + bndry->dir, z) = value - f2;
-  }
-}
-
-//////////////////////////////////////////
-// Dirichlet boundary - Third order
-
-void BoundaryOpPar_dirichlet_O3::apply(Field3D& f, BoutReal t) {
-
-  Field3D& f_next = f.ynext(bndry->dir);
-  Field3D& f_prev = f.ynext(-bndry->dir);
-
-  Coordinates& coord = *(f.getCoordinates());
-
-  // Loop over grid points If point is in boundary, then fill in
-  // f_next such that the field would be VALUE on the boundary
-  for (bndry->first(); !bndry->isDone(); bndry->next()) {
-    // temp variables for convenience
-    int x = bndry->x;
-    int y = bndry->y;
-    int z = bndry->z;
-
-    // Generate the boundary value
-    BoutReal fb = getValue(*bndry, t);
-    BoutReal f1 = f_prev(x, y - bndry->dir, z);
-    BoutReal f2 = f(x, y, z);
-    BoutReal l1 = coord.dy(x, y, z);
-    BoutReal l2 = bndry->length;
-    BoutReal l3 = coord.dy(x, y, z) - l2;
-
-    BoutReal denom = (l1 * l1 * l2 + l1 * l2 * l2);
-    BoutReal term1 = (l2 * l2 * l3 + l2 * l3 * l3);
-    BoutReal term2 = l1 * (l1 + l2 + l3) * (l2 + l3);
-    BoutReal term3 = l3 * ((l1 + l2) * l3 + (l1 + l2) * (l1 + l2));
-
-    f_next(x, y + bndry->dir, z) = (term1 * f1 + term2 * fb - term3 * f2) / denom;
-  }
-}
-
-//////////////////////////////////////////
-// Dirichlet with interpolation
-
-void BoundaryOpPar_dirichlet_interp::apply(Field3D& f, BoutReal t) {
-
-  Field3D& f_next = f.ynext(bndry->dir);
-  Field3D& f_prev = f.ynext(-bndry->dir);
-
-  Coordinates& coord = *(f.getCoordinates());
-
-  // Loop over grid points If point is in boundary, then fill in
-  // f_next such that the field would be VALUE on the boundary
-  for (bndry->first(); !bndry->isDone(); bndry->next()) {
-    // temp variables for convenience
-    int x = bndry->x;
-    int y = bndry->y;
-    int z = bndry->z;
-
-    // Generate the boundary value
-    BoutReal fs = getValue(*bndry, t);
-
-    // Scale the field and normalise to the desired value
-    BoutReal dy = coord.dy(x, y, z);
-    BoutReal s = bndry->length * dy;
-
-    f_next(x, y + bndry->dir, z) =
-        f_prev(x, y - bndry->dir, z) * (1. - (2. * s / (dy + s)))
-        + 2. * f(x, y, z) * ((s - dy) / s) + fs * (dy / s - (2. / s + 1.));
-  }
-}
-
-//////////////////////////////////////////
-// Neumann boundary
-
-void BoundaryOpPar_neumann::apply(Field3D& f, BoutReal t) {
-  TRACE("BoundaryOpPar_neumann::apply");
-
-  Field3D& f_next = f.ynext(bndry->dir);
-  f_next.allocate(); // Ensure unique before modifying
-
-  Coordinates& coord = *(f.getCoordinates());
-
-  // If point is in boundary, then fill in f_next such that the derivative
-  // would be VALUE on the boundary
-  for (bndry->first(); !bndry->isDone(); bndry->next()) {
-    // temp variables for convience
-    int x = bndry->x;
-    int y = bndry->y;
-    int z = bndry->z;
-
-    // Generate the boundary value
-    BoutReal value = getValue(*bndry, t);
-    BoutReal dy = coord.dy(x, y, z);
-
-    f_next(x, y + bndry->dir, z) = f(x, y, z) + bndry->dir * value * dy;
-  }
-}
diff --git a/src/mesh/parallel_boundary_region.cxx b/src/mesh/parallel_boundary_region.cxx
index 3f77d96737..e69de29bb2 100644
--- a/src/mesh/parallel_boundary_region.cxx
+++ b/src/mesh/parallel_boundary_region.cxx
@@ -1,37 +0,0 @@
-#include "bout/parallel_boundary_region.hxx"
-
-void BoundaryRegionPar::add_point(const int jx, const int jy, const int jz,
-                                  const BoutReal x, const BoutReal y, const BoutReal z,
-                                  const BoutReal length, const BoutReal angle) {
-  bndry_points.push_back({{jx, jy, jz}, {x, y, z}, length, angle});
-}
-
-void BoundaryRegionPar::first() {
-  bndry_position = begin(bndry_points);
-  if (!isDone()) {
-    x = bndry_position->index.jx;
-    y = bndry_position->index.jy;
-    z = bndry_position->index.jz;
-    s_x = bndry_position->intersection.s_x;
-    s_y = bndry_position->intersection.s_y;
-    s_z = bndry_position->intersection.s_z;
-    length = bndry_position->length;
-    angle = bndry_position->angle;
-  }
-}
-
-void BoundaryRegionPar::next() {
-  ++bndry_position;
-  if (!isDone()) {
-    x = bndry_position->index.jx;
-    y = bndry_position->index.jy;
-    z = bndry_position->index.jz;
-    s_x = bndry_position->intersection.s_x;
-    s_y = bndry_position->intersection.s_y;
-    s_z = bndry_position->intersection.s_z;
-    length = bndry_position->length;
-    angle = bndry_position->angle;
-  }
-}
-
-bool BoundaryRegionPar::isDone() { return (bndry_position == end(bndry_points)); }
diff --git a/src/mesh/parallel_boundary_stencil.cxx.py b/src/mesh/parallel_boundary_stencil.cxx.py
new file mode 100644
index 0000000000..d0988ee099
--- /dev/null
+++ b/src/mesh/parallel_boundary_stencil.cxx.py
@@ -0,0 +1,62 @@
+import os
+from tempfile import NamedTemporaryFile as tmpf
+from stencils_sympy import dirichlet, neumann, simp, Symbol, Matrix, ccode
+
+
+def gen_code(order, matrix_type):
+    x = [Symbol("spacing%d" % i) for i in range(order)]
+    matrix = matrix_type(x)
+    A = Matrix(order, order, matrix)
+
+    try:
+        iA = A.inv()
+    except:
+        import sys
+
+        print(A, matrix, file=sys.stderr)
+        raise
+    return ccode(simp(sum([iA[0, i] * Symbol("value%d" % i) for i in range(order)])))
+
+
+def run(cmd):
+    print(cmd)
+    out = os.system(cmd)
+    assert out == 0
+
+
+if __name__ == "__main__":
+    with tmpf("w", dir=".", delete=False) as f:
+        f.write("namespace {\n")
+        f.write(
+            """
+inline BoutReal pow(BoutReal val, int exp) {
+  //constexpr int expval = exp;
+  //static_assert(expval == 2 or expval == 3, "This pow is only for exponent 2 or 3");
+  if (exp == 2) {
+    return val * val;
+  }
+  ASSERT3(exp == 3);
+  return val * val * val;
+}
+"""
+        )
+
+        for order in range(1, 4):
+            for matrix in dirichlet, neumann:
+                if order == 1 and matrix == neumann:
+                    continue
+                print(f"generating {matrix.name}_o{order}")
+                args = ", ".join(
+                    [
+                        "BoutReal spacing%d, BoutReal value%d" % (i, i)
+                        for i in range(order)
+                    ]
+                )
+                f.write(
+                    f"inline BoutReal stencil_{matrix.name}_o{order}({args}) {{\n  return "
+                )
+                f.write(gen_code(order, matrix))
+                f.write(";\n}\n")
+        f.write("}\n")
+    run("clang-format -i " + f.name)
+    run(f"mv {f.name} {__file__[:-3]}")
diff --git a/src/mesh/stencils.md b/src/mesh/stencils.md
new file mode 100644
index 0000000000..0c7d181481
--- /dev/null
+++ b/src/mesh/stencils.md
@@ -0,0 +1,29 @@
+Notes concerning the generation of stencils
+================
+
+We want to create a Taylor function
+$f(x-x_0)=\sum_i=0^n \frac{1}{i!}f_i(x-x_0)^i$ where $n$
+is the order of the function, $x_0$ is the point in the boundary
+where we want to calculate the function. $f_i$ are some coefficients
+that we need to determine. To be precise, only $f_0$ needs to be
+determined.
+We know that the function has at some points certain values. If the
+value at some distance `spacing.f0` is a given value `val` then we
+can build a linear system of equations using the above formula.
+If rather the derivative is given, the above equations needs to be
+differentiated once.
+
+stencils_sympy.py calculates the coefficients of the above matrix
+which represents our system of equations. The derivative is simply
+one the factor of the next smaller term (or zero if the there is no
+smaller one). This is what is calculated by `taylor`, `dirichlet`
+and `neumann`, the respective matrix coefficients.
+
+sympy does all the heavy lifting on analytically inverting the
+matrix.
+
+With the analytic inversion we can put in the numerical offsets
+`spacing.f?` in C++ and get a fast expression for the respective
+coefficients. As mentioned before, we do not need the full inverse,
+just the first row, as we only care about the value, not about it's
+derivative.
diff --git a/src/mesh/stencils_sympy.py b/src/mesh/stencils_sympy.py
new file mode 100644
index 0000000000..64677f1985
--- /dev/null
+++ b/src/mesh/stencils_sympy.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+
+from sympy import Symbol, Eq
+from sympy.matrices import Matrix
+from sympy.printing import ccode
+from sympy.simplify import combsimp as simp
+from sympy.utilities.codegen import codegen
+
+
+def pow(a, b):
+    if b == 0:
+        return "1"
+    if b == 1:
+        return a
+    else:
+        return "%s**%d" % (a, b)
+
+
+def factorial(a):
+    if a == 0 or a == 1:
+        return 1
+    else:
+        assert a > 0
+        return a * factorial(a - 1)
+
+
+def gen_code(order, matrix_type):
+    x = [Symbol("spacing.f%d" % i) for i in range(order)]
+    matrix = matrix_type(x)
+    A = Matrix(order, order, matrix)
+
+    try:
+        iA = A.inv()
+    except:
+        import sys
+
+        print(A, matrix, file=sys.stderr)
+        raise
+    ret = ""
+    for i in range(order):
+        ret += ccode(simp(iA[0, i]), assign_to="facs.f%d" % i)
+        ret += "\n"
+    return ret
+
+
+def taylor(x, i, j):
+    if j >= 0:
+        return x[i] ** j / factorial(j)
+    else:
+        return 0
+
+
+class dirichlet:
+    name = "dirichlet"
+
+    def __init__(self, x):
+        self.x = x
+
+    def __call__(self, i, j):
+        return taylor(self.x, i, j)
+
+
+class neumann:
+    name = "neumann"
+
+    def __init__(self, x):
+        self.x = x
+
+    def __call__(self, i, j):
+        if i == 0:
+            return taylor(self.x, i, j - 1)
+        else:
+            return taylor(self.x, i, j)
+
+
+if __name__ == "__main__":
+    print(gen_code(3, dirichlet))
diff --git a/src/solver/impls/adams_bashforth/adams_bashforth.cxx b/src/solver/impls/adams_bashforth/adams_bashforth.cxx
index bfdea5e126..79161fcdbf 100644
--- a/src/solver/impls/adams_bashforth/adams_bashforth.cxx
+++ b/src/solver/impls/adams_bashforth/adams_bashforth.cxx
@@ -201,7 +201,7 @@ void AB_integrate_update(Array<BoutReal>& update, BoutReal timestep,
 
   for (std::size_t j = 0; j < static_cast<std::size_t>(order); ++j) {
     const BoutReal factor = AB_coefficients[j];
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (std::size_t i = 0; i < static_cast<std::size_t>(update.size()); ++i) {
       update[i] += history[j][i] * factor;
     }
@@ -576,7 +576,7 @@ BoutReal AdamsBashforthSolver::take_step(const BoutReal timeIn, const BoutReal d
   // std::transform(std::begin(current), std::end(current), std::begin(full_update),
   //                std::begin(result), std::plus<BoutReal>{});
   if (not(adaptive and followHighOrder)) {
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (int i = 0; i < nlocal; i++) {
       result[i] = current[i] + full_update[i];
     }
@@ -614,7 +614,7 @@ BoutReal AdamsBashforthSolver::take_step(const BoutReal timeIn, const BoutReal d
     // use this to calculate the derivatives at this point.
     // std::transform(std::begin(current), std::end(current), std::begin(half_update),
     //                std::begin(result2), std::plus<BoutReal>{});
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (int i = 0; i < nlocal; i++) {
       result2[i] = current[i] + half_update[i];
     }
@@ -639,7 +639,7 @@ BoutReal AdamsBashforthSolver::take_step(const BoutReal timeIn, const BoutReal d
   // "full" two half step half_update. Rather than using result2 we just replace
   // result here as we want to use this smaller step result
   if (followHighOrder) {
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (int i = 0; i < nlocal; i++) {
       result[i] = current[i] + half_update[i];
     }
diff --git a/src/solver/impls/adams_bashforth/adams_bashforth.hxx b/src/solver/impls/adams_bashforth/adams_bashforth.hxx
index ad8e77ed1c..60b3b2b05e 100644
--- a/src/solver/impls/adams_bashforth/adams_bashforth.hxx
+++ b/src/solver/impls/adams_bashforth/adams_bashforth.hxx
@@ -25,8 +25,8 @@
 
 class AdamsBashforthSolver;
 
-#ifndef __ADAMSBASHFORTH_SOLVER_H__
-#define __ADAMSBASHFORTH_SOLVER_H__
+#ifndef BOUT_ADAMSBASHFORTH_SOLVER_H
+#define BOUT_ADAMSBASHFORTH_SOLVER_H
 
 #include <bout/bout_types.hxx>
 #include <bout/solver.hxx>
@@ -96,4 +96,4 @@ private:
   int nlocal, neq;   // Number of variables on local processor and in total
 };
 
-#endif // __ADAMSBASHFORTH_SOLVER_H__
+#endif // BOUT_ADAMSBASHFORTH_SOLVER_H
diff --git a/src/solver/impls/arkode/arkode.cxx b/src/solver/impls/arkode/arkode.cxx
index aabe2ae050..bc3be6f80a 100644
--- a/src/solver/impls/arkode/arkode.cxx
+++ b/src/solver/impls/arkode/arkode.cxx
@@ -41,17 +41,7 @@
 #include "bout/unused.hxx"
 #include "bout/utils.hxx"
 
-#if SUNDIALS_VERSION_MAJOR >= 4
 #include <arkode/arkode_arkstep.h>
-#else
-#include <arkode/arkode.h>
-#if SUNDIALS_VERSION_MAJOR >= 3
-#include <arkode/arkode_spils.h>
-#else
-#include <arkode/arkode_spgmr.h>
-#endif
-#endif
-
 #include <arkode/arkode_bbdpre.h>
 #include <sundials/sundials_math.h>
 #include <sundials/sundials_types.h>
@@ -61,110 +51,21 @@
 
 class Field2D;
 
-#define ZERO RCONST(0.)
-#define ONE RCONST(1.0)
-
-#ifndef ARKODEINT
-#if SUNDIALS_VERSION_MAJOR < 3
-using ARKODEINT = bout::utils::function_traits<ARKLocalFn>::arg_t<0>;
-#else
-using ARKODEINT = sunindextype;
-#endif
-#endif
-
-static int arkode_rhs_explicit(BoutReal t, N_Vector u, N_Vector du, void* user_data);
-static int arkode_rhs_implicit(BoutReal t, N_Vector u, N_Vector du, void* user_data);
-static int arkode_rhs(BoutReal t, N_Vector u, N_Vector du, void* user_data);
-
-static int arkode_bbd_rhs(ARKODEINT Nlocal, BoutReal t, N_Vector u, N_Vector du,
-                          void* user_data);
-static int arkode_pre(BoutReal t, N_Vector yy, N_Vector yp, N_Vector rvec, N_Vector zvec,
-                      BoutReal gamma, BoutReal delta, int lr, void* user_data);
-#if SUNDIALS_VERSION_MAJOR < 3
-// Shim for earlier versions
-inline static int arkode_pre_shim(BoutReal t, N_Vector yy, N_Vector yp, N_Vector rvec,
-                                  N_Vector zvec, BoutReal gamma, BoutReal delta, int lr,
-                                  void* user_data, N_Vector UNUSED(tmp)) {
-  return arkode_pre(t, yy, yp, rvec, zvec, gamma, delta, lr, user_data);
-}
-#else
-// Alias for newer versions
-constexpr auto& arkode_pre_shim = arkode_pre;
-#endif
+// NOLINTBEGIN(readability-identifier-length)
+namespace {
+int arkode_rhs_explicit(BoutReal t, N_Vector u, N_Vector du, void* user_data);
+int arkode_rhs_implicit(BoutReal t, N_Vector u, N_Vector du, void* user_data);
+int arkode_rhs(BoutReal t, N_Vector u, N_Vector du, void* user_data);
 
-static int arkode_jac(N_Vector v, N_Vector Jv, realtype t, N_Vector y, N_Vector fy,
-                      void* user_data, N_Vector tmp);
-#if SUNDIALS_VERSION_MAJOR < 4
-// Shim for earlier versions
-inline int ARKStepSetJacTimes(void* arkode_mem, std::nullptr_t,
-                              ARKSpilsJacTimesVecFn jtimes) {
-#if SUNDIALS_VERSION_MAJOR < 3
-  return ARKSpilsSetJacTimesVecFn(arkode_mem, jtimes);
-#else
-  return ARKSpilsSetJacTimes(arkode_mem, nullptr, jtimes);
-#endif
-}
-#endif
+int arkode_bbd_rhs(sunindextype Nlocal, BoutReal t, N_Vector u, N_Vector du,
+                   void* user_data);
+int arkode_pre(BoutReal t, N_Vector yy, N_Vector yp, N_Vector rvec, N_Vector zvec,
+               BoutReal gamma, BoutReal delta, int lr, void* user_data);
 
-#if SUNDIALS_VERSION_MAJOR < 4
-void* ARKStepCreate(ARKRhsFn fe, ARKRhsFn fi, BoutReal t0, N_Vector y0) {
-  auto arkode_mem = ARKodeCreate();
-
-  if (arkode_mem == nullptr) {
-    throw BoutException("ARKodeCreate failed\n");
-  }
-  if (ARKodeInit(arkode_mem, fe, fi, t0, y0) != ARK_SUCCESS) {
-    throw BoutException("ARKodeInit failed\n");
-  }
-  return arkode_mem;
-}
-
-#if SUNDIALS_VERSION_MAJOR == 3
-int ARKStepSetLinearSolver(void* arkode_mem, SUNLinearSolver LS, std::nullptr_t) {
-  return ARKSpilsSetLinearSolver(arkode_mem, LS);
-}
-#endif
-
-// Aliases for older versions
-// In SUNDIALS 4, ARKode has become ARKStep, hence all the renames
-constexpr auto& ARKStepEvolve = ARKode;
-constexpr auto& ARKStepFree = ARKodeFree;
-constexpr auto& ARKStepGetCurrentTime = ARKodeGetCurrentTime;
-constexpr auto& ARKStepGetDky = ARKodeGetDky;
-constexpr auto& ARKStepGetLastStep = ARKodeGetLastStep;
-constexpr auto& ARKStepGetNumLinIters = ARKSpilsGetNumLinIters;
-constexpr auto& ARKStepGetNumNonlinSolvIters = ARKodeGetNumNonlinSolvIters;
-constexpr auto& ARKStepGetNumPrecEvals = ARKSpilsGetNumPrecEvals;
-constexpr auto& ARKStepGetNumRhsEvals = ARKodeGetNumRhsEvals;
-constexpr auto& ARKStepGetNumSteps = ARKodeGetNumSteps;
-constexpr auto& ARKStepReInit = ARKodeReInit;
-constexpr auto& ARKStepSStolerances = ARKodeSStolerances;
-constexpr auto& ARKStepSVtolerances = ARKodeSVtolerances;
-constexpr auto& ARKStepSetAdaptivityMethod = ARKodeSetAdaptivityMethod;
-constexpr auto& ARKStepSetCFLFraction = ARKodeSetCFLFraction;
-constexpr auto& ARKStepSetEpsLin = ARKSpilsSetEpsLin;
-constexpr auto& ARKStepSetExplicit = ARKodeSetExplicit;
-constexpr auto& ARKStepSetFixedPoint = ARKodeSetFixedPoint;
-constexpr auto& ARKStepSetFixedStep = ARKodeSetFixedStep;
-constexpr auto& ARKStepSetImEx = ARKodeSetImEx;
-constexpr auto& ARKStepSetImplicit = ARKodeSetImplicit;
-constexpr auto& ARKStepSetInitStep = ARKodeSetInitStep;
-constexpr auto& ARKStepSetLinear = ARKodeSetLinear;
-constexpr auto& ARKStepSetMaxNumSteps = ARKodeSetMaxNumSteps;
-constexpr auto& ARKStepSetMaxStep = ARKodeSetMaxStep;
-constexpr auto& ARKStepSetMinStep = ARKodeSetMinStep;
-constexpr auto& ARKStepSetOptimalParams = ARKodeSetOptimalParams;
-constexpr auto& ARKStepSetOrder = ARKodeSetOrder;
-constexpr auto& ARKStepSetPreconditioner = ARKSpilsSetPreconditioner;
-constexpr auto& ARKStepSetUserData = ARKodeSetUserData;
-#endif
-
-#if SUNDIALS_VERSION_MAJOR < 6
-void* ARKStepCreate(ARKRhsFn fe, ARKRhsFn fi, BoutReal t0, N_Vector y0,
-                    [[maybe_unused]] SUNContext context) {
-  return ARKStepCreate(fe, fi, t0, y0);
-}
-#endif
+int arkode_jac(N_Vector v, N_Vector Jv, BoutReal t, N_Vector y, N_Vector fy,
+               void* user_data, N_Vector tmp);
+} // namespace
+// NOLINTEND(readability-identifier-length)
 
 ArkodeSolver::ArkodeSolver(Options* opts)
     : Solver(opts), diagnose((*options)["diagnose"]
@@ -187,6 +88,14 @@ ArkodeSolver::ArkodeSolver(Options* opts)
                           "not recommended except for code comparison")
                      .withDefault(false)),
       order((*options)["order"].doc("Order of internal step").withDefault(4)),
+#if SUNDIALS_TABLE_BY_NAME_SUPPORT
+      implicit_table((*options)["implicit_table"]
+                         .doc("Name of the implicit Butcher table")
+                         .withDefault("")),
+      explicit_table((*options)["explicit_table"]
+                         .doc("Name of the explicit Butcher table")
+                         .withDefault("")),
+#endif
       cfl_frac((*options)["cfl_frac"]
                    .doc("Fraction of the estimated explicitly stable step to use")
                    .withDefault(-1.0)),
@@ -226,7 +135,7 @@ ArkodeSolver::ArkodeSolver(Options* opts)
                        .withDefault(false)),
       optimize(
           (*options)["optimize"].doc("Use ARKode optimal parameters").withDefault(false)),
-      suncontext(static_cast<void*>(&BoutComm::get())) {
+      suncontext(createSUNContext(BoutComm::get())) {
   has_constraints = false; // This solver doesn't have constraints
 
   // Add diagnostics to output
@@ -243,10 +152,14 @@ ArkodeSolver::ArkodeSolver(Options* opts)
 }
 
 ArkodeSolver::~ArkodeSolver() {
-  N_VDestroy_Parallel(uvec);
+  N_VDestroy(uvec);
   ARKStepFree(&arkode_mem);
   SUNLinSolFree(sun_solver);
   SUNNonlinSolFree(nonlinear_solver);
+
+#if SUNDIALS_CONTROLLER_SUPPORT
+  SUNAdaptController_Destroy(controller);
+#endif
 }
 
 /**************************************************************************
@@ -274,12 +187,13 @@ int ArkodeSolver::init() {
                n2Dvars(), neq, local_N);
 
   // Allocate memory
-  if ((uvec = N_VNew_Parallel(BoutComm::get(), local_N, neq, suncontext)) == nullptr) {
+  uvec = callWithSUNContext(N_VNew_Parallel, suncontext, BoutComm::get(), local_N, neq);
+  if (uvec == nullptr) {
     throw BoutException("SUNDIALS memory allocation failed\n");
   }
 
   // Put the variables into uvec
-  save_vars(NV_DATA_P(uvec));
+  save_vars(N_VGetArrayPointer(uvec));
 
   ASSERT1(solve_explicit or solve_implicit);
 
@@ -298,8 +212,9 @@ int ArkodeSolver::init() {
     }
   }();
 
-  if ((arkode_mem = ARKStepCreate(explicit_rhs, implicit_rhs, simtime, uvec, suncontext))
-      == nullptr) {
+  arkode_mem = callWithSUNContext(ARKStepCreate, suncontext, explicit_rhs, implicit_rhs,
+                                  simtime, uvec);
+  if (arkode_mem == nullptr) {
     throw BoutException("ARKStepCreate failed\n");
   }
 
@@ -325,11 +240,8 @@ int ArkodeSolver::init() {
     throw BoutException("ARKStepSetUserData failed\n");
   }
 
-  if (set_linear) {
-    output.write("\tSetting ARKStep implicit solver to Linear\n");
-    if (ARKStepSetLinear(arkode_mem, 1) != ARK_SUCCESS) {
-      throw BoutException("ARKStepSetLinear failed\n");
-    }
+  if (ARKStepSetLinear(arkode_mem, set_linear) != ARK_SUCCESS) {
+    throw BoutException("ARKStepSetLinear failed\n");
   }
 
   if (fixed_step) {
@@ -344,13 +256,59 @@ int ArkodeSolver::init() {
     throw BoutException("ARKStepSetOrder failed\n");
   }
 
+#if SUNDIALS_TABLE_BY_NAME_SUPPORT
+  if (!implicit_table.empty() || !explicit_table.empty()) {
+    if (ARKStepSetTableName(
+            arkode_mem,
+            implicit_table.empty() ? "ARKODE_DIRK_NONE" : implicit_table.c_str(),
+            explicit_table.empty() ? "ARKODE_ERK_NONE" : explicit_table.c_str())
+        != ARK_SUCCESS) {
+      throw BoutException("ARKStepSetTableName failed\n");
+    }
+  }
+#endif
+
   if (ARKStepSetCFLFraction(arkode_mem, cfl_frac) != ARK_SUCCESS) {
     throw BoutException("ARKStepSetCFLFraction failed\n");
   }
 
+#if SUNDIALS_CONTROLLER_SUPPORT
+  switch (adap_method) {
+  case 0:
+    controller = SUNAdaptController_PID(suncontext);
+    break;
+  case 1:
+    controller = SUNAdaptController_PI(suncontext);
+    break;
+  case 2:
+    controller = SUNAdaptController_I(suncontext);
+    break;
+  case 3:
+    controller = SUNAdaptController_ExpGus(suncontext);
+    break;
+  case 4:
+    controller = SUNAdaptController_ImpGus(suncontext);
+    break;
+  case 5:
+    controller = SUNAdaptController_ImExGus(suncontext);
+    break;
+
+  default:
+    throw BoutException("Invalid adap_method\n");
+  }
+
+  if (ARKStepSetAdaptController(arkode_mem, controller) != ARK_SUCCESS) {
+    throw BoutException("ARKStepSetAdaptController failed\n");
+  }
+
+  if (ARKStepSetAdaptivityAdjustment(arkode_mem, 0) != ARK_SUCCESS) {
+    throw BoutException("ARKStepSetAdaptivityAdjustment failed\n");
+  }
+#else
   if (ARKStepSetAdaptivityMethod(arkode_mem, adap_method, 1, 1, nullptr) != ARK_SUCCESS) {
     throw BoutException("ARKStepSetAdaptivityMethod failed\n");
   }
+#endif
 
   if (use_vector_abstol) {
     std::vector<BoutReal> f2dtols;
@@ -374,18 +332,18 @@ int ArkodeSolver::init() {
                      return Options::root()[f3.name]["atol"].withDefault(abstol);
                    });
 
-    N_Vector abstolvec = N_VNew_Parallel(BoutComm::get(), local_N, neq, suncontext);
+    N_Vector abstolvec = N_VClone(uvec);
     if (abstolvec == nullptr) {
       throw BoutException("SUNDIALS memory allocation (abstol vector) failed\n");
     }
 
-    set_abstol_values(NV_DATA_P(abstolvec), f2dtols, f3dtols);
+    set_abstol_values(N_VGetArrayPointer(abstolvec), f2dtols, f3dtols);
 
     if (ARKStepSVtolerances(arkode_mem, reltol, abstolvec) != ARK_SUCCESS) {
       throw BoutException("ARKStepSVtolerances failed\n");
     }
 
-    N_VDestroy_Parallel(abstolvec);
+    N_VDestroy(abstolvec);
   } else {
     if (ARKStepSStolerances(arkode_mem, reltol, abstol) != ARK_SUCCESS) {
       throw BoutException("ARKStepSStolerances failed\n");
@@ -414,118 +372,79 @@ int ArkodeSolver::init() {
     }
   }
 
-  // ARKStepSetPredictorMethod(arkode_mem,4);
-
-#if SUNDIALS_VERSION_MAJOR < 4
-  if (fixed_point) {
-    output.write("\tUsing accelerated fixed point solver\n");
-    if (ARKodeSetFixedPoint(arkode_mem, 3.0)) {
-      throw BoutException("ARKodeSetFixedPoint failed\n");
-    }
-  } else {
-    output.write("\tUsing Newton iteration\n");
-    if (ARKodeSetNewton(arkode_mem)) {
-      throw BoutException("ARKodeSetNewton failed\n");
-    }
-  }
-#else
   if (fixed_point) {
     output.write("\tUsing accelerated fixed point solver\n");
-    if ((nonlinear_solver = SUNNonlinSol_FixedPoint(uvec, 3, suncontext)) == nullptr) {
+    nonlinear_solver = callWithSUNContext(SUNNonlinSol_FixedPoint, suncontext, uvec, 3);
+    if (nonlinear_solver == nullptr) {
       throw BoutException("Creating SUNDIALS fixed point nonlinear solver failed\n");
     }
+    if (ARKStepSetNonlinearSolver(arkode_mem, nonlinear_solver) != ARK_SUCCESS) {
+      throw BoutException("ARKStepSetNonlinearSolver failed\n");
+    }
   } else {
     output.write("\tUsing Newton iteration\n");
-    if ((nonlinear_solver = SUNNonlinSol_Newton(uvec, suncontext)) == nullptr) {
-      throw BoutException("Creating SUNDIALS Newton nonlinear solver failed\n");
-    }
-  }
-  if (ARKStepSetNonlinearSolver(arkode_mem, nonlinear_solver) != ARK_SUCCESS) {
-    throw BoutException("ARKStepSetNonlinearSolver failed\n");
-  }
-#endif
-
-  /// Set Preconditioner
-  if (use_precon) {
-    const int prectype = rightprec ? SUN_PREC_RIGHT : SUN_PREC_LEFT;
 
-#if SUNDIALS_VERSION_MAJOR >= 3
-    if ((sun_solver = SUNLinSol_SPGMR(uvec, prectype, maxl, suncontext)) == nullptr) {
+    const auto prectype =
+        use_precon ? (rightprec ? SUN_PREC_RIGHT : SUN_PREC_LEFT) : SUN_PREC_NONE;
+    sun_solver = callWithSUNContext(SUNLinSol_SPGMR, suncontext, uvec, prectype, maxl);
+    if (sun_solver == nullptr) {
       throw BoutException("Creating SUNDIALS linear solver failed\n");
     }
-    if (ARKStepSetLinearSolver(arkode_mem, sun_solver, nullptr) != ARK_SUCCESS) {
+    if (ARKStepSetLinearSolver(arkode_mem, sun_solver, nullptr) != ARKLS_SUCCESS) {
       throw BoutException("ARKStepSetLinearSolver failed\n");
     }
-#else
-    if (ARKSpgmr(arkode_mem, prectype, maxl) != ARKSPILS_SUCCESS) {
-      throw BoutException("ARKSpgmr failed\n");
-    }
-#endif
 
-    if (!hasPreconditioner()) {
-      output.write("\tUsing BBD preconditioner\n");
-
-      /// Get options
-      // Compute band_width_default from actually added fields, to allow for multiple
-      // Mesh objects
-      //
-      // Previous implementation was equivalent to:
-      //   int MXSUB = mesh->xend - mesh->xstart + 1;
-      //   int band_width_default = n3Dvars()*(MXSUB+2);
-      const int band_width_default = std::accumulate(
-          begin(f3d), end(f3d), 0, [](int a, const VarStr<Field3D>& fvar) {
-            Mesh* localmesh = fvar.var->getMesh();
-            return a + localmesh->xend - localmesh->xstart + 3;
-          });
-
-      const auto mudq = (*options)["mudq"]
-                            .doc("Upper half-bandwidth to be used in the difference "
-                                 "quotient Jacobian approximation")
-                            .withDefault(band_width_default);
-      const auto mldq = (*options)["mldq"]
-                            .doc("Lower half-bandwidth to be used in the difference "
-                                 "quotient Jacobian approximation")
-                            .withDefault(band_width_default);
-      const auto mukeep = (*options)["mukeep"]
-                              .doc("Upper half-bandwidth of the retained banded "
-                                   "approximate Jacobian block")
-                              .withDefault(n3Dvars() + n2Dvars());
-      const auto mlkeep = (*options)["mlkeep"]
-                              .doc("Lower half-bandwidth of the retained banded "
-                                   "approximate Jacobian block")
-                              .withDefault(n3Dvars() + n2Dvars());
-
-      if (ARKBBDPrecInit(arkode_mem, local_N, mudq, mldq, mukeep, mlkeep, ZERO,
-                         arkode_bbd_rhs, nullptr)
-          != ARK_SUCCESS) {
-        throw BoutException("ARKBBDPrecInit failed\n");
+    /// Set Preconditioner
+    if (use_precon) {
+      if (hasPreconditioner()) {
+        output.write("\tUsing user-supplied preconditioner\n");
+
+        if (ARKStepSetPreconditioner(arkode_mem, nullptr, arkode_pre) != ARKLS_SUCCESS) {
+          throw BoutException("ARKStepSetPreconditioner failed\n");
+        }
+      } else {
+        output.write("\tUsing BBD preconditioner\n");
+
+        /// Get options
+        // Compute band_width_default from actually added fields, to allow for multiple
+        // Mesh objects
+        //
+        // Previous implementation was equivalent to:
+        //   int MXSUB = mesh->xend - mesh->xstart + 1;
+        //   int band_width_default = n3Dvars()*(MXSUB+2);
+        const int band_width_default = std::accumulate(
+            begin(f3d), end(f3d), 0, [](int acc, const VarStr<Field3D>& fvar) {
+              Mesh* localmesh = fvar.var->getMesh();
+              return acc + localmesh->xend - localmesh->xstart + 3;
+            });
+
+        const auto mudq = (*options)["mudq"]
+                              .doc("Upper half-bandwidth to be used in the difference "
+                                   "quotient Jacobian approximation")
+                              .withDefault(band_width_default);
+        const auto mldq = (*options)["mldq"]
+                              .doc("Lower half-bandwidth to be used in the difference "
+                                   "quotient Jacobian approximation")
+                              .withDefault(band_width_default);
+        const auto mukeep = (*options)["mukeep"]
+                                .doc("Upper half-bandwidth of the retained banded "
+                                     "approximate Jacobian block")
+                                .withDefault(n3Dvars() + n2Dvars());
+        const auto mlkeep = (*options)["mlkeep"]
+                                .doc("Lower half-bandwidth of the retained banded "
+                                     "approximate Jacobian block")
+                                .withDefault(n3Dvars() + n2Dvars());
+
+        if (ARKBBDPrecInit(arkode_mem, local_N, mudq, mldq, mukeep, mlkeep, 0,
+                           arkode_bbd_rhs, nullptr)
+            != ARKLS_SUCCESS) {
+          throw BoutException("ARKBBDPrecInit failed\n");
+        }
       }
-
     } else {
-      output.write("\tUsing user-supplied preconditioner\n");
-
-      if (ARKStepSetPreconditioner(arkode_mem, nullptr, arkode_pre_shim) != ARK_SUCCESS) {
-        throw BoutException("ARKStepSetPreconditioner failed\n");
-      }
+      // Not using preconditioning
+      output.write("\tNo preconditioning\n");
     }
-  } else {
-    // Not using preconditioning
-
-    output.write("\tNo preconditioning\n");
-
-#if SUNDIALS_VERSION_MAJOR >= 3
-    if ((sun_solver = SUNLinSol_SPGMR(uvec, SUN_PREC_NONE, maxl, suncontext))
-        == nullptr) {
-      throw BoutException("Creating SUNDIALS linear solver failed\n");
-    }
-    if (ARKStepSetLinearSolver(arkode_mem, sun_solver, nullptr) != ARK_SUCCESS) {
-      throw BoutException("ARKStepSetLinearSolver failed\n");
-    }
-#else
-    if (ARKSpgmr(arkode_mem, SUN_PREC_NONE, maxl) != ARKSPILS_SUCCESS) {
-      throw BoutException("ARKSpgmr failed\n");
-    }
-#endif
   }
 
   /// Set Jacobian-vector multiplication function
@@ -533,8 +452,8 @@ int ArkodeSolver::init() {
   if (use_jacobian and hasJacobian()) {
     output.write("\tUsing user-supplied Jacobian function\n");
 
-    if (ARKStepSetJacTimes(arkode_mem, nullptr, arkode_jac) != ARK_SUCCESS) {
-      throw BoutException("ARKStepSetJacTimesVecFn failed\n");
+    if (ARKStepSetJacTimes(arkode_mem, nullptr, arkode_jac) != ARKLS_SUCCESS) {
+      throw BoutException("ARKStepSetJacTimes failed\n");
     }
   } else {
     output.write("\tUsing difference quotient approximation for Jacobian\n");
@@ -645,7 +564,7 @@ BoutReal ArkodeSolver::run(BoutReal tout) {
   }
 
   // Copy variables
-  load_vars(NV_DATA_P(uvec));
+  load_vars(N_VGetArrayPointer(uvec));
   // Call rhs function to get extra variables at this time
   run_rhs(simtime);
   // run_diffusive(simtime);
@@ -718,8 +637,8 @@ void ArkodeSolver::pre(BoutReal t, BoutReal gamma, BoutReal delta, BoutReal* uda
 
   if (!hasPreconditioner()) {
     // Identity (but should never happen)
-    const int N = NV_LOCLENGTH_P(uvec);
-    std::copy(rvec, rvec + N, zvec);
+    const auto length = N_VGetLocalLength_Parallel(uvec);
+    std::copy(rvec, rvec + length, zvec);
     return;
   }
 
@@ -766,10 +685,12 @@ void ArkodeSolver::jac(BoutReal t, BoutReal* ydata, BoutReal* vdata, BoutReal* J
  * ARKODE explicit RHS functions
  **************************************************************************/
 
-static int arkode_rhs_explicit(BoutReal t, N_Vector u, N_Vector du, void* user_data) {
+// NOLINTBEGIN(readability-identifier-length)
+namespace {
+int arkode_rhs_explicit(BoutReal t, N_Vector u, N_Vector du, void* user_data) {
 
-  BoutReal* udata = NV_DATA_P(u);
-  BoutReal* dudata = NV_DATA_P(du);
+  BoutReal* udata = N_VGetArrayPointer(u);
+  BoutReal* dudata = N_VGetArrayPointer(du);
 
   auto* s = static_cast<ArkodeSolver*>(user_data);
 
@@ -782,10 +703,10 @@ static int arkode_rhs_explicit(BoutReal t, N_Vector u, N_Vector du, void* user_d
   return 0;
 }
 
-static int arkode_rhs_implicit(BoutReal t, N_Vector u, N_Vector du, void* user_data) {
+int arkode_rhs_implicit(BoutReal t, N_Vector u, N_Vector du, void* user_data) {
 
-  BoutReal* udata = NV_DATA_P(u);
-  BoutReal* dudata = NV_DATA_P(du);
+  BoutReal* udata = N_VGetArrayPointer(u);
+  BoutReal* dudata = N_VGetArrayPointer(du);
 
   auto* s = static_cast<ArkodeSolver*>(user_data);
 
@@ -798,10 +719,10 @@ static int arkode_rhs_implicit(BoutReal t, N_Vector u, N_Vector du, void* user_d
   return 0;
 }
 
-static int arkode_rhs(BoutReal t, N_Vector u, N_Vector du, void* user_data) {
+int arkode_rhs(BoutReal t, N_Vector u, N_Vector du, void* user_data) {
 
-  BoutReal* udata = NV_DATA_P(u);
-  BoutReal* dudata = NV_DATA_P(du);
+  BoutReal* udata = N_VGetArrayPointer(u);
+  BoutReal* dudata = N_VGetArrayPointer(du);
 
   auto* s = static_cast<ArkodeSolver*>(user_data);
 
@@ -815,18 +736,17 @@ static int arkode_rhs(BoutReal t, N_Vector u, N_Vector du, void* user_data) {
 }
 
 /// RHS function for BBD preconditioner
-static int arkode_bbd_rhs(ARKODEINT UNUSED(Nlocal), BoutReal t, N_Vector u, N_Vector du,
-                          void* user_data) {
+int arkode_bbd_rhs(sunindextype UNUSED(Nlocal), BoutReal t, N_Vector u, N_Vector du,
+                   void* user_data) {
   return arkode_rhs_implicit(t, u, du, user_data);
 }
 
 /// Preconditioner function
-static int arkode_pre(BoutReal t, N_Vector yy, N_Vector UNUSED(yp), N_Vector rvec,
-                      N_Vector zvec, BoutReal gamma, BoutReal delta, int UNUSED(lr),
-                      void* user_data) {
-  BoutReal* udata = NV_DATA_P(yy);
-  BoutReal* rdata = NV_DATA_P(rvec);
-  BoutReal* zdata = NV_DATA_P(zvec);
+int arkode_pre(BoutReal t, N_Vector yy, N_Vector UNUSED(yp), N_Vector rvec, N_Vector zvec,
+               BoutReal gamma, BoutReal delta, int UNUSED(lr), void* user_data) {
+  BoutReal* udata = N_VGetArrayPointer(yy);
+  BoutReal* rdata = N_VGetArrayPointer(rvec);
+  BoutReal* zdata = N_VGetArrayPointer(zvec);
 
   auto* s = static_cast<ArkodeSolver*>(user_data);
 
@@ -837,11 +757,11 @@ static int arkode_pre(BoutReal t, N_Vector yy, N_Vector UNUSED(yp), N_Vector rve
 }
 
 /// Jacobian-vector multiplication function
-static int arkode_jac(N_Vector v, N_Vector Jv, realtype t, N_Vector y,
-                      N_Vector UNUSED(fy), void* user_data, N_Vector UNUSED(tmp)) {
-  BoutReal* ydata = NV_DATA_P(y);   ///< System state
-  BoutReal* vdata = NV_DATA_P(v);   ///< Input vector
-  BoutReal* Jvdata = NV_DATA_P(Jv); ///< Jacobian*vector output
+int arkode_jac(N_Vector v, N_Vector Jv, BoutReal t, N_Vector y, N_Vector UNUSED(fy),
+               void* user_data, N_Vector UNUSED(tmp)) {
+  BoutReal* ydata = N_VGetArrayPointer(y);   ///< System state
+  BoutReal* vdata = N_VGetArrayPointer(v);   ///< Input vector
+  BoutReal* Jvdata = N_VGetArrayPointer(Jv); ///< Jacobian*vector output
 
   auto* s = static_cast<ArkodeSolver*>(user_data);
 
@@ -849,6 +769,8 @@ static int arkode_jac(N_Vector v, N_Vector Jv, realtype t, N_Vector y,
 
   return 0;
 }
+} // namespace
+// NOLINTEND(readability-identifier-length)
 
 /**************************************************************************
  * vector abstol functions
diff --git a/src/solver/impls/arkode/arkode.hxx b/src/solver/impls/arkode/arkode.hxx
index afdce0b701..08bb3ea729 100644
--- a/src/solver/impls/arkode/arkode.hxx
+++ b/src/solver/impls/arkode/arkode.hxx
@@ -26,8 +26,8 @@
  *
  **************************************************************************/
 
-#ifndef __ARKODE_SOLVER_H__
-#define __ARKODE_SOLVER_H__
+#ifndef BOUT_ARKODE_SOLVER_H
+#define BOUT_ARKODE_SOLVER_H
 
 #include "bout/build_config.hxx"
 #include "bout/solver.hxx"
@@ -47,6 +47,10 @@ RegisterUnavailableSolver
 #include <nvector/nvector_parallel.h>
 #include <sundials/sundials_config.h>
 
+#if SUNDIALS_CONTROLLER_SUPPORT
+#include <sundials/sundials_adaptcontroller.h>
+#endif
+
 #include <vector>
 
 class ArkodeSolver;
@@ -102,6 +106,10 @@ private:
   bool fixed_step;
   /// Order of internal step
   int order;
+  /// Name of the implicit Butcher table
+  std::string implicit_table;
+  /// Name of the explicit Butcher table
+  std::string explicit_table;
   /// Fraction of the estimated explicitly stable step to use
   BoutReal cfl_frac;
   /// Set timestep adaptivity function:
@@ -153,11 +161,15 @@ private:
 
   /// SPGMR solver structure
   SUNLinearSolver sun_solver{nullptr};
-  /// Solver for functional iterations for Adams-Moulton
+  /// Solver for implicit stages
   SUNNonlinearSolver nonlinear_solver{nullptr};
+#if SUNDIALS_CONTROLLER_SUPPORT
+  /// Timestep controller
+  SUNAdaptController controller{nullptr};
+#endif
   /// Context for SUNDIALS memory allocations
   sundials::Context suncontext;
 };
 
 #endif // BOUT_HAS_ARKODE
-#endif // __ARKODE_SOLVER_H__
+#endif // BOUT_ARKODE_SOLVER_H
diff --git a/src/solver/impls/cvode/cvode.cxx b/src/solver/impls/cvode/cvode.cxx
index c17bed420c..22f7f154f7 100644
--- a/src/solver/impls/cvode/cvode.cxx
+++ b/src/solver/impls/cvode/cvode.cxx
@@ -44,16 +44,9 @@
 #include "fmt/core.h"
 
 #include <cvode/cvode.h>
-
-#if SUNDIALS_VERSION_MAJOR >= 3
-#include <cvode/cvode_spils.h>
-#include <sunlinsol/sunlinsol_spgmr.h>
-#else
-#include <cvode/cvode_spgmr.h>
-#endif
-
 #include <cvode/cvode_bbdpre.h>
 #include <sundials/sundials_types.h>
+#include <sunlinsol/sunlinsol_spgmr.h>
 
 #include <algorithm>
 #include <numeric>
@@ -61,68 +54,22 @@
 
 class Field2D;
 
-#define ZERO RCONST(0.)
-#define ONE RCONST(1.0)
-
-#ifndef CVODEINT
-#if SUNDIALS_VERSION_MAJOR < 3
-using CVODEINT = bout::utils::function_traits<CVLocalFn>::arg_t<0>;
-#else
-using CVODEINT = sunindextype;
-#endif
-#endif
-
 BOUT_ENUM_CLASS(positivity_constraint, none, positive, non_negative, negative,
                 non_positive);
 
-static int cvode_rhs(BoutReal t, N_Vector u, N_Vector du, void* user_data);
-static int cvode_bbd_rhs(CVODEINT Nlocal, BoutReal t, N_Vector u, N_Vector du,
-                         void* user_data);
+// NOLINTBEGIN(readability-identifier-length)
+namespace {
+int cvode_rhs(BoutReal t, N_Vector u, N_Vector du, void* user_data);
+int cvode_bbd_rhs(sunindextype Nlocal, BoutReal t, N_Vector u, N_Vector du,
+                  void* user_data);
 
-static int cvode_pre(BoutReal t, N_Vector yy, N_Vector yp, N_Vector rvec, N_Vector zvec,
-                     BoutReal gamma, BoutReal delta, int lr, void* user_data);
+int cvode_pre(BoutReal t, N_Vector yy, N_Vector yp, N_Vector rvec, N_Vector zvec,
+              BoutReal gamma, BoutReal delta, int lr, void* user_data);
 
-#if SUNDIALS_VERSION_MAJOR < 3
-// Shim for earlier versions
-inline static int cvode_pre_shim(BoutReal t, N_Vector yy, N_Vector yp, N_Vector rvec,
-                                 N_Vector zvec, BoutReal gamma, BoutReal delta, int lr,
-                                 void* user_data, N_Vector UNUSED(tmp)) {
-  return cvode_pre(t, yy, yp, rvec, zvec, gamma, delta, lr, user_data);
-}
-#else
-// Alias for newer versions
-constexpr auto& cvode_pre_shim = cvode_pre;
-#endif
-
-static int cvode_jac(N_Vector v, N_Vector Jv, realtype t, N_Vector y, N_Vector fy,
-                     void* user_data, N_Vector tmp);
-
-#if SUNDIALS_VERSION_MAJOR < 3
-// Shim for earlier versions
-inline int CVSpilsSetJacTimes(void* arkode_mem, std::nullptr_t,
-                              CVSpilsJacTimesVecFn jtimes) {
-  return CVSpilsSetJacTimesVecFn(arkode_mem, jtimes);
-}
-#endif
-
-#if SUNDIALS_VERSION_MAJOR >= 4
-// Shim for newer versions
-constexpr auto CV_FUNCTIONAL = 0;
-constexpr auto CV_NEWTON = 0;
-#endif
-
-#if SUNDIALS_VERSION_MAJOR >= 3
-void* CVodeCreate(int lmm, [[maybe_unused]] int iter,
-                  [[maybe_unused]] SUNContext context) {
-#if SUNDIALS_VERSION_MAJOR == 3
-  return CVodeCreate(lmm, iter);
-#elif SUNDIALS_VERSION_MAJOR == 4 || SUNDIALS_VERSION_MAJOR == 5
-  return CVodeCreate(lmm);
-#else
-  return CVodeCreate(lmm, context);
-#endif
-}
-#endif
+int cvode_jac(N_Vector v, N_Vector Jv, BoutReal t, N_Vector y, N_Vector fy,
+              void* user_data, N_Vector tmp);
+} // namespace
+// NOLINTEND(readability-identifier-length)
 
 CvodeSolver::CvodeSolver(Options* opts)
     : Solver(opts), diagnose((*options)["diagnose"]
@@ -136,7 +83,7 @@ CvodeSolver::CvodeSolver(Options* opts)
                     .doc("Use functional iteration instead of Newton")
                     .withDefault(adams_moulton)),
       max_order((*options)["cvode_max_order"]
-                    .doc("Maximum order of method to use. < 0 means no limit.")
+                    .doc("Maximum order of method to use. <= 0 means default limit.")
                     .withDefault(-1)),
       stablimdet((*options)["cvode_stability_limit_detection"].withDefault(false)),
       abstol((*options)["atol"].doc("Absolute tolerance").withDefault(1.0e-12)),
@@ -148,19 +95,18 @@ CvodeSolver::CvodeSolver(Options* opts)
                   .doc("Maximum number of internal steps between outputs.")
                   .withDefault(500)),
       max_timestep(
-          (*options)["max_timestep"].doc("Maximum time step size").withDefault(-1.0)),
+          (*options)["max_timestep"].doc("Maximum time step size").withDefault(0.0)),
       min_timestep(
-          (*options)["min_timestep"].doc("Minimum time step size").withDefault(-1.0)),
+          (*options)["min_timestep"].doc("Minimum time step size").withDefault(0.0)),
       start_timestep((*options)["start_timestep"]
-                         .doc("Starting time step. < 0 then chosen by CVODE.")
-                         .withDefault(-1.0)),
+                         .doc("Starting time step. = 0 then chosen by CVODE.")
+                         .withDefault(0.0)),
       mxorder((*options)["mxorder"].doc("Maximum order").withDefault(-1)),
       max_nonlinear_iterations(
           (*options)["max_nonlinear_iterations"]
               .doc("Maximum number of nonlinear iterations allowed by CVODE before "
-                   "reducing "
-                   "timestep. CVODE default (used if this option is negative) is 3.")
-              .withDefault(-1)),
+                   "reducing timestep.")
+              .withDefault(3)),
       apply_positivity_constraints(
           (*options)["apply_positivity_constraints"]
               .doc("Use CVODE function CVodeSetConstraints to constrain variables - the "
@@ -184,7 +130,7 @@ CvodeSolver::CvodeSolver(Options* opts)
               .doc("Factor by which the Krylov linear solver’s convergence test constant "
                    "is reduced from the nonlinear solver test constant.")
               .withDefault(0.05)),
-      suncontext(static_cast<void*>(&BoutComm::get())) {
+      suncontext(createSUNContext(BoutComm::get())) {
   has_constraints = false; // This solver doesn't have constraints
   canReset = true;
 
@@ -210,7 +156,7 @@ CvodeSolver::CvodeSolver(Options* opts)
 
 CvodeSolver::~CvodeSolver() {
   if (cvode_initialised) {
-    N_VDestroy_Parallel(uvec);
+    N_VDestroy(uvec);
     CVodeFree(&cvode_mem);
     SUNLinSolFree(sun_solver);
     SUNNonlinSolFree(nonlinear_solver);
@@ -242,12 +188,13 @@ int CvodeSolver::init() {
                     n3Dvars(), n2Dvars(), neq, local_N);
 
   // Allocate memory
-  if ((uvec = N_VNew_Parallel(BoutComm::get(), local_N, neq, suncontext)) == nullptr) {
+  uvec = callWithSUNContext(N_VNew_Parallel, suncontext, BoutComm::get(), local_N, neq);
+  if (uvec == nullptr) {
     throw BoutException("SUNDIALS memory allocation failed\n");
   }
 
   // Put the variables into uvec
-  save_vars(NV_DATA_P(uvec));
+  save_vars(N_VGetArrayPointer(uvec));
 
   if (adams_moulton) {
     // By default use functional iteration for Adams-Moulton
@@ -258,31 +205,29 @@ int CvodeSolver::init() {
   }
 
   const auto lmm = adams_moulton ? CV_ADAMS : CV_BDF;
-  const auto iter = func_iter ? CV_FUNCTIONAL : CV_NEWTON;
 
-  if ((cvode_mem = CVodeCreate(lmm, iter, suncontext)) == nullptr) {
+  cvode_mem = callWithSUNContext(CVodeCreate, suncontext, lmm);
+  if (cvode_mem == nullptr) {
     throw BoutException("CVodeCreate failed\n");
   }
 
   // For callbacks, need pointer to solver object
-  if (CVodeSetUserData(cvode_mem, this) < 0) {
+  if (CVodeSetUserData(cvode_mem, this) != CV_SUCCESS) {
     throw BoutException("CVodeSetUserData failed\n");
   }
 
-  if (CVodeInit(cvode_mem, cvode_rhs, simtime, uvec) < 0) {
+  if (CVodeInit(cvode_mem, cvode_rhs, simtime, uvec) != CV_SUCCESS) {
     throw BoutException("CVodeInit failed\n");
   }
 
   if (max_order > 0) {
-    if (CVodeSetMaxOrd(cvode_mem, max_order) < 0) {
+    if (CVodeSetMaxOrd(cvode_mem, max_order) != CV_SUCCESS) {
       throw BoutException("CVodeSetMaxOrder failed\n");
     }
   }
 
-  if (stablimdet) {
-    if (CVodeSetStabLimDet(cvode_mem, stablimdet) < 0) {
-      throw BoutException("CVodeSetStabLimDet failed\n");
-    }
+  if (CVodeSetStabLimDet(cvode_mem, static_cast<int>(stablimdet)) != CV_SUCCESS) {
+    throw BoutException("CVodeSetStabLimDet failed\n");
   }
 
   if (use_vector_abstol) {
@@ -307,94 +252,97 @@ int CvodeSolver::init() {
                      return Options::root()[f3.name]["atol"].withDefault(abstol);
                    });
 
-    N_Vector abstolvec = N_VNew_Parallel(BoutComm::get(), local_N, neq, suncontext);
+    N_Vector abstolvec = N_VClone(uvec);
     if (abstolvec == nullptr) {
       throw BoutException("SUNDIALS memory allocation (abstol vector) failed\n");
     }
 
-    set_vector_option_values(NV_DATA_P(abstolvec), f2dtols, f3dtols);
+    set_vector_option_values(N_VGetArrayPointer(abstolvec), f2dtols, f3dtols);
 
-    if (CVodeSVtolerances(cvode_mem, reltol, abstolvec) < 0) {
+    if (CVodeSVtolerances(cvode_mem, reltol, abstolvec) != CV_SUCCESS) {
       throw BoutException("CVodeSVtolerances failed\n");
     }
 
-    N_VDestroy_Parallel(abstolvec);
+    N_VDestroy(abstolvec);
   } else {
-    if (CVodeSStolerances(cvode_mem, reltol, abstol) < 0) {
+    if (CVodeSStolerances(cvode_mem, reltol, abstol) != CV_SUCCESS) {
       throw BoutException("CVodeSStolerances failed\n");
     }
   }
 
-  CVodeSetMaxNumSteps(cvode_mem, mxsteps);
-
-  if (max_timestep > 0.0) {
-    CVodeSetMaxStep(cvode_mem, max_timestep);
+  if (CVodeSetMaxNumSteps(cvode_mem, mxsteps) != CV_SUCCESS) {
+    throw BoutException("CVodeSetMaxNumSteps failed\n");
   }
 
-  if (min_timestep > 0.0) {
-    CVodeSetMinStep(cvode_mem, min_timestep);
+  if (CVodeSetMaxStep(cvode_mem, max_timestep) != CV_SUCCESS) {
+    throw BoutException("CVodeSetMaxStep failed\n");
   }
 
-  if (start_timestep > 0.0) {
-    CVodeSetInitStep(cvode_mem, start_timestep);
+  if (CVodeSetMinStep(cvode_mem, min_timestep) != CV_SUCCESS) {
+    throw BoutException("CVodeSetMinStep failed\n");
   }
 
-  if (mxorder > 0) {
-    CVodeSetMaxOrd(cvode_mem, mxorder);
+  if (CVodeSetInitStep(cvode_mem, start_timestep) != CV_SUCCESS) {
+    throw BoutException("CVodeSetInitStep failed\n");
   }
 
-  if (max_nonlinear_iterations > 0) {
-    CVodeSetMaxNonlinIters(cvode_mem, max_nonlinear_iterations);
+  if (CVodeSetMaxNonlinIters(cvode_mem, max_nonlinear_iterations) != CV_SUCCESS) {
+    throw BoutException("CVodeSetMaxNonlinIters failed\n");
   }
 
-#if not(SUNDIALS_VERSION_MAJOR >= 3 and SUNDIALS_VERSION_MINOR >= 2)
-  if (apply_positivity_constraints) {
-    throw BoutException("The apply_positivity_constraints option is only available with "
-                        "SUNDIALS>=3.2.0");
-  }
-#else
   if (apply_positivity_constraints) {
     auto f2d_constraints = create_constraints(f2d);
     auto f3d_constraints = create_constraints(f3d);
 
-    N_Vector constraints_vec = N_VNew_Parallel(BoutComm::get(), local_N, neq, suncontext);
+    N_Vector constraints_vec = N_VClone(uvec);
     if (constraints_vec == nullptr) {
       throw BoutException("SUNDIALS memory allocation (positivity constraints vector) "
                           "failed\n");
     }
 
-    set_vector_option_values(NV_DATA_P(constraints_vec), f2d_constraints,
+    set_vector_option_values(N_VGetArrayPointer(constraints_vec), f2d_constraints,
                              f3d_constraints);
 
-    if (CVodeSetConstraints(cvode_mem, constraints_vec) < 0) {
+    if (CVodeSetConstraints(cvode_mem, constraints_vec) != CV_SUCCESS) {
       throw BoutException("CVodeSetConstraints failed\n");
     }
 
-    N_VDestroy_Parallel(constraints_vec);
+    N_VDestroy(constraints_vec);
   }
-#endif
 
   /// Newton method can include Preconditioners and Jacobian function
-  if (!func_iter) {
+  if (func_iter) {
+    output_info.write("\tUsing Functional iteration\n");
+    nonlinear_solver = callWithSUNContext(SUNNonlinSol_FixedPoint, suncontext, uvec, 0);
+    if (nonlinear_solver == nullptr) {
+      throw BoutException("SUNNonlinSol_FixedPoint failed\n");
+    }
+
+    if (CVodeSetNonlinearSolver(cvode_mem, nonlinear_solver) != 0) {
+      throw BoutException("CVodeSetNonlinearSolver failed\n");
+    }
+  } else {
     output_info.write("\tUsing Newton iteration\n");
     TRACE("Setting preconditioner");
-    if (use_precon) {
-      const int prectype = rightprec ? SUN_PREC_RIGHT : SUN_PREC_LEFT;
 
-#if SUNDIALS_VERSION_MAJOR >= 3
-      if ((sun_solver = SUNLinSol_SPGMR(uvec, prectype, maxl, suncontext)) == nullptr) {
-        throw BoutException("Creating SUNDIALS linear solver failed\n");
-      }
-      if (CVSpilsSetLinearSolver(cvode_mem, sun_solver) != CV_SUCCESS) {
-        throw BoutException("CVSpilsSetLinearSolver failed\n");
-      }
-#else
-      if (CVSpgmr(cvode_mem, prectype, maxl) != CVSPILS_SUCCESS) {
-        throw BoutException("CVSpgmr failed\n");
-      }
-#endif
+    const auto prectype =
+        use_precon ? (rightprec ? SUN_PREC_RIGHT : SUN_PREC_LEFT) : SUN_PREC_NONE;
+    sun_solver = callWithSUNContext(SUNLinSol_SPGMR, suncontext, uvec, prectype, maxl);
+    if (sun_solver == nullptr) {
+      throw BoutException("Creating SUNDIALS linear solver failed\n");
+    }
+    if (CVodeSetLinearSolver(cvode_mem, sun_solver, nullptr) != CVLS_SUCCESS) {
+      throw BoutException("CVodeSetLinearSolver failed\n");
+    }
 
-      if (!hasPreconditioner()) {
+    if (use_precon) {
+      if (hasPreconditioner()) {
+        output_info.write("\tUsing user-supplied preconditioner\n");
+
+        if (CVodeSetPreconditioner(cvode_mem, nullptr, cvode_pre) != CVLS_SUCCESS) {
+          throw BoutException("CVodeSetPreconditioner failed\n");
+        }
+      } else {
         output_info.write("\tUsing BBD preconditioner\n");
 
         /// Get options
@@ -415,62 +363,36 @@ int CvodeSolver::init() {
         const auto mukeep = (*options)["mukeep"].withDefault(n3Dvars() + n2Dvars());
         const auto mlkeep = (*options)["mlkeep"].withDefault(n3Dvars() + n2Dvars());
 
-        if (CVBBDPrecInit(cvode_mem, local_N, mudq, mldq, mukeep, mlkeep, ZERO,
-                          cvode_bbd_rhs, nullptr)) {
+        if (CVBBDPrecInit(cvode_mem, local_N, mudq, mldq, mukeep, mlkeep, 0.0,
+                          cvode_bbd_rhs, nullptr)
+            != CVLS_SUCCESS) {
           throw BoutException("CVBBDPrecInit failed\n");
         }
-
-      } else {
-        output_info.write("\tUsing user-supplied preconditioner\n");
-
-        if (CVSpilsSetPreconditioner(cvode_mem, nullptr, cvode_pre_shim)) {
-          throw BoutException("CVSpilsSetPreconditioner failed\n");
-        }
       }
     } else {
       output_info.write("\tNo preconditioning\n");
-
-#if SUNDIALS_VERSION_MAJOR >= 3
-      if ((sun_solver = SUNLinSol_SPGMR(uvec, SUN_PREC_NONE, maxl, suncontext))
-          == nullptr) {
-        throw BoutException("Creating SUNDIALS linear solver failed\n");
-      }
-      if (CVSpilsSetLinearSolver(cvode_mem, sun_solver) != CV_SUCCESS) {
-        throw BoutException("CVSpilsSetLinearSolver failed\n");
-      }
-#else
-      if (CVSpgmr(cvode_mem, SUN_PREC_NONE, maxl) != CVSPILS_SUCCESS) {
-        throw BoutException("CVSpgmr failed\n");
-      }
-#endif
     }
 
     /// Set Jacobian-vector multiplication function
     if (use_jacobian and hasJacobian()) {
       output_info.write("\tUsing user-supplied Jacobian function\n");
 
-      if (CVSpilsSetJacTimes(cvode_mem, nullptr, cvode_jac) != CV_SUCCESS) {
-        throw BoutException("CVSpilsSetJacTimesVecFn failed\n");
+      if (CVodeSetJacTimes(cvode_mem, nullptr, cvode_jac) != CVLS_SUCCESS) {
+        throw BoutException("CVodeSetJacTimes failed\n");
       }
     } else {
       output_info.write("\tUsing difference quotient approximation for Jacobian\n");
     }
-  } else {
-    output_info.write("\tUsing Functional iteration\n");
-#if SUNDIALS_VERSION_MAJOR >= 4
-    if ((nonlinear_solver = SUNNonlinSol_FixedPoint(uvec, 0, suncontext)) == nullptr) {
-      throw BoutException("SUNNonlinSol_FixedPoint failed\n");
-    }
-
-    if (CVodeSetNonlinearSolver(cvode_mem, nonlinear_solver)) {
-      throw BoutException("CVodeSetNonlinearSolver failed\n");
-    }
-#endif
   }
 
   // Set internal tolerance factors
-  CVodeSetNonlinConvCoef(cvode_mem, cvode_nonlinear_convergence_coef);
-  CVodeSetEpsLin(cvode_mem, cvode_linear_convergence_coef);
+  if (CVodeSetNonlinConvCoef(cvode_mem, cvode_nonlinear_convergence_coef) != CV_SUCCESS) {
+    throw BoutException("CVodeSetNonlinConvCoef failed\n");
+  }
+
+  if (CVodeSetEpsLin(cvode_mem, cvode_linear_convergence_coef) != CV_SUCCESS) {
+    throw BoutException("CVodeSetEpsLin failed\n");
+  }
 
   cvode_initialised = true;
 
@@ -544,9 +466,9 @@ int CvodeSolver::run() {
     nfevals = int(temp_long_int);
     CVodeGetNumNonlinSolvIters(cvode_mem, &temp_long_int);
     nniters = int(temp_long_int);
-    CVSpilsGetNumPrecSolves(cvode_mem, &temp_long_int);
+    CVodeGetNumPrecSolves(cvode_mem, &temp_long_int);
     npevals = int(temp_long_int);
-    CVSpilsGetNumLinIters(cvode_mem, &temp_long_int);
+    CVodeGetNumLinIters(cvode_mem, &temp_long_int);
     nliters = int(temp_long_int);
 
     // Last step size
@@ -634,7 +556,7 @@ BoutReal CvodeSolver::run(BoutReal tout) {
   }
 
   // Copy variables
-  load_vars(NV_DATA_P(uvec));
+  load_vars(N_VGetArrayPointer(uvec));
 
   // Call rhs function to get extra variables at this time
   run_rhs(simtime);
@@ -678,11 +600,11 @@ void CvodeSolver::pre(BoutReal t, BoutReal gamma, BoutReal delta, BoutReal* udat
 
   BoutReal tstart = bout::globals::mpi->MPI_Wtime();
 
-  int N = NV_LOCLENGTH_P(uvec);
+  const auto length = N_VGetLocalLength_Parallel(uvec);
 
   if (!hasPreconditioner()) {
     // Identity (but should never happen)
-    for (int i = 0; i < N; i++) {
+    for (int i = 0; i < length; i++) {
       zvec[i] = rvec[i];
     }
     return;
@@ -731,10 +653,12 @@ void CvodeSolver::jac(BoutReal t, BoutReal* ydata, BoutReal* vdata, BoutReal* Jv
  * CVODE RHS functions
  **************************************************************************/
 
-static int cvode_rhs(BoutReal t, N_Vector u, N_Vector du, void* user_data) {
+// NOLINTBEGIN(readability-identifier-length)
+namespace {
+int cvode_rhs(BoutReal t, N_Vector u, N_Vector du, void* user_data) {
 
-  BoutReal* udata = NV_DATA_P(u);
-  BoutReal* dudata = NV_DATA_P(du);
+  BoutReal* udata = N_VGetArrayPointer(u);
+  BoutReal* dudata = N_VGetArrayPointer(du);
 
   auto* s = static_cast<CvodeSolver*>(user_data);
 
@@ -748,18 +672,17 @@ static int cvode_rhs(BoutReal t, N_Vector u, N_Vector du, void* user_data) {
 }
 
 /// RHS function for BBD preconditioner
-static int cvode_bbd_rhs(CVODEINT UNUSED(Nlocal), BoutReal t, N_Vector u, N_Vector du,
-                         void* user_data) {
+int cvode_bbd_rhs(sunindextype UNUSED(Nlocal), BoutReal t, N_Vector u, N_Vector du,
+                  void* user_data) {
   return cvode_rhs(t, u, du, user_data);
 }
 
 /// Preconditioner function
-static int cvode_pre(BoutReal t, N_Vector yy, N_Vector UNUSED(yp), N_Vector rvec,
-                     N_Vector zvec, BoutReal gamma, BoutReal delta, int UNUSED(lr),
-                     void* user_data) {
-  BoutReal* udata = NV_DATA_P(yy);
-  BoutReal* rdata = NV_DATA_P(rvec);
-  BoutReal* zdata = NV_DATA_P(zvec);
+int cvode_pre(BoutReal t, N_Vector yy, N_Vector UNUSED(yp), N_Vector rvec, N_Vector zvec,
+              BoutReal gamma, BoutReal delta, int UNUSED(lr), void* user_data) {
+  BoutReal* udata = N_VGetArrayPointer(yy);
+  BoutReal* rdata = N_VGetArrayPointer(rvec);
+  BoutReal* zdata = N_VGetArrayPointer(zvec);
 
   auto* s = static_cast<CvodeSolver*>(user_data);
 
@@ -770,11 +693,11 @@ static int cvode_pre(BoutReal t, N_Vector yy, N_Vector UNUSED(yp), N_Vector rvec
 }
 
 /// Jacobian-vector multiplication function
-static int cvode_jac(N_Vector v, N_Vector Jv, realtype t, N_Vector y, N_Vector UNUSED(fy),
-                     void* user_data, N_Vector UNUSED(tmp)) {
-  BoutReal* ydata = NV_DATA_P(y);   ///< System state
-  BoutReal* vdata = NV_DATA_P(v);   ///< Input vector
-  BoutReal* Jvdata = NV_DATA_P(Jv); ///< Jacobian*vector output
+int cvode_jac(N_Vector v, N_Vector Jv, BoutReal t, N_Vector y, N_Vector UNUSED(fy),
+              void* user_data, N_Vector UNUSED(tmp)) {
+  BoutReal* ydata = N_VGetArrayPointer(y);   ///< System state
+  BoutReal* vdata = N_VGetArrayPointer(v);   ///< Input vector
+  BoutReal* Jvdata = N_VGetArrayPointer(Jv); ///< Jacobian*vector output
 
   auto* s = static_cast<CvodeSolver*>(user_data);
 
@@ -782,6 +705,8 @@ static int cvode_jac(N_Vector v, N_Vector Jv, realtype t, N_Vector y, N_Vector U
 
   return 0;
 }
+} // namespace
+// NOLINTEND(readability-identifier-length)
 
 /**************************************************************************
  * CVODE vector option functions
@@ -829,9 +754,9 @@ void CvodeSolver::loop_vector_option_values_op(Ind2D UNUSED(i2d), BoutReal* opti
 
 void CvodeSolver::resetInternalFields() {
   TRACE("CvodeSolver::resetInternalFields");
-  save_vars(NV_DATA_P(uvec));
+  save_vars(N_VGetArrayPointer(uvec));
 
-  if (CVodeReInit(cvode_mem, simtime, uvec) < 0) {
+  if (CVodeReInit(cvode_mem, simtime, uvec) != CV_SUCCESS) {
     throw BoutException("CVodeReInit failed\n");
   }
 }
diff --git a/src/solver/impls/cvode/cvode.hxx b/src/solver/impls/cvode/cvode.hxx
index fa8b972bca..89c3a613a8 100644
--- a/src/solver/impls/cvode/cvode.hxx
+++ b/src/solver/impls/cvode/cvode.hxx
@@ -25,8 +25,8 @@
  *
  **************************************************************************/
 
-#ifndef __SUNDIAL_SOLVER_H__
-#define __SUNDIAL_SOLVER_H__
+#ifndef BOUT_SUNDIAL_SOLVER_H
+#define BOUT_SUNDIAL_SOLVER_H
 
 #include "bout/build_config.hxx"
 #include "bout/solver.hxx"
@@ -157,4 +157,4 @@ private:
 };
 
 #endif // BOUT_HAS_CVODE
-#endif // __SUNDIAL_SOLVER_H__
+#endif // BOUT_SUNDIAL_SOLVER_H
diff --git a/src/solver/impls/euler/euler.cxx b/src/solver/impls/euler/euler.cxx
index 7bffcdeb15..3976f4402c 100644
--- a/src/solver/impls/euler/euler.cxx
+++ b/src/solver/impls/euler/euler.cxx
@@ -144,7 +144,7 @@ void EulerSolver::take_step(BoutReal curtime, BoutReal dt, Array<BoutReal>& star
   run_rhs(curtime);
   save_derivs(std::begin(result));
 
-  BOUT_OMP(parallel for)
+  BOUT_OMP_PERF(parallel for)
   for (int i = 0; i < nlocal; i++) {
     result[i] = start[i] + dt * result[i];
   }
diff --git a/src/solver/impls/euler/euler.hxx b/src/solver/impls/euler/euler.hxx
index bfa0be9bb3..0ee81a3d33 100644
--- a/src/solver/impls/euler/euler.hxx
+++ b/src/solver/impls/euler/euler.hxx
@@ -27,8 +27,8 @@
 
 class EulerSolver;
 
-#ifndef __EULER_SOLVER_H__
-#define __EULER_SOLVER_H__
+#ifndef BOUT_EULER_SOLVER_H
+#define BOUT_EULER_SOLVER_H
 
 #include "mpi.h"
 
@@ -66,4 +66,4 @@ private:
                  Array<BoutReal>& result);
 };
 
-#endif // __KARNIADAKIS_SOLVER_H__
+#endif // BOUT_KARNIADAKIS_SOLVER_H
diff --git a/src/solver/impls/ida/ida.cxx b/src/solver/impls/ida/ida.cxx
index 189a103bbe..cfc978f755 100644
--- a/src/solver/impls/ida/ida.cxx
+++ b/src/solver/impls/ida/ida.cxx
@@ -40,53 +40,23 @@
 #include "bout/unused.hxx"
 
 #include <ida/ida.h>
-
-#if SUNDIALS_VERSION_MAJOR >= 3
-#include <ida/ida_spils.h>
-#include <sunlinsol/sunlinsol_spgmr.h>
-#else
-#include <ida/ida_spgmr.h>
-#endif
-
 #include <ida/ida_bbdpre.h>
 #include <nvector/nvector_parallel.h>
 #include <sundials/sundials_types.h>
+#include <sunlinsol/sunlinsol_spgmr.h>
 
 #include <numeric>
 
-#define ZERO RCONST(0.)
-#define ONE RCONST(1.0)
-
-#ifndef IDAINT
-#if SUNDIALS_VERSION_MAJOR < 3
-using IDAINT = bout::utils::function_traits<IDABBDLocalFn>::arg_t<0>;
-#else
-using IDAINT = sunindextype;
-#endif
-#endif
-
-static int idares(BoutReal t, N_Vector u, N_Vector du, N_Vector rr, void* user_data);
-static int ida_bbd_res(IDAINT Nlocal, BoutReal t, N_Vector u, N_Vector du, N_Vector rr,
-                       void* user_data);
+// NOLINTBEGIN(readability-identifier-length)
+namespace {
+int idares(BoutReal t, N_Vector u, N_Vector du, N_Vector rr, void* user_data);
+int ida_bbd_res(sunindextype Nlocal, BoutReal t, N_Vector u, N_Vector du, N_Vector rr,
+                void* user_data);
 
-static int ida_pre(BoutReal t, N_Vector yy, N_Vector yp, N_Vector rr, N_Vector rvec,
-                   N_Vector zvec, BoutReal cj, BoutReal delta, void* user_data);
-
-#if SUNDIALS_VERSION_MAJOR < 3
-// Shim for earlier versions
-inline static int ida_pre_shim(BoutReal t, N_Vector yy, N_Vector yp, N_Vector rr,
-                               N_Vector rvec, N_Vector zvec, BoutReal cj, BoutReal delta,
-                               void* user_data, N_Vector UNUSED(tmp)) {
-  return ida_pre(t, yy, yp, rr, rvec, zvec, cj, delta, user_data);
-}
-#else
-// Alias for newer versions
-constexpr auto& ida_pre_shim = ida_pre;
-#endif
-
-#if SUNDIALS_VERSION_MAJOR < 6
-void* IDACreate([[maybe_unused]] SUNContext) { return IDACreate(); }
-#endif
+int ida_pre(BoutReal t, N_Vector yy, N_Vector yp, N_Vector rr, N_Vector rvec,
+            N_Vector zvec, BoutReal cj, BoutReal delta, void* user_data);
+} // namespace
+// NOLINTEND(readability-identifier-length)
 
 IdaSolver::IdaSolver(Options* opts)
     : Solver(opts),
@@ -101,15 +71,15 @@ IdaSolver::IdaSolver(Options* opts)
       correct_start((*options)["correct_start"]
                         .doc("Correct the initial values")
                         .withDefault(true)),
-      suncontext(static_cast<void*>(&BoutComm::get())) {
+      suncontext(createSUNContext(BoutComm::get())) {
   has_constraints = true; // This solver has constraints
 }
 
 IdaSolver::~IdaSolver() {
   if (initialised) {
-    N_VDestroy_Parallel(uvec);
-    N_VDestroy_Parallel(duvec);
-    N_VDestroy_Parallel(id);
+    N_VDestroy(uvec);
+    N_VDestroy(duvec);
+    N_VDestroy(id);
     IDAFree(&idamem);
     SUNLinSolFree(sun_solver);
   }
@@ -144,69 +114,75 @@ int IdaSolver::init() {
                neq, local_N);
 
   // Allocate memory
-  if ((uvec = N_VNew_Parallel(BoutComm::get(), local_N, neq, suncontext)) == nullptr) {
+  uvec = callWithSUNContext(N_VNew_Parallel, suncontext, BoutComm::get(), local_N, neq);
+  if (uvec == nullptr) {
     throw BoutException("SUNDIALS memory allocation failed\n");
   }
-  if ((duvec = N_VNew_Parallel(BoutComm::get(), local_N, neq, suncontext)) == nullptr) {
+  duvec = N_VClone(uvec);
+  if (duvec == nullptr) {
     throw BoutException("SUNDIALS memory allocation failed\n");
   }
-  if ((id = N_VNew_Parallel(BoutComm::get(), local_N, neq, suncontext)) == nullptr) {
+  id = N_VClone(uvec);
+  if (id == nullptr) {
     throw BoutException("SUNDIALS memory allocation failed\n");
   }
 
   // Put the variables into uvec
-  save_vars(NV_DATA_P(uvec));
+  save_vars(N_VGetArrayPointer(uvec));
 
   // Get the starting time derivative
   run_rhs(simtime);
 
   // Put the time-derivatives into duvec
-  save_derivs(NV_DATA_P(duvec));
+  save_derivs(N_VGetArrayPointer(duvec));
 
   // Set the equation type in id(Differential or Algebraic. This is optional)
-  set_id(NV_DATA_P(id));
+  set_id(N_VGetArrayPointer(id));
 
   // Call IDACreate to initialise
-  if ((idamem = IDACreate(suncontext)) == nullptr) {
+  idamem = callWithSUNContext(IDACreate, suncontext);
+  if (idamem == nullptr) {
     throw BoutException("IDACreate failed\n");
   }
 
   // For callbacks, need pointer to solver object
-  if (IDASetUserData(idamem, this) < 0) {
+  if (IDASetUserData(idamem, this) != IDA_SUCCESS) {
     throw BoutException("IDASetUserData failed\n");
   }
 
-  if (IDASetId(idamem, id) < 0) {
+  if (IDASetId(idamem, id) != IDA_SUCCESS) {
     throw BoutException("IDASetID failed\n");
   }
 
-  if (IDAInit(idamem, idares, simtime, uvec, duvec) < 0) {
+  if (IDAInit(idamem, idares, simtime, uvec, duvec) != IDA_SUCCESS) {
     throw BoutException("IDAInit failed\n");
   }
 
-  if (IDASStolerances(idamem, reltol, abstol) < 0) {
+  if (IDASStolerances(idamem, reltol, abstol) != IDA_SUCCESS) {
     throw BoutException("IDASStolerances failed\n");
   }
 
-  IDASetMaxNumSteps(idamem, mxsteps);
+  if (IDASetMaxNumSteps(idamem, mxsteps) != IDA_SUCCESS) {
+    throw BoutException("IDASetMaxNumSteps failed\n");
+  }
 
   // Call IDASpgmr to specify the IDA linear solver IDASPGMR
   const auto maxl = (*options)["maxl"].withDefault(6 * n3d);
-#if SUNDIALS_VERSION_MAJOR >= 3
-  if ((sun_solver = SUNLinSol_SPGMR(uvec, SUN_PREC_NONE, maxl, suncontext)) == nullptr) {
+  sun_solver = callWithSUNContext(SUNLinSol_SPGMR, suncontext, uvec, SUN_PREC_NONE, maxl);
+  if (sun_solver == nullptr) {
     throw BoutException("Creating SUNDIALS linear solver failed\n");
   }
-  if (IDASpilsSetLinearSolver(idamem, sun_solver) != IDA_SUCCESS) {
-    throw BoutException("IDASpilsSetLinearSolver failed\n");
+  if (IDASetLinearSolver(idamem, sun_solver, nullptr) != IDALS_SUCCESS) {
+    throw BoutException("IDASetLinearSolver failed\n");
   }
-#else
-  if (IDASpgmr(idamem, maxl)) {
-    throw BoutException("IDASpgmr failed\n");
-  }
-#endif
 
   if (use_precon) {
-    if (!hasPreconditioner()) {
+    if (hasPreconditioner()) {
+      output.write("\tUsing user-supplied preconditioner\n");
+      if (IDASetPreconditioner(idamem, nullptr, ida_pre) != IDALS_SUCCESS) {
+        throw BoutException("IDASetPreconditioner failed\n");
+      }
+    } else {
       output.write("\tUsing BBD preconditioner\n");
       /// Get options
       // Compute band_width_default from actually added fields, to allow for multiple Mesh
@@ -225,21 +201,17 @@ int IdaSolver::init() {
       const auto mldq = (*options)["mldq"].withDefault(band_width_default);
       const auto mukeep = (*options)["mukeep"].withDefault(n3d);
       const auto mlkeep = (*options)["mlkeep"].withDefault(n3d);
-      if (IDABBDPrecInit(idamem, local_N, mudq, mldq, mukeep, mlkeep, ZERO, ida_bbd_res,
-                         nullptr)) {
+      if (IDABBDPrecInit(idamem, local_N, mudq, mldq, mukeep, mlkeep, 0.0, ida_bbd_res,
+                         nullptr)
+          != IDALS_SUCCESS) {
         throw BoutException("IDABBDPrecInit failed\n");
       }
-    } else {
-      output.write("\tUsing user-supplied preconditioner\n");
-      if (IDASpilsSetPreconditioner(idamem, nullptr, ida_pre_shim)) {
-        throw BoutException("IDASpilsSetPreconditioner failed\n");
-      }
     }
   }
 
   // Call IDACalcIC (with default options) to correct the initial values
   if (correct_start) {
-    if (IDACalcIC(idamem, IDA_YA_YDP_INIT, 1e-6)) {
+    if (IDACalcIC(idamem, IDA_YA_YDP_INIT, 1e-6) != IDA_SUCCESS) {
       throw BoutException("IDACalcIC failed\n");
     }
   }
@@ -291,7 +263,7 @@ BoutReal IdaSolver::run(BoutReal tout) {
   const int flag = IDASolve(idamem, tout, &simtime, uvec, duvec, IDA_NORMAL);
 
   // Copy variables
-  load_vars(NV_DATA_P(uvec));
+  load_vars(N_VGetArrayPointer(uvec));
 
   // Call rhs function to get extra variables at this time
   run_rhs(simtime);
@@ -322,9 +294,9 @@ void IdaSolver::res(BoutReal t, BoutReal* udata, BoutReal* dudata, BoutReal* rda
   save_derivs(rdata);
 
   // If a differential equation, subtract dudata
-  const int N = NV_LOCLENGTH_P(id);
-  const BoutReal* idd = NV_DATA_P(id);
-  for (int i = 0; i < N; i++) {
+  const auto length = N_VGetLocalLength_Parallel(id);
+  const BoutReal* idd = N_VGetArrayPointer(id);
+  for (int i = 0; i < length; i++) {
     if (idd[i] > 0.5) { // 1 -> differential, 0 -> algebraic
       rdata[i] -= dudata[i];
     }
@@ -343,8 +315,8 @@ void IdaSolver::pre(BoutReal t, BoutReal cj, BoutReal delta, BoutReal* udata,
 
   if (!hasPreconditioner()) {
     // Identity (but should never happen)
-    const int N = NV_LOCLENGTH_P(id);
-    std::copy(rvec, rvec + N, zvec);
+    const auto length = N_VGetLocalLength_Parallel(id);
+    std::copy(rvec, rvec + length, zvec);
     return;
   }
 
@@ -367,10 +339,12 @@ void IdaSolver::pre(BoutReal t, BoutReal cj, BoutReal delta, BoutReal* udata,
  * IDA res function
  **************************************************************************/
 
-static int idares(BoutReal t, N_Vector u, N_Vector du, N_Vector rr, void* user_data) {
-  BoutReal* udata = NV_DATA_P(u);
-  BoutReal* dudata = NV_DATA_P(du);
-  BoutReal* rdata = NV_DATA_P(rr);
+// NOLINTBEGIN(readability-identifier-length)
+namespace {
+int idares(BoutReal t, N_Vector u, N_Vector du, N_Vector rr, void* user_data) {
+  BoutReal* udata = N_VGetArrayPointer(u);
+  BoutReal* dudata = N_VGetArrayPointer(du);
+  BoutReal* rdata = N_VGetArrayPointer(rr);
 
   auto* s = static_cast<IdaSolver*>(user_data);
 
@@ -381,18 +355,17 @@ static int idares(BoutReal t, N_Vector u, N_Vector du, N_Vector rr, void* user_d
 }
 
 /// Residual function for BBD preconditioner
-static int ida_bbd_res(IDAINT UNUSED(Nlocal), BoutReal t, N_Vector u, N_Vector du,
-                       N_Vector rr, void* user_data) {
+int ida_bbd_res(sunindextype UNUSED(Nlocal), BoutReal t, N_Vector u, N_Vector du,
+                N_Vector rr, void* user_data) {
   return idares(t, u, du, rr, user_data);
 }
 
 // Preconditioner function
-static int ida_pre(BoutReal t, N_Vector yy, N_Vector UNUSED(yp), N_Vector UNUSED(rr),
-                   N_Vector rvec, N_Vector zvec, BoutReal cj, BoutReal delta,
-                   void* user_data) {
-  BoutReal* udata = NV_DATA_P(yy);
-  BoutReal* rdata = NV_DATA_P(rvec);
-  BoutReal* zdata = NV_DATA_P(zvec);
+int ida_pre(BoutReal t, N_Vector yy, N_Vector UNUSED(yp), N_Vector UNUSED(rr),
+            N_Vector rvec, N_Vector zvec, BoutReal cj, BoutReal delta, void* user_data) {
+  BoutReal* udata = N_VGetArrayPointer(yy);
+  BoutReal* rdata = N_VGetArrayPointer(rvec);
+  BoutReal* zdata = N_VGetArrayPointer(zvec);
 
   auto* s = static_cast<IdaSolver*>(user_data);
 
@@ -401,5 +374,7 @@ static int ida_pre(BoutReal t, N_Vector yy, N_Vector UNUSED(yp), N_Vector UNUSED
 
   return 0;
 }
+} // namespace
+// NOLINTEND(readability-identifier-length)
 
 #endif
diff --git a/src/solver/impls/ida/ida.hxx b/src/solver/impls/ida/ida.hxx
index 83ee4d83e6..b00054d157 100644
--- a/src/solver/impls/ida/ida.hxx
+++ b/src/solver/impls/ida/ida.hxx
@@ -27,8 +27,8 @@
  *
  **************************************************************************/
 
-#ifndef __IDA_SOLVER_H__
-#define __IDA_SOLVER_H__
+#ifndef BOUT_IDA_SOLVER_H
+#define BOUT_IDA_SOLVER_H
 
 #include "bout/build_config.hxx"
 #include "bout/solver.hxx"
@@ -97,4 +97,4 @@ private:
 };
 
 #endif // BOUT_HAS_IDA
-#endif // __IDA_SOLVER_H__
+#endif // BOUT_IDA_SOLVER_H
diff --git a/src/solver/impls/imex-bdf2/imex-bdf2.hxx b/src/solver/impls/imex-bdf2/imex-bdf2.hxx
index 4126c48265..f0e1b2faee 100644
--- a/src/solver/impls/imex-bdf2/imex-bdf2.hxx
+++ b/src/solver/impls/imex-bdf2/imex-bdf2.hxx
@@ -32,8 +32,8 @@
  *
  **************************************************************************/
 
-#ifndef __IMEXBDF2_SOLVER_H__
-#define __IMEXBDF2_SOLVER_H__
+#ifndef BOUT_IMEXBDF2_SOLVER_H
+#define BOUT_IMEXBDF2_SOLVER_H
 
 #include "bout/build_config.hxx"
 #include "bout/solver.hxx"
@@ -221,6 +221,6 @@ private:
   void saveDerivs(BoutReal* u);
 };
 
-#endif // __IMEXBDF2_SOLVER_H__
+#endif // BOUT_IMEXBDF2_SOLVER_H
 
 #endif // BOUT_HAS_PETSC
diff --git a/src/solver/impls/petsc/petsc.hxx b/src/solver/impls/petsc/petsc.hxx
index 349f40bad8..7239126abb 100644
--- a/src/solver/impls/petsc/petsc.hxx
+++ b/src/solver/impls/petsc/petsc.hxx
@@ -24,8 +24,8 @@
  *
  **************************************************************************/
 
-#ifndef __PETSC_SOLVER_H__
-#define __PETSC_SOLVER_H__
+#ifndef BOUT_PETSC_SOLVER_H
+#define BOUT_PETSC_SOLVER_H
 
 #include "bout/build_config.hxx"
 #include "bout/solver.hxx"
@@ -149,4 +149,4 @@ private:
 
 #endif // BOUT_HAS_PETSC
 
-#endif // __PETSC_SOLVER_H__
+#endif // BOUT_PETSC_SOLVER_H
diff --git a/src/solver/impls/power/power.hxx b/src/solver/impls/power/power.hxx
index 757befeec5..6f56c20f43 100644
--- a/src/solver/impls/power/power.hxx
+++ b/src/solver/impls/power/power.hxx
@@ -26,8 +26,8 @@
 
 class PowerSolver;
 
-#ifndef __POWER_SOLVER_H__
-#define __POWER_SOLVER_H__
+#ifndef BOUT_POWER_SOLVER_H
+#define BOUT_POWER_SOLVER_H
 
 #include <bout/bout_types.hxx>
 #include <bout/solver.hxx>
@@ -60,4 +60,4 @@ private:
   void divide(Array<BoutReal>& in, BoutReal value);
 };
 
-#endif // __KARNIADAKIS_SOLVER_H__
+#endif // BOUT_KARNIADAKIS_SOLVER_H
diff --git a/src/solver/impls/pvode/pvode.hxx b/src/solver/impls/pvode/pvode.hxx
index 2ff02c22bf..d29135d02e 100644
--- a/src/solver/impls/pvode/pvode.hxx
+++ b/src/solver/impls/pvode/pvode.hxx
@@ -30,8 +30,8 @@
 
 class PvodeSolver;
 
-#ifndef __PVODE_SOLVER_H__
-#define __PVODE_SOLVER_H__
+#ifndef BOUT_PVODE_SOLVER_H
+#define BOUT_PVODE_SOLVER_H
 
 #include <bout/bout_types.hxx>
 #include <bout/solver.hxx>
@@ -81,6 +81,6 @@ private:
   bool pvode_initialised = false;
 };
 
-#endif // __PVODE_SOLVER_H__
+#endif // BOUT_PVODE_SOLVER_H
 
 #endif
diff --git a/src/solver/impls/rk3-ssp/rk3-ssp.cxx b/src/solver/impls/rk3-ssp/rk3-ssp.cxx
index 27979bc435..e13d996c00 100644
--- a/src/solver/impls/rk3-ssp/rk3-ssp.cxx
+++ b/src/solver/impls/rk3-ssp/rk3-ssp.cxx
@@ -108,7 +108,7 @@ void RK3SSP::take_step(BoutReal curtime, BoutReal dt, Array<BoutReal>& start,
   run_rhs(curtime);
   save_derivs(std::begin(L));
 
-  BOUT_OMP(parallel for)
+  BOUT_OMP_PERF(parallel for)
   for (int i = 0; i < nlocal; i++) {
     u1[i] = start[i] + dt * L[i];
   }
@@ -117,7 +117,7 @@ void RK3SSP::take_step(BoutReal curtime, BoutReal dt, Array<BoutReal>& start,
   run_rhs(curtime + dt);
   save_derivs(std::begin(L));
 
-  BOUT_OMP(parallel for )
+  BOUT_OMP_PERF(parallel for )
   for (int i = 0; i < nlocal; i++) {
     u2[i] = 0.75 * start[i] + 0.25 * u1[i] + 0.25 * dt * L[i];
   }
@@ -126,7 +126,7 @@ void RK3SSP::take_step(BoutReal curtime, BoutReal dt, Array<BoutReal>& start,
   run_rhs(curtime + 0.5 * dt);
   save_derivs(std::begin(L));
 
-  BOUT_OMP(parallel for)
+  BOUT_OMP_PERF(parallel for)
   for (int i = 0; i < nlocal; i++) {
     result[i] = (1. / 3) * start[i] + (2. / 3.) * (u2[i] + dt * L[i]);
   }
diff --git a/src/solver/impls/rk3-ssp/rk3-ssp.hxx b/src/solver/impls/rk3-ssp/rk3-ssp.hxx
index 4080b17bb5..3682d5cbde 100644
--- a/src/solver/impls/rk3-ssp/rk3-ssp.hxx
+++ b/src/solver/impls/rk3-ssp/rk3-ssp.hxx
@@ -33,8 +33,8 @@
 
 class RK3SSP;
 
-#ifndef __RK3SSP_SOLVER_H__
-#define __RK3SSP_SOLVER_H__
+#ifndef BOUT_RK3SSP_SOLVER_H
+#define BOUT_RK3SSP_SOLVER_H
 
 #include "mpi.h"
 
@@ -72,4 +72,4 @@ private:
   Array<BoutReal> u1, u2, u3, L; //< Time-stepping arrays
 };
 
-#endif // __RK4_SOLVER_H__
+#endif // BOUT_RK3SSP_SOLVER_H
diff --git a/src/solver/impls/rk4/rk4.cxx b/src/solver/impls/rk4/rk4.cxx
index 47bef38f9c..0e7a942a45 100644
--- a/src/solver/impls/rk4/rk4.cxx
+++ b/src/solver/impls/rk4/rk4.cxx
@@ -105,7 +105,7 @@ int RK4Solver::run() {
 
           // Check accuracy
           BoutReal local_err = 0.;
-          BOUT_OMP(parallel for reduction(+: local_err)   )
+          BOUT_OMP_PERF(parallel for reduction(+: local_err)   )
           for (int i = 0; i < nlocal; i++) {
             local_err += fabs(f2[i] - f1[i]) / (fabs(f1[i]) + fabs(f2[i]) + atol);
           }
@@ -182,7 +182,7 @@ void RK4Solver::take_step(BoutReal curtime, BoutReal dt, Array<BoutReal>& start,
   run_rhs(curtime);
   save_derivs(std::begin(k1));
 
-  BOUT_OMP(parallel for)
+  BOUT_OMP_PERF(parallel for)
   for (int i = 0; i < nlocal; i++) {
     k5[i] = start[i] + 0.5 * dt * k1[i];
   }
@@ -191,7 +191,7 @@ void RK4Solver::take_step(BoutReal curtime, BoutReal dt, Array<BoutReal>& start,
   run_rhs(curtime + 0.5 * dt);
   save_derivs(std::begin(k2));
 
-  BOUT_OMP(parallel for )
+  BOUT_OMP_PERF(parallel for )
   for (int i = 0; i < nlocal; i++) {
     k5[i] = start[i] + 0.5 * dt * k2[i];
   }
@@ -200,7 +200,7 @@ void RK4Solver::take_step(BoutReal curtime, BoutReal dt, Array<BoutReal>& start,
   run_rhs(curtime + 0.5 * dt);
   save_derivs(std::begin(k3));
 
-  BOUT_OMP(parallel for)
+  BOUT_OMP_PERF(parallel for)
   for (int i = 0; i < nlocal; i++) {
     k5[i] = start[i] + dt * k3[i];
   }
@@ -209,7 +209,7 @@ void RK4Solver::take_step(BoutReal curtime, BoutReal dt, Array<BoutReal>& start,
   run_rhs(curtime + dt);
   save_derivs(std::begin(k4));
 
-  BOUT_OMP(parallel for)
+  BOUT_OMP_PERF(parallel for)
   for (int i = 0; i < nlocal; i++) {
     result[i] = start[i] + (1. / 6.) * dt * (k1[i] + 2. * k2[i] + 2. * k3[i] + k4[i]);
   }
diff --git a/src/solver/impls/rk4/rk4.hxx b/src/solver/impls/rk4/rk4.hxx
index 5838b24e8e..7ec7e6dd45 100644
--- a/src/solver/impls/rk4/rk4.hxx
+++ b/src/solver/impls/rk4/rk4.hxx
@@ -27,8 +27,8 @@
 
 class RK4Solver;
 
-#ifndef __RK4_SOLVER_H__
-#define __RK4_SOLVER_H__
+#ifndef BOUT_RK4_SOLVER_H
+#define BOUT_RK4_SOLVER_H
 
 #include "mpi.h"
 
@@ -68,4 +68,4 @@ private:
   Array<BoutReal> k1, k2, k3, k4, k5; //< Time-stepping arrays
 };
 
-#endif // __RK4_SOLVER_H__
+#endif // BOUT_RK4_SOLVER_H
diff --git a/src/solver/impls/rkgeneric/impls/cashkarp/cashkarp.hxx b/src/solver/impls/rkgeneric/impls/cashkarp/cashkarp.hxx
index 32072f1fc7..76042174f9 100644
--- a/src/solver/impls/rkgeneric/impls/cashkarp/cashkarp.hxx
+++ b/src/solver/impls/rkgeneric/impls/cashkarp/cashkarp.hxx
@@ -1,8 +1,8 @@
 
 class CASHKARPScheme;
 
-#ifndef __CASHKARP_SCHEME_H__
-#define __CASHKARP_SCHEME_H__
+#ifndef BOUT_CASHKARP_SCHEME_H
+#define BOUT_CASHKARP_SCHEME_H
 
 #include <bout/rkscheme.hxx>
 #include <bout/utils.hxx>
@@ -16,4 +16,4 @@ namespace {
 RegisterRKScheme<CASHKARPScheme> registerrkschemecashkarp(RKSCHEME_CASHKARP);
 }
 
-#endif // __CASHKARP_SCHEME_H__
+#endif // BOUT_CASHKARP_SCHEME_H
diff --git a/src/solver/impls/rkgeneric/impls/rk4simple/rk4simple.hxx b/src/solver/impls/rkgeneric/impls/rk4simple/rk4simple.hxx
index 126fa0912c..9fc0fc0604 100644
--- a/src/solver/impls/rkgeneric/impls/rk4simple/rk4simple.hxx
+++ b/src/solver/impls/rkgeneric/impls/rk4simple/rk4simple.hxx
@@ -1,8 +1,8 @@
 
 class RK4SIMPLEScheme;
 
-#ifndef __RK4SIMPLE_SCHEME_H__
-#define __RK4SIMPLE_SCHEME_H__
+#ifndef BOUT_RK4SIMPLE_SCHEME_H
+#define BOUT_RK4SIMPLE_SCHEME_H
 
 #include <bout/rkscheme.hxx>
 #include <bout/utils.hxx>
@@ -19,4 +19,4 @@ namespace {
 RegisterRKScheme<RK4SIMPLEScheme> registerrkscheme4simple(RKSCHEME_RK4);
 }
 
-#endif // __RK4SIMPLE_SCHEME_H__
+#endif // BOUT_RK4SIMPLE_SCHEME_H
diff --git a/src/solver/impls/rkgeneric/impls/rkf34/rkf34.hxx b/src/solver/impls/rkgeneric/impls/rkf34/rkf34.hxx
index 9de022b865..6840c4f5b4 100644
--- a/src/solver/impls/rkgeneric/impls/rkf34/rkf34.hxx
+++ b/src/solver/impls/rkgeneric/impls/rkf34/rkf34.hxx
@@ -1,8 +1,8 @@
 
 class RKF34Scheme;
 
-#ifndef __RKF34_SCHEME_H__
-#define __RKF34_SCHEME_H__
+#ifndef BOUT_RKF34_SCHEME_H
+#define BOUT_RKF34_SCHEME_H
 
 #include <bout/rkscheme.hxx>
 #include <bout/utils.hxx>
@@ -16,4 +16,4 @@ namespace {
 RegisterRKScheme<RKF34Scheme> registerrkschemef34(RKSCHEME_RKF34);
 }
 
-#endif // __RKF34_SCHEME_H__
+#endif // BOUT_RKF34_SCHEME_H
diff --git a/src/solver/impls/rkgeneric/impls/rkf45/rkf45.hxx b/src/solver/impls/rkgeneric/impls/rkf45/rkf45.hxx
index ea752877e0..70150a2a40 100644
--- a/src/solver/impls/rkgeneric/impls/rkf45/rkf45.hxx
+++ b/src/solver/impls/rkgeneric/impls/rkf45/rkf45.hxx
@@ -1,8 +1,8 @@
 
 class RKF45Scheme;
 
-#ifndef __RKF45_SCHEME_H__
-#define __RKF45_SCHEME_H__
+#ifndef BOUT_RKF45_SCHEME_H
+#define BOUT_RKF45_SCHEME_H
 
 #include <bout/rkscheme.hxx>
 #include <bout/utils.hxx>
@@ -16,4 +16,4 @@ namespace {
 RegisterRKScheme<RKF45Scheme> registerrkschemef45(RKSCHEME_RKF45);
 }
 
-#endif // __RKF45_SCHEME_H__
+#endif // BOUT_RKF45_SCHEME_H
diff --git a/src/solver/impls/rkgeneric/rkgeneric.cxx b/src/solver/impls/rkgeneric/rkgeneric.cxx
index 8f5e95f0be..1c332d26de 100644
--- a/src/solver/impls/rkgeneric/rkgeneric.cxx
+++ b/src/solver/impls/rkgeneric/rkgeneric.cxx
@@ -75,7 +75,7 @@ int RKGenericSolver::init() {
 
 void RKGenericSolver::resetInternalFields() {
   //Zero out history
-  BOUT_OMP(parallel for)
+  BOUT_OMP_PERF(parallel for)
   for (int i = 0; i < nlocal; i++) {
     tmpState[i] = 0;
     f2[i] = 0;
diff --git a/src/solver/impls/rkgeneric/rkgeneric.hxx b/src/solver/impls/rkgeneric/rkgeneric.hxx
index a18678e724..9df9a4a396 100644
--- a/src/solver/impls/rkgeneric/rkgeneric.hxx
+++ b/src/solver/impls/rkgeneric/rkgeneric.hxx
@@ -25,8 +25,8 @@
 
 class RKGenericSolver;
 
-#ifndef __RKGENERIC_SOLVER_H__
-#define __RKGENERIC_SOLVER_H__
+#ifndef BOUT_RKGENERIC_SOLVER_H
+#define BOUT_RKGENERIC_SOLVER_H
 
 #include "mpi.h"
 
@@ -77,4 +77,4 @@ private:
   std::unique_ptr<RKScheme> scheme{nullptr};
 };
 
-#endif // __RKGENERIC_SOLVER_H__
+#endif // BOUT_RKGENERIC_SOLVER_H
diff --git a/src/solver/impls/rkgeneric/rkscheme.cxx b/src/solver/impls/rkgeneric/rkscheme.cxx
index 25de364533..dd4bd8e7a1 100644
--- a/src/solver/impls/rkgeneric/rkscheme.cxx
+++ b/src/solver/impls/rkgeneric/rkscheme.cxx
@@ -59,7 +59,7 @@ void RKScheme::setCurState(const Array<BoutReal>& start, Array<BoutReal>& out,
                            const int curStage, const BoutReal dt) {
 
   //Set the initial stage
-  BOUT_OMP(parallel for)
+  BOUT_OMP_PERF(parallel for)
   for (int i = 0; i < nlocal; i++) {
     out[i] = start[i];
   }
@@ -76,7 +76,7 @@ void RKScheme::setCurState(const Array<BoutReal>& start, Array<BoutReal>& out,
     }
     BoutReal fac = stageCoeffs(curStage, j) * dt;
 
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (int i = 0; i < nlocal; i++) {
       out[i] = out[i] + fac * steps(j, i);
     }
@@ -147,7 +147,7 @@ BoutReal RKScheme::getErr(Array<BoutReal>& solA, Array<BoutReal>& solB) {
   // we expect slightly different round-off error each time this
   // is called and hence the nrhs may no longer be exactly
   // repeatable with this parallelisation.
-  BOUT_OMP(parallel for reduction(+:local_err))
+  BOUT_OMP_PERF(parallel for reduction(+:local_err))
   for (int i = 0; i < nlocal; i++) {
     local_err +=
         std::abs(solA[i] - solB[i]) / (std::abs(solA[i]) + std::abs(solB[i]) + atol);
@@ -166,7 +166,7 @@ BoutReal RKScheme::getErr(Array<BoutReal>& solA, Array<BoutReal>& solB) {
 void RKScheme::constructOutput(const Array<BoutReal>& start, const BoutReal dt,
                                const int index, Array<BoutReal>& sol) {
   //Initialise the return data
-  BOUT_OMP(parallel for)
+  BOUT_OMP_PERF(parallel for)
   for (int i = 0; i < nlocal; i++) {
     sol[i] = start[i];
   }
@@ -177,7 +177,7 @@ void RKScheme::constructOutput(const Array<BoutReal>& start, const BoutReal dt,
       continue; // Real comparison not great
     }
     BoutReal fac = dt * resultCoeffs(curStage, index);
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (int i = 0; i < nlocal; i++) {
       sol[i] = sol[i] + fac * steps(curStage, i);
     }
@@ -188,7 +188,7 @@ void RKScheme::constructOutputs(const Array<BoutReal>& start, const BoutReal dt,
                                 const int indexFollow, const int indexAlt,
                                 Array<BoutReal>& solFollow, Array<BoutReal>& solAlt) {
   //Initialise the return data
-  BOUT_OMP(parallel for)
+  BOUT_OMP_PERF(parallel for)
   for (int i = 0; i < nlocal; i++) {
     solFollow[i] = start[i];
     solAlt[i] = start[i];
@@ -198,7 +198,7 @@ void RKScheme::constructOutputs(const Array<BoutReal>& start, const BoutReal dt,
   for (int curStage = 0; curStage < getStageCount(); curStage++) {
     BoutReal facFol = dt * resultCoeffs(curStage, indexFollow);
     BoutReal facAlt = dt * resultCoeffs(curStage, indexAlt);
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (int i = 0; i < nlocal; i++) {
       solFollow[i] = solFollow[i] + facFol * steps(curStage, i);
       solAlt[i] = solAlt[i] + facAlt * steps(curStage, i);
diff --git a/src/solver/impls/slepc/slepc.hxx b/src/solver/impls/slepc/slepc.hxx
index 88f35a04f9..619c873132 100644
--- a/src/solver/impls/slepc/slepc.hxx
+++ b/src/solver/impls/slepc/slepc.hxx
@@ -24,8 +24,8 @@
  *
  **************************************************************************/
 
-#ifndef __SLEPC_SOLVER_H__
-#define __SLEPC_SOLVER_H__
+#ifndef BOUT_SLEPC_SOLVER_H
+#define BOUT_SLEPC_SOLVER_H
 
 #include "bout/build_config.hxx"
 #include "bout/solver.hxx"
@@ -234,4 +234,4 @@ private:
 
 #endif // BOUT_HAS_SLEPC
 
-#endif // __SLEPC_SOLVER_H__
+#endif // BOUT_SLEPC_SOLVER_H
diff --git a/src/solver/impls/snes/snes.hxx b/src/solver/impls/snes/snes.hxx
index 2021402cd7..601eaaaa25 100644
--- a/src/solver/impls/snes/snes.hxx
+++ b/src/solver/impls/snes/snes.hxx
@@ -25,8 +25,8 @@
  *
  **************************************************************************/
 
-#ifndef __SNES_SOLVER_H__
-#define __SNES_SOLVER_H__
+#ifndef BOUT_SNES_SOLVER_H
+#define BOUT_SNES_SOLVER_H
 
 #include <bout/build_config.hxx>
 #include <bout/solver.hxx>
@@ -143,4 +143,4 @@ RegisterUnavailableSolver
 
 #endif // BOUT_HAS_PETSC
 
-#endif // __SNES_SOLVER_H__
+#endif // BOUT_SNES_SOLVER_H
diff --git a/src/solver/impls/split-rk/split-rk.cxx b/src/solver/impls/split-rk/split-rk.cxx
index ef53a12f2e..cd6bd1718c 100644
--- a/src/solver/impls/split-rk/split-rk.cxx
+++ b/src/solver/impls/split-rk/split-rk.cxx
@@ -113,7 +113,7 @@ int SplitRK::run() {
 
           // Check accuracy
           BoutReal local_err = 0.;
-          BOUT_OMP(parallel for reduction(+: local_err)   )
+          BOUT_OMP_PERF(parallel for reduction(+: local_err)   )
           for (int i = 0; i < nlocal; i++) {
             local_err +=
                 fabs(state2[i] - state1[i]) / (fabs(state1[i]) + fabs(state2[i]) + atol);
@@ -220,7 +220,7 @@ void SplitRK::take_diffusion_step(BoutReal curtime, BoutReal dt, Array<BoutReal>
   // Stage j = 1
   // y_m2 = y0 + weight/3.0 * f(y0)  -> u2
 
-  BOUT_OMP(parallel for)
+  BOUT_OMP_PERF(parallel for)
   for (int i = 0; i < dydt.size(); i++) {
     u2[i] = start[i] + (weight / 3.0) * dydt[i];
   }
@@ -231,7 +231,7 @@ void SplitRK::take_diffusion_step(BoutReal curtime, BoutReal dt, Array<BoutReal>
   run_diffusive(curtime + (weight / 3.0) * dt);
   save_derivs(std::begin(u3)); // f(y_m2) -> u3
 
-  BOUT_OMP(parallel for)
+  BOUT_OMP_PERF(parallel for)
   for (int i = 0; i < u3.size(); i++) {
     u1[i] = 1.5 * (u2[i] + weight * u3[i]) - 0.5 * start[i] - weight * dydt[i];
   }
@@ -251,7 +251,7 @@ void SplitRK::take_diffusion_step(BoutReal curtime, BoutReal dt, Array<BoutReal>
     run_diffusive(curtime);
     save_derivs(std::begin(u3)); // f(y_m1) -> u3
 
-    BOUT_OMP(parallel for)
+    BOUT_OMP_PERF(parallel for)
     for (int i = 0; i < u3.size(); i++) {
       // Next stage result in u3
       u3[i] = mu * (u1[i] + weight * (u3[i] - a_jm1 * dydt[i])) + nu * u2[i]
@@ -280,7 +280,7 @@ void SplitRK::take_advection_step(BoutReal curtime, BoutReal dt, Array<BoutReal>
   run_convective(curtime);
   save_derivs(std::begin(dydt));
 
-  BOUT_OMP(parallel for)
+  BOUT_OMP_PERF(parallel for)
   for (int i = 0; i < nlocal; i++) {
     u1[i] = start[i] + dt * dydt[i];
   }
@@ -289,7 +289,7 @@ void SplitRK::take_advection_step(BoutReal curtime, BoutReal dt, Array<BoutReal>
   run_convective(curtime + dt);
   save_derivs(std::begin(dydt));
 
-  BOUT_OMP(parallel for )
+  BOUT_OMP_PERF(parallel for )
   for (int i = 0; i < nlocal; i++) {
     u2[i] = 0.75 * start[i] + 0.25 * u1[i] + 0.25 * dt * dydt[i];
   }
@@ -298,7 +298,7 @@ void SplitRK::take_advection_step(BoutReal curtime, BoutReal dt, Array<BoutReal>
   run_convective(curtime + 0.5 * dt);
   save_derivs(std::begin(dydt));
 
-  BOUT_OMP(parallel for)
+  BOUT_OMP_PERF(parallel for)
   for (int i = 0; i < nlocal; i++) {
     result[i] = (1. / 3) * start[i] + (2. / 3.) * (u2[i] + dt * dydt[i]);
   }
diff --git a/src/sys/adios_object.cxx b/src/sys/adios_object.cxx
index c7d6dab9aa..477dae14ef 100644
--- a/src/sys/adios_object.cxx
+++ b/src/sys/adios_object.cxx
@@ -1,6 +1,6 @@
 #include "bout/build_config.hxx"
 
-#if BOUT_HAS_ADIOS
+#if BOUT_HAS_ADIOS2
 
 #include "bout/adios_object.hxx"
 #include "bout/boutexception.hxx"
@@ -95,4 +95,4 @@ void ADIOSSetParameters(const std::string& input, const char delimKeyValue,
 }
 
 } // namespace bout
-#endif //BOUT_HAS_ADIOS
+#endif //BOUT_HAS_ADIOS2
diff --git a/src/sys/hyprelib.cxx b/src/sys/hyprelib.cxx
index 691e53230f..7bdeaa47cf 100644
--- a/src/sys/hyprelib.cxx
+++ b/src/sys/hyprelib.cxx
@@ -27,7 +27,7 @@ static constexpr auto BOUT_HYPRE_MEMORY = HYPRE_MEMORY_HOST;
 #endif
 
 HypreLib::HypreLib() {
-  BOUT_OMP(critical(HypreLib))
+  BOUT_OMP_SAFE(critical(HypreLib))
   {
     if (count == 0) { // Initialise once
       output_progress.write("Initialising Hypre\n");
@@ -40,7 +40,7 @@ HypreLib::HypreLib() {
 }
 
 HypreLib::HypreLib([[maybe_unused]] const HypreLib& other) noexcept {
-  BOUT_OMP(critical(HypreLib))
+  BOUT_OMP_SAFE(critical(HypreLib))
   {
     // No need to initialise Hypre, because it must already be initialised
     count++; // Copying, so increase count
@@ -48,7 +48,7 @@ HypreLib::HypreLib([[maybe_unused]] const HypreLib& other) noexcept {
 }
 
 HypreLib::HypreLib([[maybe_unused]] HypreLib&& other) noexcept {
-  BOUT_OMP(critical(HypreLib))
+  BOUT_OMP_SAFE(critical(HypreLib))
   {
     // No need to initialise Hypre, because it must already be initialised
     count++; // Creating a new Hyprelib object; other will be deleted
@@ -56,7 +56,7 @@ HypreLib::HypreLib([[maybe_unused]] HypreLib&& other) noexcept {
 }
 
 HypreLib::~HypreLib() {
-  BOUT_OMP(critical(HypreLib))
+  BOUT_OMP_SAFE(critical(HypreLib))
   {
     count--;
     if (count == 0) {
@@ -67,7 +67,7 @@ HypreLib::~HypreLib() {
 }
 
 void HypreLib::cleanup() {
-  BOUT_OMP(critical(HypreLib))
+  BOUT_OMP_SAFE(critical(HypreLib))
   {
     if (count > 0) {
       output << "Finalising Hypre. Warning: Instances of HypreLib still exist.\n";
diff --git a/src/sys/msg_stack.cxx b/src/sys/msg_stack.cxx
index 6ea4c15a8b..502836324c 100644
--- a/src/sys/msg_stack.cxx
+++ b/src/sys/msg_stack.cxx
@@ -58,7 +58,7 @@ void MsgStack::pop() {
   if (position <= 0) {
     return;
   }
-  BOUT_OMP(single)
+  BOUT_OMP_SAFE(single)
   { --position; }
 }
 
@@ -78,7 +78,7 @@ void MsgStack::pop(int id) {
 }
 
 void MsgStack::clear() {
-  BOUT_OMP(single)
+  BOUT_OMP_SAFE(single)
   {
     stack.clear();
     position = 0;
@@ -86,7 +86,7 @@ void MsgStack::clear() {
 }
 
 void MsgStack::dump() {
-  BOUT_OMP(single)
+  BOUT_OMP_SAFE(single)
   { output << this->getDump(); }
 }
 
diff --git a/src/sys/options.cxx b/src/sys/options.cxx
index a358d50234..893a92cffc 100644
--- a/src/sys/options.cxx
+++ b/src/sys/options.cxx
@@ -221,6 +221,36 @@ Options::fuzzyFind(const std::string& name, std::string::size_type distance) con
   return matches;
 }
 
+Options::Options(const Options& other) { (*this) = other.copy(); }
+
+Options& Options::operator=(const Options& other) {
+  if (this == &other) {
+    return *this;
+  }
+
+  // Note: Here can't do copy-and-swap because pointers to parents are stored
+
+  value = other.value;
+
+  // Assigning the attributes.
+  // The simple assignment operator fails to compile with Apple Clang 12
+  //   attributes = other.attributes;
+  attributes.clear();
+  attributes.insert(other.attributes.begin(), other.attributes.end());
+
+  full_name = other.full_name;
+  is_section = other.is_section;
+  children = other.children;
+  value_used = other.value_used;
+
+  // Ensure that this is the parent of all children,
+  // otherwise will point to the original Options instance
+  for (auto& child : children) {
+    child.second.parent_instance = this;
+  }
+  return *this;
+}
+
 Options& Options::operator=(Options&& other) noexcept {
   if (this == &other) {
     return *this;
diff --git a/src/sys/options/optionparser.hxx b/src/sys/options/optionparser.hxx
index ff5bb61a6f..bc61ef7297 100644
--- a/src/sys/options/optionparser.hxx
+++ b/src/sys/options/optionparser.hxx
@@ -39,8 +39,8 @@
 
 class OptionParser;
 
-#ifndef __OPTIONPARSER_H__
-#define __OPTIONPARSER_H__
+#ifndef BOUT_OPTIONPARSER_H
+#define BOUT_OPTIONPARSER_H
 
 #include "bout/bout_types.hxx"
 #include "bout/options.hxx"
@@ -61,4 +61,4 @@ public:
 private:
 };
 
-#endif // __OPTIONPARSER_H__
+#endif // BOUT_OPTIONPARSER_H
diff --git a/src/sys/options/options_adios.cxx b/src/sys/options/options_adios.cxx
index b313d7bc79..88df92df04 100644
--- a/src/sys/options/options_adios.cxx
+++ b/src/sys/options/options_adios.cxx
@@ -1,6 +1,6 @@
 #include "bout/build_config.hxx"
 
-#if BOUT_HAS_ADIOS
+#if BOUT_HAS_ADIOS2
 
 #include "options_adios.hxx"
 #include "bout/adios_object.hxx"
@@ -628,4 +628,4 @@ void OptionsADIOS::write(const Options& options, const std::string& time_dim) {
 
 } // namespace bout
 
-#endif // BOUT_HAS_ADIOS
+#endif // BOUT_HAS_ADIOS2
diff --git a/src/sys/options/options_adios.hxx b/src/sys/options/options_adios.hxx
index eddb3976ff..a942e6fed9 100644
--- a/src/sys/options/options_adios.hxx
+++ b/src/sys/options/options_adios.hxx
@@ -8,7 +8,7 @@
 #include "bout/options.hxx"
 #include "bout/options_io.hxx"
 
-#if !BOUT_HAS_ADIOS
+#if !BOUT_HAS_ADIOS2
 
 namespace {
 bout::RegisterUnavailableOptionsIO
@@ -79,5 +79,5 @@ RegisterOptionsIO<OptionsADIOS> registeroptionsadios("adios");
 
 } // namespace bout
 
-#endif // BOUT_HAS_ADIOS
+#endif // BOUT_HAS_ADIOS2
 #endif // OPTIONS_ADIOS_H
diff --git a/src/sys/options/options_ini.hxx b/src/sys/options/options_ini.hxx
index d06a700f09..092ed9320a 100644
--- a/src/sys/options/options_ini.hxx
+++ b/src/sys/options/options_ini.hxx
@@ -33,8 +33,8 @@
 
 class OptionINI;
 
-#ifndef __OPTIONS_INI_H__
-#define __OPTIONS_INI_H__
+#ifndef BOUT_OPTIONS_INI_H
+#define BOUT_OPTIONS_INI_H
 
 #include "optionparser.hxx"
 
@@ -59,4 +59,4 @@ private:
   std::string getNextLine(std::ifstream& fin);
 };
 
-#endif // __OPTIONS_INI_H__
+#endif // BOUT_OPTIONS_INI_H
diff --git a/src/sys/petsclib.cxx b/src/sys/petsclib.cxx
index bfcd7d6314..f1cf1a9d1b 100644
--- a/src/sys/petsclib.cxx
+++ b/src/sys/petsclib.cxx
@@ -58,7 +58,7 @@ void setPetscOptions(Options& options, const std::string& prefix) {
 } // namespace
 
 PetscLib::PetscLib(Options* opt) {
-  BOUT_OMP(critical(PetscLib))
+  BOUT_OMP_SAFE(critical(PetscLib))
   {
     if (count == 0) {
       // Initialise PETSc
@@ -95,7 +95,7 @@ PetscLib::PetscLib(Options* opt) {
 }
 
 PetscLib::~PetscLib() {
-  BOUT_OMP(critical(PetscLib))
+  BOUT_OMP_SAFE(critical(PetscLib))
   {
     count--;
     if (count == 0) {
@@ -120,7 +120,7 @@ void PetscLib::setOptionsFromInputFile(SNES& snes) {
 }
 
 void PetscLib::cleanup() {
-  BOUT_OMP(critical(PetscLib))
+  BOUT_OMP_SAFE(critical(PetscLib))
   {
     if (count > 0) {
       output << "Finalising PETSc. Warning: Instances of PetscLib still exist.\n";
diff --git a/tests/MMS/GBS/gbs.hxx b/tests/MMS/GBS/gbs.hxx
index e711e3ea83..468a5e579c 100644
--- a/tests/MMS/GBS/gbs.hxx
+++ b/tests/MMS/GBS/gbs.hxx
@@ -1,8 +1,8 @@
 
 class GBS;
 
-#ifndef __GBS_H__
-#define __GBS_H__
+#ifndef BOUT_GBS_H
+#define BOUT_GBS_H
 
 #include <bout/physicsmodel.hxx>
 
@@ -96,4 +96,4 @@ private:
   std::unique_ptr<Laplacian> aparSolver{nullptr};
 };
 
-#endif // __GBS_H__
+#endif // BOUT_GBS_H
diff --git a/tests/MMS/spatial/fci/runtest b/tests/MMS/spatial/fci/runtest
index 712442a795..204a9cc271 100755
--- a/tests/MMS/spatial/fci/runtest
+++ b/tests/MMS/spatial/fci/runtest
@@ -27,7 +27,7 @@ nx = 3  # Not changed for these tests
 nlist = [8, 16, 32, 64, 128]
 
 # Number of parallel slices (in each direction)
-nslices = [1, 2]
+nslices = [1]
 
 directory = "data"
 
diff --git a/tests/integrated/CMakeLists.txt b/tests/integrated/CMakeLists.txt
index 7d3e8e81ce..ef173db7df 100644
--- a/tests/integrated/CMakeLists.txt
+++ b/tests/integrated/CMakeLists.txt
@@ -11,6 +11,7 @@ add_subdirectory(test-datafilefacade)
 add_subdirectory(test-drift-instability)
 add_subdirectory(test-drift-instability-staggered)
 add_subdirectory(test-fieldgroupComm)
+add_subdirectory(test-fci-boundary)
 add_subdirectory(test-griddata)
 add_subdirectory(test-griddata-yboundary-guards)
 add_subdirectory(test-gyro)
diff --git a/tests/integrated/test-fci-boundary/CMakeLists.txt b/tests/integrated/test-fci-boundary/CMakeLists.txt
new file mode 100644
index 0000000000..bf25cd7c57
--- /dev/null
+++ b/tests/integrated/test-fci-boundary/CMakeLists.txt
@@ -0,0 +1,22 @@
+bout_add_mms_test(test-fci-boundary
+  SOURCES get_par_bndry.cxx
+  USE_RUNTEST
+  USE_DATA_BOUT_INP
+  REQUIRES zoidberg_FOUND
+  PROCESSORS 1
+  )
+
+if (zoidberg_FOUND)
+  set(gridfile ${CMAKE_CURRENT_BINARY_DIR}/grid.fci.nc)
+  add_custom_command(OUTPUT ${gridfile}
+    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${BOUT_PYTHONPATH}:$ENV{PYTHONPATH} ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/grid.py ${gridfile}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../../../tools/pylib/boutconfig/__init__.py
+    DEPENDS grid.py
+    IMPLICIT_DEPENDS  ${CMAKE_CURRENT_BINARY_DIR}
+    COMMENT "Creating test-fci-boundary grid file"
+  )
+  add_custom_target(test-fci-boundary-grid DEPENDS ${gridfile})
+  add_dependencies(test-fci-boundary
+    test-fci-boundary-grid)
+endif()
diff --git a/tests/integrated/test-fci-boundary/data/BOUT.inp b/tests/integrated/test-fci-boundary/data/BOUT.inp
new file mode 100644
index 0000000000..b631f16295
--- /dev/null
+++ b/tests/integrated/test-fci-boundary/data/BOUT.inp
@@ -0,0 +1,20 @@
+grid = grid.fci.nc
+
+MXG = 1
+NXPE = 1
+MYG = 1
+
+[mesh]
+symmetricglobalx = true
+
+[mesh:ddy]
+first = C2
+second = C2
+
+[mesh:paralleltransform]
+type = fci
+y_periodic = true
+z_periodic = true
+
+[mesh:paralleltransform:xzinterpolation]
+type = lagrange4pt
diff --git a/tests/integrated/test-fci-boundary/get_par_bndry.cxx b/tests/integrated/test-fci-boundary/get_par_bndry.cxx
new file mode 100644
index 0000000000..ac0f5de2a6
--- /dev/null
+++ b/tests/integrated/test-fci-boundary/get_par_bndry.cxx
@@ -0,0 +1,34 @@
+#include "bout/bout.hxx"
+#include "bout/derivs.hxx"
+#include "bout/field_factory.hxx"
+#include "bout/parallel_boundary_region.hxx"
+
+int main(int argc, char** argv) {
+  BoutInitialise(argc, argv);
+
+  using bout::globals::mesh;
+
+  std::vector<Field3D> fields;
+  fields.resize(static_cast<int>(BoundaryParType::SIZE));
+  Options dump;
+  for (int i = 0; i < fields.size(); i++) {
+    fields[i] = Field3D{0.0};
+    mesh->communicate(fields[i]);
+    for (const auto& bndry_par :
+         mesh->getBoundariesPar(static_cast<BoundaryParType>(i))) {
+      output.write("{:s} region\n", toString(static_cast<BoundaryParType>(i)));
+      for (bndry_par->first(); !bndry_par->isDone(); bndry_par->next()) {
+        fields[i][bndry_par->ind()] += 1;
+        output.write("{:s} increment\n", toString(static_cast<BoundaryParType>(i)));
+      }
+    }
+    output.write("{:s} done\n", toString(static_cast<BoundaryParType>(i)));
+
+    dump[fmt::format("field_{:s}", toString(static_cast<BoundaryParType>(i)))] =
+        fields[i];
+  }
+
+  bout::writeDefaultOutputFile(dump);
+
+  BoutFinalise();
+}
diff --git a/tests/integrated/test-fci-boundary/grid.py b/tests/integrated/test-fci-boundary/grid.py
new file mode 100644
index 0000000000..d544f0cdf7
--- /dev/null
+++ b/tests/integrated/test-fci-boundary/grid.py
@@ -0,0 +1,55 @@
+import zoidberg as zb
+import numpy as np
+import sys
+import boutconfig as bc
+
+
+def rotating_ellipse(
+    nx=68,
+    ny=16,
+    nz=128,
+    npoints=421,
+    xcentre=5.5,
+    I_coil=0.01,
+    curvilinear=True,
+    rectangular=False,
+    fname="rotating-ellipse.fci.nc",
+    a=0.4,
+    Btor=2.5,
+):
+    yperiod = 2 * np.pi / 5.0
+    field = zb.field.RotatingEllipse(
+        xcentre=xcentre,
+        I_coil=I_coil,
+        radius=2 * a,
+        yperiod=yperiod,
+        Btor=Btor,
+    )
+    # Define the y locations
+    ycoords = np.linspace(0.0, yperiod, ny, endpoint=False)
+
+    if rectangular:
+        print("Making rectangular poloidal grid")
+        poloidal_grid = zb.poloidal_grid.RectangularPoloidalGrid(
+            nx, nz, 1.0, 1.0, Rcentre=xcentre
+        )
+    elif curvilinear:
+        print("Making curvilinear poloidal grid")
+        inner = zb.rzline.shaped_line(
+            R0=xcentre, a=a / 2.0, elong=0, triang=0.0, indent=0, n=npoints
+        )
+        outer = zb.rzline.shaped_line(
+            R0=xcentre, a=a, elong=0, triang=0.0, indent=0, n=npoints
+        )
+
+        print("creating grid...")
+        poloidal_grid = zb.poloidal_grid.grid_elliptic(inner, outer, nx, nz)
+
+    # Create the 3D grid by putting together 2D poloidal grids
+    grid = zb.grid.Grid(poloidal_grid, ycoords, yperiod, yperiodic=True)
+    maps = zb.make_maps(grid, field, quiet=True)
+    zb.write_maps(grid, field, maps, str(fname), metric2d=bc.isMetric2D())
+
+
+if __name__ == "__main__":
+    rotating_ellipse(fname=sys.argv[1])
diff --git a/tests/integrated/test-fci-boundary/runtest b/tests/integrated/test-fci-boundary/runtest
new file mode 100755
index 0000000000..16cb4ee443
--- /dev/null
+++ b/tests/integrated/test-fci-boundary/runtest
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+#
+# Python script to run and analyse MMS test
+#
+
+# Cores: 2
+# only working with cmake
+# requires: False
+from boututils.run_wrapper import launch_safe
+from boututils.datafile import DataFile
+from boutdata.collect import collect as _collect
+
+import numpy as np
+
+
+def collect(var):
+    return _collect(
+        var,
+        info=False,
+        path=directory,
+        xguards=False,
+        yguards=False,
+    )
+
+
+nprocs = [1]  # , 2, 4]
+mthread = 2
+
+directory = "data"
+
+with DataFile("grid.fci.nc") as grid:
+    xfwd = grid.read("forward_xt_prime")[1:-1]
+    xbwd = grid.read("backward_xt_prime")[1:-1]
+
+nx = xfwd.shape[0]
+
+regions = {
+    "xin_fwd": xfwd < 1,
+    "xout_fwd": xfwd > nx,
+    "xin_bwd": xbwd < 1,
+    "xout_bwd": xbwd > nx,
+}
+regions = {k: v.astype(int) for k, v in regions.items()}
+
+# for x in "xout", "xin":
+#     regions[x] = np.logical_or(regions[f"{x}_fwd"], regions[f"{x}_bwd"])
+# for x in "fwd", "bwd":
+#     regions[x] = np.logical_or(regions[f"xin_{x}"], regions[f"xout_{x}"])
+# regions["all"] = np.logical_or(regions["xin"], regions["xout"])
+for x in "xout", "xin":
+    regions[x] = regions[f"{x}_fwd"] + regions[f"{x}_bwd"]
+for x in "fwd", "bwd":
+    regions[x] = regions[f"xin_{x}"] + regions[f"xout_{x}"]
+regions["all"] = regions["xin"] + regions["xout"]
+
+for nproc in nprocs:
+    cmd = "./get_par_bndry"
+
+    # Launch using MPI
+    _, out = launch_safe(cmd, nproc=nproc, mthread=mthread, pipe=True)
+
+    for k, v in regions.items():
+        # Collect data
+        data = collect(f"field_{k}")
+        assert np.allclose(data, v), (
+            k + " does not match",
+            np.sum(data),
+            np.sum(v),
+            np.max(data),
+        )
diff --git a/tests/integrated/test-laplace-hypre3d/data_circular_core-sol/BOUT.inp b/tests/integrated/test-laplace-hypre3d/data_circular_core-sol/BOUT.inp
index 9a6ac24fa1..46d3cb55ba 100644
--- a/tests/integrated/test-laplace-hypre3d/data_circular_core-sol/BOUT.inp
+++ b/tests/integrated/test-laplace-hypre3d/data_circular_core-sol/BOUT.inp
@@ -1,7 +1,7 @@
 [f]
 #function = 0.
 function = mixmode(x, 1.)*mixmode(y, 2.)*mixmode(z, 3.)
-bndry_par_all = parallel_neumann
+bndry_par_all = parallel_neumann_o2
 
 [rhs]
 function = mixmode(x, 4.)*mixmode(y, 5.)*mixmode(z, 6.)
@@ -17,7 +17,7 @@ function = 1. + .1*mixmode(x, 10.)*mixmode(y, 11.)*mixmode(z, 12.)
 [C2]
 #function = 0.
 function = .1*mixmode(x, 13.)*mixmode(y, 14.)*mixmode(z, 15.)
-bndry_par_all = parallel_neumann
+bndry_par_all = parallel_neumann_o2
 
 [A]
 function = 0.
diff --git a/tests/integrated/test-laplace-hypre3d/data_circular_core/BOUT.inp b/tests/integrated/test-laplace-hypre3d/data_circular_core/BOUT.inp
index eb78644f0f..be0c697d80 100644
--- a/tests/integrated/test-laplace-hypre3d/data_circular_core/BOUT.inp
+++ b/tests/integrated/test-laplace-hypre3d/data_circular_core/BOUT.inp
@@ -16,7 +16,7 @@ function = 1. + .1*mixmode(x, 10.)*mixmode(y, 11.)*mixmode(z, 12.)
 [C2]
 #function = 0.
 function = .1*mixmode(x, 13.)*mixmode(y, 14.)*mixmode(z, 15.)
-bndry_par_all = parallel_neumann
+bndry_par_all = parallel_neumann_o2
 
 [A]
 function = 0.
diff --git a/tests/integrated/test-laplace-petsc3d/data_circular_core-sol/BOUT.inp b/tests/integrated/test-laplace-petsc3d/data_circular_core-sol/BOUT.inp
index da1918dcc7..bc3c47eac7 100644
--- a/tests/integrated/test-laplace-petsc3d/data_circular_core-sol/BOUT.inp
+++ b/tests/integrated/test-laplace-petsc3d/data_circular_core-sol/BOUT.inp
@@ -17,7 +17,7 @@ function = 1. + .1*mixmode(x, 10.)*mixmode(y, 11.)*mixmode(z, 12.)
 [C2]
 #function = 0.
 function = .1*mixmode(x, 13.)*mixmode(y, 14.)*mixmode(z, 15.)
-bndry_par_all = parallel_neumann
+bndry_par_all = parallel_neumann_o2
 
 [A]
 function = 0.0
diff --git a/tests/integrated/test-laplace-petsc3d/data_circular_core/BOUT.inp b/tests/integrated/test-laplace-petsc3d/data_circular_core/BOUT.inp
index 6474b2604b..601531de84 100644
--- a/tests/integrated/test-laplace-petsc3d/data_circular_core/BOUT.inp
+++ b/tests/integrated/test-laplace-petsc3d/data_circular_core/BOUT.inp
@@ -17,7 +17,7 @@ function = 1. + .1*mixmode(x, 10.)*mixmode(y, 11.)*mixmode(z, 12.)
 [C2]
 #function = 0.
 function = .1*mixmode(x, 13.)*mixmode(y, 14.)*mixmode(z, 15.)
-bndry_par_all = parallel_neumann
+bndry_par_all = parallel_neumann_o2
 
 [A]
 function = 0.0
diff --git a/tests/integrated/test-laplacexy/loadmetric.hxx b/tests/integrated/test-laplacexy/loadmetric.hxx
index 141269d8b8..25e55fc8e8 100644
--- a/tests/integrated/test-laplacexy/loadmetric.hxx
+++ b/tests/integrated/test-laplacexy/loadmetric.hxx
@@ -1,8 +1,8 @@
-#ifndef __LOADMETRIC_H__
-#define __LOADMETRIC_H__
+#ifndef BOUT_LOADMETRIC_H
+#define BOUT_LOADMETRIC_H
 
 #include <bout/bout_types.hxx>
 
 void LoadMetric(BoutReal Lnorm, BoutReal Bnorm);
 
-#endif // __LOADMETRIC_H__
+#endif // BOUT_LOADMETRIC_H
diff --git a/tests/integrated/test-options-adios/CMakeLists.txt b/tests/integrated/test-options-adios/CMakeLists.txt
index 110773d6fd..cc61fabe57 100644
--- a/tests/integrated/test-options-adios/CMakeLists.txt
+++ b/tests/integrated/test-options-adios/CMakeLists.txt
@@ -2,5 +2,5 @@ bout_add_integrated_test(test-options-adios
   SOURCES test-options-adios.cxx
   USE_RUNTEST
   USE_DATA_BOUT_INP
-  REQUIRES BOUT_HAS_ADIOS
+  REQUIRES BOUT_HAS_ADIOS2
   )
diff --git a/tests/integrated/test-options-adios/runtest b/tests/integrated/test-options-adios/runtest
index 1621c686a3..03a83fc0ba 100755
--- a/tests/integrated/test-options-adios/runtest
+++ b/tests/integrated/test-options-adios/runtest
@@ -34,7 +34,7 @@ assert result["int"] == 42
 assert math.isclose(result["real"], 3.1415)
 assert result["string"] == "hello"
 
-print("Checking saved ADIOS test-out file -- Not implemented")
+print("Checking saved ADIOS2 test-out file -- Not implemented")
 
 # Check the output NetCDF file
 # with DataFile("test-out.nc") as f:
diff --git a/tests/integrated/test-petsc_laplace/test_petsc_laplace.cxx b/tests/integrated/test-petsc_laplace/test_petsc_laplace.cxx
index bfd394194f..1e3cdde310 100644
--- a/tests/integrated/test-petsc_laplace/test_petsc_laplace.cxx
+++ b/tests/integrated/test-petsc_laplace/test_petsc_laplace.cxx
@@ -23,15 +23,90 @@
  *
  **************************************************************************/
 
-#include <bout/bout.hxx>
-#include <bout/constants.hxx>
-// #include <bout/sys/timer.hxx>
-#include <bout/boutexception.hxx>
-#include <bout/invert_laplace.hxx>
-#include <bout/options.hxx>
+#include "bout/bout.hxx" // NOLINT
+#include "bout/bout_types.hxx"
+#include "bout/boutexception.hxx"
+#include "bout/constants.hxx"
+#include "bout/difops.hxx"
+#include "bout/field2d.hxx"
+#include "bout/field3d.hxx"
+#include "bout/invert_laplace.hxx"
+#include "bout/options.hxx"
+#include "bout/options_io.hxx"
+#include "bout/output.hxx"
+#include "bout/traits.hxx"
+
+#include "fmt/core.h"
+#include <mpi.h>
+
 #include <cmath>
+#include <string_view>
 
 BoutReal max_error_at_ystart(const Field3D& error);
+void apply_flat_boundary(Field3D& bcoef);
+
+template <class T, class U>
+void check_laplace(int test_num, std::string_view test_name, Laplacian& invert,
+                   int inner_flags, int outer_flags, const T& acoef, const T& ccoef,
+                   const T& dcoef, const U& bcoef, const Field3D& field, int ystart,
+                   Options& dump) {
+  static_assert(bout::utils::is_Field_v<T>, "check_laplace requires Field2D or Field3D");
+  static_assert(bout::utils::is_Field_v<U>, "check_laplace requires Field2D or Field3D");
+
+  invert.setInnerBoundaryFlags(inner_flags);
+  invert.setOuterBoundaryFlags(outer_flags);
+  invert.setCoefA(acoef);
+  invert.setCoefC(ccoef);
+  invert.setCoefD(dcoef);
+
+  checkData(bcoef);
+
+  Field3D sol;
+  Field3D error;
+  Field3D abs_error;
+  BoutReal max_error = -1;
+
+  try {
+    sol = invert.solve(sliceXZ(bcoef, ystart));
+    error = (field - sol) / field;
+    abs_error = field - sol;
+    max_error = max_error_at_ystart(abs(abs_error));
+  } catch (BoutException& err) {
+    output.write("BoutException occured in invert->solve(b1): {}\n", err.what());
+  }
+
+  output.write("\nTest {}: {}\n", test_num, test_name);
+  output.write("Magnitude of maximum absolute error is {}\n", max_error);
+
+  dump[fmt::format("a{}", test_num)] = acoef;
+  dump[fmt::format("b{}", test_num)] = bcoef;
+  dump[fmt::format("c{}", test_num)] = ccoef;
+  dump[fmt::format("d{}", test_num)] = dcoef;
+  dump[fmt::format("f{}", test_num)] = field;
+  dump[fmt::format("sol{}", test_num)] = sol;
+  dump[fmt::format("error{}", test_num)] = error;
+  dump[fmt::format("absolute_error{}", test_num)] = abs_error;
+  dump[fmt::format("max_error{}", test_num)] = max_error;
+}
+
+template <class T>
+Field3D forward_laplace(const Field3D& field, const T& acoef, const T& ccoef,
+                        const T& dcoef) {
+  auto bcoef =
+      dcoef * Delp2(field) + Grad_perp(ccoef) * Grad_perp(field) / ccoef + acoef * field;
+  apply_flat_boundary(bcoef);
+  return bcoef;
+}
+
+Field3D generate_f1(const Mesh& mesh);
+Field3D generate_a1(const Mesh& mesh);
+Field3D generate_c1(const Mesh& mesh);
+Field3D generate_d1(const Mesh& mesh);
+
+Field3D generate_f5(const Mesh& mesh);
+Field3D generate_a5(const Mesh& mesh);
+Field3D generate_c5(const Mesh& mesh);
+Field3D generate_d5(const Mesh& mesh);
 
 int main(int argc, char** argv) {
 
@@ -42,829 +117,553 @@ int main(int argc, char** argv) {
     options = Options::getRoot()->getSection("petsc4th");
     auto invert_4th = Laplacian::create(options);
 
-    // Solving equations of the form d*Delp2(f) + 1/c*Grad_perp(c).Grad_perp(f) + a*f = b for various f, a, c, d
-    Field3D f1, a1, b1, c1, d1, sol1;
-    BoutReal p, q; //Use to set parameters in constructing trial functions
-    Field3D error1,
-        absolute_error1; //Absolute value of relative error: abs( (f1-sol1)/f1 )
-    BoutReal max_error1; //Output of test
+    Options dump;
 
+    // Solving equations of the form d*Delp2(f) + 1/c*Grad_perp(c).Grad_perp(f) + a*f = b for various f, a, c, d
     using bout::globals::mesh;
 
     // Only Neumann x-boundary conditions are implemented so far, so test functions should be Neumann in x and periodic in z.
     // Use Field3D's, but solver only works on FieldPerp slices, so only use 1 y-point
-    BoutReal nx = mesh->GlobalNx - 2 * mesh->xstart - 1;
-    BoutReal nz = mesh->GlobalNz;
 
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////
     // Test 1: Gaussian x-profiles, 2nd order Krylov
-    p = 0.39503274;
-    q = 0.20974396;
-    f1.allocate();
-    for (int jx = mesh->xstart; jx <= mesh->xend; jx++) {
-      for (int jy = 0; jy < mesh->LocalNy; jy++) {
-        for (int jz = 0; jz < mesh->LocalNz; jz++) {
-          BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-          BoutReal z = BoutReal(jz) / nz;
-          f1(jx, jy, jz) =
-              0. + exp(-(100. * pow(x - p, 2) + 1. - cos(2. * PI * (z - q))))
-              - 50.
-                    * (2. * p * exp(-100. * pow(-p, 2)) * x
-                       + (-p * exp(-100. * pow(-p, 2))
-                          - (1 - p) * exp(-100. * pow(1 - p, 2)))
-                             * pow(x, 2))
-                    * exp(-(
-                        1.
-                        - cos(2. * PI
-                              * (z - q)))) //make the gradients zero at both x-boundaries
-              ;
-          ASSERT0(finite(f1(jx, jy, jz)));
-        }
-      }
-    }
-    if (mesh->firstX()) {
-      for (int jx = mesh->xstart - 1; jx >= 0; jx--) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-            BoutReal z = BoutReal(jz) / nz;
-            f1(jx, jy, jz) =
-                0. + exp(-(60. * pow(x - p, 2) + 1. - cos(2. * PI * (z - q))))
-                - 50.
-                      * (2. * p * exp(-60. * pow(-p, 2)) * x
-                         + (-p * exp(-60. * pow(-p, 2))
-                            - (1 - p) * exp(-60. * pow(1 - p, 2)))
-                               * pow(x, 2))
-                      * exp(-(
-                          1.
-                          - cos(
-                              2. * PI
-                              * (z - q)))); //make the gradients zero at both x-boundaries
-            ASSERT0(finite(f1(jx, jy, jz)));
-          }
-        }
-      }
-    }
-    if (mesh->lastX()) {
-      for (int jx = mesh->xend + 1; jx < mesh->LocalNx; jx++) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-            BoutReal z = BoutReal(jz) / nz;
-            f1(jx, jy, jz) =
-                0. + exp(-(60. * pow(x - p, 2) + 1. - cos(2. * PI * (z - q))))
-                - 50.
-                      * (2. * p * exp(-60. * pow(-p, 2)) * x
-                         + (-p * exp(-60. * pow(-p, 2))
-                            - (1 - p) * exp(-60. * pow(1 - p, 2)))
-                               * pow(x, 2))
-                      * exp(-(
-                          1.
-                          - cos(
-                              2. * PI
-                              * (z - q)))); //make the gradients zero at both x-boundaries
-            ASSERT0(finite(f1(jx, jy, jz)));
-          }
-        }
-      }
-    }
+    Field3D f_1 = generate_f1(*mesh);
+    Field3D a_1 = generate_a1(*mesh);
+    Field3D c_1 = generate_c1(*mesh);
+    Field3D d_1 = generate_d1(*mesh);
 
-    f1.applyBoundary("neumann");
-
-    p = 0.512547;
-    q = 0.30908712;
-    d1.allocate();
-    for (int jx = mesh->xstart; jx <= mesh->xend; jx++) {
-      for (int jy = 0; jy < mesh->LocalNy; jy++) {
-        for (int jz = 0; jz < mesh->LocalNz; jz++) {
-          BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-          BoutReal z = BoutReal(jz) / nz;
-          d1(jx, jy, jz) =
-              1. + 0.2 * exp(-50. * pow(x - p, 2) / 4.) * sin(2. * PI * (z - q) * 3.);
-        }
-      }
-    }
-    if (mesh->firstX()) {
-      for (int jx = mesh->xstart - 1; jx >= 0; jx--) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-            BoutReal z = BoutReal(jz) / nz;
-            d1(jx, jy, jz) =
-                1. + 0.2 * exp(-50. * pow(x - p, 2) / 4.) * sin(2. * PI * (z - q) * 3.);
-            // 	  d1(jx, jy, jz) = d1(jx+1, jy, jz);
-          }
-        }
-      }
-    }
-    if (mesh->lastX()) {
-      for (int jx = mesh->xend + 1; jx < mesh->LocalNx; jx++) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-            BoutReal z = BoutReal(jz) / nz;
-            d1(jx, jy, jz) =
-                1. + 0.2 * exp(-50. * pow(x - p, 2) / 4.) * sin(2. * PI * (z - q) * 3.);
-            // 	  d1(jx, jy, jz) = d1(jx-1, jy, jz);
-          }
-        }
-      }
-    }
+    mesh->communicate(f_1, a_1, c_1, d_1);
 
-    p = 0.18439023;
-    q = 0.401089473;
-    c1.allocate();
-    for (int jx = mesh->xstart; jx <= mesh->xend; jx++) {
-      for (int jy = 0; jy < mesh->LocalNy; jy++) {
-        for (int jz = 0; jz < mesh->LocalNz; jz++) {
-          BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-          BoutReal z = BoutReal(jz) / nz;
-          c1(jx, jy, jz) =
-              1. + 0.15 * exp(-50. * pow(x - p, 2) * 2.) * sin(2. * PI * (z - q) * 2.);
-        }
-      }
-    }
-    if (mesh->firstX()) {
-      for (int jx = mesh->xstart - 1; jx >= 0; jx--) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-            BoutReal z = BoutReal(jz) / nz;
-            c1(jx, jy, jz) =
-                1. + 0.15 * exp(-50. * pow(x - p, 2) * 2.) * sin(2. * PI * (z - q) * 2.);
-            // 	  c1(jx, jy, jz) = c1(jx+1, jy, jz);
-          }
-        }
-      }
-    }
-    if (mesh->lastX()) {
-      for (int jx = mesh->xend + 1; jx < mesh->LocalNx; jx++) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-            BoutReal z = BoutReal(jz) / nz;
-            c1(jx, jy, jz) =
-                1. + 0.15 * exp(-50. * pow(x - p, 2) * 2.) * sin(2. * PI * (z - q) * 2.);
-            // 	  c1(jx, jy, jz) = c1(jx-1, jy, jz);
-          }
-        }
+    const Field3D b_1 = forward_laplace(f_1, a_1, c_1, d_1);
+
+    int test_num = 0;
+    check_laplace(++test_num, "PETSc 2nd order", *invert, INVERT_AC_GRAD, INVERT_AC_GRAD,
+                  a_1, c_1, d_1, b_1, f_1, mesh->ystart, dump);
+
+    /////////////////////////////////////////////////
+    // Test 2: Gaussian x-profiles, 4th order Krylov
+
+    check_laplace(++test_num, "PETSc 4th order", *invert_4th, INVERT_AC_GRAD,
+                  INVERT_AC_GRAD, a_1, c_1, d_1, b_1, f_1, mesh->ystart, dump);
+
+    ////////////////////////////////////////////////////////////////////////////////////////
+    // Test 3+4: Gaussian x-profiles, z-independent coefficients and compare with SPT method
+
+    const Field2D a_3 = DC(a_1);
+    const Field2D c_3 = DC(c_1);
+    const Field2D d_3 = DC(d_1);
+    const Field3D b_3 = forward_laplace(f_1, a_3, c_3, d_3);
+
+    check_laplace(++test_num, "with coefficients constant in z, PETSc 2nd order", *invert,
+                  INVERT_AC_GRAD, INVERT_AC_GRAD, a_3, c_3, d_3, b_3, f_1, mesh->ystart,
+                  dump);
+
+    Options* SPT_options = Options::getRoot()->getSection("SPT");
+    auto invert_SPT = Laplacian::create(SPT_options);
+
+    check_laplace(++test_num, "with coefficients constant in z, default solver",
+                  *invert_SPT, INVERT_AC_GRAD, INVERT_AC_GRAD | INVERT_DC_GRAD, a_3, c_3,
+                  d_3, b_3, f_1, mesh->ystart, dump);
+
+    //////////////////////////////////////////////
+    // Test 5: Cosine x-profiles, 2nd order Krylov
+    Field3D f_5 = generate_f5(*mesh);
+    Field3D a_5 = generate_a5(*mesh);
+    Field3D c_5 = generate_c5(*mesh);
+    Field3D d_5 = generate_d5(*mesh);
+
+    mesh->communicate(f_5, a_5, c_5, d_5);
+
+    const Field3D b_5 = forward_laplace(f_5, a_5, c_5, d_5);
+
+    check_laplace(++test_num, "different profiles, PETSc 2nd order", *invert,
+                  INVERT_AC_GRAD, INVERT_AC_GRAD, a_5, c_5, d_5, b_5, f_5, mesh->ystart,
+                  dump);
+
+    //////////////////////////////////////////////
+    // Test 6: Cosine x-profiles, 4th order Krylov
+
+    check_laplace(++test_num, "different profiles, PETSc 4th order", *invert_4th,
+                  INVERT_AC_GRAD, INVERT_AC_GRAD, a_5, c_5, d_5, b_5, f_5, mesh->ystart,
+                  dump);
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Test 7+8: Cosine x-profiles, z-independent coefficients and compare with SPT method
+
+    const Field2D a_7 = DC(a_5);
+    const Field2D c_7 = DC(c_5);
+    const Field2D d_7 = DC(d_5);
+    const Field3D b_7 = forward_laplace(f_5, a_7, c_7, d_7);
+
+    check_laplace(++test_num,
+                  "different profiles, with coefficients constant in z, PETSc 2nd order",
+                  *invert, INVERT_AC_GRAD, INVERT_AC_GRAD, a_7, c_7, d_7, b_7, f_5,
+                  mesh->ystart, dump);
+
+    check_laplace(++test_num,
+                  "different profiles, with coefficients constant in z, default solver",
+                  *invert_SPT, INVERT_AC_GRAD, INVERT_AC_GRAD | INVERT_DC_GRAD, a_7, c_7,
+                  d_7, b_7, f_5, mesh->ystart, dump);
+
+    // Write and close the output file
+    bout::writeDefaultOutputFile(dump);
+
+    MPI_Barrier(BoutComm::get()); // Wait for all processors to write data
+  }
+
+  bout::checkForUnusedOptions();
+
+  BoutFinalise();
+  return 0;
+}
+
+BoutReal max_error_at_ystart(const Field3D& error) {
+  const auto* mesh = error.getMesh();
+  BoutReal local_max_error = error(mesh->xstart, mesh->ystart, 0);
+
+  for (int jx = mesh->xstart; jx <= mesh->xend; jx++) {
+    for (int jz = 0; jz < mesh->LocalNz; jz++) {
+      if (local_max_error < error(jx, mesh->ystart, jz)) {
+        local_max_error = error(jx, mesh->ystart, jz);
       }
     }
+  }
 
-    p = 0.612547;
-    q = 0.30908712;
-    a1.allocate();
-    for (int jx = mesh->xstart; jx <= mesh->xend; jx++) {
-      for (int jy = 0; jy < mesh->LocalNy; jy++) {
-        for (int jz = 0; jz < mesh->LocalNz; jz++) {
-          BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-          BoutReal z = BoutReal(jz) / nz;
-          a1(jx, jy, jz) =
-              -1. + 0.1 * exp(-50. * pow(x - p, 2) * 2.5) * sin(2. * PI * (z - q) * 7.);
+  BoutReal max_error = BoutNaN;
+
+  MPI_Allreduce(&local_max_error, &max_error, 1, MPI_DOUBLE, MPI_MAX, BoutComm::get());
+
+  return max_error;
+}
+
+void apply_flat_boundary(Field3D& bcoef) {
+  const Mesh& mesh = *bcoef.getMesh();
+  if (mesh.firstX()) {
+    for (int jx = mesh.xstart - 1; jx >= 0; jx--) {
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          bcoef(jx, jy, jz) = bcoef(jx + 1, jy, jz);
         }
       }
     }
-    if (mesh->firstX()) {
-      for (int jx = mesh->xstart - 1; jx >= 0; jx--) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-            BoutReal z = BoutReal(jz) / nz;
-            a1(jx, jy, jz) =
-                -1. + 0.1 * exp(-50. * pow(x - p, 2) * 2.5) * sin(2. * PI * (z - q) * 7.);
-            // 	  a1(jx, jy, jz) = a1(jx+1, jy, jz);
-          }
+  }
+  if (mesh.lastX()) {
+    for (int jx = mesh.xend + 1; jx < mesh.LocalNx; jx++) {
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          bcoef(jx, jy, jz) = bcoef(jx - 1, jy, jz);
         }
       }
     }
-    if (mesh->lastX()) {
-      for (int jx = mesh->xend + 1; jx < mesh->LocalNx; jx++) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-            BoutReal z = BoutReal(jz) / nz;
-            a1(jx, jy, jz) =
-                -1. + 0.1 * exp(-50. * pow(x - p, 2) * 2.5) * sin(2. * PI * (z - q) * 7.);
-            // 	  a1(jx, jy, jz) = a1(jx-1, jy, jz);
-          }
-        }
+  }
+}
+
+Field3D generate_f1(const Mesh& mesh) {
+  const BoutReal nx = mesh.GlobalNx - 2 * mesh.xstart - 1;
+  const BoutReal nz = mesh.GlobalNz;
+
+  constexpr BoutReal p = 0.39503274; // NOLINT
+  constexpr BoutReal q = 0.20974396; // NOLINT
+
+  Field3D result;
+  result.allocate();
+  for (int jx = mesh.xstart; jx <= mesh.xend; jx++) {
+    const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+    for (int jy = 0; jy < mesh.LocalNy; jy++) {
+      for (int jz = 0; jz < mesh.LocalNz; jz++) {
+        const BoutReal z = BoutReal(jz) / nz;
+        //make the gradients zero at both x-boundaries
+        result(jx, jy, jz) = 0.
+                             + exp(-(100. * pow(x - p, 2) + 1. - cos(2. * PI * (z - q))))
+                             - 50.
+                                   * (2. * p * exp(-100. * pow(-p, 2)) * x
+                                      + (-p * exp(-100. * pow(-p, 2))
+                                         - (1 - p) * exp(-100. * pow(1 - p, 2)))
+                                            * pow(x, 2))
+                                   * exp(-(1. - cos(2. * PI * (z - q))));
       }
     }
+  }
+  if (mesh.firstX()) {
+    for (int jx = mesh.xstart - 1; jx >= 0; jx--) {
+      const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
 
-    checkData(f1);
-    checkData(a1);
-    checkData(c1);
-    checkData(d1);
-
-    mesh->communicate(f1, a1, c1, d1);
-
-    b1 = d1 * Delp2(f1) + Grad_perp(c1) * Grad_perp(f1) / c1 + a1 * f1;
-
-    if (mesh->firstX()) {
-      for (int jx = mesh->xstart - 1; jx >= 0; jx--) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            b1(jx, jy, jz) = b1(jx + 1, jy, jz);
-          }
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          const BoutReal z = BoutReal(jz) / nz;
+          //make the gradients zero at both x-boundaries
+          result(jx, jy, jz) = 0.
+                               + exp(-(60. * pow(x - p, 2) + 1. - cos(2. * PI * (z - q))))
+                               - 50.
+                                     * (2. * p * exp(-60. * pow(-p, 2)) * x
+                                        + (-p * exp(-60. * pow(-p, 2))
+                                           - (1 - p) * exp(-60. * pow(1 - p, 2)))
+                                              * pow(x, 2))
+                                     * exp(-(1. - cos(2. * PI * (z - q))));
         }
       }
     }
-    if (mesh->lastX()) {
-      for (int jx = mesh->xend + 1; jx < mesh->LocalNx; jx++) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            b1(jx, jy, jz) = b1(jx - 1, jy, jz);
-          }
+  }
+  if (mesh.lastX()) {
+    for (int jx = mesh.xend + 1; jx < mesh.LocalNx; jx++) {
+      const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          const BoutReal z = BoutReal(jz) / nz;
+          //make the gradients zero at both x-boundaries
+          result(jx, jy, jz) = 0.
+                               + exp(-(60. * pow(x - p, 2) + 1. - cos(2. * PI * (z - q))))
+                               - 50.
+                                     * (2. * p * exp(-60. * pow(-p, 2)) * x
+                                        + (-p * exp(-60. * pow(-p, 2))
+                                           - (1 - p) * exp(-60. * pow(1 - p, 2)))
+                                              * pow(x, 2))
+                                     * exp(-(1. - cos(2. * PI * (z - q))));
         }
       }
     }
+  }
 
-    invert->setInnerBoundaryFlags(INVERT_AC_GRAD);
-    invert->setOuterBoundaryFlags(INVERT_AC_GRAD);
-    invert->setCoefA(a1);
-    invert->setCoefC(c1);
-    invert->setCoefD(d1);
-
-    checkData(b1);
-
-    try {
-      sol1 = invert->solve(sliceXZ(b1, mesh->ystart));
-      error1 = (f1 - sol1) / f1;
-      absolute_error1 = f1 - sol1;
-      //     max_error1 = max_error_at_ystart(abs(error1));
-      max_error1 = max_error_at_ystart(abs(absolute_error1));
-    } catch (BoutException& err) {
-      output << "BoutException occured in invert->solve(b1): " << err.what() << endl;
-      max_error1 = -1;
-    }
+  checkData(result);
+  result.applyBoundary("neumann");
+  return result;
+}
 
-    output << endl << "Test 1: PETSc 2nd order" << endl;
-    //   output<<"Time to set up is "<<Timer::getTime("petscsetup")<<". Time to solve is "<<Timer::getTime("petscsolve")<<endl;
-    //   output<<"Magnitude of maximum relative error is "<<max_error1<<endl;
-    output << "Magnitude of maximum absolute error is " << max_error1 << endl;
-    //   Timer::resetTime("petscsetup");
-    //   Timer::resetTime("petscsolve");
+Field3D generate_d1(const Mesh& mesh) {
+  const BoutReal nx = mesh.GlobalNx - 2 * mesh.xstart - 1;
+  const BoutReal nz = mesh.GlobalNz;
 
-    Options dump;
-    dump["a1"] = a1;
-    dump["b1"] = b1;
-    dump["c1"] = c1;
-    dump["d1"] = d1;
-    dump["f1"] = f1;
-    dump["sol1"] = sol1;
-    dump["error1"] = error1;
-    dump["absolute_error1"] = absolute_error1;
-    dump["max_error1"] = max_error1;
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Test 2: Gaussian x-profiles, 4th order Krylov
-    Field3D sol2;
-    Field3D error2,
-        absolute_error2; //Absolute value of relative error: abs( (f3-sol3)/f3 )
-    BoutReal max_error2; //Output of test
-
-    invert_4th->setInnerBoundaryFlags(INVERT_AC_GRAD);
-    invert_4th->setOuterBoundaryFlags(INVERT_AC_GRAD);
-    invert_4th->setGlobalFlags(INVERT_4TH_ORDER);
-    invert_4th->setCoefA(a1);
-    invert_4th->setCoefC(c1);
-    invert_4th->setCoefD(d1);
-
-    try {
-      sol2 = invert_4th->solve(sliceXZ(b1, mesh->ystart));
-      error2 = (f1 - sol2) / f1;
-      absolute_error2 = f1 - sol2;
-      //     max_error2 = max_error_at_ystart(abs(error2));
-      max_error2 = max_error_at_ystart(abs(absolute_error2));
-    } catch (BoutException& err) {
-      output << "BoutException occured in invert->solve(b1): " << err.what() << endl;
-      max_error2 = -1;
+  constexpr BoutReal p = 0.512547;   // NOLINT
+  constexpr BoutReal q = 0.30908712; // NOLINT
+  Field3D result;
+  result.allocate();
+  for (int jx = mesh.xstart; jx <= mesh.xend; jx++) {
+    const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+    for (int jy = 0; jy < mesh.LocalNy; jy++) {
+      for (int jz = 0; jz < mesh.LocalNz; jz++) {
+        const BoutReal z = BoutReal(jz) / nz;
+        result(jx, jy, jz) =
+            1. + 0.2 * exp(-50. * pow(x - p, 2) / 4.) * sin(2. * PI * (z - q) * 3.);
+      }
     }
-
-    output << endl << "Test 2: PETSc 4th order" << endl;
-    //   output<<"Time to set up is "<<Timer::getTime("petscsetup")<<". Time to solve is "<<Timer::getTime("petscsolve")<<endl;
-    //   output<<"Magnitude of maximum relative error is "<<max_error2<<endl;
-    output << "Magnitude of maximum absolute error is " << max_error2 << endl;
-    //   Timer::resetTime("petscsetup");
-    //   Timer::resetTime("petscsolve");
-
-    dump["a2"] = a1;
-    dump["b2"] = b1;
-    dump["c2"] = c1;
-    dump["d2"] = d1;
-    dump["f2"] = f1;
-    dump["sol2"] = sol2;
-    dump["error2"] = error2;
-    dump["absolute_error2"] = absolute_error2;
-    dump["max_error2"] = max_error2;
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Test 3+4: Gaussian x-profiles, z-independent coefficients and compare with SPT method
-    Field2D a3, c3, d3;
-    Field3D b3;
-    Field3D sol3, sol4;
-    Field3D error3, absolute_error3, error4, absolute_error4;
-    BoutReal max_error3, max_error4;
-
-    a3 = DC(a1);
-    c3 = DC(c1);
-    d3 = DC(d1);
-    b3 = d3 * Delp2(f1) + Grad_perp(c3) * Grad_perp(f1) / c3 + a3 * f1;
-    if (mesh->firstX()) {
-      for (int jx = mesh->xstart - 1; jx >= 0; jx--) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            b3(jx, jy, jz) = b3(jx + 1, jy, jz);
-          }
+  }
+  if (mesh.firstX()) {
+    for (int jx = mesh.xstart - 1; jx >= 0; jx--) {
+      const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          const BoutReal z = BoutReal(jz) / nz;
+          result(jx, jy, jz) =
+              1. + 0.2 * exp(-50. * pow(x - p, 2) / 4.) * sin(2. * PI * (z - q) * 3.);
         }
       }
     }
-    if (mesh->lastX()) {
-      for (int jx = mesh->xend + 1; jx < mesh->LocalNx; jx++) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            b3(jx, jy, jz) = b3(jx - 1, jy, jz);
-          }
+  }
+  if (mesh.lastX()) {
+    for (int jx = mesh.xend + 1; jx < mesh.LocalNx; jx++) {
+      const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          const BoutReal z = BoutReal(jz) / nz;
+          result(jx, jy, jz) =
+              1. + 0.2 * exp(-50. * pow(x - p, 2) / 4.) * sin(2. * PI * (z - q) * 3.);
         }
       }
     }
+  }
+  checkData(result);
+  return result;
+}
 
-    invert->setInnerBoundaryFlags(INVERT_AC_GRAD);
-    invert->setOuterBoundaryFlags(INVERT_AC_GRAD);
-    invert->setCoefA(a3);
-    invert->setCoefC(c3);
-    invert->setCoefD(d3);
-
-    try {
-      sol3 = invert->solve(sliceXZ(b3, mesh->ystart));
-      error3 = (f1 - sol3) / f1;
-      absolute_error3 = f1 - sol3;
-      //     max_error3 = max_error_at_ystart(abs(error3));
-      max_error3 = max_error_at_ystart(abs(absolute_error3));
-    } catch (BoutException& err) {
-      output << "BoutException occured in invert->solve(b3): " << err.what() << endl;
-      max_error3 = -1;
-    }
+Field3D generate_c1(const Mesh& mesh) {
+  const BoutReal nx = mesh.GlobalNx - 2 * mesh.xstart - 1;
+  const BoutReal nz = mesh.GlobalNz;
 
-    output << endl << "Test 3: with coefficients constant in z, PETSc 2nd order" << endl;
-    //   output<<"Time to set up is "<<Timer::getTime("petscsetup")<<". Time to solve is "<<Timer::getTime("petscsolve")<<endl;
-    //   output<<"Magnitude of maximum relative error is "<<max_error3<<endl;
-    output << "Magnitude of maximum absolute error is " << max_error3 << endl;
-    //   Timer::resetTime("petscsetup");
-    //   Timer::resetTime("petscsolve");
-
-    dump["a3"] = a3;
-    dump["b3"] = b3;
-    dump["c3"] = c3;
-    dump["d3"] = d3;
-    dump["f3"] = f1;
-    dump["sol3"] = sol3;
-    dump["error3"] = error3;
-    dump["absolute_error3"] = absolute_error3;
-    dump["max_error3"] = max_error3;
-
-    Options* SPT_options;
-    SPT_options = Options::getRoot()->getSection("SPT");
-    auto invert_SPT = Laplacian::create(SPT_options);
-    invert_SPT->setInnerBoundaryFlags(INVERT_AC_GRAD);
-    invert_SPT->setOuterBoundaryFlags(INVERT_AC_GRAD | INVERT_DC_GRAD);
-    invert_SPT->setCoefA(a3);
-    invert_SPT->setCoefC(c3);
-    invert_SPT->setCoefD(d3);
-
-    sol4 = invert_SPT->solve(sliceXZ(b3, mesh->ystart));
-    error4 = (f1 - sol4) / f1;
-    absolute_error4 = f1 - sol4;
-    //   max_error4 = max_error_at_ystart(abs(error4));
-    max_error4 = max_error_at_ystart(abs(absolute_error4));
-
-    output << endl << "Test 4: with coefficients constant in z, default solver" << endl;
-    //   output<<"Time to set up is "<<Timer::getTime("petscsetup")<<". Time to solve is "<<Timer::getTime("petscsolve")<<endl;
-    //   output<<"Magnitude of maximum relative error is "<<max_error4<<endl;
-    output << "Magnitude of maximum absolute error is " << max_error4 << endl;
-    //   Timer::resetTime("petscsetup");
-    //   Timer::resetTime("petscsolve");
-
-    dump["a4"] = a3;
-    dump["b4"] = b3;
-    dump["c4"] = c3;
-    dump["d4"] = d3;
-    dump["f4"] = f1;
-    dump["sol4"] = sol4;
-    dump["error4"] = error4;
-    dump["absolute_error4"] = absolute_error4;
-    dump["max_error4"] = max_error4;
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Test 5: Cosine x-profiles, 2nd order Krylov
-    Field3D f5, a5, b5, c5, d5, sol5;
-    Field3D error5,
-        absolute_error5; //Absolute value of relative error: abs( (f5-sol5)/f5 )
-    BoutReal max_error5; //Output of test
-
-    p = 0.623901;
-    q = 0.01209489;
-    f5.allocate();
-    for (int jx = mesh->xstart; jx <= mesh->xend; jx++) {
-      for (int jy = 0; jy < mesh->LocalNy; jy++) {
-        for (int jz = 0; jz < mesh->LocalNz; jz++) {
-          BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-          BoutReal z = BoutReal(jz) / nz;
-          f5(jx, jy, jz) =
-              0. + exp(-(50. * pow(x - p, 2) + 1. - cos(2. * PI * (z - q))))
-              - 50.
-                    * (2. * p * exp(-50. * pow(-p, 2)) * x
-                       + (-p * exp(-50. * pow(-p, 2))
-                          - (1 - p) * exp(-50. * pow(1 - p, 2)))
-                             * pow(x, 2))
-                    * exp(-(
-                        1.
-                        - cos(2. * PI
-                              * (z - q)))) //make the gradients zero at both x-boundaries
-              ;
-        }
+  constexpr BoutReal p = 0.18439023;  // NOLINT
+  constexpr BoutReal q = 0.401089473; // NOLINT
+  Field3D result;
+  result.allocate();
+  for (int jx = mesh.xstart; jx <= mesh.xend; jx++) {
+    const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+    for (int jy = 0; jy < mesh.LocalNy; jy++) {
+      for (int jz = 0; jz < mesh.LocalNz; jz++) {
+        const BoutReal z = BoutReal(jz) / nz;
+        result(jx, jy, jz) =
+            1. + 0.15 * exp(-50. * pow(x - p, 2) * 2.) * sin(2. * PI * (z - q) * 2.);
       }
     }
-    if (mesh->firstX()) {
-      for (int jx = mesh->xstart - 1; jx >= 0; jx--) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-            BoutReal z = BoutReal(jz) / nz;
-            f5(jx, jy, jz) =
-                0. + exp(-(50. * pow(x - p, 2) + 1. - cos(2. * PI * (z - q))))
-                - 50.
-                      * (2. * p * exp(-50. * pow(-p, 2)) * x
-                         + (-p * exp(-50. * pow(-p, 2))
-                            - (1 - p) * exp(-50. * pow(1 - p, 2)))
-                               * pow(x, 2))
-                      * exp(-(
-                          1.
-                          - cos(
-                              2. * PI
-                              * (z - q)))); //make the gradients zero at both x-boundaries
-          }
+  }
+  if (mesh.firstX()) {
+    for (int jx = mesh.xstart - 1; jx >= 0; jx--) {
+      const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          const BoutReal z = BoutReal(jz) / nz;
+          result(jx, jy, jz) =
+              1. + 0.15 * exp(-50. * pow(x - p, 2) * 2.) * sin(2. * PI * (z - q) * 2.);
         }
       }
     }
-    if (mesh->lastX()) {
-      for (int jx = mesh->xend + 1; jx < mesh->LocalNx; jx++) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-            BoutReal z = BoutReal(jz) / nz;
-            f5(jx, jy, jz) =
-                0. + exp(-(50. * pow(x - p, 2) + 1. - cos(2. * PI * (z - q))))
-                - 50.
-                      * (2. * p * exp(-50. * pow(-p, 2)) * x
-                         + (-p * exp(-50. * pow(-p, 2))
-                            - (1 - p) * exp(-50. * pow(1 - p, 2)))
-                               * pow(x, 2))
-                      * exp(-(
-                          1.
-                          - cos(
-                              2. * PI
-                              * (z - q)))); //make the gradients zero at both x-boundaries
-          }
+  }
+  if (mesh.lastX()) {
+    for (int jx = mesh.xend + 1; jx < mesh.LocalNx; jx++) {
+      const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          const BoutReal z = BoutReal(jz) / nz;
+          result(jx, jy, jz) =
+              1. + 0.15 * exp(-50. * pow(x - p, 2) * 2.) * sin(2. * PI * (z - q) * 2.);
         }
       }
     }
+  }
 
-    p = 0.63298589;
-    q = 0.889237890;
-    d5.allocate();
-    for (int jx = mesh->xstart; jx <= mesh->xend; jx++) {
-      for (int jy = 0; jy < mesh->LocalNy; jy++) {
-        for (int jz = 0; jz < mesh->LocalNz; jz++) {
-          BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-          BoutReal z = BoutReal(jz) / nz;
-          d5(jx, jy, jz) = 1. + p * cos(2. * PI * x) * sin(2. * PI * (z - q) * 3.);
-        }
+  checkData(result);
+  return result;
+}
+
+Field3D generate_a1(const Mesh& mesh) {
+  const BoutReal nx = mesh.GlobalNx - 2 * mesh.xstart - 1;
+  const BoutReal nz = mesh.GlobalNz;
+
+  constexpr BoutReal p = 0.612547;   // NOLINT
+  constexpr BoutReal q = 0.30908712; // NOLINT
+  Field3D result;
+  result.allocate();
+  for (int jx = mesh.xstart; jx <= mesh.xend; jx++) {
+    const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+    for (int jy = 0; jy < mesh.LocalNy; jy++) {
+      for (int jz = 0; jz < mesh.LocalNz; jz++) {
+        const BoutReal z = BoutReal(jz) / nz;
+        result(jx, jy, jz) =
+            -1. + 0.1 * exp(-50. * pow(x - p, 2) * 2.5) * sin(2. * PI * (z - q) * 7.);
       }
     }
-    if (mesh->firstX()) {
-      for (int jx = mesh->xstart - 1; jx >= 0; jx--) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-            BoutReal z = BoutReal(jz) / nz;
-            d5(jx, jy, jz) = 1. + p * cos(2. * PI * x) * sin(2. * PI * (z - q) * 3.);
-          }
+  }
+  if (mesh.firstX()) {
+    for (int jx = mesh.xstart - 1; jx >= 0; jx--) {
+      const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          const BoutReal z = BoutReal(jz) / nz;
+          result(jx, jy, jz) =
+              -1. + 0.1 * exp(-50. * pow(x - p, 2) * 2.5) * sin(2. * PI * (z - q) * 7.);
         }
       }
     }
-    if (mesh->lastX()) {
-      for (int jx = mesh->xend + 1; jx < mesh->LocalNx; jx++) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-            BoutReal z = BoutReal(jz) / nz;
-            d5(jx, jy, jz) = 1. + p * cos(2. * PI * x) * sin(2. * PI * (z - q) * 3.);
-          }
+  }
+  if (mesh.lastX()) {
+    for (int jx = mesh.xend + 1; jx < mesh.LocalNx; jx++) {
+      const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          const BoutReal z = BoutReal(jz) / nz;
+          result(jx, jy, jz) =
+              -1. + 0.1 * exp(-50. * pow(x - p, 2) * 2.5) * sin(2. * PI * (z - q) * 7.);
         }
       }
     }
+  }
 
-    p = 0.160983834;
-    q = 0.73050121087;
-    c5.allocate();
-    for (int jx = mesh->xstart; jx <= mesh->xend; jx++) {
-      for (int jy = 0; jy < mesh->LocalNy; jy++) {
-        for (int jz = 0; jz < mesh->LocalNz; jz++) {
-          BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-          BoutReal z = BoutReal(jz) / nz;
-          c5(jx, jy, jz) = 1. + p * cos(2. * PI * x * 5) * sin(2. * PI * (z - q) * 2.);
-        }
+  checkData(result);
+  return result;
+}
+
+Field3D generate_f5(const Mesh& mesh) {
+  const BoutReal nx = mesh.GlobalNx - 2 * mesh.xstart - 1;
+  const BoutReal nz = mesh.GlobalNz;
+  constexpr BoutReal p = 0.623901;   // NOLINT
+  constexpr BoutReal q = 0.01209489; // NOLINT
+  Field3D result;
+  result.allocate();
+  for (int jx = mesh.xstart; jx <= mesh.xend; jx++) {
+    const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+    for (int jy = 0; jy < mesh.LocalNy; jy++) {
+      for (int jz = 0; jz < mesh.LocalNz; jz++) {
+        const BoutReal z = BoutReal(jz) / nz;
+        //make the gradients zero at both x-boundaries
+        result(jx, jy, jz) =
+            0. + exp(-(50. * pow(x - p, 2) + 1. - cos(2. * PI * (z - q))))
+            - 50.
+                  * (2. * p * exp(-50. * pow(-p, 2)) * x
+                     + (-p * exp(-50. * pow(-p, 2)) - (1 - p) * exp(-50. * pow(1 - p, 2)))
+                           * pow(x, 2))
+                  * exp(-(1. - cos(2. * PI * (z - q))));
       }
     }
-    if (mesh->firstX()) {
-      for (int jx = mesh->xstart - 1; jx >= 0; jx--) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-            BoutReal z = BoutReal(jz) / nz;
-            c5(jx, jy, jz) = 1. + p * cos(2. * PI * x * 5) * sin(2. * PI * (z - q) * 2.);
-          }
+  }
+  if (mesh.firstX()) {
+    for (int jx = mesh.xstart - 1; jx >= 0; jx--) {
+      const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          const BoutReal z = BoutReal(jz) / nz;
+          //make the gradients zero at both x-boundaries
+          result(jx, jy, jz) = 0.
+                               + exp(-(50. * pow(x - p, 2) + 1. - cos(2. * PI * (z - q))))
+                               - 50.
+                                     * (2. * p * exp(-50. * pow(-p, 2)) * x
+                                        + (-p * exp(-50. * pow(-p, 2))
+                                           - (1 - p) * exp(-50. * pow(1 - p, 2)))
+                                              * pow(x, 2))
+                                     * exp(-(1. - cos(2. * PI * (z - q))));
         }
       }
     }
-    if (mesh->lastX()) {
-      for (int jx = mesh->xend + 1; jx < mesh->LocalNx; jx++) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-            BoutReal z = BoutReal(jz) / nz;
-            c5(jx, jy, jz) = 1. + p * cos(2. * PI * x * 5) * sin(2. * PI * (z - q) * 2.);
-          }
+  }
+  if (mesh.lastX()) {
+    for (int jx = mesh.xend + 1; jx < mesh.LocalNx; jx++) {
+      const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          const BoutReal z = BoutReal(jz) / nz;
+          //make the gradients zero at both x-boundaries
+          result(jx, jy, jz) = 0.
+                               + exp(-(50. * pow(x - p, 2) + 1. - cos(2. * PI * (z - q))))
+                               - 50.
+                                     * (2. * p * exp(-50. * pow(-p, 2)) * x
+                                        + (-p * exp(-50. * pow(-p, 2))
+                                           - (1 - p) * exp(-50. * pow(1 - p, 2)))
+                                              * pow(x, 2))
+                                     * exp(-(1. - cos(2. * PI * (z - q))));
         }
       }
     }
+  }
+  result.applyBoundary("neumann");
+  checkData(result);
+  return result;
+}
 
-    p = 0.5378950;
-    q = 0.2805870;
-    a5.allocate();
-    for (int jx = mesh->xstart; jx <= mesh->xend; jx++) {
-      for (int jy = 0; jy < mesh->LocalNy; jy++) {
-        for (int jz = 0; jz < mesh->LocalNz; jz++) {
-          BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-          BoutReal z = BoutReal(jz) / nz;
-          a5(jx, jy, jz) = -1. + p * cos(2. * PI * x * 2.) * sin(2. * PI * (z - q) * 7.);
-        }
+Field3D generate_d5(const Mesh& mesh) {
+  const BoutReal nx = mesh.GlobalNx - 2 * mesh.xstart - 1;
+  const BoutReal nz = mesh.GlobalNz;
+  constexpr BoutReal p = 0.63298589;  // NOLINT
+  constexpr BoutReal q = 0.889237890; // NOLINT
+  Field3D result;
+  result.allocate();
+  for (int jx = mesh.xstart; jx <= mesh.xend; jx++) {
+    const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+    for (int jy = 0; jy < mesh.LocalNy; jy++) {
+      for (int jz = 0; jz < mesh.LocalNz; jz++) {
+        const BoutReal z = BoutReal(jz) / nz;
+        result(jx, jy, jz) = 1. + p * cos(2. * PI * x) * sin(2. * PI * (z - q) * 3.);
       }
     }
-    if (mesh->firstX()) {
-      for (int jx = mesh->xstart - 1; jx >= 0; jx--) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-            BoutReal z = BoutReal(jz) / nz;
-            a5(jx, jy, jz) =
-                -1. + p * cos(2. * PI * x * 2.) * sin(2. * PI * (z - q) * 7.);
-          }
+  }
+  if (mesh.firstX()) {
+    for (int jx = mesh.xstart - 1; jx >= 0; jx--) {
+      const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          const BoutReal z = BoutReal(jz) / nz;
+          result(jx, jy, jz) = 1. + p * cos(2. * PI * x) * sin(2. * PI * (z - q) * 3.);
         }
       }
     }
-    if (mesh->lastX()) {
-      for (int jx = mesh->xend + 1; jx < mesh->LocalNx; jx++) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            BoutReal x = BoutReal(mesh->getGlobalXIndex(jx) - mesh->xstart) / nx;
-            BoutReal z = BoutReal(jz) / nz;
-            a5(jx, jy, jz) =
-                -1. + p * cos(2. * PI * x * 2.) * sin(2. * PI * (z - q) * 7.);
-          }
+  }
+  if (mesh.lastX()) {
+    for (int jx = mesh.xend + 1; jx < mesh.LocalNx; jx++) {
+      const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          const BoutReal z = BoutReal(jz) / nz;
+          result(jx, jy, jz) = 1. + p * cos(2. * PI * x) * sin(2. * PI * (z - q) * 3.);
         }
       }
     }
+  }
+  checkData(result);
+  return result;
+}
 
-    f5.applyBoundary("neumann");
-    mesh->communicate(f5, a5, c5, d5);
+Field3D generate_c5(const Mesh& mesh) {
+  const BoutReal nx = mesh.GlobalNx - 2 * mesh.xstart - 1;
+  const BoutReal nz = mesh.GlobalNz;
+  constexpr BoutReal p = 0.160983834;   // NOLINT
+  constexpr BoutReal q = 0.73050121087; // NOLINT
 
-    b5 = d5 * Delp2(f5) + Grad_perp(c5) * Grad_perp(f5) / c5 + a5 * f5;
-    if (mesh->firstX()) {
-      for (int jx = mesh->xstart - 1; jx >= 0; jx--) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            b5(jx, jy, jz) = b5(jx + 1, jy, jz);
-          }
-        }
+  Field3D result;
+
+  result.allocate();
+  for (int jx = mesh.xstart; jx <= mesh.xend; jx++) {
+    const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+    for (int jy = 0; jy < mesh.LocalNy; jy++) {
+      for (int jz = 0; jz < mesh.LocalNz; jz++) {
+        const BoutReal z = BoutReal(jz) / nz;
+        result(jx, jy, jz) = 1. + p * cos(2. * PI * x * 5) * sin(2. * PI * (z - q) * 2.);
       }
     }
-    if (mesh->lastX()) {
-      for (int jx = mesh->xend + 1; jx < mesh->LocalNx; jx++) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            b5(jx, jy, jz) = b5(jx - 1, jy, jz);
-          }
+  }
+  if (mesh.firstX()) {
+    for (int jx = mesh.xstart - 1; jx >= 0; jx--) {
+      const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          const BoutReal z = BoutReal(jz) / nz;
+          result(jx, jy, jz) =
+              1. + p * cos(2. * PI * x * 5) * sin(2. * PI * (z - q) * 2.);
         }
       }
     }
-
-    invert->setInnerBoundaryFlags(INVERT_AC_GRAD);
-    invert->setOuterBoundaryFlags(INVERT_AC_GRAD);
-    invert->setCoefA(a5);
-    invert->setCoefC(c5);
-    invert->setCoefD(d5);
-
-    try {
-      sol5 = invert->solve(sliceXZ(b5, mesh->ystart));
-      error5 = (f5 - sol5) / f5;
-      absolute_error5 = f5 - sol5;
-      //     max_error5 = max_error_at_ystart(abs(error5));
-      max_error5 = max_error_at_ystart(abs(absolute_error5));
-    } catch (BoutException& err) {
-      output << "BoutException occured in invert->solve(b5): " << err.what() << endl;
-      max_error5 = -1;
-    }
-
-    output << endl << "Test 5: different profiles, PETSc 2nd order" << endl;
-    //   output<<"Time to set up is "<<Timer::getTime("petscsetup")<<". Time to solve is "<<Timer::getTime("petscsolve")<<endl;
-    //   output<<"Magnitude of maximum relative error is "<<max_error5<<endl;
-    output << "Magnitude of maximum absolute error is " << max_error5 << endl;
-    //   Timer::resetTime("petscsetup");
-    //   Timer::resetTime("petscsolve");
-
-    dump["a5"] = a5;
-    dump["b5"] = b5;
-    dump["c5"] = c5;
-    dump["d5"] = d5;
-    dump["f5"] = f5;
-    dump["sol5"] = sol5;
-    dump["error5"] = error5;
-    dump["absolute_error5"] = absolute_error5;
-    dump["max_error5"] = max_error5;
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Test 6: Cosine x-profiles, 4th order Krylov
-    Field3D sol6;
-    Field3D error6,
-        absolute_error6; //Absolute value of relative error: abs( (f5-sol5)/f5 )
-    BoutReal max_error6; //Output of test
-    invert_4th->setInnerBoundaryFlags(INVERT_AC_GRAD);
-    invert_4th->setOuterBoundaryFlags(INVERT_AC_GRAD);
-    invert_4th->setGlobalFlags(INVERT_4TH_ORDER);
-    invert_4th->setCoefA(a5);
-    invert_4th->setCoefC(c5);
-    invert_4th->setCoefD(d5);
-
-    try {
-      sol6 = invert_4th->solve(sliceXZ(b5, mesh->ystart));
-      error6 = (f5 - sol6) / f5;
-      absolute_error6 = f5 - sol6;
-      //     max_error6 = max_error_at_ystart(abs(error6));
-      max_error6 = max_error_at_ystart(abs(absolute_error6));
-    } catch (BoutException& err) {
-      output
-          << "BoutException occured in invert->solve(b6): Laplacian inversion failed to "
-             "converge (probably)"
-          << endl;
-      max_error6 = -1;
+  }
+  if (mesh.lastX()) {
+    for (int jx = mesh.xend + 1; jx < mesh.LocalNx; jx++) {
+      const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          const BoutReal z = BoutReal(jz) / nz;
+          result(jx, jy, jz) =
+              1. + p * cos(2. * PI * x * 5) * sin(2. * PI * (z - q) * 2.);
+        }
+      }
     }
+  }
+  checkData(result);
+  return result;
+}
 
-    output << endl << "Test 6: different profiles, PETSc 4th order" << endl;
-    //   output<<"Time to set up is "<<Timer::getTime("petscsetup")<<". Time to solve is "<<Timer::getTime("petscsolve")<<endl;
-    //   output<<"Magnitude of maximum relative error is "<<max_error6<<endl;
-    output << "Magnitude of maximum absolute error is " << max_error6 << endl;
-    //   Timer::resetTime("petscsetup");
-    //   Timer::resetTime("petscsolve");
-
-    dump["a6"] = a5;
-    dump["b6"] = b5;
-    dump["c6"] = c5;
-    dump["d6"] = d5;
-    dump["f6"] = f5;
-    dump["sol6"] = sol6;
-    dump["error6"] = error6;
-    dump["absolute_error6"] = absolute_error6;
-    dump["max_error6"] = max_error6;
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Test 7+8: Cosine x-profiles, z-independent coefficients and compare with SPT method
-    Field2D a7, c7, d7;
-    Field3D b7;
-    Field3D sol7, sol8;
-    Field3D error7, absolute_error7, error8, absolute_error8;
-    BoutReal max_error7, max_error8;
-
-    a7 = DC(a5);
-    c7 = DC(c5);
-    d7 = DC(d5);
-    b7 = d7 * Delp2(f5) + Grad_perp(c7) * Grad_perp(f5) / c7 + a7 * f5;
-    if (mesh->firstX()) {
-      for (int jx = mesh->xstart - 1; jx >= 0; jx--) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            b7(jx, jy, jz) = b7(jx + 1, jy, jz);
-          }
-        }
+Field3D generate_a5(const Mesh& mesh) {
+  const BoutReal nx = mesh.GlobalNx - 2 * mesh.xstart - 1;
+  const BoutReal nz = mesh.GlobalNz;
+  constexpr BoutReal p = 0.5378950; // NOLINT
+  constexpr BoutReal q = 0.2805870; // NOLINT
+  Field3D result;
+  result.allocate();
+  for (int jx = mesh.xstart; jx <= mesh.xend; jx++) {
+    const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+    for (int jy = 0; jy < mesh.LocalNy; jy++) {
+      for (int jz = 0; jz < mesh.LocalNz; jz++) {
+        const BoutReal z = BoutReal(jz) / nz;
+        result(jx, jy, jz) =
+            -1. + p * cos(2. * PI * x * 2.) * sin(2. * PI * (z - q) * 7.);
       }
     }
-    if (mesh->lastX()) {
-      for (int jx = mesh->xend + 1; jx < mesh->LocalNx; jx++) {
-        for (int jy = 0; jy < mesh->LocalNy; jy++) {
-          for (int jz = 0; jz < mesh->LocalNz; jz++) {
-            b7(jx, jy, jz) = b7(jx - 1, jy, jz);
-          }
+  }
+  if (mesh.firstX()) {
+    for (int jx = mesh.xstart - 1; jx >= 0; jx--) {
+      const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          const BoutReal z = BoutReal(jz) / nz;
+          result(jx, jy, jz) =
+              -1. + p * cos(2. * PI * x * 2.) * sin(2. * PI * (z - q) * 7.);
         }
       }
     }
-
-    invert->setInnerBoundaryFlags(INVERT_AC_GRAD);
-    invert->setOuterBoundaryFlags(INVERT_AC_GRAD);
-    invert->setCoefA(a7);
-    invert->setCoefC(c7);
-    invert->setCoefD(d7);
-
-    try {
-      sol7 = invert->solve(sliceXZ(b7, mesh->ystart));
-      error7 = (f5 - sol7) / f5;
-      absolute_error7 = f5 - sol7;
-      //     max_error7 = max_error_at_ystart(abs(error7));
-      max_error7 = max_error_at_ystart(abs(absolute_error7));
-    } catch (BoutException& err) {
-      output << "BoutException occured in invert->solve(b7): " << err.what() << endl;
-      max_error7 = -1;
-    }
-
-    output
-        << endl
-        << "Test 7: different profiles, with coefficients constant in z, PETSc 2nd order"
-        << endl;
-    //   output<<"Time to set up is "<<Timer::getTime("petscsetup")<<". Time to solve is "<<Timer::getTime("petscsolve")<<endl;
-    //   output<<"Magnitude of maximum relative error is "<<max_error7<<endl;
-    output << "Magnitude of maximum absolute error is " << max_error7 << endl;
-    //   Timer::resetTime("petscsetup");
-    //   Timer::resetTime("petscsolve");
-
-    dump["a7"] = a7;
-    dump["b7"] = b7;
-    dump["c7"] = c7;
-    dump["d7"] = d7;
-    dump["f7"] = f5;
-    dump["sol7"] = sol7;
-    dump["error7"] = error7;
-    dump["absolute_error7"] = absolute_error7;
-    dump["max_error7"] = max_error7;
-
-    invert_SPT->setInnerBoundaryFlags(INVERT_AC_GRAD);
-    invert_SPT->setOuterBoundaryFlags(INVERT_AC_GRAD | INVERT_DC_GRAD);
-    invert_SPT->setCoefA(a7);
-    invert_SPT->setCoefC(c7);
-    invert_SPT->setCoefD(d7);
-
-    sol8 = invert_SPT->solve(sliceXZ(b7, mesh->ystart));
-    error8 = (f5 - sol8) / f5;
-    absolute_error8 = f5 - sol8;
-    //   max_error8 = max_error_at_ystart(abs(error8));
-    max_error8 = max_error_at_ystart(abs(absolute_error8));
-
-    output
-        << endl
-        << "Test 8: different profiles, with coefficients constant in z, default solver"
-        << endl;
-    //   output<<"Time to set up is "<<Timer::getTime("petscsetup")<<". Time to solve is "<<Timer::getTime("petscsolve")<<endl;
-    //   output<<"Magnitude of maximum relative error is "<<max_error8<<endl;
-    output << "Magnitude of maximum absolute error is " << max_error8 << endl;
-    //   Timer::resetTime("petscsetup");
-    //   Timer::resetTime("petscsolve");
-
-    dump["a8"] = a7;
-    dump["b8"] = b7;
-    dump["c8"] = c7;
-    dump["d8"] = d7;
-    dump["f8"] = f5;
-    dump["sol8"] = sol8;
-    dump["error8"] = error8;
-    dump["absolute_error8"] = absolute_error8;
-    dump["max_error8"] = max_error8;
-
-    // Write and close the output file
-    bout::writeDefaultOutputFile(dump);
-
-    MPI_Barrier(BoutComm::get()); // Wait for all processors to write data
   }
-
-  bout::checkForUnusedOptions();
-
-  BoutFinalise();
-  return 0;
-}
-
-BoutReal max_error_at_ystart(const Field3D& error) {
-  const auto* mesh = error.getMesh();
-  BoutReal local_max_error = error(mesh->xstart, mesh->ystart, 0);
-
-  for (int jx = mesh->xstart; jx <= mesh->xend; jx++) {
-    for (int jz = 0; jz < mesh->LocalNz; jz++) {
-      if (local_max_error < error(jx, mesh->ystart, jz)) {
-        local_max_error = error(jx, mesh->ystart, jz);
+  if (mesh.lastX()) {
+    for (int jx = mesh.xend + 1; jx < mesh.LocalNx; jx++) {
+      const BoutReal x = BoutReal(mesh.getGlobalXIndex(jx) - mesh.xstart) / nx;
+      for (int jy = 0; jy < mesh.LocalNy; jy++) {
+        for (int jz = 0; jz < mesh.LocalNz; jz++) {
+          const BoutReal z = BoutReal(jz) / nz;
+          result(jx, jy, jz) =
+              -1. + p * cos(2. * PI * x * 2.) * sin(2. * PI * (z - q) * 7.);
+        }
       }
     }
   }
-
-  BoutReal max_error;
-
-  MPI_Allreduce(&local_max_error, &max_error, 1, MPI_DOUBLE, MPI_MAX, BoutComm::get());
-
-  return max_error;
+  checkData(result);
+  return result;
 }
diff --git a/tests/integrated/test-squash/runtest b/tests/integrated/test-squash/runtest
index 692d561c59..c79cba0faf 100755
--- a/tests/integrated/test-squash/runtest
+++ b/tests/integrated/test-squash/runtest
@@ -15,7 +15,7 @@ import os.path
 # cores: 4
 
 IGNORED_VARS_PATTERN = re.compile(
-    "(wtime|ncalls|arkode|cvode|run_id|run_restart_from|M.?SUB|N.?PE|iteration|wall_time|has_legacy_netcdf|hist_hi).*"
+    "(wtime|ncalls|arkode|cvode|run_id|run_restart_from|M.?SUB|N.?PE|iteration|wall_time|has_legacy_netcdf|hist_hi|openmp_threads).*"
 )
 
 
diff --git a/tests/unit/fake_parallel_mesh.hxx b/tests/unit/fake_parallel_mesh.hxx
index c648bbab9c..805dcb2a0a 100644
--- a/tests/unit/fake_parallel_mesh.hxx
+++ b/tests/unit/fake_parallel_mesh.hxx
@@ -8,6 +8,8 @@
 #include <memory>
 
 #include "../../src/mesh/impls/bout/boutmesh.hxx"
+#include "bout/boundary_op.hxx"
+#include "bout/boundary_region.hxx"
 #include "bout/boutcomm.hxx"
 #include "bout/coordinates.hxx"
 #include "bout/field2d.hxx"
diff --git a/tests/unit/include/bout/test_hypre_interface.cxx b/tests/unit/include/bout/test_hypre_interface.cxx
index a56f061a6e..e2eefab9a8 100644
--- a/tests/unit/include/bout/test_hypre_interface.cxx
+++ b/tests/unit/include/bout/test_hypre_interface.cxx
@@ -309,7 +309,7 @@ TYPED_TEST(HypreMatrixTest, SetElements) {
       auto j_index = static_cast<HYPRE_BigInt>(this->indexer->getGlobal(j));
       HYPRE_Int ncolumns{1};
       HYPRE_Complex value;
-      BOUT_OMP(critical)
+      BOUT_OMP_SAFE(critical)
       { HYPRE_IJMatrixGetValues(raw_matrix, 1, &ncolumns, &i_index, &j_index, &value); }
       if (i == j) {
         EXPECT_EQ(static_cast<BoutReal>(value),
diff --git a/tests/unit/include/bout/test_petsc_indexer.cxx b/tests/unit/include/bout/test_petsc_indexer.cxx
index 082acafde6..3c20de9989 100644
--- a/tests/unit/include/bout/test_petsc_indexer.cxx
+++ b/tests/unit/include/bout/test_petsc_indexer.cxx
@@ -81,15 +81,15 @@ TYPED_TEST(IndexerTest, TestConvertIndex) {
   BOUT_FOR(i, f.getRegion("RGN_NOBNDRY")) {
     int global = this->globalSquareIndexer.getGlobal(i);
     EXPECT_GE(global, 0);
-    BOUT_OMP(critical)
+    BOUT_OMP_SAFE(critical)
     EXPECT_TRUE(indicesGlobalSquare.insert(global).second);
     global = this->globalStarIndexer.getGlobal(i);
     EXPECT_GE(global, 0);
-    BOUT_OMP(critical)
+    BOUT_OMP_SAFE(critical)
     EXPECT_TRUE(indicesGlobalStar.insert(global).second);
     global = this->globalDefaultIndexer.getGlobal(i);
     EXPECT_GE(global, 0);
-    BOUT_OMP(critical)
+    BOUT_OMP_SAFE(critical)
     EXPECT_TRUE(indicesGlobalDefault.insert(global).second);
   }
 
@@ -97,11 +97,11 @@ TYPED_TEST(IndexerTest, TestConvertIndex) {
   BOUT_FOR(i, f.getRegion("RGN_XGUARDS")) {
     int global = this->globalSquareIndexer.getGlobal(i);
     EXPECT_GE(global, 0);
-    BOUT_OMP(critical)
+    BOUT_OMP_SAFE(critical)
     EXPECT_TRUE(indicesGlobalSquare.insert(global).second);
     global = this->globalStarIndexer.getGlobal(i);
     EXPECT_GE(global, 0);
-    BOUT_OMP(critical)
+    BOUT_OMP_SAFE(critical)
     EXPECT_TRUE(indicesGlobalStar.insert(global).second);
     EXPECT_LT(this->globalDefaultIndexer.getGlobal(i), 0);
   }
@@ -111,11 +111,11 @@ TYPED_TEST(IndexerTest, TestConvertIndex) {
     BOUT_FOR(i, f.getRegion("RGN_YGUARDS")) {
       int global = this->globalSquareIndexer.getGlobal(i);
       EXPECT_GE(global, 0);
-      BOUT_OMP(critical)
+      BOUT_OMP_SAFE(critical)
       EXPECT_TRUE(indicesGlobalSquare.insert(global).second);
       global = this->globalStarIndexer.getGlobal(i);
       EXPECT_GE(global, 0);
-      BOUT_OMP(critical)
+      BOUT_OMP_SAFE(critical)
       EXPECT_TRUE(indicesGlobalStar.insert(global).second);
       EXPECT_LT(this->globalDefaultIndexer.getGlobal(i), 0);
     }
diff --git a/tests/unit/include/bout/test_petsc_matrix.cxx b/tests/unit/include/bout/test_petsc_matrix.cxx
index cc07145d8e..9ba2475096 100644
--- a/tests/unit/include/bout/test_petsc_matrix.cxx
+++ b/tests/unit/include/bout/test_petsc_matrix.cxx
@@ -177,7 +177,7 @@ TYPED_TEST(PetscMatrixTest, TestGetElements) {
       int i_ind = this->indexer->getGlobal(i);
       int j_ind = this->indexer->getGlobal(j);
       PetscScalar matContents;
-      BOUT_OMP(critical)
+      BOUT_OMP_SAFE(critical)
       MatGetValues(*rawmat, 1, &i_ind, 1, &j_ind, &matContents);
       if (i == j) {
         EXPECT_EQ(matContents, static_cast<BoutReal>(i.ind));
diff --git a/tests/unit/include/bout/test_region.cxx b/tests/unit/include/bout/test_region.cxx
index befcc07771..8776dad59a 100644
--- a/tests/unit/include/bout/test_region.cxx
+++ b/tests/unit/include/bout/test_region.cxx
@@ -262,7 +262,7 @@ TEST_F(RegionTest, regionLoopAllSection) {
   const auto& region = mesh->getRegion3D("RGN_ALL");
 
   int count = 0;
-  BOUT_OMP(parallel)
+  BOUT_OMP_PERF(parallel)
   {
     BOUT_FOR_OMP(i, region, for reduction(+:count)) {
       ++count;
@@ -296,7 +296,7 @@ TEST_F(RegionTest, regionLoopNoBndrySection) {
   const auto& region = mesh->getRegion3D("RGN_NOBNDRY");
 
   int count = 0;
-  BOUT_OMP(parallel)
+  BOUT_OMP_PERF(parallel)
   {
     BOUT_FOR_OMP(i, region, for reduction(+:count)) {
       ++count;
@@ -313,7 +313,7 @@ TEST_F(RegionTest, regionLoopAllInner) {
   const auto& region = mesh->getRegion3D("RGN_ALL");
 
   Field3D a{0.};
-  BOUT_OMP(parallel)
+  BOUT_OMP_PERF(parallel)
   {
     BOUT_FOR_INNER(i, region) { a[i] = 1.0; }
   }
@@ -331,7 +331,7 @@ TEST_F(RegionTest, regionLoopNoBndryInner) {
   const auto& region = mesh->getRegion3D("RGN_NOBNDRY");
 
   Field3D a{0.};
-  BOUT_OMP(parallel)
+  BOUT_OMP_PERF(parallel)
   {
     BOUT_FOR_INNER(i, region) { a[i] = 1.0; }
   }
diff --git a/tests/unit/mesh/test_boundary_factory.cxx b/tests/unit/mesh/test_boundary_factory.cxx
index 6637e73711..b552f7629e 100644
--- a/tests/unit/mesh/test_boundary_factory.cxx
+++ b/tests/unit/mesh/test_boundary_factory.cxx
@@ -1,6 +1,7 @@
 #include "gtest/gtest.h"
 
 #include "bout/boundary_factory.hxx"
+#include "bout/boundary_op.hxx"
 #include "bout/boundary_region.hxx"
 
 #include "test_extras.hxx"
diff --git a/tests/unit/test_extras.hxx b/tests/unit/test_extras.hxx
index 6f78e99fd3..700b977ac8 100644
--- a/tests/unit/test_extras.hxx
+++ b/tests/unit/test_extras.hxx
@@ -8,6 +8,7 @@
 #include <numeric>
 #include <vector>
 
+#include "bout/boundary_region.hxx"
 #include "bout/boutcomm.hxx"
 #include "bout/coordinates.hxx"
 #include "bout/field3d.hxx"
@@ -232,8 +233,9 @@ public:
   RangeIterator iterateBndryUpperInnerY() const override { return RangeIterator(); }
   void addBoundary(BoundaryRegion* region) override { boundaries.push_back(region); }
   std::vector<BoundaryRegion*> getBoundaries() override { return boundaries; }
-  std::vector<BoundaryRegionPar*> getBoundariesPar() override {
-    return std::vector<BoundaryRegionPar*>();
+  std::vector<std::shared_ptr<BoundaryRegionPar>>
+  getBoundariesPar(BoundaryParType UNUSED(type)) override {
+    return std::vector<std::shared_ptr<BoundaryRegionPar>>();
   }
   BoutReal GlobalX(int jx) const override { return jx; }
   BoutReal GlobalY(int jy) const override { return jy; }
diff --git a/tools/archiving/sdctools/sdclib/sdclib.c b/tools/archiving/sdctools/sdclib/sdclib.c
index f7db255a47..7294cc0791 100644
--- a/tools/archiving/sdctools/sdclib/sdclib.c
+++ b/tools/archiving/sdctools/sdclib/sdclib.c
@@ -34,8 +34,6 @@
 
 #include "sdclib.h"
 
-//#define DEBUG
-
 #define DEFAULT_IFRAME 10
 #define DEFAULT_ORDER 4
 
diff --git a/tools/pylib/_boutpp_build/CMakeLists.txt b/tools/pylib/_boutpp_build/CMakeLists.txt
index 6b88986a28..3be2a5d2aa 100644
--- a/tools/pylib/_boutpp_build/CMakeLists.txt
+++ b/tools/pylib/_boutpp_build/CMakeLists.txt
@@ -25,7 +25,7 @@ bout_python_maybe_error(${Cython_FOUND} Cython)
 find_package(Bash)
 bout_python_maybe_error(${Bash_FOUND} Bash)
 
-execute_process(COMMAND ${Python_EXECUTABLE} -c "import jinja2"
+execute_process(COMMAND ${Python3_EXECUTABLE} -c "import jinja2"
   RESULT_VARIABLE jinja2_FOUND)
 if (jinja2_FOUND EQUAL 0)
   # We have jinja2 - all good
@@ -33,7 +33,7 @@ else()
   bout_python_maybe_error(OFF jinja2)
 endif()
 
-execute_process(COMMAND ${Python_EXECUTABLE} -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX')[:-3])"
+execute_process(COMMAND ${Python3_EXECUTABLE} -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX')[:-3])"
   RESULT_VARIABLE PYTHON_WORKING
   OUTPUT_VARIABLE PYTHON_EXT_SUFFIX
   OUTPUT_STRIP_TRAILING_WHITESPACE
@@ -73,7 +73,7 @@ foreach(file IN LISTS files)
   #message(FATAL_ERROR "${gen} ${src}/${file}.jinja")
   add_custom_command(OUTPUT ${gen}
 	COMMAND ${CMAKE_COMMAND} -E make_directory ${tar}
-	COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${tar}/..:\${PYTHONPATH} ${Python_EXECUTABLE} generate.py ${file}.jinja ${gen}
+	COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${tar}/..:\${PYTHONPATH} ${Python3_EXECUTABLE} generate.py ${file}.jinja ${gen}
 	DEPENDS ${src}/${file}.jinja
 	DEPENDS ${src}/helper.py
 	DEPENDS ${src}/resolve_enum_inv.pyx.jinja
@@ -93,8 +93,7 @@ endforeach()
 
 add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libboutpp.cpp
   COMMAND ${CMAKE_COMMAND} -E copy boutpp.pyx libboutpp.pyx
-  COMMAND ${Python_EXECUTABLE} -m cython libboutpp.pyx --cplus -3  -X binding=True -X embedsignature=True
-  COMMENT "Cythonizing python interface"
+  COMMAND ${Python3_EXECUTABLE} -m cython libboutpp.pyx --cplus -3  -X binding=True -X embedsignature=True
   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
   DEPENDS ${boutpp_depends}
   )
@@ -120,5 +119,6 @@ install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/boutpp.py
   DESTINATION ${CMAKE_INSTALL_PYTHON_SITEARCH}/boutpp/
   RENAME __init__.py
   )
+
 target_link_libraries(boutpp${PYTHON_EXT_SUFFIX} bout++)
-target_include_directories(boutpp${PYTHON_EXT_SUFFIX} PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}> ${Numpy_INCLUDE_DIRS} ${Python_INCLUDE_DIRS})
+target_include_directories(boutpp${PYTHON_EXT_SUFFIX} PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}> ${Numpy_INCLUDE_DIRS} ${Python3_INCLUDE_DIRS})
diff --git a/tools/pylib/_boutpp_build/bout_options.pxd b/tools/pylib/_boutpp_build/bout_options.pxd
index be17608cea..365e08bcc7 100644
--- a/tools/pylib/_boutpp_build/bout_options.pxd
+++ b/tools/pylib/_boutpp_build/bout_options.pxd
@@ -43,6 +43,7 @@ cdef extern from "bout/options.hxx":
         void get(string, double&, double)
         void get(string, bool&, bool)
         void cleanCache()
+        void setConditionallyUsed()
 
 
 cdef extern from "bout/optionsreader.hxx":
diff --git a/tools/pylib/_boutpp_build/boutcpp.pxd.jinja b/tools/pylib/_boutpp_build/boutcpp.pxd.jinja
index 12e210a5b5..8f838b864c 100644
--- a/tools/pylib/_boutpp_build/boutcpp.pxd.jinja
+++ b/tools/pylib/_boutpp_build/boutcpp.pxd.jinja
@@ -148,10 +148,10 @@ cdef extern from "bout/physicsmodel.hxx":
 ctypedef void (*Method)(void *param, void *user_data)
 cdef extern from "helper.h":
     cppclass PythonModel(PhysicsModel):
-        int rhs(double t)
+        int rhs(double t) except +raise_bout_py_error
         void pyinit()
         void free()
-        void solve()
+        void solve() except +raise_bout_py_error
         Solver * getSolver()
         void set_rhs_func(PythonModelCallback*)
         void set_init_func(PythonModelCallback*)
diff --git a/tools/pylib/_boutpp_build/boutpp.pyx.jinja b/tools/pylib/_boutpp_build/boutpp.pyx.jinja
index 3aeb1428eb..9aedbb291a 100644
--- a/tools/pylib/_boutpp_build/boutpp.pyx.jinja
+++ b/tools/pylib/_boutpp_build/boutpp.pyx.jinja
@@ -583,9 +583,9 @@ cdef class {{ field.field_type }}:
 
 {% endfor %}
     def __dealloc__(self):
-        self.__boutpp_dealloc()
+        self._boutpp_dealloc()
 
-    def __boutpp_dealloc(self):
+    def _boutpp_dealloc(self):
         if self.isSelfOwned and self.cobj != NULL:
             del self.cobj
             self.cobj = NULL
@@ -645,9 +645,9 @@ cdef class {{ vec }}:
 
 
     def __dealloc__(self):
-        self.__boutpp_dealloc()
+        self._boutpp_dealloc()
 
-    def __boutpp_dealloc(self):
+    def _boutpp_dealloc(self):
         if self.isSelfOwned and self.cobj != NULL:
             del self.cobj
             self.cobj=NULL
@@ -742,9 +742,9 @@ cdef class Mesh:
         return msh
 
     def __dealloc__(self):
-        self.__boutpp_dealloc()
+        self._boutpp_dealloc()
 
-    def __boutpp_dealloc(self):
+    def _boutpp_dealloc(self):
         if self.cobj and self.isSelfOwned:
             del self.cobj
             self.cobj = NULL
@@ -850,9 +850,9 @@ cdef class Coordinates:
 {% endfor %}
 
     def __dealloc__(self):
-        self.__boutpp_dealloc()
+        self._boutpp_dealloc()
 
-    def __boutpp_dealloc(self):
+    def _boutpp_dealloc(self):
         if self.cobj and self.isSelfOwned:
             del self.cobj
             self.cobj = NULL
@@ -931,9 +931,9 @@ cdef class FieldFactory:
         checkInit()
         cobj=< c.FieldFactory*>0
     def __dealloc__(self):
-        self.__boutpp_dealloc()
+        self._boutpp_dealloc()
 
-    def __boutpp_dealloc(self):
+    def _boutpp_dealloc(self):
         if self.cobj != NULL:
             del self.cobj
             self.cobj = NULL
@@ -965,9 +965,9 @@ cdef class PythonModelCallback:
         self.cobj = new c.PythonModelCallback(callback, <void*>method)
 
     def __dealloc__(self):
-        self.__boutpp_dealloc()
+        self._boutpp_dealloc()
 
-    def __boutpp_dealloc(self):
+    def _boutpp_dealloc(self):
         if self.cobj:
             del self.cobj
             self.cobj = NULL
@@ -1037,12 +1037,12 @@ cdef class PhysicsModelBase(object):
         self.cmodel.set_init_func(self.callbackinit)
 
     def __dealloc__(self):
-        if hasattr(self, "__boutpp_dealloc"):
-            self.__boutpp_dealloc()
+        if hasattr(self, "_boutpp_dealloc"):
+            self._boutpp_dealloc()
         else:
-            PhysicsModelBase.__boutpp_dealloc(self)
+            PhysicsModelBase._boutpp_dealloc(self)
 
-    def __boutpp_dealloc(self):
+    def _boutpp_dealloc(self):
         if self.cmodel != <c.PythonModel *> 0:
             self.cmodel.free()
             del self.cmodel
@@ -1123,8 +1123,8 @@ class PhysicsModel(PhysicsModelBase):
     def __dealloc__(self):
         super(PhysicsModel,self).__dealloc__()
 
-    def __boutpp_dealloc(self):
-        super(PhysicsModel,self).__boutpp_dealloc()
+    def _boutpp_dealloc(self):
+        super(PhysicsModel,self)._boutpp_dealloc()
 
 cdef extern from "bout/bout.hxx":
     int BoutInitialise(int&, char **&) except +raise_bout_py_error
@@ -1204,13 +1204,14 @@ def finalise():
                   PythonModelCallback)
     for obj in objects:
         if isinstance(obj, ourClasses):
-            if hasattr(obj, "__boutpp_dealloc"):
-                obj.__boutpp_dealloc()
+            if hasattr(obj, "_boutpp_dealloc"):
+                obj._boutpp_dealloc()
             else:
                 for ourClass in ourClasses:
                     if isinstance(obj, ourClass):
-                         ourClass.__boutpp_dealloc(obj)
-                         break
+                        if hasattr(ourClass, "_boutpp_dealloc"):
+                            ourClass._boutpp_dealloc(obj)
+                            break
     del objects
     # Actually finalise
     if wasInit:
@@ -1715,10 +1716,19 @@ cdef class Options:
             opt.get(key, ret_str, default_)
             return ret_str.decode()
 
+    def setConditionallyUsed(self):
+        """Set the attribute "conditionally used" to be true for \p options
+        and all its children/sections, causing `Options::getUnused` to
+        assume those options have been used. This is useful to ignore
+        options when checking for typos etc.
+        """
+        cdef c.Options* opt = self.cobj
+        opt.setConditionallyUsed()
+
     def __dealloc__(self):
-        self.__boutpp_dealloc()
+        self._boutpp_dealloc()
 
-    def __boutpp_dealloc(self):
+    def _boutpp_dealloc(self):
         if self.isSelfOwned and self.cobj != NULL:
             del self.cobj
             self.cobj = NULL