From 3888650dccadfa1d0c6749dc777c05f2ba27961f Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Sat, 11 Jan 2025 14:42:06 +0000
Subject: [PATCH 1/9] Use block preconditioner for mixed Poisson problem 
 (#3580)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Switch to iterative solver for mixed-Poisson demo

* Update for complex types

* Update

* Work on block preconditioned mixed Poisson solver

* Text updates

* Small text fixes

* Disable fast fail

* Skip Hypre solver in complex mode

* Revert CI change

* Update python/demo/demo_mixed-poisson.py

Co-authored-by: Jørgen Schartum Dokken <dokken92@gmail.com>

* Update python/demo/demo_mixed-poisson.py

Co-authored-by: Jørgen Schartum Dokken <dokken92@gmail.com>

* Update python/demo/demo_mixed-poisson.py

Co-authored-by: Jørgen Schartum Dokken <dokken92@gmail.com>

* Update python/demo/demo_mixed-poisson.py

Co-authored-by: Jørgen Schartum Dokken <dokken92@gmail.com>

* Update python/demo/demo_mixed-poisson.py

Co-authored-by: Jørgen Schartum Dokken <dokken92@gmail.com>

* Update output

* Updates

* Updates

* Support zero form

* Simplify

* Install blas in CI

* Install lapack

* Fix typo

* Bump macos-15

* Use superlu_dist

* LU solver work-arounds

* Ruff fix

* Fix typo

* Fix typos

---------

Co-authored-by: Jørgen Schartum Dokken <dokken92@gmail.com>
---
 .github/workflows/ccpp.yml                |   6 +-
 .github/workflows/macos.yml               |   2 +-
 cpp/dolfinx/fem/assemble_vector_impl.h    |   2 +
 cpp/dolfinx/fem/discreteoperators.h       |  18 +-
 python/demo/demo_half_loaded_waveguide.py |   2 +-
 python/demo/demo_mixed-poisson.py         | 387 ++++++++++++++++------
 python/demo/demo_poisson.py               |   4 +-
 python/demo/demo_stokes.py                |  12 +-
 python/dolfinx/fem/dofmap.py              |  11 +-
 python/dolfinx/fem/forms.py               |  80 +++--
 python/dolfinx/fem/function.py            |  17 +-
 python/dolfinx/fem/petsc.py               |  70 ++--
 python/dolfinx/graph.py                   |   3 +
 python/dolfinx/mesh.py                    |   2 +-
 python/dolfinx/plot.py                    |   3 +-
 python/test/conftest.py                   |   2 +-
 python/test/unit/fem/test_assembler.py    |  20 +-
 17 files changed, 432 insertions(+), 209 deletions(-)
diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml
index 28ae5bdd989..9cd218d440f 100644
--- a/.github/workflows/ccpp.yml
+++ b/.github/workflows/ccpp.yml
@@ -78,7 +78,7 @@ jobs:
           find . -type f \( -name "*.cmake" -o -name "*.cmake.in" -o -name "CMakeLists.txt" \) | xargs cmake-format --check
 
   build:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     needs: [fenicsx-refs, lint]
     env:
       OMPI_ALLOW_RUN_AS_ROOT: 1
@@ -93,7 +93,9 @@ jobs:
       - name: Install dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install catch2 cmake g++ libboost-dev libhdf5-mpi-dev libparmetis-dev libpugixml-dev libspdlog-dev mpi-default-dev ninja-build pkg-config
+          sudo apt-get install catch2 cmake g++ libblas-dev libboost-dev libhdf5-mpi-dev \
+             liblapack-dev libparmetis-dev libpugixml-dev libspdlog-dev mpi-default-dev \
+             ninja-build pkg-config
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
index 02dd6c144be..f7dff7556c2 100644
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -20,7 +20,7 @@ jobs:
 
   mac-os-build:
     name: macOS Homebrew install and test
-    runs-on: macos-14
+    runs-on: macos-15
     needs: fenicsx-refs
     env:
       PETSC_ARCH: arch-darwin-c-opt
diff --git a/cpp/dolfinx/fem/assemble_vector_impl.h b/cpp/dolfinx/fem/assemble_vector_impl.h
index 6d9879c39fc..9965abc8917 100644
--- a/cpp/dolfinx/fem/assemble_vector_impl.h
+++ b/cpp/dolfinx/fem/assemble_vector_impl.h
@@ -1295,8 +1295,10 @@ void assemble_vector(
   std::shared_ptr<const mesh::Mesh<U>> mesh = L.mesh();
   assert(mesh);
   if constexpr (std::is_same_v<U, scalar_value_type_t<T>>)
+  {
     assemble_vector(b, L, mesh->geometry().dofmap(), mesh->geometry().x(),
                     constants, coefficients);
+  }
   else
   {
     auto x = mesh->geometry().x();
diff --git a/cpp/dolfinx/fem/discreteoperators.h b/cpp/dolfinx/fem/discreteoperators.h
index cfe37525a9f..87740f60d27 100644
--- a/cpp/dolfinx/fem/discreteoperators.h
+++ b/cpp/dolfinx/fem/discreteoperators.h
@@ -28,12 +28,12 @@ namespace dolfinx::fem
 /// a Lagrange finite element function in \f$V_0 \subset H^1\f$ into a
 /// Nédélec (first kind) space \f$V_1 \subset H({\rm curl})\f$, i.e.
 /// \f$\nabla V_0 \rightarrow V_1\f$. If \f$u_0\f$ is the
-/// degree-of-freedom vector associated with \f$V_0\f$, the hen
+/// degree-of-freedom vector associated with \f$V_0\f$, then
 /// \f$u_1=Au_0\f$ where \f$u_1\f$ is the degrees-of-freedom vector for
 /// interpolating function in the \f$H({\rm curl})\f$ space. An example
 /// of where discrete gradient operators are used is the creation of
-/// algebraic multigrid solvers for \f$H({\rm curl})\f$  and
-/// \f$H({\rm div})\f$ problems.
+/// algebraic multigrid solvers for \f$H({\rm curl})\f$ and \f$H({\rm
+/// div})\f$ problems.
 ///
 /// @note The sparsity pattern for a discrete operator can be
 /// initialised using sparsitybuild::cells. The space `V1` should be
@@ -44,9 +44,9 @@ namespace dolfinx::fem
 ///
 /// @param[in] topology Mesh topology
 /// @param[in] V0 Lagrange element and dofmap for corresponding space to
-/// interpolate the gradient from
-/// @param[in] V1 Nédélec (first kind) element and and dofmap for
-/// corresponding space to interpolate into
+/// interpolate the gradient from.
+/// @param[in] V1 Nédélec (first kind) element and dofmap for
+/// corresponding space to interpolate into.
 /// @param[in] mat_set A functor that sets values in a matrix
 template <dolfinx::scalar T,
           std::floating_point U = dolfinx::scalar_value_type_t<T>>
@@ -148,9 +148,9 @@ void discrete_gradient(mesh::Topology& topology,
 /// initialised using sparsitybuild::cells. The space `V1` should be
 /// used for the rows of the sparsity pattern, `V0` for the columns.
 ///
-/// @param[in] V0 The space to interpolate from
-/// @param[in] V1 The space to interpolate to
-/// @param[in] mat_set A functor that sets values in a matrix
+/// @param[in] V0 Space to interpolate from.
+/// @param[in] V1 Space to interpolate to.
+/// @param[in] mat_set Functor that sets values in a matrix.
 template <dolfinx::scalar T, std::floating_point U>
 void interpolation_matrix(const FunctionSpace<U>& V0,
                           const FunctionSpace<U>& V1, auto&& mat_set)
diff --git a/python/demo/demo_half_loaded_waveguide.py b/python/demo/demo_half_loaded_waveguide.py
index b732d6ba23e..e1446276783 100644
--- a/python/demo/demo_half_loaded_waveguide.py
+++ b/python/demo/demo_half_loaded_waveguide.py
@@ -461,7 +461,7 @@ def Omega_v(x):
         # Verify if kz is consistent with the analytical equations
         assert verify_mode(kz, w, h, d, lmbd0, eps_d, eps_v, threshold=1e-4)
 
-        print(f"eigenvalue: {-kz**2}")
+        print(f"eigenvalue: {-(kz**2)}")
         print(f"kz: {kz}")
         print(f"kz/k0: {kz / k0}")
 
diff --git a/python/demo/demo_mixed-poisson.py b/python/demo/demo_mixed-poisson.py
index 751e5be96d1..51086346b19 100644
--- a/python/demo/demo_mixed-poisson.py
+++ b/python/demo/demo_mixed-poisson.py
@@ -8,18 +8,23 @@
 #       jupytext_version: 1.14.1
 # ---
 
-# # Mixed formulation for the Poisson equation
-
-# This demo illustrates how to solve Poisson equation using a mixed
-# (two-field) formulation. In particular, it illustrates how to
+# # Mixed formulation of the Poisson equation with a block-preconditioner/solver
+#
+# This demo illustrates how to solve the Poisson equation using a mixed
+# (two-field) formulation and a block-preconditioned iterative solver.
+# In particular, it illustrates how to
 #
 # * Use mixed and non-continuous finite element spaces.
-# * Set essential boundary conditions for subspaces and $H(\mathrm{div})$ spaces.
+# * Set essential boundary conditions for subspaces and
+#   $H(\mathrm{div})$ spaces.
+# * Construct a blocked linear system.
+# * Construct a block-preconditioned iterative linear solver using
+#   PETSc/petsc4y.
+# * Construct a Hypre Auxiliary Maxwell Space (AMS) preconditioner for
+#   $H(\mathrm{div})$ problems in two-dimensions.
 #
-
 # ```{admonition} Download sources
 # :class: download
-#
 # * {download}`Python script <./demo_mixed-poisson.py>`
 # * {download}`Jupyter notebook <./demo_mixed-poisson.ipynb>`
 # ```
@@ -44,15 +49,14 @@
 #   \sigma \cdot n = g \quad {\rm on} \ \Gamma_{N}.
 # $$
 #
-# The same equations arise in connection with flow in porous media, and are
-# also referred to as Darcy flow. Here $n$ denotes the outward pointing normal
-# vector on the boundary. Looking at the variational form, we see that the
-# boundary condition for the flux ($\sigma \cdot n = g$) is now an essential
-# boundary condition (which should be enforced in the function space), while
-# the other boundary condition ($u = u_0$) is a natural boundary condition
-# (which should be applied to the variational form). Inserting the boundary
-# conditions, this variational problem can be phrased in the general form: find
-# $(\sigma, u) \in \Sigma_g \times V$ such that
+# where $n$ is the outward unit normal vector on the boundary. Looking
+# at the variational form, we see that the boundary condition for the
+# flux ($\sigma \cdot n = g$) is now an essential boundary condition
+# (which should be enforced in the function space), while the other
+# boundary condition ($u = u_0$) is a natural boundary condition (which
+# should be applied to the variational form). Inserting the boundary
+# conditions, this variational problem can be phrased in the general
+# form: find $(\sigma, u) \in \Sigma_g \times V$ such that
 #
 # $$
 #    a((\sigma, u), (\tau, v)) = L((\tau, v))
@@ -62,37 +66,33 @@
 # where the variational forms $a$ and $L$ are defined as
 #
 # $$
-# \begin{align}
-#   a((\sigma, u), (\tau, v)) &=
+#   a((\sigma, u), (\tau, v)) &:=
 #     \int_{\Omega} \sigma \cdot \tau + \nabla \cdot \tau \ u
 #   + \nabla \cdot \sigma \ v \ {\rm d} x, \\
-#   L((\tau, v)) &= - \int_{\Omega} f v \ {\rm d} x
+#   L((\tau, v)) &:= - \int_{\Omega} f v \ {\rm d} x
 #   + \int_{\Gamma_D} u_0 \tau \cdot n  \ {\rm d} s,
-# \end{align}
 # $$
-# and $\Sigma_g = \{ \tau \in H({\rm div})$ such that $\tau \cdot n|_{\Gamma_N}
-# = g \}$ and $V = L^2(\Omega)$.
-#
-# To discretize the above formulation, two discrete function spaces $\Sigma_h
-# \subset \Sigma$ and $V_h \subset V$ are needed to form a mixed function space
-# $\Sigma_h \times V_h$. A stable choice of finite element spaces is to let
-# $\Sigma_h$ be the Brezzi-Douglas-Marini elements of polynomial order
-# $k$ and let $V_h$ be discontinuous elements of polynomial order $k-1$.
+# and $\Sigma_g := \{ \tau \in H({\rm div})$ such that $\tau \cdot
+# n|_{\Gamma_N} = g \}$ and $V := L^2(\Omega)$.
 #
-# We will use the same definitions of functions and boundaries as in the
-# demo for {doc}`the Poisson equation <demo_poisson>`. These are:
+# To discretize the above formulation, two discrete function spaces
+# $\Sigma_h \subset \Sigma$ and $V_h \subset V$ are needed to form a
+# mixed function space $\Sigma_h \times V_h$. A stable choice of finite
+# element spaces is to let $\Sigma_h$ be the Raviart-Thomas elements of
+# polynomial order $k$ and let $V_h$ be discontinuous Lagrange elements of
+# polynomial order $k-1$.
 #
-# * $\Omega = [0,1] \times [0,1]$ (a unit square)
-# * $\Gamma_{D} = \{(0, y) \cup (1, y) \in \partial \Omega\}$
-# * $\Gamma_{N} = \{(x, 0) \cup (x, 1) \in \partial \Omega\}$
-# * $u_0 = 0$
-# * $g = \sin(5x)$   (flux)
-# * $f = 10\exp(-((x - 0.5)^2 + (y - 0.5)^2) / 0.02)$  (source term)
+# To solve the linear system for the mixed problem, we will use am
+# iterative method with a block-diagonal preconditioner that is based on
+# the Riesz map, see for example this
+# [paper](https://doi.org/10.1002/(SICI)1099-1506(199601/02)3:1%3C1::AID-NLA67%3E3.0.CO;2-E).
+
 #
 # ## Implementation
+#
+# Import the required modules:
 
 # +
-
 try:
     from petsc4py import PETSc
 
@@ -109,88 +109,285 @@
 
 import numpy as np
 
-from basix.ufl import element, mixed_element
-from dolfinx import default_real_type, fem, io, mesh
-from dolfinx.fem.petsc import LinearProblem
-from ufl import Measure, SpatialCoordinate, TestFunctions, TrialFunctions, div, exp, inner
+import dolfinx.fem.petsc
+from basix.ufl import element
+from dolfinx import fem, la, mesh
+from dolfinx.fem.petsc import discrete_gradient, interpolation_matrix
+from dolfinx.mesh import CellType, create_unit_square
+from ufl import (
+    Measure,
+    SpatialCoordinate,
+    TestFunction,
+    TrialFunction,
+    ZeroBaseForm,
+    div,
+    exp,
+    inner,
+)
+
+# Solution scalar (e.g., float32, complex128) and geometry (float32/64)
+# types
+dtype = dolfinx.default_scalar_type
+xdtype = dolfinx.default_real_type
+# -
 
-msh = mesh.create_unit_square(MPI.COMM_WORLD, 32, 32, mesh.CellType.quadrilateral)
+# Create a two-dimensional mesh. The iterative solver constructed
+# later requires special construction that is specific to two
+# dimensions. Application in three-dimensions would require a number of
+# changes to the linear solver.
 
+# +
+msh = create_unit_square(MPI.COMM_WORLD, 96, 96, CellType.triangle, dtype=xdtype)
+# -
+#
+# Here we construct compatible function spaces for the mixed Poisson
+# problem. The `V` Raviart-Thomas ($\mathbb{RT}$) space is a
+# vector-valued $H({\rm div})$ conforming space. The `W` space is a
+# space of discontinuous Lagrange function of degree `k`.
+# ```{note}
+# The $\mathbb{RT}_{k}$ element in DOLFINx/Basix is usually denoted as
+# $\mathbb{RT}_{k-1}$ in the literature.
+# ```
+# In the lowest-order case $k=1$. It can be increased, by the
+# convergence of the iterative solver will degrade.
+
+# +
 k = 1
-Q_el = element("BDMCF", msh.basix_cell(), k, dtype=default_real_type)
-P_el = element("DG", msh.basix_cell(), k - 1, dtype=default_real_type)
-V_el = mixed_element([Q_el, P_el])
-V = fem.functionspace(msh, V_el)
+V = fem.functionspace(msh, element("RT", msh.basix_cell(), k, dtype=xdtype))
+W = fem.functionspace(msh, element("Discontinuous Lagrange", msh.basix_cell(), k - 1, dtype=xdtype))
+# -
+
+# Trial functions for $\sigma$ and $u$ are declared on the space $V$ and
+# $W$, with corresponding test functions $\tau$ and $v$:
+
+# +
+(sigma, u) = TrialFunction(V), TrialFunction(W)
+(tau, v) = TestFunction(V), TestFunction(W)
+# -
 
-(sigma, u) = TrialFunctions(V)
-(tau, v) = TestFunctions(V)
+# The source function is set to be $f = 10\exp(-((x_{0} - 0.5)^2 +
+# (x_{1} - 0.5)^2) / 0.02)$:
 
+# +
 x = SpatialCoordinate(msh)
-f = 10.0 * exp(-((x[0] - 0.5) * (x[0] - 0.5) + (x[1] - 0.5) * (x[1] - 0.5)) / 0.02)
+f = 10 * exp(-((x[0] - 0.5) * (x[0] - 0.5) + (x[1] - 0.5) * (x[1] - 0.5)) / 0.02)
+# -
+
+# We now declare the blocked bilinear and linear forms. The rows of `a`
+# and `L` correspond to the $\tau$ and $v$ test functions, respectively.
+# The columns of `a` correspond to the $\sigma$ and $u$ trial functions,
+# respectively. Note that `a[1][1]` is empty, which is denoted by
+# `None`. This zero block is typical of a saddle-point problem. In the
+# `L[0]` block, the test function $\tau$ is multiplied by a zero
+# `Constant`, which is evaluated at runtime. We do this to preserve
+# knowledge of the test space in the block. *Note that the defined `L`
+# corresponds to $u_{0} = 0$ on $\Gamma_{D}$.*
 
+# +
 dx = Measure("dx", msh)
-a = inner(sigma, tau) * dx + inner(u, div(tau)) * dx + inner(div(sigma), v) * dx
-L = -inner(f, v) * dx
+a = [[inner(sigma, tau) * dx, inner(u, div(tau)) * dx], [inner(div(sigma), v) * dx, None]]
+L = [ZeroBaseForm((tau,)), -inner(f, v) * dx]
+# -
+
+# We now compile the abstract/symbolic forms in `a` and `L` into
+# concrete instanced that can be assembled into matrix operators and
+# vectors, respectively.
+
+# +
+a, L = fem.form(a, dtype=dtype), fem.form(L, dtype=dtype)
+# -
 
-# Get subspace of V
-V0 = V.sub(0)
+# In preparation for Dirichlet boundary conditions, we use the function
+# `locate_entities_boundary` to locate mesh entities (facets) with which
+# degree-of-freedoms to be constrained are associated with, and then use
+# `locate_dofs_topological` to get the  degree-of-freedom indices. Below
+# we identify the degree-of-freedom in `V` on the (i) top ($x_{1} = 1$)
+# and (ii) bottom ($x_{1} = 0$) of the mesh/domain.
 
+# +
 fdim = msh.topology.dim - 1
 facets_top = mesh.locate_entities_boundary(msh, fdim, lambda x: np.isclose(x[1], 1.0))
-Q, _ = V0.collapse()
-dofs_top = fem.locate_dofs_topological((V0, Q), fdim, facets_top)
+facets_bottom = mesh.locate_entities_boundary(msh, fdim, lambda x: np.isclose(x[1], 0.0))
+dofs_top = fem.locate_dofs_topological(V, fdim, facets_top)
+dofs_bottom = fem.locate_dofs_topological(V, fdim, facets_bottom)
+# -
 
+# Now, we create Dirichlet boundary objects for the condition $\sigma
+# \cdot n = \sin(5 x_(0)$ on the top and bottom boundaries:
 
-def f1(x):
-    values = np.zeros((2, x.shape[1]))
-    values[1, :] = np.sin(5 * x[0])
-    return values
+# +
+cells_top_ = mesh.compute_incident_entities(msh.topology, facets_top, fdim, fdim + 1)
+cells_bottom = mesh.compute_incident_entities(msh.topology, facets_bottom, fdim, fdim + 1)
+g = fem.Function(V, dtype=dtype)
+g.interpolate(lambda x: np.vstack((np.zeros_like(x[0]), np.sin(5 * x[0]))), cells0=cells_top_)
+g.interpolate(lambda x: np.vstack((np.zeros_like(x[0]), -np.sin(5 * x[0]))), cells0=cells_bottom)
+bcs = [fem.dirichletbc(g, dofs_top), fem.dirichletbc(g, dofs_bottom)]
+# -
+
+# Assemble the matrix operator `A` into a PETSc 'nested matrix', zero
+# rows and columns associated with a Dirichlet boundary condition and
+# placing 1 on the diagonal for Dirichlet constrained
+# degrees-of-freedom:
 
+# +
+A = fem.petsc.assemble_matrix_nest(a, bcs=bcs)
+A.assemble()
+# -
 
-f_h1 = fem.Function(Q)
-f_h1.interpolate(f1)
-bc_top = fem.dirichletbc(f_h1, dofs_top, V0)
+# Assemble the RHS vector as a 'nested' vector and modify (apply
+# lifting) to account for the effect non-zero Dirichlet boundary
+# conditions. Then set Dirichlet boundary values in the RHS vector `b`:
 
+# +
+b = fem.petsc.assemble_vector_nest(L)
+fem.petsc.apply_lifting_nest(b, a, bcs=bcs)
+for b_sub in b.getNestSubVecs():
+    b_sub.ghostUpdate(addv=PETSc.InsertMode.ADD, mode=PETSc.ScatterMode.REVERSE)
+
+bcs0 = fem.bcs_by_block(fem.extract_function_spaces(L), bcs)
+fem.petsc.set_bc_nest(b, bcs0)
+# -
+
+# Rather than solving the linear system $A x = b$, we will solve the
+# preconditioned problem $P^{-1} A x = P^{-1} b$. Commonly $P = A$, but
+# this does not lead to efficient solvers for saddle point problems.
+#
+# For this problem, we introduce the preconditioner
+# $$
+# a_p((\sigma, u), (\tau, v))
+# = \begin{bmatrix} \int_{\Omega} \sigma \cdot \tau + (\nabla \cdot
+# \sigma) (\nabla \cdot \tau) \ {\rm d} x  & 0 \\ 0 &
+# \int_{\Omega} u \cdot v \ {\rm d} x \end{bmatrix}
+# $$
+# and assemble it into the matrix `P`:
 
-facets_bottom = mesh.locate_entities_boundary(msh, fdim, lambda x: np.isclose(x[1], 0.0))
-dofs_bottom = fem.locate_dofs_topological((V0, Q), fdim, facets_bottom)
+# +
+a_p = fem.form(
+    [[inner(sigma, tau) * dx + inner(div(sigma), div(tau)) * dx, None], [None, inner(u, v) * dx]],
+    dtype=dtype,
+)
+P = fem.petsc.assemble_matrix_nest(a_p, bcs=bcs)
+P.assemble()
+# -
 
+# We now create a PETSc Krylov solver and set the preconditioner (`P`)
+# and operator (`A`) matrices:
 
-def f2(x):
-    values = np.zeros((2, x.shape[1]))
-    values[1, :] = -np.sin(5 * x[0])
-    return values
+# +
+ksp = PETSc.KSP().create(msh.comm)
+ksp.setOperators(A, P)
+ksp.setMonitor(lambda ksp, its, rnorm: print(f"Iteration: {its}, residual: {rnorm}"))
+ksp.setType("gmres")
+ksp.setTolerances(rtol=1e-8)
+ksp.setGMRESRestart(100)
+# -
+
+# To apply different solvers/preconditioners to the blocks of `P`, we
+# set the preconditioner to be a PETSc
+# [`fieldsplit`](https://petsc.org/release/manual/ksp/#sec-block-matrices)
+# (block) type and set the 'splits' between the $\sigma$ and $u$ fields.
 
+# +
+ksp.getPC().setType("fieldsplit")
+ksp.getPC().setFieldSplitType(PETSc.PC.CompositeType.ADDITIVE)
+nested_IS = P.getNestISs()
+ksp.getPC().setFieldSplitIS(("sigma", nested_IS[0][0]), ("u", nested_IS[0][1]))
+ksp_sigma, ksp_u = ksp.getPC().getFieldSplitSubKSP()
+# -
+
+# For the $P_{11}$ block, which is the discontinuous Lagrange mass
+# matrix, we let the preconditioner be the default, which is incomplete
+# LU factorisation and which can solve the block exactly in one
+# iteration. The $P_{00}$ requires careful handling as $H({\rm div})$
+# problems require special preconditioners to be efficient.
+#
+# If PETSc has been configured with Hypre, we use the Hypre `Auxiliary
+# Maxwell Space` (AMS) algebraic multigrid preconditioner. We can use
+# AMS for this $H({\rm div})$-type problem in two-dimensions because
+# $H({\rm div})$ and $H({\rm curl})$ spaces are effectively the same in
+# two-dimensions, just rotated by $\pi/2.
 
-f_h2 = fem.Function(Q)
-f_h2.interpolate(f2)
-bc_bottom = fem.dirichletbc(f_h2, dofs_bottom, V0)
+# +
+pc_sigma = ksp_sigma.getPC()
+if PETSc.Sys().hasExternalPackage("hypre") and not np.issubdtype(dtype, np.complexfloating):
+    pc_sigma.setType("hypre")
+    pc_sigma.setHYPREType("ams")
+
+    opts = PETSc.Options()
+    opts[f"{ksp_sigma.prefix}pc_hypre_ams_cycle_type"] = 7
+    opts[f"{ksp_sigma.prefix}pc_hypre_ams_relax_times"] = 2
+
+    # Construct and set the 'discrete gradient' operator, which maps
+    # grad H1 -> H(curl), i.e. the gradient of a scalar Lagrange space
+    # to a H(curl) space
+    V_H1 = fem.functionspace(msh, element("Lagrange", msh.basix_cell(), k, dtype=xdtype))
+    V_curl = fem.functionspace(msh, element("N1curl", msh.basix_cell(), k, dtype=xdtype))
+    G = discrete_gradient(V_H1, V_curl)
+    G.assemble()
+    pc_sigma.setHYPREDiscreteGradient(G)
+
+    assert k > 0, "Element degree must be at least 1."
+    if k == 1:
+        # For the lowest order base (k=1), we can supply interpolation
+        # of the '1' vectors in the space V. Hypre can then construct
+        # the required operators from G and the '1' vectors.
+        cvec0, cvec1 = fem.Function(V), fem.Function(V)
+        cvec0.interpolate(lambda x: np.vstack((np.ones_like(x[0]), np.zeros_like(x[1]))))
+        cvec1.interpolate(lambda x: np.vstack((np.zeros_like(x[0]), np.ones_like(x[1]))))
+        pc_sigma.setHYPRESetEdgeConstantVectors(cvec0.x.petsc_vec, cvec1.x.petsc_vec, None)
+    else:
+        # For high-order spaces, we must provide the (H1)^d -> H(div)
+        # interpolation operator/matrix
+        V_H1d = fem.functionspace(msh, ("Lagrange", k, (msh.geometry.dim,)))
+        Pi = interpolation_matrix(V_H1d, V)  # (H1)^d -> H(div)
+        Pi.assemble()
+        pc_sigma.setHYPRESetInterpolations(msh.geometry.dim, None, None, Pi, None)
+
+        # High-order elements generally converge less well than the
+        # lowest-order case with algebraic multigrid, so we perform
+        # extra work at the multigrid stage
+        opts[f"{ksp_sigma.prefix}pc_hypre_ams_tol"] = 1e-12
+        opts[f"{ksp_sigma.prefix}pc_hypre_ams_max_iter"] = 3
+
+    ksp_sigma.setFromOptions()
+else:
+    # If Hypre is not available, use LU factorisation on the $P_{00}$
+    # block
+    pc_sigma.setType("lu")
+    use_superlu = PETSc.IntType == np.int64
+    if PETSc.Sys().hasExternalPackage("mumps") and not use_superlu:
+        pc_sigma.setFactorSolverType("mumps")
+    elif PETSc.Sys().hasExternalPackage("superlu_dist"):
+        pc_sigma.setFactorSolverType("superlu_dist")
+# -
+
+# We create finite element functions that will hold the $\sigma$ and $u$
+# solutions:
 
+# +
+sigma, u = fem.Function(V, dtype=dtype), fem.Function(W, dtype=dtype)
+# -
 
-bcs = [bc_top, bc_bottom]
+# Create a PETSc 'nested' vector that holds reference to the `sigma` and
+# `u` solution (degree-of-freedom) vectors and solve.
 
-problem = LinearProblem(
-    a,
-    L,
-    bcs=bcs,
-    petsc_options={
-        "ksp_type": "preonly",
-        "pc_type": "lu",
-        "pc_factor_mat_solver_type": "superlu_dist",
-    },
-)
-try:
-    w_h = problem.solve()
-except PETSc.Error as e:  # type: ignore
-    if e.ierr == 92:
-        print("The required PETSc solver/preconditioner is not available. Exiting.")
-        print(e)
-        exit(0)
-    else:
-        raise e
+# +
+x = PETSc.Vec().createNest([la.create_petsc_vector_wrap(sigma.x), la.create_petsc_vector_wrap(u.x)])
+ksp.solve(b, x)
+reason = ksp.getConvergedReason()
+assert reason > 0, f"Krylov solver has not converged {reason}."
+ksp.view()
+# -
 
-sigma_h, u_h = w_h.split()
+# We save the solution `u` in VTX format:
 
-with io.XDMFFile(msh.comm, "out_mixed_poisson/u.xdmf", "w") as file:
-    file.write_mesh(msh)
-    file.write_function(u_h)
+# +
+try:
+    from dolfinx.io import VTXWriter
+
+    u.name = "u"
+    with VTXWriter(msh.comm, "output_mixed_poisson.bp", u, "bp4") as f:
+        f.write(0.0)
+except ImportError:
+    print("ADIOS2 required for VTX output.")
+# -
diff --git a/python/demo/demo_poisson.py b/python/demo/demo_poisson.py
index 78f3aec3982..38c072211e3 100644
--- a/python/demo/demo_poisson.py
+++ b/python/demo/demo_poisson.py
@@ -190,6 +190,6 @@
     else:
         plotter.show()
 except ModuleNotFoundError:
-    print("'pyvista' is required to visualise the solution")
-    print("Install 'pyvista' with pip: 'python3 -m pip install pyvista'")
+    print("'pyvista' is required to visualise the solution.")
+    print("To install pyvista with pip: 'python3 -m pip install pyvista'.")
 # -
diff --git a/python/demo/demo_stokes.py b/python/demo/demo_stokes.py
index bf89e14fc4f..599063f4fc5 100644
--- a/python/demo/demo_stokes.py
+++ b/python/demo/demo_stokes.py
@@ -116,7 +116,7 @@
 from dolfinx.fem.petsc import assemble_matrix_block, assemble_vector_block
 from dolfinx.io import XDMFFile
 from dolfinx.mesh import CellType, create_rectangle, locate_entities_boundary
-from ufl import div, dx, grad, inner
+from ufl import ZeroBaseForm, div, dx, grad, inner
 
 # We create a {py:class}`Mesh <dolfinx.mesh.Mesh>`, define functions for
 # locating geometrically subsets of the boundary, and define a function
@@ -177,7 +177,7 @@ def lid_velocity_expression(x):
 # -
 
 # The bilinear and linear forms for the Stokes equations are defined
-# using a a blocked structure:
+# using a blocked structure:
 
 # +
 # Define variational problem
@@ -186,7 +186,7 @@ def lid_velocity_expression(x):
 f = Constant(msh, (PETSc.ScalarType(0), PETSc.ScalarType(0)))  # type: ignore
 
 a = form([[inner(grad(u), grad(v)) * dx, inner(p, div(v)) * dx], [inner(div(u), q) * dx, None]])
-L = form([inner(f, v) * dx, inner(Constant(msh, PETSc.ScalarType(0)), q) * dx])  # type: ignore
+L = form([inner(f, v) * dx, ZeroBaseForm((q,))])
 # -
 
 # A block-diagonal preconditioner will be used with the iterative
@@ -440,9 +440,8 @@ def block_direct_solver():
     # handle pressure nullspace
     pc = ksp.getPC()
     pc.setType("lu")
-    sys = PETSc.Sys()  # type: ignore
     use_superlu = PETSc.IntType == np.int64
-    if sys.hasExternalPackage("mumps") and not use_superlu:
+    if PETSc.Sys().hasExternalPackage("mumps") and not use_superlu:
         pc.setFactorSolverType("mumps")
         pc.setFactorSetUpSolverType()
         pc.getFactorMatrix().setMumpsIcntl(icntl=24, ival=1)
@@ -527,9 +526,8 @@ def mixed_direct():
     # Configure MUMPS to handle pressure nullspace
     pc = ksp.getPC()
     pc.setType("lu")
-    sys = PETSc.Sys()  # type: ignore
     use_superlu = PETSc.IntType == np.int64
-    if sys.hasExternalPackage("mumps") and not use_superlu:
+    if PETSc.Sys().hasExternalPackage("mumps") and not use_superlu:
         pc.setFactorSolverType("mumps")
         pc.setFactorSetUpSolverType()
         pc.getFactorMatrix().setMumpsIcntl(icntl=24, ival=1)
diff --git a/python/dolfinx/fem/dofmap.py b/python/dolfinx/fem/dofmap.py
index 0355a2cd860..b8a848170dc 100644
--- a/python/dolfinx/fem/dofmap.py
+++ b/python/dolfinx/fem/dofmap.py
@@ -8,10 +8,10 @@
 
 
 class DofMap:
-    """Degree-of-freedom map
+    """Degree-of-freedom map.
 
-    This class handles the mapping of degrees of freedom. It builds
-    a dof map based on a FiniteElement on a specific mesh.
+    This class handles the mapping of degrees of freedom. It builds a
+    dof map based on a FiniteElement on a specific mesh.
     """
 
     _cpp_object: _cpp.fem.DofMap
@@ -26,13 +26,14 @@ def cell_dofs(self, cell_index: int):
             cell: The cell index.
 
         Returns:
-            Local-global dof map for the cell (using process-local indices).
+            Local-global dof map for the cell (using process-local
+            indices).
         """
         return self._cpp_object.cell_dofs(cell_index)
 
     @property
     def bs(self):
-        """Returns the block size of the dofmap"""
+        """Block size of the dofmap."""
         return self._cpp_object.bs
 
     @property
diff --git a/python/dolfinx/fem/forms.py b/python/dolfinx/fem/forms.py
index bb96eae9261..85ed796d011 100644
--- a/python/dolfinx/fem/forms.py
+++ b/python/dolfinx/fem/forms.py
@@ -1,4 +1,5 @@
-# Copyright (C) 2017-2024 Chris N. Richardson, Garth N. Wells, Michal Habera and Jørgen S. Dokken
+# Copyright (C) 2017-2024 Chris N. Richardson, Garth N. Wells,
+# Michal Habera and Jørgen S. Dokken
 #
 # This file is part of DOLFINx (https://www.fenicsproject.org)
 #
@@ -116,17 +117,17 @@ def get_integration_domains(
 ) -> list[tuple[int, np.ndarray]]:
     """Get integration domains from subdomain data.
 
-    The subdomain data is a meshtags object consisting of markers, or a
-    None object. If it is a None object we do not pack any integration
+    The subdomain data is a MeshTags object consisting of markers, or
+    ``None``. If it is ``None``, we do not pack any integration
     entities. Integration domains are defined as a list of tuples, where
-    each input `subdomain_ids` is mapped to an array of integration
+    each input ``subdomain_ids`` is mapped to an array of integration
     entities, where an integration entity for a cell integral is the
     list of cells. For an exterior facet integral each integration
-    entity is a tuple (cell_index, local_facet_index). For an interior
-    facet integral each integration entity is a uple (cell_index0,
-    local_facet_index0, cell_index1, local_facet_index1). Where the
-    first cell-facet pair is the '+' restriction, the second the '-'
-    restriction.
+    entity is a tuple ``(cell_index, local_facet_index)``. For an
+    interior facet integral each integration entity is a tuple
+    ``(cell_index0, local_facet_index0, cell_index1,
+    local_facet_index1)``. Where the first cell-facet pair is the
+    ``'+'`` restriction, the second the ``'-'`` restriction.
 
     Args:
         integral_type: The type of integral to pack integration
@@ -216,9 +217,9 @@ def form(
             For each key (a mesh, different to the integration domain
             mesh) a map should be provided relating the entities in the
             integration domain mesh to the entities in the key mesh e.g.
-            for a key-value pair (msh, emap) in `entity_maps`, `emap[i]`
-            is the entity in `msh` corresponding to entity `i` in the
-            integration domain mesh.
+            for a key-value pair ``(msh, emap)`` in ``entity_maps``,
+            ``emap[i]`` is the entity in ``msh`` corresponding to entity
+            ``i`` in the integration domain mesh.
 
     Returns:
         Compiled finite element Form.
@@ -246,11 +247,11 @@ def _form(form):
         for data in sd.get(domain).values():
             assert all([d is data[0] for d in data if d is not None])
 
-        mesh = domain.ufl_cargo()
-        if mesh is None:
+        msh = domain.ufl_cargo()
+        if msh is None:
             raise RuntimeError("Expecting to find a Mesh in the form.")
         ufcx_form, module, code = jit.ffcx_jit(
-            mesh.comm, form, form_compiler_options=form_compiler_options, jit_options=jit_options
+            msh.comm, form, form_compiler_options=form_compiler_options, jit_options=jit_options
         )
 
         # For each argument in form extract its function space
@@ -308,10 +309,28 @@ def _form(form):
             constants,
             subdomains,
             _entity_maps,
-            mesh,
+            msh,
         )
         return Form(f, ufcx_form, code, module)
 
+    def _zero_form(form):
+        """Compile a single 'zero' UFL form, i.e. a form with no integrals."""
+        V = [arg.ufl_function_space()._cpp_object for arg in form.arguments()]
+        if entity_maps is None:
+            _entity_maps = dict()
+        else:
+            _entity_maps = {msh._cpp_object: emap for (msh, emap) in entity_maps.items()}
+        f = ftype(
+            spaces=V,
+            integrals={},
+            coefficients=[],
+            constants=[],
+            need_permutation_data=False,
+            entity_maps=_entity_maps,
+            mesh=None,
+        )
+        return Form(f)
+
     def _create_form(form):
         """Recursively convert ufl.Forms to dolfinx.fem.Form.
 
@@ -323,10 +342,9 @@ def _create_form(form):
             A ``dolfinx.fem.Form`` or a list of ``dolfinx.fem.Form``.
         """
         if isinstance(form, ufl.Form):
-            if form.empty():
-                return None
-            else:
-                return _form(form)
+            return _form(form)
+        elif isinstance(form, ufl.ZeroBaseForm):
+            return _zero_form(form)
         elif isinstance(form, collections.abc.Iterable):
             return list(map(lambda sub_form: _create_form(sub_form), form))
         else:
@@ -344,11 +362,11 @@ def extract_function_spaces(
 ) -> typing.Iterable[typing.Union[None, function.FunctionSpace]]:
     """Extract common function spaces from an array of forms.
 
-    If `forms` is a list of linear form, this function returns of list
-    of the corresponding test functions. If `forms` is a 2D array of
-    bilinear forms, for index=0 the list common test function space for
-    each row is returned, and if index=1 the common trial function
-    spaces for each column are returned.
+    If ``forms`` is a list of linear form, this function returns of list
+    of the corresponding test functions. If ``forms`` is a 2D array of
+    bilinear forms, for ``index=0`` the list common test function space
+    for each row is returned, and if ``index=1`` the common trial
+    function spaces for each column are returned.
     """
     _forms = np.array(forms)
     if _forms.ndim == 0:
@@ -456,7 +474,7 @@ def form_cpp_creator(
 def create_form(
     form: CompiledForm,
     function_spaces: list[function.FunctionSpace],
-    mesh: Mesh,
+    msh: Mesh,
     subdomains: dict[IntegralType, list[tuple[int, np.ndarray]]],
     coefficient_map: dict[ufl.Function, function.Function],
     constant_map: dict[ufl.Constant, function.Constant],
@@ -469,7 +487,7 @@ def create_form(
         form: Compiled ufl form
         function_spaces: List of function spaces associated with the
             form. Should match the number of arguments in the form.
-        mesh: Mesh to associate form with
+        msh: Mesh to associate form with
         subdomains: A map from integral type to a list of pairs, where
             each pair corresponds to a subdomain id and the set of of
             integration entities to integrate over. Can be computed with
@@ -477,9 +495,9 @@ def create_form(
         coefficient_map: Map from UFL coefficient to function with data.
         constant_map: Map from UFL constant to constant with data.
         entity_map: A map where each key corresponds to a mesh different
-            to the integration domain `mesh`. The value of the map is an
-            array of integers, where the i-th entry is the entity in the
-            key mesh.
+            to the integration domain ``msh``. The value of the map is
+            an array of integers, where the i-th entry is the entity in
+            the key mesh.
     """
     if entity_maps is None:
         _entity_maps = {}
@@ -506,6 +524,6 @@ def create_form(
         constants,
         _subdomain_data,
         _entity_maps,
-        mesh._cpp_object,
+        msh._cpp_object,
     )
     return Form(f, form.ufcx_form, form.code)
diff --git a/python/dolfinx/fem/function.py b/python/dolfinx/fem/function.py
index 44c1f740bb2..8c8a3277ea3 100644
--- a/python/dolfinx/fem/function.py
+++ b/python/dolfinx/fem/function.py
@@ -316,9 +316,9 @@ def __init__(
             if dtype is None:
                 dtype = default_scalar_type
 
-        assert np.issubdtype(
-            V.element.dtype, np.dtype(dtype).type(0).real.dtype
-        ), "Incompatible FunctionSpace dtype and requested dtype."
+        assert np.issubdtype(V.element.dtype, np.dtype(dtype).type(0).real.dtype), (
+            "Incompatible FunctionSpace dtype and requested dtype."
+        )
 
         # Create cpp Function
         def functiontype(dtype):
@@ -592,9 +592,9 @@ def functionspace(
     element = finiteelement(mesh.topology.cell_type, ufl_e, dtype)
     cpp_dofmap = _cpp.fem.create_dofmap(mesh.comm, mesh.topology._cpp_object, element._cpp_object)
 
-    assert np.issubdtype(
-        mesh.geometry.x.dtype, element.dtype
-    ), "Mesh and element dtype are not compatible."
+    assert np.issubdtype(mesh.geometry.x.dtype, element.dtype), (
+        "Mesh and element dtype are not compatible."
+    )
 
     # Initialize the cpp.FunctionSpace
     try:
@@ -667,11 +667,6 @@ def num_sub_spaces(self) -> int:
         """Number of sub spaces."""
         return self.element.num_sub_elements
 
-    # @property
-    # def value_shape(self) -> tuple[int, ...]:
-    #     """Value shape."""
-    #     return tuple(int(i) for i in self._cpp_object.value_shape)
-
     def sub(self, i: int) -> FunctionSpace:
         """Return the i-th sub space.
 
diff --git a/python/dolfinx/fem/petsc.py b/python/dolfinx/fem/petsc.py
index 9771371c6d8..4305c004ab0 100644
--- a/python/dolfinx/fem/petsc.py
+++ b/python/dolfinx/fem/petsc.py
@@ -143,7 +143,8 @@ def create_vector_block(L: list[Form]) -> PETSc.Vec:
 
 
 def create_vector_nest(L: list[Form]) -> PETSc.Vec:
-    """Create a PETSc nested vector (``VecNest``) that is compatible with a list of linear forms.
+    """Create a PETSc nested vector (``VecNest``) that is compatible
+    with a list of linear forms.
 
     Args:
         L: List of linear forms.
@@ -285,14 +286,16 @@ def _assemble_vector_vec(b: PETSc.Vec, L: Form, constants=None, coeffs=None) ->
 def assemble_vector_nest(L: typing.Any, constants=None, coeffs=None) -> PETSc.Vec:
     """Assemble linear forms into a new nested PETSc (``VecNest``) vector.
 
-    Note:
-        Due to subtle issues in the interaction between petsc4py memory management
-        and the Python garbage collector, it is recommended that the method ``PETSc.Vec.destroy()``
-        is called on the returned object once the object is no longer required. Note that
-        ``PETSc.Vec.destroy()`` is collective over the object's MPI communicator.
-
     The returned vector is not finalised, i.e. ghost values are not
     accumulated on the owning processes.
+
+    Note:
+        Due to subtle issues in the interaction between petsc4py memory
+        management and the Python garbage collector, it is recommended
+        that the method ``PETSc.Vec.destroy()`` is called on the
+        returned object once the object is no longer required. Note that
+        ``PETSc.Vec.destroy()`` is collective over the object's MPI
+        communicator.
     """
     maps = [
         (form.function_spaces[0].dofmap.index_map, form.function_spaces[0].dofmap.index_map_bs)
@@ -337,13 +340,15 @@ def assemble_vector_block(
 ) -> PETSc.Vec:
     """Assemble linear forms into a monolithic vector.
 
-    Note:
-        Due to subtle issues in the interaction between petsc4py memory management
-        and the Python garbage collector, it is recommended that the method ``PETSc.Vec.destroy()``
-        is called on the returned object once the object is no longer required. Note that
-        ``PETSc.Vec.destroy()`` is collective over the object's MPI communicator.
-
     The vector is not finalised, i.e. ghost values are not accumulated.
+
+    Note:
+        Due to subtle issues in the interaction between petsc4py memory
+        management and the Python garbage collector, it is recommended
+        that the method ``PETSc.Vec.destroy()`` is called on the
+        returned object once the object is no longer required. Note that
+        ``PETSc.Vec.destroy()`` is collective over the object's MPI
+        communicator.
     """
     maps = [
         (form.function_spaces[0].dofmap.index_map, form.function_spaces[0].dofmap.index_map_bs)
@@ -460,10 +465,12 @@ def assemble_matrix(
         have not been communicated.
 
     Note:
-        Due to subtle issues in the interaction between petsc4py memory management
-        and the Python garbage collector, it is recommended that the method ``PETSc.Mat.destroy()``
-        is called on the returned object once the object is no longer required. Note that
-        ``PETSc.Mat.destroy()`` is collective over the object's MPI communicator.
+        Due to subtle issues in the interaction between petsc4py memory
+        management and the Python garbage collector, it is recommended
+        that the method ``PETSc.Mat.destroy()`` is called on the
+        returned object once the object is no longer required. Note that
+        ``PETSc.Mat.destroy()`` is collective over the object's MPI
+        communicator.
 
     Args:
         a: Bilinear form to assembled into a matrix.
@@ -520,10 +527,12 @@ def assemble_matrix_nest(
     """Create a nested matrix and assemble bilinear forms into the matrix.
 
     Note:
-        Due to subtle issues in the interaction between petsc4py memory management
-        and the Python garbage collector, it is recommended that the method ``PETSc.Mat.destroy()``
-        is called on the returned object once the object is no longer required. Note that
-        ``PETSc.Mat.destroy()`` is collective over the object's MPI communicator.
+        Due to subtle issues in the interaction between petsc4py memory
+        management and the Python garbage collector, it is recommended
+        that the method ``PETSc.Mat.destroy()`` is called on the
+        returned object once the object is no longer required. Note that
+        ``PETSc.Mat.destroy()`` is collective over the object's MPI
+        communicator.
 
     Args:
         a: Rectangular (list-of-lists) array for bilinear forms.
@@ -615,11 +624,12 @@ def assemble_matrix_block(
     """Assemble bilinear forms into a blocked matrix.
 
     Note:
-        Due to subtle issues in the interaction between petsc4py memory management
-        and the Python garbage collector, it is recommended that the method ``PETSc.Mat.destroy()``
-        is called on the returned object once the object is no longer required. Note that
-        ``PETSc.Mat.destroy()`` is collective over the object's MPI communicator.
-
+        Due to subtle issues in the interaction between petsc4py memory
+        management and the Python garbage collector, it is recommended
+        that the method ``PETSc.Mat.destroy()`` is called on the
+        returned object once the object is no longer required. Note that
+        ``PETSc.Mat.destroy()`` is collective over the object's MPI
+        communicator.
     """
     _a = [[None if form is None else form._cpp_object for form in arow] for arow in a]
     A = _cpp.fem.petsc.create_matrix_block(_a)
@@ -777,7 +787,9 @@ def set_bc_nest(
     x0: typing.Optional[PETSc.Vec] = None,
     alpha: float = 1,
 ) -> None:
-    """Apply the function :func:`dolfinx.fem.set_bc` to each sub-vector of a nested PETSc Vector."""
+    """Apply the function :func:`dolfinx.fem.set_bc` to each sub-vector
+    of a nested PETSc Vector.
+    """
     _b = b.getNestSubVecs()
     x0 = len(_b) * [None] if x0 is None else x0.getNestSubVecs()
     for b_sub, bc, x_sub in zip(_b, bcs, x0):
@@ -962,8 +974,8 @@ def __init__(
                 command line to see all available options.
             jit_options: Options used in CFFI JIT compilation of C
                 code generated by FFCx. See ``python/dolfinx/jit.py``
-                for all available options. Takes priority over all
-                other option values.
+                for all available options. Takes priority over all other
+                option values.
 
         Example::
 
diff --git a/python/dolfinx/graph.py b/python/dolfinx/graph.py
index ebc297ba000..5ad919a1fa2 100644
--- a/python/dolfinx/graph.py
+++ b/python/dolfinx/graph.py
@@ -48,6 +48,9 @@ def __init__(self, cpp_object: Union[_cpp.la.AdjacencyList_int32, _cpp.la.Adjace
         """
         self._cpp_object = cpp_object
 
+    def __repr__(self):
+        return self._cpp_object.__repr__
+
     def links(self, node: Union[np.int32, np.int64]) -> npt.NDArray[Union[np.int32, np.int64]]:
         """Retrieve the links of a node.
 
diff --git a/python/dolfinx/mesh.py b/python/dolfinx/mesh.py
index 9439faa5521..89a1ff69f21 100644
--- a/python/dolfinx/mesh.py
+++ b/python/dolfinx/mesh.py
@@ -366,7 +366,7 @@ def ufl_id(self) -> int:
 
     @property
     def topology(self) -> _cpp.mesh.Topology:
-        """Mesh topology with which the the tags are associated."""
+        """Mesh topology with which the tags are associated."""
         return self._cpp_object.topology
 
     @property
diff --git a/python/dolfinx/plot.py b/python/dolfinx/plot.py
index ef90cb1c2ca..f92499de55f 100644
--- a/python/dolfinx/plot.py
+++ b/python/dolfinx/plot.py
@@ -103,7 +103,8 @@ def _(V: fem.FunctionSpace, entities=None):
         "P",
     ]:
         raise RuntimeError(
-            "Can only create meshes from continuous or discontinuous Lagrange spaces"
+            "Can only create meshes from continuous or discontinuous Lagrange"
+            "spaces, not {V.ufl_element().family_name}."
         )
 
     degree = V.ufl_element().degree
diff --git a/python/test/conftest.py b/python/test/conftest.py
index b55c9a144d9..36a699d0868 100644
--- a/python/test/conftest.py
+++ b/python/test/conftest.py
@@ -192,7 +192,7 @@ def _global_dot(comm, v0, v1):
             p.array[:nr] = beta * p.array[:nr] + r
 
         raise RuntimeError(
-            f"Solver exceeded max iterations ({maxit}). Relative residual={rnorm/rnorm0}."
+            f"Solver exceeded max iterations ({maxit}). Relative residual={rnorm / rnorm0}."
         )
 
     return _cg
diff --git a/python/test/unit/fem/test_assembler.py b/python/test/unit/fem/test_assembler.py
index dc4b9f2efc8..7459af0b54c 100644
--- a/python/test/unit/fem/test_assembler.py
+++ b/python/test/unit/fem/test_assembler.py
@@ -333,9 +333,7 @@ def test_matrix_assembly_block(self, mode):
         # Define variational problem
         u, p, r = ufl.TrialFunction(V0), ufl.TrialFunction(V1), ufl.TrialFunction(V2)
         v, q, s = ufl.TestFunction(V0), ufl.TestFunction(V1), ufl.TestFunction(V2)
-        f = 1.0
         g = -3.0
-        zero = Function(V0)
 
         a00 = inner(u, v) * dx
         a01 = inner(p, v) * dx
@@ -347,7 +345,7 @@ def test_matrix_assembly_block(self, mode):
         a21 = inner(p, s) * dx
         a22 = inner(r, s) * dx
 
-        L0 = zero * inner(f, v) * dx
+        L0 = ufl.ZeroBaseForm((v,))
         L1 = inner(g, q) * dx
         L2 = inner(g, s) * dx
 
@@ -409,7 +407,7 @@ def monolithic():
                 + inner(u2, v0) * dx
                 + inner(u2, v1) * dx
             )
-            L = zero * inner(f, v0) * ufl.dx + inner(g, v1) * dx + inner(g, v2) * dx
+            L = ufl.ZeroBaseForm((v0,)) + inner(g, v1) * dx + inner(g, v2) * dx
             a, L = form(a), form(L)
 
             bdofsW_V1 = locate_dofs_topological(W.sub(1), mesh.topology.dim - 1, bndry_facets)
@@ -476,11 +474,10 @@ def test_assembly_solve_block(self, mode):
         v, q = ufl.TestFunction(V0), ufl.TestFunction(V1)
         f = 1.0
         g = -3.0
-        zero = Function(V0)
 
         a00 = form(inner(u, v) * dx)
-        a01 = form(zero * inner(p, v) * dx)
-        a10 = form(zero * inner(u, q) * dx)
+        a01 = None
+        a10 = None
         a11 = form(inner(p, q) * dx)
         L0 = form(inner(f, v) * dx)
         L1 = form(inner(g, q) * dx)
@@ -654,12 +651,10 @@ def boundary1(x):
         p01, p10 = None, None
         p11 = inner(p, q) * dx
 
-        # FIXME
-        # We need zero function for the 'zero' part of L
-        p_zero = Function(P1)
+        # We need a 'zero' form for the 'zero' part of L
         f = Function(P2)
         L0 = ufl.inner(f, v) * dx
-        L1 = ufl.inner(p_zero, q) * dx
+        L1 = ufl.ZeroBaseForm((q,))
 
         def nested_solve():
             """Nested solver"""
@@ -759,9 +754,8 @@ def monolithic_solve():
             p_form = p00 + p11
 
             f = Function(W.sub(0).collapse()[0])
-            p_zero = Function(W.sub(1).collapse()[0])
             L0 = inner(f, v) * dx
-            L1 = inner(p_zero, q) * dx
+            L1 = ufl.ZeroBaseForm((q,))
             L = L0 + L1
             a, p_form, L = form(a), form(p_form), form(L)
 

From 22a8ec26636e548e9fbe372d015f7c6c96ab79f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Paul=20T=2E=20K=C3=BChner?=
 <56360279+schnellerhase@users.noreply.github.com>
Date: Sat, 11 Jan 2025 17:03:35 +0100
Subject: [PATCH 2/9] Fix inconsistent ufl usage in demos (#3532)

* Fix inconsitent ufl import in elasticity example

* Fix inconsistent ufl import in biharmonic example

* Fix inconsistent ufl import in cahn-hilliard example

* Fix inconsistent ufl import in hdg example

* Fix inconsistent ufl import in Helmholtz example

* Fix inconsistent ufl import in Lagrange variants example

* Fix inconsistent ufl import in poisson matrix free example

* Fix inconsistent ufl import in poisson example

* Fix inconsistent ufl import in pyamg example

* Fix inconsistent ufl import in stokes example

* Fix lagrange variants

* Add 'ufl' banned import from

* Switch to consistent 'import ufl' usage in demos

* New ruff version

* Remove comment
---
 python/demo/demo_biharmonic.py          | 17 +++--
 python/demo/demo_cahn-hilliard.py       | 15 ++--
 python/demo/demo_elasticity.py          |  9 ++-
 python/demo/demo_hdg.py                 | 15 ++--
 python/demo/demo_helmholtz.py           | 21 +++---
 python/demo/demo_lagrange_variants.py   |  5 +-
 python/demo/demo_mixed-poisson.py       | 33 ++++-----
 python/demo/demo_navier-stokes.py       | 93 +++++++++++--------------
 python/demo/demo_poisson.py             |  5 +-
 python/demo/demo_poisson_matrix_free.py |  9 ++-
 python/demo/demo_pyamg.py               | 13 ++--
 python/demo/demo_stokes.py              | 19 +++--
 python/demo/demo_tnt-elements.py        | 16 ++---
 python/demo/pyproject.toml              |  5 ++
 14 files changed, 139 insertions(+), 136 deletions(-)
 create mode 100644 python/demo/pyproject.toml

diff --git a/python/demo/demo_biharmonic.py b/python/demo/demo_biharmonic.py
index bf8f0d9c9bc..4ed85951321 100644
--- a/python/demo/demo_biharmonic.py
+++ b/python/demo/demo_biharmonic.py
@@ -131,7 +131,6 @@
 from dolfinx import fem, io, mesh, plot
 from dolfinx.fem.petsc import LinearProblem
 from dolfinx.mesh import CellType, GhostMode
-from ufl import CellDiameter, FacetNormal, avg, div, dS, dx, grad, inner, jump, pi, sin
 
 # -
 
@@ -192,8 +191,8 @@
 # sides of a facet.
 
 alpha = ScalarType(8.0)
-h = CellDiameter(msh)
-n = FacetNormal(msh)
+h = ufl.CellDiameter(msh)
+n = ufl.FacetNormal(msh)
 h_avg = (h("+") + h("-")) / 2.0
 
 # After that, we can define the variational problem consisting of the bilinear
@@ -210,15 +209,15 @@
 u = ufl.TrialFunction(V)
 v = ufl.TestFunction(V)
 x = ufl.SpatialCoordinate(msh)
-f = 4.0 * pi**4 * sin(pi * x[0]) * sin(pi * x[1])
+f = 4.0 * ufl.pi**4 * ufl.sin(ufl.pi * x[0]) * ufl.sin(ufl.pi * x[1])
 
 a = (
-    inner(div(grad(u)), div(grad(v))) * dx
-    - inner(avg(div(grad(u))), jump(grad(v), n)) * dS
-    - inner(jump(grad(u), n), avg(div(grad(v)))) * dS
-    + alpha / h_avg * inner(jump(grad(u), n), jump(grad(v), n)) * dS
+    ufl.inner(ufl.div(ufl.grad(u)), ufl.div(ufl.grad(v))) * ufl.dx
+    - ufl.inner(ufl.avg(ufl.div(ufl.grad(u))), ufl.jump(ufl.grad(v), n)) * ufl.dS
+    - ufl.inner(ufl.jump(ufl.grad(u), n), ufl.avg(ufl.div(ufl.grad(v)))) * ufl.dS
+    + alpha / h_avg * ufl.inner(ufl.jump(ufl.grad(u), n), ufl.jump(ufl.grad(v), n)) * ufl.dS
 )
-L = inner(f, v) * dx
+L = ufl.inner(f, v) * ufl.dx
 # -
 
 # We create a {py:class}`LinearProblem <dolfinx.fem.petsc.LinearProblem>`
diff --git a/python/demo/demo_cahn-hilliard.py b/python/demo/demo_cahn-hilliard.py
index 3cfed7d48e8..85e7aa45e91 100644
--- a/python/demo/demo_cahn-hilliard.py
+++ b/python/demo/demo_cahn-hilliard.py
@@ -145,7 +145,6 @@
 from dolfinx.io import XDMFFile
 from dolfinx.mesh import CellType, create_unit_square
 from dolfinx.nls.petsc import NewtonSolver
-from ufl import dx, grad, inner
 
 try:
     import pyvista as pv
@@ -200,7 +199,7 @@
 c0, mu0 = ufl.split(u0)
 # -
 
-# The line `c, mu = ufl.split(u)` permits direct access to the
+# The line `c, mu = split(u)` permits direct access to the
 # components of a mixed function. Note that `c` and `mu` are references
 # for components of `u`, and not copies.
 #
@@ -249,8 +248,16 @@
 # which is then used in the definition of the variational forms:
 
 # Weak statement of the equations
-F0 = inner(c, q) * dx - inner(c0, q) * dx + dt * inner(grad(mu_mid), grad(q)) * dx
-F1 = inner(mu, v) * dx - inner(dfdc, v) * dx - lmbda * inner(grad(c), grad(v)) * dx
+F0 = (
+    ufl.inner(c, q) * ufl.dx
+    - ufl.inner(c0, q) * ufl.dx
+    + dt * ufl.inner(ufl.grad(mu_mid), ufl.grad(q)) * ufl.dx
+)
+F1 = (
+    ufl.inner(mu, v) * ufl.dx
+    - ufl.inner(dfdc, v) * ufl.dx
+    - lmbda * ufl.inner(ufl.grad(c), ufl.grad(v)) * ufl.dx
+)
 F = F0 + F1
 
 # This is a statement of the time-discrete equations presented as part
diff --git a/python/demo/demo_elasticity.py b/python/demo/demo_elasticity.py
index 8301667b096..fbca3dbdad7 100644
--- a/python/demo/demo_elasticity.py
+++ b/python/demo/demo_elasticity.py
@@ -53,7 +53,6 @@
 from dolfinx.fem.petsc import apply_lifting, assemble_matrix, assemble_vector
 from dolfinx.io import XDMFFile
 from dolfinx.mesh import CellType, GhostMode, create_box, locate_entities_boundary
-from ufl import dx, grad, inner
 
 dtype = PETSc.ScalarType  # type: ignore
 # -
@@ -135,7 +134,7 @@ def build_nullspace(V: FunctionSpace):
 
 def σ(v):
     """Return an expression for the stress σ given a displacement field"""
-    return 2.0 * μ * ufl.sym(grad(v)) + λ * ufl.tr(ufl.sym(grad(v))) * ufl.Identity(len(v))
+    return 2.0 * μ * ufl.sym(ufl.grad(v)) + λ * ufl.tr(ufl.sym(ufl.grad(v))) * ufl.Identity(len(v))
 
 
 # -
@@ -146,8 +145,8 @@ def σ(v):
 
 V = functionspace(msh, ("Lagrange", 1, (msh.geometry.dim,)))
 u, v = ufl.TrialFunction(V), ufl.TestFunction(V)
-a = form(inner(σ(u), grad(v)) * dx)
-L = form(inner(f, v) * dx)
+a = form(ufl.inner(σ(u), ufl.grad(v)) * ufl.dx)
+L = form(ufl.inner(f, v) * ufl.dx)
 
 # A homogeneous (zero) boundary condition is created on $x_0 = 0$ and
 # $x_1 = 1$ by finding all facets on these boundaries, and then creating
@@ -240,7 +239,7 @@ def σ(v):
 
 # +
 sigma_dev = σ(uh) - (1 / 3) * ufl.tr(σ(uh)) * ufl.Identity(len(uh))
-sigma_vm = ufl.sqrt((3 / 2) * inner(sigma_dev, sigma_dev))
+sigma_vm = ufl.sqrt((3 / 2) * ufl.inner(sigma_dev, sigma_dev))
 # -
 
 # Next, the Von Mises stress is interpolated in a piecewise-constant
diff --git a/python/demo/demo_hdg.py b/python/demo/demo_hdg.py
index 8e0e54a6869..c40ef304d3e 100644
--- a/python/demo/demo_hdg.py
+++ b/python/demo/demo_hdg.py
@@ -41,7 +41,6 @@
 import ufl
 from dolfinx import fem, mesh
 from dolfinx.cpp.mesh import cell_num_entities
-from ufl import div, dot, grad, inner
 
 
 def par_print(comm, string):
@@ -147,17 +146,17 @@ def u_e(x):
 x = ufl.SpatialCoordinate(msh)
 c = 1.0 + 0.1 * ufl.sin(ufl.pi * x[0]) * ufl.sin(ufl.pi * x[1])
 a = (
-    inner(c * grad(u), grad(v)) * dx_c
-    - inner(c * (u - ubar), dot(grad(v), n)) * ds_c(cell_boundaries)
-    - inner(dot(grad(u), n), c * (v - vbar)) * ds_c(cell_boundaries)
-    + gamma * inner(c * (u - ubar), v - vbar) * ds_c(cell_boundaries)
+    ufl.inner(c * ufl.grad(u), ufl.grad(v)) * dx_c
+    - ufl.inner(c * (u - ubar), ufl.dot(ufl.grad(v), n)) * ds_c(cell_boundaries)
+    - ufl.inner(ufl.dot(ufl.grad(u), n), c * (v - vbar)) * ds_c(cell_boundaries)
+    + gamma * ufl.inner(c * (u - ubar), v - vbar) * ds_c(cell_boundaries)
 )
 
 # Manufacture a source term
-f = -div(c * grad(u_e(x)))
+f = -ufl.div(c * ufl.grad(u_e(x)))
 
-L = inner(f, v) * dx_c
-L += inner(fem.Constant(facet_mesh, dtype(0.0)), vbar) * dx_f
+L = ufl.inner(f, v) * dx_c
+L += ufl.inner(fem.Constant(facet_mesh, dtype(0.0)), vbar) * dx_f
 
 # Define block structure
 a_blocked = dolfinx.fem.form(ufl.extract_blocks(a), entity_maps=entity_maps)
diff --git a/python/demo/demo_helmholtz.py b/python/demo/demo_helmholtz.py
index 0c9ae774bd2..051ba18b2bd 100644
--- a/python/demo/demo_helmholtz.py
+++ b/python/demo/demo_helmholtz.py
@@ -51,7 +51,6 @@
 from dolfinx.fem.petsc import LinearProblem
 from dolfinx.io import XDMFFile
 from dolfinx.mesh import create_unit_square
-from ufl import FacetNormal, dot, ds, dx, grad, inner
 
 # Wavenumber
 k0 = 4 * np.pi
@@ -79,14 +78,14 @@
 
 # Define variational problem:
 u, v = ufl.TrialFunction(V), ufl.TestFunction(V)
-a = inner(grad(u), grad(v)) * dx - k0**2 * inner(u, v) * dx
+a = ufl.inner(ufl.grad(u), ufl.grad(v)) * ufl.dx - k0**2 * ufl.inner(u, v) * ufl.dx
 
 # solve for plane wave with mixed Dirichlet and Neumann BCs
 theta = np.pi / 4
 u_exact.interpolate(lambda x: A * np.exp(1j * k0 * (np.cos(theta) * x[0] + np.sin(theta) * x[1])))
-n = FacetNormal(msh)
-g = -dot(n, grad(u_exact))
-L = -inner(g, v) * ds
+n = ufl.FacetNormal(msh)
+g = -ufl.dot(n, ufl.grad(u_exact))
+L = -ufl.inner(g, v) * ufl.ds
 
 
 dofs_D = locate_dofs_geometrical(
@@ -122,13 +121,17 @@
 
 # H1 errors
 diff = uh - u_exact
-H1_diff = msh.comm.allreduce(assemble_scalar(form(inner(grad(diff), grad(diff)) * dx)), op=MPI.SUM)
+H1_diff = msh.comm.allreduce(
+    assemble_scalar(form(ufl.inner(ufl.grad(diff), ufl.grad(diff)) * ufl.dx)), op=MPI.SUM
+)
 H1_exact = msh.comm.allreduce(
-    assemble_scalar(form(inner(grad(u_exact), grad(u_exact)) * dx)), op=MPI.SUM
+    assemble_scalar(form(ufl.inner(ufl.grad(u_exact), ufl.grad(u_exact)) * ufl.dx)), op=MPI.SUM
 )
 print("Relative H1 error of FEM solution:", abs(np.sqrt(H1_diff) / np.sqrt(H1_exact)))
 
 # L2 errors
-L2_diff = msh.comm.allreduce(assemble_scalar(form(inner(diff, diff) * dx)), op=MPI.SUM)
-L2_exact = msh.comm.allreduce(assemble_scalar(form(inner(u_exact, u_exact) * dx)), op=MPI.SUM)
+L2_diff = msh.comm.allreduce(assemble_scalar(form(ufl.inner(diff, diff) * ufl.dx)), op=MPI.SUM)
+L2_exact = msh.comm.allreduce(
+    assemble_scalar(form(ufl.inner(u_exact, u_exact) * ufl.dx)), op=MPI.SUM
+)
 print("Relative L2 error of FEM solution:", abs(np.sqrt(L2_diff) / np.sqrt(L2_exact)))
diff --git a/python/demo/demo_lagrange_variants.py b/python/demo/demo_lagrange_variants.py
index 9348aa86d29..ef7153be181 100644
--- a/python/demo/demo_lagrange_variants.py
+++ b/python/demo/demo_lagrange_variants.py
@@ -24,9 +24,8 @@
 
 import basix
 import basix.ufl
-import ufl  # type: ignore
+import ufl
 from dolfinx import default_real_type, fem, mesh
-from ufl import dx
 
 # -
 
@@ -175,7 +174,7 @@ def saw_tooth(x):
     V = fem.functionspace(msh, ufl_element)
     uh = fem.Function(V)
     uh.interpolate(lambda x: saw_tooth(x[0]))
-    M = fem.form((u_exact - uh) ** 2 * dx)
+    M = fem.form((u_exact - uh) ** 2 * ufl.dx)
     error = msh.comm.allreduce(fem.assemble_scalar(M), op=MPI.SUM)
     print(f"Computed L2 interpolation error ({variant.name}):", error**0.5)
 # -
diff --git a/python/demo/demo_mixed-poisson.py b/python/demo/demo_mixed-poisson.py
index 51086346b19..44eb2a91032 100644
--- a/python/demo/demo_mixed-poisson.py
+++ b/python/demo/demo_mixed-poisson.py
@@ -110,20 +110,11 @@
 import numpy as np
 
 import dolfinx.fem.petsc
+import ufl
 from basix.ufl import element
 from dolfinx import fem, la, mesh
 from dolfinx.fem.petsc import discrete_gradient, interpolation_matrix
 from dolfinx.mesh import CellType, create_unit_square
-from ufl import (
-    Measure,
-    SpatialCoordinate,
-    TestFunction,
-    TrialFunction,
-    ZeroBaseForm,
-    div,
-    exp,
-    inner,
-)
 
 # Solution scalar (e.g., float32, complex128) and geometry (float32/64)
 # types
@@ -161,16 +152,16 @@
 # $W$, with corresponding test functions $\tau$ and $v$:
 
 # +
-(sigma, u) = TrialFunction(V), TrialFunction(W)
-(tau, v) = TestFunction(V), TestFunction(W)
+(sigma, u) = ufl.TrialFunction(V), ufl.TrialFunction(W)
+(tau, v) = ufl.TestFunction(V), ufl.TestFunction(W)
 # -
 
 # The source function is set to be $f = 10\exp(-((x_{0} - 0.5)^2 +
 # (x_{1} - 0.5)^2) / 0.02)$:
 
 # +
-x = SpatialCoordinate(msh)
-f = 10 * exp(-((x[0] - 0.5) * (x[0] - 0.5) + (x[1] - 0.5) * (x[1] - 0.5)) / 0.02)
+x = ufl.SpatialCoordinate(msh)
+f = 10 * ufl.exp(-((x[0] - 0.5) * (x[0] - 0.5) + (x[1] - 0.5) * (x[1] - 0.5)) / 0.02)
 # -
 
 # We now declare the blocked bilinear and linear forms. The rows of `a`
@@ -184,9 +175,12 @@
 # corresponds to $u_{0} = 0$ on $\Gamma_{D}$.*
 
 # +
-dx = Measure("dx", msh)
-a = [[inner(sigma, tau) * dx, inner(u, div(tau)) * dx], [inner(div(sigma), v) * dx, None]]
-L = [ZeroBaseForm((tau,)), -inner(f, v) * dx]
+dx = ufl.Measure("dx", msh)
+a = [
+    [ufl.inner(sigma, tau) * dx, ufl.inner(u, ufl.div(tau)) * dx],
+    [ufl.inner(ufl.div(sigma), v) * dx, None],
+]
+L = [ufl.ZeroBaseForm((tau,)), -ufl.inner(f, v) * dx]
 # -
 
 # We now compile the abstract/symbolic forms in `a` and `L` into
@@ -263,7 +257,10 @@
 
 # +
 a_p = fem.form(
-    [[inner(sigma, tau) * dx + inner(div(sigma), div(tau)) * dx, None], [None, inner(u, v) * dx]],
+    [
+        [ufl.inner(sigma, tau) * dx + ufl.inner(ufl.div(sigma), ufl.div(tau)) * dx, None],
+        [None, ufl.inner(u, v) * dx],
+    ],
     dtype=dtype,
 )
 P = fem.petsc.assemble_matrix_nest(a_p, bcs=bcs)
diff --git a/python/demo/demo_navier-stokes.py b/python/demo/demo_navier-stokes.py
index a48b9aa3a77..ec5bd5d3ce7 100644
--- a/python/demo/demo_navier-stokes.py
+++ b/python/demo/demo_navier-stokes.py
@@ -179,27 +179,9 @@
 # +
 import numpy as np
 
+import ufl
 from dolfinx import default_real_type, fem, io, mesh
 from dolfinx.fem.petsc import assemble_matrix_block, assemble_vector_block
-from ufl import (
-    CellDiameter,
-    FacetNormal,
-    MixedFunctionSpace,
-    TestFunctions,
-    TrialFunctions,
-    avg,
-    conditional,
-    div,
-    dot,
-    dS,
-    ds,
-    dx,
-    extract_blocks,
-    grad,
-    gt,
-    inner,
-    outer,
-)
 
 try:
     from petsc4py import PETSc
@@ -225,15 +207,18 @@
 # +
 def norm_L2(comm, v):
     """Compute the L2(Ω)-norm of v"""
-    return np.sqrt(comm.allreduce(fem.assemble_scalar(fem.form(inner(v, v) * dx)), op=MPI.SUM))
+    return np.sqrt(
+        comm.allreduce(fem.assemble_scalar(fem.form(ufl.inner(v, v) * ufl.dx)), op=MPI.SUM)
+    )
 
 
 def domain_average(msh, v):
     """Compute the average of a function over the domain"""
     vol = msh.comm.allreduce(
-        fem.assemble_scalar(fem.form(fem.Constant(msh, default_real_type(1.0)) * dx)), op=MPI.SUM
+        fem.assemble_scalar(fem.form(fem.Constant(msh, default_real_type(1.0)) * ufl.dx)),
+        op=MPI.SUM,
     )
-    return (1 / vol) * msh.comm.allreduce(fem.assemble_scalar(fem.form(v * dx)), op=MPI.SUM)
+    return (1 / vol) * msh.comm.allreduce(fem.assemble_scalar(fem.form(v * ufl.dx)), op=MPI.SUM)
 
 
 def u_e_expr(x):
@@ -282,7 +267,7 @@ def f_expr(x):
 # Function spaces for the velocity and for the pressure
 V = fem.functionspace(msh, ("Raviart-Thomas", k + 1))
 Q = fem.functionspace(msh, ("Discontinuous Lagrange", k))
-VQ = MixedFunctionSpace(V, Q)
+VQ = ufl.MixedFunctionSpace(V, Q)
 
 # Function space for visualising the velocity field
 gdim = msh.geometry.dim
@@ -290,18 +275,18 @@ def f_expr(x):
 
 # Define trial and test functions
 
-u, p = TrialFunctions(VQ)
-v, q = TestFunctions(VQ)
+u, p = ufl.TrialFunctions(VQ)
+v, q = ufl.TestFunctions(VQ)
 
 delta_t = fem.Constant(msh, default_real_type(t_end / num_time_steps))
 alpha = fem.Constant(msh, default_real_type(6.0 * k**2))
 
-h = CellDiameter(msh)
-n = FacetNormal(msh)
+h = ufl.CellDiameter(msh)
+n = ufl.FacetNormal(msh)
 
 
 def jump(phi, n):
-    return outer(phi("+"), n("+")) + outer(phi("-"), n("-"))
+    return ufl.outer(phi("+"), n("+")) + ufl.outer(phi("-"), n("-"))
 
 
 # -
@@ -311,27 +296,28 @@ def jump(phi, n):
 
 # +
 a = (1.0 / Re) * (
-    inner(grad(u), grad(v)) * dx
-    - inner(avg(grad(u)), jump(v, n)) * dS
-    - inner(jump(u, n), avg(grad(v))) * dS
-    + (alpha / avg(h)) * inner(jump(u, n), jump(v, n)) * dS
-    - inner(grad(u), outer(v, n)) * ds
-    - inner(outer(u, n), grad(v)) * ds
-    + (alpha / h) * inner(outer(u, n), outer(v, n)) * ds
+    ufl.inner(ufl.grad(u), ufl.grad(v)) * ufl.dx
+    - ufl.inner(ufl.avg(ufl.grad(u)), jump(v, n)) * ufl.dS
+    - ufl.inner(jump(u, n), ufl.avg(ufl.grad(v))) * ufl.dS
+    + (alpha / ufl.avg(h)) * ufl.inner(jump(u, n), jump(v, n)) * ufl.dS
+    - ufl.inner(ufl.grad(u), ufl.outer(v, n)) * ufl.ds
+    - ufl.inner(ufl.outer(u, n), ufl.grad(v)) * ufl.ds
+    + (alpha / h) * ufl.inner(ufl.outer(u, n), ufl.outer(v, n)) * ufl.ds
 )
-a -= inner(p, div(v)) * dx
-a -= inner(div(u), q) * dx
+a -= ufl.inner(p, ufl.div(v)) * ufl.dx
+a -= ufl.inner(ufl.div(u), q) * ufl.dx
 
-a_blocked = fem.form(extract_blocks(a))
+a_blocked = fem.form(ufl.extract_blocks(a))
 
 f = fem.Function(W)
 u_D = fem.Function(V)
 u_D.interpolate(u_e_expr)
-L = inner(f, v) * dx + (1 / Re) * (
-    -inner(outer(u_D, n), grad(v)) * ds + (alpha / h) * inner(outer(u_D, n), outer(v, n)) * ds
+L = ufl.inner(f, v) * ufl.dx + (1 / Re) * (
+    -ufl.inner(ufl.outer(u_D, n), ufl.grad(v)) * ufl.ds
+    + (alpha / h) * ufl.inner(ufl.outer(u_D, n), ufl.outer(v, n)) * ufl.ds
 )
-L += inner(fem.Constant(msh, default_real_type(0.0)), q) * dx
-L_blocked = fem.form(extract_blocks(L))
+L += ufl.inner(fem.Constant(msh, default_real_type(0.0)), q) * ufl.dx
+L_blocked = fem.form(ufl.extract_blocks(L))
 
 # Boundary conditions
 boundary_facets = mesh.exterior_facet_indices(msh.topology)
@@ -404,19 +390,22 @@ def jump(phi, n):
 # Now we add the time stepping and convective terms
 
 # +
-lmbda = conditional(gt(dot(u_n, n), 0), 1, 0)
+lmbda = ufl.conditional(ufl.gt(ufl.dot(u_n, n), 0), 1, 0)
 u_uw = lmbda("+") * u("+") + lmbda("-") * u("-")
 a += (
-    inner(u / delta_t, v) * dx
-    - inner(u, div(outer(v, u_n))) * dx
-    + inner((dot(u_n, n))("+") * u_uw, v("+")) * dS
-    + inner((dot(u_n, n))("-") * u_uw, v("-")) * dS
-    + inner(dot(u_n, n) * lmbda * u, v) * ds
+    ufl.inner(u / delta_t, v) * ufl.dx
+    - ufl.inner(u, ufl.div(ufl.outer(v, u_n))) * ufl.dx
+    + ufl.inner((ufl.dot(u_n, n))("+") * u_uw, v("+")) * ufl.dS
+    + ufl.inner((ufl.dot(u_n, n))("-") * u_uw, v("-")) * ufl.dS
+    + ufl.inner(ufl.dot(u_n, n) * lmbda * u, v) * ufl.ds
 )
-a_blocked = fem.form(extract_blocks(a))
+a_blocked = fem.form(ufl.extract_blocks(a))
 
-L += inner(u_n / delta_t, v) * dx - inner(dot(u_n, n) * (1 - lmbda) * u_D, v) * ds
-L_blocked = fem.form(extract_blocks(L))
+L += (
+    ufl.inner(u_n / delta_t, v) * ufl.dx
+    - ufl.inner(ufl.dot(u_n, n) * (1 - lmbda) * u_D, v) * ufl.ds
+)
+L_blocked = fem.form(ufl.extract_blocks(L))
 
 # Time stepping loop
 for n in range(num_time_steps):
@@ -473,7 +462,7 @@ def jump(phi, n):
 
 # Compute errors
 e_u = norm_L2(msh.comm, u_h - u_e)
-e_div_u = norm_L2(msh.comm, div(u_h))
+e_div_u = norm_L2(msh.comm, ufl.div(u_h))
 
 # This scheme conserves mass exactly, so check this
 assert np.isclose(e_div_u, 0.0, atol=float(1.0e5 * np.finfo(default_real_type).eps))
diff --git a/python/demo/demo_poisson.py b/python/demo/demo_poisson.py
index 38c072211e3..0ee822c719d 100644
--- a/python/demo/demo_poisson.py
+++ b/python/demo/demo_poisson.py
@@ -85,7 +85,6 @@
 import ufl
 from dolfinx import fem, io, mesh, plot
 from dolfinx.fem.petsc import LinearProblem
-from ufl import ds, dx, grad, inner
 
 # -
 
@@ -146,8 +145,8 @@
 x = ufl.SpatialCoordinate(msh)
 f = 10 * ufl.exp(-((x[0] - 0.5) ** 2 + (x[1] - 0.5) ** 2) / 0.02)
 g = ufl.sin(5 * x[0])
-a = inner(grad(u), grad(v)) * dx
-L = inner(f, v) * dx + inner(g, v) * ds
+a = ufl.inner(ufl.grad(u), ufl.grad(v)) * ufl.dx
+L = ufl.inner(f, v) * ufl.dx + ufl.inner(g, v) * ufl.ds
 # -
 
 # A {py:class}`LinearProblem <dolfinx.fem.petsc.LinearProblem>` object is
diff --git a/python/demo/demo_poisson_matrix_free.py b/python/demo/demo_poisson_matrix_free.py
index b29d9980403..fa5fe103bdd 100644
--- a/python/demo/demo_poisson_matrix_free.py
+++ b/python/demo/demo_poisson_matrix_free.py
@@ -83,7 +83,6 @@
 import dolfinx
 import ufl
 from dolfinx import fem, la
-from ufl import action, dx, grad, inner
 
 # We begin by using {py:func}`create_rectangle
 # <dolfinx.mesh.create_rectangle>` to create a rectangular
@@ -138,8 +137,8 @@
 u = ufl.TrialFunction(V)
 v = ufl.TestFunction(V)
 f = fem.Constant(mesh, dtype(-6.0))
-a = inner(grad(u), grad(v)) * dx
-L = inner(f, v) * dx
+a = ufl.inner(ufl.grad(u), ufl.grad(v)) * ufl.dx
+L = ufl.inner(f, v) * ufl.dx
 L_fem = fem.form(L, dtype=dtype)
 
 # For the matrix-free solvers we also define a second linear form `M` as
@@ -152,7 +151,7 @@
 # $$
 
 ui = fem.Function(V, dtype=dtype)
-M = action(a, ui)
+M = ufl.action(a, ui)
 M_fem = fem.form(M, dtype=dtype)
 
 # ### Matrix-free conjugate gradient solver
@@ -254,7 +253,7 @@ def _global_dot(comm, v0, v1):
 
 
 def L2Norm(u):
-    val = fem.assemble_scalar(fem.form(inner(u, u) * dx, dtype=dtype))
+    val = fem.assemble_scalar(fem.form(ufl.inner(u, u) * ufl.dx, dtype=dtype))
     return np.sqrt(comm.allreduce(val, op=MPI.SUM))
 
 
diff --git a/python/demo/demo_pyamg.py b/python/demo/demo_pyamg.py
index 69de02486fe..dab612f9a1f 100644
--- a/python/demo/demo_pyamg.py
+++ b/python/demo/demo_pyamg.py
@@ -48,7 +48,6 @@
     locate_dofs_topological,
 )
 from dolfinx.mesh import CellType, create_box, locate_entities_boundary
-from ufl import ds, dx, grad, inner
 
 # -
 
@@ -88,8 +87,8 @@ def poisson_problem(dtype: npt.DTypeLike, solver_type: str):
     x = ufl.SpatialCoordinate(mesh)
     f = 10 * ufl.exp(-((x[0] - 0.5) ** 2 + (x[1] - 0.5) ** 2) / 0.02)
     g = ufl.sin(5 * x[0])
-    a = form(inner(grad(u), grad(v)) * dx, dtype=dtype)
-    L = form(inner(f, v) * dx + inner(g, v) * ds, dtype=dtype)
+    a = form(ufl.inner(ufl.grad(u), ufl.grad(v)) * ufl.dx, dtype=dtype)
+    L = form(ufl.inner(f, v) * ufl.dx + ufl.inner(g, v) * ufl.ds, dtype=dtype)
 
     A = assemble_matrix(a, [bc]).to_scipy()
     b = assemble_vector(L)
@@ -193,12 +192,14 @@ def elasticity_problem(dtype):
 
     def σ(v):
         """Return an expression for the stress σ given a displacement field"""
-        return 2.0 * μ * ufl.sym(grad(v)) + λ * ufl.tr(ufl.sym(grad(v))) * ufl.Identity(len(v))
+        return 2.0 * μ * ufl.sym(ufl.grad(v)) + λ * ufl.tr(ufl.sym(ufl.grad(v))) * ufl.Identity(
+            len(v)
+        )
 
     V = functionspace(mesh, ("Lagrange", 1, (mesh.geometry.dim,)))
     u, v = ufl.TrialFunction(V), ufl.TestFunction(V)
-    a = form(inner(σ(u), grad(v)) * dx, dtype=dtype)
-    L = form(inner(f, v) * dx, dtype=dtype)
+    a = form(ufl.inner(σ(u), ufl.grad(v)) * ufl.dx, dtype=dtype)
+    L = form(ufl.inner(f, v) * ufl.dx, dtype=dtype)
 
     tdim = mesh.topology.dim
     dofs = locate_dofs_topological(V=V, entity_dim=tdim - 1, entities=facets)
diff --git a/python/demo/demo_stokes.py b/python/demo/demo_stokes.py
index 599063f4fc5..5b5df67b4c1 100644
--- a/python/demo/demo_stokes.py
+++ b/python/demo/demo_stokes.py
@@ -116,7 +116,6 @@
 from dolfinx.fem.petsc import assemble_matrix_block, assemble_vector_block
 from dolfinx.io import XDMFFile
 from dolfinx.mesh import CellType, create_rectangle, locate_entities_boundary
-from ufl import ZeroBaseForm, div, dx, grad, inner
 
 # We create a {py:class}`Mesh <dolfinx.mesh.Mesh>`, define functions for
 # locating geometrically subsets of the boundary, and define a function
@@ -185,14 +184,19 @@ def lid_velocity_expression(x):
 (v, q) = ufl.TestFunction(V), ufl.TestFunction(Q)
 f = Constant(msh, (PETSc.ScalarType(0), PETSc.ScalarType(0)))  # type: ignore
 
-a = form([[inner(grad(u), grad(v)) * dx, inner(p, div(v)) * dx], [inner(div(u), q) * dx, None]])
-L = form([inner(f, v) * dx, ZeroBaseForm((q,))])
+a = form(
+    [
+        [ufl.inner(ufl.grad(u), ufl.grad(v)) * ufl.dx, ufl.inner(p, ufl.div(v)) * ufl.dx],
+        [ufl.inner(ufl.div(u), q) * ufl.dx, None],
+    ]
+)
+L = form([ufl.inner(f, v) * ufl.dx, ufl.ZeroBaseForm((q,))])
 # -
 
 # A block-diagonal preconditioner will be used with the iterative
 # solvers for this problem:
 
-a_p11 = form(inner(p, q) * dx)
+a_p11 = form(ufl.inner(p, q) * ufl.dx)
 a_p = [[a[0][0], None], [None, a_p11]]
 
 # ### Nested matrix solver
@@ -503,8 +507,11 @@ def mixed_direct():
     (u, p) = ufl.TrialFunctions(W)
     (v, q) = ufl.TestFunctions(W)
     f = Function(Q)
-    a = form((inner(grad(u), grad(v)) + inner(p, div(v)) + inner(div(u), q)) * dx)
-    L = form(inner(f, v) * dx)
+    a = form(
+        (ufl.inner(ufl.grad(u), ufl.grad(v)) + ufl.inner(p, ufl.div(v)) + ufl.inner(ufl.div(u), q))
+        * ufl.dx
+    )
+    L = form(ufl.inner(f, v) * ufl.dx)
 
     # Assemble LHS matrix and RHS vector
     A = fem.petsc.assemble_matrix(a, bcs=bcs)
diff --git a/python/demo/demo_tnt-elements.py b/python/demo/demo_tnt-elements.py
index 06eadf560b6..810d18533a7 100644
--- a/python/demo/demo_tnt-elements.py
+++ b/python/demo/demo_tnt-elements.py
@@ -41,9 +41,9 @@
 
 import basix
 import basix.ufl
+import ufl
 from dolfinx import default_real_type, fem, mesh
 from dolfinx.fem.petsc import LinearProblem
-from ufl import SpatialCoordinate, TestFunction, TrialFunction, cos, div, dx, grad, inner, sin
 
 mpl.use("agg")
 # -
@@ -256,14 +256,14 @@ def create_tnt_quad(degree):
 
 def poisson_error(V: fem.FunctionSpace):
     msh = V.mesh
-    u, v = TrialFunction(V), TestFunction(V)
+    u, v = ufl.TrialFunction(V), ufl.TestFunction(V)
 
-    x = SpatialCoordinate(msh)
-    u_exact = sin(10 * x[1]) * cos(15 * x[0])
-    f = -div(grad(u_exact))
+    x = ufl.SpatialCoordinate(msh)
+    u_exact = ufl.sin(10 * x[1]) * ufl.cos(15 * x[0])
+    f = -ufl.div(ufl.grad(u_exact))
 
-    a = inner(grad(u), grad(v)) * dx
-    L = inner(f, v) * dx
+    a = ufl.inner(ufl.grad(u), ufl.grad(v)) * ufl.dx
+    L = ufl.inner(f, v) * ufl.dx
 
     # Create Dirichlet boundary condition
     u_bc = fem.Function(V)
@@ -278,7 +278,7 @@ def poisson_error(V: fem.FunctionSpace):
     problem = LinearProblem(a, L, bcs=[bc], petsc_options={"ksp_rtol": 1e-12})
     uh = problem.solve()
 
-    M = (u_exact - uh) ** 2 * dx
+    M = (u_exact - uh) ** 2 * ufl.dx
     M = fem.form(M)
     error = msh.comm.allreduce(fem.assemble_scalar(M), op=MPI.SUM)
     return error**0.5
diff --git a/python/demo/pyproject.toml b/python/demo/pyproject.toml
new file mode 100644
index 00000000000..28e7749be23
--- /dev/null
+++ b/python/demo/pyproject.toml
@@ -0,0 +1,5 @@
+[tool.ruff]
+extend = "../pyproject.toml"
+
+[tool.ruff.lint.flake8-import-conventions]
+banned-from = ["ufl"]

From 7dee60cc769342bab35c8ee3c4c8a1df02a285be Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Sat, 11 Jan 2025 19:40:11 +0000
Subject: [PATCH 3/9] Move SpMV from tests into MatrixCSR (#3591)

* Move SPMV to MatrixCSR from test

* ruff

* Add to python

* Template block size

* Put back c++20 in tests

* Rename as Scalar for doxygen

* Try to get BLAS

* Simplifications

* Small updates

---------

Co-authored-by: Garth N. Wells <gnw20@cam.ac.uk>
---
 cpp/dolfinx/la/MatrixCSR.h                    |  83 ++++++++
 cpp/dolfinx/la/matrix_csr_impl.h              |  64 ++++--
 cpp/test/CMakeLists.txt                       |   4 +-
 cpp/test/matrix.cpp                           |  84 +-------
 python/dolfinx/la.py                          | 183 +++++++++---------
 python/dolfinx/wrappers/la.cpp                |   1 +
 .../test/unit/fem/test_discrete_operators.py  |  11 +-
 python/test/unit/la/test_matrix_vector.py     |  32 +++
 8 files changed, 271 insertions(+), 191 deletions(-)

diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h
index 1a6738d676a..c15e7248dd5 100644
--- a/cpp/dolfinx/la/MatrixCSR.h
+++ b/cpp/dolfinx/la/MatrixCSR.h
@@ -7,6 +7,7 @@
 #pragma once
 
 #include "SparsityPattern.h"
+#include "Vector.h"
 #include "matrix_csr_impl.h"
 #include <algorithm>
 #include <dolfinx/common/IndexMap.h>
@@ -322,6 +323,15 @@ class MatrixCSR
   /// @note MPI Collective
   double squared_norm() const;
 
+  /// @brief Compute the product `y += Ax`.
+  ///
+  /// The vectors `x` and `y` must have parallel layouts that are
+  /// compatible with `A`.
+  ///
+  /// @param[in] x Vector to be apply `A` to.
+  /// @param[in,out] y Vector to accumulate the result into.
+  void mult(Vector<value_type>& x, Vector<value_type>& y);
+
   /// @brief Index maps for the row and column space.
   ///
   /// The row IndexMap contains ghost entries for rows which may be
@@ -734,4 +744,77 @@ double MatrixCSR<U, V, W, X>::squared_norm() const
 }
 //-----------------------------------------------------------------------------
 
+// The matrix A is distributed across P  processes by blocks of rows:
+//  A = |   A_0  |
+//      |   A_1  |
+//      |   ...  |
+//      |  A_P-1 |
+//
+// Each submatrix A_i is owned by a single process "i" and can be further
+// decomposed into diagonal (Ai[0]) and off diagonal (Ai[1]) blocks:
+//  Ai = |Ai[0] Ai[1]|
+//
+// If A is square, the diagonal block Ai[0] is also square and contains
+// only owned columns and rows. The block Ai[1] contains ghost columns
+// (unowned dofs).
+
+// Likewise, a local vector x can be decomposed into owned and ghost blocks:
+// xi = |   x[0]  |
+//      |   x[1]  |
+//
+// So the product y = Ax can be computed into two separate steps:
+//  y[0] = |Ai[0] Ai[1]| |   x[0]  | = Ai[0] x[0] + Ai[1] x[1]
+//                       |   x[1]  |
+//
+/// Computes y += A*x for a parallel CSR matrix A and parallel dense vectors x,y
+template <typename Scalar, typename V, typename W, typename X>
+void MatrixCSR<Scalar, V, W, X>::mult(la::Vector<Scalar>& x,
+                                      la::Vector<Scalar>& y)
+{
+  // start communication (update ghosts)
+  x.scatter_fwd_begin();
+
+  const std::int32_t nrowslocal = num_owned_rows();
+  std::span<const std::int64_t> Arow_ptr(row_ptr().data(), nrowslocal + 1);
+  std::span<const std::int32_t> Acols(cols().data(), Arow_ptr[nrowslocal]);
+  std::span<const std::int64_t> Aoff_diag_offset(off_diag_offset().data(),
+                                                 nrowslocal);
+  std::span<const Scalar> Avalues(values().data(), Arow_ptr[nrowslocal]);
+
+  std::span<const Scalar> _x = x.array();
+  std::span<Scalar> _y = y.mutable_array();
+
+  std::span<const std::int64_t> Arow_begin(Arow_ptr.data(), nrowslocal);
+  std::span<const std::int64_t> Arow_end(Arow_ptr.data() + 1, nrowslocal);
+
+  // First stage:  spmv - diagonal
+  // yi[0] += Ai[0] * xi[0]
+  if (_bs[1] == 1)
+  {
+    impl::spmv<Scalar, 1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y,
+                          _bs[0], 1);
+  }
+  else
+  {
+    impl::spmv<Scalar, -1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y,
+                           _bs[0], _bs[1]);
+  }
+
+  // finalize ghost update
+  x.scatter_fwd_end();
+
+  // Second stage:  spmv - off-diagonal
+  // yi[0] += Ai[1] * xi[1]
+  if (_bs[1] == 1)
+  {
+    impl::spmv<Scalar, 1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y,
+                          _bs[0], 1);
+  }
+  else
+  {
+    impl::spmv<Scalar, -1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y,
+                           _bs[0], _bs[1]);
+  }
+}
+
 } // namespace dolfinx::la
diff --git a/cpp/dolfinx/la/matrix_csr_impl.h b/cpp/dolfinx/la/matrix_csr_impl.h
index 4da90cb046f..8a218a130e4 100644
--- a/cpp/dolfinx/la/matrix_csr_impl.h
+++ b/cpp/dolfinx/la/matrix_csr_impl.h
@@ -100,14 +100,12 @@ void insert_nonblocked_csr(U&& data, const V& cols, const W& row_ptr,
                            const X& x, const Y& xrows, const Y& xcols, OP op,
                            typename Y::value_type num_rows, int bs0, int bs1);
 
-} // namespace impl
-
 //-----------------------------------------------------------------------------
 template <int BS0, int BS1, typename OP, typename U, typename V, typename W,
           typename X, typename Y>
-void impl::insert_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
-                      const Y& xrows, const Y& xcols, OP op,
-                      [[maybe_unused]] typename Y::value_type num_rows)
+void insert_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
+                const Y& xrows, const Y& xcols, OP op,
+                [[maybe_unused]] typename Y::value_type num_rows)
 {
   const std::size_t nc = xcols.size();
   assert(x.size() == xrows.size() * xcols.size() * BS0 * BS1);
@@ -151,9 +149,9 @@ void impl::insert_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
 // Insert with block insertion into a regular CSR (block size 1)
 template <int BS0, int BS1, typename OP, typename U, typename V, typename W,
           typename X, typename Y>
-void impl::insert_blocked_csr(U&& data, const V& cols, const W& row_ptr,
-                              const X& x, const Y& xrows, const Y& xcols, OP op,
-                              [[maybe_unused]] typename Y::value_type num_rows)
+void insert_blocked_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
+                        const Y& xrows, const Y& xcols, OP op,
+                        [[maybe_unused]] typename Y::value_type num_rows)
 {
   const std::size_t nc = xcols.size();
   assert(x.size() == xrows.size() * xcols.size() * BS0 * BS1);
@@ -195,12 +193,10 @@ void impl::insert_blocked_csr(U&& data, const V& cols, const W& row_ptr,
 // Add individual entries in block-CSR storage
 template <typename OP, typename U, typename V, typename W, typename X,
           typename Y>
-void impl::insert_nonblocked_csr(U&& data, const V& cols, const W& row_ptr,
-                                 const X& x, const Y& xrows, const Y& xcols,
-                                 OP op,
-                                 [[maybe_unused]]
-                                 typename Y::value_type num_rows,
-                                 int bs0, int bs1)
+void insert_nonblocked_csr(U&& data, const V& cols, const W& row_ptr,
+                           const X& x, const Y& xrows, const Y& xcols, OP op,
+                           [[maybe_unused]] typename Y::value_type num_rows,
+                           int bs0, int bs1)
 {
   const std::size_t nc = xcols.size();
   const int nbs = bs0 * bs1;
@@ -237,4 +233,44 @@ void impl::insert_nonblocked_csr(U&& data, const V& cols, const W& row_ptr,
   }
 }
 //-----------------------------------------------------------------------------
+
+template <typename T, int BS1>
+void spmv(std::span<const T> values, std::span<const std::int64_t> row_begin,
+          std::span<const std::int64_t> row_end,
+          std::span<const std::int32_t> indices, std::span<const T> x,
+          std::span<T> y, int bs0, int bs1)
+{
+  assert(row_begin.size() == row_end.size());
+
+  for (int k0 = 0; k0 < bs0; ++k0)
+  {
+    for (std::size_t i = 0; i < row_begin.size(); i++)
+    {
+      T vi{0};
+      for (std::int32_t j = row_begin[i]; j < row_end[i]; j++)
+      {
+        if constexpr (BS1 == -1)
+        {
+          for (int k1 = 0; k1 < bs1; ++k1)
+          {
+            vi += values[j * bs1 * bs0 + k1 * bs0 + k0]
+                  * x[indices[j] * bs1 + k1];
+          }
+        }
+        else
+        {
+          for (int k1 = 0; k1 < BS1; ++k1)
+          {
+            vi += values[j * BS1 * bs0 + k1 * bs0 + k0]
+                  * x[indices[j] * BS1 + k1];
+          }
+        }
+      }
+
+      y[i * bs0 + k0] += vi;
+    }
+  }
+}
+
+} // namespace impl
 } // namespace dolfinx::la
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index ada545792a1..e5b9a8a37ab 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -51,11 +51,13 @@ add_executable(
   ${CMAKE_CURRENT_BINARY_DIR}/poisson.c
 )
 target_link_libraries(unittests PRIVATE Catch2::Catch2WithMain dolfinx)
+
 # UUID requires bcrypt to be linked on Windows, broken in vcpkg.
 # https://github.com/microsoft/vcpkg/issues/4481
 if(WIN32)
   target_link_libraries(unittests PRIVATE bcrypt)
 endif()
+
 target_include_directories(
   unittests PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
 )
@@ -71,7 +73,7 @@ endif()
 target_compile_options(
   unittests
   PRIVATE
-    $<$<AND:$<CONFIG:Developer>,$<COMPILE_LANGUAGE:CXX>>:${DOLFINX_CXX_DEVELOPER_FLAGS}>
+  $<$<AND:$<CONFIG:Developer>,$<COMPILE_LANGUAGE:CXX>>:${DOLFINX_CXX_DEVELOPER_FLAGS}>
 )
 
 # Enable testing
diff --git a/cpp/test/matrix.cpp b/cpp/test/matrix.cpp
index 9741111964c..ce0f3644fff 100644
--- a/cpp/test/matrix.cpp
+++ b/cpp/test/matrix.cpp
@@ -23,88 +23,6 @@ using namespace dolfinx;
 
 namespace
 {
-/// Computes y += A*x for a local CSR matrix A and local dense vectors x,y
-/// @param[in] values Nonzero values of A
-/// @param[in] row_begin First index of each row in the arrays values and
-/// indices.
-/// @param[in] row_end Last index of each row in the arrays values and indices.
-/// @param[in] indices Column indices for each non-zero element of the matrix A
-/// @param[in] x Input vector
-/// @param[in, out] x Output vector
-template <typename T>
-void spmv_impl(std::span<const T> values,
-               std::span<const std::int64_t> row_begin,
-               std::span<const std::int64_t> row_end,
-               std::span<const std::int32_t> indices, std::span<const T> x,
-               std::span<T> y)
-{
-  assert(row_begin.size() == row_end.size());
-  for (std::size_t i = 0; i < row_begin.size(); i++)
-  {
-    double vi{0};
-    for (std::int32_t j = row_begin[i]; j < row_end[i]; j++)
-      vi += values[j] * x[indices[j]];
-    y[i] += vi;
-  }
-}
-
-// The matrix A is distributed across P  processes by blocks of rows:
-//  A = |   A_0  |
-//      |   A_1  |
-//      |   ...  |
-//      |  A_P-1 |
-//
-// Each submatrix A_i is owned by a single process "i" and can be further
-// decomposed into diagonal (Ai[0]) and off diagonal (Ai[1]) blocks:
-//  Ai = |Ai[0] Ai[1]|
-//
-// If A is square, the diagonal block Ai[0] is also square and contains
-// only owned columns and rows. The block Ai[1] contains ghost columns
-// (unowned dofs).
-
-// Likewise, a local vector x can be decomposed into owned and ghost blocks:
-// xi = |   x[0]  |
-//      |   x[1]  |
-//
-// So the product y = Ax can be computed into two separate steps:
-//  y[0] = |Ai[0] Ai[1]| |   x[0]  | = Ai[0] x[0] + Ai[1] x[1]
-//                       |   x[1]  |
-//
-/// Computes y += A*x for a parallel CSR matrix A and parallel dense vectors x,y
-/// @param[in] A Parallel CSR matrix
-/// @param[in] x Input vector
-/// @param[in, out] y Output vector
-template <typename T>
-void spmv(la::MatrixCSR<T>& A, la::Vector<T>& x, la::Vector<T>& y)
-{
-  // start communication (update ghosts)
-  x.scatter_fwd_begin();
-
-  const std::int32_t nrowslocal = A.num_owned_rows();
-  std::span<const std::int64_t> row_ptr(A.row_ptr().data(), nrowslocal + 1);
-  std::span<const std::int32_t> cols(A.cols().data(), row_ptr[nrowslocal]);
-  std::span<const std::int64_t> off_diag_offset(A.off_diag_offset().data(),
-                                                nrowslocal);
-  std::span<const T> values(A.values().data(), row_ptr[nrowslocal]);
-
-  std::span<const T> _x = x.array();
-  std::span<T> _y = y.mutable_array();
-
-  std::span<const std::int64_t> row_begin(row_ptr.data(), nrowslocal);
-  std::span<const std::int64_t> row_end(row_ptr.data() + 1, nrowslocal);
-
-  // First stage:  spmv - diagonal
-  // yi[0] += Ai[0] * xi[0]
-  spmv_impl<T>(values, row_begin, off_diag_offset, cols, _x, _y);
-
-  // finalize ghost update
-  x.scatter_fwd_end();
-
-  // Second stage:  spmv - off-diagonal
-  // yi[0] += Ai[1] * xi[1]
-  spmv_impl<T>(values, off_diag_offset, row_end, cols, _x, _y);
-}
-
 /// @brief Create a matrix operator
 /// @param comm The communicator to builf the matrix on
 /// @return The assembled matrix
@@ -195,7 +113,7 @@ la::MatrixCSR<double> create_operator(MPI_Comm comm)
 
   // Matrix A represents the action of the Laplace operator, so when
   // applied to a constant vector the result should be zero
-  spmv(A, x, y);
+  A.mult(x, y);
 
   std::ranges::for_each(y.array(),
                         [](auto a) { REQUIRE(std::abs(a) < 1e-13); });
diff --git a/python/dolfinx/la.py b/python/dolfinx/la.py
index 49ce24a6a5a..23c5fba198c 100644
--- a/python/dolfinx/la.py
+++ b/python/dolfinx/la.py
@@ -27,6 +27,90 @@
 ]
 
 
+class Vector:
+    _cpp_object: typing.Union[
+        _cpp.la.Vector_float32,
+        _cpp.la.Vector_float64,
+        _cpp.la.Vector_complex64,
+        _cpp.la.Vector_complex128,
+        _cpp.la.Vector_int8,
+        _cpp.la.Vector_int32,
+        _cpp.la.Vector_int64,
+    ]
+
+    def __init__(
+        self,
+        x: typing.Union[
+            _cpp.la.Vector_float32,
+            _cpp.la.Vector_float64,
+            _cpp.la.Vector_complex64,
+            _cpp.la.Vector_complex128,
+            _cpp.la.Vector_int8,
+            _cpp.la.Vector_int32,
+            _cpp.la.Vector_int64,
+        ],
+    ):
+        """A distributed vector object.
+
+        Args:
+            x: C++ Vector object.
+
+        Note:
+            This initialiser is intended for internal library use only.
+            User code should call :func:`vector` to create a vector object.
+        """
+        self._cpp_object = x
+        self._petsc_x = None
+
+    def __del__(self):
+        if self._petsc_x is not None:
+            self._petsc_x.destroy()
+
+    @property
+    def index_map(self) -> IndexMap:
+        """Index map that describes size and parallel distribution."""
+        return self._cpp_object.index_map
+
+    @property
+    def block_size(self) -> int:
+        """Block size for the vector."""
+        return self._cpp_object.bs
+
+    @property
+    def array(self) -> np.ndarray:
+        """Local representation of the vector."""
+        return self._cpp_object.array
+
+    @property
+    def petsc_vec(self):
+        """PETSc vector holding the entries of the vector.
+
+        Upon first call, this function creates a PETSc ``Vec`` object
+        that wraps the degree-of-freedom data. The ``Vec`` object is
+        cached and the cached ``Vec`` is returned upon subsequent calls.
+
+        Note:
+          When the object is destroyed it will destroy the underlying petsc4py
+          vector automatically.
+        """
+        if self._petsc_x is None:
+            self._petsc_x = create_petsc_vector_wrap(self)
+        return self._petsc_x
+
+    def scatter_forward(self) -> None:
+        """Update ghost entries."""
+        self._cpp_object.scatter_forward()
+
+    def scatter_reverse(self, mode: InsertMode) -> None:
+        """Scatter ghost entries to owner.
+
+        Args:
+            mode: Control how scattered values are set/accumulated by
+                owner.
+        """
+        self._cpp_object.scatter_reverse(mode)
+
+
 class MatrixCSR:
     _cpp_object: typing.Union[
         _cpp.la.MatrixCSR_float32,
@@ -63,8 +147,17 @@ def index_map(self, i: int) -> IndexMap:
         """
         return self._cpp_object.index_map(i)
 
+    def mult(self, x: Vector, y: Vector) -> None:
+        """Compute ``y += Ax``.
+
+        Args:
+            x: Input Vector
+            y: Output Vector
+        """
+        self._cpp_object.mult(x._cpp_object, y._cpp_object)
+
     @property
-    def block_size(self):
+    def block_size(self) -> list:
         """Block sizes for the matrix."""
         return self._cpp_object.bs
 
@@ -128,11 +221,10 @@ def to_dense(self) -> npt.NDArray[np.floating]:
 
         Note:
             Typically used for debugging.
-
         """
         return self._cpp_object.to_dense()
 
-    def to_scipy(self, ghosted=False):
+    def to_scipy(self, ghosted: bool = False):
         """Convert to a SciPy CSR/BSR matrix. Data is shared.
 
         Note:
@@ -202,90 +294,6 @@ def matrix_csr(
     return MatrixCSR(ftype(sp, block_mode))
 
 
-class Vector:
-    _cpp_object: typing.Union[
-        _cpp.la.Vector_float32,
-        _cpp.la.Vector_float64,
-        _cpp.la.Vector_complex64,
-        _cpp.la.Vector_complex128,
-        _cpp.la.Vector_int8,
-        _cpp.la.Vector_int32,
-        _cpp.la.Vector_int64,
-    ]
-
-    def __init__(
-        self,
-        x: typing.Union[
-            _cpp.la.Vector_float32,
-            _cpp.la.Vector_float64,
-            _cpp.la.Vector_complex64,
-            _cpp.la.Vector_complex128,
-            _cpp.la.Vector_int8,
-            _cpp.la.Vector_int32,
-            _cpp.la.Vector_int64,
-        ],
-    ):
-        """A distributed vector object.
-
-        Args:
-            x: C++ Vector object.
-
-        Note:
-            This initialiser is intended for internal library use only.
-            User code should call :func:`vector` to create a vector object.
-        """
-        self._cpp_object = x
-        self._petsc_x = None
-
-    def __del__(self):
-        if self._petsc_x is not None:
-            self._petsc_x.destroy()
-
-    @property
-    def index_map(self) -> IndexMap:
-        """Index map that describes size and parallel distribution."""
-        return self._cpp_object.index_map
-
-    @property
-    def block_size(self) -> int:
-        """Block size for the vector."""
-        return self._cpp_object.bs
-
-    @property
-    def array(self) -> np.ndarray:
-        """Local representation of the vector."""
-        return self._cpp_object.array
-
-    @property
-    def petsc_vec(self):
-        """PETSc vector holding the entries of the vector.
-
-        Upon first call, this function creates a PETSc ``Vec`` object
-        that wraps the degree-of-freedom data. The ``Vec`` object is
-        cached and the cached ``Vec`` is returned upon subsequent calls.
-
-        Note:
-          When the object is destroyed it will destroy the underlying petsc4py
-          vector automatically.
-        """
-        if self._petsc_x is None:
-            self._petsc_x = create_petsc_vector_wrap(self)
-        return self._petsc_x
-
-    def scatter_forward(self) -> None:
-        """Update ghost entries."""
-        self._cpp_object.scatter_forward()
-
-    def scatter_reverse(self, mode: InsertMode) -> None:
-        """Scatter ghost entries to owner.
-
-        Args:
-            mode: Control how scattered values are set/accumulated by
-                owner.
-        """
-        self._cpp_object.scatter_reverse(mode)
-
-
 def vector(map, bs=1, dtype: npt.DTypeLike = np.float64) -> Vector:
     """Create a distributed vector.
 
@@ -332,7 +340,6 @@ def create_petsc_vector_wrap(x: Vector):
 
     Returns:
         A PETSc vector that shares data with ``x``.
-
     """
     from petsc4py import PETSc
 
diff --git a/python/dolfinx/wrappers/la.cpp b/python/dolfinx/wrappers/la.cpp
index 35fece54300..b04209c277b 100644
--- a/python/dolfinx/wrappers/la.cpp
+++ b/python/dolfinx/wrappers/la.cpp
@@ -132,6 +132,7 @@ void declare_objects(nb::module_& m, const std::string& type)
                &dolfinx::la::MatrixCSR<T>::set),
            nb::arg("x"))
       .def("scatter_reverse", &dolfinx::la::MatrixCSR<T>::scatter_rev)
+      .def("mult", &dolfinx::la::MatrixCSR<T>::mult)
       .def("to_dense",
            [](const dolfinx::la::MatrixCSR<T>& self)
            {
diff --git a/python/test/unit/fem/test_discrete_operators.py b/python/test/unit/fem/test_discrete_operators.py
index 16f7283f6ce..eab5d82dd26 100644
--- a/python/test/unit/fem/test_discrete_operators.py
+++ b/python/test/unit/fem/test_discrete_operators.py
@@ -233,9 +233,10 @@ def test_interpolation_matrix(cell_type, p, q, from_lagrange):
 
     V = functionspace(mesh, el0)
     W = functionspace(mesh, el1)
-    G = interpolation_matrix(V, W).to_scipy()
+    G = interpolation_matrix(V, W)
 
-    u = Function(V)
+    uvec = dolfinx.la.vector(G.index_map(1), bs=G.block_size[1])
+    u = Function(V, uvec)
 
     def f(x):
         if mesh.geometry.dim == 2:
@@ -249,10 +250,10 @@ def f(x):
 
     # Compute global matrix vector product
     w = Function(W)
-    ux = np.zeros(G.shape[1])
-    ux[: len(u.x.array)] = u.x.array[:]
-    w.x.array[: G.shape[0]] = G @ ux
+    w.x.array[:] = 0
+    G.mult(u.x, w.x)
     w.x.scatter_forward()
 
     atol = 100 * np.finfo(default_real_type).resolution
+
     assert np.allclose(w_vec.x.array, w.x.array, atol=atol)
diff --git a/python/test/unit/la/test_matrix_vector.py b/python/test/unit/la/test_matrix_vector.py
index 7114d3af70a..c917d2a8ffe 100644
--- a/python/test/unit/la/test_matrix_vector.py
+++ b/python/test/unit/la/test_matrix_vector.py
@@ -44,6 +44,38 @@ def test_create_matrix_csr():
     assert np.allclose(dense, np.zeros(dense.shape, dtype=np.complex128))
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        np.float32,
+        np.float64,
+        np.complex64,
+        np.complex128,
+    ],
+)
+def test_matvec(dtype):
+    mesh = create_unit_square(MPI.COMM_WORLD, 3, 3)
+    imap = mesh.topology.index_map(0)
+    sp = _cpp.la.SparsityPattern(mesh.comm, [imap, imap], [1, 1])
+    rows = np.arange(0, imap.size_local)
+    cols = np.arange(0, imap.size_local)
+    sp.insert(rows, cols)
+    sp.finalize()
+
+    # Identity
+    A = la.matrix_csr(sp, dtype=dtype)
+    for i in range(imap.size_local):
+        A.add([2.0], [i], [i])
+    A.scatter_reverse()
+
+    b = la.vector(imap, dtype=dtype)
+    u = la.vector(imap, dtype=dtype)
+    b.array[:] = 1.0
+    A.mult(b, u)
+    u.scatter_forward()
+    assert np.allclose(u.array[: imap.size_local], 2.0)
+
+
 @pytest.mark.parametrize(
     "dtype",
     [

From 56d549b22a7174c5ad505a6ce5f17f9c94047258 Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Sat, 11 Jan 2025 22:11:20 +0000
Subject: [PATCH 4/9] Add discrete curl matrix operator (#3587)

* Start

* Work on discrete curl

* Fix some bugs

* Work in curl interpolation

* Ruff

* Update test

* Doc fix

* Updates

* Skip test in parallel for now

* Test update

* Ruff format

* Bug fix

* Debugging

* Get test working

* Tidy up

* Work on docs

* Tidy up

* Update cpp/dolfinx/fem/discreteoperators.h

Co-authored-by: Nate <34454754+nate-sime@users.noreply.github.com>

* Simplify discrete curl

* Ruff format

* Simplifications

* Fixes

* Update docs

* Extent tests

* Wrapper fixes

* Span updates

* Work on extents

* Fix extents

* Updates

* Extent updates

* Tidy up

* Update docs

---------

Co-authored-by: Nate <34454754+nate-sime@users.noreply.github.com>
---
 cpp/dolfinx/fem/CoordinateElement.h           |   4 +-
 cpp/dolfinx/fem/discreteoperators.h           | 379 ++++++++++++++----
 cpp/dolfinx/fem/interpolate.h                 | 274 ++++++-------
 python/dolfinx/fem/__init__.py                |  47 ++-
 python/dolfinx/fem/petsc.py                   |  15 +
 python/dolfinx/wrappers/assemble.cpp          |  14 +
 python/dolfinx/wrappers/petsc.cpp             |  49 ++-
 .../test/unit/fem/test_discrete_operators.py  | 176 ++++++--
 8 files changed, 680 insertions(+), 278 deletions(-)

diff --git a/cpp/dolfinx/fem/CoordinateElement.h b/cpp/dolfinx/fem/CoordinateElement.h
index 01c6428ee24..fdce4bc3306 100644
--- a/cpp/dolfinx/fem/CoordinateElement.h
+++ b/cpp/dolfinx/fem/CoordinateElement.h
@@ -78,7 +78,9 @@ class CoordinateElement
   /// compute. Use 0 for the basis functions only
   /// @param[in] num_points Number of points at which to evaluate the
   /// basis functions.
-  /// @return Shape of the array to be filled by `tabulate`.
+  /// @return Shape of the array to be filled by `tabulate`, where (0)
+  /// is derivative index, (1) is the point index, (2) is the basis
+  /// function index and (3) is the basis function component.
   std::array<std::size_t, 4> tabulate_shape(std::size_t nd,
                                             std::size_t num_points) const;
 
diff --git a/cpp/dolfinx/fem/discreteoperators.h b/cpp/dolfinx/fem/discreteoperators.h
index 87740f60d27..1161e7c6c71 100644
--- a/cpp/dolfinx/fem/discreteoperators.h
+++ b/cpp/dolfinx/fem/discreteoperators.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2015-2022 Garth N. Wells, Jørgen S. Dokken
+// Copyright (C) 2015-2025 Garth N. Wells, Jørgen S. Dokken
 //
 // This file is part of DOLFINx (https://www.fenicsproject.org)
 //
@@ -14,6 +14,7 @@
 #include <concepts>
 #include <dolfinx/common/IndexMap.h>
 #include <dolfinx/common/math.h>
+#include <dolfinx/la/utils.h>
 #include <dolfinx/mesh/Mesh.h>
 #include <memory>
 #include <span>
@@ -22,6 +23,268 @@
 namespace dolfinx::fem
 {
 
+/// @brief Assemble a discrete curl operator.
+///
+/// For vector-valued finite functions \f$u \in V_{0} \f$ and \f$v \in
+/// V_{1}\f$, consider the interpolation of the curl of \f$u\f$ in the
+/// space \f$V_{1}\f$, i.e. \f$\Pi_{V_{1}}: \nabla \times u \rightarrow
+/// v\f$, where \f$\Pi_{V_{1}}\f$ is the interpolation operator
+/// associated with \f$V_{1}\f$. This interpolation of \f$\nabla \times
+/// u\f$ into \f$V_{1}\f$ is properly posed and exact for specific
+/// choices of function spaces. If \f$V_{0}\f$ is a Nédélec (\f$H({\rm
+/// curl})\f$) space of degree \f$k > 1\f$ and \f$V_{1}\f$ is a
+/// Raviart-Thomas (\f$H({\rm div})\f$) space of degree of at least \f$k
+/// - 1\f$, then the interpolation is exact.
+///
+/// The implementation of this function exploits the result:
+///
+/// \f[
+///   \hat{\nabla} \times \psi_{C}(\boldsymbol{u}) = \psi_{D}(\nabla \times
+///   \boldsymbol{u}),
+/// \f]
+///
+/// where \f$\psi_{C}\f$ is the covariant pull-back (to the reference
+/// cell) and \f$\psi_{D}\f$ is the contravariant pull-back. See Ern and
+/// Guermond (2021), *Finite Elements I*, Springer Nature,
+/// https://doi.org/10.1007/978-3-030-56341-7 [Corollary 9.9 (Commuting
+/// properties)]. Consequently, the spaces `V0` and `V1` must use
+/// covariant and contravariant maps, respectively.
+///
+/// This function builds a matrix \f$C\f$ (the 'discrete curl'), which
+/// when applied to the degrees-of-freedom of \f$u\f$ gives the
+/// degrees-of-freedom of \f$v\f$ such that \f$v = \nabla \times u\f$.
+/// If the finite element degree-of-freedom vectors associated with
+/// \f$u\f$ and \f$v\f$ are \f$a\f$ and \f$b\f$, respectively, then \f$b
+/// = C a\f$, which yields \f$v = \Pi_{V} \nabla \times u\f$. It
+/// essentially maps that curl of a function in a degree \f$k > 1\f$
+/// Nédélec space into a degree \f$k - 1\f$ Raviart-Thomas space.
+///
+/// The discerete curl is typically used in constructing algebraic
+/// multigrid preconditioners for \f$H({\rm div})\f$ problems, e.g. when
+/// using the Hypre Auxiliary-space Divergence Solver (ADS) to solve a
+/// mixed Poisson in three-dimension.
+///
+/// @pre `V0` and `V1` must be vector-valued, in three spatial
+/// dimensions, and use covariant and contravariant maps, respectively.
+///
+/// @tparam T Scalar type of the mesh and elements. @tparam U Scalar
+/// type of the matrix being inserted into. This is usually the same as
+/// `T`, but may differ for matrix backends that support only a specific
+/// type, e.g. PETSc which supports only one scalar type for a build of
+/// PETSc.
+///
+/// @param[in] V0 Space that \f$u\f$ is from. It must be a covariant
+/// Piola mapped element. It is normally an \f$H({\rm
+/// curl})\f$-conforming Nédélec space.
+/// @param[in] V1 Space that \f$v\f$ is from. It must be a contravariant
+/// Piola mapped element. It is normally an \f$H({\rm
+/// div})\f$-conforming Raviart-Thomas space of one degree lower than
+/// `V0`.
+/// @param[in] mat_set A functor that sets (not add) values in a matrix
+/// \f$C\f$.
+template <std::floating_point T, dolfinx::scalar U = T>
+void discrete_curl(const FunctionSpace<T>& V0, const FunctionSpace<T>& V1,
+                   la::MatSet<U> auto&& mat_set)
+{
+  namespace md = MDSPAN_IMPL_STANDARD_NAMESPACE;
+
+  // Get mesh
+  auto mesh = V0.mesh();
+  assert(mesh);
+  assert(V1.mesh());
+  if (mesh != V1.mesh())
+    throw std::runtime_error("Meshes must be the same.");
+
+  if (mesh->geometry().dim() != 3)
+    throw std::runtime_error("Geometric must be equal to 3..");
+  if (mesh->geometry().dim() != mesh->topology()->dim())
+  {
+    throw std::runtime_error(
+        "Geometric and topological dimensions must be equal.");
+  }
+  constexpr int gdim = 3;
+
+  // Get elements
+  std::shared_ptr<const FiniteElement<T>> e0 = V0.element();
+  assert(e0);
+  if (e0->map_type() != basix::maps::type::covariantPiola)
+  {
+    throw std::runtime_error(
+        "Finite element for parent space must be covariant Piola.");
+  }
+
+  std::shared_ptr<const FiniteElement<T>> e1 = V1.element();
+  assert(e1);
+  if (e1->map_type() != basix::maps::type::contravariantPiola)
+  {
+    throw std::runtime_error(
+        "Finite element for target space must be contracovariant Piola.");
+  }
+
+  // Get cell orientation information
+  std::span<const std::uint32_t> cell_info;
+  if (e1->needs_dof_transformations() or e0->needs_dof_transformations())
+  {
+    mesh->topology_mutable()->create_entity_permutations();
+    cell_info = std::span(mesh->topology()->get_cell_permutation_info());
+  }
+
+  // Get dofmaps
+  auto dofmap0 = V0.dofmap();
+  assert(dofmap0);
+  auto dofmap1 = V1.dofmap();
+  assert(dofmap1);
+
+  // Get dof transformation operators
+  auto apply_dof_transformation0
+      = e0->template dof_transformation_fn<T>(doftransform::standard, false);
+  auto apply_inverse_dof_transform1 = e1->template dof_transformation_fn<U>(
+      doftransform::inverse_transpose, false);
+
+  // Get sizes of elements
+  const std::size_t space_dim0 = e0->space_dimension();
+  const std::size_t space_dim1 = e1->space_dimension();
+  if (e0->reference_value_size() != 3)
+    throw std::runtime_error("Value size for parent space should be 3.");
+  if (e1->reference_value_size() != 3)
+    throw std::runtime_error("Value size for target space should be 3.");
+
+  // Get the V1 reference interpolation points
+  const auto [X, Xshape] = e1->interpolation_points();
+
+  // Get/compute geometry map and evaluate at interpolation points
+  const CoordinateElement<T>& cmap = mesh->geometry().cmap();
+  auto x_dofmap = mesh->geometry().dofmap();
+  const std::size_t num_dofs_g = cmap.dim();
+  std::span<const T> x_g = mesh->geometry().x();
+  std::array<std::size_t, 4> Phi_g_shape = cmap.tabulate_shape(1, Xshape[0]);
+  std::vector<T> Phi_g_b(std::reduce(Phi_g_shape.begin(), Phi_g_shape.end(), 1,
+                                     std::multiplies{}));
+  // (derivative,  point index, phi, component)
+  md::mdspan<const T, md::extents<std::size_t, gdim + 1, md::dynamic_extent,
+                                  md::dynamic_extent, 1>>
+      Phi_g(Phi_g_b.data(), Phi_g_shape);
+  cmap.tabulate(1, X, Xshape, Phi_g_b);
+
+  // Geometry data structures
+  std::vector<T> coord_dofs_b(num_dofs_g * gdim);
+  md::mdspan<T, md::extents<std::size_t, md::dynamic_extent, 3>> coord_dofs(
+      coord_dofs_b.data(), num_dofs_g, gdim);
+  std::vector<T> J_b(Xshape[0] * gdim * gdim);
+  md::mdspan<T, md::extents<std::size_t, md::dynamic_extent, 3, 3>> J(
+      J_b.data(), Xshape[0], gdim, gdim);
+  std::vector<T> K_b(Xshape[0] * gdim * gdim);
+  md::mdspan<T, typename decltype(J)::extents_type> K(K_b.data(), J.extents());
+  std::vector<T> detJ(Xshape[0]);
+  std::vector<T> det_scratch(2 * gdim * gdim);
+
+  // Evaluate V0 basis function derivatives at reference interpolation
+  // points for V1, (deriv, pt_idx, phi (dof), comp)
+  const auto [Phi0_b, Phi0_shape] = e0->tabulate(X, Xshape, 1);
+  md::mdspan<const T, md::extents<std::size_t, 4, md::dynamic_extent,
+                                  md::dynamic_extent, 3>>
+      Phi0(Phi0_b.data(), Phi0_shape);
+
+  // Create working arrays, (point, phi (dof), deriv, comp)
+  md::extents<std::size_t, md::dynamic_extent, md::dynamic_extent, 3, 3>
+      dPhi0_ext(Phi0.extent(1), Phi0.extent(2), Phi0.extent(0) - 1,
+                Phi0.extent(3));
+  std::vector<T> dPhi0_b(dPhi0_ext.extent(0) * dPhi0_ext.extent(1)
+                         * dPhi0_ext.extent(2) * dPhi0_ext.extent(3));
+  md::mdspan<T, decltype(dPhi0_ext)> dPhi0(dPhi0_b.data(), dPhi0_ext);
+
+  // Get the interpolation operator (matrix) Pi that maps a function
+  // evaluated at the interpolation points to the V1 element degrees of
+  // freedom, i.e. dofs = Pi f_x
+  const auto [Pi1_b, pi_shape] = e1->interpolation_operator();
+  md::mdspan<const T, md::dextents<std::size_t, 2>> Pi_1(Pi1_b.data(),
+                                                         pi_shape);
+
+  // curl data structure, (pt_idx, phi (dof), comp)
+  std::vector<T> curl_b(dPhi0.extent(0) * dPhi0.extent(1) * dPhi0.extent(3));
+  md::mdspan<
+      T, md::extents<std::size_t, md::dynamic_extent, md::dynamic_extent, 3>>
+      curl(curl_b.data(), dPhi0.extent(0), dPhi0.extent(1), dPhi0.extent(3));
+
+  std::vector<U> Ab(space_dim0 * space_dim1);
+  std::vector<U> local1(space_dim1);
+
+  // Iterate over mesh and interpolate on each cell
+  assert(mesh->topology()->index_map(gdim));
+  for (std::int32_t c = 0; c < mesh->topology()->index_map(gdim)->size_local();
+       ++c)
+  {
+    // Get cell geometry (coordinate dofs)
+    auto x_dofs = md::submdspan(x_dofmap, c, md::full_extent);
+    for (std::size_t i = 0; i < x_dofs.size(); ++i)
+      for (std::size_t j = 0; j < gdim; ++j)
+        coord_dofs(i, j) = x_g[3 * x_dofs[i] + j];
+
+    // Compute Jacobians at reference points for current cell
+    std::ranges::fill(J_b, 0);
+    for (std::size_t p = 0; p < Xshape[0]; ++p)
+    {
+      auto dPhi_g
+          = md::submdspan(Phi_g, std::pair(1, gdim + 1), p, md::full_extent, 0);
+      auto _J = md::submdspan(J, p, md::full_extent, md::full_extent);
+      cmap.compute_jacobian(dPhi_g, coord_dofs, _J);
+      auto _K = md::submdspan(K, p, md::full_extent, md::full_extent);
+      cmap.compute_jacobian_inverse(_J, _K);
+      detJ[p] = cmap.compute_jacobian_determinant(_J, det_scratch);
+    }
+
+    // TODO: re-order loops and/or re-pack Phi0 to allow a simple flat
+    // copy?
+
+    // Copy (d)Phi0 (on reference) and apply DOF transformation
+    // Phi0:  (deriv, pt_idx, phi (dof), comp)
+    // dPhi0: (pt_idx, phi (dov), deriv, comp)
+    for (std::size_t p = 0; p < Phi0.extent(1); ++p)           // point
+      for (std::size_t phi = 0; phi < Phi0.extent(2); ++phi)   // phi_i
+        for (std::size_t d = 0; d < Phi0.extent(3); ++d)       // Comp. of phi
+          for (std::size_t dx = 0; dx < dPhi0.extent(2); ++dx) // dx
+            dPhi0(p, phi, dx, d) = Phi0(dx + 1, p, phi, d);
+
+    for (std::size_t p = 0; p < dPhi0.extent(0); ++p) // point
+    {
+      // Size: num_phi * num_derivs * num_components
+      std::size_t size = dPhi0.extent(1) * dPhi0.extent(2) * dPhi0.extent(3);
+      std::size_t offset = p * size; // Offset for point p
+
+      // Shape: (num_phi , (value_size * num_derivs))
+      apply_dof_transformation0(std::span(dPhi0.data_handle() + offset, size),
+                                cell_info, c,
+                                dPhi0.extent(2) * dPhi0.extent(3));
+    }
+
+    // Compute curl
+    // dPhi0: (pt_idx, phi_idx, deriv, comp)
+    // curl: (pt_idx, phi_idx, comp)
+    for (std::size_t p = 0; p < curl.extent(0); ++p) // point
+    {
+      for (std::size_t i = 0; i < curl.extent(1); ++i) // phi_i
+      {
+        curl(p, i, 0) = dPhi0(p, i, 1, 2) - dPhi0(p, i, 2, 1);
+        curl(p, i, 1) = dPhi0(p, i, 2, 0) - dPhi0(p, i, 0, 2);
+        curl(p, i, 2) = dPhi0(p, i, 0, 1) - dPhi0(p, i, 1, 0);
+      }
+    }
+
+    // Apply interpolation matrix to basis derivatives values of V0 at
+    // the interpolation points of V1
+    for (std::size_t i = 0; i < curl.extent(1); ++i)
+    {
+      auto values = md::submdspan(curl, md::full_extent, i, md::full_extent);
+      impl::interpolation_apply(Pi_1, values, std::span(local1), 1);
+      for (std::size_t j = 0; j < local1.size(); ++j)
+        Ab[space_dim0 * j + i] = local1[j];
+    }
+
+    apply_inverse_dof_transform1(Ab, cell_info, c, space_dim0);
+    mat_set(dofmap1->cell_dofs(c), dofmap0->cell_dofs(c), Ab);
+  }
+}
+
 /// @brief Assemble a discrete gradient operator.
 ///
 /// The discrete gradient operator \f$A\f$ interpolates the gradient of
@@ -59,17 +322,16 @@ void discrete_gradient(mesh::Topology& topology,
                            V1,
                        auto&& mat_set)
 {
+  namespace md = MDSPAN_IMPL_STANDARD_NAMESPACE;
+
   auto& e0 = V0.first.get();
   const DofMap& dofmap0 = V0.second.get();
   auto& e1 = V1.first.get();
   const DofMap& dofmap1 = V1.second.get();
 
-  using cmdspan2_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      const U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>;
-  using mdspan2_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>;
-  using cmdspan4_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      const U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 4>>;
+  using cmdspan2_t = md::mdspan<const U, md::dextents<std::size_t, 2>>;
+  using mdspan2_t = md::mdspan<U, md::dextents<std::size_t, 2>>;
+  using cmdspan4_t = md::mdspan<const U, md::dextents<std::size_t, 4>>;
 
   // Check elements
   if (e0.map_type() != basix::maps::type::identity)
@@ -84,18 +346,18 @@ void discrete_gradient(mesh::Topology& topology,
   if (e1.block_size() != 1)
     throw std::runtime_error("Block size is greater than 1 for V1.");
 
-  // Get V0 (H(curl)) space interpolation points
+  // Get V1 (H(curl)) space interpolation points
   const auto [X, Xshape] = e1.interpolation_points();
 
-  // Tabulate first order derivatives of Lagrange space at H(curl)
-  // interpolation points
+  // Tabulate first derivatives of Lagrange space at V1 interpolation
+  // points
   const int ndofs0 = e0.space_dimension();
   const int tdim = topology.dim();
   std::vector<U> phi0_b((tdim + 1) * Xshape[0] * ndofs0 * 1);
   cmdspan4_t phi0(phi0_b.data(), tdim + 1, Xshape[0], ndofs0, 1);
   e0.tabulate(phi0_b, X, Xshape, 1);
 
-  // Reshape lagrange basis derivatives as a matrix of shape (tdim *
+  // Reshape Lagrange basis derivatives as a matrix of shape (tdim *
   // num_points, num_dofs_per_cell)
   cmdspan2_t dphi_reshaped(
       phi0_b.data() + phi0.extent(3) * phi0.extent(2) * phi0.extent(1),
@@ -115,9 +377,8 @@ void discrete_gradient(mesh::Topology& topology,
   // Build the element interpolation matrix
   std::vector<T> Ab(e1.space_dimension() * ndofs0);
   {
-    MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-        T, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>
-        A(Ab.data(), e1.space_dimension(), ndofs0);
+    md::mdspan<T, md::dextents<std::size_t, 2>> A(Ab.data(),
+                                                  e1.space_dimension(), ndofs0);
     const auto [Pi, shape] = e1.interpolation_operator();
     cmdspan2_t _Pi(Pi.data(), shape);
     math::dot(_Pi, dphi_reshaped, A);
@@ -155,6 +416,8 @@ template <dolfinx::scalar T, std::floating_point U>
 void interpolation_matrix(const FunctionSpace<U>& V0,
                           const FunctionSpace<U>& V1, auto&& mat_set)
 {
+  namespace md = MDSPAN_IMPL_STANDARD_NAMESPACE;
+
   // Get mesh
   auto mesh = V0.mesh();
   assert(mesh);
@@ -204,16 +467,11 @@ void interpolation_matrix(const FunctionSpace<U>& V0,
   const std::size_t num_dofs_g = cmap.dim();
   std::span<const U> x_g = mesh->geometry().x();
 
-  using mdspan2_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>;
-  using cmdspan2_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      const U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>;
-  using cmdspan3_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      const U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 3>>;
-  using cmdspan4_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      const U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 4>>;
-  using mdspan3_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 3>>;
+  using mdspan2_t = md::mdspan<U, md::dextents<std::size_t, 2>>;
+  using cmdspan2_t = md::mdspan<const U, md::dextents<std::size_t, 2>>;
+  using cmdspan3_t = md::mdspan<const U, md::dextents<std::size_t, 3>>;
+  using cmdspan4_t = md::mdspan<const U, md::dextents<std::size_t, 4>>;
+  using mdspan3_t = md::mdspan<U, md::dextents<std::size_t, 3>>;
 
   // Evaluate coordinate map basis at reference interpolation points
   const auto [X, Xshape] = e1->interpolation_points();
@@ -254,14 +512,10 @@ void interpolation_matrix(const FunctionSpace<U>& V0,
 
   bool interpolation_ident = e1->interpolation_ident();
 
-  using u_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>;
-  using U_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      const U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>;
-  using J_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      const U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>;
-  using K_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      const U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>;
+  using u_t = md::mdspan<U, md::dextents<std::size_t, 2>>;
+  using U_t = md::mdspan<const U, md::dextents<std::size_t, 2>>;
+  using J_t = md::mdspan<const U, md::dextents<std::size_t, 2>>;
+  using K_t = md::mdspan<const U, md::dextents<std::size_t, 2>>;
   auto push_forward_fn0
       = e0->basix_element().template map_fn<u_t, U_t, J_t, K_t>();
 
@@ -295,8 +549,7 @@ void interpolation_matrix(const FunctionSpace<U>& V0,
   for (std::int32_t c = 0; c < num_cells; ++c)
   {
     // Get cell geometry (coordinate dofs)
-    auto x_dofs = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-        x_dofmap, c, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+    auto x_dofs = md::submdspan(x_dofmap, c, md::full_extent);
     for (std::size_t i = 0; i < x_dofs.size(); ++i)
     {
       for (std::size_t j = 0; j < gdim; ++j)
@@ -307,16 +560,11 @@ void interpolation_matrix(const FunctionSpace<U>& V0,
     std::ranges::fill(J_b, 0);
     for (std::size_t p = 0; p < Xshape[0]; ++p)
     {
-      auto dphi = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          phi, std::pair(1, tdim + 1), p,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent, 0);
-      auto _J = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          J, p, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+      auto dphi
+          = md::submdspan(phi, std::pair(1, tdim + 1), p, md::full_extent, 0);
+      auto _J = md::submdspan(J, p, md::full_extent, md::full_extent);
       cmap.compute_jacobian(dphi, coord_dofs, _J);
-      auto _K = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          K, p, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+      auto _K = md::submdspan(K, p, md::full_extent, md::full_extent);
       cmap.compute_jacobian_inverse(_J, _K);
       detJ[p] = cmap.compute_jacobian_determinant(_J, det_scratch);
     }
@@ -338,18 +586,11 @@ void interpolation_matrix(const FunctionSpace<U>& V0,
 
     for (std::size_t p = 0; p < basis0.extent(0); ++p)
     {
-      auto _u = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          basis0, p, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
-      auto _U = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          basis_reference0, p, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
-      auto _K = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          K, p, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
-      auto _J = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          J, p, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+      auto _u = md::submdspan(basis0, p, md::full_extent, md::full_extent);
+      auto _U = md::submdspan(basis_reference0, p, md::full_extent,
+                              md::full_extent);
+      auto _K = md::submdspan(K, p, md::full_extent, md::full_extent);
+      auto _J = md::submdspan(J, p, md::full_extent, md::full_extent);
       push_forward_fn0(_u, _U, _J, detJ[p], _K);
     }
 
@@ -363,18 +604,12 @@ void interpolation_matrix(const FunctionSpace<U>& V0,
     // Pull back the physical values to the reference of output space
     for (std::size_t p = 0; p < basis_values.extent(0); ++p)
     {
-      auto _u = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          basis_values, p, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
-      auto _U = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          mapped_values, p, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
-      auto _K = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          K, p, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
-      auto _J = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          J, p, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+      auto _u
+          = md::submdspan(basis_values, p, md::full_extent, md::full_extent);
+      auto _U
+          = md::submdspan(mapped_values, p, md::full_extent, md::full_extent);
+      auto _K = md::submdspan(K, p, md::full_extent, md::full_extent);
+      auto _J = md::submdspan(J, p, md::full_extent, md::full_extent);
       pull_back_fn1(_U, _u, _K, 1.0 / detJ[p], _J);
     }
 
@@ -382,9 +617,8 @@ void interpolation_matrix(const FunctionSpace<U>& V0,
     // interpolation points of V1
     if (interpolation_ident)
     {
-      MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-          T, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 3>>
-          A(Ab.data(), Xshape[0], V1.element()->value_size(), space_dim0);
+      md::mdspan<T, md::dextents<std::size_t, 3>> A(
+          Ab.data(), Xshape[0], V1.element()->value_size(), space_dim0);
       for (std::size_t i = 0; i < mapped_values.extent(0); ++i)
         for (std::size_t j = 0; j < mapped_values.extent(1); ++j)
           for (std::size_t k = 0; k < mapped_values.extent(2); ++k)
@@ -394,9 +628,8 @@ void interpolation_matrix(const FunctionSpace<U>& V0,
     {
       for (std::size_t i = 0; i < mapped_values.extent(1); ++i)
       {
-        auto values = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-            mapped_values, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent, i,
-            MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+        auto values
+            = md::submdspan(mapped_values, md::full_extent, i, md::full_extent);
         impl::interpolation_apply(Pi_1, values, std::span(local1), bs1);
         for (std::size_t j = 0; j < local1.size(); j++)
           Ab[space_dim0 * j + i] = local1[j];
diff --git a/cpp/dolfinx/fem/interpolate.h b/cpp/dolfinx/fem/interpolate.h
index f1ee49e1e42..e339d9d4e6b 100644
--- a/cpp/dolfinx/fem/interpolate.h
+++ b/cpp/dolfinx/fem/interpolate.h
@@ -45,13 +45,15 @@ concept MDSpan = requires(T x, std::size_t idx) {
 /// @param[in] cells Indices of the cells in the mesh to compute
 /// interpolation coordinates for.
 /// @return The coordinates in the physical space at which to evaluate
-/// an expression. The shape is (3, num_points) and storage is
+/// an expression. The shape is `(3, num_points)` and storage is
 /// row-major.
 template <std::floating_point T>
 std::vector<T> interpolation_coords(const fem::FiniteElement<T>& element,
                                     const mesh::Geometry<T>& geometry,
                                     std::span<const std::int32_t> cells)
 {
+  namespace md = MDSPAN_IMPL_STANDARD_NAMESPACE;
+
   // Get geometry data and the element coordinate map
   const std::size_t gdim = geometry.dim();
   auto x_dofmap = geometry.dofmap();
@@ -67,13 +69,11 @@ std::vector<T> interpolation_coords(const fem::FiniteElement<T>& element,
   std::array<std::size_t, 4> phi_shape = cmap.tabulate_shape(0, Xshape[0]);
   std::vector<T> phi_b(
       std::reduce(phi_shape.begin(), phi_shape.end(), 1, std::multiplies{}));
-  MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      const T, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 4>>
+  md::mdspan<const T, md::extents<std::size_t, 1, md::dynamic_extent,
+                                  md::dynamic_extent, 1>>
       phi_full(phi_b.data(), phi_shape);
   cmap.tabulate(0, X, Xshape, phi_b);
-  auto phi = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-      phi_full, 0, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-      MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent, 0);
+  auto phi = md::submdspan(phi_full, 0, md::full_extent, md::full_extent, 0);
 
   // Push reference coordinates (X) forward to the physical coordinates
   // (x) for each cell
@@ -82,8 +82,7 @@ std::vector<T> interpolation_coords(const fem::FiniteElement<T>& element,
   for (std::size_t c = 0; c < cells.size(); ++c)
   {
     // Get geometry data for current cell
-    auto x_dofs = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-        x_dofmap, cells[c], MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+    auto x_dofs = md::submdspan(x_dofmap, cells[c], md::full_extent);
     for (std::size_t i = 0; i < x_dofs.size(); ++i)
     {
       std::copy_n(std::next(x_g.begin(), 3 * x_dofs[i]), gdim,
@@ -109,18 +108,19 @@ std::vector<T> interpolation_coords(const fem::FiniteElement<T>& element,
 /// @brief Interpolate an evaluated expression f(x) in a finite element
 /// space.
 ///
-/// @tparam T Scalar type
-/// @tparam U Mesh geometry type
-/// @param[out] u Function object to interpolate into
+/// @tparam T Scalar type.
+/// @tparam U Mesh geometry type.
+///
+/// @param[out] u Function object to interpolate into.
 /// @param[in] f Evaluation of the function `f(x)` at the physical
-/// points `x` given by \ref interpolation_coords. The element used in
-/// \ref interpolation_coords should be the same element as associated
-/// with `u`. The shape of `f` is  `(value_size, num_points)`, with
-/// row-major storage.
+/// points `x` given by `interpolation_coords`. The element used in
+/// `interpolation_coords` should be the same element as associated with
+/// `u`. The shape of `f` is  `(value_size, num_points)`, with row-major
+/// storage.
 /// @param[in] fshape Shape of `f`.
 /// @param[in] cells Indices of the cells in the mesh on which to
 /// interpolate. Should be the same as the list of cells used when
-/// calling \ref interpolation_coords.
+/// calling `interpolation_coords`.
 template <dolfinx::scalar T, std::floating_point U>
 void interpolate(Function<T, U>& u, std::span<const T> f,
                  std::array<std::size_t, 2> fshape,
@@ -292,11 +292,11 @@ void scatter_values(MPI_Comm comm, std::span<const std::int32_t> src_ranks,
 
 /// @brief Apply interpolation operator Pi to data to evaluate the dof
 /// coefficients.
-/// @param[in] Pi The interpolation matrix (shape = (num dofs,
-/// num_points * value_size)).
+/// @param[in] Pi The interpolation matrix (`shape=(num dofs, num_points
+/// * value_size)`).
 /// @param[in] data Function evaluations, by point, e.g. (f0(x0),
 /// f1(x0), f0(x1), f1(x1), ...).
-/// @param[out] coeffs The degrees of freedom to compute.
+/// @param[out] coeffs Degrees of freedom to compute.
 /// @param[in] bs The block size.
 template <MDSpan U, MDSpan V, dolfinx::scalar T>
 void interpolation_apply(U&& Pi, V&& data, std::span<T> coeffs, int bs)
@@ -459,6 +459,8 @@ void interpolate_nonmatching_maps(Function<T, U>& u1,
                                   const Function<T, U>& u0,
                                   std::span<const std::int32_t> cells0)
 {
+  namespace md = MDSPAN_IMPL_STANDARD_NAMESPACE;
+
   // Get mesh
   auto V0 = u0.function_space();
   assert(V0);
@@ -515,46 +517,60 @@ void interpolate_nonmatching_maps(Function<T, U>& u1,
   const std::size_t num_dofs_g = cmap.dim();
   std::span<const U> x_g = mesh0->geometry().x();
 
+  // (0) is derivative index, (1) is the point index, (2) is the basis
+  // function index and (3) is the basis function component.
+
   // Evaluate coordinate map basis at reference interpolation points
   const std::array<std::size_t, 4> phi_shape
       = cmap.tabulate_shape(1, Xshape[0]);
   std::vector<U> phi_b(
       std::reduce(phi_shape.begin(), phi_shape.end(), 1, std::multiplies{}));
-  mdspan_t<const U, 4> phi(phi_b.data(), phi_shape);
+  md::mdspan<const U, md::extents<std::size_t, md::dynamic_extent,
+                                  md::dynamic_extent, md::dynamic_extent, 1>>
+      phi(phi_b.data(), phi_shape);
   cmap.tabulate(1, X, Xshape, phi_b);
 
   // Evaluate v basis functions at reference interpolation points
   const auto [_basis_derivatives_reference0, b0shape]
       = element0->tabulate(X, Xshape, 0);
-  mdspan_t<const U, 4> basis_derivatives_reference0(
-      _basis_derivatives_reference0.data(), b0shape);
+  md::mdspan<const U, std::extents<std::size_t, 1, md::dynamic_extent,
+                                   md::dynamic_extent, md::dynamic_extent>>
+      basis_derivatives_reference0(_basis_derivatives_reference0.data(),
+                                   b0shape);
 
   // Create working arrays
   std::vector<T> local1(element1->space_dimension());
   std::vector<T> coeffs0(element0->space_dimension());
 
   std::vector<U> basis0_b(Xshape[0] * dim0 * value_size0);
-  impl::mdspan_t<U, 3> basis0(basis0_b.data(), Xshape[0], dim0, value_size0);
+  md::mdspan<U, std::dextents<std::size_t, 3>> basis0(
+      basis0_b.data(), Xshape[0], dim0, value_size0);
 
   std::vector<U> basis_reference0_b(Xshape[0] * dim0 * value_size_ref0);
-  impl::mdspan_t<U, 3> basis_reference0(basis_reference0_b.data(), Xshape[0],
-                                        dim0, value_size_ref0);
+  md::mdspan<U, std::dextents<std::size_t, 3>> basis_reference0(
+      basis_reference0_b.data(), Xshape[0], dim0, value_size_ref0);
 
   std::vector<T> values0_b(Xshape[0] * 1 * V1->element()->value_size());
-  impl::mdspan_t<T, 3> values0(values0_b.data(), Xshape[0], 1,
-                               V1->element()->value_size());
+  md::mdspan<
+      T, md::extents<std::size_t, md::dynamic_extent, 1, md::dynamic_extent>>
+      values0(values0_b.data(), Xshape[0], 1, V1->element()->value_size());
 
   std::vector<T> mapped_values_b(Xshape[0] * 1 * V1->element()->value_size());
-  impl::mdspan_t<T, 3> mapped_values0(mapped_values_b.data(), Xshape[0], 1,
-                                      V1->element()->value_size());
+  md::mdspan<
+      T, md::extents<std::size_t, md::dynamic_extent, 1, md::dynamic_extent>>
+      mapped_values0(mapped_values_b.data(), Xshape[0], 1,
+                     V1->element()->value_size());
 
   std::vector<U> coord_dofs_b(num_dofs_g * gdim);
-  impl::mdspan_t<U, 2> coord_dofs(coord_dofs_b.data(), num_dofs_g, gdim);
+  md::mdspan<U, std::dextents<std::size_t, 2>> coord_dofs(coord_dofs_b.data(),
+                                                          num_dofs_g, gdim);
 
   std::vector<U> J_b(Xshape[0] * gdim * tdim);
-  impl::mdspan_t<U, 3> J(J_b.data(), Xshape[0], gdim, tdim);
+  md::mdspan<U, std::dextents<std::size_t, 3>> J(J_b.data(), Xshape[0], gdim,
+                                                 tdim);
   std::vector<U> K_b(Xshape[0] * tdim * gdim);
-  impl::mdspan_t<U, 3> K(K_b.data(), Xshape[0], tdim, gdim);
+  md::mdspan<U, std::dextents<std::size_t, 3>> K(K_b.data(), Xshape[0], tdim,
+                                                 gdim);
   std::vector<U> detJ(Xshape[0]);
   std::vector<U> det_scratch(2 * gdim * tdim);
 
@@ -562,15 +578,16 @@ void interpolate_nonmatching_maps(Function<T, U>& u1,
   const auto [_Pi_1, pi_shape] = element1->interpolation_operator();
   impl::mdspan_t<const U, 2> Pi_1(_Pi_1.data(), pi_shape);
 
-  using u_t = impl::mdspan_t<U, 2>;
-  using U_t = impl::mdspan_t<const U, 2>;
-  using J_t = impl::mdspan_t<const U, 2>;
-  using K_t = impl::mdspan_t<const U, 2>;
+  using u_t = md::mdspan<U, std::dextents<std::size_t, 2>>;
+  using U_t = md::mdspan<const U, std::dextents<std::size_t, 2>>;
+  using J_t = md::mdspan<const U, std::dextents<std::size_t, 2>>;
+  using K_t = md::mdspan<const U, std::dextents<std::size_t, 2>>;
   auto push_forward_fn0
       = element0->basix_element().template map_fn<u_t, U_t, J_t, K_t>();
 
-  using v_t = impl::mdspan_t<const T, 2>;
-  using V_t = impl::mdspan_t<T, 2>;
+  using v_t = md::mdspan<const T, std::dextents<std::size_t, 2>>;
+  using V_t = decltype(md::submdspan(mapped_values0, 0, md::full_extent,
+                                     md::full_extent));
   auto pull_back_fn1
       = element1->basix_element().template map_fn<V_t, v_t, K_t, J_t>();
 
@@ -580,8 +597,7 @@ void interpolate_nonmatching_maps(Function<T, U>& u1,
   for (std::size_t c = 0; c < cells0.size(); c++)
   {
     // Get cell geometry (coordinate dofs)
-    auto x_dofs = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-        x_dofmap, cells0[c], MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+    auto x_dofs = md::submdspan(x_dofmap, cells0[c], md::full_extent);
     for (std::size_t i = 0; i < num_dofs_g; ++i)
     {
       const int pos = 3 * x_dofs[i];
@@ -593,17 +609,11 @@ void interpolate_nonmatching_maps(Function<T, U>& u1,
     std::ranges::fill(J_b, 0);
     for (std::size_t p = 0; p < Xshape[0]; ++p)
     {
-      auto dphi = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          phi, std::pair(1, tdim + 1), p,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent, 0);
-
-      auto _J = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          J, p, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+      auto dphi
+          = md::submdspan(phi, std::pair(1, tdim + 1), p, md::full_extent, 0);
+      auto _J = md::submdspan(J, p, md::full_extent, md::full_extent);
       cmap.compute_jacobian(dphi, coord_dofs, _J);
-      auto _K = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          K, p, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+      auto _K = md::submdspan(K, p, md::full_extent, md::full_extent);
       cmap.compute_jacobian_inverse(_J, _K);
       detJ[p] = cmap.compute_jacobian_determinant(_J, det_scratch);
     }
@@ -626,18 +636,11 @@ void interpolate_nonmatching_maps(Function<T, U>& u1,
 
     for (std::size_t i = 0; i < basis0.extent(0); ++i)
     {
-      auto _u = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          basis0, i, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
-      auto _U = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          basis_reference0, i, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
-      auto _K = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          K, i, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
-      auto _J = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          J, i, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+      auto _u = md::submdspan(basis0, i, md::full_extent, md::full_extent);
+      auto _U = md::submdspan(basis_reference0, i, md::full_extent,
+                              md::full_extent);
+      auto _K = md::submdspan(K, i, md::full_extent, md::full_extent);
+      auto _J = md::submdspan(J, i, md::full_extent, md::full_extent);
       push_forward_fn0(_u, _U, _J, detJ[i], _K);
     }
 
@@ -667,24 +670,16 @@ void interpolate_nonmatching_maps(Function<T, U>& u1,
     // Pull back the physical values to the u reference
     for (std::size_t i = 0; i < values0.extent(0); ++i)
     {
-      auto _u = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          values0, i, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
-      auto _U = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          mapped_values0, i, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
-      auto _K = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          K, i, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
-      auto _J = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          J, i, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-          MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+      auto _u = md::submdspan(values0, i, md::full_extent, md::full_extent);
+      auto _U
+          = md::submdspan(mapped_values0, i, md::full_extent, md::full_extent);
+      auto _K = md::submdspan(K, i, md::full_extent, md::full_extent);
+      auto _J = md::submdspan(J, i, md::full_extent, md::full_extent);
       pull_back_fn1(_U, _u, _K, 1.0 / detJ[i], _J);
     }
 
-    auto values = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-        mapped_values0, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent, 0,
-        MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+    auto values
+        = md::submdspan(mapped_values0, md::full_extent, 0, md::full_extent);
     interpolation_apply(Pi_1, values, std::span(local1), bs1);
     apply_inverse_dof_transform1(local1, cell_info1, cells1[c], 1);
 
@@ -705,14 +700,8 @@ void interpolate(Function<T, U>& u, std::span<const T> f,
                  std::array<std::size_t, 2> fshape,
                  std::span<const std::int32_t> cells)
 {
-  using cmdspan2_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      const U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>;
-  using cmdspan4_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      const U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 4>>;
-  using mdspan2_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>;
-  using mdspan3_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 3>>;
+  namespace md = MDSPAN_IMPL_STANDARD_NAMESPACE;
+
   auto element = u.function_space()->element();
   assert(element);
   const int element_bs = element->block_size();
@@ -743,9 +732,7 @@ void interpolate(Function<T, U>& u, std::span<const T> f,
 
   const std::size_t f_shape1
       = f.size() / u.function_space()->element()->value_size();
-  MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-      const T, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>
-      _f(f.data(), fshape);
+  md::mdspan<const T, md::dextents<std::size_t, 2>> _f(f.data(), fshape);
 
   // Get dofmap
   const auto dofmap = u.function_space()->dofmap();
@@ -862,7 +849,7 @@ void interpolate(Function<T, U>& u, std::span<const T> f,
 
     // Get interpolation operator
     const auto [_Pi, pi_shape] = element->interpolation_operator();
-    cmdspan2_t Pi(_Pi.data(), pi_shape);
+    md::mdspan<const U, std::dextents<std::size_t, 2>> Pi(_Pi.data(), pi_shape);
     const std::size_t num_interp_points = Pi.extent(1);
     assert(Pi.extent(0) == num_scalar_dofs);
 
@@ -872,10 +859,8 @@ void interpolate(Function<T, U>& u, std::span<const T> f,
 
     // Loop over cells
     std::vector<T> ref_data_b(num_interp_points);
-    MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-        T, MDSPAN_IMPL_STANDARD_NAMESPACE::extents<
-               std::size_t, MDSPAN_IMPL_STANDARD_NAMESPACE::dynamic_extent, 1>>
-        ref_data(ref_data_b.data(), num_interp_points, 1);
+    md::mdspan<T, md::extents<std::size_t, md::dynamic_extent, 1>> ref_data(
+        ref_data_b.data(), num_interp_points, 1);
     for (std::size_t c = 0; c < cells.size(); ++c)
     {
       const std::int32_t cell = cells[c];
@@ -930,23 +915,26 @@ void interpolate(Function<T, U>& u, std::span<const T> f,
 
     // Create data structures for Jacobian info
     std::vector<U> J_b(Xshape[0] * gdim * tdim);
-    mdspan3_t J(J_b.data(), Xshape[0], gdim, tdim);
+    md::mdspan<U, std::dextents<std::size_t, 3>> J(J_b.data(), Xshape[0], gdim,
+                                                   tdim);
     std::vector<U> K_b(Xshape[0] * tdim * gdim);
-    mdspan3_t K(K_b.data(), Xshape[0], tdim, gdim);
+    md::mdspan<U, std::dextents<std::size_t, 3>> K(K_b.data(), Xshape[0], tdim,
+                                                   gdim);
     std::vector<U> detJ(Xshape[0]);
     std::vector<U> det_scratch(2 * gdim * tdim);
 
     std::vector<U> coord_dofs_b(num_dofs_g * gdim);
-    mdspan2_t coord_dofs(coord_dofs_b.data(), num_dofs_g, gdim);
+    md::mdspan<U, std::dextents<std::size_t, 2>> coord_dofs(coord_dofs_b.data(),
+                                                            num_dofs_g, gdim);
     const std::size_t value_size_ref = element->reference_value_size();
     std::vector<T> ref_data_b(Xshape[0] * 1 * value_size_ref);
-    MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-        T, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 3>>
+    md::mdspan<
+        T, md::extents<std::size_t, md::dynamic_extent, 1, md::dynamic_extent>>
         ref_data(ref_data_b.data(), Xshape[0], 1, value_size_ref);
 
     std::vector<T> _vals_b(Xshape[0] * 1 * value_size);
-    MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-        T, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 3>>
+    md::mdspan<
+        T, md::extents<std::size_t, md::dynamic_extent, 1, md::dynamic_extent>>
         _vals(_vals_b.data(), Xshape[0], 1, value_size);
 
     // Tabulate 1st derivative of shape functions at interpolation
@@ -954,12 +942,12 @@ void interpolate(Function<T, U>& u, std::span<const T> f,
     std::array<std::size_t, 4> phi_shape = cmap.tabulate_shape(1, Xshape[0]);
     std::vector<U> phi_b(
         std::reduce(phi_shape.begin(), phi_shape.end(), 1, std::multiplies{}));
-    cmdspan4_t phi(phi_b.data(), phi_shape);
+    md::mdspan<const U, md::extents<std::size_t, md::dynamic_extent,
+                                    md::dynamic_extent, md::dynamic_extent, 1>>
+        phi(phi_b.data(), phi_shape);
     cmap.tabulate(1, X, Xshape, phi_b);
-    auto dphi = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-        phi, std::pair(1, tdim + 1),
-        MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-        MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent, 0);
+    auto dphi = md::submdspan(phi, std::pair(1, tdim + 1), md::full_extent,
+                              md::full_extent, 0);
 
     const std::function<void(std::span<T>, std::span<const std::uint32_t>,
                              std::int32_t, int)>
@@ -969,24 +957,20 @@ void interpolate(Function<T, U>& u, std::span<const T> f,
 
     // Get interpolation operator
     const auto [_Pi, pi_shape] = element->interpolation_operator();
-    cmdspan2_t Pi(_Pi.data(), pi_shape);
-
-    using u_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-        const T, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>;
-    using U_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-        T, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>;
-    using J_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-        const U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>;
-    using K_t = MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<
-        const U, MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>>;
+    md::mdspan<const U, std::dextents<std::size_t, 2>> Pi(_Pi.data(), pi_shape);
+
+    using u_t = md::mdspan<const T, md::dextents<std::size_t, 2>>;
+    using U_t = decltype(md::submdspan(ref_data, 0, md::full_extent,
+                                       md::full_extent));
+    using J_t = md::mdspan<const U, md::dextents<std::size_t, 2>>;
+    using K_t = md::mdspan<const U, md::dextents<std::size_t, 2>>;
     auto pull_back_fn
         = element->basix_element().template map_fn<U_t, u_t, J_t, K_t>();
 
     for (std::size_t c = 0; c < cells.size(); ++c)
     {
       const std::int32_t cell = cells[c];
-      auto x_dofs = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-          x_dofmap, cell, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+      auto x_dofs = md::submdspan(x_dofmap, cell, md::full_extent);
       for (int i = 0; i < num_dofs_g; ++i)
       {
         const int pos = 3 * x_dofs[i];
@@ -998,16 +982,10 @@ void interpolate(Function<T, U>& u, std::span<const T> f,
       std::ranges::fill(J_b, 0);
       for (std::size_t p = 0; p < Xshape[0]; ++p)
       {
-        auto _dphi = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-            dphi, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent, p,
-            MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
-        auto _J = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-            J, p, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-            MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+        auto _dphi = md::submdspan(dphi, md::full_extent, p, md::full_extent);
+        auto _J = md::submdspan(J, p, md::full_extent, md::full_extent);
         cmap.compute_jacobian(_dphi, coord_dofs, _J);
-        auto _K = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-            K, p, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-            MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+        auto _K = md::submdspan(K, p, md::full_extent, md::full_extent);
         cmap.compute_jacobian_inverse(_J, _K);
         detJ[p] = cmap.compute_jacobian_determinant(_J, det_scratch);
       }
@@ -1028,24 +1006,15 @@ void interpolate(Function<T, U>& u, std::span<const T> f,
         // Get element degrees of freedom for block
         for (std::size_t i = 0; i < Xshape[0]; ++i)
         {
-          auto _u = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-              _vals, i, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-              MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
-          auto _U = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-              ref_data, i, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-              MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
-          auto _K = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-              K, i, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-              MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
-          auto _J = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-              J, i, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent,
-              MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+          auto _u = md::submdspan(_vals, i, md::full_extent, md::full_extent);
+          auto _U
+              = md::submdspan(ref_data, i, md::full_extent, md::full_extent);
+          auto _K = md::submdspan(K, i, md::full_extent, md::full_extent);
+          auto _J = md::submdspan(J, i, md::full_extent, md::full_extent);
           pull_back_fn(_U, _u, _K, 1.0 / detJ[i], _J);
         }
 
-        auto ref = MDSPAN_IMPL_STANDARD_NAMESPACE::submdspan(
-            ref_data, MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent, 0,
-            MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent);
+        auto ref = md::submdspan(ref_data, md::full_extent, 0, md::full_extent);
         impl::interpolation_apply(Pi, ref, std::span(_coeffs), element_bs);
         apply_inverse_transpose_dof_transformation(_coeffs, cell_info, cell, 1);
 
@@ -1062,15 +1031,15 @@ void interpolate(Function<T, U>& u, std::span<const T> f,
   }
 }
 
-/// @brief Generate data needed to interpolate finite element Functions
-/// across different meshes.
+/// @brief Generate data needed to interpolate finite element
+/// fem::Function's across different meshes.
 ///
-/// @param[in] geometry0 Mesh geometry of the space to interpolate into
-/// @param[in] element0 Element of the space to interpolate into
-/// @param[in] mesh1 Mesh of the function to interpolate from
+/// @param[in] geometry0 Mesh geometry of the space to interpolate into.
+/// @param[in] element0 Element of the space to interpolate into.
+/// @param[in] mesh1 Mesh of the function to interpolate from.
 /// @param[in] cells Indices of the cells in the destination mesh on
 /// which to interpolate. Should be the same as the list used when
-/// calling \ref interpolation_coords.
+/// calling `interpolation_coords`.
 /// @param[in] padding Absolute padding of bounding boxes of all
 /// entities on `mesh1`. This is used avoid floating point issues when
 /// an interpolation point from `mesh0` is on the surface of a cell in
@@ -1102,6 +1071,7 @@ geometry::PointOwnershipData<T> create_interpolation_data(
 
 /// @brief Interpolate a finite element Function defined on a mesh to a
 /// finite element Function defined on different (non-matching) mesh.
+///
 /// @tparam T Function scalar type.
 /// @tparam U mesh::Mesh geometry scalar type.
 /// @param u Function to interpolate into.
@@ -1116,6 +1086,8 @@ void interpolate(Function<T, U>& u, const Function<T, U>& v,
                  std::span<const std::int32_t> cells,
                  const geometry::PointOwnershipData<U>& interpolation_data)
 {
+  namespace md = MDSPAN_IMPL_STANDARD_NAMESPACE;
+
   auto mesh = u.function_space()->mesh();
   assert(mesh);
   MPI_Comm comm = mesh->comm();
@@ -1149,20 +1121,18 @@ void interpolate(Function<T, U>& u, const Function<T, U>& v,
   v.eval(recv_points, {recv_points.size() / 3, (std::size_t)3},
          evaluation_cells, send_values, {recv_points.size() / 3, value_size});
 
-  using dextents2 = MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<std::size_t, 2>;
-
   // Send values back to owning process
   std::vector<T> values_b(dest_ranks.size() * value_size);
-  MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<const T, dextents2> _send_values(
+  md::mdspan<const T, md::dextents<std::size_t, 2>> _send_values(
       send_values.data(), src_ranks.size(), value_size);
   impl::scatter_values(comm, src_ranks, dest_ranks, _send_values,
                        std::span(values_b));
 
   // Transpose received data
-  MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<const T, dextents2> values(
+  md::mdspan<const T, md::dextents<std::size_t, 2>> values(
       values_b.data(), dest_ranks.size(), value_size);
   std::vector<T> valuesT_b(value_size * dest_ranks.size());
-  MDSPAN_IMPL_STANDARD_NAMESPACE::mdspan<T, dextents2> valuesT(
+  md::mdspan<T, md::dextents<std::size_t, 2>> valuesT(
       valuesT_b.data(), value_size, dest_ranks.size());
   for (std::size_t i = 0; i < values.extent(0); ++i)
     for (std::size_t j = 0; j < values.extent(1); ++j)
diff --git a/python/dolfinx/fem/__init__.py b/python/dolfinx/fem/__init__.py
index 44ca560ffea..8430eee2b9a 100644
--- a/python/dolfinx/fem/__init__.py
+++ b/python/dolfinx/fem/__init__.py
@@ -12,6 +12,7 @@
 from dolfinx.cpp.fem import compute_integration_domains as _compute_integration_domains
 from dolfinx.cpp.fem import create_interpolation_data as _create_interpolation_data
 from dolfinx.cpp.fem import create_sparsity_pattern as _create_sparsity_pattern
+from dolfinx.cpp.fem import discrete_curl as _discrete_curl
 from dolfinx.cpp.fem import discrete_gradient as _discrete_gradient
 from dolfinx.cpp.fem import interpolation_matrix as _interpolation_matrix
 from dolfinx.cpp.mesh import Topology
@@ -100,21 +101,38 @@ def create_interpolation_data(
     )
 
 
+def discrete_curl(V0: FunctionSpace, V1: FunctionSpace) -> _MatrixCSR:
+    """Assemble a discrete curl operator.
+
+    The discrete curl operator interpolates the curl of H(curl) finite
+    element function into a H(div) space.
+
+    Args:
+        V0: H1(curl) space to interpolate the curl from.
+        V1: H(div) space to interpolate into.
+
+    Returns:
+        Discrete curl operator.
+    """
+    return _MatrixCSR(_discrete_curl(V0._cpp_object, V1._cpp_object))
+
+
 def discrete_gradient(space0: FunctionSpace, space1: FunctionSpace) -> _MatrixCSR:
     """Assemble a discrete gradient operator.
 
-    The discrete gradient operator interpolates the gradient of
-    a H1 finite element function into a H(curl) space. It is assumed that
-    the H1 space uses an identity map and the H(curl) space uses a covariant Piola map.
+    The discrete gradient operator interpolates the gradient of a H1
+    finite element function into a H(curl) space. It is assumed that the
+    H1 space uses an identity map and the H(curl) space uses a covariant
+    Piola map.
 
     Args:
-        space0: H1 space to interpolate the gradient from
-        space1: H(curl) space to interpolate into
+        space0: H1 space to interpolate the gradient from.
+        space1: H(curl) space to interpolate into.
 
     Returns:
-        Discrete gradient operator
+        Discrete gradient operator.
     """
-    return _discrete_gradient(space0._cpp_object, space1._cpp_object)
+    return _MatrixCSR(_discrete_gradient(space0._cpp_object, space1._cpp_object))
 
 
 def interpolation_matrix(space0: FunctionSpace, space1: FunctionSpace) -> _MatrixCSR:
@@ -135,13 +153,14 @@ def compute_integration_domains(
 ):
     """Given an integral type and a set of entities compute integration entities.
 
-    This function returns a list `[(id, entities)]`. For cell integrals
-    `entities` are the cell indices. For exterior facet integrals,
-    `entities` is a list of `(cell_index, local_facet_index)` pairs. For
-    interior facet integrals, `entities` is a list of `(cell_index0,
-    local_facet_index0, cell_index1, local_facet_index1)`. `id` refers
-    to the subdomain id used in the definition of the integration
-    measures of the variational form.
+    This function returns a list ``[(id, entities)]``. For cell
+    integrals ``entities`` are the cell indices. For exterior facet
+    integrals, ``entities`` is a list of ``(cell_index,
+    local_facet_index)`` pairs. For interior facet integrals,
+    ``entities`` is a list of ``(cell_index0, local_facet_index0,
+    cell_index1, local_facet_index1)``. ``id`` refers to the subdomain
+    id used in the definition of the integration measures of the
+    variational form.
 
     Note:
         Owned mesh entities only are returned. Ghost entities are not
diff --git a/python/dolfinx/fem/petsc.py b/python/dolfinx/fem/petsc.py
index 4305c004ab0..902f0c9bc0d 100644
--- a/python/dolfinx/fem/petsc.py
+++ b/python/dolfinx/fem/petsc.py
@@ -31,6 +31,7 @@
 from dolfinx import la
 from dolfinx.cpp.fem import pack_coefficients as _pack_coefficients
 from dolfinx.cpp.fem import pack_constants as _pack_constants
+from dolfinx.cpp.fem.petsc import discrete_curl as _discrete_curl
 from dolfinx.cpp.fem.petsc import discrete_gradient as _discrete_gradient
 from dolfinx.cpp.fem.petsc import interpolation_matrix as _interpolation_matrix
 from dolfinx.fem import assemble as _assemble
@@ -60,6 +61,7 @@
     "create_vector",
     "create_vector_block",
     "create_vector_nest",
+    "discrete_curl",
     "discrete_gradient",
     "interpolation_matrix",
     "set_bc",
@@ -1043,6 +1045,19 @@ def J(self, x: PETSc.Vec, A: PETSc.Mat) -> None:
         A.assemble()
 
 
+def discrete_curl(space0: _FunctionSpace, space1: _FunctionSpace) -> PETSc.Mat:
+    """Assemble a discrete curl operator.
+
+    Args:
+        space0: H1 space to interpolate the gradient from.
+        space1: H(curl) space to interpolate into.
+
+    Returns:
+        Discrete curl operator.
+    """
+    return _discrete_curl(space0._cpp_object, space1._cpp_object)
+
+
 def discrete_gradient(space0: _FunctionSpace, space1: _FunctionSpace) -> PETSc.Mat:
     """Assemble a discrete gradient operator.
 
diff --git a/python/dolfinx/wrappers/assemble.cpp b/python/dolfinx/wrappers/assemble.cpp
index 2d85dd7501f..107a5a24ea7 100644
--- a/python/dolfinx/wrappers/assemble.cpp
+++ b/python/dolfinx/wrappers/assemble.cpp
@@ -139,6 +139,20 @@ void declare_discrete_operators(nb::module_& m)
           return A;
         });
 
+  m.def(
+      "discrete_curl",
+      [](const dolfinx::fem::FunctionSpace<U>& V0,
+         const dolfinx::fem::FunctionSpace<U>& V1)
+      {
+        dolfinx::la::SparsityPattern sp = create_sparsity(V0, V1);
+
+        // Build operator
+        dolfinx::la::MatrixCSR<T> A(sp);
+        dolfinx::fem::discrete_curl<U, T>(V0, V1, A.mat_set_values());
+        return A;
+      },
+      nb::arg("V0"), nb::arg("V1"));
+
   m.def(
       "discrete_gradient",
       [](const dolfinx::fem::FunctionSpace<U>& V0,
diff --git a/python/dolfinx/wrappers/petsc.cpp b/python/dolfinx/wrappers/petsc.cpp
index f0841f54854..553bffe0ca4 100644
--- a/python/dolfinx/wrappers/petsc.cpp
+++ b/python/dolfinx/wrappers/petsc.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2017-2023 Chris Richardson and Garth N. Wells
+// Copyright (C) 2017-2025 Chris Richardson and Garth N. Wells
 //
 // This file is part of DOLFINx (https://www.fenicsproject.org)
 //
@@ -10,6 +10,7 @@
 #include "caster_mpi.h"
 #include "caster_petsc.h"
 #include "pycoeff.h"
+#include <concepts>
 #include <dolfinx/common/IndexMap.h>
 #include <dolfinx/fem/DirichletBC.h>
 #include <dolfinx/fem/DofMap.h>
@@ -41,11 +42,11 @@
 namespace
 {
 // Declare assembler function that have multiple scalar types
-template <typename T, typename U>
+template <typename T, std::floating_point U>
 void declare_petsc_discrete_operators(nb::module_& m)
 {
   m.def(
-      "discrete_gradient",
+      "discrete_curl",
       [](const dolfinx::fem::FunctionSpace<U>& V0,
          const dolfinx::fem::FunctionSpace<U>& V1)
       {
@@ -53,7 +54,46 @@ void declare_petsc_discrete_operators(nb::module_& m)
         auto mesh = V0.mesh();
         assert(V1.mesh());
         assert(mesh == V1.mesh());
+
+        auto dofmap0 = V0.dofmap();
+        assert(dofmap0);
+        auto dofmap1 = V1.dofmap();
+        assert(dofmap1);
+
+        // Create and build  sparsity pattern
+        assert(dofmap0->index_map);
+        assert(dofmap1->index_map);
         MPI_Comm comm = mesh->comm();
+        dolfinx::la::SparsityPattern sp(
+            comm, {dofmap1->index_map, dofmap0->index_map},
+            {dofmap1->index_map_bs(), dofmap0->index_map_bs()});
+
+        int tdim = mesh->topology()->dim();
+        auto map = mesh->topology()->index_map(tdim);
+        assert(map);
+        std::vector<std::int32_t> c(map->size_local(), 0);
+        std::iota(c.begin(), c.end(), 0);
+        dolfinx::fem::sparsitybuild::cells(sp, {c, c}, {*dofmap1, *dofmap0});
+        sp.finalize();
+
+        // Build operator
+        Mat A = dolfinx::la::petsc::create_matrix(comm, sp);
+        MatSetOption(A, MAT_IGNORE_ZERO_ENTRIES, PETSC_TRUE);
+        dolfinx::fem::discrete_curl<U, T>(
+            V0, V1, dolfinx::la::petsc::Matrix::set_fn(A, INSERT_VALUES));
+        return A;
+      },
+      nb::rv_policy::take_ownership, nb::arg("V0"), nb::arg("V1"));
+
+  m.def(
+      "discrete_gradient",
+      [](const dolfinx::fem::FunctionSpace<U>& V0,
+         const dolfinx::fem::FunctionSpace<U>& V1)
+      {
+        assert(V0.mesh());
+        auto mesh = V0.mesh();
+        assert(V1.mesh());
+        assert(mesh == V1.mesh());
 
         auto dofmap0 = V0.dofmap();
         assert(dofmap0);
@@ -63,6 +103,7 @@ void declare_petsc_discrete_operators(nb::module_& m)
         // Create and build  sparsity pattern
         assert(dofmap0->index_map);
         assert(dofmap1->index_map);
+        MPI_Comm comm = mesh->comm();
         dolfinx::la::SparsityPattern sp(
             comm, {dofmap1->index_map, dofmap0->index_map},
             {dofmap1->index_map_bs(), dofmap0->index_map_bs()});
@@ -94,7 +135,6 @@ void declare_petsc_discrete_operators(nb::module_& m)
         auto mesh = V0.mesh();
         assert(V1.mesh());
         assert(mesh == V1.mesh());
-        MPI_Comm comm = mesh->comm();
 
         auto dofmap0 = V0.dofmap();
         assert(dofmap0);
@@ -104,6 +144,7 @@ void declare_petsc_discrete_operators(nb::module_& m)
         // Create and build  sparsity pattern
         assert(dofmap0->index_map);
         assert(dofmap1->index_map);
+        MPI_Comm comm = mesh->comm();
         dolfinx::la::SparsityPattern sp(
             comm, {dofmap1->index_map, dofmap0->index_map},
             {dofmap1->index_map_bs(), dofmap0->index_map_bs()});
diff --git a/python/test/unit/fem/test_discrete_operators.py b/python/test/unit/fem/test_discrete_operators.py
index eab5d82dd26..dfc20564bdb 100644
--- a/python/test/unit/fem/test_discrete_operators.py
+++ b/python/test/unit/fem/test_discrete_operators.py
@@ -1,20 +1,19 @@
-# Copyright (C) 2015-2022 Garth N. Wells, Jørgen S. Dokken
+# Copyright (C) 2015-2025 Garth N. Wells and Jørgen S. Dokken
 #
 # This file is part of DOLFINx (https://www.fenicsproject.org)
 #
 # SPDX-License-Identifier:    LGPL-3.0-or-later
-"""Unit tests for the DiscreteOperator class"""
+"""Unit tests for the discrete operators."""
 
 from mpi4py import MPI
 
 import numpy as np
 import pytest
-import scipy
 
 import dolfinx.la
 import ufl
 from basix.ufl import element
-from dolfinx.fem import Expression, Function, discrete_gradient, functionspace
+from dolfinx.fem import Expression, Function, discrete_curl, discrete_gradient, functionspace
 from dolfinx.mesh import CellType, GhostMode, create_unit_cube, create_unit_square
 
 
@@ -35,9 +34,10 @@ def test_gradient(mesh):
     """Test discrete gradient computation for lowest order elements."""
     V = functionspace(mesh, ("Lagrange", 1))
     W = functionspace(mesh, ("Nedelec 1st kind H(curl)", 1))
+
+    # N.B. do not scatter_rev G - doing so would transfer rows to other
+    # processes where they will be summed to give an incorrect matrix
     G = discrete_gradient(V, W)
-    # N.B. do not scatter_rev G - doing so would transfer rows to other processes
-    # where they will be summed to give an incorrect matrix
 
     num_edges = mesh.topology.index_map(1).size_global
     m, n = G.index_map(0).size_global, G.index_map(1).size_global
@@ -46,6 +46,114 @@ def test_gradient(mesh):
     assert np.isclose(G.squared_norm(), 2.0 * num_edges)
 
 
+@pytest.mark.parametrize("cell", [CellType.triangle, CellType.quadrilateral])
+def test_discrete_curl_gdim_raises(cell):
+    """Test that discrete curl function raises for gdim != 3."""
+    msh = create_unit_square(MPI.COMM_WORLD, 3, 3, cell_type=cell, dtype=np.float64)
+    E0 = element("N1curl", msh.basix_cell(), 2, dtype=np.float64)
+    E1 = element("RT", msh.basix_cell(), 2, dtype=np.float64)
+    V0, V1 = functionspace(msh, E0), functionspace(msh, E1)
+    with pytest.raises(RuntimeError):
+        discrete_curl(V0, V1)
+
+
+@pytest.mark.parametrize(
+    "elements",
+    [
+        (
+            element("Lagrange", "tetrahedron", 2, shape=(3,), dtype=np.float64),
+            element("Lagrange", "tetrahedron", 2, shape=(3,), dtype=np.float64),
+        ),
+        (
+            element("N1curl", "tetrahedron", 2, dtype=np.float64),
+            element("N1curl", "tetrahedron", 2, dtype=np.float64),
+        ),
+        (
+            element("RT", "tetrahedron", 2, dtype=np.float64),
+            element("N1curl", "tetrahedron", 2, dtype=np.float64),
+        ),
+        (
+            element("RT", "tetrahedron", 2, dtype=np.float64),
+            element("RT", "tetrahedron", 2, dtype=np.float64),
+        ),
+    ],
+)
+def test_discrete_curl_map_raises(elements):
+    """Test that discrete curl function raises for incorrect spaces."""
+    msh = create_unit_cube(
+        MPI.COMM_WORLD, 3, 3, 3, cell_type=CellType.tetrahedron, dtype=np.float64
+    )
+    V0, V1 = functionspace(msh, elements[0]), functionspace(msh, elements[1])
+    with pytest.raises(RuntimeError):
+        discrete_curl(V0, V1)
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.complex64, np.float64, np.complex128])
+@pytest.mark.parametrize("p", range(2, 5))
+@pytest.mark.parametrize(
+    "element_data",
+    [
+        (CellType.tetrahedron, "Nedelec 1st kind H(curl)", "Raviart-Thomas"),
+        (CellType.hexahedron, "Nedelec 1st kind H(curl)", "Raviart-Thomas"),
+    ],
+)
+def test_discrete_curl(element_data, p, dtype):
+    """Compute discrete curl operator, with verification using Expression."""
+    xdtype = dtype(0).real.dtype
+
+    celltype, E0, E1 = element_data
+    N = 3
+    msh = create_unit_cube(
+        MPI.COMM_WORLD,
+        N,
+        N // 2,
+        2 * N,
+        ghost_mode=GhostMode.none,
+        cell_type=celltype,
+        dtype=xdtype,
+    )
+
+    # Perturb mesh (making hexahedral cells no longer affine) in serial.
+    # Do not perturb in parallel - would make mesh con-conforming.
+    rng = np.random.default_rng(0)
+    delta_x = 1 / (2 * N - 1) if MPI.COMM_WORLD.size == 1 else 0
+    msh.geometry.x[:] = msh.geometry.x + 0.2 * delta_x * (rng.random(msh.geometry.x.shape) - 0.5)
+
+    V0 = functionspace(msh, (E0, p))
+    V1 = functionspace(msh, (E1, p - 1))
+
+    u0 = Function(V0, dtype=dtype)
+    u0.interpolate(
+        lambda x: np.vstack(
+            (
+                x[1] ** 4 + 3 * x[2] ** 2 + (x[1] * x[2]) ** 3,
+                3 * x[0] ** 4 + 3 * x[2] ** 2,
+                x[0] ** 3 + x[1] ** 4,
+            )
+        )
+    )
+
+    # Create discrete curl operator and get local part of G (including
+    # ghost rows) as a SciPy sparse matrix
+    # Note: do not 'assemble' (scatter_rev) G. This would wrongly
+    # accumulate data for ghost entries.
+    G = discrete_curl(V0, V1)
+    Glocal = G.to_scipy(ghosted=True)
+
+    # Apply discrete curl operator to the u0 vector
+    u1 = Function(V1, dtype=dtype)
+    x0 = u0.x.array
+    u1.x.array[:] = Glocal[:, : x0.shape[0]] @ x0
+
+    # Interpolate curl of u0 using Expression
+    curl_u = Expression(ufl.curl(u0), V1.element.interpolation_points, dtype=dtype)
+    u1_expr = Function(V1, dtype=dtype)
+    u1_expr.interpolate(curl_u)
+
+    atol = 1000 * np.finfo(dtype).resolution
+    assert np.allclose(u1_expr.x.array, u1.x.array, atol=atol)
+
+
 @pytest.mark.parametrize("p", range(1, 4))
 @pytest.mark.parametrize("q", range(1, 4))
 @pytest.mark.parametrize(
@@ -177,20 +285,17 @@ def test_gradient_interpolation(cell_type, p, q):
     w = Function(W, dtype=dtype)
 
     # Get the local part of G (no ghost rows)
-    nrlocal = G.index_map(0).size_local
-    nnzlocal = G.indptr[nrlocal]
-    Glocal = scipy.sparse.csr_matrix(
-        (G.data[:nnzlocal], G.indices[:nnzlocal], G.indptr[: nrlocal + 1])
-    )
+    Glocal = G.to_scipy(ghosted=False)
 
     # MatVec
-    w.x.array[:nrlocal] = Glocal @ u.x.array
+    w.x.array[: Glocal.shape[0]] = Glocal @ u.x.array
     w.x.scatter_forward()
 
     atol = 1000 * np.finfo(dtype).resolution
     assert np.allclose(w_expr.x.array, w.x.array, atol=atol)
 
 
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
 @pytest.mark.parametrize("p", range(1, 4))
 @pytest.mark.parametrize("q", range(1, 4))
 @pytest.mark.parametrize("from_lagrange", [True, False])
@@ -198,32 +303,38 @@ def test_gradient_interpolation(cell_type, p, q):
     "cell_type",
     [CellType.quadrilateral, CellType.triangle, CellType.tetrahedron, CellType.hexahedron],
 )
-def test_interpolation_matrix(cell_type, p, q, from_lagrange):
+def test_interpolation_matrix(dtype, cell_type, p, q, from_lagrange):
     """Test that discrete interpolation matrix yields the same result as interpolation."""
-    from dolfinx import default_real_type
     from dolfinx.fem import interpolation_matrix
 
     comm = MPI.COMM_WORLD
     if cell_type == CellType.triangle:
-        mesh = create_unit_square(comm, 7, 5, ghost_mode=GhostMode.none, cell_type=cell_type)
+        mesh = create_unit_square(
+            comm, 7, 5, ghost_mode=GhostMode.none, cell_type=cell_type, dtype=dtype
+        )
         lagrange = "Lagrange" if from_lagrange else "DG"
         nedelec = "Nedelec 1st kind H(curl)"
     elif cell_type == CellType.quadrilateral:
-        mesh = create_unit_square(comm, 11, 6, ghost_mode=GhostMode.none, cell_type=cell_type)
+        mesh = create_unit_square(
+            comm, 11, 6, ghost_mode=GhostMode.none, cell_type=cell_type, dtype=dtype
+        )
         lagrange = "Q" if from_lagrange else "DQ"
         nedelec = "RTCE"
     elif cell_type == CellType.hexahedron:
-        mesh = create_unit_cube(comm, 3, 2, 1, ghost_mode=GhostMode.none, cell_type=cell_type)
+        mesh = create_unit_cube(
+            comm, 3, 2, 1, ghost_mode=GhostMode.none, cell_type=cell_type, dtype=dtype
+        )
         lagrange = "Q" if from_lagrange else "DQ"
         nedelec = "NCE"
     elif cell_type == CellType.tetrahedron:
-        mesh = create_unit_cube(comm, 3, 2, 2, ghost_mode=GhostMode.none, cell_type=cell_type)
+        mesh = create_unit_cube(
+            comm, 3, 2, 2, ghost_mode=GhostMode.none, cell_type=cell_type, dtype=dtype
+        )
         lagrange = "Lagrange" if from_lagrange else "DG"
         nedelec = "Nedelec 1st kind H(curl)"
-    v_el = element(
-        lagrange, mesh.basix_cell(), p, shape=(mesh.geometry.dim,), dtype=default_real_type
-    )
-    s_el = element(nedelec, mesh.basix_cell(), q, dtype=default_real_type)
+
+    v_el = element(lagrange, mesh.basix_cell(), p, shape=(mesh.geometry.dim,), dtype=dtype)
+    s_el = element(nedelec, mesh.basix_cell(), q, dtype=dtype)
     if from_lagrange:
         el0 = v_el
         el1 = s_el
@@ -231,12 +342,8 @@ def test_interpolation_matrix(cell_type, p, q, from_lagrange):
         el0 = s_el
         el1 = v_el
 
-    V = functionspace(mesh, el0)
-    W = functionspace(mesh, el1)
-    G = interpolation_matrix(V, W)
-
-    uvec = dolfinx.la.vector(G.index_map(1), bs=G.block_size[1])
-    u = Function(V, uvec)
+    V, W = functionspace(mesh, el0), functionspace(mesh, el1)
+    G = interpolation_matrix(V, W).to_scipy()
 
     def f(x):
         if mesh.geometry.dim == 2:
@@ -244,16 +351,17 @@ def f(x):
         else:
             return (x[0] ** p, x[2] ** p, x[1] ** p)
 
+    u = Function(V, dtype=dtype)
     u.interpolate(f)
-    w_vec = Function(W)
+    w_vec = Function(W, dtype=dtype)
     w_vec.interpolate(u)
 
     # Compute global matrix vector product
-    w = Function(W)
-    w.x.array[:] = 0
-    G.mult(u.x, w.x)
+    w = Function(W, dtype=dtype)
+    ux = np.zeros(G.shape[1])
+    ux[: len(u.x.array)] = u.x.array[:]
+    w.x.array[: G.shape[0]] = G @ ux
     w.x.scatter_forward()
 
-    atol = 100 * np.finfo(default_real_type).resolution
-
+    atol = 100 * np.finfo(dtype).resolution
     assert np.allclose(w_vec.x.array, w.x.array, atol=atol)

From d407cbee50c1867e07d0879337f19f9a5121242c Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Sat, 11 Jan 2025 22:57:04 +0000
Subject: [PATCH 5/9] Version bumps (#3592)

---
 docker/Dockerfile.test-env | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docker/Dockerfile.test-env b/docker/Dockerfile.test-env
index 1d9147950ed..b973d45bc3d 100644
--- a/docker/Dockerfile.test-env
+++ b/docker/Dockerfile.test-env
@@ -13,21 +13,21 @@
 #
 
 ARG ADIOS2_VERSION=2.10.2
-ARG DOXYGEN_VERSION=1_12_0
+ARG DOXYGEN_VERSION=1_13_1
 ARG GMSH_VERSION=4_13_1
 ARG HDF5_VERSION=1.14.5
-ARG KAHIP_VERSION=3.16
+ARG KAHIP_VERSION=3.17
 # NOTE: The NumPy version (https://pypi.org/project/numpy/#history)
 # should be pinned to the most recent NumPy release that is supported by
 # the most recent Numba release, see
 # https://numba.readthedocs.io/en/stable/user/installing.html#version-support-information
 ARG NUMPY_VERSION=2.0.2
-ARG PETSC_VERSION=3.22.1
-ARG SLEPC_VERSION=3.22.1
+ARG PETSC_VERSION=3.22.2
+ARG SLEPC_VERSION=3.22.2
 
 ARG MPICH_VERSION=4.2.3
 ARG OPENMPI_SERIES=5.0
-ARG OPENMPI_PATCH=5
+ARG OPENMPI_PATCH=6
 
 ########################################
 

From f045c42cedab63de6142a7f1001c886ad30063c1 Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Mon, 13 Jan 2025 08:14:25 +0000
Subject: [PATCH 6/9] Tidy up spare matrix code (#3596)

---
 cpp/dolfinx/la/MatrixCSR.h       | 223 +++++++++++++++----------------
 cpp/dolfinx/la/matrix_csr_impl.h | 168 +++++++++++------------
 2 files changed, 187 insertions(+), 204 deletions(-)

diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h
index c15e7248dd5..39d8dec1d76 100644
--- a/cpp/dolfinx/la/MatrixCSR.h
+++ b/cpp/dolfinx/la/MatrixCSR.h
@@ -289,7 +289,31 @@ class MatrixCSR
   /// expanded.
   /// @return Dense copy of the part of the matrix on the calling rank.
   /// Storage is row-major.
-  std::vector<value_type> to_dense() const;
+  std::vector<value_type> to_dense() const
+  {
+    const std::size_t nrows = num_all_rows();
+    const std::size_t ncols = _index_maps[1]->size_global();
+    std::vector<value_type> A(nrows * ncols * _bs[0] * _bs[1], 0.0);
+    for (std::size_t r = 0; r < nrows; ++r)
+    {
+      for (std::int32_t j = _row_ptr[r]; j < _row_ptr[r + 1]; ++j)
+      {
+        for (int i0 = 0; i0 < _bs[0]; ++i0)
+        {
+          for (int i1 = 0; i1 < _bs[1]; ++i1)
+          {
+            std::array<std::int32_t, 1> local_col{_cols[j]};
+            std::array<std::int64_t, 1> global_col{0};
+            _index_maps[1]->local_to_global(local_col, global_col);
+            A[(r * _bs[1] + i0) * ncols * _bs[0] + global_col[0] * _bs[1] + i1]
+                = _data[j * _bs[0] * _bs[1] + i0 * _bs[1] + i1];
+          }
+        }
+      }
+    }
+
+    return A;
+  }
 
   /// @brief Transfer ghost row data to the owning ranks accumulating
   /// received values on the owned rows, and zeroing any existing data
@@ -310,18 +334,93 @@ class MatrixCSR
   /// must not be changed.
   /// @note This function does not change the matrix data. Data update
   /// only occurs with `scatter_rev_end()`.
-  void scatter_rev_begin();
+  void scatter_rev_begin()
+  {
+    const std::int32_t local_size0 = _index_maps[0]->size_local();
+    const std::int32_t num_ghosts0 = _index_maps[0]->num_ghosts();
+    const int bs2 = _bs[0] * _bs[1];
+
+    // For each ghost row, pack and send values to send to neighborhood
+    std::vector<int> insert_pos = _val_send_disp;
+    _ghost_value_data.resize(_val_send_disp.back());
+    for (int i = 0; i < num_ghosts0; ++i)
+    {
+      const int rank = _ghost_row_to_rank[i];
+
+      // Get position in send buffer to place data to send to this
+      // neighbour
+      const std::int32_t val_pos = insert_pos[rank];
+      std::copy(std::next(_data.data(), _row_ptr[local_size0 + i] * bs2),
+                std::next(_data.data(), _row_ptr[local_size0 + i + 1] * bs2),
+                std::next(_ghost_value_data.begin(), val_pos));
+      insert_pos[rank]
+          += bs2 * (_row_ptr[local_size0 + i + 1] - _row_ptr[local_size0 + i]);
+    }
+
+    _ghost_value_data_in.resize(_val_recv_disp.back());
+
+    // Compute data sizes for send and receive from displacements
+    std::vector<int> val_send_count(_val_send_disp.size() - 1);
+    std::adjacent_difference(std::next(_val_send_disp.begin()),
+                             _val_send_disp.end(), val_send_count.begin());
+
+    std::vector<int> val_recv_count(_val_recv_disp.size() - 1);
+    std::adjacent_difference(std::next(_val_recv_disp.begin()),
+                             _val_recv_disp.end(), val_recv_count.begin());
+
+    int status = MPI_Ineighbor_alltoallv(
+        _ghost_value_data.data(), val_send_count.data(), _val_send_disp.data(),
+        dolfinx::MPI::mpi_t<value_type>, _ghost_value_data_in.data(),
+        val_recv_count.data(), _val_recv_disp.data(),
+        dolfinx::MPI::mpi_t<value_type>, _comm.comm(), &_request);
+    assert(status == MPI_SUCCESS);
+  }
 
   /// @brief End transfer of ghost row data to owning ranks.
   /// @note Must be preceded by MatrixCSR::scatter_rev_begin().
   /// @note Matrix data received from other processes will be
   /// accumulated into locally owned rows, and ghost rows will be
   /// zeroed.
-  void scatter_rev_end();
+  void scatter_rev_end()
+  {
+    int status = MPI_Wait(&_request, MPI_STATUS_IGNORE);
+    assert(status == MPI_SUCCESS);
+
+    _ghost_value_data.clear();
+    _ghost_value_data.shrink_to_fit();
+
+    // Add to local rows
+    const int bs2 = _bs[0] * _bs[1];
+    assert(_ghost_value_data_in.size() == _unpack_pos.size() * bs2);
+    for (std::size_t i = 0; i < _unpack_pos.size(); ++i)
+      for (int j = 0; j < bs2; ++j)
+        _data[_unpack_pos[i] * bs2 + j] += _ghost_value_data_in[i * bs2 + j];
+
+    _ghost_value_data_in.clear();
+    _ghost_value_data_in.shrink_to_fit();
+
+    // Set ghost row data to zero
+    const std::int32_t local_size0 = _index_maps[0]->size_local();
+    std::fill(std::next(_data.begin(), _row_ptr[local_size0] * bs2),
+              _data.end(), 0);
+  }
 
   /// @brief Compute the Frobenius norm squared across all processes.
   /// @note MPI Collective
-  double squared_norm() const;
+  double squared_norm() const
+  {
+    const std::size_t num_owned_rows = _index_maps[0]->size_local();
+    const int bs2 = _bs[0] * _bs[1];
+    assert(num_owned_rows < _row_ptr.size());
+    double norm_sq_local = std::accumulate(
+        _data.cbegin(),
+        std::next(_data.cbegin(), _row_ptr[num_owned_rows] * bs2), double(0),
+        [](auto norm, value_type y) { return norm + std::norm(y); });
+    double norm_sq;
+    MPI_Allreduce(&norm_sq_local, &norm_sq, 1, MPI_DOUBLE, MPI_SUM,
+                  _comm.comm());
+    return norm_sq;
+  }
 
   /// @brief Compute the product `y += Ax`.
   ///
@@ -434,7 +533,7 @@ MatrixCSR<U, V, W, X>::MatrixCSR(const SparsityPattern& p, BlockMode mode)
     for (int i = 0; i < 2; ++i)
     {
       const auto im = _index_maps[i];
-      const int size_local = im->size_local() * _bs[i];
+      const std::int32_t size_local = im->size_local() * _bs[i];
       std::span ghost_i = im->ghosts();
       std::vector<std::int64_t> ghosts;
       const std::vector<int> ghost_owner_i(im->owners().begin(),
@@ -448,7 +547,8 @@ MatrixCSR<U, V, W, X>::MatrixCSR(const SparsityPattern& p, BlockMode mode)
           src_rank.push_back(ghost_owner_i[j]);
         }
       }
-      const std::array<std::vector<int>, 2> src_dest0
+
+      std::array<std::vector<int>, 2> src_dest0
           = {std::vector(_index_maps[i]->src().begin(),
                          _index_maps[i]->src().end()),
              std::vector(_index_maps[i]->dest().begin(),
@@ -520,7 +620,7 @@ MatrixCSR<U, V, W, X>::MatrixCSR(const SparsityPattern& p, BlockMode mode)
   {
     auto it = std::ranges::lower_bound(src_ranks, r);
     assert(it != src_ranks.end() and *it == r);
-    int pos = std::distance(src_ranks.begin(), it);
+    std::size_t pos = std::distance(src_ranks.begin(), it);
     _ghost_row_to_rank.push_back(pos);
   }
 
@@ -545,7 +645,7 @@ MatrixCSR<U, V, W, X>::MatrixCSR(const SparsityPattern& p, BlockMode mode)
     for (std::size_t i = 0; i < _ghost_row_to_rank.size(); ++i)
     {
       const int rank = _ghost_row_to_rank[i];
-      int row_id = local_size[0] + i;
+      std::int32_t row_id = local_size[0] + i;
       for (int j = _row_ptr[row_id]; j < _row_ptr[row_id + 1]; ++j)
       {
         // Get position in send buffer
@@ -639,110 +739,6 @@ MatrixCSR<U, V, W, X>::MatrixCSR(const SparsityPattern& p, BlockMode mode)
   }
 }
 //-----------------------------------------------------------------------------
-template <typename U, typename V, typename W, typename X>
-std::vector<typename MatrixCSR<U, V, W, X>::value_type>
-MatrixCSR<U, V, W, X>::to_dense() const
-{
-  const std::size_t nrows = num_all_rows();
-  const std::size_t ncols = _index_maps[1]->size_global();
-  std::vector<value_type> A(nrows * ncols * _bs[0] * _bs[1], 0.0);
-  for (std::size_t r = 0; r < nrows; ++r)
-    for (std::int32_t j = _row_ptr[r]; j < _row_ptr[r + 1]; ++j)
-      for (int i0 = 0; i0 < _bs[0]; ++i0)
-        for (int i1 = 0; i1 < _bs[1]; ++i1)
-        {
-          std::array<std::int32_t, 1> local_col{_cols[j]};
-          std::array<std::int64_t, 1> global_col{0};
-          _index_maps[1]->local_to_global(local_col, global_col);
-          A[(r * _bs[1] + i0) * ncols * _bs[0] + global_col[0] * _bs[1] + i1]
-              = _data[j * _bs[0] * _bs[1] + i0 * _bs[1] + i1];
-        }
-
-  return A;
-}
-//-----------------------------------------------------------------------------
-template <typename U, typename V, typename W, typename X>
-void MatrixCSR<U, V, W, X>::scatter_rev_begin()
-{
-  const std::int32_t local_size0 = _index_maps[0]->size_local();
-  const std::int32_t num_ghosts0 = _index_maps[0]->num_ghosts();
-  const int bs2 = _bs[0] * _bs[1];
-
-  // For each ghost row, pack and send values to send to neighborhood
-  std::vector<int> insert_pos = _val_send_disp;
-  _ghost_value_data.resize(_val_send_disp.back());
-  for (int i = 0; i < num_ghosts0; ++i)
-  {
-    const int rank = _ghost_row_to_rank[i];
-
-    // Get position in send buffer to place data to send to this
-    // neighbour
-    const std::int32_t val_pos = insert_pos[rank];
-    std::copy(std::next(_data.data(), _row_ptr[local_size0 + i] * bs2),
-              std::next(_data.data(), _row_ptr[local_size0 + i + 1] * bs2),
-              std::next(_ghost_value_data.begin(), val_pos));
-    insert_pos[rank]
-        += bs2 * (_row_ptr[local_size0 + i + 1] - _row_ptr[local_size0 + i]);
-  }
-
-  _ghost_value_data_in.resize(_val_recv_disp.back());
-
-  // Compute data sizes for send and receive from displacements
-  std::vector<int> val_send_count(_val_send_disp.size() - 1);
-  std::adjacent_difference(std::next(_val_send_disp.begin()),
-                           _val_send_disp.end(), val_send_count.begin());
-
-  std::vector<int> val_recv_count(_val_recv_disp.size() - 1);
-  std::adjacent_difference(std::next(_val_recv_disp.begin()),
-                           _val_recv_disp.end(), val_recv_count.begin());
-
-  int status = MPI_Ineighbor_alltoallv(
-      _ghost_value_data.data(), val_send_count.data(), _val_send_disp.data(),
-      dolfinx::MPI::mpi_t<value_type>, _ghost_value_data_in.data(),
-      val_recv_count.data(), _val_recv_disp.data(),
-      dolfinx::MPI::mpi_t<value_type>, _comm.comm(), &_request);
-  assert(status == MPI_SUCCESS);
-}
-//-----------------------------------------------------------------------------
-template <typename U, typename V, typename W, typename X>
-void MatrixCSR<U, V, W, X>::scatter_rev_end()
-{
-  int status = MPI_Wait(&_request, MPI_STATUS_IGNORE);
-  assert(status == MPI_SUCCESS);
-
-  _ghost_value_data.clear();
-  _ghost_value_data.shrink_to_fit();
-
-  // Add to local rows
-  const int bs2 = _bs[0] * _bs[1];
-  assert(_ghost_value_data_in.size() == _unpack_pos.size() * bs2);
-  for (std::size_t i = 0; i < _unpack_pos.size(); ++i)
-    for (int j = 0; j < bs2; ++j)
-      _data[_unpack_pos[i] * bs2 + j] += _ghost_value_data_in[i * bs2 + j];
-
-  _ghost_value_data_in.clear();
-  _ghost_value_data_in.shrink_to_fit();
-
-  // Set ghost row data to zero
-  const std::int32_t local_size0 = _index_maps[0]->size_local();
-  std::fill(std::next(_data.begin(), _row_ptr[local_size0] * bs2), _data.end(),
-            0);
-}
-//-----------------------------------------------------------------------------
-template <typename U, typename V, typename W, typename X>
-double MatrixCSR<U, V, W, X>::squared_norm() const
-{
-  const std::size_t num_owned_rows = _index_maps[0]->size_local();
-  const int bs2 = _bs[0] * _bs[1];
-  assert(num_owned_rows < _row_ptr.size());
-  double norm_sq_local = std::accumulate(
-      _data.cbegin(), std::next(_data.cbegin(), _row_ptr[num_owned_rows] * bs2),
-      double(0), [](auto norm, value_type y) { return norm + std::norm(y); });
-  double norm_sq;
-  MPI_Allreduce(&norm_sq_local, &norm_sq, 1, MPI_DOUBLE, MPI_SUM, _comm.comm());
-  return norm_sq;
-}
-//-----------------------------------------------------------------------------
 
 // The matrix A is distributed across P  processes by blocks of rows:
 //  A = |   A_0  |
@@ -766,7 +762,8 @@ double MatrixCSR<U, V, W, X>::squared_norm() const
 //  y[0] = |Ai[0] Ai[1]| |   x[0]  | = Ai[0] x[0] + Ai[1] x[1]
 //                       |   x[1]  |
 //
-/// Computes y += A*x for a parallel CSR matrix A and parallel dense vectors x,y
+/// Computes y += A*x for a parallel CSR matrix A and parallel dense vectors
+/// x,y
 template <typename Scalar, typename V, typename W, typename X>
 void MatrixCSR<Scalar, V, W, X>::mult(la::Vector<Scalar>& x,
                                       la::Vector<Scalar>& y)
diff --git a/cpp/dolfinx/la/matrix_csr_impl.h b/cpp/dolfinx/la/matrix_csr_impl.h
index 8a218a130e4..a7a918d177a 100644
--- a/cpp/dolfinx/la/matrix_csr_impl.h
+++ b/cpp/dolfinx/la/matrix_csr_impl.h
@@ -6,7 +6,6 @@
 
 #pragma once
 
-#include <iostream>
 #include <numeric>
 #include <span>
 #include <utility>
@@ -16,28 +15,28 @@ namespace dolfinx::la
 {
 namespace impl
 {
-/// @brief Incorporate data into a CSR matrix
+/// @brief Incorporate data into a CSR matrix.
 ///
-/// @tparam BS0 Row block size (of both matrix and data)
-/// @tparam BS1 Column block size (of both matrix and data)
-/// @tparam OP The operation (usually "set" or "add")
-/// @param[out] data The CSR matrix data
-/// @param[in] cols The CSR column indices
-/// @param[in] row_ptr The pointer to the ith row in the CSR data
+/// @tparam BS0 Row block size (of both matrix and data).
+/// @tparam BS1 Column block size (of both matrix and data).
+/// @tparam OP The operation (usually "set" or "add").
+/// @param[out] data CSR matrix data.
+/// @param[in] cols CSR column indices.
+/// @param[in] row_ptr Pointer to the ith row in the CSR data.
 /// @param[in] x The `m` by `n` dense block of values (row-major) to add
-/// to the matrix
-/// @param[in] xrows The row indices of `x`
-/// @param[in] xcols The column indices of `x`
-/// @param[in] op The operation (set or add)
-/// @param[in] num_rows The maximum row index that can be set. Used
-/// when debugging to check that rows beyond a permitted range
-/// are not being set.
+/// to the matrix.
+/// @param[in] xrows Row indices of `x`.
+/// @param[in] xcols Column indices of `x`.
+/// @param[in] op The operation (set or add),
+/// @param[in] num_rows Maximum row index that can be set. Used when
+/// debugging to check that rows beyond a permitted range are not being
+/// set.
 ///
 /// @note In the case of block data, where BS0 or BS1 are greater than
-/// one, the layout of the input data is still the same. For example, the
-/// following can be inserted into the top-left corner
-/// with xrows={0,1} and xcols={0,1}, BS0=2, BS1=2 and
-/// x={0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}.
+/// one, the layout of the input data is still the same. For example,
+/// the following can be inserted into the top-left corner with
+/// `xrows={0,1}` and `xcols={0,1}`, `BS0=2`, `BS1=2` and
+/// `x={0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}`.
 ///
 /// 0  1  | 2  3
 /// 4  5  | 6  7
@@ -45,62 +44,6 @@ namespace impl
 /// 8  9  | 10 11
 /// 12 13 | 14 15
 ///
-template <int BS0, int BS1, typename OP, typename U, typename V, typename W,
-          typename X, typename Y>
-void insert_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
-                const Y& xrows, const Y& xcols, OP op,
-                typename Y::value_type num_rows);
-
-/// @brief Incorporate blocked data with given block sizes into a non-blocked
-/// MatrixCSR
-/// @note Matrix block size (bs=1). Matrix sparsity must be correct to accept
-/// the data.
-/// @note see `insert_csr` for data layout
-///
-/// @tparam BS0 Row block size of Data
-/// @tparam BS1 Column block size of Data
-/// @tparam OP The operation (usually "set" or "add")
-/// @param[out] data The CSR matrix data
-/// @param[in] cols The CSR column indices
-/// @param[in] row_ptr The pointer to the ith row in the CSR data
-/// @param[in] x The `m` by `n` dense block of values (row-major) to add
-/// to the matrix
-/// @param[in] xrows The row indices of `x`
-/// @param[in] xcols The column indices of `x`
-/// @param[in] op The operation (set or add)
-/// @param[in] num_rows The maximum row index that can be set. Used
-/// when debugging to check that rows beyond a permitted range
-/// are not being set.
-template <int BS0, int BS1, typename OP, typename U, typename V, typename W,
-          typename X, typename Y>
-void insert_blocked_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
-                        const Y& xrows, const Y& xcols, OP op,
-                        typename Y::value_type num_rows);
-
-/// @brief Incorporate non-blocked data into a blocked matrix (Data block size =
-/// 1)
-/// @note Matrix sparsity must be correct to accept the data
-/// @note see `insert_csr` for data layout
-/// @param[out] data The CSR matrix data
-/// @param[in] cols The CSR column indices
-/// @param[in] row_ptr The pointer to the ith row in the CSR data
-/// @param[in] x The `m` by `n` dense block of values (row-major) to add
-/// to the matrix
-/// @param[in] xrows The row indices of `x`
-/// @param[in] xcols The column indices of `x`
-/// @param[in] op The operation (set or add)
-/// @param[in] num_rows The maximum row index that can be set. Used
-/// when debugging to check that rows beyond a permitted range
-/// are not being set.
-/// @param[in] bs0 Row block size of Matrix
-/// @param[in] bs1 Column block size of Matrix
-template <typename OP, typename U, typename V, typename W, typename X,
-          typename Y>
-void insert_nonblocked_csr(U&& data, const V& cols, const W& row_ptr,
-                           const X& x, const Y& xrows, const Y& xcols, OP op,
-                           typename Y::value_type num_rows, int bs0, int bs1);
-
-//-----------------------------------------------------------------------------
 template <int BS0, int BS1, typename OP, typename U, typename V, typename W,
           typename X, typename Y>
 void insert_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
@@ -127,13 +70,12 @@ void insert_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
     {
       // Find position of column index
       auto it = std::lower_bound(cit0, cit1, xcols[c]);
-
       if (it == cit1 or *it != xcols[c])
         throw std::runtime_error("Entry not in sparsity");
 
       std::size_t d = std::distance(cols.begin(), it);
-      int di = d * BS0 * BS1;
-      int xi = c * BS1;
+      std::size_t di = d * BS0 * BS1;
+      std::size_t xi = c * BS1;
       assert(di < data.size());
       for (int i = 0; i < BS0; ++i)
       {
@@ -145,8 +87,28 @@ void insert_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
     }
   }
 }
-//-----------------------------------------------------------------------------
-// Insert with block insertion into a regular CSR (block size 1)
+
+/// @brief Incorporate blocked data with given block sizes into a
+/// non-blocked MatrixCSR.
+///
+/// @note Matrix block size (bs=1). Matrix sparsity must be correct to
+/// accept the data.
+/// @note See ::insert_csr for data layout.
+///
+/// @tparam BS0 Row block size of data.
+/// @tparam BS1 Column block size of data.
+/// @tparam OP The operation (usually "set" or "add").
+/// @param[out] data CSR matrix data.
+/// @param[in] cols CSR column indices.
+/// @param[in] row_ptr Pointer to the ith row in the CSR data.
+/// @param[in] x The `m` by `n` dense block of values (row-major) to add
+/// to the matrix.
+/// @param[in] xrows Row indices of `x`.
+/// @param[in] xcols Column indices of `x`.
+/// @param[in] op The operation (set or add).
+/// @param[in] num_rows Maximum row index that can be set. Used when
+/// debugging to check that rows beyond a permitted range are not being
+/// set.
 template <int BS0, int BS1, typename OP, typename U, typename V, typename W,
           typename X, typename Y>
 void insert_blocked_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
@@ -159,7 +121,6 @@ void insert_blocked_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
   {
     // Row index and current data row
     auto row = xrows[r] * BS0;
-
 #ifndef NDEBUG
     if (row >= num_rows)
       throw std::runtime_error("Local row out of range");
@@ -169,6 +130,7 @@ void insert_blocked_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
     {
       using T = typename X::value_type;
       const T* xr = x.data() + (r * BS0 + i) * nc * BS1;
+
       // Columns indices for row
       auto cit0 = std::next(cols.begin(), row_ptr[row + i]);
       auto cit1 = std::next(cols.begin(), row_ptr[row + i + 1]);
@@ -176,27 +138,43 @@ void insert_blocked_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
       {
         // Find position of column index
         auto it = std::lower_bound(cit0, cit1, xcols[c] * BS1);
-
         if (it == cit1 or *it != xcols[c] * BS1)
           throw std::runtime_error("Entry not in sparsity");
 
         std::size_t d = std::distance(cols.begin(), it);
         assert(d < data.size());
-        int xi = c * BS1;
+        std::size_t xi = c * BS1;
         for (int j = 0; j < BS1; ++j)
           op(data[d + j], xr[xi + j]);
       }
     }
   }
 }
-//-----------------------------------------------------------------------------
-// Add individual entries in block-CSR storage
+
+/// @brief Incorporate non-blocked data into a blocked matrix (data
+/// block size=1)
+///
+/// @note Matrix sparsity must be correct to accept the data.
+/// @note See ::insert_csr for data layout.
+///
+/// @param[out] data CSR matrix data.
+/// @param[in] cols CSR column indices.
+/// @param[in] row_ptr Pointer to the ith row in the CSR data.
+/// @param[in] x The `m` by `n` dense block of values (row-major) to add
+/// to the matrix.
+/// @param[in] xrows Rrow indices of `x`.
+/// @param[in] xcols Column indices of `x`.
+/// @param[in] op The operation (set or add).
+/// @param[in] num_rows Maximum row index that can be set. Used when
+/// debugging to check that rows beyond a permitted range are not being
+/// set.
+/// @param[in] bs0 Row block size of matrix.
+/// @param[in] bs1 Column block size of matrix.
 template <typename OP, typename U, typename V, typename W, typename X,
           typename Y>
 void insert_nonblocked_csr(U&& data, const V& cols, const W& row_ptr,
                            const X& x, const Y& xrows, const Y& xcols, OP op,
-                           [[maybe_unused]] typename Y::value_type num_rows,
-                           int bs0, int bs1)
+                           typename Y::value_type num_rows, int bs0, int bs1)
 {
   const std::size_t nc = xcols.size();
   const int nbs = bs0 * bs1;
@@ -221,19 +199,28 @@ void insert_nonblocked_csr(U&& data, const V& cols, const W& row_ptr,
       // Find position of column index
       auto cdiv = std::div(xcols[c], bs1);
       auto it = std::lower_bound(cit0, cit1, cdiv.quot);
-
       if (it == cit1 or *it != cdiv.quot)
         throw std::runtime_error("Entry not in sparsity");
 
       std::size_t d = std::distance(cols.begin(), it);
-      const int di = d * nbs + rdiv.rem * bs1 + cdiv.rem;
+      std::size_t di = d * nbs + rdiv.rem * bs1 + cdiv.rem;
       assert(di < data.size());
       op(data[di], xr[c]);
     }
   }
 }
-//-----------------------------------------------------------------------------
 
+/// @brief  Sparse matrix-vector product implementation.
+/// @tparam T
+/// @tparam BS1
+/// @param values
+/// @param row_begin
+/// @param row_end
+/// @param indices
+/// @param x
+/// @param y
+/// @param bs0
+/// @param bs1
 template <typename T, int BS1>
 void spmv(std::span<const T> values, std::span<const std::int64_t> row_begin,
           std::span<const std::int64_t> row_end,
@@ -241,7 +228,6 @@ void spmv(std::span<const T> values, std::span<const std::int64_t> row_begin,
           std::span<T> y, int bs0, int bs1)
 {
   assert(row_begin.size() == row_end.size());
-
   for (int k0 = 0; k0 < bs0; ++k0)
   {
     for (std::size_t i = 0; i < row_begin.size(); i++)

From 477076efebd381922ed2b9adef1fb6bcda04b9a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B8rgen=20Schartum=20Dokken?= <dokken92@gmail.com>
Date: Mon, 13 Jan 2025 15:48:00 +0100
Subject: [PATCH 7/9] Fix mapping of coefficients and constants for form
 independent compilation. (#3597)

* Fix packing of constants and coefficients

* Fix naming

* Add test

* Fix numbering of constants
---
 python/dolfinx/fem/forms.py                   | 35 +++++++--
 .../test_assemble_mesh_independent_form.py    | 73 +++++++++++++++++++
 2 files changed, 101 insertions(+), 7 deletions(-)

diff --git a/python/dolfinx/fem/forms.py b/python/dolfinx/fem/forms.py
index 85ed796d011..335ef212594 100644
--- a/python/dolfinx/fem/forms.py
+++ b/python/dolfinx/fem/forms.py
@@ -508,13 +508,34 @@ def create_form(
     for _, idomain in _subdomain_data.items():
         idomain.sort(key=lambda x: x[0])
 
-    # Extract name of ufl objects and map them to their corresponding C++ object
-    ufl_coefficients = ufl.algorithms.extract_coefficients(form.ufl_form)
-    coefficients = {
-        f"w{ufl_coefficients.index(u)}": uh._cpp_object for (u, uh) in coefficient_map.items()
-    }
-    ufl_constants = ufl.algorithms.analysis.extract_constants(form.ufl_form)
-    constants = {f"c{ufl_constants.index(u)}": uh._cpp_object for (u, uh) in constant_map.items()}
+    # Extract all coefficients of the compiled form in correct order
+    coefficients = {}
+    original_coefficients = ufl.algorithms.extract_coefficients(form.ufl_form)
+    num_coefficients = form.ufcx_form.num_coefficients
+    for c in range(num_coefficients):
+        original_index = form.ufcx_form.original_coefficient_positions[c]
+        original_coeff = original_coefficients[original_index]
+        try:
+            coefficients[f"w{c}"] = coefficient_map[original_coeff]._cpp_object
+        except KeyError:
+            raise RuntimeError(f"Missing coefficient {original_coeff}")
+
+    # Extract all constants of the compiled form in correct order
+    # NOTE: Constants are not eliminated
+    original_constants = ufl.algorithms.analysis.extract_constants(form.ufl_form)
+    num_constants = form.ufcx_form.num_constants
+    if num_constants != len(original_constants):
+        raise RuntimeError(
+            f"Number of constants in compiled form ({num_constants})",
+            f"does not match the original form {len(original_constants)}",
+        )
+    constants = {}
+    for counter, constant in enumerate(original_constants):
+        try:
+            mapped_constant = constant_map[constant]
+            constants[f"c{counter}"] = mapped_constant._cpp_object
+        except KeyError:
+            raise RuntimeError(f"Missing constant {constant}")
 
     ftype = form_cpp_creator(form.dtype)
     f = ftype(
diff --git a/python/test/unit/fem/test_assemble_mesh_independent_form.py b/python/test/unit/fem/test_assemble_mesh_independent_form.py
index ce8895bba22..0c7ed671ca3 100644
--- a/python/test/unit/fem/test_assemble_mesh_independent_form.py
+++ b/python/test/unit/fem/test_assemble_mesh_independent_form.py
@@ -1,3 +1,9 @@
+# Copyright (C) 2024-2025 Jørgen S. Dokken
+#
+# This file is part of DOLFINx (https://www.fenicsproject.org)
+#
+# SPDX-License-Identifier:    LGPL-3.0-or-later
+
 from mpi4py import MPI
 
 import numpy as np
@@ -152,3 +158,70 @@ def g(x):
     # to dolfinx functions and constants
     for i in range(1, 4):
         create_and_integrate(i, compiled_form)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        np.float32,
+        np.float64,
+        pytest.param(np.complex64, marks=pytest.mark.xfail_win32_complex),
+        pytest.param(np.complex128, marks=pytest.mark.xfail_win32_complex),
+    ],
+)
+def test_eliminated_data(dtype):
+    """
+    Test that mesh independent compilation handles the re-ordering of coefficients and constants
+    when removed through differentiation
+    """
+
+    cell_name = "triangle"
+    real_type = dtype(0).real.dtype
+    c_el = basix.ufl.element("Lagrange", cell_name, 1, shape=(2,), dtype=real_type)
+    domain = ufl.Mesh(c_el)
+    el = basix.ufl.element("Lagrange", cell_name, 2, dtype=real_type)
+
+    V = ufl.FunctionSpace(domain, el)
+
+    c = ufl.Constant(domain)
+    d = ufl.Constant(domain)
+    u = ufl.Coefficient(V)
+    v = ufl.Coefficient(V)
+
+    J = (c * u**2 + d * v**2) * ufl.dx
+    dv = ufl.conj(ufl.TestFunction(V))
+    L = ufl.derivative(J, v, dv)
+
+    # Compile form using dolfinx.jit.ffcx_jit
+    compiled_form = dolfinx.fem.compile_form(
+        MPI.COMM_WORLD, L, form_compiler_options={"scalar_type": dtype}
+    )
+
+    # Pack discrete data
+    cell_type = dolfinx.mesh.to_type(cell_name)
+    mesh = dolfinx.mesh.create_unit_square(
+        MPI.COMM_WORLD, 5, 2, dtype=real_type, cell_type=cell_type
+    )
+    Vh = dolfinx.fem.functionspace(mesh, el)
+    uh = dolfinx.fem.Function(Vh, dtype=dtype)
+    uh.interpolate(lambda x: x[0])
+    vh = dolfinx.fem.Function(Vh, dtype=dtype)
+    vh.interpolate(lambda x: x[1])
+    dh = dolfinx.fem.Constant(mesh, dtype(3.0))
+    ch = dolfinx.fem.Constant(mesh, dtype(2.0))
+
+    # Assemble discrete vector
+    form = dolfinx.fem.create_form(compiled_form, [Vh], mesh, {}, {u: uh, v: vh}, {c: ch, d: dh})
+    b = dolfinx.fem.assemble_vector(form)
+    b.scatter_reverse(dolfinx.la.InsertMode.add)
+    b.scatter_forward()
+
+    # Compare to reference solution
+    dvh = ufl.conj(ufl.TestFunction(Vh))
+    exact_form = 2 * dh * vh * dvh * ufl.dx
+    b_exact = dolfinx.fem.assemble_vector(dolfinx.fem.form(exact_form, dtype=dtype))
+    b_exact.scatter_reverse(dolfinx.la.InsertMode.add)
+    b_exact.scatter_forward()
+
+    tol = np.finfo(dtype).resolution * 1e3
+    np.testing.assert_allclose(b.array, b_exact.array, atol=tol)

From d12e5123a19eaa9d7ce5a3bf374302f94dcef956 Mon Sep 17 00:00:00 2001
From: Joe Dean <jpd62@cam.ac.uk>
Date: Thu, 16 Jan 2025 07:53:26 +0000
Subject: [PATCH 8/9] More work towards supporting mixed-topology meshes
 (#3590)

* Add very low level example code

* Update to latest dolfinx

* Add comments

* Use vtk permutation

* Call ffcx once

* Remove unused

* Fix some mypy moaning

* Exit in parallel

* Work on create_mesh [skip ci]

* push back spans [skip ci]

* add test

* Adjust docstring

* Enable no-partitioning mode

* Use create_mesh

* Update to API

* Slight tidy up

* More dolfinx integration

* Simplify and add more docs

* Fix for MPI

* Start on generalising assembler

* Used custom SP for now

* Ranges

* Check only one type of imap when Topology::index_map called

* Fix default integration entities

* Combine

* Add sparsity build function

* Simplify

* Tidy

* Format

* Fix docs

* Docs

* Add FIXMEs

* Add list of eles

* Multiple dofmaps

* Add new FS contructor

* Add getters

* Bind new constructor

* Start on form

* Add func to get cell types

* Pass multiple ufcx_forms

* Work on demo

* Remove commented line

* Start on adding cell_type to integral data

* More work

* More work

* More hacking

* More work

* More work

* Multiple kernels

* Fix sparsity pattern creation

* Add back rest of demo

* Tidy

* Tidy

* Tidy

* Tidy

* Uncomment code

* Tidy

* Tidy

* Add FIXME

* Tidy

* Tidy

* Uncomment code

* Simplify

* Uncomment more code

* Rename

* Simplify

* Tidy

* Add comment

* Get kernel properly

* Comments

* More work

* Check number of elements/dofmaps

* Add note

* Tidy

* Tidy

* Tidy

* Ruff fix

* Add []

* Fix vector vs span

* Ruff formatting

* Ruff formatting

* Doc fix

* Add separate function

* Add FIXME

* mypy

* Use std::size_t

* Docs

* Fix float type

* Fix type

* Fix test

* Updates

* Add check

* Update cpp/dolfinx/fem/FunctionSpace.h

* Style updates

* Docs

---------

Co-authored-by: Chris Richardson <chris@bpi.cam.ac.uk>
Co-authored-by: Garth N. Wells <gnw20@cam.ac.uk>
---
 cpp/dolfinx/fem/Form.h                 | 112 ++++-
 cpp/dolfinx/fem/FunctionSpace.h        | 120 +++--
 cpp/dolfinx/fem/assemble_matrix_impl.h | 190 ++++----
 cpp/dolfinx/fem/assembler.h            |  27 +-
 cpp/dolfinx/fem/utils.h                | 583 +++++++++++++------------
 cpp/dolfinx/mesh/Geometry.h            |   2 +-
 cpp/dolfinx/mesh/Topology.cpp          |  15 +-
 cpp/dolfinx/mesh/Topology.h            |   6 +-
 python/demo/demo_mixed-topology.py     | 187 ++++++++
 python/doc/source/demos.rst            |   2 +
 python/dolfinx/fem/__init__.py         |  21 +
 python/dolfinx/fem/forms.py            |  93 +++-
 python/dolfinx/wrappers/assemble.cpp   |   9 +-
 python/dolfinx/wrappers/fem.cpp        |  22 +-
 python/dolfinx/wrappers/mesh.cpp       |   1 +
 python/test/unit/fem/test_forms.py     |   4 +-
 16 files changed, 963 insertions(+), 431 deletions(-)
 create mode 100644 python/demo/demo_mixed-topology.py

diff --git a/cpp/dolfinx/fem/Form.h b/cpp/dolfinx/fem/Form.h
index 58f3eef2ebd..c75a2af9e81 100644
--- a/cpp/dolfinx/fem/Form.h
+++ b/cpp/dolfinx/fem/Form.h
@@ -258,6 +258,35 @@ class Form
     return _function_spaces;
   }
 
+  /// @brief Get the kernel function for integral `i` on given domain
+  /// type.
+  /// @param[in] type Integral type.
+  /// @param[in] i The subdomain ID.
+  /// @param[in] kernel_idx Index of the kernel (we may have multiple kernels
+  /// for a given ID in mixed-topology meshes).
+  /// @return Function to call for `tabulate_tensor`.
+  std::function<void(scalar_type*, const scalar_type*, const scalar_type*,
+                     const geometry_type*, const int*, const uint8_t*)>
+  kernel(IntegralType type, int i, int kernel_idx) const
+  {
+    const std::vector<integral_data<scalar_type, geometry_type>>& integrals
+        = _integrals[static_cast<std::size_t>(type)];
+
+    // Get the range of integrals with a given ID
+    auto get_id = [](const auto& a) { return a.id; };
+    auto start = std::ranges::lower_bound(integrals, i, std::less<>{}, get_id);
+    auto end = std::ranges::upper_bound(integrals, i, std::less<>{}, get_id);
+
+    // Check that the kernel is valid and return it if so
+    if (start == integrals.end() or start->id != i
+        or std::distance(start, end) <= kernel_idx)
+    {
+      throw std::runtime_error("No kernel for requested domain index.");
+    }
+
+    return std::next(start, kernel_idx)->kernel;
+  }
+
   /// @brief Get the kernel function for integral `i` on given domain
   /// type.
   /// @param[in] type Integral type.
@@ -267,13 +296,7 @@ class Form
                      const geometry_type*, const int*, const uint8_t*)>
   kernel(IntegralType type, int i) const
   {
-    const auto& integrals = _integrals[static_cast<std::size_t>(type)];
-    auto it = std::ranges::lower_bound(integrals, i, std::less<>{},
-                                       [](const auto& a) { return a.id; });
-    if (it != integrals.end() and it->id == i)
-      return it->kernel;
-    else
-      throw std::runtime_error("No kernel for requested domain index.");
+    return kernel(type, i, 0);
   }
 
   /// @brief Get types of integrals in the form.
@@ -310,8 +333,7 @@ class Form
   /// @param[in] i Index of the integral.
   std::vector<int> active_coeffs(IntegralType type, std::size_t i) const
   {
-    assert(i < _integrals[static_cast<std::size_t>(type)].size());
-    return _integrals[static_cast<std::size_t>(type)][i].coeffs;
+    return _integrals[static_cast<std::size_t>(type)].at(i).coeffs;
   }
 
   /// @brief Get the IDs for integrals (kernels) for given integral type.
@@ -324,9 +346,15 @@ class Form
   std::vector<int> integral_ids(IntegralType type) const
   {
     std::vector<int> ids;
-    const auto& integrals = _integrals[static_cast<std::size_t>(type)];
+    const std::vector<integral_data<scalar_type, geometry_type>>& integrals
+        = _integrals[static_cast<std::size_t>(type)];
     std::ranges::transform(integrals, std::back_inserter(ids),
                            [](auto& integral) { return integral.id; });
+
+    // IDs may be repeated in mixed-topology meshes, so remove duplicates
+    std::sort(ids.begin(), ids.end());
+    auto it = std::unique(ids.begin(), ids.end());
+    ids.erase(it, ids.end());
     return ids;
   }
 
@@ -344,12 +372,14 @@ class Form
   /// local_facet_index_1)`. Data is flattened with row-major layout,
   /// `shape=(num_facets, 4)`.
   ///
-  /// @param[in] type Integral domain type.
+  /// @param[in] type Integral type.
   /// @param[in] i Integral ID, i.e. (sub)domain index.
   /// @return List of active entities for the given integral (kernel).
   std::span<const std::int32_t> domain(IntegralType type, int i) const
   {
-    const auto& integrals = _integrals[static_cast<std::size_t>(type)];
+    // FIXME This should call domain with kernel_idx=0
+    const std::vector<integral_data<scalar_type, geometry_type>>& integrals
+        = _integrals[static_cast<std::size_t>(type)];
     auto it = std::ranges::lower_bound(integrals, i, std::less<>{},
                                        [](const auto& a) { return a.id; });
     if (it != integrals.end() and it->id == i)
@@ -358,24 +388,52 @@ class Form
       throw std::runtime_error("No mesh entities for requested domain index.");
   }
 
+  /// @brief Get the list of mesh entity indices for the ith integral
+  /// (kernel) of a given type.
+  /// @param[in] type Integral type.
+  /// @param[in] i Integral ID, i.e. (sub)domain index.
+  /// @param[in] kernel_idx Index of the kernel (we may have multiple kernels
+  /// for a given ID in mixed-topology meshes).
+  /// @return List of active entities for the given integral (kernel).
+  std::vector<std::int32_t> domain(IntegralType type, int i,
+                                   int kernel_idx) const
+  {
+    const std::vector<integral_data<scalar_type, geometry_type>>& integrals
+        = _integrals[static_cast<std::size_t>(type)];
+    auto get_id = [](const auto& a) { return a.id; };
+    auto start = std::ranges::lower_bound(integrals, i, std::less<>{}, get_id);
+    auto end = std::ranges::upper_bound(integrals, i, std::less<>{}, get_id);
+
+    // Check that the kernel is valid and return it if so
+    if (start == integrals.end() or start->id != i
+        or std::distance(start, end) <= kernel_idx)
+    {
+      throw std::runtime_error("No kernel for requested domain index.");
+    }
+
+    return std::next(start, kernel_idx)->entities;
+  }
+
   /// @brief Compute the list of entity indices in `mesh` for the ith
   /// integral (kernel) of a given type (i.e. cell, exterior facet, or
   /// interior facet).
   ///
   /// @param type Integral type.
   /// @param i Integral ID, i.e. the (sub)domain index.
+  /// @param kernel_idx Index of the kernel (we may have multiple
+  /// kernels for a given ID in mixed-topology meshes).
   /// @param mesh The mesh the entities are numbered with respect to.
   /// @return List of active entities in `mesh` for the given integral.
-  std::vector<std::int32_t> domain(IntegralType type, int i,
+  std::vector<std::int32_t> domain(IntegralType type, int i, int kernel_idx,
                                    const mesh::Mesh<geometry_type>& mesh) const
   {
     // Hack to avoid passing shared pointer to this function
     std::shared_ptr<const mesh::Mesh<geometry_type>> msh_ptr(
         &mesh, [](const mesh::Mesh<geometry_type>*) {});
 
-    std::span<const std::int32_t> entities = domain(type, i);
+    std::vector<std::int32_t> entities = domain(type, i, kernel_idx);
     if (msh_ptr == _mesh)
-      return std::vector(entities.begin(), entities.end());
+      return entities;
     else
     {
       std::span<const std::int32_t> entity_map = _entity_maps.at(msh_ptr);
@@ -416,6 +474,7 @@ class Form
             // Get the facet index
             const std::int32_t facet
                 = c_to_f->links(entities[i])[entities[i + 1]];
+
             // Add cell and the local facet index
             mapped_entities.insert(mapped_entities.end(),
                                    {entity_map[facet], entities[i + 1]});
@@ -443,9 +502,9 @@ class Form
         }
         else if (codim == 1)
         {
-          // In this case, the entity maps take facets in (`_mesh`) to cells in
-          // `mesh`, so we need to get the facet number from the (cell,
-          // local_facet pair) first.
+          // In this case, the entity maps take facets in (`_mesh`) to
+          // cells in `mesh`, so we need to get the facet number from
+          // the (cell, local_facet pair) first.
           auto c_to_f = _mesh->topology()->connectivity(tdim, tdim - 1);
           assert(c_to_f);
           for (std::size_t i = 0; i < entities.size(); i += 2)
@@ -453,6 +512,7 @@ class Form
             // Get the facet index
             const std::int32_t facet
                 = c_to_f->links(entities[i])[entities[i + 1]];
+
             // Add cell and the local facet index
             mapped_entities.insert(mapped_entities.end(),
                                    {entity_map[facet], entities[i + 1]});
@@ -468,6 +528,20 @@ class Form
     }
   }
 
+  /// @brief Compute the list of entity indices in `mesh` for the ith
+  /// integral (kernel) of a given type (i.e. cell, exterior facet, or
+  /// interior facet).
+  ///
+  /// @param type Integral type.
+  /// @param i Integral ID, i.e. the (sub)domain index.
+  /// @param mesh The mesh the entities are numbered with respect to.
+  /// @return List of active entities in `mesh` for the given integral.
+  std::vector<std::int32_t> domain(IntegralType type, int i,
+                                   const mesh::Mesh<geometry_type>& mesh) const
+  {
+    return domain(type, i, 0, mesh);
+  }
+
   /// @brief Access coefficients.
   const std::vector<
       std::shared_ptr<const Function<scalar_type, geometry_type>>>&
@@ -531,5 +605,5 @@ class Form
   std::map<std::shared_ptr<const mesh::Mesh<geometry_type>>,
            std::vector<std::int32_t>>
       _entity_maps;
-}; // namespace dolfinx::fem
+};
 } // namespace dolfinx::fem
diff --git a/cpp/dolfinx/fem/FunctionSpace.h b/cpp/dolfinx/fem/FunctionSpace.h
index e7b82f9c8a3..58518283281 100644
--- a/cpp/dolfinx/fem/FunctionSpace.h
+++ b/cpp/dolfinx/fem/FunctionSpace.h
@@ -44,12 +44,47 @@ class FunctionSpace
   FunctionSpace(std::shared_ptr<const mesh::Mesh<geometry_type>> mesh,
                 std::shared_ptr<const FiniteElement<geometry_type>> element,
                 std::shared_ptr<const DofMap> dofmap)
-      : _mesh(mesh), _element(element), _dofmap(dofmap),
+      : _mesh(mesh), _dofmaps{dofmap}, _elements{element},
         _id(boost::uuids::random_generator()()), _root_space_id(_id)
   {
     // Do nothing
   }
 
+  /// @brief Create function space for given mesh, elements and
+  /// degree-of-freedom maps.
+  /// @param[in] mesh Mesh that the space is defined on.
+  /// @param[in] elements Finite elements for the space, one for each cell type.
+  /// The elements must be ordered to be consistent with
+  /// mesh::topology::cell_types.
+  /// @param[in] dofmaps Degree-of-freedom maps for the space, one for each
+  /// element. The dofmaps must be ordered in the same way as the elements.
+  FunctionSpace(
+      std::shared_ptr<const mesh::Mesh<geometry_type>> mesh,
+      std::vector<std::shared_ptr<const FiniteElement<geometry_type>>> elements,
+      std::vector<std::shared_ptr<const DofMap>> dofmaps)
+      : _mesh(mesh), _dofmaps(dofmaps), _elements(elements),
+        _id(boost::uuids::random_generator()()), _root_space_id(_id)
+  {
+    std::vector<mesh::CellType> cell_types = mesh->topology()->cell_types();
+    int num_cell_types = cell_types.size();
+    if (elements.size() != num_cell_types)
+    {
+      throw std::runtime_error(
+          "Number of elements must match number of cell types");
+    }
+    if (dofmaps.size() != num_cell_types)
+    {
+      throw std::runtime_error(
+          "Number of dofmaps must match number of cell types");
+    }
+    for (std::size_t i = 0; i < num_cell_types; ++i)
+    {
+      if (elements.at(i)->cell_type() != cell_types.at(i))
+        throw std::runtime_error(
+            "Element cell types must match mesh cell types");
+    }
+  }
+
   // Copy constructor (deleted)
   FunctionSpace(const FunctionSpace& V) = delete;
 
@@ -76,19 +111,19 @@ class FunctionSpace
   FunctionSpace sub(const std::vector<int>& component) const
   {
     assert(_mesh);
-    assert(_element);
-    assert(_dofmap);
+    assert(_elements.front());
+    assert(_dofmaps.front());
 
     // Check that component is valid
     if (component.empty())
       throw std::runtime_error("Component must be non-empty");
 
     // Extract sub-element
-    auto element = this->_element->extract_sub_element(component);
+    auto element = this->_elements.front()->extract_sub_element(component);
 
     // Extract sub dofmap
-    auto dofmap
-        = std::make_shared<DofMap>(_dofmap->extract_sub_dofmap(component));
+    auto dofmap = std::make_shared<DofMap>(
+        _dofmaps.front()->extract_sub_dofmap(component));
 
     // Create new sub space
     FunctionSpace sub_space(_mesh, element, dofmap);
@@ -137,11 +172,11 @@ class FunctionSpace
 
     // Create collapsed DofMap
     auto [_collapsed_dofmap, collapsed_dofs]
-        = _dofmap->collapse(_mesh->comm(), *_mesh->topology());
+        = _dofmaps.front()->collapse(_mesh->comm(), *_mesh->topology());
     auto collapsed_dofmap
         = std::make_shared<DofMap>(std::move(_collapsed_dofmap));
 
-    return {FunctionSpace(_mesh, _element, collapsed_dofmap),
+    return {FunctionSpace(_mesh, _elements.front(), collapsed_dofmap),
             std::move(collapsed_dofs)};
   }
 
@@ -154,8 +189,8 @@ class FunctionSpace
   /// 2-tensor.
   bool symmetric() const
   {
-    if (_element)
-      return _element->symmetric();
+    if (_elements.front())
+      return _elements.front()->symmetric();
     return false;
   }
 
@@ -178,8 +213,8 @@ class FunctionSpace
                                "FunctionSpace that is a subspace.");
     }
 
-    assert(_element);
-    if (_element->is_mixed())
+    assert(_elements.front());
+    if (_elements.front()->is_mixed())
     {
       throw std::runtime_error(
           "Cannot tabulate coordinates for a mixed FunctionSpace.");
@@ -187,31 +222,32 @@ class FunctionSpace
 
     // Geometric dimension
     assert(_mesh);
-    assert(_element);
+    assert(_elements.front());
     const std::size_t gdim = _mesh->geometry().dim();
     const int tdim = _mesh->topology()->dim();
 
     // Get dofmap local size
-    assert(_dofmap);
-    std::shared_ptr<const common::IndexMap> index_map = _dofmap->index_map;
+    assert(_dofmaps.front());
+    std::shared_ptr<const common::IndexMap> index_map
+        = _dofmaps.front()->index_map;
     assert(index_map);
-    const int index_map_bs = _dofmap->index_map_bs();
-    const int dofmap_bs = _dofmap->bs();
+    const int index_map_bs = _dofmaps.front()->index_map_bs();
+    const int dofmap_bs = _dofmaps.front()->bs();
 
-    const int element_block_size = _element->block_size();
+    const int element_block_size = _elements.front()->block_size();
     const std::size_t scalar_dofs
-        = _element->space_dimension() / element_block_size;
+        = _elements.front()->space_dimension() / element_block_size;
     const std::int32_t num_dofs
         = index_map_bs * (index_map->size_local() + index_map->num_ghosts())
           / dofmap_bs;
 
     // Get the dof coordinates on the reference element
-    if (!_element->interpolation_ident())
+    if (!_elements.front()->interpolation_ident())
     {
       throw std::runtime_error("Cannot evaluate dof coordinates - this element "
                                "does not have pointwise evaluation.");
     }
-    const auto [X, Xshape] = _element->interpolation_points();
+    const auto [X, Xshape] = _elements.front()->interpolation_points();
 
     // Get coordinate map
     const CoordinateElement<geometry_type>& cmap = _mesh->geometry().cmap();
@@ -245,7 +281,7 @@ class FunctionSpace
     const int num_cells = map->size_local() + map->num_ghosts();
 
     std::span<const std::uint32_t> cell_info;
-    if (_element->needs_dof_transformations())
+    if (_elements.front()->needs_dof_transformations())
     {
       _mesh->topology_mutable()->create_entity_permutations();
       cell_info = std::span(_mesh->topology()->get_cell_permutation_info());
@@ -264,7 +300,7 @@ class FunctionSpace
     // TODO: Check transform
     // Basis function reference-to-conforming transformation function
     auto apply_dof_transformation
-        = _element->template dof_transformation_fn<geometry_type>(
+        = _elements.front()->template dof_transformation_fn<geometry_type>(
             doftransform::standard);
 
     for (int c = 0; c < num_cells; ++c)
@@ -282,7 +318,7 @@ class FunctionSpace
           x_b, std::span(cell_info.data(), cell_info.size()), c, x.extent(1));
 
       // Get cell dofmap
-      auto dofs = _dofmap->cell_dofs(c);
+      auto dofs = _dofmaps.front()->cell_dofs(c);
 
       // Copy dof coordinates into vector
       if (!transpose)
@@ -311,21 +347,49 @@ class FunctionSpace
   /// The finite element
   std::shared_ptr<const FiniteElement<geometry_type>> element() const
   {
-    return _element;
+    if (_elements.size() > 1)
+    {
+      throw std::runtime_error(
+          "FunctionSpace has multiple elements, call `elements` instead.");
+    }
+
+    return elements(0);
+  }
+
+  /// The finite elements
+  std::shared_ptr<const FiniteElement<geometry_type>>
+  elements(int cell_type_idx) const
+  {
+    return _elements.at(cell_type_idx);
   }
 
   /// The dofmap
-  std::shared_ptr<const DofMap> dofmap() const { return _dofmap; }
+  std::shared_ptr<const DofMap> dofmap() const
+  {
+    if (_dofmaps.size() > 1)
+    {
+      throw std::runtime_error(
+          "FunctionSpace has multiple dofmaps, call `dofmaps` instead.");
+    }
+
+    return dofmaps(0);
+  }
+
+  /// The dofmaps
+  std::shared_ptr<const DofMap> dofmaps(int cell_type_idx) const
+  {
+    return _dofmaps.at(cell_type_idx);
+  }
 
 private:
   // The mesh
   std::shared_ptr<const mesh::Mesh<geometry_type>> _mesh;
 
   // The finite element
-  std::shared_ptr<const FiniteElement<geometry_type>> _element;
+  std::vector<std::shared_ptr<const FiniteElement<geometry_type>>> _elements;
 
   // The dofmap
-  std::shared_ptr<const DofMap> _dofmap;
+  std::vector<std::shared_ptr<const DofMap>> _dofmaps;
 
   // The component w.r.t. to root space
   std::vector<int> _component;
diff --git a/cpp/dolfinx/fem/assemble_matrix_impl.h b/cpp/dolfinx/fem/assemble_matrix_impl.h
index 78582892f4f..b1abf6a6a80 100644
--- a/cpp/dolfinx/fem/assemble_matrix_impl.h
+++ b/cpp/dolfinx/fem/assemble_matrix_impl.h
@@ -492,7 +492,7 @@ void assemble_interior_facets(
 /// are applied. Matrix is not finalised.
 template <dolfinx::scalar T, std::floating_point U>
 void assemble_matrix(
-    la::MatSet<T> auto mat_set, const Form<T, U>& a, mdspan2_t x_dofmap,
+    la::MatSet<T> auto mat_set, const Form<T, U>& a,
     std::span<const scalar_value_type_t<T>> x, std::span<const T> constants,
     const std::map<std::pair<IntegralType, int>,
                    std::pair<std::span<const T>, int>>& coefficients,
@@ -510,90 +510,120 @@ void assemble_matrix(
   auto mesh1 = a.function_spaces().at(1)->mesh();
   assert(mesh1);
 
-  // Get dofmap data
-  std::shared_ptr<const fem::DofMap> dofmap0
-      = a.function_spaces().at(0)->dofmap();
-  std::shared_ptr<const fem::DofMap> dofmap1
-      = a.function_spaces().at(1)->dofmap();
-  assert(dofmap0);
-  assert(dofmap1);
-  auto dofs0 = dofmap0->map();
-  const int bs0 = dofmap0->bs();
-  auto dofs1 = dofmap1->map();
-  const int bs1 = dofmap1->bs();
-
-  auto element0 = a.function_spaces().at(0)->element();
-  assert(element0);
-  auto element1 = a.function_spaces().at(1)->element();
-  assert(element1);
-  fem::DofTransformKernel<T> auto P0
-      = element0->template dof_transformation_fn<T>(doftransform::standard);
-  fem::DofTransformKernel<T> auto P1T
-      = element1->template dof_transformation_right_fn<T>(
-          doftransform::transpose);
-
-  std::span<const std::uint32_t> cell_info0;
-  std::span<const std::uint32_t> cell_info1;
-  if (element0->needs_dof_transformations()
-      or element1->needs_dof_transformations() or a.needs_facet_permutations())
+  // TODO: Mixed topology with exterior and interior facet integrals.
+  //
+  // NOTE: Can't just loop over cell types for interior facet integrals
+  // because we have a kernel per combination of comparable cell types,
+  // rather than one per cell type. Also, we need the dofmaps for two
+  // different cell types at the same time.
+  const int num_cell_types = mesh->topology()->cell_types().size();
+  for (int cell_type_idx = 0; cell_type_idx < num_cell_types; ++cell_type_idx)
   {
-    mesh0->topology_mutable()->create_entity_permutations();
-    mesh1->topology_mutable()->create_entity_permutations();
-    cell_info0 = std::span(mesh0->topology()->get_cell_permutation_info());
-    cell_info1 = std::span(mesh1->topology()->get_cell_permutation_info());
-  }
+    // Geometry dofmap and data
+    mdspan2_t x_dofmap = mesh->geometry().dofmap(cell_type_idx);
+
+    // Get dofmap data
+    std::shared_ptr<const fem::DofMap> dofmap0
+        = a.function_spaces().at(0)->dofmaps(cell_type_idx);
+    std::shared_ptr<const fem::DofMap> dofmap1
+        = a.function_spaces().at(1)->dofmaps(cell_type_idx);
+    assert(dofmap0);
+    assert(dofmap1);
+    auto dofs0 = dofmap0->map();
+    const int bs0 = dofmap0->bs();
+    auto dofs1 = dofmap1->map();
+    const int bs1 = dofmap1->bs();
+
+    auto element0 = a.function_spaces().at(0)->elements(cell_type_idx);
+    assert(element0);
+    auto element1 = a.function_spaces().at(1)->elements(cell_type_idx);
+    assert(element1);
+    fem::DofTransformKernel<T> auto P0
+        = element0->template dof_transformation_fn<T>(doftransform::standard);
+    fem::DofTransformKernel<T> auto P1T
+        = element1->template dof_transformation_right_fn<T>(
+            doftransform::transpose);
+
+    std::span<const std::uint32_t> cell_info0;
+    std::span<const std::uint32_t> cell_info1;
+    if (element0->needs_dof_transformations()
+        or element1->needs_dof_transformations()
+        or a.needs_facet_permutations())
+    {
+      mesh0->topology_mutable()->create_entity_permutations();
+      mesh1->topology_mutable()->create_entity_permutations();
+      cell_info0 = std::span(mesh0->topology()->get_cell_permutation_info());
+      cell_info1 = std::span(mesh1->topology()->get_cell_permutation_info());
+    }
 
-  for (int i : a.integral_ids(IntegralType::cell))
-  {
-    auto fn = a.kernel(IntegralType::cell, i);
-    assert(fn);
-    auto& [coeffs, cstride] = coefficients.at({IntegralType::cell, i});
-    impl::assemble_cells(
-        mat_set, x_dofmap, x, a.domain(IntegralType::cell, i),
-        {dofs0, bs0, a.domain(IntegralType::cell, i, *mesh0)}, P0,
-        {dofs1, bs1, a.domain(IntegralType::cell, i, *mesh1)}, P1T, bc0, bc1,
-        fn, coeffs, cstride, constants, cell_info0, cell_info1);
-  }
+    for (int i : a.integral_ids(IntegralType::cell))
+    {
+      auto fn = a.kernel(IntegralType::cell, i, cell_type_idx);
+      assert(fn);
+      auto& [coeffs, cstride] = coefficients.at({IntegralType::cell, i});
+
+      impl::assemble_cells(
+          mat_set, x_dofmap, x, a.domain(IntegralType::cell, i, cell_type_idx),
+          {dofs0, bs0, a.domain(IntegralType::cell, i, cell_type_idx, *mesh0)},
+          P0,
+          {dofs1, bs1, a.domain(IntegralType::cell, i, cell_type_idx, *mesh1)},
+          P1T, bc0, bc1, fn, coeffs, cstride, constants, cell_info0,
+          cell_info1);
+    }
 
-  std::span<const std::uint8_t> perms;
-  if (a.needs_facet_permutations())
-  {
-    mesh->topology_mutable()->create_entity_permutations();
-    perms = std::span(mesh->topology()->get_facet_permutations());
-  }
+    std::span<const std::uint8_t> perms;
+    if (a.needs_facet_permutations())
+    {
+      mesh->topology_mutable()->create_entity_permutations();
+      perms = std::span(mesh->topology()->get_facet_permutations());
+    }
 
-  mesh::CellType cell_type = mesh->topology()->cell_type();
-  int num_facets_per_cell
-      = mesh::cell_num_entities(cell_type, mesh->topology()->dim() - 1);
-  for (int i : a.integral_ids(IntegralType::exterior_facet))
-  {
-    auto fn = a.kernel(IntegralType::exterior_facet, i);
-    assert(fn);
-    auto& [coeffs, cstride]
-        = coefficients.at({IntegralType::exterior_facet, i});
-    impl::assemble_exterior_facets(
-        mat_set, x_dofmap, x, num_facets_per_cell,
-        a.domain(IntegralType::exterior_facet, i),
-        {dofs0, bs0, a.domain(IntegralType::exterior_facet, i, *mesh0)}, P0,
-        {dofs1, bs1, a.domain(IntegralType::exterior_facet, i, *mesh1)}, P1T,
-        bc0, bc1, fn, coeffs, cstride, constants, cell_info0, cell_info1,
-        perms);
-  }
+    mesh::CellType cell_type = mesh->topology()->cell_types()[cell_type_idx];
+    int num_facets_per_cell
+        = mesh::cell_num_entities(cell_type, mesh->topology()->dim() - 1);
+    for (int i : a.integral_ids(IntegralType::exterior_facet))
+    {
+      if (num_cell_types > 1)
+      {
+        throw std::runtime_error("Exterior facet integrals with mixed "
+                                 "topology aren't supported yet");
+      }
 
-  for (int i : a.integral_ids(IntegralType::interior_facet))
-  {
-    const std::vector<int> c_offsets = a.coefficient_offsets();
-    auto fn = a.kernel(IntegralType::interior_facet, i);
-    assert(fn);
-    auto& [coeffs, cstride]
-        = coefficients.at({IntegralType::interior_facet, i});
-    impl::assemble_interior_facets(
-        mat_set, x_dofmap, x, num_facets_per_cell,
-        a.domain(IntegralType::interior_facet, i),
-        {*dofmap0, bs0, a.domain(IntegralType::interior_facet, i, *mesh0)}, P0,
-        {*dofmap1, bs1, a.domain(IntegralType::interior_facet, i, *mesh1)}, P1T,
-        bc0, bc1, fn, coeffs, cstride, c_offsets, constants, cell_info0,
-        cell_info1, perms);
+      auto fn = a.kernel(IntegralType::exterior_facet, i);
+      assert(fn);
+      auto& [coeffs, cstride]
+          = coefficients.at({IntegralType::exterior_facet, i});
+      impl::assemble_exterior_facets(
+          mat_set, x_dofmap, x, num_facets_per_cell,
+          a.domain(IntegralType::exterior_facet, i),
+          {dofs0, bs0, a.domain(IntegralType::exterior_facet, i, *mesh0)}, P0,
+          {dofs1, bs1, a.domain(IntegralType::exterior_facet, i, *mesh1)}, P1T,
+          bc0, bc1, fn, coeffs, cstride, constants, cell_info0, cell_info1,
+          perms);
+    }
+
+    for (int i : a.integral_ids(IntegralType::interior_facet))
+    {
+      if (num_cell_types > 1)
+      {
+        throw std::runtime_error("Interior facet integrals with mixed "
+                                 "topology aren't supported yet");
+      }
+
+      const std::vector<int> c_offsets = a.coefficient_offsets();
+      auto fn = a.kernel(IntegralType::interior_facet, i);
+      assert(fn);
+      auto& [coeffs, cstride]
+          = coefficients.at({IntegralType::interior_facet, i});
+      impl::assemble_interior_facets(
+          mat_set, x_dofmap, x, num_facets_per_cell,
+          a.domain(IntegralType::interior_facet, i),
+          {*dofmap0, bs0, a.domain(IntegralType::interior_facet, i, *mesh0)},
+          P0,
+          {*dofmap1, bs1, a.domain(IntegralType::interior_facet, i, *mesh1)},
+          P1T, bc0, bc1, fn, coeffs, cstride, c_offsets, constants, cell_info0,
+          cell_info1, perms);
+    }
   }
 }
 
diff --git a/cpp/dolfinx/fem/assembler.h b/cpp/dolfinx/fem/assembler.h
index ab41da82792..4a15a3229e9 100644
--- a/cpp/dolfinx/fem/assembler.h
+++ b/cpp/dolfinx/fem/assembler.h
@@ -38,10 +38,10 @@ make_coefficients_span(const std::map<std::pair<IntegralType, int>,
 {
   using Key = typename std::remove_reference_t<decltype(coeffs)>::key_type;
   std::map<Key, std::pair<std::span<const T>, int>> c;
-  std::ranges::transform(
-      coeffs, std::inserter(c, c.end()),
-      [](auto& e) -> typename decltype(c)::value_type
-      { return {e.first, {e.second.first, e.second.second}}; });
+  std::ranges::transform(coeffs, std::inserter(c, c.end()),
+                         [](auto& e) -> typename decltype(c)::value_type {
+                           return {e.first, {e.second.first, e.second.second}};
+                         });
   return c;
 }
 
@@ -245,16 +245,15 @@ void assemble_matrix(
   assert(mesh);
   if constexpr (std::is_same_v<U, scalar_value_type_t<T>>)
   {
-    impl::assemble_matrix(mat_add, a, mesh->geometry().dofmap(),
-                          mesh->geometry().x(), constants, coefficients,
-                          dof_marker0, dof_marker1);
+    impl::assemble_matrix(mat_add, a, mesh->geometry().x(), constants,
+                          coefficients, dof_marker0, dof_marker1);
   }
   else
   {
     auto x = mesh->geometry().x();
     std::vector<scalar_value_type_t<T>> _x(x.begin(), x.end());
-    impl::assemble_matrix(mat_add, a, mesh->geometry().dofmap(), _x, constants,
-                          coefficients, dof_marker0, dof_marker1);
+    impl::assemble_matrix(mat_add, a, _x, constants, coefficients, dof_marker0,
+                          dof_marker1);
   }
 }
 
@@ -273,10 +272,12 @@ void assemble_matrix(
     const std::vector<std::reference_wrapper<const DirichletBC<T, U>>>& bcs)
 {
   // Index maps for dof ranges
-  auto map0 = a.function_spaces().at(0)->dofmap()->index_map;
-  auto map1 = a.function_spaces().at(1)->dofmap()->index_map;
-  auto bs0 = a.function_spaces().at(0)->dofmap()->index_map_bs();
-  auto bs1 = a.function_spaces().at(1)->dofmap()->index_map_bs();
+  // NOTE: For mixed-topology meshes, there will be multiple DOF maps,
+  // but the index maps are the same.
+  auto map0 = a.function_spaces().at(0)->dofmaps(0)->index_map;
+  auto map1 = a.function_spaces().at(1)->dofmaps(0)->index_map;
+  auto bs0 = a.function_spaces().at(0)->dofmaps(0)->index_map_bs();
+  auto bs1 = a.function_spaces().at(1)->dofmaps(0)->index_map_bs();
 
   // Build dof markers
   std::vector<std::int8_t> dof_marker0, dof_marker1;
diff --git a/cpp/dolfinx/fem/utils.h b/cpp/dolfinx/fem/utils.h
index d0d5698744a..2dab25772ca 100644
--- a/cpp/dolfinx/fem/utils.h
+++ b/cpp/dolfinx/fem/utils.h
@@ -150,6 +150,34 @@ extract_function_spaces(const std::vector<std::vector<const Form<T, U>*>>& a)
 /// @return The corresponding sparsity pattern
 template <dolfinx::scalar T, std::floating_point U>
 la::SparsityPattern create_sparsity_pattern(const Form<T, U>& a)
+{
+  std::shared_ptr mesh = a.mesh();
+  assert(mesh);
+
+  // Get index maps and block sizes from the DOF maps. Note that in
+  // mixed-topology meshes, despite there being multiple DOF maps, the
+  // index maps and block sizes are the same.
+  std::array<std::reference_wrapper<const DofMap>, 2> dofmaps{
+      *a.function_spaces().at(0)->dofmaps(0),
+      *a.function_spaces().at(1)->dofmaps(0)};
+
+  const std::array index_maps{dofmaps[0].get().index_map,
+                              dofmaps[1].get().index_map};
+  const std::array bs
+      = {dofmaps[0].get().index_map_bs(), dofmaps[1].get().index_map_bs()};
+
+  la::SparsityPattern pattern(mesh->comm(), index_maps, bs);
+  build_sparsity_pattern(pattern, a);
+  return pattern;
+}
+
+/// @brief Build a sparsity pattern for a given form.
+/// @note The pattern is not finalised, i.e. the caller is responsible
+/// for calling SparsityPattern::assemble.
+/// @param[in] pattern The sparsity pattern to add to
+/// @param[in] a A bilinear form
+template <dolfinx::scalar T, std::floating_point U>
+void build_sparsity_pattern(la::SparsityPattern& pattern, const Form<T, U>& a)
 {
   if (a.rank() != 2)
   {
@@ -157,13 +185,8 @@ la::SparsityPattern create_sparsity_pattern(const Form<T, U>& a)
         "Cannot create sparsity pattern. Form is not a bilinear.");
   }
 
-  // Get dof maps and mesh
-  std::array<std::reference_wrapper<const DofMap>, 2> dofmaps{
-      *a.function_spaces().at(0)->dofmap(),
-      *a.function_spaces().at(1)->dofmap()};
   std::shared_ptr mesh = a.mesh();
   assert(mesh);
-
   std::shared_ptr mesh0 = a.function_spaces().at(0)->mesh();
   assert(mesh0);
   std::shared_ptr mesh1 = a.function_spaces().at(1)->mesh();
@@ -181,12 +204,6 @@ la::SparsityPattern create_sparsity_pattern(const Form<T, U>& a)
 
   common::Timer t0("Build sparsity");
 
-  // Get common::IndexMaps for each dimension
-  const std::array index_maps{dofmaps[0].get().index_map,
-                              dofmaps[1].get().index_map};
-  const std::array bs
-      = {dofmaps[0].get().index_map_bs(), dofmaps[1].get().index_map_bs()};
-
   auto extract_cells = [](std::span<const std::int32_t> facets)
   {
     assert(facets.size() % 2 == 0);
@@ -197,48 +214,54 @@ la::SparsityPattern create_sparsity_pattern(const Form<T, U>& a)
     return cells;
   };
 
-  // Create and build sparsity pattern
-  la::SparsityPattern pattern(mesh->comm(), index_maps, bs);
-  for (auto type : types)
+  const int num_cell_types = mesh->topology()->cell_types().size();
+  for (int cell_type_idx = 0; cell_type_idx < num_cell_types; ++cell_type_idx)
   {
-    std::vector<int> ids = a.integral_ids(type);
-    switch (type)
+    std::array<std::reference_wrapper<const DofMap>, 2> dofmaps{
+        *a.function_spaces().at(0)->dofmaps(cell_type_idx),
+        *a.function_spaces().at(1)->dofmaps(cell_type_idx)};
+
+    // Create and build sparsity pattern
+    for (auto type : types)
     {
-    case IntegralType::cell:
-      for (int id : ids)
-      {
-        sparsitybuild::cells(
-            pattern, {a.domain(type, id, *mesh0), a.domain(type, id, *mesh1)},
-            {{dofmaps[0], dofmaps[1]}});
-      }
-      break;
-    case IntegralType::interior_facet:
-      for (int id : ids)
+      std::vector<int> ids = a.integral_ids(type);
+      switch (type)
       {
-        sparsitybuild::interior_facets(
-            pattern,
-            {extract_cells(a.domain(type, id, *mesh0)),
-             extract_cells(a.domain(type, id, *mesh1))},
-            {{dofmaps[0], dofmaps[1]}});
-      }
-      break;
-    case IntegralType::exterior_facet:
-      for (int id : ids)
-      {
-        sparsitybuild::cells(pattern,
-                             {extract_cells(a.domain(type, id, *mesh0)),
-                              extract_cells(a.domain(type, id, *mesh1))},
-                             {{dofmaps[0], dofmaps[1]}});
+      case IntegralType::cell:
+        for (int id : ids)
+        {
+          sparsitybuild::cells(pattern,
+                               {a.domain(type, id, cell_type_idx, *mesh0),
+                                a.domain(type, id, cell_type_idx, *mesh1)},
+                               {{dofmaps[0], dofmaps[1]}});
+        }
+        break;
+      case IntegralType::interior_facet:
+        for (int id : ids)
+        {
+          sparsitybuild::interior_facets(
+              pattern,
+              {extract_cells(a.domain(type, id, *mesh0)),
+               extract_cells(a.domain(type, id, *mesh1))},
+              {{dofmaps[0], dofmaps[1]}});
+        }
+        break;
+      case IntegralType::exterior_facet:
+        for (int id : ids)
+        {
+          sparsitybuild::cells(pattern,
+                               {extract_cells(a.domain(type, id, *mesh0)),
+                                extract_cells(a.domain(type, id, *mesh1))},
+                               {{dofmaps[0], dofmaps[1]}});
+        }
+        break;
+      default:
+        throw std::runtime_error("Unsupported integral type");
       }
-      break;
-    default:
-      throw std::runtime_error("Unsupported integral type");
     }
   }
 
   t0.stop();
-
-  return pattern;
 }
 
 /// Create an ElementDofLayout from a FiniteElement
@@ -323,7 +346,7 @@ std::vector<std::string> get_constant_names(const ufcx_form& ufcx_form);
 /// Use fem::create_form to create a fem::Form with coefficients and
 /// constants associated with the name/string.
 ///
-/// @param[in] ufcx_form The UFCx form.
+/// @param[in] ufcx_forms A list of UFCx forms, one for each cell type.
 /// @param[in] spaces Vector of function spaces. The number of spaces is
 /// equal to the rank of the form.
 /// @param[in] coefficients Coefficient fields in the form.
@@ -337,7 +360,7 @@ std::vector<std::string> get_constant_names(const ufcx_form& ufcx_form);
 /// @pre Each value in `subdomains` must be sorted by domain id.
 template <dolfinx::scalar T, std::floating_point U = scalar_value_type_t<T>>
 Form<T, U> create_form_factory(
-    const ufcx_form& ufcx_form,
+    const std::vector<std::reference_wrapper<const ufcx_form>>& ufcx_forms,
     const std::vector<std::shared_ptr<const FunctionSpace<U>>>& spaces,
     const std::vector<std::shared_ptr<const Function<T, U>>>& coefficients,
     const std::vector<std::shared_ptr<const Constant<T>>>& constants,
@@ -349,29 +372,38 @@ Form<T, U> create_form_factory(
                    std::span<const std::int32_t>>& entity_maps,
     std::shared_ptr<const mesh::Mesh<U>> mesh = nullptr)
 {
-  if (ufcx_form.rank != (int)spaces.size())
-    throw std::runtime_error("Wrong number of argument spaces for Form.");
-  if (ufcx_form.num_coefficients != (int)coefficients.size())
+  for (const ufcx_form& ufcx_form : ufcx_forms)
   {
-    throw std::runtime_error(
-        "Mismatch between number of expected and provided Form coefficients.");
-  }
-  if (ufcx_form.num_constants != (int)constants.size())
-  {
-    throw std::runtime_error(
-        "Mismatch between number of expected and provided Form constants.");
+    if (ufcx_form.rank != (int)spaces.size())
+      throw std::runtime_error("Wrong number of argument spaces for Form.");
+    if (ufcx_form.num_coefficients != (int)coefficients.size())
+    {
+      throw std::runtime_error("Mismatch between number of expected and "
+                               "provided Form coefficients.");
+    }
+    if (ufcx_form.num_constants != (int)constants.size())
+    {
+      throw std::runtime_error(
+          "Mismatch between number of expected and provided Form constants.");
+    }
   }
 
   // Check argument function spaces
-  for (std::size_t i = 0; i < spaces.size(); ++i)
+  for (std::size_t form_idx = 0; form_idx < ufcx_forms.size(); ++form_idx)
   {
-    assert(spaces[i]->element());
-    if (auto element_hash = ufcx_form.finite_element_hashes[i];
-        element_hash != 0
-        and element_hash != spaces[i]->element()->basix_element().hash())
+    for (std::size_t i = 0; i < spaces.size(); ++i)
     {
-      throw std::runtime_error("Cannot create form. Elements are different to "
-                               "those used to compile the form.");
+      assert(spaces[i]->elements(form_idx));
+      if (auto element_hash
+          = ufcx_forms[form_idx].get().finite_element_hashes[i];
+          element_hash != 0
+          and element_hash
+                  != spaces[i]->elements(form_idx)->basix_element().hash())
+      {
+        throw std::runtime_error(
+            "Cannot create form. Elements are different to "
+            "those used to compile the form.");
+      }
     }
   }
 
@@ -391,7 +423,10 @@ Form<T, U> create_form_factory(
   assert(topology);
   const int tdim = topology->dim();
 
-  const int* integral_offsets = ufcx_form.form_integral_offsets;
+  // NOTE: This assumes all forms in mixed-topology meshes have the same
+  // integral offsets. Since the UFL forms for each type of cell should be
+  // the same, I think this assumption is OK.
+  const int* integral_offsets = ufcx_forms[0].get().form_integral_offsets;
   std::vector<int> num_integrals_type(3);
   for (int i = 0; i < 3; ++i)
     num_integrals_type[i] = integral_offsets[i + 1] - integral_offsets[i];
@@ -413,267 +448,277 @@ Form<T, U> create_form_factory(
 
   // Attach cell kernels
   bool needs_facet_permutations = false;
-  std::vector<std::int32_t> default_cells;
   {
-    std::span<const int> ids(ufcx_form.form_integral_ids
+    std::vector<std::int32_t> default_cells;
+    std::span<const int> ids(ufcx_forms[0].get().form_integral_ids
                                  + integral_offsets[cell],
                              num_integrals_type[cell]);
     auto itg = integrals.insert({IntegralType::cell, {}});
     auto sd = subdomains.find(IntegralType::cell);
-    for (int i = 0; i < num_integrals_type[cell]; ++i)
+    for (std::size_t form_idx = 0; form_idx < ufcx_forms.size(); ++form_idx)
     {
-      const int id = ids[i];
-      ufcx_integral* integral
-          = ufcx_form.form_integrals[integral_offsets[cell] + i];
-      assert(integral);
-
-      // Build list of active coefficients
-      std::vector<int> active_coeffs;
-      for (int j = 0; j < ufcx_form.num_coefficients; ++j)
+      const ufcx_form& ufcx_form = ufcx_forms[form_idx];
+      for (int i = 0; i < num_integrals_type[cell]; ++i)
       {
-        if (integral->enabled_coefficients[j])
-          active_coeffs.push_back(j);
-      }
+        const int id = ids[i];
+        ufcx_integral* integral
+            = ufcx_form.form_integrals[integral_offsets[cell] + i];
+        assert(integral);
+
+        // Build list of active coefficients
+        std::vector<int> active_coeffs;
+        for (int j = 0; j < ufcx_form.num_coefficients; ++j)
+        {
+          if (integral->enabled_coefficients[j])
+            active_coeffs.push_back(j);
+        }
 
-      kern_t k = nullptr;
-      if constexpr (std::is_same_v<T, float>)
-        k = integral->tabulate_tensor_float32;
+        kern_t k = nullptr;
+        if constexpr (std::is_same_v<T, float>)
+          k = integral->tabulate_tensor_float32;
 #ifndef DOLFINX_NO_STDC_COMPLEX_KERNELS
-      else if constexpr (std::is_same_v<T, std::complex<float>>)
-      {
-        k = reinterpret_cast<void (*)(
-            T*, const T*, const T*,
-            const typename scalar_value_type<T>::value_type*, const int*,
-            const unsigned char*)>(integral->tabulate_tensor_complex64);
-      }
+        else if constexpr (std::is_same_v<T, std::complex<float>>)
+        {
+          k = reinterpret_cast<void (*)(
+              T*, const T*, const T*,
+              const typename scalar_value_type<T>::value_type*, const int*,
+              const unsigned char*)>(integral->tabulate_tensor_complex64);
+        }
 #endif // DOLFINX_NO_STDC_COMPLEX_KERNELS
-      else if constexpr (std::is_same_v<T, double>)
-        k = integral->tabulate_tensor_float64;
+        else if constexpr (std::is_same_v<T, double>)
+          k = integral->tabulate_tensor_float64;
 #ifndef DOLFINX_NO_STDC_COMPLEX_KERNELS
-      else if constexpr (std::is_same_v<T, std::complex<double>>)
-      {
-        k = reinterpret_cast<void (*)(
-            T*, const T*, const T*,
-            const typename scalar_value_type<T>::value_type*, const int*,
-            const unsigned char*)>(integral->tabulate_tensor_complex128);
-      }
+        else if constexpr (std::is_same_v<T, std::complex<double>>)
+        {
+          k = reinterpret_cast<void (*)(
+              T*, const T*, const T*,
+              const typename scalar_value_type<T>::value_type*, const int*,
+              const unsigned char*)>(integral->tabulate_tensor_complex128);
+        }
 #endif // DOLFINX_NO_STDC_COMPLEX_KERNELS
 
-      if (!k)
-      {
-        throw std::runtime_error(
-            "UFCx kernel function is NULL. Check requested types.");
-      }
+        if (!k)
+        {
+          throw std::runtime_error(
+              "UFCx kernel function is NULL. Check requested types.");
+        }
 
-      // Build list of entities to assemble over
-      if (id == -1)
-      {
-        // Default kernel, operates on all (owned) cells
-        assert(topology->index_map(tdim));
-        default_cells.resize(topology->index_map(tdim)->size_local(), 0);
-        std::iota(default_cells.begin(), default_cells.end(), 0);
-        itg.first->second.emplace_back(id, k, default_cells, active_coeffs);
-      }
-      else if (sd != subdomains.end())
-      {
-        // NOTE: This requires that pairs are sorted
-        auto it
-            = std::ranges::lower_bound(sd->second, id, std::less<>{},
-                                       [](const auto& a) { return a.first; });
-        if (it != sd->second.end() and it->first == id)
-          itg.first->second.emplace_back(id, k, it->second, active_coeffs);
-      }
+        // Build list of entities to assemble over
+        if (id == -1)
+        {
+          // Default kernel, operates on all (owned) cells
+          assert(topology->index_maps(tdim).at(form_idx));
+          default_cells.resize(
+              topology->index_maps(tdim).at(form_idx)->size_local(), 0);
+          std::iota(default_cells.begin(), default_cells.end(), 0);
+          itg.first->second.emplace_back(id, k, default_cells, active_coeffs);
+        }
+        else if (sd != subdomains.end())
+        {
+          // NOTE: This requires that pairs are sorted
+          auto it = std::ranges::lower_bound(sd->second, id, std::less<>{},
+                                             [](auto& a) { return a.first; });
+          if (it != sd->second.end() and it->first == id)
+            itg.first->second.emplace_back(id, k, it->second, active_coeffs);
+        }
 
-      if (integral->needs_facet_permutations)
-        needs_facet_permutations = true;
+        if (integral->needs_facet_permutations)
+          needs_facet_permutations = true;
+      }
     }
   }
 
   // Attach exterior facet kernels
   std::vector<std::int32_t> default_facets_ext;
   {
-    std::span<const int> ids(ufcx_form.form_integral_ids
+    std::span<const int> ids(ufcx_forms[0].get().form_integral_ids
                                  + integral_offsets[exterior_facet],
                              num_integrals_type[exterior_facet]);
     auto itg = integrals.insert({IntegralType::exterior_facet, {}});
     auto sd = subdomains.find(IntegralType::exterior_facet);
-    for (int i = 0; i < num_integrals_type[exterior_facet]; ++i)
+    for (std::size_t form_idx = 0; form_idx < ufcx_forms.size(); ++form_idx)
     {
-      const int id = ids[i];
-      ufcx_integral* integral
-          = ufcx_form.form_integrals[integral_offsets[exterior_facet] + i];
-      assert(integral);
-      std::vector<int> active_coeffs;
-      for (int j = 0; j < ufcx_form.num_coefficients; ++j)
+      const ufcx_form& ufcx_form = ufcx_forms[form_idx];
+      for (int i = 0; i < num_integrals_type[exterior_facet]; ++i)
       {
-        if (integral->enabled_coefficients[j])
-          active_coeffs.push_back(j);
-      }
+        const int id = ids[i];
+        ufcx_integral* integral
+            = ufcx_form.form_integrals[integral_offsets[exterior_facet] + i];
+        assert(integral);
+        std::vector<int> active_coeffs;
+        for (int j = 0; j < ufcx_form.num_coefficients; ++j)
+        {
+          if (integral->enabled_coefficients[j])
+            active_coeffs.push_back(j);
+        }
 
-      kern_t k = nullptr;
-      if constexpr (std::is_same_v<T, float>)
-        k = integral->tabulate_tensor_float32;
+        kern_t k = nullptr;
+        if constexpr (std::is_same_v<T, float>)
+          k = integral->tabulate_tensor_float32;
 #ifndef DOLFINX_NO_STDC_COMPLEX_KERNELS
-      else if constexpr (std::is_same_v<T, std::complex<float>>)
-      {
-        k = reinterpret_cast<void (*)(
-            T*, const T*, const T*,
-            const typename scalar_value_type<T>::value_type*, const int*,
-            const unsigned char*)>(integral->tabulate_tensor_complex64);
-      }
+        else if constexpr (std::is_same_v<T, std::complex<float>>)
+        {
+          k = reinterpret_cast<void (*)(
+              T*, const T*, const T*,
+              const typename scalar_value_type<T>::value_type*, const int*,
+              const unsigned char*)>(integral->tabulate_tensor_complex64);
+        }
 #endif // DOLFINX_NO_STDC_COMPLEX_KERNELS
-      else if constexpr (std::is_same_v<T, double>)
-        k = integral->tabulate_tensor_float64;
+        else if constexpr (std::is_same_v<T, double>)
+          k = integral->tabulate_tensor_float64;
 #ifndef DOLFINX_NO_STDC_COMPLEX_KERNELS
-      else if constexpr (std::is_same_v<T, std::complex<double>>)
-      {
-        k = reinterpret_cast<void (*)(
-            T*, const T*, const T*,
-            const typename scalar_value_type<T>::value_type*, const int*,
-            const unsigned char*)>(integral->tabulate_tensor_complex128);
-      }
+        else if constexpr (std::is_same_v<T, std::complex<double>>)
+        {
+          k = reinterpret_cast<void (*)(
+              T*, const T*, const T*,
+              const typename scalar_value_type<T>::value_type*, const int*,
+              const unsigned char*)>(integral->tabulate_tensor_complex128);
+        }
 #endif // DOLFINX_NO_STDC_COMPLEX_KERNELS
-      assert(k);
-
-      // Build list of entities to assembler over
-      const std::vector bfacets = mesh::exterior_facet_indices(*topology);
-      auto f_to_c = topology->connectivity(tdim - 1, tdim);
-      assert(f_to_c);
-      auto c_to_f = topology->connectivity(tdim, tdim - 1);
-      assert(c_to_f);
-      if (id == -1)
-      {
-        // Default kernel, operates on all (owned) exterior facets
-        default_facets_ext.reserve(2 * bfacets.size());
-        for (std::int32_t f : bfacets)
+        assert(k);
+
+        // Build list of entities to assembler over
+        const std::vector bfacets = mesh::exterior_facet_indices(*topology);
+        auto f_to_c = topology->connectivity(tdim - 1, tdim);
+        assert(f_to_c);
+        auto c_to_f = topology->connectivity(tdim, tdim - 1);
+        assert(c_to_f);
+        if (id == -1)
+        {
+          // Default kernel, operates on all (owned) exterior facets
+          default_facets_ext.reserve(2 * bfacets.size());
+          for (std::int32_t f : bfacets)
+          {
+            // There will only be one pair for an exterior facet integral
+            auto pair
+                = impl::get_cell_facet_pairs<1>(f, f_to_c->links(f), *c_to_f);
+            default_facets_ext.insert(default_facets_ext.end(), pair.begin(),
+                                      pair.end());
+          }
+          itg.first->second.emplace_back(id, k, default_facets_ext,
+                                         active_coeffs);
+        }
+        else if (sd != subdomains.end())
         {
-          // There will only be one pair for an exterior facet integral
-          auto pair
-              = impl::get_cell_facet_pairs<1>(f, f_to_c->links(f), *c_to_f);
-          default_facets_ext.insert(default_facets_ext.end(), pair.begin(),
-                                    pair.end());
+          // NOTE: This requires that pairs are sorted
+          auto it = std::ranges::lower_bound(sd->second, id, std::less<>{},
+                                             [](auto& a) { return a.first; });
+          if (it != sd->second.end() and it->first == id)
+            itg.first->second.emplace_back(id, k, it->second, active_coeffs);
         }
-        itg.first->second.emplace_back(id, k, default_facets_ext,
-                                       active_coeffs);
-      }
-      else if (sd != subdomains.end())
-      {
-        // NOTE: This requires that pairs are sorted
-        auto it
-            = std::ranges::lower_bound(sd->second, id, std::less<>{},
-                                       [](const auto& a) { return a.first; });
-        if (it != sd->second.end() and it->first == id)
-          itg.first->second.emplace_back(id, k, it->second, active_coeffs);
-      }
 
-      if (integral->needs_facet_permutations)
-        needs_facet_permutations = true;
+        if (integral->needs_facet_permutations)
+          needs_facet_permutations = true;
+      }
     }
   }
 
   // Attach interior facet kernels
   std::vector<std::int32_t> default_facets_int;
   {
-    std::span<const int> ids(ufcx_form.form_integral_ids
+    std::span<const int> ids(ufcx_forms[0].get().form_integral_ids
                                  + integral_offsets[interior_facet],
                              num_integrals_type[interior_facet]);
     auto itg = integrals.insert({IntegralType::interior_facet, {}});
     auto sd = subdomains.find(IntegralType::interior_facet);
-
-    // Create indicator for interprocess facets
-    std::vector<std::int8_t> interprocess_marker;
-    if (num_integrals_type[interior_facet] > 0)
+    for (std::size_t form_idx = 0; form_idx < ufcx_forms.size(); ++form_idx)
     {
-      assert(topology->index_map(tdim - 1));
-      const std::vector<std::int32_t>& interprocess_facets
-          = topology->interprocess_facets();
-      std::int32_t num_facets = topology->index_map(tdim - 1)->size_local()
-                                + topology->index_map(tdim - 1)->num_ghosts();
-      interprocess_marker.resize(num_facets, 0);
-      std::ranges::for_each(interprocess_facets, [&interprocess_marker](auto f)
-                            { interprocess_marker[f] = 1; });
-    }
-
-    for (int i = 0; i < num_integrals_type[interior_facet]; ++i)
-    {
-      const int id = ids[i];
-      ufcx_integral* integral
-          = ufcx_form.form_integrals[integral_offsets[interior_facet] + i];
-      assert(integral);
-      std::vector<int> active_coeffs;
-      for (int j = 0; j < ufcx_form.num_coefficients; ++j)
+      const ufcx_form& ufcx_form = ufcx_forms[form_idx];
+      // Create indicator for interprocess facets
+      std::vector<std::int8_t> interprocess_marker;
+      if (num_integrals_type[interior_facet] > 0)
       {
-        if (integral->enabled_coefficients[j])
-          active_coeffs.push_back(j);
+        assert(topology->index_map(tdim - 1));
+        const std::vector<std::int32_t>& interprocess_facets
+            = topology->interprocess_facets();
+        std::int32_t num_facets = topology->index_map(tdim - 1)->size_local()
+                                  + topology->index_map(tdim - 1)->num_ghosts();
+        interprocess_marker.resize(num_facets, 0);
+        std::ranges::for_each(interprocess_facets,
+                              [&interprocess_marker](auto f)
+                              { interprocess_marker[f] = 1; });
       }
 
-      kern_t k = nullptr;
-      if constexpr (std::is_same_v<T, float>)
-        k = integral->tabulate_tensor_float32;
-#ifndef DOLFINX_NO_STDC_COMPLEX_KERNELS
-      else if constexpr (std::is_same_v<T, std::complex<float>>)
+      for (int i = 0; i < num_integrals_type[interior_facet]; ++i)
       {
-        k = reinterpret_cast<void (*)(
-            T*, const T*, const T*,
-            const typename scalar_value_type<T>::value_type*, const int*,
-            const unsigned char*)>(integral->tabulate_tensor_complex64);
-      }
+        const int id = ids[i];
+        ufcx_integral* integral
+            = ufcx_form.form_integrals[integral_offsets[interior_facet] + i];
+        assert(integral);
+        std::vector<int> active_coeffs;
+        for (int j = 0; j < ufcx_form.num_coefficients; ++j)
+        {
+          if (integral->enabled_coefficients[j])
+            active_coeffs.push_back(j);
+        }
+
+        kern_t k = nullptr;
+        if constexpr (std::is_same_v<T, float>)
+          k = integral->tabulate_tensor_float32;
+#ifndef DOLFINX_NO_STDC_COMPLEX_KERNELS
+        else if constexpr (std::is_same_v<T, std::complex<float>>)
+        {
+          k = reinterpret_cast<void (*)(
+              T*, const T*, const T*,
+              const typename scalar_value_type<T>::value_type*, const int*,
+              const unsigned char*)>(integral->tabulate_tensor_complex64);
+        }
 #endif // DOLFINX_NO_STDC_COMPLEX_KERNELS
-      else if constexpr (std::is_same_v<T, double>)
-        k = integral->tabulate_tensor_float64;
+        else if constexpr (std::is_same_v<T, double>)
+          k = integral->tabulate_tensor_float64;
 #ifndef DOLFINX_NO_STDC_COMPLEX_KERNELS
-      else if constexpr (std::is_same_v<T, std::complex<double>>)
-      {
-        k = reinterpret_cast<void (*)(
-            T*, const T*, const T*,
-            const typename scalar_value_type<T>::value_type*, const int*,
-            const unsigned char*)>(integral->tabulate_tensor_complex128);
-      }
+        else if constexpr (std::is_same_v<T, std::complex<double>>)
+        {
+          k = reinterpret_cast<void (*)(
+              T*, const T*, const T*,
+              const typename scalar_value_type<T>::value_type*, const int*,
+              const unsigned char*)>(integral->tabulate_tensor_complex128);
+        }
 #endif // DOLFINX_NO_STDC_COMPLEX_KERNELS
-      assert(k);
-
-      // Build list of entities to assembler over
-      auto f_to_c = topology->connectivity(tdim - 1, tdim);
-      assert(f_to_c);
-      auto c_to_f = topology->connectivity(tdim, tdim - 1);
-      assert(c_to_f);
-      if (id == -1)
-      {
-        // Default kernel, operates on all (owned) interior facets
-        assert(topology->index_map(tdim - 1));
-        std::int32_t num_facets = topology->index_map(tdim - 1)->size_local();
-        default_facets_int.reserve(4 * num_facets);
-        for (std::int32_t f = 0; f < num_facets; ++f)
+        assert(k);
+
+        // Build list of entities to assembler over
+        auto f_to_c = topology->connectivity(tdim - 1, tdim);
+        assert(f_to_c);
+        auto c_to_f = topology->connectivity(tdim, tdim - 1);
+        assert(c_to_f);
+        if (id == -1)
         {
-          if (f_to_c->num_links(f) == 2)
+          // Default kernel, operates on all (owned) interior facets
+          assert(topology->index_map(tdim - 1));
+          std::int32_t num_facets = topology->index_map(tdim - 1)->size_local();
+          default_facets_int.reserve(4 * num_facets);
+          for (std::int32_t f = 0; f < num_facets; ++f)
           {
-            auto pairs
-                = impl::get_cell_facet_pairs<2>(f, f_to_c->links(f), *c_to_f);
-            default_facets_int.insert(default_facets_int.end(), pairs.begin(),
-                                      pairs.end());
-          }
-          else if (interprocess_marker[f])
-          {
-            throw std::runtime_error(
-                "Cannot compute interior facet integral over interprocess "
-                "facet. Please use ghost mode shared facet when creating the "
-                "mesh");
+            if (f_to_c->num_links(f) == 2)
+            {
+              auto pairs
+                  = impl::get_cell_facet_pairs<2>(f, f_to_c->links(f), *c_to_f);
+              default_facets_int.insert(default_facets_int.end(), pairs.begin(),
+                                        pairs.end());
+            }
+            else if (interprocess_marker[f])
+            {
+              throw std::runtime_error(
+                  "Cannot compute interior facet integral over interprocess "
+                  "facet. Please use ghost mode shared facet when creating the "
+                  "mesh");
+            }
           }
+          itg.first->second.emplace_back(id, k, default_facets_int,
+                                         active_coeffs);
+        }
+        else if (sd != subdomains.end())
+        {
+          auto it = std::ranges::lower_bound(sd->second, id, std::less{},
+                                             [](auto& a) { return a.first; });
+          if (it != sd->second.end() and it->first == id)
+            itg.first->second.emplace_back(id, k, it->second, active_coeffs);
         }
-        itg.first->second.emplace_back(id, k, default_facets_int,
-                                       active_coeffs);
-      }
-      else if (sd != subdomains.end())
-      {
-        auto it
-            = std::ranges::lower_bound(sd->second, id, std::less<>{},
-                                       [](const auto& a) { return a.first; });
-        if (it != sd->second.end() and it->first == id)
-          itg.first->second.emplace_back(id, k, it->second, active_coeffs);
-      }
 
-      if (integral->needs_facet_permutations)
-        needs_facet_permutations = true;
+        if (integral->needs_facet_permutations)
+          needs_facet_permutations = true;
+      }
     }
   }
 
@@ -744,7 +789,7 @@ Form<T, U> create_form(
       throw std::runtime_error("Form constant \"" + name + "\" not provided.");
   }
 
-  return create_form_factory(ufcx_form, spaces, coeff_map, const_map,
+  return create_form_factory({ufcx_form}, spaces, coeff_map, const_map,
                              subdomains, entity_maps, mesh);
 }
 
diff --git a/cpp/dolfinx/mesh/Geometry.h b/cpp/dolfinx/mesh/Geometry.h
index b5bffbc0dd5..ba628ad2fa3 100644
--- a/cpp/dolfinx/mesh/Geometry.h
+++ b/cpp/dolfinx/mesh/Geometry.h
@@ -269,7 +269,7 @@ create_geometry(
 
   //  Build 'geometry' dofmap on the topology
   auto [_dof_index_map, bs, dofmaps]
-      = fem::build_dofmap_data(topology.index_map(topology.dim())->comm(),
+      = fem::build_dofmap_data(topology.index_maps(topology.dim())[0]->comm(),
                                topology, dof_layouts, reorder_fn);
   auto dof_index_map
       = std::make_shared<common::IndexMap>(std::move(_dof_index_map));
diff --git a/cpp/dolfinx/mesh/Topology.cpp b/cpp/dolfinx/mesh/Topology.cpp
index fa7f27fefb8..d11378b71f8 100644
--- a/cpp/dolfinx/mesh/Topology.cpp
+++ b/cpp/dolfinx/mesh/Topology.cpp
@@ -768,7 +768,17 @@ const std::vector<CellType>& Topology::entity_types(int dim) const
 //-----------------------------------------------------------------------------
 mesh::CellType Topology::cell_type() const
 {
-  return _entity_types.back().at(0);
+  std::vector<CellType> cell_types = entity_types(dim());
+  if (cell_types.size() > 1)
+    throw std::runtime_error(
+        "Multiple cell types of this dimension. Call cell_types "
+        "instead.");
+  return cell_types.front();
+}
+//-----------------------------------------------------------------------------
+std::vector<mesh::CellType> Topology::cell_types() const
+{
+  return entity_types(dim());
 }
 //-----------------------------------------------------------------------------
 std::vector<std::shared_ptr<const common::IndexMap>>
@@ -786,6 +796,9 @@ Topology::index_maps(int dim) const
 //-----------------------------------------------------------------------------
 std::shared_ptr<const common::IndexMap> Topology::index_map(int dim) const
 {
+  if (_entity_types[dim].size() > 1)
+    throw std::runtime_error(
+        "Multiple index maps of this dimension. Call index_maps instead.");
   return this->index_maps(dim).at(0);
 }
 //-----------------------------------------------------------------------------
diff --git a/cpp/dolfinx/mesh/Topology.h b/cpp/dolfinx/mesh/Topology.h
index 7043bfa5e66..4966add5668 100644
--- a/cpp/dolfinx/mesh/Topology.h
+++ b/cpp/dolfinx/mesh/Topology.h
@@ -164,6 +164,10 @@ class Topology
   /// computed
   const std::vector<std::uint8_t>& get_facet_permutations() const;
 
+  /// @brief Get the types of cells in the topology
+  /// @return The cell types
+  std::vector<CellType> cell_types() const;
+
   /// @brief List of inter-process facets of a given type.
   ///
   /// "Inter-process" facets are facets that are connected (1) to a cell
@@ -194,7 +198,7 @@ class Topology
   /// existed.
   bool create_entities(int dim);
 
-  /// @brief Create connectivity between given pair of dimensions, `d0
+    /// @brief Create connectivity between given pair of dimensions, `d0
   /// -> d1`.
   /// @param[in] d0 Topological dimension.
   /// @param[in] d1 Topological dimension.
diff --git a/python/demo/demo_mixed-topology.py b/python/demo/demo_mixed-topology.py
new file mode 100644
index 00000000000..88b935bae36
--- /dev/null
+++ b/python/demo/demo_mixed-topology.py
@@ -0,0 +1,187 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: light
+#       format_version: '1.5'
+#       jupytext_version: 1.13.6
+# ---
+
+# # Poisson equation
+#
+# This demo illustrates how to solve a simple Helmholtz problem on a
+# mixed-topology mesh.
+#
+# NOTE: Mixed-topology meshes are a work in progress and are not yet fully
+# supported in DOLFINx.
+
+from mpi4py import MPI
+
+import numpy as np
+from scipy.sparse.linalg import spsolve
+
+import basix
+import dolfinx.cpp as _cpp
+import ufl
+from dolfinx.cpp.mesh import GhostMode, create_cell_partitioner, create_mesh
+from dolfinx.fem import (
+    FunctionSpace,
+    assemble_matrix,
+    coordinate_element,
+    mixed_topology_form,
+)
+from dolfinx.io.utils import cell_perm_vtk
+from dolfinx.mesh import CellType, Mesh
+
+if MPI.COMM_WORLD.size > 1:
+    print("Not yet running in parallel")
+    exit(0)
+
+
+# Create a mixed-topology mesh
+nx = 16
+ny = 16
+nz = 16
+n_cells = nx * ny * nz
+
+cells: list = [[], []]
+orig_idx: list = [[], []]
+geom = []
+
+if MPI.COMM_WORLD.rank == 0:
+    idx = 0
+    for i in range(n_cells):
+        iz = i // (nx * ny)
+        j = i % (nx * ny)
+        iy = j // nx
+        ix = j % nx
+
+        v0 = (iz * (ny + 1) + iy) * (nx + 1) + ix
+        v1 = v0 + 1
+        v2 = v0 + (nx + 1)
+        v3 = v1 + (nx + 1)
+        v4 = v0 + (nx + 1) * (ny + 1)
+        v5 = v1 + (nx + 1) * (ny + 1)
+        v6 = v2 + (nx + 1) * (ny + 1)
+        v7 = v3 + (nx + 1) * (ny + 1)
+        if ix < nx / 2:
+            cells[0] += [v0, v1, v2, v3, v4, v5, v6, v7]
+            orig_idx[0] += [idx]
+            idx += 1
+        else:
+            cells[1] += [v0, v1, v2, v4, v5, v6]
+            orig_idx[1] += [idx]
+            idx += 1
+            cells[1] += [v1, v2, v3, v5, v6, v7]
+            orig_idx[1] += [idx]
+            idx += 1
+
+    n_points = (nx + 1) * (ny + 1) * (nz + 1)
+    sqxy = (nx + 1) * (ny + 1)
+    for v in range(n_points):
+        iz = v // sqxy
+        p = v % sqxy
+        iy = p // (nx + 1)
+        ix = p % (nx + 1)
+        geom += [[ix / nx, iy / ny, iz / nz]]
+
+cells_np = [np.array(c) for c in cells]
+geomx = np.array(geom, dtype=np.float64)
+hexahedron = coordinate_element(CellType.hexahedron, 1)
+prism = coordinate_element(CellType.prism, 1)
+
+part = create_cell_partitioner(GhostMode.none)
+mesh = create_mesh(
+    MPI.COMM_WORLD, cells_np, [hexahedron._cpp_object, prism._cpp_object], geomx, part
+)
+
+# Create elements and dofmaps for each cell type
+elements = [
+    basix.create_element(basix.ElementFamily.P, basix.CellType.hexahedron, 1),
+    basix.create_element(basix.ElementFamily.P, basix.CellType.prism, 1),
+]
+elements_cpp = [_cpp.fem.FiniteElement_float64(e._e, None, True) for e in elements]
+# NOTE: Both dofmaps have the same IndexMap, but different cell_dofs
+dofmaps = _cpp.fem.create_dofmaps(mesh.comm, mesh.topology, elements_cpp)
+
+# Create C++ function space
+V_cpp = _cpp.fem.FunctionSpace_float64(mesh, elements_cpp, dofmaps)
+
+# Create forms for each cell type.
+# FIXME This hack is required at the moment because UFL does not yet know about
+# mixed topology meshes.
+a = []
+for i, cell_name in enumerate(["hexahedron", "prism"]):
+    print(f"Creating form for {cell_name}")
+    element = basix.ufl.wrap_element(elements[i])
+    domain = ufl.Mesh(basix.ufl.element("Lagrange", cell_name, 1, shape=(3,)))
+    V = FunctionSpace(Mesh(mesh, domain), element, V_cpp)
+    u, v = ufl.TrialFunction(V), ufl.TestFunction(V)
+    k = 12.0
+    a += [(ufl.inner(ufl.grad(u), ufl.grad(v)) - k**2 * u * v) * ufl.dx]
+
+# Compile the form
+# FIXME: For the time being, since UFL doesn't understand mixed topology meshes,
+# we have to call mixed_topology_form instead of form.
+a_form = mixed_topology_form(a, dtype=np.float64)
+
+# Assemble the matrix
+A = assemble_matrix(a_form)
+
+# Solve
+A_scipy = A.to_scipy()
+b_scipy = np.ones(A_scipy.shape[1])
+
+x = spsolve(A_scipy, b_scipy)
+print(f"Solution vector norm {np.linalg.norm(x)}")
+
+# I/O
+# Save to XDMF
+xdmf = """<?xml version="1.0"?>
+<!DOCTYPE Xdmf SYSTEM "Xdmf.dtd" []>
+<Xdmf Version="3.0" xmlns:xi="https://www.w3.org/2001/XInclude">
+  <Domain>
+    <Grid Name="mesh" GridType="Collection" CollectionType="spatial">
+
+"""
+
+perm = [cell_perm_vtk(CellType.hexahedron, 8), cell_perm_vtk(CellType.prism, 6)]
+topologies = ["Hexahedron", "Wedge"]
+
+for j in range(2):
+    vtk_topology = []
+    geom_dm = mesh.geometry.dofmaps(j)
+    for c in geom_dm:
+        vtk_topology += list(c[perm[j]])
+    topology_type = topologies[j]
+
+    xdmf += f"""
+      <Grid Name="{topology_type}" GridType="Uniform">
+        <Topology TopologyType="{topology_type}">
+          <DataItem Dimensions="{geom_dm.shape[0]} {geom_dm.shape[1]}"
+           Precision="4" NumberType="Int" Format="XML">
+          {" ".join(str(val) for val in vtk_topology)}
+          </DataItem>
+        </Topology>
+        <Geometry GeometryType="XYZ" NumberType="float" Rank="2" Precision="8">
+          <DataItem Dimensions="{mesh.geometry.x.shape[0]} 3" Format="XML">
+            {" ".join(str(val) for val in mesh.geometry.x.flatten())}
+          </DataItem>
+        </Geometry>
+        <Attribute Name="u" Center="Node" NumberType="float" Precision="8">
+          <DataItem Dimensions="{len(x)}" Format="XML">
+            {" ".join(str(val) for val in x)}
+          </DataItem>
+       </Attribute>
+      </Grid>"""
+
+xdmf += """
+    </Grid>
+  </Domain>
+</Xdmf>
+"""
+
+fd = open("mixed-mesh.xdmf", "w")
+fd.write(xdmf)
+fd.close()
diff --git a/python/doc/source/demos.rst b/python/doc/source/demos.rst
index bbbcb889137..ec6b921beb5 100644
--- a/python/doc/source/demos.rst
+++ b/python/doc/source/demos.rst
@@ -37,6 +37,7 @@ PDEs (advanced)
    demos/demo_poisson_matrix_free.md
    demos/demo_pyamg.md
    demos/demo_hdg.md
+   demos/demo_mixed-topology.md
 
 
 Nonlinear problems
@@ -114,3 +115,4 @@ List of all demos
    demos/demo_mixed-poisson.md
    demos/demo_pyamg.md
    demos/demo_hdg.md
+   demos/demo_mixed-topology.md
diff --git a/python/dolfinx/fem/__init__.py b/python/dolfinx/fem/__init__.py
index 8430eee2b9a..ef528c317f4 100644
--- a/python/dolfinx/fem/__init__.py
+++ b/python/dolfinx/fem/__init__.py
@@ -9,12 +9,14 @@
 import numpy.typing as npt
 
 from dolfinx.cpp.fem import IntegralType, transpose_dofmap
+from dolfinx.cpp.fem import build_sparsity_pattern as _build_sparsity_pattern
 from dolfinx.cpp.fem import compute_integration_domains as _compute_integration_domains
 from dolfinx.cpp.fem import create_interpolation_data as _create_interpolation_data
 from dolfinx.cpp.fem import create_sparsity_pattern as _create_sparsity_pattern
 from dolfinx.cpp.fem import discrete_curl as _discrete_curl
 from dolfinx.cpp.fem import discrete_gradient as _discrete_gradient
 from dolfinx.cpp.fem import interpolation_matrix as _interpolation_matrix
+from dolfinx.cpp.la import SparsityPattern
 from dolfinx.cpp.mesh import Topology
 from dolfinx.fem.assemble import (
     apply_lifting,
@@ -41,6 +43,7 @@
     extract_function_spaces,
     form,
     form_cpp_class,
+    mixed_topology_form,
 )
 from dolfinx.fem.function import (
     Constant,
@@ -70,6 +73,23 @@ def create_sparsity_pattern(a: Form):
     return _create_sparsity_pattern(a._cpp_object)
 
 
+def build_sparsity_pattern(pattern: SparsityPattern, a: Form):
+    """Build a sparsity pattern from a bilinear form.
+
+    Args:
+        pattern: The sparsity pattern to add to
+        a: Bilinear form to build a sparsity pattern for.
+
+    Returns:
+        Sparsity pattern for the form ``a``.
+
+    Note:
+        The pattern is not finalised, i.e. the caller is responsible for
+        calling ``assemble`` on the sparsity pattern.
+    """
+    return _build_sparsity_pattern(pattern, a._cpp_object)
+
+
 def create_interpolation_data(
     V_to: FunctionSpace,
     V_from: FunctionSpace,
@@ -219,6 +239,7 @@ def compute_integration_domains(
     "functionspace",
     "locate_dofs_geometrical",
     "locate_dofs_topological",
+    "mixed_topology_form",
     "set_bc",
     "transpose_dofmap",
 ]
diff --git a/python/dolfinx/fem/forms.py b/python/dolfinx/fem/forms.py
index 335ef212594..b3a0ee7e914 100644
--- a/python/dolfinx/fem/forms.py
+++ b/python/dolfinx/fem/forms.py
@@ -37,7 +37,7 @@ class Form:
         _cpp.fem.Form_float32,
         _cpp.fem.Form_float64,
     ]
-    _code: typing.Optional[str]
+    _code: typing.Optional[typing.Union[str, list[str]]]
 
     def __init__(
         self,
@@ -48,8 +48,8 @@ def __init__(
             _cpp.fem.Form_float64,
         ],
         ufcx_form=None,
-        code: typing.Optional[str] = None,
-        module: typing.Optional[types.ModuleType] = None,
+        code: typing.Optional[typing.Union[str, list[str]]] = None,
+        module: typing.Optional[typing.Union[types.ModuleType, list[types.ModuleType]]] = None,
     ):
         """A finite element form.
 
@@ -76,12 +76,12 @@ def ufcx_form(self):
         return self._ufcx_form
 
     @property
-    def code(self) -> typing.Union[str, None]:
+    def code(self) -> typing.Union[str, list[str], None]:
         """C code strings."""
         return self._code
 
     @property
-    def module(self) -> typing.Union[types.ModuleType, None]:
+    def module(self) -> typing.Union[types.ModuleType, list[types.ModuleType], None]:
         """The CFFI module"""
         return self._module
 
@@ -197,6 +197,87 @@ def form_cpp_class(
 }
 
 
+def mixed_topology_form(
+    forms: typing.Iterable[ufl.Form],
+    dtype: npt.DTypeLike = default_scalar_type,
+    form_compiler_options: typing.Optional[dict] = None,
+    jit_options: typing.Optional[dict] = None,
+    entity_maps: typing.Optional[dict[Mesh, np.typing.NDArray[np.int32]]] = None,
+):
+    """
+    Create a mixed-topology from from an array of Forms.
+
+    # FIXME: This function is a temporary hack for mixed-topology meshes. It is needed
+    # because UFL does not know about mixed-topology meshes, so we need
+    # to pass a list of forms for each cell type.
+
+    Args:
+        form: A list of UFL forms. Each form should be the same, just
+            defined on different cell types.
+        dtype: Scalar type to use for the compiled form.
+        form_compiler_options: See :func:`ffcx_jit <dolfinx.jit.ffcx_jit>`
+        jit_options: See :func:`ffcx_jit <dolfinx.jit.ffcx_jit>`.
+        entity_maps: If any trial functions, test functions, or
+            coefficients in the form are not defined over the same mesh
+            as the integration domain, `entity_maps` must be supplied.
+            For each key (a mesh, different to the integration domain
+            mesh) a map should be provided relating the entities in the
+            integration domain mesh to the entities in the key mesh e.g.
+            for a key-value pair (msh, emap) in `entity_maps`, `emap[i]`
+            is the entity in `msh` corresponding to entity `i` in the
+            integration domain mesh.
+
+    Returns:
+        Compiled finite element Form.
+    """
+
+    if form_compiler_options is None:
+        form_compiler_options = dict()
+
+    form_compiler_options["scalar_type"] = dtype
+    ftype = form_cpp_class(dtype)
+
+    # Extract subdomain data from UFL form
+    sd = next(iter(forms)).subdomain_data()
+    (domain,) = list(sd.keys())  # Assuming single domain
+
+    # Check that subdomain data for each integral type is the same
+    for data in sd.get(domain).values():
+        assert all([d is data[0] for d in data if d is not None])
+
+    mesh = domain.ufl_cargo()
+
+    ufcx_forms = []
+    modules = []
+    codes = []
+    for form in forms:
+        ufcx_form, module, code = jit.ffcx_jit(
+            mesh.comm,
+            form,
+            form_compiler_options=form_compiler_options,
+            jit_options=jit_options,
+        )
+        ufcx_forms.append(ufcx_form)
+        modules.append(module)
+        codes.append(code)
+
+    # In a mixed-topology mesh, each form has the same C++ function space,
+    # so we can extract it from any of them
+    V = [arg.ufl_function_space()._cpp_object for arg in form.arguments()]
+
+    # TODO coeffs, constants, subdomains, entity_maps
+    f = ftype(
+        [module.ffi.cast("uintptr_t", module.ffi.addressof(ufcx_form)) for ufcx_form in ufcx_forms],
+        V,
+        [],
+        [],
+        {},
+        {},
+        mesh,
+    )
+    return Form(f, ufcx_forms, codes, modules)
+
+
 def form(
     form: typing.Union[ufl.Form, typing.Iterable[ufl.Form]],
     dtype: npt.DTypeLike = default_scalar_type,
@@ -303,7 +384,7 @@ def _form(form):
             _entity_maps = {msh._cpp_object: emap for (msh, emap) in entity_maps.items()}
 
         f = ftype(
-            module.ffi.cast("uintptr_t", module.ffi.addressof(ufcx_form)),
+            [module.ffi.cast("uintptr_t", module.ffi.addressof(ufcx_form))],
             V,
             coeffs,
             constants,
diff --git a/python/dolfinx/wrappers/assemble.cpp b/python/dolfinx/wrappers/assemble.cpp
index 107a5a24ea7..c897d383a77 100644
--- a/python/dolfinx/wrappers/assemble.cpp
+++ b/python/dolfinx/wrappers/assemble.cpp
@@ -209,8 +209,7 @@ void declare_assembly_functions(nb::module_& m)
       nb::arg("form"), "Pack coefficients for a Form.");
   m.def(
       "pack_constants",
-      [](const dolfinx::fem::Form<T, U>& form)
-      {
+      [](const dolfinx::fem::Form<T, U>& form) {
         return dolfinx_wrappers::as_nbarray(dolfinx::fem::pack_constants(form));
       },
       nb::arg("form"), "Pack constants for a Form.");
@@ -272,9 +271,11 @@ void declare_assembly_functions(nb::module_& m)
           _bcs.push_back(*bc);
         }
 
+        // Get index map block size. Note that mixed-topology meshes
+        // will have multiple DOF maps, but the block sizes are the same.
         const std::array<int, 2> data_bs
-            = {a.function_spaces().at(0)->dofmap()->index_map_bs(),
-               a.function_spaces().at(1)->dofmap()->index_map_bs()};
+            = {a.function_spaces().at(0)->dofmaps(0)->index_map_bs(),
+               a.function_spaces().at(1)->dofmaps(0)->index_map_bs()};
 
         if (data_bs[0] != data_bs[1])
         {
diff --git a/python/dolfinx/wrappers/fem.cpp b/python/dolfinx/wrappers/fem.cpp
index a80f0786410..b0c09ad05a5 100644
--- a/python/dolfinx/wrappers/fem.cpp
+++ b/python/dolfinx/wrappers/fem.cpp
@@ -74,6 +74,10 @@ void declare_function_space(nb::module_& m, std::string type)
                       std::shared_ptr<const dolfinx::fem::FiniteElement<T>>,
                       std::shared_ptr<const dolfinx::fem::DofMap>>(),
              nb::arg("mesh"), nb::arg("element"), nb::arg("dofmap"))
+        .def(nb::init<std::shared_ptr<const dolfinx::mesh::Mesh<T>>,
+                      std::vector<std::shared_ptr<const dolfinx::fem::FiniteElement<T>>>,
+                      std::vector<std::shared_ptr<const dolfinx::fem::DofMap>>>(),
+             nb::arg("mesh"), nb::arg("elements"), nb::arg("dofmaps"))
         .def("collapse", &dolfinx::fem::FunctionSpace<T>::collapse)
         .def("component", &dolfinx::fem::FunctionSpace<T>::component)
         .def("contains", &dolfinx::fem::FunctionSpace<T>::contains,
@@ -688,7 +692,7 @@ void declare_form(nb::module_& m, std::string type)
           nb::arg("entity_maps"), nb::arg("mesh").none())
       .def(
           "__init__",
-          [](dolfinx::fem::Form<T, U>* fp, std::uintptr_t form,
+          [](dolfinx::fem::Form<T, U>* fp, std::vector<std::uintptr_t> forms,
              const std::vector<
                  std::shared_ptr<const dolfinx::fem::FunctionSpace<U>>>& spaces,
              const std::vector<std::shared_ptr<
@@ -724,10 +728,12 @@ void declare_form(nb::module_& m, std::string type)
                 _entity_maps;
             for (auto& [msh, map] : entity_maps)
               _entity_maps.emplace(msh, std::span(map.data(), map.size()));
-            ufcx_form* p = reinterpret_cast<ufcx_form*>(form);
+            std::vector<std::reference_wrapper<const ufcx_form>> ps;
+            for (auto form : forms)
+                ps.push_back(*(reinterpret_cast<ufcx_form*>(form)));
             new (fp)
                 dolfinx::fem::Form<T, U>(dolfinx::fem::create_form_factory<T>(
-                    *p, spaces, coefficients, constants, sd, _entity_maps,
+                    ps, spaces, coefficients, constants, sd, _entity_maps,
                     mesh));
           },
           nb::arg("form"), nb::arg("spaces"), nb::arg("coefficients"),
@@ -807,7 +813,7 @@ void declare_form(nb::module_& m, std::string type)
         }
 
         ufcx_form* p = reinterpret_cast<ufcx_form*>(form);
-        return dolfinx::fem::create_form_factory<T>(*p, spaces, coefficients,
+        return dolfinx::fem::create_form_factory<T>({*p}, spaces, coefficients,
                                                     constants, sd, {}, mesh);
       },
       nb::arg("form"), nb::arg("spaces"), nb::arg("coefficients"),
@@ -858,9 +864,11 @@ void declare_form(nb::module_& m, std::string type)
       nb::arg("constants"), nb::arg("subdomains"), nb::arg("entity_maps"),
       nb::arg("mesh"), "Create Form from a pointer to ufcx_form.");
 
-  m.def("create_sparsity_pattern",
-        &dolfinx::fem ::create_sparsity_pattern<T, U>, nb::arg("a"),
-        "Create a sparsity pattern.");
+  m.def("create_sparsity_pattern", &dolfinx::fem::create_sparsity_pattern<T, U>,
+        nb::arg("a"), "Create a sparsity pattern.");
+
+  m.def("build_sparsity_pattern", &dolfinx::fem::build_sparsity_pattern<T, U>,
+        nb::arg("pattern"), nb::arg("a"), "Build a sparsity pattern.");
 }
 
 template <typename T>
diff --git a/python/dolfinx/wrappers/mesh.cpp b/python/dolfinx/wrappers/mesh.cpp
index f0198352adb..53199b6accb 100644
--- a/python/dolfinx/wrappers/mesh.cpp
+++ b/python/dolfinx/wrappers/mesh.cpp
@@ -650,6 +650,7 @@ void mesh(nb::module_& m)
       .def("index_map", &dolfinx::mesh::Topology::index_map, nb::arg("dim"))
       .def("index_maps", &dolfinx::mesh::Topology::index_maps, nb::arg("dim"))
       .def_prop_ro("cell_type", &dolfinx::mesh::Topology::cell_type)
+      .def_prop_ro("cell_types", &dolfinx::mesh::Topology::cell_types)
       .def_prop_ro(
           "entity_types",
           [](const dolfinx::mesh::Topology& self)
diff --git a/python/test/unit/fem/test_forms.py b/python/test/unit/fem/test_forms.py
index b1a2e5e09cf..f4ff8cccf33 100644
--- a/python/test/unit/fem/test_forms.py
+++ b/python/test/unit/fem/test_forms.py
@@ -89,7 +89,7 @@ def test_incorrect_element():
     )
 
     f = ftype(
-        module.ffi.cast("uintptr_t", module.ffi.addressof(ufcx_form)),
+        [module.ffi.cast("uintptr_t", module.ffi.addressof(ufcx_form))],
         [space._cpp_object, space._cpp_object],
         [],
         [],
@@ -101,7 +101,7 @@ def test_incorrect_element():
 
     with pytest.raises(RuntimeError):
         f = ftype(
-            module.ffi.cast("uintptr_t", module.ffi.addressof(ufcx_form)),
+            [module.ffi.cast("uintptr_t", module.ffi.addressof(ufcx_form))],
             [incorrect_space._cpp_object, incorrect_space._cpp_object],
             [],
             [],

From 372f8b76ec1653e6074c86fc4a93947200ef47d1 Mon Sep 17 00:00:00 2001
From: "Jack S. Hale" <mail@jackhale.co.uk>
Date: Fri, 17 Jan 2025 14:21:17 +0100
Subject: [PATCH 9/9] Switch to GitHub native runners (#3601)

* Try arm runner to build end user image.

* Looks OK for end user, also switch dev images.
---
 .github/workflows/docker-dev-test-env.yml | 2 +-
 .github/workflows/docker-end-user.yml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docker-dev-test-env.yml b/.github/workflows/docker-dev-test-env.yml
index 58a1b8bf497..e092da65535 100644
--- a/.github/workflows/docker-dev-test-env.yml
+++ b/.github/workflows/docker-dev-test-env.yml
@@ -29,7 +29,7 @@ jobs:
       matrix:
         variant: ["test-env", "dev-env"]
         mpi: ["openmpi", "mpich"]
-        os: ["ubuntu-latest", "buildjet-4vcpu-ubuntu-2204-arm"]
+        os: ["ubuntu-latest", "ubuntu-24.04-arm"]
 
     runs-on: ${{ matrix.os }}
     steps:
diff --git a/.github/workflows/docker-end-user.yml b/.github/workflows/docker-end-user.yml
index eee7fa51a55..b1eb10d7ba3 100644
--- a/.github/workflows/docker-end-user.yml
+++ b/.github/workflows/docker-end-user.yml
@@ -64,7 +64,7 @@ jobs:
           - arch_tag: amd64
             os: ubuntu-latest
           - arch_tag: arm64
-            os: buildjet-8vcpu-ubuntu-2204-arm
+            os: ubuntu-24.04-arm
     runs-on: ${{ matrix.os }}
     steps:
       - name: Create tag without image name