From 77a2be903a6d9a33ae671bf9f069b36217415995 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Tue, 27 Jun 2023 11:26:38 -0700
Subject: [PATCH] Update MFEM commit and patches

---
 palace/deps/CMakeLists.txt                    |     6 +-
 .../patch/mfem/patch_bilinearform_marker.diff |   720 -
 .../deps/patch/mfem/patch_direct_solvers.diff |  4600 +-
 .../deps/patch/mfem/patch_hypre_blocks.diff   |    41 -
 palace/deps/patch/mfem/patch_mesh_part.diff   |    70 +-
 palace/deps/patch/mfem/patch_pa_libceed.diff  | 28397 -----------
 palace/deps/patch/mfem/patch_pa_prereq.diff   | 41184 ----------------
 palace/deps/patch/mfem/patch_submesh.diff     |   197 +-
 8 files changed, 1017 insertions(+), 74198 deletions(-)
 delete mode 100644 palace/deps/patch/mfem/patch_bilinearform_marker.diff
 delete mode 100644 palace/deps/patch/mfem/patch_hypre_blocks.diff
 delete mode 100644 palace/deps/patch/mfem/patch_pa_libceed.diff
 delete mode 100644 palace/deps/patch/mfem/patch_pa_prereq.diff

diff --git a/palace/deps/CMakeLists.txt b/palace/deps/CMakeLists.txt
index 90c0f4aa6..a9eece82a 100644
--- a/palace/deps/CMakeLists.txt
+++ b/palace/deps/CMakeLists.txt
@@ -19,7 +19,7 @@ set(PALACE_INTERNAL_EIGEN_URL
   "URL for internal Eigen build"
 )
 set(PALACE_INTERNAL_MFEM_GIT_TAG
-  "0f5d34b2b490819789d6c85546e17724ff37f021" CACHE STRING  # master @ 05/13/2023
+  "6470d3a7b2edf868aace2b9454d95d124ff98173" CACHE STRING  # master @ 06/26/2023
   "Git tag for internal MFEM build"
 )
 
@@ -108,14 +108,10 @@ if(PALACE_WITH_INTERNAL_MFEM)
 
   # A number of patches to MFEM for our uses
   set(PALACE_MFEM_PATCH_FILES
-    # "${CMAKE_CURRENT_SOURCE_DIR}/patch/mfem/patch_bilinearform_marker.diff"
     "${CMAKE_CURRENT_SOURCE_DIR}/patch/mfem/patch_mesh_part.diff"
     "${CMAKE_CURRENT_SOURCE_DIR}/patch/mfem/patch_mesh_vis.diff"
     "${CMAKE_CURRENT_SOURCE_DIR}/patch/mfem/patch_submesh.diff"
-    "${CMAKE_CURRENT_SOURCE_DIR}/patch/mfem/patch_hypre_blocks.diff"
     "${CMAKE_CURRENT_SOURCE_DIR}/patch/mfem/patch_direct_solvers.diff"
-    "${CMAKE_CURRENT_SOURCE_DIR}/patch/mfem/patch_pa_prereq.diff"
-    "${CMAKE_CURRENT_SOURCE_DIR}/patch/mfem/patch_pa_libceed.diff"
   )
 
   include(FetchContent)
diff --git a/palace/deps/patch/mfem/patch_bilinearform_marker.diff b/palace/deps/patch/mfem/patch_bilinearform_marker.diff
deleted file mode 100644
index 7263fe75d..000000000
--- a/palace/deps/patch/mfem/patch_bilinearform_marker.diff
+++ /dev/null
@@ -1,720 +0,0 @@
-diff --git a/fem/bilinearform.cpp b/fem/bilinearform.cpp
-index fad9717aa..f2fdd5c60 100644
---- a/fem/bilinearform.cpp
-+++ b/fem/bilinearform.cpp
-@@ -100,6 +100,7 @@ BilinearForm::BilinearForm (FiniteElementSpace * f, BilinearForm * bf, int ps)
- 
-    // Copy the pointers to the integrators
-    domain_integs = bf->domain_integs;
-+   domain_integs_marker = bf->domain_integs_marker;
- 
-    boundary_integs = bf->boundary_integs;
-    boundary_integs_marker = bf->boundary_integs_marker;
-@@ -425,7 +426,7 @@ void BilinearForm::Assemble(int skip_zeros)
- 
-       for (int i = 0; i < fes -> GetNE(); i++)
-       {
--         int elem_attr = fes->GetMesh()->GetAttribute(i);
-+         int elem_attr = mesh->GetAttribute(i);
-          doftrans = fes->GetElementVDofs(i, vdofs);
-          if (element_matrices)
-          {
-@@ -436,8 +437,8 @@ void BilinearForm::Assemble(int skip_zeros)
-             elmat.SetSize(0);
-             for (int k = 0; k < domain_integs.Size(); k++)
-             {
--               if ( domain_integs_marker[k] == NULL ||
--                    (*(domain_integs_marker[k]))[elem_attr-1] == 1)
-+               if (domain_integs_marker[k] == NULL ||
-+                   (*(domain_integs_marker[k]))[elem_attr-1] == 1)
-                {
-                   const FiniteElement &fe = *fes->GetFE(i);
-                   eltrans = fes->GetElementTransformation(i);
-@@ -1176,11 +1177,14 @@ MixedBilinearForm::MixedBilinearForm (FiniteElementSpace *tr_fes,
- 
-    // Copy the pointers to the integrators
-    domain_integs = mbf->domain_integs;
-+   domain_integs_marker = mbf->domain_integs_marker;
-+
-    boundary_integs = mbf->boundary_integs;
-+   boundary_integs_marker = mbf->boundary_integs_marker;
-+
-    trace_face_integs = mbf->trace_face_integs;
--   boundary_trace_face_integs = mbf->boundary_trace_face_integs;
- 
--   boundary_integs_marker = mbf->boundary_integs_marker;
-+   boundary_trace_face_integs = mbf->boundary_trace_face_integs;
-    boundary_trace_face_integs_marker = mbf->boundary_trace_face_integs_marker;
- 
-    assembly = AssemblyLevel::LEGACY;
-@@ -1303,6 +1307,14 @@ void MixedBilinearForm::GetBlocks(Array2D<SparseMatrix *> &blocks) const
- void MixedBilinearForm::AddDomainIntegrator (BilinearFormIntegrator * bfi)
- {
-    domain_integs.Append (bfi);
-+   domain_integs_marker.Append(NULL); // NULL marker means apply everywhere
-+}
-+
-+void MixedBilinearForm::AddDomainIntegrator (BilinearFormIntegrator * bfi,
-+                                             Array<int> &elem_marker)
-+{
-+   domain_integs.Append (bfi);
-+   domain_integs_marker.Append(&elem_marker);
- }
- 
- void MixedBilinearForm::AddBoundaryIntegrator (BilinearFormIntegrator * bfi)
-@@ -1337,7 +1349,7 @@ void MixedBilinearForm::AddBdrTraceFaceIntegrator(BilinearFormIntegrator *bfi,
-    boundary_trace_face_integs_marker.Append(&bdr_marker);
- }
- 
--void MixedBilinearForm::Assemble (int skip_zeros)
-+void MixedBilinearForm::Assemble(int skip_zeros)
- {
-    if (ext)
-    {
-@@ -1359,8 +1371,20 @@ void MixedBilinearForm::Assemble (int skip_zeros)
- 
-    if (domain_integs.Size())
-    {
-+      for (int k = 0; k < domain_integs.Size(); k++)
-+      {
-+         if (domain_integs_marker[k] != NULL)
-+         {
-+            MFEM_VERIFY(domain_integs_marker[k]->Size() ==
-+                        (mesh->attributes.Size() ? mesh->attributes.Max() : 0),
-+                        "invalid element marker for domain integrator #"
-+                        << k << ", counting from zero");
-+         }
-+      }
-+
-       for (int i = 0; i < test_fes -> GetNE(); i++)
-       {
-+         int elem_attr = mesh->GetAttribute(i);
-          dom_dof_trans = trial_fes -> GetElementVDofs (i, trial_vdofs);
-          ran_dof_trans = test_fes  -> GetElementVDofs (i, test_vdofs);
-          eltrans = test_fes -> GetElementTransformation (i);
-@@ -1369,10 +1393,14 @@ void MixedBilinearForm::Assemble (int skip_zeros)
-          elmat = 0.0;
-          for (int k = 0; k < domain_integs.Size(); k++)
-          {
--            domain_integs[k] -> AssembleElementMatrix2 (*trial_fes -> GetFE(i),
--                                                        *test_fes  -> GetFE(i),
--                                                        *eltrans, elemmat);
--            elmat += elemmat;
-+            if (domain_integs_marker[k] == NULL ||
-+                (*(domain_integs_marker[k]))[elem_attr-1] == 1)
-+            {
-+               domain_integs[k] -> AssembleElementMatrix2 (*trial_fes -> GetFE(i),
-+                                                           *test_fes  -> GetFE(i),
-+                                                           *eltrans, elemmat);
-+               elmat += elemmat;
-+            }
-          }
-          if (ran_dof_trans || dom_dof_trans)
-          {
-@@ -1895,41 +1923,56 @@ void DiscreteLinearOperator::Assemble(int skip_zeros)
-       return;
-    }
- 
--   Array<int> dom_vdofs, ran_vdofs;
--   ElementTransformation *T;
-+   ElementTransformation *eltrans;
-    DofTransformation * dom_dof_trans;
-    DofTransformation * ran_dof_trans;
--   const FiniteElement *dom_fe, *ran_fe;
--   DenseMatrix totelmat, elmat;
-+   DenseMatrix elmat;
-+
-+   Mesh *mesh = test_fes->GetMesh();
- 
-    if (mat == NULL)
-    {
-       mat = new SparseMatrix(height, width);
-    }
- 
--   if (domain_integs.Size() > 0)
-+   if (domain_integs.Size())
-    {
-+      for (int k = 0; k < domain_integs.Size(); k++)
-+      {
-+         if (domain_integs_marker[k] != NULL)
-+         {
-+            MFEM_VERIFY(domain_integs_marker[k]->Size() ==
-+                        (mesh->attributes.Size() ? mesh->attributes.Max() : 0),
-+                        "invalid element marker for domain integrator #"
-+                        << k << ", counting from zero");
-+         }
-+      }
-+
-       for (int i = 0; i < test_fes->GetNE(); i++)
-       {
--         dom_dof_trans = trial_fes->GetElementVDofs(i, dom_vdofs);
--         ran_dof_trans = test_fes->GetElementVDofs(i, ran_vdofs);
--         T = test_fes->GetElementTransformation(i);
--         dom_fe = trial_fes->GetFE(i);
--         ran_fe = test_fes->GetFE(i);
--
--         domain_integs[0]->AssembleElementMatrix2(*dom_fe, *ran_fe, *T,
--                                                  totelmat);
--         for (int j = 1; j < domain_integs.Size(); j++)
-+         int elem_attr = mesh->GetAttribute(i);
-+         dom_dof_trans = trial_fes->GetElementVDofs(i, trial_vdofs);
-+         ran_dof_trans = test_fes->GetElementVDofs(i, test_vdofs);
-+         eltrans = test_fes->GetElementTransformation(i);
-+
-+         elmat.SetSize(test_vdofs.Size(), trial_vdofs.Size());
-+         elmat = 0.0;
-+         for (int k = 0; k < domain_integs.Size(); k++)
-          {
--            domain_integs[j]->AssembleElementMatrix2(*dom_fe, *ran_fe, *T,
--                                                     elmat);
--            totelmat += elmat;
-+            if (domain_integs_marker[k] == NULL ||
-+                (*(domain_integs_marker[k]))[elem_attr-1] == 1)
-+            {
-+               domain_integs[k]->AssembleElementMatrix2(*trial_fes->GetFE(i),
-+                                                        *test_fes->GetFE(i),
-+                                                        *eltrans, elemmat);
-+               elmat += elemmat;
-+            }
-          }
-          if (ran_dof_trans || dom_dof_trans)
-          {
--            TransformPrimal(ran_dof_trans, dom_dof_trans, totelmat);
-+            TransformPrimal(ran_dof_trans, dom_dof_trans, elemmat);
-          }
--         mat->SetSubMatrix(ran_vdofs, dom_vdofs, totelmat, skip_zeros);
-+         mat->SetSubMatrix(test_vdofs, trial_vdofs, elemmat, skip_zeros);
-       }
-    }
- 
-@@ -1938,21 +1981,20 @@ void DiscreteLinearOperator::Assemble(int skip_zeros)
-       const int nfaces = test_fes->GetMesh()->GetNumFaces();
-       for (int i = 0; i < nfaces; i++)
-       {
--         trial_fes->GetFaceVDofs(i, dom_vdofs);
--         test_fes->GetFaceVDofs(i, ran_vdofs);
--         T = test_fes->GetMesh()->GetFaceTransformation(i);
--         dom_fe = trial_fes->GetFaceElement(i);
--         ran_fe = test_fes->GetFaceElement(i);
--
--         trace_face_integs[0]->AssembleElementMatrix2(*dom_fe, *ran_fe, *T,
--                                                      totelmat);
--         for (int j = 1; j < trace_face_integs.Size(); j++)
-+         trial_fes->GetFaceVDofs(i, trial_vdofs);
-+         test_fes->GetFaceVDofs(i, test_vdofs);
-+         eltrans = test_fes->GetMesh()->GetFaceTransformation(i);
-+
-+         elmat.SetSize(test_vdofs.Size(), trial_vdofs.Size());
-+         elmat = 0.0;
-+         for (int k = 0; k < trace_face_integs.Size(); k++)
-          {
--            trace_face_integs[j]->AssembleElementMatrix2(*dom_fe, *ran_fe, *T,
--                                                         elmat);
--            totelmat += elmat;
-+            trace_face_integs[k]->AssembleElementMatrix2(*trial_fes->GetFaceElement(i),
-+                                                         *test_fes->GetFaceElement(i),
-+                                                         *eltrans, elemmat);
-+            elmat += elemmat;
-          }
--         mat->SetSubMatrix(ran_vdofs, dom_vdofs, totelmat, skip_zeros);
-+         mat->SetSubMatrix(test_vdofs, trial_vdofs, elmat, skip_zeros);
-       }
-    }
- }
-diff --git a/fem/bilinearform.hpp b/fem/bilinearform.hpp
-index b23df9280..876bc1b17 100644
---- a/fem/bilinearform.hpp
-+++ b/fem/bilinearform.hpp
-@@ -100,7 +100,7 @@ protected:
-    /// Includes all by default.
-    /// 0 - ignore attribute
-    /// 1 - include attribute
--   Array<Array<int>*>             domain_integs_marker;
-+   Array<Array<int>*> domain_integs_marker; ///< Entries are not owned.
- 
-    /// Set of Boundary Integrators to be applied.
-    Array<BilinearFormIntegrator*> boundary_integs;
-@@ -716,10 +716,13 @@ protected:
- 
-    /// Domain integrators.
-    Array<BilinearFormIntegrator*> domain_integs;
-+   /// Entries are not owned.
-+   Array<Array<int>*> domain_integs_marker;
- 
-    /// Boundary integrators.
-    Array<BilinearFormIntegrator*> boundary_integs;
--   Array<Array<int>*> boundary_integs_marker; ///< Entries are not owned.
-+   /// Entries are not owned.
-+   Array<Array<int>*> boundary_integs_marker;
- 
-    /// Trace face (skeleton) integrators.
-    Array<BilinearFormIntegrator*> trace_face_integs;
-@@ -799,12 +802,16 @@ public:
-    /// Adds a domain integrator. Assumes ownership of @a bfi.
-    void AddDomainIntegrator(BilinearFormIntegrator *bfi);
- 
-+   /// Adds a domain integrator. Assumes ownership of @a bfi.
-+   void AddDomainIntegrator(BilinearFormIntegrator *bfi,
-+                            Array<int> &elem_marker);
-+
-    /// Adds a boundary integrator. Assumes ownership of @a bfi.
-    void AddBoundaryIntegrator(BilinearFormIntegrator *bfi);
- 
-    /// Adds a boundary integrator. Assumes ownership of @a bfi.
--   void AddBoundaryIntegrator (BilinearFormIntegrator * bfi,
--                               Array<int> &bdr_marker);
-+   void AddBoundaryIntegrator(BilinearFormIntegrator * bfi,
-+                              Array<int> &bdr_marker);
- 
-    /** @brief Add a trace face integrator. Assumes ownership of @a bfi.
- 
-@@ -814,14 +821,18 @@ public:
-    void AddTraceFaceIntegrator(BilinearFormIntegrator *bfi);
- 
-    /// Adds a boundary trace face integrator. Assumes ownership of @a bfi.
--   void AddBdrTraceFaceIntegrator (BilinearFormIntegrator * bfi);
-+   void AddBdrTraceFaceIntegrator(BilinearFormIntegrator * bfi);
- 
-    /// Adds a boundary trace face integrator. Assumes ownership of @a bfi.
--   void AddBdrTraceFaceIntegrator (BilinearFormIntegrator * bfi,
--                                   Array<int> &bdr_marker);
-+   void AddBdrTraceFaceIntegrator(BilinearFormIntegrator * bfi,
-+                                  Array<int> &bdr_marker);
- 
-    /// Access all integrators added with AddDomainIntegrator().
-    Array<BilinearFormIntegrator*> *GetDBFI() { return &domain_integs; }
-+   /** @brief Access all domain markers added with AddDomainIntegrator().
-+       If no marker was specified when the integrator was added, the
-+       corresponding pointer (to Array<int>) will be NULL. */
-+   Array<Array<int>*> *GetDBFI_Marker() { return &domain_integs_marker; }
- 
-    /// Access all integrators added with AddBoundaryIntegrator().
-    Array<BilinearFormIntegrator*> *GetBBFI() { return &boundary_integs; }
-@@ -1059,6 +1070,9 @@ public:
-    /// Adds a domain interpolator. Assumes ownership of @a di.
-    void AddDomainInterpolator(DiscreteInterpolator *di)
-    { AddDomainIntegrator(di); }
-+   void AddDomainInterpolator(DiscreteInterpolator *di,
-+                              Array<int> &elem_marker)
-+   { AddDomainIntegrator(di, elem_marker); }
- 
-    /// Adds a trace face interpolator. Assumes ownership of @a di.
-    void AddTraceFaceInterpolator(DiscreteInterpolator *di)
-@@ -1066,6 +1080,7 @@ public:
- 
-    /// Access all interpolators added with AddDomainInterpolator().
-    Array<BilinearFormIntegrator*> *GetDI() { return &domain_integs; }
-+   Array<Array<int>*> *GetDI_Marker() { return &domain_integs_marker; }
- 
-    /// Set the desired assembly level. The default is AssemblyLevel::FULL.
-    /** This method must be called before assembly. */
-diff --git a/fem/nonlinearform.cpp b/fem/nonlinearform.cpp
-index a01e83ebe..88271e234 100644
---- a/fem/nonlinearform.cpp
-+++ b/fem/nonlinearform.cpp
-@@ -97,12 +97,37 @@ double NonlinearForm::GetGridFunctionEnergy(const Vector &x) const
-    const FiniteElement *fe;
-    ElementTransformation *T;
-    DofTransformation *doftrans;
-+   Mesh *mesh = fes->GetMesh();
-    double energy = 0.0;
- 
-    if (dnfi.Size())
-    {
-+      // Which attributes need to be processed?
-+      Array<int> attr_marker(mesh->attributes.Size() ?
-+                             mesh->attributes.Max() : 0);
-+      attr_marker = 0;
-+      for (int k = 0; k < dnfi.Size(); k++)
-+      {
-+         if (dnfi_marker[k] == NULL)
-+         {
-+            attr_marker = 1;
-+            break;
-+         }
-+         Array<int> &marker = *dnfi_marker[k];
-+         MFEM_ASSERT(marker.Size() == attr_marker.Size(),
-+                     "invalid marker for domain integrator #"
-+                     << k << ", counting from zero");
-+         for (int i = 0; i < attr_marker.Size(); i++)
-+         {
-+            attr_marker[i] |= marker[i];
-+         }
-+      }
-+
-       for (int i = 0; i < fes->GetNE(); i++)
-       {
-+         const int attr = mesh->GetAttribute(i);
-+         if (attr_marker[attr-1] == 0) { continue; }
-+
-          fe = fes->GetFE(i);
-          doftrans = fes->GetElementVDofs(i, vdofs);
-          T = fes->GetElementTransformation(i);
-@@ -110,6 +135,9 @@ double NonlinearForm::GetGridFunctionEnergy(const Vector &x) const
-          if (doftrans) {doftrans->InvTransformPrimal(el_x); }
-          for (int k = 0; k < dnfi.Size(); k++)
-          {
-+            if (dnfi_marker[k] &&
-+                (*dnfi_marker[k])[attr-1] == 0) { continue; }
-+
-             energy += dnfi[k]->GetElementEnergy(*fe, *T, el_x);
-          }
-       }
-@@ -175,8 +203,32 @@ void NonlinearForm::Mult(const Vector &x, Vector &y) const
- 
-    if (dnfi.Size())
-    {
-+      // Which attributes need to be processed?
-+      Array<int> attr_marker(mesh->attributes.Size() ?
-+                             mesh->attributes.Max() : 0);
-+      attr_marker = 0;
-+      for (int k = 0; k < dnfi.Size(); k++)
-+      {
-+         if (dnfi_marker[k] == NULL)
-+         {
-+            attr_marker = 1;
-+            break;
-+         }
-+         Array<int> &marker = *dnfi_marker[k];
-+         MFEM_ASSERT(marker.Size() == attr_marker.Size(),
-+                     "invalid marker for domain integrator #"
-+                     << k << ", counting from zero");
-+         for (int i = 0; i < attr_marker.Size(); i++)
-+         {
-+            attr_marker[i] |= marker[i];
-+         }
-+      }
-+
-       for (int i = 0; i < fes->GetNE(); i++)
-       {
-+         const int attr = mesh->GetAttribute(i);
-+         if (attr_marker[attr-1] == 0) { continue; }
-+
-          fe = fes->GetFE(i);
-          doftrans = fes->GetElementVDofs(i, vdofs);
-          T = fes->GetElementTransformation(i);
-@@ -184,6 +236,9 @@ void NonlinearForm::Mult(const Vector &x, Vector &y) const
-          if (doftrans) {doftrans->InvTransformPrimal(el_x); }
-          for (int k = 0; k < dnfi.Size(); k++)
-          {
-+            if (dnfi_marker[k] &&
-+                (*dnfi_marker[k])[attr-1] == 0) { continue; }
-+
-             dnfi[k]->AssembleElementVector(*fe, *T, el_x, el_y);
-             if (doftrans) {doftrans->TransformDual(el_y); }
-             py.AddElementVector(vdofs, el_y);
-@@ -322,8 +377,32 @@ Operator &NonlinearForm::GetGradient(const Vector &x) const
- 
-    if (dnfi.Size())
-    {
-+      // Which attributes need to be processed?
-+      Array<int> attr_marker(mesh->attributes.Size() ?
-+                             mesh->attributes.Max() : 0);
-+      attr_marker = 0;
-+      for (int k = 0; k < dnfi.Size(); k++)
-+      {
-+         if (dnfi_marker[k] == NULL)
-+         {
-+            attr_marker = 1;
-+            break;
-+         }
-+         Array<int> &marker = *dnfi_marker[k];
-+         MFEM_ASSERT(marker.Size() == attr_marker.Size(),
-+                     "invalid marker for domain integrator #"
-+                     << k << ", counting from zero");
-+         for (int i = 0; i < attr_marker.Size(); i++)
-+         {
-+            attr_marker[i] |= marker[i];
-+         }
-+      }
-+
-       for (int i = 0; i < fes->GetNE(); i++)
-       {
-+         const int attr = mesh->GetAttribute(i);
-+         if (attr_marker[attr-1] == 0) { continue; }
-+
-          fe = fes->GetFE(i);
-          doftrans = fes->GetElementVDofs(i, vdofs);
-          T = fes->GetElementTransformation(i);
-@@ -331,6 +410,9 @@ Operator &NonlinearForm::GetGradient(const Vector &x) const
-          if (doftrans) {doftrans->InvTransformPrimal(el_x); }
-          for (int k = 0; k < dnfi.Size(); k++)
-          {
-+            if (dnfi_marker[k] &&
-+                (*dnfi_marker[k])[attr-1] == 0) { continue; }
-+
-             dnfi[k]->AssembleElementGrad(*fe, *T, el_x, elmat);
-             if (doftrans) { doftrans->TransformDual(elmat); }
-             Grad->AddSubMatrix(vdofs, vdofs, elmat, skip_zeros);
-@@ -561,13 +643,6 @@ BlockNonlinearForm::BlockNonlinearForm(Array<FiniteElementSpace *> &f) :
-    SetSpaces(f);
- }
- 
--void BlockNonlinearForm::AddBdrFaceIntegrator(BlockNonlinearFormIntegrator *nfi,
--                                              Array<int> &bdr_attr_marker)
--{
--   bfnfi.Append(nfi);
--   bfnfi_marker.Append(&bdr_attr_marker);
--}
--
- void BlockNonlinearForm::SetEssentialBC(
-    const Array<Array<int> *> &bdr_attr_is_ess, Array<Vector *> &rhs)
- {
-@@ -592,6 +667,7 @@ double BlockNonlinearForm::GetEnergyBlocked(const BlockVector &bx) const
-    Array<const FiniteElement *> fe(fes.Size());
-    ElementTransformation *T;
-    DofTransformation *doftrans;
-+   Mesh *mesh = fes[0]->GetMesh();
-    double energy = 0.0;
- 
-    for (int i=0; i<fes.Size(); ++i)
-@@ -601,8 +677,33 @@ double BlockNonlinearForm::GetEnergyBlocked(const BlockVector &bx) const
-    }
- 
-    if (dnfi.Size())
-+   {
-+      // Which attributes need to be processed?
-+      Array<int> attr_marker(mesh->attributes.Size() ?
-+                             mesh->attributes.Max() : 0);
-+      attr_marker = 0;
-+      for (int k = 0; k < dnfi.Size(); k++)
-+      {
-+         if (dnfi_marker[k] == NULL)
-+         {
-+            attr_marker = 1;
-+            break;
-+         }
-+         Array<int> &marker = *dnfi_marker[k];
-+         MFEM_ASSERT(marker.Size() == attr_marker.Size(),
-+                     "invalid marker for domain integrator #"
-+                     << k << ", counting from zero");
-+         for (int i = 0; i < attr_marker.Size(); i++)
-+         {
-+            attr_marker[i] |= marker[i];
-+         }
-+      }
-+
-       for (int i = 0; i < fes[0]->GetNE(); ++i)
-       {
-+         const int attr = mesh->GetAttribute(i);
-+         if (attr_marker[attr-1] == 0) { continue; }
-+
-          T = fes[0]->GetElementTransformation(i);
-          for (int s=0; s<fes.Size(); ++s)
-          {
-@@ -614,9 +715,13 @@ double BlockNonlinearForm::GetEnergyBlocked(const BlockVector &bx) const
- 
-          for (int k = 0; k < dnfi.Size(); ++k)
-          {
-+            if (dnfi_marker[k] &&
-+                (*dnfi_marker[k])[attr-1] == 0) { continue; }
-+
-             energy += dnfi[k]->GetElementEnergy(fe, *T, el_x_const);
-          }
-       }
-+   }
- 
-    // free the allocated memory
-    for (int i = 0; i < fes.Size(); ++i)
-@@ -656,6 +761,7 @@ void BlockNonlinearForm::MultBlocked(const BlockVector &bx,
-    Array<const FiniteElement *> fe2(fes.Size());
-    ElementTransformation *T;
-    Array<DofTransformation *> doftrans(fes.Size()); doftrans = nullptr;
-+   Mesh *mesh = fes[0]->GetMesh();
- 
-    by.UseDevice(true);
-    by = 0.0;
-@@ -670,8 +776,32 @@ void BlockNonlinearForm::MultBlocked(const BlockVector &bx,
- 
-    if (dnfi.Size())
-    {
-+      // Which attributes need to be processed?
-+      Array<int> attr_marker(mesh->attributes.Size() ?
-+                             mesh->attributes.Max() : 0);
-+      attr_marker = 0;
-+      for (int k = 0; k < dnfi.Size(); k++)
-+      {
-+         if (dnfi_marker[k] == NULL)
-+         {
-+            attr_marker = 1;
-+            break;
-+         }
-+         Array<int> &marker = *dnfi_marker[k];
-+         MFEM_ASSERT(marker.Size() == attr_marker.Size(),
-+                     "invalid marker for domain integrator #"
-+                     << k << ", counting from zero");
-+         for (int i = 0; i < attr_marker.Size(); i++)
-+         {
-+            attr_marker[i] |= marker[i];
-+         }
-+      }
-+
-       for (int i = 0; i < fes[0]->GetNE(); ++i)
-       {
-+         const int attr = mesh->GetAttribute(i);
-+         if (attr_marker[attr-1] == 0) { continue; }
-+
-          T = fes[0]->GetElementTransformation(i);
-          for (int s = 0; s < fes.Size(); ++s)
-          {
-@@ -683,6 +813,9 @@ void BlockNonlinearForm::MultBlocked(const BlockVector &bx,
- 
-          for (int k = 0; k < dnfi.Size(); ++k)
-          {
-+            if (dnfi_marker[k] &&
-+                (*dnfi_marker[k])[attr-1] == 0) { continue; }
-+
-             dnfi[k]->AssembleElementVector(fe, *T,
-                                            el_x_const, el_y);
- 
-@@ -698,7 +831,6 @@ void BlockNonlinearForm::MultBlocked(const BlockVector &bx,
- 
-    if (fnfi.Size())
-    {
--      Mesh *mesh = fes[0]->GetMesh();
-       FaceElementTransformations *tr;
- 
-       for (int i = 0; i < mesh->GetNumFaces(); ++i)
-@@ -736,8 +868,8 @@ void BlockNonlinearForm::MultBlocked(const BlockVector &bx,
- 
-    if (bfnfi.Size())
-    {
--      Mesh *mesh = fes[0]->GetMesh();
-       FaceElementTransformations *tr;
-+
-       // Which boundary attributes need to be processed?
-       Array<int> bdr_attr_marker(mesh->bdr_attributes.Size() ?
-                                  mesh->bdr_attributes.Max() : 0);
-@@ -858,6 +990,7 @@ void BlockNonlinearForm::ComputeGradientBlocked(const BlockVector &bx) const
-    Array<const FiniteElement *>fe2(fes.Size());
-    ElementTransformation * T;
-    Array<DofTransformation *> doftrans(fes.Size()); doftrans = nullptr;
-+   Mesh *mesh = fes[0]->GetMesh();
- 
-    for (int i=0; i<fes.Size(); ++i)
-    {
-@@ -888,8 +1021,32 @@ void BlockNonlinearForm::ComputeGradientBlocked(const BlockVector &bx) const
- 
-    if (dnfi.Size())
-    {
-+      // Which attributes need to be processed?
-+      Array<int> attr_marker(mesh->attributes.Size() ?
-+                             mesh->attributes.Max() : 0);
-+      attr_marker = 0;
-+      for (int k = 0; k < dnfi.Size(); k++)
-+      {
-+         if (dnfi_marker[k] == NULL)
-+         {
-+            attr_marker = 1;
-+            break;
-+         }
-+         Array<int> &marker = *dnfi_marker[k];
-+         MFEM_ASSERT(marker.Size() == attr_marker.Size(),
-+                     "invalid marker for domain integrator #"
-+                     << k << ", counting from zero");
-+         for (int i = 0; i < attr_marker.Size(); i++)
-+         {
-+            attr_marker[i] |= marker[i];
-+         }
-+      }
-+
-       for (int i = 0; i < fes[0]->GetNE(); ++i)
-       {
-+         const int attr = mesh->GetAttribute(i);
-+         if (attr_marker[attr-1] == 0) { continue; }
-+
-          T = fes[0]->GetElementTransformation(i);
-          for (int s = 0; s < fes.Size(); ++s)
-          {
-@@ -901,6 +1058,9 @@ void BlockNonlinearForm::ComputeGradientBlocked(const BlockVector &bx) const
- 
-          for (int k = 0; k < dnfi.Size(); ++k)
-          {
-+            if (dnfi_marker[k] &&
-+                (*dnfi_marker[k])[attr-1] == 0) { continue; }
-+
-             dnfi[k]->AssembleElementGrad(fe, *T, el_x_const, elmats);
- 
-             for (int j=0; j<fes.Size(); ++j)
-@@ -923,7 +1083,6 @@ void BlockNonlinearForm::ComputeGradientBlocked(const BlockVector &bx) const
-    if (fnfi.Size())
-    {
-       FaceElementTransformations *tr;
--      Mesh *mesh = fes[0]->GetMesh();
- 
-       for (int i = 0; i < mesh->GetNumFaces(); ++i)
-       {
-@@ -960,7 +1119,6 @@ void BlockNonlinearForm::ComputeGradientBlocked(const BlockVector &bx) const
-    if (bfnfi.Size())
-    {
-       FaceElementTransformations *tr;
--      Mesh *mesh = fes[0]->GetMesh();
- 
-       // Which boundary attributes need to be processed?
-       Array<int> bdr_attr_marker(mesh->bdr_attributes.Size() ?
-diff --git a/fem/nonlinearform.hpp b/fem/nonlinearform.hpp
-index d15d09e04..77da539f7 100644
---- a/fem/nonlinearform.hpp
-+++ b/fem/nonlinearform.hpp
-@@ -37,6 +37,7 @@ protected:
- 
-    /// Set of Domain Integrators to be assembled (added).
-    Array<NonlinearFormIntegrator*> dnfi; // owned
-+   Array<Array<int>*>              dnfi_marker; // not owned
- 
-    /// Set of interior face Integrators to be assembled (added).
-    Array<NonlinearFormIntegrator*> fnfi; // owned
-@@ -108,7 +109,12 @@ public:
- 
-    /// Adds new Domain Integrator.
-    void AddDomainIntegrator(NonlinearFormIntegrator *nlfi)
--   { dnfi.Append(nlfi); }
-+   { dnfi.Append(nlfi); dnfi_marker.Append(NULL); }
-+
-+   /// Adds new Domain Integrator, restricted to specific attributes.
-+   void AddDomainIntegrator(NonlinearFormIntegrator *nlfi,
-+                            Array<int> &elem_marker)
-+   { dnfi.Append(nlfi); dnfi_marker.Append(&elem_marker); }
- 
-    /// Access all integrators added with AddDomainIntegrator().
-    Array<NonlinearFormIntegrator*> *GetDNFI() { return &dnfi; }
-@@ -227,13 +233,14 @@ protected:
- 
-    /// Set of Domain Integrators to be assembled (added).
-    Array<BlockNonlinearFormIntegrator*> dnfi;
-+   Array<Array<int>*>                   dnfi_marker;
- 
-    /// Set of interior face Integrators to be assembled (added).
-    Array<BlockNonlinearFormIntegrator*> fnfi;
- 
-    /// Set of Boundary Face Integrators to be assembled (added).
-    Array<BlockNonlinearFormIntegrator*> bfnfi;
--   Array<Array<int>*>           bfnfi_marker;
-+   Array<Array<int>*>                   bfnfi_marker;
- 
-    /** Auxiliary block-vectors for wrapping input and output vectors or holding
-        GridFunction-like block-vector data (e.g. in parallel). */
-@@ -298,7 +305,12 @@ public:
- 
-    /// Adds new Domain Integrator.
-    void AddDomainIntegrator(BlockNonlinearFormIntegrator *nlfi)
--   { dnfi.Append(nlfi); }
-+   { dnfi.Append(nlfi); dnfi_marker.Append(NULL); }
-+
-+   /// Adds new Domain Integrator, restricted to specific attributes.
-+   void AddDomainIntegrator(BlockNonlinearFormIntegrator *nlfi,
-+                            Array<int> &elem_marker)
-+   { dnfi.Append(nlfi); dnfi_marker.Append(&elem_marker); }
- 
-    /// Adds new Interior Face Integrator.
-    void AddInteriorFaceIntegrator(BlockNonlinearFormIntegrator *nlfi)
-@@ -311,7 +323,8 @@ public:
-    /** @brief Adds new Boundary Face Integrator, restricted to specific boundary
-        attributes. */
-    void AddBdrFaceIntegrator(BlockNonlinearFormIntegrator *nlfi,
--                             Array<int> &bdr_marker);
-+                             Array<int> &bdr_marker)
-+   { bfnfi.Append(nlfi); bfnfi_marker.Append(&bdr_marker); }
- 
-    virtual void SetEssentialBC(const Array<Array<int> *>&bdr_attr_is_ess,
-                                Array<Vector *> &rhs);
diff --git a/palace/deps/patch/mfem/patch_direct_solvers.diff b/palace/deps/patch/mfem/patch_direct_solvers.diff
index 98c0b7c53..cbe9eba3e 100644
--- a/palace/deps/patch/mfem/patch_direct_solvers.diff
+++ b/palace/deps/patch/mfem/patch_direct_solvers.diff
@@ -1,24 +1,3 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 32112b549..e9e6ae0b7 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -82,7 +82,7 @@ if (MFEM_USE_CONDUIT OR
-    #  * find_package(PETSc REQUIRED)
-    set(XSDK_ENABLE_C ON)
- endif()
--if (MFEM_USE_STRUMPACK)
-+if (MFEM_USE_STRUMPACK OR MFEM_USE_MUMPS)
-   # Just needed to find the MPI_Fortran libraries to link with
-   set(XSDK_ENABLE_Fortran ON)
- endif()
-@@ -333,6 +333,7 @@ endif()
- if (MFEM_USE_MUMPS)
-   if (MFEM_USE_MPI)
-     find_package(MUMPS REQUIRED mumps_common pord)
-+    set(MFEM_MUMPS_VERSION ${MUMPS_VERSION})
-   else()
-     message(FATAL_ERROR " *** MUMPS requires that MPI be enabled.")
-   endif()
 diff --git a/INSTALL b/INSTALL
 index cb092cc1b..9a7deaa43 100644
 --- a/INSTALL
@@ -33,332 +12,11 @@ index cb092cc1b..9a7deaa43 100644
    URL: http://portal.nersc.gov/project/sparse/strumpack
    Options: STRUMPACK_OPT, STRUMPACK_LIB.
    Versions: STRUMPACK >= 3.0.0.
-diff --git a/config/cmake/config.hpp.in b/config/cmake/config.hpp.in
-index 7e820088a..adde511fb 100644
---- a/config/cmake/config.hpp.in
-+++ b/config/cmake/config.hpp.in
-@@ -80,96 +80,101 @@
- // Internal MFEM option: enable group/batch allocation for some small objects.
- #cmakedefine MFEM_USE_MEMALLOC
- 
-+// Which library functions to use in class StopWatch for measuring time.
-+// For a list of the available options, see INSTALL.
-+// If not defined, an option is selected automatically.
-+#cmakedefine MFEM_TIMER_TYPE @MFEM_TIMER_TYPE@
-+
-+// Enable MFEM functionality based on the SUNDIALS libraries.
-+#cmakedefine MFEM_USE_SUNDIALS
-+
- // Enable MFEM functionality based on the SuiteSparse library.
- #cmakedefine MFEM_USE_SUITESPARSE
- 
- // Enable MFEM functionality based on the SuperLU_DIST library.
- #cmakedefine MFEM_USE_SUPERLU
-+#cmakedefine MFEM_USE_SUPERLU5
- 
- // Enable MFEM functionality based on the MUMPS library.
- #cmakedefine MFEM_USE_MUMPS
-+#cmakedefine MFEM_MUMPS_VERSION @MFEM_MUMPS_VERSION@
- 
- // Enable MFEM functionality based on the STRUMPACK library.
- #cmakedefine MFEM_USE_STRUMPACK
- 
--// Enable functionality based on the Ginkgo library
-+// Enable functionality based on the Ginkgo library.
- #cmakedefine MFEM_USE_GINKGO
- 
--// Enable MFEM functionality based on the AmgX library
-+// Enable MFEM functionality based on the AmgX library.
- #cmakedefine MFEM_USE_AMGX
- 
--// Enable MFEM functionality based on the GnuTLS library
-+// Enable secure socket streams based on the GNUTLS library.
- #cmakedefine MFEM_USE_GNUTLS
- 
--// Enable MFEM functionality based on the GSLIB library
--#cmakedefine MFEM_USE_GSLIB
--
--// Enable MFEM functionality based on the NetCDF library
--#cmakedefine MFEM_USE_NETCDF
--
--// Enable MFEM functionality based on the PETSc library
--#cmakedefine MFEM_USE_PETSC
--
--// Enable MFEM functionality based on the SLEPc library
--#cmakedefine MFEM_USE_SLEPC
--
--// Enable MFEM functionality based on the Sidre library
-+// Enable Sidre support.
- #cmakedefine MFEM_USE_SIDRE
- 
--// Enable the use of SIMD in the high performance templated classes
-+// Enable the use of SIMD in the high performance templated classes.
- #cmakedefine MFEM_USE_SIMD
- 
--// Enable MFEM functionality based on the FMS library
-+// Enable FMS support.
- #cmakedefine MFEM_USE_FMS
- 
--// Enable MFEM functionality based on Conduit
-+// Enable Conduit support.
- #cmakedefine MFEM_USE_CONDUIT
- 
--// Enable MFEM functionality based on the PUMI library
-+// Enable functionality based on the NetCDF library (reading CUBIT files).
-+#cmakedefine MFEM_USE_NETCDF
-+
-+// Enable functionality based on the PETSc library.
-+#cmakedefine MFEM_USE_PETSC
-+
-+// Enable functionality based on the SLEPc library.
-+#cmakedefine MFEM_USE_SLEPC
-+
-+// Enable functionality based on the MPFR library.
-+#cmakedefine MFEM_USE_MPFR
-+
-+// Enable MFEM functionality based on the PUMI library.
- #cmakedefine MFEM_USE_PUMI
- 
--// Enable MFEM functionality based on the Moonolith library
-+// Enable Moonolith-based general interpolation between finite element spaces.
- #cmakedefine MFEM_USE_MOONOLITH
- 
--// Enable MFEM functionality based on the HiOp library
-+// Enable MFEM functionality based on the HIOP library.
- #cmakedefine MFEM_USE_HIOP
- 
--// Build the GPU/CUDA-enabled version of the MFEM library.
-+// Enable MFEM functionality based on the GSLIB library.
-+#cmakedefine MFEM_USE_GSLIB
-+
-+// Build the NVIDIA GPU/CUDA-enabled version of the MFEM library.
- // Requires a CUDA compiler (nvcc).
- #cmakedefine MFEM_USE_CUDA
- 
--// Build the HIP-enabled version of the MFEM library.
-+// Build the AMD GPU/HIP-enabled version of the MFEM library.
- // Requires a HIP compiler (hipcc).
- #cmakedefine MFEM_USE_HIP
- 
--// Enable MFEM functionality based on the RAJA library
-+// Enable functionality based on the RAJA library.
- #cmakedefine MFEM_USE_RAJA
- 
--// Enable MFEM functionality based on the OCCA library
-+// Enable functionality based on the OCCA library.
- #cmakedefine MFEM_USE_OCCA
- 
--// Enable MFEM functionality based on the libCEED library
-+// Enable functionality based on the libCEED library.
- #cmakedefine MFEM_USE_CEED
- 
--// Enable MFEM functionality based on the Umpire library
--#cmakedefine MFEM_USE_UMPIRE
--
--// Enable MFEM functionality based on the ADIOS2 library
--#cmakedefine MFEM_USE_ADIOS2
--
--// Enable MFEM functionality based on the Caliper library
-+// Enable functionality based on the Caliper library.
- #cmakedefine MFEM_USE_CALIPER
- 
--// Enable MFEM functionality based on the Algoim library
-+// Enable functionality based on the Algoim library.
- #cmakedefine MFEM_USE_ALGOIM
- 
--// Which library functions to use in class StopWatch for measuring time.
--// For a list of the available options, see INSTALL.
--// If not defined, an option is selected automatically.
--#define MFEM_TIMER_TYPE @MFEM_TIMER_TYPE@
-+// Enable functionality based on the Umpire library.
-+#cmakedefine MFEM_USE_UMPIRE
- 
--// Enable MFEM functionality based on the SUNDIALS libraries.
--#cmakedefine MFEM_USE_SUNDIALS
-+// Enable IO functionality based on the ADIOS2 library.
-+#cmakedefine MFEM_USE_ADIOS2
- 
- // Version of HYPRE used for building MFEM.
- #cmakedefine MFEM_HYPRE_VERSION @MFEM_HYPRE_VERSION@
-@@ -181,13 +186,13 @@
- // Enable interface to the MKL CPardiso library.
- #cmakedefine MFEM_USE_MKL_CPARDISO
- 
--// Use forward mode for automatic differentiation
-+// Use forward mode for automatic differentiation.
- #cmakedefine MFEM_USE_ADFORWARD
- 
--// Enable the use of the CoDiPack library for AD
-+// Enable the use of the CoDiPack library for AD.
- #cmakedefine MFEM_USE_CODIPACK
- 
--// Enable MFEM functionality based on the Google Benchmark library.
-+// Enable functionality based on the Google Benchmark library.
- #cmakedefine MFEM_USE_BENCHMARK
- 
- // Enable Enzyme for AD
-diff --git a/config/cmake/modules/FindMUMPS.cmake b/config/cmake/modules/FindMUMPS.cmake
-index acdfd55a6..2b034d216 100644
---- a/config/cmake/modules/FindMUMPS.cmake
-+++ b/config/cmake/modules/FindMUMPS.cmake
-@@ -11,8 +11,9 @@
- 
- # Sets the following variables:
- #   - MUMPS_FOUND
--#   - MUMPS_INCLUDE_DIRS
- #   - MUMPS_LIBRARIES
-+#   - MUMPS_INCLUDE_DIRS
-+#   - MUMPS_VERSION
- 
- include(MfemCmakeUtilities)
- mfem_find_package(MUMPS MUMPS MUMPS_DIR
-@@ -21,3 +22,18 @@ mfem_find_package(MUMPS MUMPS MUMPS_DIR
-   "Libraries required by MUMPS."
-   ADD_COMPONENT mumps_common "include" dmumps_c.h "lib" mumps_common
-   ADD_COMPONENT pord "include" dmumps_c.h "lib" pord)
-+
-+if (MUMPS_FOUND AND (NOT MUMPS_VERSION))
-+  try_run(MUMPS_VERSION_RUN_RESULT MUMPS_VERSION_COMPILE_RESULT
-+          ${CMAKE_CURRENT_BINARY_DIR}/config
-+          ${CMAKE_CURRENT_SOURCE_DIR}/config/get_mumps_version.cpp
-+          CMAKE_FLAGS -DINCLUDE_DIRECTORIES:STRING=${MUMPS_INCLUDE_DIRS}
-+          RUN_OUTPUT_VARIABLE MUMPS_VERSION_OUTPUT)
-+  if ((MUMPS_VERSION_RUN_RESULT EQUAL 0) AND MUMPS_VERSION_OUTPUT)
-+    string(STRIP "${MUMPS_VERSION_OUTPUT}" MUMPS_VERSION)
-+    set(MUMPS_VERSION ${MUMPS_VERSION} CACHE STRING "MUMPS version." FORCE)
-+    message(STATUS "Found MUMPS version ${MUMPS_VERSION}")
-+  else()
-+    message(FATAL_ERROR "Unable to determine MUMPS version.")
-+  endif()
-+endif()
-diff --git a/config/config.hpp.in b/config/config.hpp.in
-index 76145927b..7a737720e 100644
---- a/config/config.hpp.in
-+++ b/config/config.hpp.in
-@@ -30,10 +30,10 @@
- #define MFEM_VERSION_MINOR (((MFEM_VERSION)/100)%100)
- #define MFEM_VERSION_PATCH ((MFEM_VERSION)%100)
- 
--// The absolute path of the MFEM source prefix
-+// The absolute path of the MFEM source prefix.
- // #define MFEM_SOURCE_DIR "@MFEM_SOURCE_DIR@"
- 
--// The absolute path of the MFEM installation prefix
-+// The absolute path of the MFEM installation prefix.
- // #define MFEM_INSTALL_DIR "@MFEM_INSTALL_DIR@"
- 
- // Description of the git commit used to build MFEM.
-@@ -91,7 +91,7 @@
- // Enable MFEM functionality based on the SuiteSparse library.
- // #define MFEM_USE_SUITESPARSE
- 
--// Enable MFEM functionality based on the SuperLU library.
-+// Enable MFEM functionality based on the SuperLU_DIST library.
- // #define MFEM_USE_SUPERLU
- // #define MFEM_USE_SUPERLU5
- 
-@@ -102,40 +102,40 @@
- // Enable MFEM functionality based on the STRUMPACK library.
- // #define MFEM_USE_STRUMPACK
- 
--// Enable MFEM features based on the Ginkgo library
-+// Enable MFEM features based on the Ginkgo library.
- // #define MFEM_USE_GINKGO
- 
- // Enable MFEM functionality based on the AmgX library.
- // #define MFEM_USE_AMGX
- 
--// Enable secure socket streams based on the GNUTLS library
-+// Enable secure socket streams based on the GNUTLS library.
- // #define MFEM_USE_GNUTLS
- 
--// Enable Sidre support
-+// Enable Sidre support.
- // #define MFEM_USE_SIDRE
- 
--// Enable the use of SIMD in the high performance templated classes
-+// Enable the use of SIMD in the high performance templated classes.
- // #define MFEM_USE_SIMD
- 
--// Enable FMS support
-+// Enable FMS support.
- // #define MFEM_USE_FMS
- 
--// Enable Conduit support
-+// Enable Conduit support.
- // #define MFEM_USE_CONDUIT
- 
--// Enable functionality based on the NetCDF library (reading CUBIT files)
-+// Enable functionality based on the NetCDF library (reading CUBIT files).
- // #define MFEM_USE_NETCDF
- 
--// Enable functionality based on the PETSc library
-+// Enable functionality based on the PETSc library.
- // #define MFEM_USE_PETSC
- 
--// Enable functionality based on the SLEPc library
-+// Enable functionality based on the SLEPc library.
- // #define MFEM_USE_SLEPC
- 
- // Enable functionality based on the MPFR library.
- // #define MFEM_USE_MPFR
- 
--// Enable MFEM functionality based on the PUMI library
-+// Enable MFEM functionality based on the PUMI library.
- // #define MFEM_USE_PUMI
- 
- // Enable Moonolith-based general interpolation between finite element spaces.
-@@ -144,7 +144,7 @@
- // Enable MFEM functionality based on the HIOP library.
- // #define MFEM_USE_HIOP
- 
--// Enable MFEM functionality based on the GSLIB library
-+// Enable MFEM functionality based on the GSLIB library.
- // #define MFEM_USE_GSLIB
- 
- // Build the NVIDIA GPU/CUDA-enabled version of the MFEM library.
-@@ -186,10 +186,10 @@
- // Enable interface to the MKL CPardiso library.
- // #define MFEM_USE_MKL_CPARDISO
- 
--// Use forward mode for automatic differentiation
-+// Use forward mode for automatic differentiation.
- // #define MFEM_USE_ADFORWARD
- 
--// Enable the use of the CoDiPack library for AD
-+// Enable the use of the CoDiPack library for AD.
- // #define MFEM_USE_CODIPACK
- 
- // Enable functionality based on the Google Benchmark library.
 diff --git a/config/defaults.cmake b/config/defaults.cmake
-index d5104092b..4386ce53a 100644
+index 390026414..4386ce53a 100644
 --- a/config/defaults.cmake
 +++ b/config/defaults.cmake
-@@ -134,16 +134,18 @@ set(ParMETIS_DIR "${MFEM_DIR}/../parmetis-4.0.3" CACHE PATH
- set(ParMETIS_REQUIRED_PACKAGES "METIS" CACHE STRING
-     "Additional packages required by ParMETIS.")
- 
--set(SuperLUDist_DIR "${MFEM_DIR}/../SuperLU_DIST_6.3.1" CACHE PATH
-+set(SuperLUDist_DIR "${MFEM_DIR}/../SuperLU_DIST_8.1.2" CACHE PATH
-     "Path to the SuperLU_DIST library.")
- # SuperLU_DIST may also depend on "OpenMP", depending on how it was compiled.
--set(SuperLUDist_REQUIRED_PACKAGES "MPI" "BLAS" "ParMETIS" CACHE STRING
-+set(SuperLUDist_REQUIRED_PACKAGES "MPI" "ParMETIS" "METIS"
-+    "LAPACK" "BLAS" CACHE STRING
-     "Additional packages required by SuperLU_DIST.")
- 
--set(MUMPS_DIR "${MFEM_DIR}/../MUMPS_5.2.0" CACHE PATH
-+set(MUMPS_DIR "${MFEM_DIR}/../MUMPS_5.5.0" CACHE PATH
-     "Path to the MUMPS library.")
--# Packages required by MUMPS, depending on how it was compiled.
--set(MUMPS_REQUIRED_PACKAGES "MPI" "BLAS" "METIS" "ScaLAPACK" CACHE STRING
-+# MUMPS may also depend on "OpenMP", depending on how it was compiled.
-+set(MUMPS_REQUIRED_PACKAGES "MPI" "MPI_Fortran" "ParMETIS" "METIS"
-+    "ScaLAPACK" "LAPACK" "BLAS" CACHE STRING
-     "Additional packages required by MUMPS.")
- # If the MPI package does not find all required Fortran libraries:
- # set(MUMPS_REQUIRED_LIBRARIES "gfortran" "mpi_mpifh" CACHE STRING
-@@ -154,7 +156,8 @@ set(STRUMPACK_DIR "${MFEM_DIR}/../STRUMPACK-build" CACHE PATH
+@@ -156,7 +156,8 @@ set(STRUMPACK_DIR "${MFEM_DIR}/../STRUMPACK-build" CACHE PATH
  # STRUMPACK may also depend on "OpenMP", depending on how it was compiled.
  # Starting with v2.2.0 of STRUMPACK, ParMETIS and Scotch are optional.
  set(STRUMPACK_REQUIRED_PACKAGES "MPI" "MPI_Fortran" "ParMETIS" "METIS"
@@ -368,52 +26,6 @@ index d5104092b..4386ce53a 100644
      "Additional packages required by STRUMPACK.")
  # If the MPI package does not find all required Fortran libraries:
  # set(STRUMPACK_REQUIRED_LIBRARIES "gfortran" "mpi_mpifh" CACHE STRING
-diff --git a/config/defaults.mk b/config/defaults.mk
-index ca5dc3c45..e149ae452 100644
---- a/config/defaults.mk
-+++ b/config/defaults.mk
-@@ -284,10 +284,10 @@ ifeq ($(MFEM_USE_SUPERLU5),YES)
-    SUPERLU_LIB = $(XLINKER)-rpath,$(SUPERLU_DIR)/lib -L$(SUPERLU_DIR)/lib\
-       -lsuperlu_dist_5.1.0
- else
--   SUPERLU_DIR = @MFEM_DIR@/../SuperLU_DIST_6.3.1
-+   SUPERLU_DIR = @MFEM_DIR@/../SuperLU_DIST_8.1.2
-    SUPERLU_OPT = -I$(SUPERLU_DIR)/include
-    SUPERLU_LIB = $(XLINKER)-rpath,$(SUPERLU_DIR)/lib64 -L$(SUPERLU_DIR)/lib64\
--      -lsuperlu_dist -lblas
-+      -lsuperlu_dist $(LAPACK_LIB)
- endif
- 
- # SCOTCH library configuration (required by STRUMPACK <= v2.1.0, optional in
-@@ -311,7 +311,7 @@ MPI_FORTRAN_LIB = -lmpifort
- # MPI_FORTRAN_LIB += -lgfortran
- 
- # MUMPS library configuration
--MUMPS_DIR = @MFEM_DIR@/../MUMPS_5.2.0
-+MUMPS_DIR = @MFEM_DIR@/../MUMPS_5.5.0
- MUMPS_OPT = -I$(MUMPS_DIR)/include
- MUMPS_LIB = $(XLINKER)-rpath,$(MUMPS_DIR)/lib -L$(MUMPS_DIR)/lib -ldmumps\
-  -lmumps_common -lpord $(SCALAPACK_LIB) $(LAPACK_LIB) $(MPI_FORTRAN_LIB)
-diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
-index afa98324c..7d9c835c9 100644
---- a/examples/CMakeLists.txt
-+++ b/examples/CMakeLists.txt
-@@ -161,6 +161,15 @@ if (MFEM_ENABLE_TESTING)
-       $<TARGET_FILE:ex11p> "-no-vis" "--superlu"
-       ${MPIEXEC_POSTFLAGS})
-   endif()
-+
-+  # If MUMPS is enabled, add a test run that uses it.
-+  if (MFEM_USE_MUMPS)
-+    add_test(NAME ex25p_mumps_np=${MFEM_MPI_NP}
-+      COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${MFEM_MPI_NP}
-+      ${MPIEXEC_PREFLAGS}
-+      $<TARGET_FILE:ex25p> "-no-vis" "--mumps-solver"
-+      ${MPIEXEC_POSTFLAGS})
-+  endif()
- endif()
- 
- # Include the examples/amgx directory if AmgX is enabled
 diff --git a/examples/ex11p.cpp b/examples/ex11p.cpp
 index 216a6f443..eca3ce929 100644
 --- a/examples/ex11p.cpp
@@ -435,7 +47,7 @@ index 216a6f443..eca3ce929 100644
           strumpack->SetFromCommandLine();
           precond = strumpack;
 diff --git a/examples/ex25p.cpp b/examples/ex25p.cpp
-index 160145719..2e36471bd 100644
+index e3848b848..cf5daf412 100644
 --- a/examples/ex25p.cpp
 +++ b/examples/ex25p.cpp
 @@ -170,6 +170,7 @@ int main(int argc, char *argv[])
@@ -476,7 +88,7 @@ index 160145719..2e36471bd 100644
     }
  
     if (iprob > 4) { iprob = 4; }
-@@ -474,15 +481,33 @@ int main(int argc, char *argv[])
+@@ -474,6 +481,24 @@ int main(int argc, char *argv[])
        delete A;
     }
  #endif
@@ -501,17 +113,6 @@ index 160145719..2e36471bd 100644
  #ifdef MFEM_USE_MUMPS
     if (!pa && mumps_solver)
     {
-       HypreParMatrix *A = Ah.As<ComplexHypreParMatrix>()->GetSystemMatrix();
--      MUMPSSolver mumps;
-+      MUMPSSolver mumps(A->GetComm());
-       mumps.SetPrintLevel(0);
-       mumps.SetMatrixSymType(MUMPSSolver::MatType::UNSYMMETRIC);
-       mumps.SetOperator(*A);
--      mumps.Mult(B,X);
-+      mumps.Mult(B, X);
-       delete A;
-    }
- #endif
 @@ -493,7 +518,7 @@ int main(int argc, char *argv[])
     //
     //    In PML:   1/mu (abs(1/det(J) J^T J) Curl E, Curl F)
@@ -541,3291 +142,957 @@ index 51238c4d7..e6f4730fe 100644
              strumpack->SetOperator(*Arow);
              strumpack->SetFromCommandLine();
              precond = strumpack;
-diff --git a/examples/superlu/ex1p.cpp b/examples/superlu/ex1p.cpp
-index 2bd220b07..a00f00af8 100644
---- a/examples/superlu/ex1p.cpp
-+++ b/examples/superlu/ex1p.cpp
-@@ -67,6 +67,7 @@ int main(int argc, char *argv[])
-    int slu_colperm = 4;
-    int slu_rowperm = 1;
-    int slu_iterref = 2;
-+   int slu_npdep = 1;
- 
-    OptionsParser args(argc, argv);
-    args.AddOption(&mesh_file, "-m", "--mesh",
-@@ -85,9 +86,11 @@ int main(int argc, char *argv[])
-                   "6-ZOLTAN");
-    args.AddOption(&slu_rowperm, "-rp", "--rowperm",
-                   "SuperLU Row Permutation Method:  0-NOROWPERM, 1-LargeDiag");
--   args.AddOption(&slu_iterref, "-rp", "--rowperm",
-+   args.AddOption(&slu_iterref, "-ir", "--iterref",
-                   "SuperLU Iterative Refinement:  0-NOREFINE, 1-Single, "
-                   "2-Double, 3-Extra");
-+   args.AddOption(&slu_npdep, "-npdep", "--npdepth",
-+                  "Depth of 3D parition for SuperLU (>= 7.2.0)");
- 
-    args.Parse();
-    if (!args.Good())
-@@ -214,7 +217,7 @@ int main(int argc, char *argv[])
-    a.FormLinearSystem(ess_tdof_list, x, b, A, X, B);
- 
-    // 13. Solve the linear system A X = B utilizing SuperLU.
--   SuperLUSolver *superlu = new SuperLUSolver(MPI_COMM_WORLD);
-+   SuperLUSolver *superlu = new SuperLUSolver(MPI_COMM_WORLD, slu_npdep);
-    Operator *SLU_A = new SuperLURowLocMatrix(*A.As<HypreParMatrix>());
-    superlu->SetPrintStatistics(true);
-    superlu->SetSymmetricPattern(false);
-@@ -281,10 +284,9 @@ int main(int argc, char *argv[])
-    superlu->SetOperator(*SLU_A);
-    superlu->SetPrintStatistics(true);
-    superlu->Mult(B, X);
--   superlu->DismantleGrid();
- 
--   delete SLU_A;
-    delete superlu;
-+   delete SLU_A;
- 
-    // 14. Recover the parallel grid function corresponding to X. This is the
-    //     local finite element solution on each processor.
-diff --git a/linalg/mumps.cpp b/linalg/mumps.cpp
-index fb6c7c15a..6efb98e3e 100644
---- a/linalg/mumps.cpp
-+++ b/linalg/mumps.cpp
-@@ -16,58 +16,123 @@
- 
- #include "mumps.hpp"
- 
--#ifdef HYPRE_BIGINT
--#error "MUMPSSolver requires HYPRE_Int == int, for now."
-+#include <algorithm>
-+
-+#if MFEM_MUMPS_VERSION >= 530
-+#ifdef MUMPS_INTSIZE64
-+#error "Full 64-bit MUMPS is not yet supported"
+diff --git a/general/communication.hpp b/general/communication.hpp
+index 474486f1b..be8145689 100644
+--- a/general/communication.hpp
++++ b/general/communication.hpp
+@@ -76,6 +76,14 @@ private:
+    static void Init_(int *argc, char ***argv)
+    {
+       MFEM_VERIFY(!IsInitialized(), "MPI already initialized!")
++#if defined(MFEM_USE_STRUMPACK)
++#if defined(STRUMPACK_USE_PTSCOTCH) || defined(STRUMPACK_USE_SLATE_SCALAPACK)
++      if (Root())
++      {
++         MFEM_WARNING("STRUMPACK built with SLATE or PT-Scotch may require MPI_Init_thread with MPI_THREAD_MULTIPLE!");
++      }
 +#endif
-+#else
-+#ifdef INTSIZE64
-+#error "Full 64-bit MUMPS is not yet supported"
 +#endif
- #endif
+       MPI_Init(argc, argv);
+       // The "mpi" object below needs to be created after MPI_Init() for some
+       // MPI implementations
+diff --git a/linalg/strumpack.cpp b/linalg/strumpack.cpp
+index f0ff11ab4..5b54994aa 100644
+--- a/linalg/strumpack.cpp
++++ b/linalg/strumpack.cpp
+@@ -16,238 +16,470 @@
  
--// macro s.t. indices match MUMPS documentation
-+// Macro s.t. indices match MUMPS documentation
- #define MUMPS_ICNTL(I) icntl[(I) -1]
-+#define MUMPS_CNTL(I) cntl[(I) -1]
- #define MUMPS_INFO(I) info[(I) -1]
-+#define MUMPS_INFOG(I) infog[(I) -1]
+ #include "strumpack.hpp"
  
+-using namespace std;
+-using namespace strumpack;
+-
  namespace mfem
  {
  
--void MUMPSSolver::SetOperator(const Operator &op)
-+MUMPSSolver::MUMPSSolver(MPI_Comm comm_)
+ STRUMPACKRowLocMatrix::STRUMPACKRowLocMatrix(MPI_Comm comm,
+-                                             int num_loc_rows, int first_loc_row,
+-                                             int glob_nrows, int glob_ncols,
+-                                             int *I, int *J, double *data)
+-   : comm_(comm), A_(NULL)
++                                             int num_loc_rows,
++                                             HYPRE_BigInt first_loc_row,
++                                             HYPRE_BigInt glob_nrows,
++                                             HYPRE_BigInt glob_ncols,
++                                             int *I, HYPRE_BigInt *J,
++                                             double *data, bool sym_sparse)
  {
--   auto APtr = dynamic_cast<const HypreParMatrix *>(&op);
--
--   MFEM_VERIFY(APtr, "Not compatible matrix type");
-+   Init(comm_);
-+}
- 
--   height = op.Height();
--   width = op.Width();
-+MUMPSSolver::MUMPSSolver(const Operator &op)
-+{
-+   auto APtr = dynamic_cast<const HypreParMatrix *>(&op);
-+   MFEM_VERIFY(APtr, "Not a compatible matrix type");
-+   Init(APtr->GetComm());
-+   SetOperator(op);
-+}
- 
--   comm = APtr->GetComm();
-+void MUMPSSolver::Init(MPI_Comm comm_)
-+{
-+   id = nullptr;
-+   comm = comm_;
-    MPI_Comm_size(comm, &numProcs);
-    MPI_Comm_rank(comm, &myid);
+    // Set mfem::Operator member data
+    height = num_loc_rows;
+    width  = num_loc_rows;
  
--   auto parcsr_op = (hypre_ParCSRMatrix *) const_cast<HypreParMatrix &>(*APtr);
-+   mat_type = MatType::UNSYMMETRIC;
-+   print_level = 0;
-+   reorder_method = ReorderingStrategy::AUTOMATIC;
-+   reorder_reuse = false;
-+   blr_tol = 0.0;
-+
-+#if MFEM_MUMPS_VERSION >= 530
-+   irhs_loc = nullptr;
-+   rhs_loc = nullptr;
-+   isol_loc = nullptr;
-+   sol_loc = nullptr;
-+#else
-+   recv_counts = nullptr;
-+   displs = nullptr;
-+   rhs_glob = nullptr;
-+#endif
-+}
+-   // Allocate STRUMPACK's CSRMatrixMPI
+-   int nprocs, rank;
+-   MPI_Comm_rank(comm_, &rank);
+-   MPI_Comm_size(comm_, &nprocs);
+-   int * dist = new int[nprocs + 1];
+-   dist[rank + 1] = first_loc_row + num_loc_rows;
++   // Allocate STRUMPACK's CSRMatrixMPI (copies all inputs)
++   int rank, nprocs;
++   MPI_Comm_rank(comm, &rank);
++   MPI_Comm_size(comm, &nprocs);
++   Array<HYPRE_BigInt> dist(nprocs + 1);
+    dist[0] = 0;
+-   MPI_Allgather(MPI_IN_PLACE, 0, MPI_INT, dist + 1, 1, MPI_INT, comm_);
+-   A_ = new CSRMatrixMPI<double,int>(num_loc_rows, I, J, data, dist, comm_, false);
+-   delete[] dist;
++   dist[rank + 1] = first_loc_row + (HYPRE_BigInt)num_loc_rows;
++   MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
++                 dist.GetData() + 1, 1, HYPRE_MPI_BIG_INT, comm);
 +
-+MUMPSSolver::~MUMPSSolver()
-+{
-+#if MFEM_MUMPS_VERSION >= 530
-+   delete [] irhs_loc;
-+   delete [] rhs_loc;
-+   delete [] isol_loc;
-+   delete [] sol_loc;
++#if !(defined(HYPRE_BIGINT) || defined(HYPRE_MIXEDINT))
++   A_ = new strumpack::CSRMatrixMPI<double, HYPRE_BigInt>(
++      (HYPRE_BigInt)num_loc_rows, I, J, data, dist.GetData(),
++      comm, sym_sparse);
 +#else
-+   delete [] recv_counts;
-+   delete [] displs;
-+   delete [] rhs_glob;
++   Array<HYPRE_BigInt> II(num_loc_rows+1);
++   for (int i = 0; i <= num_loc_rows; i++) { II[i] = (HYPRE_BigInt)I[i]; }
++   A_ = new strumpack::CSRMatrixMPI<double, HYPRE_BigInt>(
++      (HYPRE_BigInt)num_loc_rows, II.GetData(), J, data, dist.GetData(),
++      comm, sym_sparse);
 +#endif
-+   if (id)
-+   {
-+      id->job = -2;
-+      dmumps_c(id);
-+      delete id;
-+   }
-+}
-+
-+void MUMPSSolver::SetOperator(const Operator &op)
-+{
-+   auto APtr = dynamic_cast<const HypreParMatrix *>(&op);
+ }
+ 
+-STRUMPACKRowLocMatrix::STRUMPACKRowLocMatrix(const HypreParMatrix & hypParMat)
+-   : comm_(hypParMat.GetComm()),
+-     A_(NULL)
++STRUMPACKRowLocMatrix::STRUMPACKRowLocMatrix(const Operator &op,
++                                             bool sym_sparse)
+ {
+-   // First cast the parameter to a hypre_ParCSRMatrix
+-   hypre_ParCSRMatrix * parcsr_op =
+-      (hypre_ParCSRMatrix *)const_cast<HypreParMatrix&>(hypParMat);
++   const HypreParMatrix *APtr = dynamic_cast<const HypreParMatrix *>(&op);
 +   MFEM_VERIFY(APtr, "Not a compatible matrix type");
-+
++   MPI_Comm comm = APtr->GetComm();
+ 
+-   MFEM_ASSERT(parcsr_op != NULL,"STRUMPACK: const_cast failed in SetOperator");
++   // Set mfem::Operator member data
 +   height = op.Height();
-+   width = op.Width();
++   width  = op.Width();
  
-+   auto parcsr_op = (hypre_ParCSRMatrix *)const_cast<HypreParMatrix &>(*APtr);
-    APtr->HostRead();
-    hypre_CSRMatrix *csr_op = hypre_MergeDiagAndOffd(parcsr_op);
-    APtr->HypreRead();
+-   // Create the CSRMatrixMPI A_ by borrowing the internal data from a
+-   // hypre_CSRMatrix.
+-   hypParMat.HostRead();
+-   hypre_CSRMatrix * csr_op = hypre_MergeDiagAndOffd(parcsr_op);
+-   hypParMat.HypreRead();
+-   hypre_CSRMatrixSetDataOwner(csr_op,0);
++   // First cast the parameter to a hypre_ParCSRMatrix
++   hypre_ParCSRMatrix *parcsr_op =
++      (hypre_ParCSRMatrix *)const_cast<HypreParMatrix &>(*APtr);
++
++   // Create the CSRMatrixMPI A by taking the internal data from a
++   // hypre_CSRMatrix
++   APtr->HostRead();
++   hypre_CSRMatrix *csr_op = hypre_MergeDiagAndOffd(parcsr_op);
++   APtr->HypreRead();
 +   HYPRE_Int       *Iptr   = csr_op->i;
  #if MFEM_HYPRE_VERSION >= 21600
+-   // For now, this method assumes that HYPRE_Int is int. Also, csr_op->num_cols
+-   // is of type HYPRE_Int, so if we want to check for big indices in
+-   // csr_op->big_j, we'll have to check all entries and that check will only be
+-   // necessary in HYPRE_MIXEDINT mode which is not supported at the moment.
 -   hypre_CSRMatrixBigJtoJ(csr_op);
 +   HYPRE_BigInt    *Jptr   = csr_op->big_j;
 +#else
 +   HYPRE_Int       *Jptr   = csr_op->j;
  #endif
++   double          *data   = csr_op->data;
  
--   int *Iptr = csr_op->i;
--   int *Jptr = csr_op->j;
--   int n_loc = csr_op->num_rows;
--
--   row_start = parcsr_op->first_row_index;
-+   int n_loc = internal::to_int(csr_op->num_rows);
-+   row_start = internal::to_int(parcsr_op->first_row_index);
+-   height = csr_op->num_rows;
+-   width  = csr_op->num_rows;
++   HYPRE_BigInt fst_row = parcsr_op->first_row_index;
++   HYPRE_Int    m_loc   = csr_op->num_rows;
  
--   MUMPS_INT8 nnz = 0;
-+   MUMPS_INT8 nnz = 0, k = 0;
-    if (mat_type)
-    {
--      // count nnz in case of symmetric mode
--      int k = 0;
-+      // Count nnz in case of symmetric mode
-       for (int i = 0; i < n_loc; i++)
-       {
--         for (int j = Iptr[i]; j < Iptr[i + 1]; j++)
-+         for (HYPRE_Int j = Iptr[i]; j < Iptr[i + 1]; j++)
-          {
-             int ii = row_start + i + 1;
--            int jj = Jptr[k] + 1;
-+#if MFEM_HYPRE_VERSION >= 21600
-+            HYPRE_BigInt jj = Jptr[k] + 1;
+-   int nprocs, rank;
+-   MPI_Comm_rank(comm_, &rank);
+-   MPI_Comm_size(comm_, &nprocs);
+-   int * dist = new int[nprocs + 1];
+-   dist[rank + 1] = parcsr_op->first_row_index + csr_op->num_rows;
++   // Allocate STRUMPACK's CSRMatrixMPI
++   int rank, nprocs;
++   MPI_Comm_rank(comm, &rank);
++   MPI_Comm_size(comm, &nprocs);
++   Array<HYPRE_BigInt> dist(nprocs + 1);
+    dist[0] = 0;
+-   MPI_Allgather(MPI_IN_PLACE, 0, MPI_INT, dist + 1, 1, MPI_INT, comm_);
+-   A_ = new CSRMatrixMPI<double,int>(csr_op->num_rows, csr_op->i, csr_op->j,
+-                                     csr_op->data, dist, comm_, false);
+-   delete[] dist;
++   dist[rank + 1] = fst_row + (HYPRE_BigInt)m_loc;
++   MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
++                 dist.GetData() + 1, 1, HYPRE_MPI_BIG_INT, comm);
++
++#if !defined(HYPRE_MIXEDINT)
++   A_ = new strumpack::CSRMatrixMPI<double, HYPRE_BigInt>(
++      (HYPRE_BigInt)m_loc, Iptr, Jptr, data, dist.GetData(),
++      comm, sym_sparse);
 +#else
-+            HYPRE_Int jj = Jptr[k] + 1;
++   Array<HYPRE_BigInt> II(m_loc+1);
++   for (int i = 0; i <= m_loc; i++) { II[i] = (HYPRE_BigInt)Iptr[i]; }
++   A_ = new strumpack::CSRMatrixMPI<double, HYPRE_BigInt>(
++      (HYPRE_BigInt)m_loc, II.GetData(), Jptr, data, dist.GetData(),
++      comm, sym_sparse);
 +#endif
-+            if (ii >= jj) { nnz++; }
-             k++;
--            if (ii>=jj) { nnz++; }
-          }
-       }
-    }
-@@ -75,28 +140,31 @@ void MUMPSSolver::SetOperator(const Operator &op)
-    {
-       nnz = csr_op->num_nonzeros;
-    }
--
--   int * I = new int[nnz];
--   int * J = new int[nnz];
-+   int *I = new int[nnz];
-+   int *J = new int[nnz];
  
-    // Fill in I and J arrays for
-    // COO format in 1-based indexing
--   int k = 0;
--   double * data;
-+   k = 0;
-+   double *data;
-    if (mat_type)
-    {
--      int l = 0;
-+      MUMPS_INT8 l = 0;
-       data = new double[nnz];
-       for (int i = 0; i < n_loc; i++)
-       {
--         for (int j = Iptr[i]; j < Iptr[i + 1]; j++)
-+         for (HYPRE_Int j = Iptr[i]; j < Iptr[i + 1]; j++)
-          {
-             int ii = row_start + i + 1;
--            int jj = Jptr[k] + 1;
-+#if MFEM_HYPRE_VERSION >= 21600
-+            HYPRE_BigInt jj = Jptr[k] + 1;
-+#else
-+            HYPRE_Int jj = Jptr[k] + 1;
-+#endif
-             if (ii >= jj)
-             {
-                I[l] = ii;
--               J[l] = jj;
-+               J[l] = internal::to_int(jj);
-                data[l++] = csr_op->data[k];
-             }
-             k++;
-@@ -107,84 +175,122 @@ void MUMPSSolver::SetOperator(const Operator &op)
-    {
-       for (int i = 0; i < n_loc; i++)
-       {
--         for (int j = Iptr[i]; j < Iptr[i + 1]; j++)
-+         for (HYPRE_Int j = Iptr[i]; j < Iptr[i + 1]; j++)
-          {
-             I[k] = row_start + i + 1;
--            J[k] = Jptr[k] + 1;
-+            J[k] = internal::to_int(Jptr[k] + 1);
-             k++;
-          }
-       }
-       data = csr_op->data;
-    }
+-   // Everything has been copied or abducted so delete the structure
++   // Everything has been copied so delete the structure
+    hypre_CSRMatrixDestroy(csr_op);
+ }
  
--   // new MUMPS object
--   if (id)
-+   // New MUMPS object or reuse the one from a previous matrix
-+   if (!id || !reorder_reuse)
-    {
--      id->job = -2;
--      dmumps_c(id);
--      delete id;
--   }
--   id = new DMUMPS_STRUC_C;
--   // C to Fortran communicator
--   id->comm_fortran = (MUMPS_INT) MPI_Comm_c2f(comm);
--
--   // Host is involved in computation
--   id->par = 1;
--
--   id->sym = mat_type;
--
--   // MUMPS init
--   id->job = -1;
--   dmumps_c(id);
--
--   // Set MUMPS default parameters
--   SetParameters();
-+      if (id)
-+      {
-+         id->job = -2;
-+         dmumps_c(id);
-+         delete id;
-+      }
-+      id = new DMUMPS_STRUC_C;
-+      id->sym = mat_type;
- 
--   id->n = parcsr_op->global_num_rows;
-+      // C to Fortran communicator
-+      id->comm_fortran = (MUMPS_INT)MPI_Comm_c2f(comm);
- 
--   id->nnz_loc = nnz;
-+      // Host is involved in computation
-+      id->par = 1;
- 
--   id->irn_loc = I;
-+      // MUMPS init
-+      id->job = -1;
-+      dmumps_c(id);
- 
--   id->jcn_loc = J;
-+      // Set MUMPS default parameters
-+      SetParameters();
- 
--   id->a_loc = data;
-+      id->n = internal::to_int(parcsr_op->global_num_rows);
-+      id->nnz_loc = nnz;
-+      id->irn_loc = I;
-+      id->jcn_loc = J;
-+      id->a_loc = data;
+ STRUMPACKRowLocMatrix::~STRUMPACKRowLocMatrix()
+ {
+-   // Delete the struct
+-   if ( A_ != NULL ) { delete A_; }
++   delete A_;
+ }
  
--   // MUMPS Analysis
--   id->job = 1;
--   dmumps_c(id);
-+      // MUMPS analysis
-+      id->job = 1;
-+      dmumps_c(id);
-+   }
-+   else
-+   {
-+      id->irn_loc = I;
-+      id->jcn_loc = J;
-+      id->a_loc = data;
-+   }
+-STRUMPACKSolver::STRUMPACKSolver( int argc, char* argv[], MPI_Comm comm )
+-   : comm_(comm),
+-     APtr_(NULL),
+-     solver_(NULL)
++template <typename STRUMPACKSolverType>
++STRUMPACKSolverBase<STRUMPACKSolverType>::
++STRUMPACKSolverBase(MPI_Comm comm, int argc, char *argv[])
++   : APtr_(NULL),
++     factor_verbose_(false),
++     solve_verbose_(false),
++     reorder_reuse_(false),
++     nrhs_(-1)
+ {
+-   this->Init(argc, argv);
++   solver_ = new STRUMPACKSolverType(comm, argc, argv, false);
+ }
  
--   // MUMPS Factorization
-+   // MUMPS factorization
-    id->job = 2;
--   dmumps_c(id);
-+   {
-+      const int mem_relax_lim = 200;
-+      while (true)
-+      {
-+         dmumps_c(id);
-+         if (id->MUMPS_INFOG(1) < 0)
-+         {
-+            if (id->MUMPS_INFOG(1) == -8 || id->MUMPS_INFOG(1) == -9)
-+            {
-+               id->MUMPS_ICNTL(14) += 20;
-+               MFEM_VERIFY(id->MUMPS_ICNTL(14) <= mem_relax_lim,
-+                           "Memory relaxation limit reached for MUMPS factorization");
-+               if (myid == 0 && print_level > 0)
-+               {
-+                  mfem::out << "Re-running MUMPS factorization with memory relaxation "
-+                            << id->MUMPS_ICNTL(14) << '\n';
-+               }
-+            }
-+            else
-+            {
-+               MFEM_ABORT("Error during MUMPS numerical factorization");
-+            }
-+         }
-+         else { break; }
-+      }
-+   }
+-STRUMPACKSolver::STRUMPACKSolver( STRUMPACKRowLocMatrix & A )
+-   : comm_(A.GetComm()),
+-     APtr_(&A),
+-     solver_(NULL)
++template <typename STRUMPACKSolverType>
++STRUMPACKSolverBase<STRUMPACKSolverType>::
++STRUMPACKSolverBase(STRUMPACKRowLocMatrix &A, int argc, char *argv[])
++   : APtr_(&A),
++     factor_verbose_(false),
++     solve_verbose_(false),
++     reorder_reuse_(false),
++     nrhs_(-1)
+ {
+-   height = A.Height();
+-   width  = A.Width();
++   solver_ = new STRUMPACKSolverType(A.GetComm(), argc, argv, false);
++   SetOperator(A);
++}
  
-    hypre_CSRMatrixDestroy(csr_op);
-    delete [] I;
-    delete [] J;
-    if (mat_type) { delete [] data; }
+-   this->Init(0, NULL);
++template <typename STRUMPACKSolverType>
++STRUMPACKSolverBase<STRUMPACKSolverType>::
++~STRUMPACKSolverBase()
++{
++   delete solver_;
+ }
  
-+   id->nrhs = -1;  // Set up solution storage on first call to Mult
- #if MFEM_MUMPS_VERSION >= 530
-    delete [] irhs_loc;
--   irhs_loc = new int[n_loc];
-+   delete [] isol_loc;
-+   id->nloc_rhs = n_loc;
-+   id->lrhs_loc = n_loc;
-+   id->lsol_loc = id->MUMPS_INFO(23);
-+   irhs_loc = new int[id->lrhs_loc];
-+   isol_loc = new int[id->lsol_loc];
-    for (int i = 0; i < n_loc; i++)
-    {
-       irhs_loc[i] = row_start + i + 1;
-    }
-+   id->irhs_loc = irhs_loc;
-+   id->isol_loc = isol_loc;
-+
-    row_starts.SetSize(numProcs);
-    MPI_Allgather(&row_start, 1, MPI_INT, row_starts, 1, MPI_INT, comm);
- #else
-+   id->lrhs = id->n;
-    if (myid == 0)
-    {
--      delete [] rhs_glob;
-       delete [] recv_counts;
--      rhs_glob = new double[parcsr_op->global_num_rows];
-+      delete [] displs;
-       recv_counts = new int[numProcs];
-+      displs = new int[numProcs];
-    }
-    MPI_Gather(&n_loc, 1, MPI_INT, recv_counts, 1, MPI_INT, 0, comm);
-    if (myid == 0)
-    {
--      delete [] displs;
--      displs = new int[numProcs];
-       displs[0] = 0;
-       int s = 0;
-       for (int k = 0; k < numProcs-1; k++)
-@@ -196,54 +302,109 @@ void MUMPSSolver::SetOperator(const Operator &op)
- #endif
+-STRUMPACKSolver::~STRUMPACKSolver()
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>::
++SetFromCommandLine()
+ {
+-   if ( solver_ != NULL ) { delete solver_; }
++   solver_->options().set_from_command_line();
  }
  
--void MUMPSSolver::Mult(const Vector &x, Vector &y) const
-+void MUMPSSolver::InitRhsSol(int nrhs) const
+-void STRUMPACKSolver::Init( int argc, char* argv[] )
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>::
++SetPrintFactorStatistics(bool print_stat)
  {
--   x.HostRead();
--   y.HostReadWrite();
-+   if (id->nrhs != nrhs)
-+   {
- #if MFEM_MUMPS_VERSION >= 530
-+      delete [] rhs_loc;
-+      delete [] sol_loc;
-+      rhs_loc = (nrhs > 1) ? new double[nrhs * id->lrhs_loc] : nullptr;
-+      sol_loc = new double[nrhs * id->lsol_loc];
-+      id->rhs_loc = rhs_loc;
-+      id->sol_loc = sol_loc;
-+#else
-+      if (myid == 0)
-+      {
-+         delete rhs_glob;
-+         rhs_glob = new double[nrhs * id->lrhs];
-+         id->rhs = rhs_glob;
-+      }
-+#endif
-+   }
-+   id->nrhs = nrhs;
+-   MPI_Comm_size(comm_, &numProcs_);
+-   MPI_Comm_rank(comm_, &myid_);
++   factor_verbose_ = print_stat;
 +}
  
--   id->nloc_rhs = x.Size();
--   id->lrhs_loc = x.Size();
--   id->rhs_loc = x.GetData();
--   id->irhs_loc = irhs_loc;
-+void MUMPSSolver::Mult(const Vector &x, Vector &y) const
+-   factor_verbose_ = false;
+-   solve_verbose_ = false;
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>::
++SetPrintSolveStatistics(bool print_stat)
 +{
-+   Array<const Vector *> X(1);
-+   Array<Vector *> Y(1);
-+   X[0] = &x;
-+   Y[0] = &y;
-+   ArrayMult(X, Y);
++   solve_verbose_ = print_stat;
 +}
  
--   id->lsol_loc = id->MUMPS_INFO(23);
--   id->isol_loc = new int[id->MUMPS_INFO(23)];
--   id->sol_loc = new double[id->MUMPS_INFO(23)];
-+void MUMPSSolver::ArrayMult(const Array<const Vector *> &X,
-+                            Array<Vector *> &Y) const
+-   solver_ = new StrumpackSparseSolverMPIDist<double,int>(comm_, argc, argv,
+-                                                          false);
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>
++::SetRelTol(double rtol)
 +{
-+   MFEM_ASSERT(X.Size() == Y.Size(),
-+               "Number of columns mismatch in MUMPSSolver::Mult!");
-+   InitRhsSol(X.Size());
-+#if MFEM_MUMPS_VERSION >= 530
-+   if (id->nrhs == 1)
-+   {
-+      MFEM_ASSERT(X.Size() == 1 && X[0], "Missing Vector in MUMPSSolver::Mult!");
-+      X[0]->HostRead();
-+      id->rhs_loc = X[0]->GetData();
-+   }
-+   else
-+   {
-+      for (int i = 0; i < id->nrhs; i++)
-+      {
-+         MFEM_ASSERT(X[i], "Missing Vector in MUMPSSolver::Mult!");
-+         X[i]->HostRead();
-+         std::copy(X[i]->GetData(), X[i]->GetData() + X[i]->Size(),
-+                   id->rhs_loc + i * id->lrhs_loc);
-+      }
-+   }
- 
-    // MUMPS solve
-    id->job = 3;
-    dmumps_c(id);
- 
--   RedistributeSol(id->isol_loc, id->sol_loc, y.GetData());
--
--   delete [] id->sol_loc;
--   delete [] id->isol_loc;
-+   RedistributeSol(id->isol_loc, id->sol_loc, id->lsol_loc, Y);
- #else
--   MPI_Gatherv(x.GetData(), x.Size(), MPI_DOUBLE,
--               rhs_glob, recv_counts,
--               displs, MPI_DOUBLE, 0, comm);
--
--   if (myid == 0) { id->rhs = rhs_glob; }
-+   for (int i = 0; i < id->nrhs; i++)
-+   {
-+      MFEM_ASSERT(X[i], "Missing Vector in MUMPSSolver::Mult!");
-+      X[i]->HostRead();
-+      MPI_Gatherv(X[i]->GetData(), X[i]->Size(), MPI_DOUBLE,
-+                  id->rhs + i * id->lrhs, recv_counts, displs, MPI_DOUBLE, 0, comm);
-+   }
++   solver_->options().set_rel_tol(rtol);
+ }
  
-    // MUMPS solve
-    id->job = 3;
-    dmumps_c(id);
+-void STRUMPACKSolver::SetFromCommandLine( )
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>
++::SetAbsTol(double atol)
+ {
+-   solver_->options().set_from_command_line( );
++   solver_->options().set_abs_tol(atol);
+ }
  
--   MPI_Scatterv(rhs_glob, recv_counts, displs,
--                MPI_DOUBLE, y.GetData(), y.Size(),
--                MPI_DOUBLE, 0, comm);
-+   for (int i = 0; i < id->nrhs; i++)
-+   {
-+      MFEM_ASSERT(Y[i], "Missing Vector in MUMPSSolver::Mult!");
-+      Y[i]->HostWrite();
-+      MPI_Scatterv(id->rhs + i * id->lrhs, recv_counts, displs, MPI_DOUBLE,
-+                   Y[i]->GetData(), Y[i]->Size(), MPI_DOUBLE, 0, comm);
-+   }
- #endif
+-void STRUMPACKSolver::SetPrintFactorStatistics( bool print_stat )
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>
++::SetMaxIter(int max_it)
+ {
+-   factor_verbose_ = print_stat;
++   solver_->options().set_maxit(max_it);
  }
  
- void MUMPSSolver::MultTranspose(const Vector &x, Vector &y) const
+-void STRUMPACKSolver::SetPrintSolveStatistics( bool print_stat )
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>
++::SetReorderingReuse(bool reuse)
  {
--   // Set flag for Transpose Solve
-+   // Set flag for transpose solve
-    id->MUMPS_ICNTL(9) = 0;
--   Mult(x,y);
-+   Mult(x, y);
-+
-    // Reset the flag
-    id->MUMPS_ICNTL(9) = 1;
+-   solve_verbose_ = print_stat;
++   reorder_reuse_ = reuse;
 +}
- 
-+void MUMPSSolver::ArrayMultTranspose(const Array<const Vector *> &X,
-+                                     Array<Vector *> &Y) const
-+{
-+   // Set flag for transpose solve
-+   id->MUMPS_ICNTL(9) = 0;
-+   ArrayMult(X, Y);
 +
-+   // Reset the flag
-+   id->MUMPS_ICNTL(9) = 1;
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>
++::EnableGPU()
++{
++   solver_->options().enable_gpu();
+ }
+ 
+-void STRUMPACKSolver::SetKrylovSolver( strumpack::KrylovSolver method )
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>
++::DisableGPU()
+ {
+-   solver_->options().set_Krylov_solver( method );
++   solver_->options().disable_gpu();
  }
  
- void MUMPSSolver::SetPrintLevel(int print_lvl)
-@@ -256,34 +417,34 @@ void MUMPSSolver::SetMatrixSymType(MatType mtype)
-    mat_type = mtype;
+-void STRUMPACKSolver::SetReorderingStrategy( strumpack::ReorderingStrategy
+-                                             method )
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>::
++SetKrylovSolver(strumpack::KrylovSolver method)
+ {
+-   solver_->options().set_reordering_method( method );
++   solver_->options().set_Krylov_solver(method);
  }
  
--MUMPSSolver::~MUMPSSolver()
-+void MUMPSSolver::SetReorderingStrategy(ReorderingStrategy method)
+-void STRUMPACKSolver::DisableMatching( )
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>::
++SetReorderingStrategy(strumpack::ReorderingStrategy method)
  {
--   if (id)
--   {
--#if MFEM_MUMPS_VERSION >= 530
--      delete [] irhs_loc;
--#else
--      delete [] recv_counts;
--      delete [] displs;
--      delete [] rhs_glob;
--#endif
--      id->job = -2;
--      dmumps_c(id);
--      delete id;
--   }
-+   reorder_method = method;
+-#if STRUMPACK_VERSION_MAJOR >= 3
+-   solver_->options().set_matching( strumpack::MatchingJob::NONE );
++   solver_->options().set_reordering_method(method);
 +}
 +
-+void MUMPSSolver::SetReorderingReuse(bool reuse)
-+{
-+   reorder_reuse = reuse;
- }
- 
-+#if MFEM_MUMPS_VERSION >= 510
-+void MUMPSSolver::SetBLRTol(double tol)
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>::
++SetMatching(strumpack::MatchingJob job)
 +{
-+   blr_tol = tol;
++   solver_->options().set_matching(job);
 +}
-+#endif
 +
- void MUMPSSolver::SetParameters()
- {
--   // output stream for error messages
-+   // Output stream for error messages
-    id->MUMPS_ICNTL(1) = 6;
--   // output stream for diagnosting printing local to each proc
--   id->MUMPS_ICNTL(2) = 6;
--   // output stream for global info
-+   // Output stream for diagnosting printing local to each proc
-+   id->MUMPS_ICNTL(2) = 0;
-+   // Output stream for global info
-    id->MUMPS_ICNTL(3) = 6;
-    // Level of error printing
-    id->MUMPS_ICNTL(4) = print_level;
--   //input matrix format (assembled)
-+   // Input matrix format (assembled)
-    id->MUMPS_ICNTL(5) = 0;
-    // Use A or A^T
-    id->MUMPS_ICNTL(9) = 1;
-@@ -301,7 +462,6 @@ void MUMPSSolver::SetParameters()
-    id->MUMPS_ICNTL(18) = 3;
-    // Schur complement (no Schur complement matrix returned)
-    id->MUMPS_ICNTL(19) = 0;
--
- #if MFEM_MUMPS_VERSION >= 530
-    // Distributed RHS
-    id->MUMPS_ICNTL(20) = 10;
-@@ -317,6 +477,53 @@ void MUMPSSolver::SetParameters()
-    id->MUMPS_ICNTL(22) = 0;
-    // Max size of working memory (default = based on estimates)
-    id->MUMPS_ICNTL(23) = 0;
-+   // Configure reordering
-+   switch (reorder_method)
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>::
++SetCompression(strumpack::CompressionType type)
++{
++#if STRUMPACK_VERSION_MAJOR >= 5
++   solver_->options().set_compression(type);
+ #else
+-   solver_->options().set_mc64job( strumpack::MC64Job::NONE );
++   switch (type)
 +   {
-+      case ReorderingStrategy::AUTOMATIC:
-+         id->MUMPS_ICNTL(28) = 0;
-+         id->MUMPS_ICNTL(7) = 7;
-+         id->MUMPS_ICNTL(29) = 0;
-+         break;
-+      case ReorderingStrategy::AMD:
-+         id->MUMPS_ICNTL(28) = 1;
-+         id->MUMPS_ICNTL(7) = 0;
-+         break;
-+      case ReorderingStrategy::AMF:
-+         id->MUMPS_ICNTL(28) = 1;
-+         id->MUMPS_ICNTL(7) = 2;
-+         break;
-+      case ReorderingStrategy::PORD:
-+         id->MUMPS_ICNTL(28) = 1;
-+         id->MUMPS_ICNTL(7) = 4;
-+         break;
-+      case ReorderingStrategy::METIS:
-+         id->MUMPS_ICNTL(28) = 1;
-+         id->MUMPS_ICNTL(7) = 5;
-+         break;
-+      case ReorderingStrategy::PARMETIS:
-+         id->MUMPS_ICNTL(28) = 2;
-+         id->MUMPS_ICNTL(29) = 2;
++      case strumpack::NONE:
++         solver_->options().disable_BLR();
++         solver_->options().disable_HSS();
 +         break;
-+      case ReorderingStrategy::SCOTCH:
-+         id->MUMPS_ICNTL(28) = 1;
-+         id->MUMPS_ICNTL(7) = 3;
++      case strumpack::BLR:
++         solver_->options().enable_BLR();
 +         break;
-+      case ReorderingStrategy::PTSCOTCH:
-+         id->MUMPS_ICNTL(28) = 2;
-+         id->MUMPS_ICNTL(29) = 1;
++      case strumpack::HSS:
++         solver_->options().enable_HSS();
 +         break;
 +      default:
-+         break; // This should be unreachable
-+   }
-+   // Option to activate BLR factorization
-+#if MFEM_MUMPS_VERSION >= 510
-+   if (blr_tol > 0.0)
-+   {
-+      id->MUMPS_ICNTL(35) = 1;
-+      id->MUMPS_CNTL(7) = blr_tol;
++         MFEM_ABORT("Invalid compression type for STRUMPACK version " <<
++                    STRUMPACK_VERSION_MAJOR << "!");
++         break;
 +   }
-+#endif
+ #endif
  }
  
- #if MFEM_MUMPS_VERSION >= 530
-@@ -330,24 +537,23 @@ int MUMPSSolver::GetRowRank(int i, const Array<int> &row_starts_) const
-    return std::distance(row_starts_.begin(), up) - 1;
+-void STRUMPACKSolver::EnableMatching( )
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>::
++SetCompressionRelTol(double rtol)
+ {
+-#if STRUMPACK_VERSION_MAJOR >= 3
+-   solver_->options().set_matching
+-   ( strumpack::MatchingJob::MAX_DIAGONAL_PRODUCT_SCALING );
++#if STRUMPACK_VERSION_MAJOR >= 5
++   solver_->options().set_compression_rel_tol(rtol);
+ #else
+-   solver_->options().set_mc64job
+-   ( strumpack::MC64Job::MAX_DIAGONAL_PRODUCT_SCALING );
++   solver_->options().BLR_options().set_rel_tol(rtol);
++   solver_->options().HSS_options().set_rel_tol(rtol);
+ #endif
  }
  
--void MUMPSSolver::RedistributeSol(const int * row_map,
--                                  const double * x, double * y) const
-+void MUMPSSolver::RedistributeSol(const int *rmap, const double *x,
-+                                  const int lx_loc, Array<Vector *> &Y) const
+-#if STRUMPACK_VERSION_MAJOR >= 3
+-void STRUMPACKSolver::EnableParallelMatching( )
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>::
++SetCompressionAbsTol(double atol)
  {
--   int size = id->MUMPS_INFO(23);
--   int * send_count = new int[numProcs]();
--   for (int i = 0; i < size; i++)
-+   int *send_count = new int[numProcs]();
-+   for (int i = 0; i < lx_loc; i++)
-    {
--      int j = row_map[i] - 1;
-+      int j = rmap[i] - 1;
-       int row_rank = GetRowRank(j, row_starts);
-       if (myid == row_rank) { continue; }
-       send_count[row_rank]++;
-    }
- 
--   int * recv_count = new int[numProcs];
-+   int *recv_count = new int[numProcs];
-    MPI_Alltoall(send_count, 1, MPI_INT, recv_count, 1, MPI_INT, comm);
- 
--   int * send_displ = new int [numProcs]; send_displ[0] = 0;
--   int * recv_displ = new int [numProcs]; recv_displ[0] = 0;
-+   int *send_displ = new int[numProcs]; send_displ[0] = 0;
-+   int *recv_displ = new int[numProcs]; recv_displ[0] = 0;
-    int sbuff_size = send_count[numProcs-1];
-    int rbuff_size = recv_count[numProcs-1];
-    for (int k = 0; k < numProcs - 1; k++)
-@@ -358,54 +564,59 @@ void MUMPSSolver::RedistributeSol(const int * row_map,
-       rbuff_size += recv_count[k];
-    }
- 
--   int * sendbuf_index = new int[sbuff_size];
--   double * sendbuf_values = new double[sbuff_size];
--   int * soffs = new int[numProcs]();
-+   int *sendbuf_index = new int[sbuff_size];
-+   double *sendbuf_values = new double[sbuff_size];
-+   int *recvbuf_index = new int[rbuff_size];
-+   double *recvbuf_values = new double[rbuff_size];
-+   int *soffs = new int[numProcs]();
+-   solver_->options().set_matching
+-   ( strumpack::MatchingJob::COMBBLAS );
+-}
++#if STRUMPACK_VERSION_MAJOR >= 5
++   solver_->options().set_compression_abs_tol(atol);
++#else
++   solver_->options().BLR_options().set_abs_tol(atol);
++   solver_->options().HSS_options().set_abs_tol(atol);
+ #endif
++}
  
--   for (int i = 0; i < size; i++)
-+   for (int i = 0; i < lx_loc; i++)
-    {
--      int j = row_map[i] - 1;
-+      int j = rmap[i] - 1;
-       int row_rank = GetRowRank(j, row_starts);
--      if (myid == row_rank)
--      {
--         int local_index = j - row_start;
--         y[local_index] = x[i];
--      }
--      else
-+      if (myid != row_rank)
-       {
-          int k = send_displ[row_rank] + soffs[row_rank];
-          sendbuf_index[k] = j;
--         sendbuf_values[k] = x[i];
-          soffs[row_rank]++;
-       }
-    }
- 
--   int * recvbuf_index = new int[rbuff_size];
--   double * recvbuf_values = new double[rbuff_size];
--   MPI_Alltoallv(sendbuf_index,
--                 send_count,
--                 send_displ,
--                 MPI_INT,
--                 recvbuf_index,
--                 recv_count,
--                 recv_displ,
--                 MPI_INT,
--                 comm);
--   MPI_Alltoallv(sendbuf_values,
--                 send_count,
--                 send_displ,
--                 MPI_DOUBLE,
--                 recvbuf_values,
--                 recv_count,
--                 recv_displ,
--                 MPI_DOUBLE,
--                 comm);
--
--   // Unpack recv buffer
--   for (int i = 0; i < rbuff_size; i++)
-+   MPI_Alltoallv(sendbuf_index, send_count, send_displ, MPI_INT,
-+                 recvbuf_index, recv_count, recv_displ, MPI_INT, comm);
-+
-+   for (int rhs = 0; rhs < Y.Size(); rhs++)
-    {
--      int local_index = recvbuf_index[i] - row_start;
--      y[local_index] = recvbuf_values[i];
-+      MFEM_ASSERT(Y[rhs], "Missing Vector in MUMPSSolver::Mult!");
-+      Y[rhs]->HostWrite();
-+
-+      std::fill(soffs, soffs + numProcs, 0);
-+      for (int i = 0; i < lx_loc; i++)
-+      {
-+         int j = rmap[i] - 1;
-+         int row_rank = GetRowRank(j, row_starts);
-+         if (myid == row_rank)
-+         {
-+            int local_index = j - row_start;
-+            (*Y[rhs])(local_index) = x[rhs * lx_loc + i];
-+         }
-+         else
-+         {
-+            int k = send_displ[row_rank] + soffs[row_rank];
-+            sendbuf_values[k] = x[rhs * lx_loc + i];
-+            soffs[row_rank]++;
-+         }
-+      }
-+
-+      MPI_Alltoallv(sendbuf_values, send_count, send_displ, MPI_DOUBLE,
-+                    recvbuf_values, recv_count, recv_displ, MPI_DOUBLE, comm);
-+
-+      // Unpack recv buffer
-+      for (int i = 0; i < rbuff_size; i++)
-+      {
-+         int local_index = recvbuf_index[i] - row_start;
-+         (*Y[rhs])(local_index) = recvbuf_values[i];
-+      }
-    }
- 
-    delete [] recvbuf_values;
-diff --git a/linalg/mumps.hpp b/linalg/mumps.hpp
-index 43604f4cc..9fef9a292 100644
---- a/linalg/mumps.hpp
-+++ b/linalg/mumps.hpp
-@@ -16,12 +16,12 @@
- 
- #ifdef MFEM_USE_MUMPS
- #ifdef MFEM_USE_MPI
-+
- #include "operator.hpp"
- #include "hypre.hpp"
--
- #include <mpi.h>
-+
- #include "dmumps_c.h"
--#include <vector>
- 
- namespace mfem
- {
-@@ -31,20 +31,37 @@ namespace mfem
-  *
-  * Interface for the distributed MUMPS solver
-  */
--class MUMPSSolver : public mfem::Solver
-+class MUMPSSolver : public Solver
- {
- public:
-    enum MatType
-    {
-       UNSYMMETRIC = 0,
--      SYMMETRIC_INDEFINITE = 1,
--      SYMMETRIC_POSITIVE_DEFINITE = 2
-+      SYMMETRIC_POSITIVE_DEFINITE = 1,
-+      SYMMETRIC_INDEFINITE = 2
-+   };
-+
-+   enum ReorderingStrategy
-+   {
-+      AUTOMATIC = 0,
-+      AMD,
-+      AMF,
-+      PORD,
-+      METIS,
-+      PARMETIS,
-+      SCOTCH,
-+      PTSCOTCH
-    };
- 
-    /**
--    * @brief Default Constructor
-+    * @brief Constructor with MPI_Comm parameter.
-     */
--   MUMPSSolver() {}
-+   MUMPSSolver(MPI_Comm comm_);
-+
-+   /**
-+    * @brief Constructor with a HypreParMatrix Operator.
-+    */
-+   MUMPSSolver(const Operator &op);
- 
-    /**
-     * @brief Set the Operator and perform factorization
-@@ -62,6 +79,7 @@ public:
-     * @param y Solution vector
-     */
-    void Mult(const Vector &x, Vector &y) const;
-+   void ArrayMult(const Array<const Vector *> &X, Array<Vector *> &Y) const;
- 
-    /**
-     * @brief Transpose Solve y = Op^{-T} x.
-@@ -70,13 +88,15 @@ public:
-     * @param y Solution vector
-     */
-    void MultTranspose(const Vector &x, Vector &y) const;
-+   void ArrayMultTranspose(const Array<const Vector *> &X,
-+                           Array<Vector *> &Y) const;
- 
-    /**
-     * @brief Set the error print level for MUMPS
-     *
-     * @param print_lvl Print level
-     *
--    * @note This method has to be called before SetOperator.
-+    * @note This method has to be called before SetOperator
-     */
-    void SetPrintLevel(int print_lvl);
- 
-@@ -88,65 +108,109 @@ public:
-     *
-     * @param mtype Matrix type
-     *
--    * @note This method has to be called before SetOperator.
-+    * @note This method has to be called before SetOperator
-     */
-    void SetMatrixSymType(MatType mtype);
- 
-+   /**
-+    * @brief Set the reordering strategy
-+    *
-+    * Supported reorderings are: AUTOMATIC, AMD, AMF, PORD, METIS, PARMETIS,
-+    * SCOTCH, and PTSCOTCH
-+    *
-+    * @param method Reordering method
-+    *
-+    * @note This method has to be called before SetOperator
-+    */
-+   void SetReorderingStrategy(ReorderingStrategy method);
-+
-+   /**
-+    * @brief Set the flag controlling reuse of the symbolic factorization
-+    * for multiple operators
-+    *
-+    * @param reuse Flag to reuse symbolic factorization
-+    *
-+    * @note This method has to be called before repeated calls to SetOperator
-+    */
-+   void SetReorderingReuse(bool reuse);
-+
-+   /**
-+    * @brief Set the tolerance for activating block low-rank (BLR) approximate
-+    * factorization
-+    *
-+    * @param tol Tolerance
-+    *
-+    * @note This method has to be called before SetOperator
-+    */
-+#if MFEM_MUMPS_VERSION >= 510
-+   void SetBLRTol(double tol);
-+#endif
-+
-    // Destructor
-    ~MUMPSSolver();
- 
- private:
--
-    // MPI communicator
-    MPI_Comm comm;
- 
-    // Number of procs
-    int numProcs;
- 
--   // local mpi id
-+   // MPI rank
-    int myid;
- 
--   // parameter controlling the matrix type
--   MatType mat_type = MatType::UNSYMMETRIC;
-+   // Parameter controlling the matrix type
-+   MatType mat_type;
-+
-+   // Parameter controlling the printing level
-+   int print_level;
-+
-+   // Parameter controlling the reordering strategy
-+   ReorderingStrategy reorder_method;
-+
-+   // Parameter controlling whether or not to reuse the symbolic factorization
-+   // for multiple calls to SetOperator
-+   bool reorder_reuse;
- 
--   // parameter controlling the printing level
--   int print_level = 0;
-+#if MFEM_MUMPS_VERSION >= 510
-+   // Parameter controlling the Block Low-Rank (BLR) feature in MUMPS
-+   double blr_tol;
-+#endif
- 
--   // local row offsets
-+   // Local row offsets
-    int row_start;
- 
-    // MUMPS object
--   DMUMPS_STRUC_C *id=nullptr;
-+   DMUMPS_STRUC_C *id;
-+
-+   // Method for initialization
-+   void Init(MPI_Comm comm_);
- 
-    // Method for setting MUMPS internal parameters
-    void SetParameters();
- 
--#if MFEM_MUMPS_VERSION >= 530
-+   // Method for configuring storage for distributed/centralized RHS and
-+   // solution
-+   void InitRhsSol(int nrhs) const;
- 
--   // row offsets array on all procs
-+#if MFEM_MUMPS_VERSION >= 530
-+   // Row offests array on all procs
-    Array<int> row_starts;
- 
--   // row map
--   int * irhs_loc = nullptr;
-+   // Row maps and storage for distributed RHS and solution
-+   int *irhs_loc, *isol_loc;
-+   mutable double *rhs_loc, *sol_loc;
- 
-    // These two methods are needed to distribute the local solution
-    // vectors returned by MUMPS to the original MFEM parallel partition
-    int GetRowRank(int i, const Array<int> &row_starts_) const;
--
--   void RedistributeSol(const int * row_map,
--                        const double * x,
--                        double * y) const;
-+   void RedistributeSol(const int *rmap, const double *x, const int lx_loc,
-+                        Array<Vector *> &Y) const;
- #else
--
--   // Arrays needed for MPI_Gather and MPI_Scatter
--   int * recv_counts = nullptr;
--
--   int * displs = nullptr;
--
--   double * rhs_glob = nullptr;
--
-+   // Arrays needed for MPI_Gatherv and MPI_Scatterv
-+   int *recv_counts, *displs;
-+   mutable double *rhs_glob;
- #endif
--
- }; // mfem::MUMPSSolver class
- 
- } // namespace mfem
-diff --git a/linalg/strumpack.cpp b/linalg/strumpack.cpp
-index f0ff11ab4..5b54994aa 100644
---- a/linalg/strumpack.cpp
-+++ b/linalg/strumpack.cpp
-@@ -16,238 +16,470 @@
- 
- #include "strumpack.hpp"
- 
--using namespace std;
--using namespace strumpack;
--
- namespace mfem
- {
- 
- STRUMPACKRowLocMatrix::STRUMPACKRowLocMatrix(MPI_Comm comm,
--                                             int num_loc_rows, int first_loc_row,
--                                             int glob_nrows, int glob_ncols,
--                                             int *I, int *J, double *data)
--   : comm_(comm), A_(NULL)
-+                                             int num_loc_rows,
-+                                             HYPRE_BigInt first_loc_row,
-+                                             HYPRE_BigInt glob_nrows,
-+                                             HYPRE_BigInt glob_ncols,
-+                                             int *I, HYPRE_BigInt *J,
-+                                             double *data, bool sym_sparse)
- {
-    // Set mfem::Operator member data
-    height = num_loc_rows;
-    width  = num_loc_rows;
- 
--   // Allocate STRUMPACK's CSRMatrixMPI
--   int nprocs, rank;
--   MPI_Comm_rank(comm_, &rank);
--   MPI_Comm_size(comm_, &nprocs);
--   int * dist = new int[nprocs + 1];
--   dist[rank + 1] = first_loc_row + num_loc_rows;
-+   // Allocate STRUMPACK's CSRMatrixMPI (copies all inputs)
-+   int rank, nprocs;
-+   MPI_Comm_rank(comm, &rank);
-+   MPI_Comm_size(comm, &nprocs);
-+   Array<HYPRE_BigInt> dist(nprocs + 1);
-    dist[0] = 0;
--   MPI_Allgather(MPI_IN_PLACE, 0, MPI_INT, dist + 1, 1, MPI_INT, comm_);
--   A_ = new CSRMatrixMPI<double,int>(num_loc_rows, I, J, data, dist, comm_, false);
--   delete[] dist;
-+   dist[rank + 1] = first_loc_row + (HYPRE_BigInt)num_loc_rows;
-+   MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
-+                 dist.GetData() + 1, 1, HYPRE_MPI_BIG_INT, comm);
-+
-+#if !(defined(HYPRE_BIGINT) || defined(HYPRE_MIXEDINT))
-+   A_ = new strumpack::CSRMatrixMPI<double, HYPRE_BigInt>(
-+      (HYPRE_BigInt)num_loc_rows, I, J, data, dist.GetData(),
-+      comm, sym_sparse);
-+#else
-+   Array<HYPRE_BigInt> II(num_loc_rows+1);
-+   for (int i = 0; i <= num_loc_rows; i++) { II[i] = (HYPRE_BigInt)I[i]; }
-+   A_ = new strumpack::CSRMatrixMPI<double, HYPRE_BigInt>(
-+      (HYPRE_BigInt)num_loc_rows, II.GetData(), J, data, dist.GetData(),
-+      comm, sym_sparse);
-+#endif
- }
- 
--STRUMPACKRowLocMatrix::STRUMPACKRowLocMatrix(const HypreParMatrix & hypParMat)
--   : comm_(hypParMat.GetComm()),
--     A_(NULL)
-+STRUMPACKRowLocMatrix::STRUMPACKRowLocMatrix(const Operator &op,
-+                                             bool sym_sparse)
- {
--   // First cast the parameter to a hypre_ParCSRMatrix
--   hypre_ParCSRMatrix * parcsr_op =
--      (hypre_ParCSRMatrix *)const_cast<HypreParMatrix&>(hypParMat);
-+   const HypreParMatrix *APtr = dynamic_cast<const HypreParMatrix *>(&op);
-+   MFEM_VERIFY(APtr, "Not a compatible matrix type");
-+   MPI_Comm comm = APtr->GetComm();
- 
--   MFEM_ASSERT(parcsr_op != NULL,"STRUMPACK: const_cast failed in SetOperator");
-+   // Set mfem::Operator member data
-+   height = op.Height();
-+   width  = op.Width();
- 
--   // Create the CSRMatrixMPI A_ by borrowing the internal data from a
--   // hypre_CSRMatrix.
--   hypParMat.HostRead();
--   hypre_CSRMatrix * csr_op = hypre_MergeDiagAndOffd(parcsr_op);
--   hypParMat.HypreRead();
--   hypre_CSRMatrixSetDataOwner(csr_op,0);
-+   // First cast the parameter to a hypre_ParCSRMatrix
-+   hypre_ParCSRMatrix *parcsr_op =
-+      (hypre_ParCSRMatrix *)const_cast<HypreParMatrix &>(*APtr);
-+
-+   // Create the CSRMatrixMPI A by taking the internal data from a
-+   // hypre_CSRMatrix
-+   APtr->HostRead();
-+   hypre_CSRMatrix *csr_op = hypre_MergeDiagAndOffd(parcsr_op);
-+   APtr->HypreRead();
-+   HYPRE_Int       *Iptr   = csr_op->i;
- #if MFEM_HYPRE_VERSION >= 21600
--   // For now, this method assumes that HYPRE_Int is int. Also, csr_op->num_cols
--   // is of type HYPRE_Int, so if we want to check for big indices in
--   // csr_op->big_j, we'll have to check all entries and that check will only be
--   // necessary in HYPRE_MIXEDINT mode which is not supported at the moment.
--   hypre_CSRMatrixBigJtoJ(csr_op);
-+   HYPRE_BigInt    *Jptr   = csr_op->big_j;
-+#else
-+   HYPRE_Int       *Jptr   = csr_op->j;
- #endif
-+   double          *data   = csr_op->data;
- 
--   height = csr_op->num_rows;
--   width  = csr_op->num_rows;
-+   HYPRE_BigInt fst_row = parcsr_op->first_row_index;
-+   HYPRE_Int    m_loc   = csr_op->num_rows;
- 
--   int nprocs, rank;
--   MPI_Comm_rank(comm_, &rank);
--   MPI_Comm_size(comm_, &nprocs);
--   int * dist = new int[nprocs + 1];
--   dist[rank + 1] = parcsr_op->first_row_index + csr_op->num_rows;
-+   // Allocate STRUMPACK's CSRMatrixMPI
-+   int rank, nprocs;
-+   MPI_Comm_rank(comm, &rank);
-+   MPI_Comm_size(comm, &nprocs);
-+   Array<HYPRE_BigInt> dist(nprocs + 1);
-    dist[0] = 0;
--   MPI_Allgather(MPI_IN_PLACE, 0, MPI_INT, dist + 1, 1, MPI_INT, comm_);
--   A_ = new CSRMatrixMPI<double,int>(csr_op->num_rows, csr_op->i, csr_op->j,
--                                     csr_op->data, dist, comm_, false);
--   delete[] dist;
-+   dist[rank + 1] = fst_row + (HYPRE_BigInt)m_loc;
-+   MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
-+                 dist.GetData() + 1, 1, HYPRE_MPI_BIG_INT, comm);
-+
-+#if !defined(HYPRE_MIXEDINT)
-+   A_ = new strumpack::CSRMatrixMPI<double, HYPRE_BigInt>(
-+      (HYPRE_BigInt)m_loc, Iptr, Jptr, data, dist.GetData(),
-+      comm, sym_sparse);
-+#else
-+   Array<HYPRE_BigInt> II(m_loc+1);
-+   for (int i = 0; i <= m_loc; i++) { II[i] = (HYPRE_BigInt)Iptr[i]; }
-+   A_ = new strumpack::CSRMatrixMPI<double, HYPRE_BigInt>(
-+      (HYPRE_BigInt)m_loc, II.GetData(), Jptr, data, dist.GetData(),
-+      comm, sym_sparse);
-+#endif
- 
--   // Everything has been copied or abducted so delete the structure
-+   // Everything has been copied so delete the structure
-    hypre_CSRMatrixDestroy(csr_op);
- }
- 
- STRUMPACKRowLocMatrix::~STRUMPACKRowLocMatrix()
- {
--   // Delete the struct
--   if ( A_ != NULL ) { delete A_; }
-+   delete A_;
- }
- 
--STRUMPACKSolver::STRUMPACKSolver( int argc, char* argv[], MPI_Comm comm )
--   : comm_(comm),
--     APtr_(NULL),
--     solver_(NULL)
-+template <typename STRUMPACKSolverType>
-+STRUMPACKSolverBase<STRUMPACKSolverType>::
-+STRUMPACKSolverBase(MPI_Comm comm, int argc, char *argv[])
-+   : APtr_(NULL),
-+     factor_verbose_(false),
-+     solve_verbose_(false),
-+     reorder_reuse_(false),
-+     nrhs_(-1)
- {
--   this->Init(argc, argv);
-+   solver_ = new STRUMPACKSolverType(comm, argc, argv, false);
- }
- 
--STRUMPACKSolver::STRUMPACKSolver( STRUMPACKRowLocMatrix & A )
--   : comm_(A.GetComm()),
--     APtr_(&A),
--     solver_(NULL)
-+template <typename STRUMPACKSolverType>
-+STRUMPACKSolverBase<STRUMPACKSolverType>::
-+STRUMPACKSolverBase(STRUMPACKRowLocMatrix &A, int argc, char *argv[])
-+   : APtr_(&A),
-+     factor_verbose_(false),
-+     solve_verbose_(false),
-+     reorder_reuse_(false),
-+     nrhs_(-1)
- {
--   height = A.Height();
--   width  = A.Width();
-+   solver_ = new STRUMPACKSolverType(A.GetComm(), argc, argv, false);
-+   SetOperator(A);
-+}
- 
--   this->Init(0, NULL);
-+template <typename STRUMPACKSolverType>
-+STRUMPACKSolverBase<STRUMPACKSolverType>::
-+~STRUMPACKSolverBase()
-+{
-+   delete solver_;
- }
- 
--STRUMPACKSolver::~STRUMPACKSolver()
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>::
-+SetFromCommandLine()
- {
--   if ( solver_ != NULL ) { delete solver_; }
-+   solver_->options().set_from_command_line();
- }
- 
--void STRUMPACKSolver::Init( int argc, char* argv[] )
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>::
-+SetPrintFactorStatistics(bool print_stat)
- {
--   MPI_Comm_size(comm_, &numProcs_);
--   MPI_Comm_rank(comm_, &myid_);
-+   factor_verbose_ = print_stat;
-+}
- 
--   factor_verbose_ = false;
--   solve_verbose_ = false;
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>::
-+SetPrintSolveStatistics(bool print_stat)
-+{
-+   solve_verbose_ = print_stat;
-+}
- 
--   solver_ = new StrumpackSparseSolverMPIDist<double,int>(comm_, argc, argv,
--                                                          false);
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>
-+::SetRelTol(double rtol)
-+{
-+   solver_->options().set_rel_tol(rtol);
- }
- 
--void STRUMPACKSolver::SetFromCommandLine( )
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>
-+::SetAbsTol(double atol)
- {
--   solver_->options().set_from_command_line( );
-+   solver_->options().set_abs_tol(atol);
- }
- 
--void STRUMPACKSolver::SetPrintFactorStatistics( bool print_stat )
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>
-+::SetMaxIter(int max_it)
- {
--   factor_verbose_ = print_stat;
-+   solver_->options().set_maxit(max_it);
- }
- 
--void STRUMPACKSolver::SetPrintSolveStatistics( bool print_stat )
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>
-+::SetReorderingReuse(bool reuse)
- {
--   solve_verbose_ = print_stat;
-+   reorder_reuse_ = reuse;
-+}
-+
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>
-+::EnableGPU()
-+{
-+   solver_->options().enable_gpu();
- }
- 
--void STRUMPACKSolver::SetKrylovSolver( strumpack::KrylovSolver method )
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>
-+::DisableGPU()
- {
--   solver_->options().set_Krylov_solver( method );
-+   solver_->options().disable_gpu();
- }
- 
--void STRUMPACKSolver::SetReorderingStrategy( strumpack::ReorderingStrategy
--                                             method )
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>::
-+SetKrylovSolver(strumpack::KrylovSolver method)
- {
--   solver_->options().set_reordering_method( method );
-+   solver_->options().set_Krylov_solver(method);
- }
- 
--void STRUMPACKSolver::DisableMatching( )
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>::
-+SetReorderingStrategy(strumpack::ReorderingStrategy method)
- {
--#if STRUMPACK_VERSION_MAJOR >= 3
--   solver_->options().set_matching( strumpack::MatchingJob::NONE );
-+   solver_->options().set_reordering_method(method);
-+}
-+
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>::
-+SetMatching(strumpack::MatchingJob job)
-+{
-+   solver_->options().set_matching(job);
-+}
-+
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>::
-+SetCompression(strumpack::CompressionType type)
-+{
-+#if STRUMPACK_VERSION_MAJOR >= 5
-+   solver_->options().set_compression(type);
- #else
--   solver_->options().set_mc64job( strumpack::MC64Job::NONE );
-+   switch (type)
-+   {
-+      case strumpack::NONE:
-+         solver_->options().disable_BLR();
-+         solver_->options().disable_HSS();
-+         break;
-+      case strumpack::BLR:
-+         solver_->options().enable_BLR();
-+         break;
-+      case strumpack::HSS:
-+         solver_->options().enable_HSS();
-+         break;
-+      default:
-+         MFEM_ABORT("Invalid compression type for STRUMPACK version " <<
-+                    STRUMPACK_VERSION_MAJOR << "!");
-+         break;
-+   }
- #endif
- }
- 
--void STRUMPACKSolver::EnableMatching( )
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>::
-+SetCompressionRelTol(double rtol)
- {
--#if STRUMPACK_VERSION_MAJOR >= 3
--   solver_->options().set_matching
--   ( strumpack::MatchingJob::MAX_DIAGONAL_PRODUCT_SCALING );
-+#if STRUMPACK_VERSION_MAJOR >= 5
-+   solver_->options().set_compression_rel_tol(rtol);
- #else
--   solver_->options().set_mc64job
--   ( strumpack::MC64Job::MAX_DIAGONAL_PRODUCT_SCALING );
-+   solver_->options().BLR_options().set_rel_tol(rtol);
-+   solver_->options().HSS_options().set_rel_tol(rtol);
- #endif
- }
- 
--#if STRUMPACK_VERSION_MAJOR >= 3
--void STRUMPACKSolver::EnableParallelMatching( )
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>::
-+SetCompressionAbsTol(double atol)
- {
--   solver_->options().set_matching
--   ( strumpack::MatchingJob::COMBBLAS );
--}
-+#if STRUMPACK_VERSION_MAJOR >= 5
-+   solver_->options().set_compression_abs_tol(atol);
-+#else
-+   solver_->options().BLR_options().set_abs_tol(atol);
-+   solver_->options().HSS_options().set_abs_tol(atol);
- #endif
-+}
- 
--void STRUMPACKSolver::SetRelTol( double rtol )
-+#if STRUMPACK_VERSION_MAJOR >= 5
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>::
-+SetCompressionLossyPrecision(int precision)
- {
--   solver_->options().set_rel_tol( rtol );
-+   solver_->options().set_lossy_precision(precision);
- }
- 
--void STRUMPACKSolver::SetAbsTol( double atol )
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>::
-+SetCompressionButterflyLevels(int levels)
- {
--   solver_->options().set_abs_tol( atol );
-+   solver_->options().HODLR_options().set_butterfly_levels(levels);
- }
-+#endif
- 
--
--void STRUMPACKSolver::Mult( const Vector & x, Vector & y ) const
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>::
-+SetOperator(const Operator &op)
- {
--   MFEM_ASSERT(APtr_ != NULL,
--               "STRUMPACK Error: The operator must be set before"
--               " the system can be solved.");
--   MFEM_ASSERT(x.Size() == Width(), "invalid x.Size() = " << x.Size()
--               << ", expected size = " << Width());
--   MFEM_ASSERT(y.Size() == Height(), "invalid y.Size() = " << y.Size()
--               << ", expected size = " << Height());
-+   // Verify that we have a compatible operator
-+   bool first_mat = !APtr_;
-+   APtr_ = dynamic_cast<const STRUMPACKRowLocMatrix *>(&op);
-+   MFEM_VERIFY(APtr_,
-+               "STRUMPACK: Operator is not a STRUMPACKRowLocMatrix!");
- 
--   double*  yPtr = y.HostWrite();
--   const double*  xPtr = x.HostRead();
-+   // Set mfem::Operator member data
-+   height = op.Height();
-+   width  = op.Width();
- 
--   solver_->options().set_verbose( factor_verbose_ );
--   ReturnCode ret = solver_->factor();
--   switch (ret)
-+   if (first_mat || !reorder_reuse_)
-    {
--      case ReturnCode::SUCCESS: break;
--      case ReturnCode::MATRIX_NOT_SET:
--      {
--         MFEM_ABORT("STRUMPACK:  Matrix was not set!");
--      }
--      break;
--      case ReturnCode::REORDERING_ERROR:
--      {
--         MFEM_ABORT("STRUMPACK:  Matrix reordering failed!");
--      }
--      break;
--      default:
--      {
--         MFEM_ABORT("STRUMPACK: 'factor()' error code = " << ret);
--      }
-+      solver_->set_matrix(*(APtr_->GetA()));
-+   }
-+   else
-+   {
-+      solver_->update_matrix_values(*(APtr_->GetA()));
-    }
--   solver_->options().set_verbose( solve_verbose_ );
--   solver_->solve(xPtr, yPtr);
-+}
- 
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>::
-+FactorInternal() const
-+{
-+   MFEM_ASSERT(APtr_,
-+               "STRUMPACK: Operator must be set before the system can be "
-+               "solved!");
-+   solver_->options().set_verbose(factor_verbose_);
-+   strumpack::ReturnCode ret = solver_->factor();
-+   if (ret != strumpack::ReturnCode::SUCCESS)
-+   {
-+#if STRUMPACK_VERSION_MAJOR >= 7
-+      MFEM_ABORT("STRUMPACK: Factor failed with return code " << ret << "!");
-+#else
-+      MFEM_ABORT("STRUMPACK: Factor failed!");
-+#endif
-+   }
- }
- 
--void STRUMPACKSolver::SetOperator( const Operator & op )
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>::
-+Mult(const Vector &x, Vector &y) const
- {
--   // Verify that we have a compatible operator
--   APtr_ = dynamic_cast<const STRUMPACKRowLocMatrix*>(&op);
--   if ( APtr_ == NULL )
-+   MFEM_ASSERT(x.Size() == Width(),
-+               "STRUMPACK: Invalid x.Size() = " << x.Size() <<
-+               ", expected size = " << Width() << "!");
-+   MFEM_ASSERT(y.Size() == Height(),
-+               "STRUMPACK: Invalid y.Size() = " << y.Size() <<
-+               ", expected size = " << Height() << "!");
-+
-+   const double *xPtr = x.HostRead();
-+   double *yPtr       = y.HostReadWrite();
-+
-+   FactorInternal();
-+   solver_->options().set_verbose(solve_verbose_);
-+   strumpack::ReturnCode ret = solver_->solve(xPtr, yPtr, false);
-+   if (ret != strumpack::ReturnCode::SUCCESS)
-    {
--      mfem_error("STRUMPACKSolver::SetOperator : not STRUMPACKRowLocMatrix!");
-+#if STRUMPACK_VERSION_MAJOR >= 7
-+      MFEM_ABORT("STRUMPACK: Solve failed with return code " << ret << "!");
-+#else
-+      MFEM_ABORT("STRUMPACK: Solve failed!");
-+#endif
-    }
-+}
- 
--   solver_->set_matrix( *(APtr_->getA()) );
-+template <typename STRUMPACKSolverType>
-+void STRUMPACKSolverBase<STRUMPACKSolverType>::
-+ArrayMult(const Array<const Vector *> &X, Array<Vector *> &Y) const
-+{
-+   MFEM_ASSERT(X.Size() == Y.Size(),
-+               "Number of columns mismatch in STRUMPACK solve!");
-+   if (X.Size() == 1)
-+   {
-+      nrhs_ = 1;
-+      MFEM_ASSERT(X[0] && Y[0], "Missing Vector in STRUMPACK solve!");
-+      Mult(*X[0], *Y[0]);
-+      return;
-+   }
- 
--   // Set mfem::Operator member data
--   height = op.Height();
--   width  = op.Width();
-+   // Multiple RHS case
-+   int ldx = Height();
-+   if (nrhs_ != X.Size())
-+   {
-+      rhs_.SetSize(X.Size() * ldx);
-+      sol_.SetSize(X.Size() * ldx);
-+      nrhs_ = X.Size();
-+   }
-+   for (int i = 0; i < nrhs_; i++)
-+   {
-+      MFEM_ASSERT(X[i] && X[i]->Size() == Width(),
-+                  "STRUMPACK: Missing or invalid sized RHS Vector in solve!");
-+      Vector s(rhs_, i * ldx, ldx);
-+      s = *X[i];
-+   }
-+   const double *xPtr = rhs_.HostRead();
-+   double *yPtr       = sol_.HostReadWrite();
-+
-+   FactorInternal();
-+   solver_->options().set_verbose(solve_verbose_);
-+   strumpack::ReturnCode ret = solver_->solve(nrhs_, xPtr, ldx, yPtr, ldx,
-+                                              false);
-+   if (ret != strumpack::ReturnCode::SUCCESS)
-+   {
-+#if STRUMPACK_VERSION_MAJOR >= 7
-+      MFEM_ABORT("STRUMPACK: Solve failed with return code " << ret << "!");
-+#else
-+      MFEM_ABORT("STRUMPACK: Solve failed!");
-+#endif
-+   }
- 
-+   for (int i = 0; i < nrhs_; i++)
-+   {
-+      MFEM_ASSERT(Y[i] && Y[i]->Size() == Width(),
-+                  "STRUMPACK: Missing or invalid sized solution Vector in solve!");
-+      Vector s(sol_, i * ldx, ldx);
-+      *Y[i] = s;
-+   }
- }
- 
-+STRUMPACKSolver::
-+STRUMPACKSolver(MPI_Comm comm)
-+   : STRUMPACKSolverBase<strumpack::
-+     SparseSolverMPIDist<double, HYPRE_BigInt>>
-+     (comm, 0, NULL) {}
-+
-+STRUMPACKSolver::
-+STRUMPACKSolver(STRUMPACKRowLocMatrix &A)
-+   : STRUMPACKSolverBase<strumpack::
-+     SparseSolverMPIDist<double, HYPRE_BigInt>>
-+     (A, 0, NULL) {}
-+
-+STRUMPACKSolver::
-+STRUMPACKSolver(MPI_Comm comm, int argc, char *argv[])
-+   : STRUMPACKSolverBase<strumpack::
-+     SparseSolverMPIDist<double, HYPRE_BigInt>>
-+     (comm, argc, argv) {}
-+
-+STRUMPACKSolver::
-+STRUMPACKSolver(STRUMPACKRowLocMatrix &A, int argc, char *argv[])
-+   : STRUMPACKSolverBase<strumpack::
-+     SparseSolverMPIDist<double, HYPRE_BigInt>>
-+     (A, argc, argv) {}
-+
-+#if STRUMPACK_VERSION_MAJOR >= 7
-+STRUMPACKMixedPrecisionSolver::
-+STRUMPACKMixedPrecisionSolver(MPI_Comm comm)
-+   : STRUMPACKSolverBase<strumpack::
-+     SparseSolverMixedPrecisionMPIDist<float, double, HYPRE_BigInt>>
-+     (comm, 0, NULL) {}
-+
-+STRUMPACKMixedPrecisionSolver::
-+STRUMPACKMixedPrecisionSolver(STRUMPACKRowLocMatrix &A)
-+   : STRUMPACKSolverBase<strumpack::
-+     SparseSolverMixedPrecisionMPIDist<float, double, HYPRE_BigInt>>
-+     (A, 0, NULL) {}
-+
-+STRUMPACKMixedPrecisionSolver::
-+STRUMPACKMixedPrecisionSolver(MPI_Comm comm, int argc, char *argv[])
-+   : STRUMPACKSolverBase<strumpack::
-+     SparseSolverMixedPrecisionMPIDist<float, double, HYPRE_BigInt>>
-+     (comm, argc, argv) {}
-+
-+STRUMPACKMixedPrecisionSolver::
-+STRUMPACKMixedPrecisionSolver(STRUMPACKRowLocMatrix &A, int argc, char *argv[])
-+   : STRUMPACKSolverBase<strumpack::
-+     SparseSolverMixedPrecisionMPIDist<float, double, HYPRE_BigInt>>
-+     (A, argc, argv) {}
-+#endif
-+
-+template class STRUMPACKSolverBase<strumpack::
-+                                   SparseSolverMPIDist<double, HYPRE_BigInt>>;
-+#if STRUMPACK_VERSION_MAJOR >= 7
-+template class STRUMPACKSolverBase<strumpack::
-+                                   SparseSolverMixedPrecisionMPIDist<float, double, HYPRE_BigInt>>;
-+#endif
-+
- } // mfem namespace
- 
- #endif // MFEM_USE_MPI
-diff --git a/linalg/strumpack.hpp b/linalg/strumpack.hpp
-index 300b8415e..6a8ac4c30 100644
---- a/linalg/strumpack.hpp
-+++ b/linalg/strumpack.hpp
-@@ -16,12 +16,14 @@
- 
- #ifdef MFEM_USE_STRUMPACK
- #ifdef MFEM_USE_MPI
-+
- #include "operator.hpp"
- #include "hypre.hpp"
--
- #include <mpi.h>
- 
-+// STRUMPACK headers
- #include "StrumpackSparseSolverMPIDist.hpp"
-+#include "StrumpackSparseSolverMixedPrecisionMPIDist.hpp"
- 
- namespace mfem
- {
-@@ -34,63 +36,80 @@ public:
-        be of size (local) nrows by (global) glob_ncols. The new parallel matrix
-        contains copies of all input arrays (so they can be deleted). */
-    STRUMPACKRowLocMatrix(MPI_Comm comm,
--                         int num_loc_rows, int first_loc_row,
--                         int glob_nrows, int glob_ncols,
--                         int *I, int *J, double *data);
-+                         int num_loc_rows, HYPRE_BigInt first_loc_row,
-+                         HYPRE_BigInt glob_nrows, HYPRE_BigInt glob_ncols,
-+                         int *I, HYPRE_BigInt *J, double *data,
-+                         bool sym_sparse = false);
- 
-    /** Creates a copy of the parallel matrix hypParMat in STRUMPACK's RowLoc
-        format. All data is copied so the original matrix may be deleted. */
--   STRUMPACKRowLocMatrix(const HypreParMatrix & hypParMat);
-+   STRUMPACKRowLocMatrix(const Operator &op, bool sym_sparse = false);
- 
-    ~STRUMPACKRowLocMatrix();
- 
-    void Mult(const Vector &x, Vector &y) const
-    {
--      mfem_error("STRUMPACKRowLocMatrix::Mult(...)\n"
--                 "  matrix vector products are not supported.");
-+      MFEM_ABORT("STRUMPACKRowLocMatrix::Mult: Matrix vector products are not "
-+                 "supported!");
-    }
- 
--   MPI_Comm GetComm() const { return comm_; }
-+   MPI_Comm GetComm() const { return A_->comm(); }
- 
--   strumpack::CSRMatrixMPI<double,int>* getA() const { return A_; }
-+   strumpack::CSRMatrixMPI<double, HYPRE_BigInt> *GetA() const { return A_; }
- 
- private:
--   MPI_Comm   comm_;
--   strumpack::CSRMatrixMPI<double,int>* A_;
--
--}; // mfem::STRUMPACKRowLocMatrix
-+   strumpack::CSRMatrixMPI<double, HYPRE_BigInt> *A_;
-+};
- 
- /** The MFEM STRUMPACK Direct Solver class.
- 
-     The mfem::STRUMPACKSolver class uses the STRUMPACK library to perform LU
-     factorization of a parallel sparse matrix. The solver is capable of handling
--    double precision types. See http://portal.nersc.gov/project/sparse/strumpack
-+    double precision types. See
-+    http://portal.nersc.gov/project/sparse/strumpack/.
- */
--class STRUMPACKSolver : public mfem::Solver
-+template <typename STRUMPACKSolverType>
-+class STRUMPACKSolverBase : public Solver
- {
--public:
--   // Constructor with MPI_Comm parameter.
--   STRUMPACKSolver( int argc, char* argv[], MPI_Comm comm );
-+protected:
-+   // Constructor with MPI_Comm parameter and command line arguments.
-+   STRUMPACKSolverBase(MPI_Comm comm, int argc, char *argv[]);
- 
--   // Constructor with STRUMPACK Matrix Object.
--   STRUMPACKSolver( STRUMPACKRowLocMatrix & A);
-+   // Constructor with STRUMPACK matrix object and command line arguments.
-+   STRUMPACKSolverBase(STRUMPACKRowLocMatrix &A, int argc, char *argv[]);
- 
-+public:
-    // Default destructor.
--   ~STRUMPACKSolver( void );
-+   virtual ~STRUMPACKSolverBase();
- 
-    // Factor and solve the linear system y = Op^{-1} x.
--   void Mult( const Vector & x, Vector & y ) const;
-+   void Mult(const Vector &x, Vector &y) const;
-+   void ArrayMult(const Array<const Vector *> &X, Array<Vector *> &Y) const;
- 
-    // Set the operator.
--   void SetOperator( const Operator & op );
-+   void SetOperator(const Operator &op);
- 
-    // Set various solver options. Refer to STRUMPACK documentation for
-    // details.
--   void SetFromCommandLine( );
--   void SetPrintFactorStatistics( bool print_stat );
--   void SetPrintSolveStatistics( bool print_stat );
--   void SetRelTol( double rtol );
--   void SetAbsTol( double atol );
-+   void SetFromCommandLine();
-+   void SetPrintFactorStatistics(bool print_stat);
-+   void SetPrintSolveStatistics(bool print_stat);
-+
-+   // Set tolerances and iterations for iterative solvers. Compression
-+   // tolerance is handled below.
-+   void SetRelTol(double rtol);
-+   void SetAbsTol(double atol);
-+   void SetMaxIter(int max_it);
-+
-+   // Set the flag controlling reuse of the symbolic factorization for multiple
-+   // operators. This method has to be called before repeated calls to
-+   // SetOperator.
-+   void SetReorderingReuse(bool reuse);
-+
-+   // Enable or not GPU off-loading available if STRUMPACK was compiled with CUDA. Note
-+   // that input/output from MFEM to STRUMPACK is all still through host memory.
-+   void EnableGPU();
-+   void DisableGPU();
- 
-    /**
-     * STRUMPACK is an (approximate) direct solver. It can be used as a direct
-@@ -100,70 +119,151 @@ public:
-     * used without preconditioner.
-     *
-     * Supported values are:
--    *    AUTO:           Use iterative refinement if no HSS compression is used,
--    *                    otherwise use GMRes.
--    *    DIRECT:         No outer iterative solver, just a single application of
--    *                    the multifrontal solver.
--    *    REFINE:         Iterative refinement.
--    *    PREC_GMRES:     Preconditioned GMRes.
--    *                    The preconditioner is the (approx) multifrontal solver.
--    *    GMRES:          UN-preconditioned GMRes. (for testing mainly)
--    *    PREC_BICGSTAB:  Preconditioned BiCGStab.
--    *                    The preconditioner is the (approx) multifrontal solver.
-+    *    AUTO:           Use iterative refinement if no HSS compression is
-+    *                    used, otherwise use GMRes
-+    *    DIRECT:         No outer iterative solver, just a single application
-+    *                    of the multifrontal solver
-+    *    REFINE:         Iterative refinement
-+    *    PREC_GMRES:     Preconditioned GMRes
-+    *                    The preconditioner is the (approx) multifrontal solver
-+    *    GMRES:          UN-preconditioned GMRes (for testing mainly)
-+    *    PREC_BICGSTAB:  Preconditioned BiCGStab
-+    *                    The preconditioner is the (approx) multifrontal solver
-     *    BICGSTAB:       UN-preconditioned BiCGStab. (for testing mainly)
-     */
--   void SetKrylovSolver( strumpack::KrylovSolver method );
-+   void SetKrylovSolver(strumpack::KrylovSolver method);
- 
-    /**
-     * Supported reorderings are:
--    *    METIS, PARMETIS, SCOTCH, PTSCOTCH, RCM
-+    *    NATURAL:    Do not reorder the system
-+    *    METIS:      Use Metis nested-dissection reordering (default)
-+    *    PARMETIS:   Use ParMetis nested-dissection reordering
-+    *    SCOTCH:     Use Scotch nested-dissection reordering
-+    *    PTSCOTCH:   Use PT-Scotch nested-dissection reordering
-+    *    RCM:        Use RCM reordering
-+    *    GEOMETRIC:  A simple geometric nested dissection code that
-+    *                only works for regular meshes
-+    *    AMD:        Approximate minimum degree
-+    *    MMD:        Multiple minimum degree
-+    *    AND:        Nested dissection
-+    *    MLF:        Minimum local fill
-+    *    SPECTRAL:   Spectral nested dissection
-     */
--   void SetReorderingStrategy( strumpack::ReorderingStrategy method );
-+   void SetReorderingStrategy(strumpack::ReorderingStrategy method);
- 
-    /**
--    * Disable static pivoting for stability. The static pivoting in strumpack
-+    * Configure static pivoting for stability. The static pivoting in STRUMPACK
-     * permutes the sparse input matrix in order to get large (nonzero) elements
-     * on the diagonal. If the input matrix is already diagonally dominant, this
-     * reordering can be disabled.
-+    *
-+    * Supported matching algorithms are:
-+    *    NONE:                          Don't do anything
-+    *    MAX_CARDINALITY:               Maximum cardinality
-+    *    MAX_SMALLEST_DIAGONAL:         Maximum smallest diagonal value
-+    *    MAX_SMALLEST_DIAGONAL_2:       Same as MAX_SMALLEST_DIAGONAL
-+    *                                   but different algorithm
-+    *    MAX_DIAGONAL_SUM:              Maximum sum of diagonal values
-+    *    MAX_DIAGONAL_PRODUCT_SCALING:  Maximum product of diagonal values
-+    *                                   and row and column scaling (default)
-+    *    COMBBLAS:                      Use AWPM from CombBLAS (only with
-+    *                                   version >= 3)
-     */
--   void DisableMatching();
--
--   /**
--    * Enable static pivoting for stability using the MC64 algorithm with
--    * job=5. Using a matching algorithm, this will permute the sparse input
--    * matrix in order to get nonzero elements (as large as possible) on the
--    * diagonal. And will also scale the rows and columns of the matrix.
--    */
--   void EnableMatching();
-+   void SetMatching(strumpack::MatchingJob job);
- 
--#if STRUMPACK_VERSION_MAJOR >= 3
-    /**
--    * Use the AWPM (approximate weight perfect matching) algorithm from the
--    * Combinatorial BLAS library for static pivoting, i.e. getting large
--    * nonzeros on the diagonal. This requires that strumpack was compiled with
--    * support for Combinatorial BLAS.
-+    * Enable support for rank-structured data formats, which can be used
-+    * for compression within the sparse solver.
-+    *
-+    * Supported compression types are:
-+    *    NONE:           No compression, purely direct solver (default)
-+    *    HSS:            HSS compression of frontal matrices
-+    *    BLR:            Block low-rank compression of fronts
-+    *    HODLR:          Hierarchically Off-diagonal Low-Rank
-+    *                    compression of frontal matrices
-+    *    BLR_HODLR:      Block low-rank compression of medium
-+    *                    fronts and Hierarchically Off-diagonal
-+    *                    Low-Rank compression of large fronts
-+    *    ZFP_BLR_HODLR:  ZFP compression for small fronts,
-+    *                    Block low-rank compression of medium
-+    *                    fronts and Hierarchically Off-diagonal
-+    *                    Low-Rank compression of large fronts
-+    *    LOSSLESS:       Lossless compression
-+    *    LOSSY:          Lossy compression
-+    *
-+    * For versions of STRUMPACK < 5, we support only NONE, HSS, and BLR.
-+    * BLR_HODLR and ZPR_BLR_HODLR are supported in STRUMPACK >= 6.
-     */
--   void EnableParallelMatching();
-+   void SetCompression(strumpack::CompressionType type);
-+   void SetCompressionRelTol(double rtol);
-+   void SetCompressionAbsTol(double atol);
-+#if STRUMPACK_VERSION_MAJOR >= 5
-+   void SetCompressionLossyPrecision(int precision);
-+   void SetCompressionButterflyLevels(int levels);
- #endif
- 
- private:
--   void Init( int argc, char* argv[] );
-+   // Helper method for calling the STRUMPACK factoriation routine.
-+   void FactorInternal() const;
- 
- protected:
--
--   MPI_Comm      comm_;
--   int           numProcs_;
--   int           myid_;
-+   const STRUMPACKRowLocMatrix *APtr_;
-+   STRUMPACKSolverType         *solver_;
- 
-    bool factor_verbose_;
-    bool solve_verbose_;
-+   bool reorder_reuse_;
-+
-+   mutable Vector rhs_, sol_;
-+   mutable int    nrhs_;
-+};
- 
--   const STRUMPACKRowLocMatrix * APtr_;
--   strumpack::StrumpackSparseSolverMPIDist<double,int> * solver_;
-+class STRUMPACKSolver :
-+   public STRUMPACKSolverBase<strumpack::
-+   SparseSolverMPIDist<double, HYPRE_BigInt>>
-+{
-+public:
-+   // Constructor with MPI_Comm parameter.
-+   STRUMPACKSolver(MPI_Comm comm);
-+
-+   // Constructor with STRUMPACK matrix object.
-+   STRUMPACKSolver(STRUMPACKRowLocMatrix &A);
- 
--}; // mfem::STRUMPACKSolver class
-+   // Constructor with MPI_Comm parameter and command line arguments.
-+   STRUMPACKSolver(MPI_Comm comm, int argc, char *argv[]);
-+
-+   // Constructor with STRUMPACK matrix object and command line arguments.
-+   STRUMPACKSolver(STRUMPACKRowLocMatrix &A, int argc, char *argv[]);
-+
-+   // Destructor.
-+   ~STRUMPACKSolver() {}
-+};
-+
-+#if STRUMPACK_VERSION_MAJOR >= 7
-+class STRUMPACKMixedPrecisionSolver :
-+   public STRUMPACKSolverBase<strumpack::
-+   SparseSolverMixedPrecisionMPIDist<float, double, HYPRE_BigInt>>
-+{
-+public:
-+   // Constructor with MPI_Comm parameter.
-+   STRUMPACKMixedPrecisionSolver(MPI_Comm comm);
-+
-+   // Constructor with STRUMPACK matrix object.
-+   STRUMPACKMixedPrecisionSolver(STRUMPACKRowLocMatrix &A);
-+
-+   // Constructor with MPI_Comm parameter and command line arguments.
-+   STRUMPACKMixedPrecisionSolver(MPI_Comm comm, int argc, char *argv[]);
-+
-+   // Constructor with STRUMPACK matrix object and command line arguments.
-+   STRUMPACKMixedPrecisionSolver(STRUMPACKRowLocMatrix &A,
-+                                 int argc, char *argv[]);
-+
-+   // Destructor.
-+   ~STRUMPACKMixedPrecisionSolver() {}
-+};
-+#endif
- 
--} // mfem namespace
-+} // namespace mfem
- 
- #endif // MFEM_USE_MPI
- #endif // MFEM_USE_STRUMPACK
-diff --git a/linalg/superlu.cpp b/linalg/superlu.cpp
-index bec377739..948415d32 100644
---- a/linalg/superlu.cpp
-+++ b/linalg/superlu.cpp
-@@ -16,48 +16,50 @@
- 
- #include "superlu.hpp"
- 
--// SuperLU headers
--#include "superlu_defs.h"
-+// SuperLU header
- #include "superlu_ddefs.h"
- 
--#if XSDK_INDEX_SIZE == 64
--#error "SuperLUDist has been built with 64bit integers. This is not supported"
-+#if XSDK_INDEX_SIZE == 64 && !(defined(HYPRE_BIGINT) || defined(HYPRE_MIXEDINT))
-+#error "Mismatch between HYPRE (32bit) and SuperLU (64bit) integer types"
- #endif
--
--// For now, it is assumed that HYPRE_BigInt is int.
--#if defined(HYPRE_BIGINT) || defined(HYPRE_MIXEDINT)
--#error "SuperLUDist support requires HYPRE_BigInt == int, for now."
-+#if XSDK_INDEX_SIZE == 32 && (defined(HYPRE_BIGINT) || defined(HYPRE_MIXEDINT))
-+#error "Mismatch between HYPRE (64bit) and SuperLU (32bit) integer types"
- #endif
- 
--#if SUPERLU_DIST_MAJOR_VERSION > 6 ||                                   \
--  (SUPERLU_DIST_MAJOR_VERSION == 6 && SUPERLU_DIST_MINOR_VERSION > 2)
-+#if SUPERLU_DIST_MAJOR_VERSION > 6 || \
-+   (SUPERLU_DIST_MAJOR_VERSION == 6 && SUPERLU_DIST_MINOR_VERSION >= 3)
- #define ScalePermstruct_t dScalePermstruct_t
- #define LUstruct_t dLUstruct_t
- #define SOLVEstruct_t dSOLVEstruct_t
--#define ScalePermstructFree dScalePermstructFree
-+#define ZeroLblocks dZeroLblocks
-+#define ZeroUblocks dZeroUblocks
- #define Destroy_LU dDestroy_LU
-+#define SolveFinalize dSolveFinalize
-+#define ScalePermstructInit dScalePermstructInit
-+#define ScalePermstructFree dScalePermstructFree
- #define LUstructFree dLUstructFree
- #define LUstructInit dLUstructInit
- #endif
- 
-+#if SUPERLU_DIST_MAJOR_VERSION > 7 || \
-+   (SUPERLU_DIST_MAJOR_VERSION == 7 && SUPERLU_DIST_MINOR_VERSION >= 2)
-+#define DeAllocLlu_3d dDeAllocLlu_3d
-+#define DeAllocGlu_3d dDeAllocGlu_3d
-+#define Destroy_A3d_gathered_on_2d dDestroy_A3d_gathered_on_2d
-+#endif
- 
--using namespace std;
--
--namespace mfem
--{
--unsigned int superlu_internal::sqrti( const unsigned int & a )
-+unsigned int sqrti(unsigned int a)
- {
--   unsigned int a_ = a;
--   unsigned int rem = 0;
--   unsigned int root = 0;
-+   unsigned int rem     = 0;
-+   unsigned int root    = 0;
-    unsigned short len   = sizeof(int); len <<= 2;
--   unsigned short shift = (unsigned short)((len<<1) - 2);
-+   unsigned short shift = (unsigned short)((len << 1) - 2);
- 
--   for (int i=0; i<len; i++)
-+   for (int i = 0; i < len; i++)
-    {
-       root <<= 1;
--      rem = ((rem << 2) + (a_ >> shift));
--      a_ <<= 2;
-+      rem = ((rem << 2) + (a >> shift));
-+      a <<= 2;
-       root ++;
-       if (root <= rem)
-       {
-@@ -72,546 +74,692 @@ unsigned int superlu_internal::sqrti( const unsigned int & a )
-    return (root >> 1);
- }
- 
-+int GetGridRows(MPI_Comm comm, int npdep)
-+{
-+   int np;
-+   MPI_Comm_size(comm, &np);
-+   MFEM_VERIFY(npdep > 0 && np % npdep == 0 && !(npdep & (npdep - 1)),
-+               "SuperLUSolver: 3D partition depth must be a power of two "
-+               "and evenly divide the number of processors!");
-+   int nr = (int)sqrti((unsigned int)(np / npdep));
-+   while (np % nr != 0 && nr > 0)
-+   {
-+      nr--;
-+   }
-+   MFEM_VERIFY(nr > 0,
-+               "SuperLUSolver: Unable to determine processor grid for np = " << np);
-+   return nr;
-+}
-+
-+int GetGridCols(MPI_Comm comm, int npdep, int nr)
-+{
-+   int np;
-+   MPI_Comm_size(comm, &np);
-+   int nc = np / (nr * npdep);
-+   MFEM_VERIFY(nr * nc * npdep == np,
-+               "SuperLUSolver: Impossible processor partition!");
-+   return nc;
-+}
-+
-+namespace mfem
-+{
-+
- SuperLURowLocMatrix::SuperLURowLocMatrix(MPI_Comm comm,
--                                         int num_loc_rows, int first_loc_row,
--                                         int glob_nrows, int glob_ncols,
--                                         int *I, int *J, double *data)
--   : comm_(comm),
--     rowLocPtr_(NULL)
-+                                         int num_loc_rows,
-+                                         HYPRE_BigInt first_loc_row,
-+                                         HYPRE_BigInt glob_nrows,
-+                                         HYPRE_BigInt glob_ncols,
-+                                         int *I, HYPRE_BigInt *J,
-+                                         double *data)
-+   : comm_(comm)
- {
-    // Set mfem::Operator member data
-    height = num_loc_rows;
-    width  = num_loc_rows;
- 
-    // Allocate SuperLU's SuperMatrix struct
--   rowLocPtr_      = new SuperMatrix;
--   SuperMatrix * A = (SuperMatrix*)rowLocPtr_;
--
--   A->Store = NULL;
-+   rowLocPtr_     = new SuperMatrix;
-+   SuperMatrix *A = (SuperMatrix *)rowLocPtr_;
-+   A->Store       = NULL;
- 
--   int m       = glob_nrows;
--   int n       = glob_ncols;
--   int nnz_loc = I[num_loc_rows];
--   int m_loc   = num_loc_rows;
--   int fst_row = first_loc_row;
-+   int_t m       = glob_nrows;
-+   int_t n       = glob_ncols;
-+   int_t nnz_loc = I[num_loc_rows];
-+   int_t m_loc   = num_loc_rows;
-+   int_t fst_row = first_loc_row;
- 
--   double * nzval  = NULL;
--   int    * colind = NULL;
--   int    * rowptr = NULL;
-+   double *nzval  = NULL;
-+   int_t  *colind = NULL;
-+   int_t  *rowptr = NULL;
- 
--   if ( !(nzval  = doubleMalloc_dist(nnz_loc)) )
-+   if (!(nzval = doubleMalloc_dist(nnz_loc)))
-    {
--      ABORT("Malloc fails for nzval[].");
-+      MFEM_ABORT("SuperLURowLocMatrix: Malloc failed for nzval!");
-    }
--   for (int i=0; i<nnz_loc; i++)
-+   for (int_t i = 0; i < nnz_loc; i++)
-    {
-       nzval[i] = data[i];
-    }
- 
--   if ( !(colind = intMalloc_dist(nnz_loc)) )
-+   if (!(colind = intMalloc_dist(nnz_loc)))
-    {
--      ABORT("Malloc fails for colind[].");
-+      MFEM_ABORT("SuperLURowLocMatrix: Malloc failed for colind!")
-    }
--   for (int i=0; i<nnz_loc; i++)
-+   for (int_t i = 0; i < nnz_loc; i++)
-    {
-       colind[i] = J[i];
-    }
- 
--   if ( !(rowptr = intMalloc_dist(m_loc+1)) )
-+   if (!(rowptr = intMalloc_dist(m_loc+1)))
-    {
--      ABORT("Malloc fails for rowptr[].");
-+      MFEM_ABORT("SuperLURowLocMatrix: Malloc failed for rowptr!")
-    }
--   for (int i=0; i<=m_loc; i++)
-+   for (int_t i = 0; i <= m_loc; i++)
-    {
-       rowptr[i] = I[i];
-    }
- 
--   // Assign he matrix data to SuperLU's SuperMatrix structure
-+   // Assign the matrix data to SuperLU's SuperMatrix structure
-    dCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
-                                   nzval, colind, rowptr,
-                                   SLU_NR_loc, SLU_D, SLU_GE);
-+
-+   // Save global number of rows and columns of the matrix
-+   num_global_rows_ = m;
-+   num_global_cols_ = n;
- }
- 
--SuperLURowLocMatrix::SuperLURowLocMatrix( const HypreParMatrix & hypParMat )
--   : comm_(hypParMat.GetComm()),
--     rowLocPtr_(NULL)
-+SuperLURowLocMatrix::SuperLURowLocMatrix(const Operator &op)
- {
--   rowLocPtr_      = new SuperMatrix;
--   SuperMatrix * A = (SuperMatrix*)rowLocPtr_;
--
--   A->Store = NULL;
-+   const HypreParMatrix *APtr = dynamic_cast<const HypreParMatrix *>(&op);
-+   MFEM_VERIFY(APtr, "Not a compatible matrix type");
-+   comm_ = APtr->GetComm();
- 
--   // First cast the parameter to a hypre_ParCSRMatrix
--   hypre_ParCSRMatrix * parcsr_op =
--      (hypre_ParCSRMatrix *)const_cast<HypreParMatrix&>(hypParMat);
-+   // Set mfem::Operator member data
-+   height = op.Height();
-+   width  = op.Width();
- 
--   MFEM_ASSERT(parcsr_op != NULL,"SuperLU: const_cast failed in SetOperator");
-+   // Allocate SuperLU's SuperMatrix struct
-+   rowLocPtr_     = new SuperMatrix;
-+   SuperMatrix *A = (SuperMatrix *)rowLocPtr_;
-+   A->Store       = NULL;
- 
--   // Create the SuperMatrix A by borrowing the internal data from a
--   // hypre_CSRMatrix.
--   hypParMat.HostRead();
--   hypre_CSRMatrix * csr_op = hypre_MergeDiagAndOffd(parcsr_op);
--   hypParMat.HypreRead();
--   hypre_CSRMatrixSetDataOwner(csr_op,0);
-+   // First cast the parameter to a hypre_ParCSRMatrix
-+   hypre_ParCSRMatrix *parcsr_op =
-+      (hypre_ParCSRMatrix *)const_cast<HypreParMatrix &>(*APtr);
-+
-+   // Create the SuperMatrix A by taking the internal data from a
-+   // hypre_CSRMatrix
-+   APtr->HostRead();
-+   hypre_CSRMatrix *csr_op = hypre_MergeDiagAndOffd(parcsr_op);
-+   APtr->HypreRead();
-+   HYPRE_Int       *Iptr   = csr_op->i;
- #if MFEM_HYPRE_VERSION >= 21600
--   // For now, this method assumes that HYPRE_BigInt is int. Also, csr_op->num_cols
--   // is of type HYPRE_Int, so if we want to check for big indices in
--   // csr_op->big_j, we'll have to check all entries and that check will only be
--   // necessary in HYPRE_MIXEDINT mode which is not supported at the moment.
--   hypre_CSRMatrixBigJtoJ(csr_op);
-+   HYPRE_BigInt    *Jptr   = csr_op->big_j;
-+#else
-+   HYPRE_Int       *Jptr   = csr_op->j;
- #endif
-+   int_t m       = parcsr_op->global_num_rows;
-+   int_t n       = parcsr_op->global_num_cols;
-+   int_t fst_row = parcsr_op->first_row_index;
-+   int_t nnz_loc = csr_op->num_nonzeros;
-+   int_t m_loc   = csr_op->num_rows;
- 
--   int m         = parcsr_op->global_num_rows;
--   int n         = parcsr_op->global_num_cols;
--   int fst_row   = parcsr_op->first_row_index;
--   int nnz_loc   = csr_op->num_nonzeros;
--   int m_loc     = csr_op->num_rows;
--
--   height = m_loc;
--   width  = m_loc;
-+   double *nzval  = csr_op->data;
-+   int_t  *colind = NULL;
-+   int_t  *rowptr = NULL;
- 
--   double * nzval  = csr_op->data;
--   int    * colind = csr_op->j;
--   int    * rowptr = NULL;
-+   // Some machines don't like HYPRE_BigInt to int_t
-+#if defined(HYPRE_BIGINT) || defined(HYPRE_MIXEDINT)
-+   if (!(colind = intMalloc_dist(nnz_loc)))
-+   {
-+      MFEM_ABORT("SuperLURowLocMatrix: Malloc failed for colind!")
-+   }
-+   for (int_t i = 0; i < nnz_loc; i++)
-+   {
-+      colind[i] = Jptr[i];
-+   }
-+#else
-+   colind = Jptr;
-+#endif
- 
-    // The "i" array cannot be stolen from the hypre_CSRMatrix so we'll copy it
--   if ( !(rowptr = intMalloc_dist(m_loc+1)) )
-+   if (!(rowptr = intMalloc_dist(m_loc+1)))
-    {
--      ABORT("Malloc fails for rowptr[].");
-+      MFEM_ABORT("SuperLURowLocMatrix: Malloc failed for rowptr!")
-    }
--   for (int i=0; i<=m_loc; i++)
-+   for (int_t i = 0; i <= m_loc; i++)
-    {
--      rowptr[i] = (csr_op->i)[i];
-+      rowptr[i] = (int_t)Iptr[i];  // Promotion for HYPRE_MIXEDINT
-    }
- 
--   // Everything has been copied or abducted so delete the structure
--   hypre_CSRMatrixDestroy(csr_op);
--
-    // Assign he matrix data to SuperLU's SuperMatrix structure
-    dCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
-                                   nzval, colind, rowptr,
-                                   SLU_NR_loc, SLU_D, SLU_GE);
- 
--   // Save global number of columns (width) of the matrix
--   num_global_cols = n;
-+   // SuperLU will free the passed CSR data arrays
-+   hypre_CSRMatrixSetDataOwner(csr_op, 0);
-+   hypre_CSRMatrixDestroy(csr_op);
-+#if defined(HYPRE_BIGINT) || defined(HYPRE_MIXEDINT)
-+   delete Jptr;
-+#endif
-+
-+   // Save global number of rows and columns of the matrix
-+   num_global_rows_ = m;
-+   num_global_cols_ = n;
- }
- 
- SuperLURowLocMatrix::~SuperLURowLocMatrix()
- {
--   SuperMatrix * A = (SuperMatrix*)rowLocPtr_;
--
--   // Delete the internal data
-+   SuperMatrix *A = (SuperMatrix *)rowLocPtr_;
-    Destroy_CompRowLoc_Matrix_dist(A);
--
--   // Delete the struct
--   if ( A != NULL ) { delete A; }
-+   delete A;
- }
- 
--SuperLUSolver::SuperLUSolver( MPI_Comm comm )
--   : comm_(comm),
-+SuperLUSolver::SuperLUSolver(MPI_Comm comm, int npdep)
-+   : nprow_(GetGridRows(comm, npdep)),
-+     npcol_(GetGridCols(comm, npdep, nprow_)),
-+     npdep_(npdep),
-      APtr_(NULL),
--     optionsPtr_(NULL),
--     statPtr_(NULL),
--     ScalePermstructPtr_(NULL),
--     LUstructPtr_(NULL),
--     SOLVEstructPtr_(NULL),
--     gridPtr_(NULL),
--     berr_(NULL),
--     perm_r_(NULL),
--     nrhs_(1),
--     nprow_(0),
--     npcol_(0),
--     firstSolveWithThisA_(false),
--     gridInitialized_(false),
--     LUStructInitialized_(false)
-+     nrhs_(0)
- {
--   this->Init();
-+   Init(comm);
- }
- 
--SuperLUSolver::SuperLUSolver( SuperLURowLocMatrix & A )
--   : comm_(A.GetComm()),
--     APtr_(&A),
--     optionsPtr_(NULL),
--     statPtr_(NULL),
--     ScalePermstructPtr_(NULL),
--     LUstructPtr_(NULL),
--     SOLVEstructPtr_(NULL),
--     gridPtr_(NULL),
--     berr_(NULL),
--     perm_r_(NULL),
--     nrhs_(1),
--     nprow_(0),
--     npcol_(0),
--     firstSolveWithThisA_(true),
--     gridInitialized_(false),
--     LUStructInitialized_(false)
-+SuperLUSolver::SuperLUSolver(SuperLURowLocMatrix &A, int npdep)
-+   : SuperLUSolver(A.GetComm(), npdep)
- {
--   height = A.Height();
--   width  = A.Width();
--
--   this->Init();
-+   SetOperator(A);
- }
- 
- SuperLUSolver::~SuperLUSolver()
- {
--   superlu_dist_options_t * options = (superlu_dist_options_t*)optionsPtr_;
--   SuperLUStat_t     * stat         = (SuperLUStat_t*)statPtr_;
--   ScalePermstruct_t * SPstruct     = (ScalePermstruct_t*)ScalePermstructPtr_;
--   LUstruct_t        * LUstruct     = (LUstruct_t*)LUstructPtr_;
--   SOLVEstruct_t     * SOLVEstruct  = (SOLVEstruct_t*)SOLVEstructPtr_;
--   gridinfo_t        * grid         = (gridinfo_t*)gridPtr_;
-+   superlu_dist_options_t *options = (superlu_dist_options_t *)optionsPtr_;
- 
--   SUPERLU_FREE(berr_);
--   PStatFree(stat);
-+   ScalePermstruct_t *ScalePermstruct = (ScalePermstruct_t *)ScalePermstructPtr_;
-+   LUstruct_t        *LUstruct        = (LUstruct_t *)LUstructPtr_;
-+   SOLVEstruct_t     *SOLVEstruct     = (SOLVEstruct_t *)SOLVEstructPtr_;
- 
--   if ( LUStructInitialized_ )
-+#if SUPERLU_DIST_MAJOR_VERSION > 7 || \
-+   (SUPERLU_DIST_MAJOR_VERSION == 7 && SUPERLU_DIST_MINOR_VERSION >= 2)
-+   if (npdep_ > 1)
-    {
--      ScalePermstructFree(SPstruct);
--      Destroy_LU(APtr_->GetGlobalNumColumns(), grid, LUstruct);
--      LUstructFree(LUstruct);
--   }
-+      gridinfo3d_t *grid3d = (gridinfo3d_t *)gridPtr_;
- 
--   if ( options->SolveInitialized )
-+      if (APtr_)
-+      {
-+         if (grid3d->zscp.Iam == 0)
-+         {
-+            // Process layer 0
-+            Destroy_LU(APtr_->GetGlobalNumColumns(), &(grid3d->grid2d),
-+                       LUstruct);
-+            SolveFinalize(options, SOLVEstruct);
-+         }
-+         else
-+         {
-+            // Process layers not equal 0
-+            DeAllocLlu_3d(APtr_->GetGlobalNumColumns(), LUstruct, grid3d);
-+            DeAllocGlu_3d(LUstruct);
-+         }
-+         Destroy_A3d_gathered_on_2d(SOLVEstruct, grid3d);
-+         ScalePermstructFree(ScalePermstruct);
-+         LUstructFree(LUstruct);
-+      }
-+
-+      superlu_gridexit3d(grid3d);
-+      delete grid3d;
-+   }
-+   else
-+#endif
-    {
--      dSolveFinalize(options, SOLVEstruct);
-+      gridinfo_t *grid = (gridinfo_t *)gridPtr_;
-+
-+      if (APtr_)
-+      {
-+         Destroy_LU(APtr_->GetGlobalNumColumns(), grid, LUstruct);
-+         SolveFinalize(options, SOLVEstruct);
-+         ScalePermstructFree(ScalePermstruct);
-+         LUstructFree(LUstruct);
-+      }
-+
-+      superlu_gridexit(grid);
-+      delete grid;
-    }
- 
--   if (     options != NULL ) { delete options; }
--   if (        stat != NULL ) { delete stat; }
--   if (    SPstruct != NULL ) { delete SPstruct; }
--   if (    LUstruct != NULL ) { delete LUstruct; }
--   if ( SOLVEstruct != NULL ) { delete SOLVEstruct; }
--   if (        grid != NULL ) { delete grid; }
--   if (     perm_r_ != NULL ) { SUPERLU_FREE(perm_r_); }
-+   delete options;
-+   delete ScalePermstruct;
-+   delete LUstruct;
-+   delete SOLVEstruct;
- }
- 
--void SuperLUSolver::Init()
-+void SuperLUSolver::Init(MPI_Comm comm)
- {
--   MPI_Comm_size(comm_, &numProcs_);
--   MPI_Comm_rank(comm_, &myid_);
--
-    optionsPtr_         = new superlu_dist_options_t;
--   statPtr_            = new SuperLUStat_t;
-    ScalePermstructPtr_ = new ScalePermstruct_t;
-    LUstructPtr_        = new LUstruct_t;
-    SOLVEstructPtr_     = new SOLVEstruct_t;
--   gridPtr_            = new gridinfo_t;
--
--   superlu_dist_options_t * options = (superlu_dist_options_t*)optionsPtr_;
--   SuperLUStat_t          *    stat = (SuperLUStat_t*)statPtr_;
- 
--   if ( !(berr_ = doubleMalloc_dist(nrhs_)) )
-+   // Initialize process grid
-+#if SUPERLU_DIST_MAJOR_VERSION > 7 || \
-+   (SUPERLU_DIST_MAJOR_VERSION == 7 && SUPERLU_DIST_MINOR_VERSION >= 2)
-+   if (npdep_ > 1)
-+   {
-+      gridPtr_ = new gridinfo3d_t;
-+      superlu_gridinit3d(comm, nprow_, npcol_, npdep_, (gridinfo3d_t *)gridPtr_);
-+   }
-+   else
-+#endif
-    {
--      ABORT("Malloc fails for berr[].");
-+      gridPtr_ = new gridinfo_t;
-+      MFEM_VERIFY(npdep_ == 1,
-+                  "SuperLUSolver: 3D partitioning is only available for "
-+                  "SuperLU_DIST version >= 7.2.0!");
-+      superlu_gridinit(comm, nprow_, npcol_, (gridinfo_t *)gridPtr_);
-    }
- 
--   // Set default options
-+   // Set default options:
-+   //    options.Fact = DOFACT;
-+   //    options.Equil = YES;
-+   //    options.ColPerm = METIS_AT_PLUS_A;
-+   //    options.RowPerm = LargeDiag_MC64;
-+   //    options.ReplaceTinyPivot = NO;
-+   //    options.Trans = NOTRANS;
-+   //    options.IterRefine = SLU_DOUBLE;
-+   //    options.SolveInitialized = NO;
-+   //    options.RefineInitialized = NO;
-+   //    options.PrintStat = YES;
-+   superlu_dist_options_t *options = (superlu_dist_options_t *)optionsPtr_;
-    set_default_options_dist(options);
--
--   // Choose nprow and npcol so that the process grid is as square as possible.
--   // If the processes cannot be divided evenly, keep the row dimension smaller
--   // than the column dimension.
--
--   nprow_ = (int)superlu_internal::sqrti((unsigned int)numProcs_);
--   while (numProcs_ % nprow_ != 0 && nprow_ > 0)
-+#if SUPERLU_DIST_MAJOR_VERSION > 7 || \
-+   (SUPERLU_DIST_MAJOR_VERSION == 7 && SUPERLU_DIST_MINOR_VERSION >= 2)
-+   if (npdep_ > 1)
-    {
--      nprow_--;
-+      options->Algo3d = YES;
-    }
--
--   npcol_ = (int)(numProcs_ / nprow_);
--   MFEM_ASSERT(nprow_ * npcol_ == numProcs_, "");
--
--   PStatInit(stat); // Initialize the statistics variables.
-+#endif
- }
- 
--void SuperLUSolver::SetPrintStatistics( bool print_stat )
-+void SuperLUSolver::SetPrintStatistics(bool print_stat)
+-void STRUMPACKSolver::SetRelTol( double rtol )
++#if STRUMPACK_VERSION_MAJOR >= 5
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>::
++SetCompressionLossyPrecision(int precision)
  {
--   superlu_dist_options_t * options = (superlu_dist_options_t*)optionsPtr_;
--
--   yes_no_t opt = print_stat?YES:NO;
--
-+   superlu_dist_options_t *options = (superlu_dist_options_t *)optionsPtr_;
-+   yes_no_t opt = print_stat ? YES : NO;
-    options->PrintStat = opt;
+-   solver_->options().set_rel_tol( rtol );
++   solver_->options().set_lossy_precision(precision);
  }
  
--void SuperLUSolver::SetEquilibriate( bool equil )
-+void SuperLUSolver::SetEquilibriate(bool equil)
+-void STRUMPACKSolver::SetAbsTol( double atol )
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>::
++SetCompressionButterflyLevels(int levels)
  {
--   superlu_dist_options_t * options = (superlu_dist_options_t*)optionsPtr_;
--
--   yes_no_t opt = equil?YES:NO;
--
-+   superlu_dist_options_t *options = (superlu_dist_options_t *)optionsPtr_;
-+   yes_no_t opt = equil ? YES : NO;
-    options->Equil = opt;
+-   solver_->options().set_abs_tol( atol );
++   solver_->options().HODLR_options().set_butterfly_levels(levels);
  }
++#endif
  
--void SuperLUSolver::SetColumnPermutation( superlu::ColPerm col_perm )
-+void SuperLUSolver::SetColumnPermutation(superlu::ColPerm col_perm)
- {
--   superlu_dist_options_t * options = (superlu_dist_options_t*)optionsPtr_;
--
-+   superlu_dist_options_t *options = (superlu_dist_options_t *)optionsPtr_;
-    colperm_t opt = (colperm_t)col_perm;
 -
-+   if (opt == MY_PERMC)
-+   {
-+      MFEM_ABORT("SuperLUSolver::SetColumnPermutation does not yet support "
-+                 "MY_PERMC!");
-+   }
-+   else if (opt == PARMETIS)
-+   {
-+      options->ParSymbFact = YES;
-+   }
-    options->ColPerm = opt;
- }
- 
--void SuperLUSolver::SetRowPermutation( superlu::RowPerm row_perm,
--                                       Array<int> * perm )
-+void SuperLUSolver::SetRowPermutation(superlu::RowPerm row_perm)
+-void STRUMPACKSolver::Mult( const Vector & x, Vector & y ) const
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>::
++SetOperator(const Operator &op)
  {
--   superlu_dist_options_t * options = (superlu_dist_options_t*)optionsPtr_;
--
-+   superlu_dist_options_t *options = (superlu_dist_options_t *)optionsPtr_;
-    rowperm_t opt = (rowperm_t)row_perm;
--
--   options->RowPerm = opt;
--
--   if ( opt == MY_PERMR )
-+   if (opt == MY_PERMR)
+-   MFEM_ASSERT(APtr_ != NULL,
+-               "STRUMPACK Error: The operator must be set before"
+-               " the system can be solved.");
+-   MFEM_ASSERT(x.Size() == Width(), "invalid x.Size() = " << x.Size()
+-               << ", expected size = " << Width());
+-   MFEM_ASSERT(y.Size() == Height(), "invalid y.Size() = " << y.Size()
+-               << ", expected size = " << Height());
++   // Verify that we have a compatible operator
++   bool first_mat = !APtr_;
++   APtr_ = dynamic_cast<const STRUMPACKRowLocMatrix *>(&op);
++   MFEM_VERIFY(APtr_,
++               "STRUMPACK: Operator is not a STRUMPACKRowLocMatrix!");
+ 
+-   double*  yPtr = y.HostWrite();
+-   const double*  xPtr = x.HostRead();
++   // Set mfem::Operator member data
++   height = op.Height();
++   width  = op.Width();
+ 
+-   solver_->options().set_verbose( factor_verbose_ );
+-   ReturnCode ret = solver_->factor();
+-   switch (ret)
++   if (first_mat || !reorder_reuse_)
     {
--      if ( perm == NULL )
+-      case ReturnCode::SUCCESS: break;
+-      case ReturnCode::MATRIX_NOT_SET:
 -      {
--         mfem_error("SuperLUSolver::SetRowPermutation :"
--                    " permutation vector not set!");
+-         MFEM_ABORT("STRUMPACK:  Matrix was not set!");
 -      }
--
--      if ( !(perm_r_ = intMalloc_dist(perm->Size())) )
+-      break;
+-      case ReturnCode::REORDERING_ERROR:
 -      {
--         ABORT("Malloc fails for perm_r[].");
+-         MFEM_ABORT("STRUMPACK:  Matrix reordering failed!");
 -      }
--      for (int i=0; i<perm->Size(); i++)
+-      break;
+-      default:
 -      {
--         perm_r_[i] = (*perm)[i];
+-         MFEM_ABORT("STRUMPACK: 'factor()' error code = " << ret);
 -      }
-+      MFEM_ABORT("SuperLUSolver::SetRowPermutation does not yet support "
-+                 "MY_PERMR!");
++      solver_->set_matrix(*(APtr_->GetA()));
++   }
++   else
++   {
++      solver_->update_matrix_values(*(APtr_->GetA()));
     }
-+   options->RowPerm = opt;
- }
+-   solver_->options().set_verbose( solve_verbose_ );
+-   solver_->solve(xPtr, yPtr);
++}
  
--void SuperLUSolver::SetTranspose( superlu::Trans trans )
--{
--   superlu_dist_options_t * options = (superlu_dist_options_t*)optionsPtr_;
--
--   trans_t opt = (trans_t)trans;
--
--   options->Trans = opt;
--}
--
--void SuperLUSolver::SetIterativeRefine( superlu::IterRefine iter_ref )
-+void SuperLUSolver::SetIterativeRefine(superlu::IterRefine iter_ref)
- {
--   superlu_dist_options_t * options = (superlu_dist_options_t*)optionsPtr_;
--
-+   superlu_dist_options_t *options = (superlu_dist_options_t *)optionsPtr_;
-    IterRefine_t opt = (IterRefine_t)iter_ref;
--
-    options->IterRefine = opt;
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>::
++FactorInternal() const
++{
++   MFEM_ASSERT(APtr_,
++               "STRUMPACK: Operator must be set before the system can be "
++               "solved!");
++   solver_->options().set_verbose(factor_verbose_);
++   strumpack::ReturnCode ret = solver_->factor();
++   if (ret != strumpack::ReturnCode::SUCCESS)
++   {
++#if STRUMPACK_VERSION_MAJOR >= 7
++      MFEM_ABORT("STRUMPACK: Factor failed with return code " << ret << "!");
++#else
++      MFEM_ABORT("STRUMPACK: Factor failed!");
++#endif
++   }
  }
  
--void SuperLUSolver::SetReplaceTinyPivot( bool rtp )
-+void SuperLUSolver::SetReplaceTinyPivot(bool rtp)
+-void STRUMPACKSolver::SetOperator( const Operator & op )
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>::
++Mult(const Vector &x, Vector &y) const
  {
--   superlu_dist_options_t * options = (superlu_dist_options_t*)optionsPtr_;
--
--   yes_no_t opt = rtp?YES:NO;
--
-+   superlu_dist_options_t *options = (superlu_dist_options_t *)optionsPtr_;
-+   yes_no_t opt = rtp ? YES : NO;
-    options->ReplaceTinyPivot = opt;
- }
+-   // Verify that we have a compatible operator
+-   APtr_ = dynamic_cast<const STRUMPACKRowLocMatrix*>(&op);
+-   if ( APtr_ == NULL )
++   MFEM_ASSERT(x.Size() == Width(),
++               "STRUMPACK: Invalid x.Size() = " << x.Size() <<
++               ", expected size = " << Width() << "!");
++   MFEM_ASSERT(y.Size() == Height(),
++               "STRUMPACK: Invalid y.Size() = " << y.Size() <<
++               ", expected size = " << Height() << "!");
++
++   const double *xPtr = x.HostRead();
++   double *yPtr       = y.HostReadWrite();
++
++   FactorInternal();
++   solver_->options().set_verbose(solve_verbose_);
++   strumpack::ReturnCode ret = solver_->solve(xPtr, yPtr, false);
++   if (ret != strumpack::ReturnCode::SUCCESS)
+    {
+-      mfem_error("STRUMPACKSolver::SetOperator : not STRUMPACKRowLocMatrix!");
++#if STRUMPACK_VERSION_MAJOR >= 7
++      MFEM_ABORT("STRUMPACK: Solve failed with return code " << ret << "!");
++#else
++      MFEM_ABORT("STRUMPACK: Solve failed!");
++#endif
+    }
++}
  
--void SuperLUSolver::SetNumLookAheads( int num_lookaheads )
-+void SuperLUSolver::SetNumLookAheads(int num_lookaheads)
- {
--   superlu_dist_options_t * options = (superlu_dist_options_t*)optionsPtr_;
--
-+   superlu_dist_options_t *options = (superlu_dist_options_t *)optionsPtr_;
-    options->num_lookaheads = num_lookaheads;
- }
+-   solver_->set_matrix( *(APtr_->getA()) );
++template <typename STRUMPACKSolverType>
++void STRUMPACKSolverBase<STRUMPACKSolverType>::
++ArrayMult(const Array<const Vector *> &X, Array<Vector *> &Y) const
++{
++   MFEM_ASSERT(X.Size() == Y.Size(),
++               "Number of columns mismatch in STRUMPACK solve!");
++   if (X.Size() == 1)
++   {
++      nrhs_ = 1;
++      MFEM_ASSERT(X[0] && Y[0], "Missing Vector in STRUMPACK solve!");
++      Mult(*X[0], *Y[0]);
++      return;
++   }
  
--void SuperLUSolver::SetLookAheadElimTree( bool etree )
-+void SuperLUSolver::SetLookAheadElimTree(bool etree)
- {
--   superlu_dist_options_t * options = (superlu_dist_options_t*)optionsPtr_;
--
--   yes_no_t opt = etree?YES:NO;
--
-+   superlu_dist_options_t *options = (superlu_dist_options_t *)optionsPtr_;
-+   yes_no_t opt = etree ? YES : NO;
-    options->lookahead_etree = opt;
- }
+-   // Set mfem::Operator member data
+-   height = op.Height();
+-   width  = op.Width();
++   // Multiple RHS case
++   int ldx = Height();
++   if (nrhs_ != X.Size())
++   {
++      rhs_.SetSize(X.Size() * ldx);
++      sol_.SetSize(X.Size() * ldx);
++      nrhs_ = X.Size();
++   }
++   for (int i = 0; i < nrhs_; i++)
++   {
++      MFEM_ASSERT(X[i] && X[i]->Size() == Width(),
++                  "STRUMPACK: Missing or invalid sized RHS Vector in solve!");
++      Vector s(rhs_, i * ldx, ldx);
++      s = *X[i];
++   }
++   const double *xPtr = rhs_.HostRead();
++   double *yPtr       = sol_.HostReadWrite();
++
++   FactorInternal();
++   solver_->options().set_verbose(solve_verbose_);
++   strumpack::ReturnCode ret = solver_->solve(nrhs_, xPtr, ldx, yPtr, ldx,
++                                              false);
++   if (ret != strumpack::ReturnCode::SUCCESS)
++   {
++#if STRUMPACK_VERSION_MAJOR >= 7
++      MFEM_ABORT("STRUMPACK: Solve failed with return code " << ret << "!");
++#else
++      MFEM_ABORT("STRUMPACK: Solve failed!");
++#endif
++   }
  
--void SuperLUSolver::SetSymmetricPattern( bool sym )
-+void SuperLUSolver::SetSymmetricPattern(bool sym)
- {
--   superlu_dist_options_t * options = (superlu_dist_options_t*)optionsPtr_;
--
--   yes_no_t opt = sym?YES:NO;
--
-+   superlu_dist_options_t *options = (superlu_dist_options_t *)optionsPtr_;
-+   yes_no_t opt = sym ? YES : NO;
-    options->SymPattern = opt;
++   for (int i = 0; i < nrhs_; i++)
++   {
++      MFEM_ASSERT(Y[i] && Y[i]->Size() == Width(),
++                  "STRUMPACK: Missing or invalid sized solution Vector in solve!");
++      Vector s(sol_, i * ldx, ldx);
++      *Y[i] = s;
++   }
  }
  
--void SuperLUSolver::SetParSymbFact( bool par )
-+void SuperLUSolver::SetParSymbFact(bool par)
- {
--   superlu_dist_options_t * options = (superlu_dist_options_t*)optionsPtr_;
--
--   yes_no_t opt = par?YES:NO;
++STRUMPACKSolver::
++STRUMPACKSolver(MPI_Comm comm)
++   : STRUMPACKSolverBase<strumpack::
++     SparseSolverMPIDist<double, HYPRE_BigInt>>
++     (comm, 0, NULL) {}
++
++STRUMPACKSolver::
++STRUMPACKSolver(STRUMPACKRowLocMatrix &A)
++   : STRUMPACKSolverBase<strumpack::
++     SparseSolverMPIDist<double, HYPRE_BigInt>>
++     (A, 0, NULL) {}
++
++STRUMPACKSolver::
++STRUMPACKSolver(MPI_Comm comm, int argc, char *argv[])
++   : STRUMPACKSolverBase<strumpack::
++     SparseSolverMPIDist<double, HYPRE_BigInt>>
++     (comm, argc, argv) {}
++
++STRUMPACKSolver::
++STRUMPACKSolver(STRUMPACKRowLocMatrix &A, int argc, char *argv[])
++   : STRUMPACKSolverBase<strumpack::
++     SparseSolverMPIDist<double, HYPRE_BigInt>>
++     (A, argc, argv) {}
++
++#if STRUMPACK_VERSION_MAJOR >= 7
++STRUMPACKMixedPrecisionSolver::
++STRUMPACKMixedPrecisionSolver(MPI_Comm comm)
++   : STRUMPACKSolverBase<strumpack::
++     SparseSolverMixedPrecisionMPIDist<float, double, HYPRE_BigInt>>
++     (comm, 0, NULL) {}
++
++STRUMPACKMixedPrecisionSolver::
++STRUMPACKMixedPrecisionSolver(STRUMPACKRowLocMatrix &A)
++   : STRUMPACKSolverBase<strumpack::
++     SparseSolverMixedPrecisionMPIDist<float, double, HYPRE_BigInt>>
++     (A, 0, NULL) {}
++
++STRUMPACKMixedPrecisionSolver::
++STRUMPACKMixedPrecisionSolver(MPI_Comm comm, int argc, char *argv[])
++   : STRUMPACKSolverBase<strumpack::
++     SparseSolverMixedPrecisionMPIDist<float, double, HYPRE_BigInt>>
++     (comm, argc, argv) {}
++
++STRUMPACKMixedPrecisionSolver::
++STRUMPACKMixedPrecisionSolver(STRUMPACKRowLocMatrix &A, int argc, char *argv[])
++   : STRUMPACKSolverBase<strumpack::
++     SparseSolverMixedPrecisionMPIDist<float, double, HYPRE_BigInt>>
++     (A, argc, argv) {}
++#endif
++
++template class STRUMPACKSolverBase<strumpack::
++                                   SparseSolverMPIDist<double, HYPRE_BigInt>>;
++#if STRUMPACK_VERSION_MAJOR >= 7
++template class STRUMPACKSolverBase<strumpack::
++                                   SparseSolverMixedPrecisionMPIDist<float, double, HYPRE_BigInt>>;
++#endif
++
+ } // mfem namespace
+ 
+ #endif // MFEM_USE_MPI
+diff --git a/linalg/strumpack.hpp b/linalg/strumpack.hpp
+index 300b8415e..6a8ac4c30 100644
+--- a/linalg/strumpack.hpp
++++ b/linalg/strumpack.hpp
+@@ -16,12 +16,14 @@
+ 
+ #ifdef MFEM_USE_STRUMPACK
+ #ifdef MFEM_USE_MPI
++
+ #include "operator.hpp"
+ #include "hypre.hpp"
 -
-+   superlu_dist_options_t *options = (superlu_dist_options_t *)optionsPtr_;
-+   yes_no_t opt = par ? YES : NO;
-    options->ParSymbFact = opt;
- }
+ #include <mpi.h>
+ 
++// STRUMPACK headers
+ #include "StrumpackSparseSolverMPIDist.hpp"
++#include "StrumpackSparseSolverMixedPrecisionMPIDist.hpp"
  
--void SuperLUSolver::SetupGrid()
-+void SuperLUSolver::SetFact(superlu::Fact fact)
+ namespace mfem
  {
--   gridinfo_t * grid = (gridinfo_t*)gridPtr_;
-+   superlu_dist_options_t *options = (superlu_dist_options_t *)optionsPtr_;
-+   fact_t opt = (fact_t)fact;
-+   options->Fact = opt;
-+}
+@@ -34,63 +36,80 @@ public:
+        be of size (local) nrows by (global) glob_ncols. The new parallel matrix
+        contains copies of all input arrays (so they can be deleted). */
+    STRUMPACKRowLocMatrix(MPI_Comm comm,
+-                         int num_loc_rows, int first_loc_row,
+-                         int glob_nrows, int glob_ncols,
+-                         int *I, int *J, double *data);
++                         int num_loc_rows, HYPRE_BigInt first_loc_row,
++                         HYPRE_BigInt glob_nrows, HYPRE_BigInt glob_ncols,
++                         int *I, HYPRE_BigInt *J, double *data,
++                         bool sym_sparse = false);
  
--   // Make sure the values of nprow and npcol are reasonable
--   if ( ((nprow_ * npcol_) > numProcs_) || ((nprow_ * npcol_) < 1) )
--   {
--      if ( myid_ == 0 )
--      {
--         mfem::err << "Warning: User specified nprow and npcol are such that "
--                   << "(nprow * npcol) > numProcs or (nprow * npcol) < 1.  "
--                   << "Using default values for nprow and npcol instead."
--                   << endl;
--      }
-+void SuperLUSolver::SetOperator(const Operator &op)
-+{
-+   // Verify that we have a compatible operator
-+   bool LUStructInitialized = (APtr_ != NULL);
-+   APtr_ = dynamic_cast<const SuperLURowLocMatrix *>(&op);
-+   MFEM_VERIFY(APtr_, "SuperLUSolver::SetOperator: Not a SuperLURowLocMatrix!");
+    /** Creates a copy of the parallel matrix hypParMat in STRUMPACK's RowLoc
+        format. All data is copied so the original matrix may be deleted. */
+-   STRUMPACKRowLocMatrix(const HypreParMatrix & hypParMat);
++   STRUMPACKRowLocMatrix(const Operator &op, bool sym_sparse = false);
  
--      nprow_ = (int)superlu_internal::sqrti((unsigned int)numProcs_);
--      while (numProcs_ % nprow_ != 0 && nprow_ > 0)
--      {
--         nprow_--;
--      }
-+   superlu_dist_options_t *options = (superlu_dist_options_t *)optionsPtr_;
+    ~STRUMPACKRowLocMatrix();
  
--      npcol_ = (int)(numProcs_ / nprow_);
--      MFEM_ASSERT(nprow_ * npcol_ == numProcs_, "");
--   }
-+   ScalePermstruct_t *ScalePermstruct = (ScalePermstruct_t *)ScalePermstructPtr_;
-+   LUstruct_t        *LUstruct        = (LUstruct_t *)LUstructPtr_;
+    void Mult(const Vector &x, Vector &y) const
+    {
+-      mfem_error("STRUMPACKRowLocMatrix::Mult(...)\n"
+-                 "  matrix vector products are not supported.");
++      MFEM_ABORT("STRUMPACKRowLocMatrix::Mult: Matrix vector products are not "
++                 "supported!");
+    }
  
--   superlu_gridinit(comm_, nprow_, npcol_, grid);
-+   gridinfo_t        *grid;
-+#if SUPERLU_DIST_MAJOR_VERSION > 7 || \
-+   (SUPERLU_DIST_MAJOR_VERSION == 7 && SUPERLU_DIST_MINOR_VERSION >= 2)
-+   gridinfo3d_t      *grid3d = NULL;
-+   if (npdep_ > 1)
-+   {
-+      grid3d = (gridinfo3d_t *)gridPtr_;
-+      grid = NULL;
-+   }
-+   else
-+#endif
-+   {
-+      grid = (gridinfo_t *)gridPtr_;
-+   }
+-   MPI_Comm GetComm() const { return comm_; }
++   MPI_Comm GetComm() const { return A_->comm(); }
  
--   gridInitialized_ = true;
--}
-+   // Set mfem::Operator member data
-+   MFEM_VERIFY(!LUStructInitialized ||
-+               (height == op.Height() && width == op.Width()),
-+               "SuperLUSolver::SetOperator: Inconsistent new matrix size!");
-+   height = op.Height();
-+   width  = op.Width();
+-   strumpack::CSRMatrixMPI<double,int>* getA() const { return A_; }
++   strumpack::CSRMatrixMPI<double, HYPRE_BigInt> *GetA() const { return A_; }
  
--void SuperLUSolver::DismantleGrid()
--{
--   if ( gridInitialized_ )
-+   if (!LUStructInitialized)
-    {
--      gridinfo_t * grid = (gridinfo_t*)gridPtr_;
+ private:
+-   MPI_Comm   comm_;
+-   strumpack::CSRMatrixMPI<double,int>* A_;
 -
--      superlu_gridexit(grid);
-+      // Initialize ScalePermstruct and LUstruct once for all operators (must
-+      // have same dimensions)
-+      ScalePermstructInit(APtr_->GetGlobalNumRows(),
-+                          APtr_->GetGlobalNumColumns(), ScalePermstruct);
-+      LUstructInit(APtr_->GetGlobalNumColumns(), LUstruct);
-+      options->Fact = DOFACT;
-    }
-+   else
-+   {
-+      // A previous matrix has already been set and factored
-+      switch (options->Fact)
-+      {
-+         case DOFACT:
-+            MFEM_ABORT("SuperLUSolver::SetOperator: Previous matrix was never used!");
-+            break;
-+         case SamePattern_SameRowPerm:
-+         {
-+            // Just zero the LU factors
-+#if SUPERLU_DIST_MAJOR_VERSION > 7 || \
-+(SUPERLU_DIST_MAJOR_VERSION == 7 && SUPERLU_DIST_MINOR_VERSION >= 2)
-+            if (npdep_ > 1)
-+            {
-+               if (grid3d->zscp.Iam == 0)
-+               {
-+                  ZeroLblocks(grid3d->iam, APtr_->GetGlobalNumColumns(),
-+                              &(grid3d->grid2d), LUstruct);
-+                  ZeroUblocks(grid3d->iam, APtr_->GetGlobalNumColumns(),
-+                              &(grid3d->grid2d), LUstruct);
-+               }
-+            }
-+            else
-+#endif
-+            {
-+               ZeroLblocks(grid->iam, APtr_->GetGlobalNumColumns(),
-+                           grid, LUstruct);
-+               ZeroUblocks(grid->iam, APtr_->GetGlobalNumColumns(),
-+                           grid, LUstruct);
-+            }
-+         }
-+         break;
-+         case SamePattern:
-+         case FACTORED:
-+         {
-+            // Delete factors from the prior factorization
-+#if SUPERLU_DIST_MAJOR_VERSION > 7 || \
-+(SUPERLU_DIST_MAJOR_VERSION == 7 && SUPERLU_DIST_MINOR_VERSION >= 2)
-+            if (npdep_ > 1)
-+            {
-+               if (grid3d->zscp.Iam == 0)
-+               {
-+                  Destroy_LU(APtr_->GetGlobalNumColumns(), &(grid3d->grid2d),
-+                             LUstruct);
-+               }
-+               else
-+               {
-+                  DeAllocLlu_3d(APtr_->GetGlobalNumColumns(), LUstruct,
-+                                grid3d);
-+                  DeAllocGlu_3d(LUstruct);
-+               }
-+            }
-+            else
-+#endif
-+            {
-+               Destroy_LU(APtr_->GetGlobalNumColumns(), grid, LUstruct);
-+            }
-+         }
-+         break;
-+         default:
-+            MFEM_ABORT("SuperLUSolver::SetOperator: Unexpected value for "
-+                       "options->Fact!");
-+            break;
-+      }
-+      if (options->Fact == FACTORED) { options->Fact = DOFACT; }
-+   }
-+}
+-}; // mfem::STRUMPACKRowLocMatrix
++   strumpack::CSRMatrixMPI<double, HYPRE_BigInt> *A_;
++};
  
--   gridInitialized_ = false;
-+void SuperLUSolver::Mult(const Vector &x, Vector &y) const
-+{
-+   Array<const Vector *> X(1);
-+   Array<Vector *> Y(1);
-+   X[0] = &x;
-+   Y[0] = &y;
-+   ArrayMult(X, Y);
- }
+ /** The MFEM STRUMPACK Direct Solver class.
  
--void SuperLUSolver::Mult( const Vector & x, Vector & y ) const
-+void SuperLUSolver::ArrayMult(const Array<const Vector *> &X,
-+                              Array<Vector *> &Y) const
+     The mfem::STRUMPACKSolver class uses the STRUMPACK library to perform LU
+     factorization of a parallel sparse matrix. The solver is capable of handling
+-    double precision types. See http://portal.nersc.gov/project/sparse/strumpack
++    double precision types. See
++    http://portal.nersc.gov/project/sparse/strumpack/.
+ */
+-class STRUMPACKSolver : public mfem::Solver
++template <typename STRUMPACKSolverType>
++class STRUMPACKSolverBase : public Solver
  {
-    MFEM_ASSERT(APtr_ != NULL,
-                "SuperLU Error: The operator must be set before"
-                " the system can be solved.");
--
--   superlu_dist_options_t * options = (superlu_dist_options_t*)optionsPtr_;
--   SuperLUStat_t     * stat         = (SuperLUStat_t*)statPtr_;
--   SuperMatrix       * A            = (SuperMatrix*)APtr_->InternalData();
--
--   ScalePermstruct_t * SPstruct     = (ScalePermstruct_t*)ScalePermstructPtr_;
--   LUstruct_t        * LUstruct     = (LUstruct_t*)LUstructPtr_;
--   SOLVEstruct_t     * SOLVEstruct  = (SOLVEstruct_t*)SOLVEstructPtr_;
--   gridinfo_t        * grid         = (gridinfo_t*)gridPtr_;
--
--   if (!firstSolveWithThisA_)
-+   SuperMatrix            *A       = (SuperMatrix *)APtr_->InternalData();
-+   superlu_dist_options_t *options = (superlu_dist_options_t *)optionsPtr_;
-+
-+   ScalePermstruct_t *ScalePermstruct = (ScalePermstruct_t *)ScalePermstructPtr_;
-+   LUstruct_t        *LUstruct        = (LUstruct_t *)LUstructPtr_;
-+   SOLVEstruct_t     *SOLVEstruct     = (SOLVEstruct_t *)SOLVEstructPtr_;
-+
-+   gridinfo_t        *grid;
-+#if SUPERLU_DIST_MAJOR_VERSION > 7 || \
-+   (SUPERLU_DIST_MAJOR_VERSION == 7 && SUPERLU_DIST_MINOR_VERSION >= 2)
-+   gridinfo3d_t      *grid3d = NULL;
-+   if (npdep_ > 1)
-    {
--      options->Fact = FACTORED; // Indicate the factored form of A is supplied.
-+      grid3d = (gridinfo3d_t *)gridPtr_;
-+      grid = NULL;
-    }
--   else // This is the first solve with this A
-+   else
-+#endif
-    {
--      firstSolveWithThisA_ = false;
--
--      // Make sure that the parameters have been initialized The only parameter
--      // we might have to worry about is ScalePermstruct, if the user is
--      // supplying a row or column permutation.
--
--      // Initialize ScalePermstruct and LUstruct.
--      SPstruct->DiagScale = NOEQUIL;
-+      grid = (gridinfo_t *)gridPtr_;
-+   }
- 
--      // Transfer ownership of the row permutations if available
--      if ( perm_r_ != NULL )
-+   // SuperLU overwrites x with y, so copy x to y and pass that to the solve
-+   // routine. Due to issues with repeated solves and changes in the number
-+   // of RHS vectors, this is not supported.
-+   MFEM_ASSERT(X.Size() == Y.Size(),
-+               "Number of columns mismatch in SuperLUSolver::Mult!");
-+   MFEM_VERIFY(nrhs_ < 1 || nrhs_ == X.Size(),
-+               "SuperLUSolver does not support multiple solves with different "
-+               "numbers of RHS vectors!");
-+   int ldx = Height();
-+   if (X.Size() == 1)
-+   {
-+      MFEM_ASSERT(X[0] && Y[0], "Missing Vector in SuperLUSolver::Mult!");
-+      sol_.MakeRef(*Y[0], 0, Y[0]->Size());
-+      sol_ = *X[0];
-+      nrhs_ = 1;
-+   }
-+   else
-+   {
-+      if (nrhs_ < 1)
-       {
--         SPstruct->perm_r = perm_r_;
--         perm_r_ = NULL;
-+         sol_.SetSize(X.Size() * ldx);
-+         nrhs_ = X.Size();
-       }
--      else
-+      for (int i = 0; i < nrhs_; i++)
-       {
--         if ( !(SPstruct->perm_r = intMalloc_dist(A->nrow)) )
--         {
--            ABORT("Malloc fails for perm_r[].");
--         }
-+         MFEM_ASSERT(X[i], "Missing Vector in SuperLUSolver::Mult!");
-+         Vector s(sol_, i * ldx, ldx);
-+         s = *X[i];
-       }
--      if ( !(SPstruct->perm_c = intMalloc_dist(A->ncol)) )
-+   }
-+
-+   // Solve the system
-+   double *B = sol_.HostReadWrite(), *berr;
-+   if (!(berr = doubleMalloc_dist(nrhs_)))
-+   {
-+      MFEM_ABORT("SuperLUSolver::Mult: Malloc failed for berr!");
-+   }
-+   SuperLUStat_t stat;
-+   PStatInit(&stat);
-+   int info = -1;
-+#if SUPERLU_DIST_MAJOR_VERSION > 7 || \
-+   (SUPERLU_DIST_MAJOR_VERSION == 7 && SUPERLU_DIST_MINOR_VERSION >= 2)
-+   if (npdep_ > 1)
-+   {
-+      pdgssvx3d(options, A, ScalePermstruct, B, ldx, nrhs_,
-+                grid3d, LUstruct, SOLVEstruct, berr, &stat, &info);
-+   }
-+   else
-+#endif
-+   {
-+      pdgssvx(options, A, ScalePermstruct, B, ldx, nrhs_,
-+              grid, LUstruct, SOLVEstruct, berr, &stat, &info);
-+   }
-+   HandleError(info);
-+   SUPERLU_FREE(berr);
-+   PStatFree(&stat);
-+   options->Fact = FACTORED;
-+
-+   // Copy solution into output (no need to do anything for single RHS since
-+   // solution is written directly into output Vector)
-+   if (nrhs_ > 1)
-+   {
-+      for (int i = 0; i < nrhs_; i++)
-       {
--         ABORT("Malloc fails for perm_c[].");
-+         MFEM_ASSERT(Y[i], "Missing Vector in SuperLUSolver::Mult!");
-+         Vector s(sol_, i * ldx, ldx);
-+         *Y[i] = s;
-       }
--
--      LUstructInit(A->ncol, LUstruct);
--      LUStructInitialized_ = true;
-    }
-+}
+-public:
+-   // Constructor with MPI_Comm parameter.
+-   STRUMPACKSolver( int argc, char* argv[], MPI_Comm comm );
++protected:
++   // Constructor with MPI_Comm parameter and command line arguments.
++   STRUMPACKSolverBase(MPI_Comm comm, int argc, char *argv[]);
  
--   // SuperLU overwrites x with y, so copy x to y and pass that to the solve
--   // routine.
-+void SuperLUSolver::MultTranspose(const Vector &x, Vector &y) const
-+{
-+   // Set flag for transpose solve
-+   superlu_dist_options_t *options = (superlu_dist_options_t *)optionsPtr_;
-+   options->Trans = TRANS;
-+   Mult(x, y);
+-   // Constructor with STRUMPACK Matrix Object.
+-   STRUMPACKSolver( STRUMPACKRowLocMatrix & A);
++   // Constructor with STRUMPACK matrix object and command line arguments.
++   STRUMPACKSolverBase(STRUMPACKRowLocMatrix &A, int argc, char *argv[]);
  
--   const double *xPtr = x.HostRead();
--   y = xPtr;
--   double * yPtr = y.HostReadWrite();
-+   // Reset the flag
-+   options->Trans = NOTRANS;
-+}
++public:
+    // Default destructor.
+-   ~STRUMPACKSolver( void );
++   virtual ~STRUMPACKSolverBase();
  
--   int      info = -1, locSize = y.Size();
-+void SuperLUSolver::ArrayMultTranspose(const Array<const Vector *> &X,
-+                                       Array<Vector *> &Y) const
-+{
-+   // Set flag for transpose solve
-+   superlu_dist_options_t *options = (superlu_dist_options_t *)optionsPtr_;
-+   options->Trans = TRANS;
-+   ArrayMult(X, Y);
+    // Factor and solve the linear system y = Op^{-1} x.
+-   void Mult( const Vector & x, Vector & y ) const;
++   void Mult(const Vector &x, Vector &y) const;
++   void ArrayMult(const Array<const Vector *> &X, Array<Vector *> &Y) const;
  
--   // Solve the system
--   pdgssvx(options, A, SPstruct, yPtr, locSize, nrhs_, grid,
--           LUstruct, SOLVEstruct, berr_, stat, &info);
-+   // Reset the flag
-+   options->Trans = NOTRANS;
-+}
+    // Set the operator.
+-   void SetOperator( const Operator & op );
++   void SetOperator(const Operator &op);
  
--   if ( info != 0 )
-+void SuperLUSolver::HandleError(int info) const
-+{
-+   if (info != 0)
-    {
--      if ( info < 0 )
-+      SuperMatrix *A = (SuperMatrix *)APtr_->InternalData();
-+      if (info < 0)
-       {
-          switch (-info)
-          {
-             case 1:
--               MFEM_ABORT("SuperLU:  SuperLU options are invalid.");
-+               MFEM_ABORT("SuperLUSolver: SuperLU options are invalid!");
-                break;
-             case 2:
--               MFEM_ABORT("SuperLU:  Matrix A (in Ax=b) is invalid.");
-+               MFEM_ABORT("SuperLUSolver: Matrix A (in Ax=b) is invalid!");
-                break;
-             case 5:
--               MFEM_ABORT("SuperLU:  Vector b dimension (in Ax=b) is invalid.");
-+               MFEM_ABORT("SuperLUSolver: Vector b dimension (in Ax=b) is "
-+                          "invalid!");
-                break;
-             case 6:
--               MFEM_ABORT("SuperLU:  Number of right-hand sides is invalid.");
-+               MFEM_ABORT("SuperLUSolver: Number of right-hand sides is "
-+                          "invalid!");
-                break;
-             default:
--               MFEM_ABORT("SuperLU:  Parameter with index "
--                          << -info << "invalid. (1-indexed)");
-+               MFEM_ABORT("SuperLUSolver: Parameter with index "
-+                          << -info << "invalid (1-indexed)!");
-                break;
-          }
-       }
--      else if ( info <= A->ncol )
-+      else if (info <= A->ncol)
-       {
--         MFEM_ABORT("SuperLU:  Found a singular matrix, U("
--                    << info << "," << info << ") is exactly zero.");
-+         MFEM_ABORT("SuperLUSolver: Found a singular matrix, U("
-+                    << info << "," << info << ") is exactly zero!");
-       }
--      else if ( info > A->ncol )
-+      else if (info > A->ncol)
-       {
--         MFEM_ABORT("SuperLU:  Memory allocation error with "
--                    << info - A->ncol << " bytes already allocated,");
-+         MFEM_ABORT("SuperLUSolver: Memory allocation error with "
-+                    << info - A->ncol << " bytes already allocated!");
-       }
-       else
-       {
--         MFEM_ABORT("Unknown SuperLU Error");
-+         MFEM_ABORT("Unknown SuperLU error: info = " << info << "!");
-       }
-    }
- }
+    // Set various solver options. Refer to STRUMPACK documentation for
+    // details.
+-   void SetFromCommandLine( );
+-   void SetPrintFactorStatistics( bool print_stat );
+-   void SetPrintSolveStatistics( bool print_stat );
+-   void SetRelTol( double rtol );
+-   void SetAbsTol( double atol );
++   void SetFromCommandLine();
++   void SetPrintFactorStatistics(bool print_stat);
++   void SetPrintSolveStatistics(bool print_stat);
++
++   // Set tolerances and iterations for iterative solvers. Compression
++   // tolerance is handled below.
++   void SetRelTol(double rtol);
++   void SetAbsTol(double atol);
++   void SetMaxIter(int max_it);
++
++   // Set the flag controlling reuse of the symbolic factorization for multiple
++   // operators. This method has to be called before repeated calls to
++   // SetOperator.
++   void SetReorderingReuse(bool reuse);
++
++   // Enable or not GPU off-loading available if STRUMPACK was compiled with CUDA. Note
++   // that input/output from MFEM to STRUMPACK is all still through host memory.
++   void EnableGPU();
++   void DisableGPU();
  
--void SuperLUSolver::SetOperator( const Operator & op )
--{
--   // Verify that we have a compatible operator
--   APtr_ = dynamic_cast<const SuperLURowLocMatrix*>(&op);
--   if ( APtr_ == NULL )
--   {
--      mfem_error("SuperLUSolver::SetOperator : not SuperLURowLocMatrix!");
--   }
--
--   // Everything is OK so finish setting the operator
--   firstSolveWithThisA_ = true;
--
--   // Set mfem::Operator member data
--   height = op.Height();
--   width  = op.Width();
--
--   // Initialize the processor grid if necessary
--   if (!gridInitialized_)
--   {
--      this->SetupGrid();
--   }
--}
--
--} // mfem namespace
-+} // namespace mfem
+    /**
+     * STRUMPACK is an (approximate) direct solver. It can be used as a direct
+@@ -100,70 +119,151 @@ public:
+     * used without preconditioner.
+     *
+     * Supported values are:
+-    *    AUTO:           Use iterative refinement if no HSS compression is used,
+-    *                    otherwise use GMRes.
+-    *    DIRECT:         No outer iterative solver, just a single application of
+-    *                    the multifrontal solver.
+-    *    REFINE:         Iterative refinement.
+-    *    PREC_GMRES:     Preconditioned GMRes.
+-    *                    The preconditioner is the (approx) multifrontal solver.
+-    *    GMRES:          UN-preconditioned GMRes. (for testing mainly)
+-    *    PREC_BICGSTAB:  Preconditioned BiCGStab.
+-    *                    The preconditioner is the (approx) multifrontal solver.
++    *    AUTO:           Use iterative refinement if no HSS compression is
++    *                    used, otherwise use GMRes
++    *    DIRECT:         No outer iterative solver, just a single application
++    *                    of the multifrontal solver
++    *    REFINE:         Iterative refinement
++    *    PREC_GMRES:     Preconditioned GMRes
++    *                    The preconditioner is the (approx) multifrontal solver
++    *    GMRES:          UN-preconditioned GMRes (for testing mainly)
++    *    PREC_BICGSTAB:  Preconditioned BiCGStab
++    *                    The preconditioner is the (approx) multifrontal solver
+     *    BICGSTAB:       UN-preconditioned BiCGStab. (for testing mainly)
+     */
+-   void SetKrylovSolver( strumpack::KrylovSolver method );
++   void SetKrylovSolver(strumpack::KrylovSolver method);
  
- #endif // MFEM_USE_MPI
- #endif // MFEM_USE_SUPERLU
-diff --git a/linalg/superlu.hpp b/linalg/superlu.hpp
-index 1edec0a08..e22020751 100644
---- a/linalg/superlu.hpp
-+++ b/linalg/superlu.hpp
-@@ -16,33 +16,30 @@
+    /**
+     * Supported reorderings are:
+-    *    METIS, PARMETIS, SCOTCH, PTSCOTCH, RCM
++    *    NATURAL:    Do not reorder the system
++    *    METIS:      Use Metis nested-dissection reordering (default)
++    *    PARMETIS:   Use ParMetis nested-dissection reordering
++    *    SCOTCH:     Use Scotch nested-dissection reordering
++    *    PTSCOTCH:   Use PT-Scotch nested-dissection reordering
++    *    RCM:        Use RCM reordering
++    *    GEOMETRIC:  A simple geometric nested dissection code that
++    *                only works for regular meshes
++    *    AMD:        Approximate minimum degree
++    *    MMD:        Multiple minimum degree
++    *    AND:        Nested dissection
++    *    MLF:        Minimum local fill
++    *    SPECTRAL:   Spectral nested dissection
+     */
+-   void SetReorderingStrategy( strumpack::ReorderingStrategy method );
++   void SetReorderingStrategy(strumpack::ReorderingStrategy method);
  
- #ifdef MFEM_USE_SUPERLU
- #ifdef MFEM_USE_MPI
-+
- #include "operator.hpp"
- #include "hypre.hpp"
+    /**
+-    * Disable static pivoting for stability. The static pivoting in strumpack
++    * Configure static pivoting for stability. The static pivoting in STRUMPACK
+     * permutes the sparse input matrix in order to get large (nonzero) elements
+     * on the diagonal. If the input matrix is already diagonally dominant, this
+     * reordering can be disabled.
++    *
++    * Supported matching algorithms are:
++    *    NONE:                          Don't do anything
++    *    MAX_CARDINALITY:               Maximum cardinality
++    *    MAX_SMALLEST_DIAGONAL:         Maximum smallest diagonal value
++    *    MAX_SMALLEST_DIAGONAL_2:       Same as MAX_SMALLEST_DIAGONAL
++    *                                   but different algorithm
++    *    MAX_DIAGONAL_SUM:              Maximum sum of diagonal values
++    *    MAX_DIAGONAL_PRODUCT_SCALING:  Maximum product of diagonal values
++    *                                   and row and column scaling (default)
++    *    COMBBLAS:                      Use AWPM from CombBLAS (only with
++    *                                   version >= 3)
+     */
+-   void DisableMatching();
 -
- #include <mpi.h>
- 
- namespace mfem
- {
+-   /**
+-    * Enable static pivoting for stability using the MC64 algorithm with
+-    * job=5. Using a matching algorithm, this will permute the sparse input
+-    * matrix in order to get nonzero elements (as large as possible) on the
+-    * diagonal. And will also scale the rows and columns of the matrix.
+-    */
+-   void EnableMatching();
++   void SetMatching(strumpack::MatchingJob job);
  
--namespace superlu_internal
--{
--unsigned int sqrti(const unsigned int & a);
--}
--
- namespace superlu
- {
--// Copy selected enumerations from SuperLU
-+
-+// Copy selected enumerations from SuperLU (from superlu_enum_consts.h)
- #ifdef MFEM_USE_SUPERLU5
--typedef enum {NOROWPERM, LargeDiag, MY_PERMR}                       RowPerm;
-+typedef enum {NOROWPERM, LargeDiag, MY_PERMR}                      RowPerm;
- #else
--typedef enum {NOROWPERM, LargeDiag_MC64, LargeDiag_HWPM, MY_PERMR}  RowPerm;
-+typedef enum {NOROWPERM, LargeDiag_MC64, LargeDiag_HWPM, MY_PERMR} RowPerm;
+-#if STRUMPACK_VERSION_MAJOR >= 3
+    /**
+-    * Use the AWPM (approximate weight perfect matching) algorithm from the
+-    * Combinatorial BLAS library for static pivoting, i.e. getting large
+-    * nonzeros on the diagonal. This requires that strumpack was compiled with
+-    * support for Combinatorial BLAS.
++    * Enable support for rank-structured data formats, which can be used
++    * for compression within the sparse solver.
++    *
++    * Supported compression types are:
++    *    NONE:           No compression, purely direct solver (default)
++    *    HSS:            HSS compression of frontal matrices
++    *    BLR:            Block low-rank compression of fronts
++    *    HODLR:          Hierarchically Off-diagonal Low-Rank
++    *                    compression of frontal matrices
++    *    BLR_HODLR:      Block low-rank compression of medium
++    *                    fronts and Hierarchically Off-diagonal
++    *                    Low-Rank compression of large fronts
++    *    ZFP_BLR_HODLR:  ZFP compression for small fronts,
++    *                    Block low-rank compression of medium
++    *                    fronts and Hierarchically Off-diagonal
++    *                    Low-Rank compression of large fronts
++    *    LOSSLESS:       Lossless compression
++    *    LOSSY:          Lossy compression
++    *
++    * For versions of STRUMPACK < 5, we support only NONE, HSS, and BLR.
++    * BLR_HODLR and ZPR_BLR_HODLR are supported in STRUMPACK >= 6.
+     */
+-   void EnableParallelMatching();
++   void SetCompression(strumpack::CompressionType type);
++   void SetCompressionRelTol(double rtol);
++   void SetCompressionAbsTol(double atol);
++#if STRUMPACK_VERSION_MAJOR >= 5
++   void SetCompressionLossyPrecision(int precision);
++   void SetCompressionButterflyLevels(int levels);
  #endif
- typedef enum {NATURAL, MMD_ATA, MMD_AT_PLUS_A, COLAMD,
-               METIS_AT_PLUS_A, PARMETIS, ZOLTAN, MY_PERMC
--             }          ColPerm;
--typedef enum {NOTRANS, TRANS, CONJ}                                 Trans;
--typedef enum {NOREFINE, SLU_SINGLE=1, SLU_DOUBLE, SLU_EXTRA}        IterRefine;
--}
-+             } ColPerm;
-+typedef enum {NOREFINE, SLU_SINGLE=1, SLU_DOUBLE, SLU_EXTRA} IterRefine;
-+typedef enum {DOFACT, SamePattern, SamePattern_SameRowPerm, FACTORED} Fact;
-+
-+} // namespace superlu
- 
- class SuperLURowLocMatrix : public Operator
- {
-@@ -52,34 +49,35 @@ public:
-        be of size (local) nrows by (global) glob_ncols. The new parallel matrix
-        contains copies of all input arrays (so they can be deleted). */
-    SuperLURowLocMatrix(MPI_Comm comm,
--                       int num_loc_rows, int first_loc_row,
--                       int glob_nrows, int glob_ncols,
--                       int *I, int *J, double *data);
-+                       int num_loc_rows, HYPRE_BigInt first_loc_row,
-+                       HYPRE_BigInt glob_nrows, HYPRE_BigInt glob_ncols,
-+                       int *I, HYPRE_BigInt *J, double *data);
- 
-    /** Creates a copy of the parallel matrix hypParMat in SuperLU's RowLoc
-        format. All data is copied so the original matrix may be deleted. */
--   SuperLURowLocMatrix(const HypreParMatrix & hypParMat);
-+   SuperLURowLocMatrix(const Operator &op);
  
-    ~SuperLURowLocMatrix();
+ private:
+-   void Init( int argc, char* argv[] );
++   // Helper method for calling the STRUMPACK factoriation routine.
++   void FactorInternal() const;
  
-    void Mult(const Vector &x, Vector &y) const
-    {
--      mfem_error("SuperLURowLocMatrix::Mult(...)\n"
--                 "  matrix vector products are not supported.");
-+      MFEM_ABORT("SuperLURowLocMatrix::Mult: Matrix vector products are not "
-+                 "supported!");
-    }
+ protected:
+-
+-   MPI_Comm      comm_;
+-   int           numProcs_;
+-   int           myid_;
++   const STRUMPACKRowLocMatrix *APtr_;
++   STRUMPACKSolverType         *solver_;
  
-+   void *InternalData() const { return rowLocPtr_; }
+    bool factor_verbose_;
+    bool solve_verbose_;
++   bool reorder_reuse_;
 +
-    MPI_Comm GetComm() const { return comm_; }
- 
--   void * InternalData() const { return rowLocPtr_; }
-+   HYPRE_BigInt GetGlobalNumRows() const { return num_global_rows_; }
- 
--   HYPRE_BigInt GetGlobalNumColumns() const { return num_global_cols; }
-+   HYPRE_BigInt GetGlobalNumColumns() const { return num_global_cols_; }
- 
- private:
--   MPI_Comm   comm_;
--   void     * rowLocPtr_;
--   HYPRE_BigInt num_global_cols;
--
--}; // mfem::SuperLURowLocMatrix
-+   MPI_Comm     comm_;
-+   void        *rowLocPtr_;
-+   HYPRE_BigInt num_global_rows_, num_global_cols_;
++   mutable Vector rhs_, sol_;
++   mutable int    nrhs_;
 +};
  
- /** The MFEM SuperLU Direct Solver class.
- 
-@@ -88,80 +86,75 @@ private:
-     double precision types. It is currently maintained by Xiaoye Sherry Li at
-     NERSC, see http://crd-legacy.lbl.gov/~xiaoye/SuperLU/.
- */
--class SuperLUSolver : public mfem::Solver
-+class SuperLUSolver : public Solver
- {
- public:
-    // Constructor with MPI_Comm parameter.
--   SuperLUSolver( MPI_Comm comm );
-+   SuperLUSolver(MPI_Comm comm, int npdep = 1);
- 
--   // Constructor with SuperLU Matrix Object.
--   SuperLUSolver( SuperLURowLocMatrix & A);
-+   // Constructor with SuperLU matrix object.
-+   SuperLUSolver(SuperLURowLocMatrix &A, int npdep = 1);
- 
-    // Default destructor.
--   ~SuperLUSolver( void );
-+   ~SuperLUSolver();
- 
--   // Allocate and deallocate the MPI communicators. This routine is called
--   // internally by SetOperator().
--   void SetupGrid();
--   // This routing must be called after the solve, but before destruction.
--   void DismantleGrid();
-+   // Set the operator.
-+   void SetOperator(const Operator &op);
+-   const STRUMPACKRowLocMatrix * APtr_;
+-   strumpack::StrumpackSparseSolverMPIDist<double,int> * solver_;
++class STRUMPACKSolver :
++   public STRUMPACKSolverBase<strumpack::
++   SparseSolverMPIDist<double, HYPRE_BigInt>>
++{
++public:
++   // Constructor with MPI_Comm parameter.
++   STRUMPACKSolver(MPI_Comm comm);
++
++   // Constructor with STRUMPACK matrix object.
++   STRUMPACKSolver(STRUMPACKRowLocMatrix &A);
  
-    // Factor and solve the linear system y = Op^{-1} x.
--   void Mult( const Vector & x, Vector & y ) const;
--
--   // Set the operator.
--   void SetOperator( const Operator & op );
--
--   // Set various solver options. Refer to SuperLU documentation for details.
--   void SetPrintStatistics  ( bool              print_stat );
--   void SetEquilibriate     ( bool                   equil );
--   void SetColumnPermutation( superlu::ColPerm    col_perm );
--   void SetRowPermutation   ( superlu::RowPerm    row_perm,
--                              Array<int> *     perm = NULL );
--   void SetTranspose        ( superlu::Trans         trans );
--   void SetIterativeRefine  ( superlu::IterRefine iter_ref );
--   void SetReplaceTinyPivot ( bool                     rtp );
--   void SetNumLookAheads    ( int           num_lookaheads );
--   void SetLookAheadElimTree( bool                   etree );
--   void SetSymmetricPattern ( bool                     sym );
--   void SetParSymbFact      ( bool                     par );
-+   // Note: Factorization modifies the operator matrix.
-+   void Mult(const Vector &x, Vector &y) const;
-+   void ArrayMult(const Array<const Vector *> &X, Array<Vector *> &Y) const;
+-}; // mfem::STRUMPACKSolver class
++   // Constructor with MPI_Comm parameter and command line arguments.
++   STRUMPACKSolver(MPI_Comm comm, int argc, char *argv[]);
 +
-+   // Factor and solve the linear system y = Op^{-T} x.
-+   // Note: Factorization modifies the operator matrix.
-+   void MultTranspose(const Vector &x, Vector &y) const;
-+   void ArrayMultTranspose(const Array<const Vector *> &X,
-+                           Array<Vector *> &Y) const;
++   // Constructor with STRUMPACK matrix object and command line arguments.
++   STRUMPACKSolver(STRUMPACKRowLocMatrix &A, int argc, char *argv[]);
 +
-+   // Set various solver options. Refer to SuperLU_DIST documentation for
-+   // details.
-+   void SetPrintStatistics(bool print_stat);
-+   void SetEquilibriate(bool equil);
-+   void SetColumnPermutation(superlu::ColPerm col_perm);
-+   void SetRowPermutation(superlu::RowPerm row_perm);
-+   void SetIterativeRefine(superlu::IterRefine iter_ref);
-+   void SetReplaceTinyPivot(bool rtp);
-+   void SetNumLookAheads(int num_lookaheads);
-+   void SetLookAheadElimTree(bool etree);
-+   void SetSymmetricPattern(bool sym);
-+   void SetParSymbFact(bool par);
-+   void SetFact(superlu::Fact fact);
++   // Destructor.
++   ~STRUMPACKSolver() {}
++};
 +
-+   // Processor grid for SuperLU_DIST.
-+   const int nprow_, npcol_, npdep_;
- 
- private:
--   void Init();
-+   // Initialize the solver.
-+   void Init(MPI_Comm comm);
- 
--protected:
-+   // Handle error message from call to SuperLU solver.
-+   void HandleError(int info) const;
- 
--   MPI_Comm      comm_;
--   int           numProcs_;
--   int           myid_;
--
--   const SuperLURowLocMatrix * APtr_;
--
--   // The actual types of the following pointers are hidden to avoid exposing
--   // the SuperLU header files to the entire library. Their types are given in
--   // the trailing comments. The reason that this is necessary is that SuperLU
--   // defines these structs differently for use with its real and complex
--   // solvers. If we want to add support for SuperLU's complex solvers one day
--   // we will need to hide these types to avoid name conflicts.
--   void*         optionsPtr_;         // superlu_options_t *
--   void*         statPtr_;            //     SuperLUStat_t *
--   void*         ScalePermstructPtr_; //  ScalePermsruct_t *
--   void*         LUstructPtr_;        //        LUstruct_t *
--   void*         SOLVEstructPtr_;     //     SOLVEstruct_t *
--   void*         gridPtr_;            //        gridinfo_t *
--
--   double*       berr_;
--   mutable int*  perm_r_;
--   int           nrhs_;
--   int           nprow_;
--   int           npcol_;
--   mutable bool  firstSolveWithThisA_;
--   bool          gridInitialized_;
--   mutable bool  LUStructInitialized_;
--
--}; // mfem::SuperLUSolver class
--
--} // mfem namespace
-+protected:
-+   const SuperLURowLocMatrix *APtr_;
-+   mutable Vector             sol_;
-+   mutable int                nrhs_;
++#if STRUMPACK_VERSION_MAJOR >= 7
++class STRUMPACKMixedPrecisionSolver :
++   public STRUMPACKSolverBase<strumpack::
++   SparseSolverMixedPrecisionMPIDist<float, double, HYPRE_BigInt>>
++{
++public:
++   // Constructor with MPI_Comm parameter.
++   STRUMPACKMixedPrecisionSolver(MPI_Comm comm);
 +
-+   /** The actual types of the following pointers are hidden to avoid exposing
-+       the SuperLU header files to the entire library. Their types are given in
-+       the trailing comments. The reason that this is necessary is that SuperLU
-+       defines these structs differently for use with its real and complex
-+       solvers. If we want to add support for SuperLU's complex solvers one day
-+       we will need to hide these types to avoid name conflicts. */
-+   void *optionsPtr_;          // superlu_options_t *
-+   void *ScalePermstructPtr_;  //  ScalePermsruct_t *
-+   void *LUstructPtr_;         //        LUstruct_t *
-+   void *SOLVEstructPtr_;      //     SOLVEstruct_t *
-+   void *gridPtr_;             //        gridinfo_t * or gridinfo3d_t *
-+};
++   // Constructor with STRUMPACK matrix object.
++   STRUMPACKMixedPrecisionSolver(STRUMPACKRowLocMatrix &A);
++
++   // Constructor with MPI_Comm parameter and command line arguments.
++   STRUMPACKMixedPrecisionSolver(MPI_Comm comm, int argc, char *argv[]);
++
++   // Constructor with STRUMPACK matrix object and command line arguments.
++   STRUMPACKMixedPrecisionSolver(STRUMPACKRowLocMatrix &A,
++                                 int argc, char *argv[]);
 +
++   // Destructor.
++   ~STRUMPACKMixedPrecisionSolver() {}
++};
++#endif
+ 
+-} // mfem namespace
 +} // namespace mfem
  
  #endif // MFEM_USE_MPI
- #endif // MFEM_USE_SUPERLU
+ #endif // MFEM_USE_STRUMPACK
 diff --git a/miniapps/nurbs/nurbs_ex11p.cpp b/miniapps/nurbs/nurbs_ex11p.cpp
 index 7b8e3bd2d..e5cf95062 100644
 --- a/miniapps/nurbs/nurbs_ex11p.cpp
@@ -3847,10 +1114,10 @@ index 7b8e3bd2d..e5cf95062 100644
           strumpack->SetFromCommandLine();
           precond = strumpack;
 diff --git a/tests/unit/linalg/test_direct_solvers.cpp b/tests/unit/linalg/test_direct_solvers.cpp
-index 838bb4009..848cf76df 100644
+index d015a8b94..de2d074c1 100644
 --- a/tests/unit/linalg/test_direct_solvers.cpp
 +++ b/tests/unit/linalg/test_direct_solvers.cpp
-@@ -23,14 +23,16 @@ using namespace mfem;
+@@ -23,6 +23,9 @@ using namespace mfem;
  #ifdef MFEM_USE_SUPERLU
  #define DIRECT_SOLVE_PARALLEL
  #endif
@@ -3860,211 +1127,35 @@ index 838bb4009..848cf76df 100644
  
  #if defined(DIRECT_SOLVE_SERIAL) || defined(DIRECT_SOLVE_PARALLEL)
  
--int dim;
--double uexact(const Vector& x)
-+double uexact(const Vector &x)
- {
-    double u;
--   switch (dim)
-+   switch (x.Size())
-    {
-       case 1:
-          u  = 3.0 + 2.0 * x(0) - 0.5 * x(0) * x(0);
-@@ -45,16 +47,16 @@ double uexact(const Vector& x)
-    return u;
- }
- 
--void gradexact(const Vector& x, Vector & grad)
-+void gradexact(const Vector &x, Vector &grad)
- {
--   grad.SetSize(dim);
--   switch (dim)
-+   grad.SetSize(x.Size());
-+   switch (x.Size())
-    {
-       case 1:
-          grad[0] = 2.0 - x(0);
-          break;
-       case 2:
--         grad[0] = 0.2 - 0.9 * x(1) + x(1) * x (1);
-+         grad[0] = 0.2 - 0.9 * x(1) + x(1) * x(1);
-          grad[1] = - 0.9 * x(0) + 2.0 * x(0) * x(1);
-          break;
-       default:
-@@ -68,7 +70,7 @@ void gradexact(const Vector& x, Vector & grad)
- double d2uexact(const Vector& x) // returns \Delta u
- {
-    double d2u;
--   switch (dim)
-+   switch (x.Size())
-    {
-       case 1:
-          d2u  = -1.0;
-@@ -83,7 +85,7 @@ double d2uexact(const Vector& x) // returns \Delta u
-    return d2u;
- }
- 
--double fexact(const Vector& x) // returns -\Delta u
-+double fexact(const Vector &x) // returns -\Delta u
- {
-    double d2u = d2uexact(x);
-    return -d2u;
-@@ -93,7 +95,7 @@ double fexact(const Vector& x) // returns -\Delta u
- 
- #ifdef DIRECT_SOLVE_SERIAL
- 
--TEST_CASE("direct-serial","[CUDA]")
-+TEST_CASE("Serial Direct Solvers", "[CUDA]")
- {
-    const int ne = 2;
-    for (dim = 1; dim < 4; ++dim)
-@@ -114,10 +116,9 @@ TEST_CASE("direct-serial","[CUDA]")
-                    ne, ne, ne, Element::HEXAHEDRON, 1.0, 1.0, 1.0);
+@@ -100,7 +103,7 @@ TEST_CASE("Serial Direct Solvers", "[CUDA]")
+       Mesh mesh;
+       if (dim == 1)
+       {
+-         mesh = Mesh::MakeCartesian1D(ne,  1.0);
++         mesh = Mesh::MakeCartesian1D(ne, 1.0);
        }
-       int order = 3;
--      FiniteElementCollection* fec = new H1_FECollection(order, dim);
--      FiniteElementSpace fespace(&mesh, fec);
--      Array<int> ess_tdof_list;
--      Array<int> ess_bdr(mesh.bdr_attributes.Max());
-+      H1_FECollection fec(order, dim);
-+      FiniteElementSpace fespace(&mesh, &fec);
-+      Array<int> ess_tdof_list, ess_bdr(mesh.bdr_attributes.Max());
-       ess_bdr = 1;
-       fespace.GetEssentialTrueDofs(ess_bdr, ess_tdof_list);
- 
-@@ -146,15 +147,14 @@ TEST_CASE("direct-serial","[CUDA]")
-       umf_solver.Mult(B, X);
- 
-       Vector Y(X.Size());
--      A->Mult(X,Y);
--      Y-=B;
-+      A->Mult(X, Y);
-+      Y -= B;
-       REQUIRE(Y.Norml2() < 1.e-12);
- 
-       a.RecoverFEMSolution(X, b, x);
--      VectorFunctionCoefficient grad(dim,gradexact);
--      double error = x.ComputeH1Error(&uex,&grad);
-+      VectorFunctionCoefficient grad(dim, gradexact);
-+      double error = x.ComputeH1Error(&uex, &grad);
-       REQUIRE(error < 1.e-12);
--      delete fec;
-    }
- }
- 
-@@ -162,12 +162,12 @@ TEST_CASE("direct-serial","[CUDA]")
- 
- #ifdef DIRECT_SOLVE_PARALLEL
- 
--TEST_CASE("direct-parallel", "[Parallel], [CUDA]")
-+TEST_CASE("Parallel Direct Solvers", "[Parallel], [CUDA]")
+       else if (dim == 2)
+       {
+@@ -163,13 +166,13 @@ TEST_CASE("Parallel Direct Solvers", "[Parallel], [CUDA]")
  {
     int rank;
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 -   const int ne = 2;
--   for (dim = 1; dim < 4; ++dim)
-+   const int ne = 8;
-+   for (int dim = 1; dim <= 3; ++dim)
++   const int ne = 4;
+    for (int dim = 1; dim < 4; ++dim)
     {
        Mesh mesh;
        if (dim == 1)
-@@ -185,16 +185,15 @@ TEST_CASE("direct-parallel", "[Parallel], [CUDA]")
-                    ne, ne, ne, Element::HEXAHEDRON, 1.0, 1.0, 1.0);
-       }
- 
--      ParMesh *pmesh = new ParMesh(MPI_COMM_WORLD, mesh);
-+      ParMesh pmesh(MPI_COMM_WORLD, mesh);
-       mesh.Clear();
-       int order = 3;
--      FiniteElementCollection* fec = new H1_FECollection(order, dim);
--      ParFiniteElementSpace fespace(pmesh, fec);
--      Array<int> ess_tdof_list;
--      Array<int> ess_bdr;
--      if (pmesh->bdr_attributes.Size())
-+      H1_FECollection fec(order, dim);
-+      ParFiniteElementSpace fespace(&pmesh, &fec);
-+      Array<int> ess_tdof_list, ess_bdr;
-+      if (pmesh.bdr_attributes.Size())
        {
--         ess_bdr.SetSize(pmesh->bdr_attributes.Max());
-+         ess_bdr.SetSize(pmesh.bdr_attributes.Max());
-          ess_bdr = 1;
-          fespace.GetEssentialTrueDofs(ess_bdr, ess_tdof_list);
+-         mesh = Mesh::MakeCartesian1D(ne,  1.0);
++         mesh = Mesh::MakeCartesian1D(ne, 1.0);
        }
-@@ -217,20 +216,40 @@ TEST_CASE("direct-parallel", "[Parallel], [CUDA]")
-       Vector B, X;
-       a.FormLinearSystem(ess_tdof_list, x, b, A, X, B);
- 
-+      Vector B0(X.Size()), B1(X.Size()), X0(X.Size()), X1(X.Size());
-+      B0 = B;
-+      B1 = B;
-+      B1 *= 2.0;
-+      Array<Vector *> BB(2), XX(2);
-+      BB[0] = &B0;
-+      BB[1] = &B1;
-+      XX[0] = &X0;
-+      XX[1] = &X1;
-+
- #ifdef MFEM_USE_MUMPS
+       else if (dim == 2)
        {
--         MUMPSSolver mumps;
-+         MUMPSSolver mumps(MPI_COMM_WORLD);
-          mumps.SetPrintLevel(0);
-          mumps.SetOperator(*A.As<HypreParMatrix>());
--         mumps.Mult(B,X);
-+         mumps.Mult(B, X);
-+
-          Vector Y(X.Size());
--         A->Mult(X,Y);
--         Y-=B;
-+         A->Mult(X, Y);
-+         Y -= B;
-          REQUIRE(Y.Norml2() < 1.e-12);
- 
-+         mumps.ArrayMult(BB,XX);
-+
-+         for (int i = 0; i < XX.Size(); i++)
-+         {
-+            A->Mult(*XX[i], Y);
-+            Y -= *BB[i];
-+            REQUIRE(Y.Norml2() < 1.e-12);
-+         }
-+
-          a.RecoverFEMSolution(X, b, x);
--         VectorFunctionCoefficient grad(dim,gradexact);
--         double error = x.ComputeH1Error(&uex,&grad);
-+         VectorFunctionCoefficient grad(dim, gradexact);
-+         double error = x.ComputeH1Error(&uex, &grad);
+@@ -288,6 +291,39 @@ TEST_CASE("Parallel Direct Solvers", "[Parallel], [CUDA]")
           REQUIRE(error < 1.e-12);
        }
  #endif
-@@ -244,18 +263,59 @@ TEST_CASE("direct-parallel", "[Parallel], [CUDA]")
-          superlu.SetColumnPermutation(superlu::METIS_AT_PLUS_A);
-          superlu.SetOperator(SA);
-          superlu.Mult(B, X);
-+
-          Vector Y(X.Size());
--         A->Mult(X,Y);
--         Y-=B;
-+         A->Mult(X, Y);
-+         Y -= B;
-          REQUIRE(Y.Norml2() < 1.e-12);
-+
-+         // SuperLUSolver requires constant number of RHS across solves
-+         SuperLURowLocMatrix SA2(*A.As<HypreParMatrix>());
-+         SuperLUSolver superlu2(MPI_COMM_WORLD);
-+         superlu2.SetPrintStatistics(false);
-+         superlu2.SetSymmetricPattern(false);
-+         superlu2.SetColumnPermutation(superlu::METIS_AT_PLUS_A);
-+         superlu2.SetOperator(SA2);
-+         superlu2.ArrayMult(BB, XX);
-+
-+         a.RecoverFEMSolution(X, b, x);
-+         VectorFunctionCoefficient grad(dim, gradexact);
-+         double error = x.ComputeH1Error(&uex, &grad);
-+         REQUIRE(error < 1.e-12);
-+      }
-+#endif
 +#ifdef MFEM_USE_STRUMPACK
 +      // Transform to monolithic HypreParMatrix
 +      {
@@ -4073,7 +1164,8 @@ index 838bb4009..848cf76df 100644
 +         strumpack.SetPrintFactorStatistics(false);
 +         strumpack.SetPrintSolveStatistics(false);
 +         strumpack.SetKrylovSolver(strumpack::KrylovSolver::DIRECT);
-+         strumpack.SetReorderingStrategy(strumpack::ReorderingStrategy::METIS);
++         strumpack.SetReorderingStrategy(dim > 1 ? strumpack::ReorderingStrategy::METIS :
++                                         strumpack::ReorderingStrategy::NATURAL);
 +         strumpack.SetOperator(SA);
 +         strumpack.Mult(B, X);
 +
@@ -4091,16 +1183,12 @@ index 838bb4009..848cf76df 100644
 +            REQUIRE(Y.Norml2() < 1.e-12);
 +         }
 +
-          a.RecoverFEMSolution(X, b, x);
--         VectorFunctionCoefficient grad(dim,gradexact);
--         double error = x.ComputeH1Error(&uex,&grad);
++         a.RecoverFEMSolution(X, b, x);
 +         VectorFunctionCoefficient grad(dim, gradexact);
 +         double error = x.ComputeH1Error(&uex, &grad);
-          REQUIRE(error < 1.e-12);
-       }
- #endif
--      delete fec;
--      delete pmesh;
++         REQUIRE(error < 1.e-12);
++      }
++#endif
     }
  }
  
diff --git a/palace/deps/patch/mfem/patch_hypre_blocks.diff b/palace/deps/patch/mfem/patch_hypre_blocks.diff
deleted file mode 100644
index 346657dcb..000000000
--- a/palace/deps/patch/mfem/patch_hypre_blocks.diff
+++ /dev/null
@@ -1,41 +0,0 @@
-diff --git a/linalg/hypre.cpp b/linalg/hypre.cpp
-index d7069d4e6..689ce675e 100644
---- a/linalg/hypre.cpp
-+++ b/linalg/hypre.cpp
-@@ -2996,10 +2996,7 @@ void GatherBlockOffsetData(MPI_Comm comm, const int rank, const int nprocs,
-    for (int i = 0; i < nprocs; ++i)
-    {
-       globalNum += all_num_loc[i];
--      if (rank == 0)
--      {
--         MFEM_VERIFY(globalNum >= 0, "overflow in global size");
--      }
-+      MFEM_VERIFY(globalNum >= 0, "overflow in global size");
-       if (i < rank)
-       {
-          firstLocal += all_num_loc[i];
-@@ -3064,9 +3061,6 @@ HypreParMatrix * HypreParMatrixFromBlocks(Array2D<HypreParMatrix*> &blocks,
-             const int nrows = blocks(i,j)->NumRows();
-             const int ncols = blocks(i,j)->NumCols();
- 
--            MFEM_VERIFY(nrows > 0 &&
--                        ncols > 0, "Invalid block in HypreParMatrixFromBlocks");
--
-             if (rowOffsets[i+1] == 0)
-             {
-                rowOffsets[i+1] = nrows;
-@@ -3088,14 +3082,11 @@ HypreParMatrix * HypreParMatrixFromBlocks(Array2D<HypreParMatrix*> &blocks,
-             }
-          }
-       }
--
--      MFEM_VERIFY(rowOffsets[i+1] > 0, "Invalid input blocks");
-       rowOffsets[i+1] += rowOffsets[i];
-    }
- 
-    for (int j=0; j<numBlockCols; ++j)
-    {
--      MFEM_VERIFY(colOffsets[j+1] > 0, "Invalid input blocks");
-       colOffsets[j+1] += colOffsets[j];
-    }
- 
diff --git a/palace/deps/patch/mfem/patch_mesh_part.diff b/palace/deps/patch/mfem/patch_mesh_part.diff
index 347ce63a5..55c132611 100644
--- a/palace/deps/patch/mfem/patch_mesh_part.diff
+++ b/palace/deps/patch/mfem/patch_mesh_part.diff
@@ -1,8 +1,8 @@
 diff --git a/.gitignore b/.gitignore
-index 553aa8582..e26611bdc 100644
+index dbdb54ce1..4f405c132 100644
 --- a/.gitignore
 +++ b/.gitignore
-@@ -218,7 +218,7 @@ miniapps/meshing/mobius-strip.mesh
+@@ -220,7 +220,7 @@ miniapps/meshing/mobius-strip.mesh
  miniapps/meshing/klein-bottle.mesh
  miniapps/meshing/toroid-*.mesh
  miniapps/meshing/twist-*.mesh
@@ -383,7 +383,7 @@ index 2ed9f4a1b..96373b2d1 100644
  
  ///  C = A * B  (as boolean matrices)
 diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp
-index de2d80f15..0e0ce23c6 100644
+index 87e606510..a4f4e2069 100644
 --- a/mesh/mesh.cpp
 +++ b/mesh/mesh.cpp
 @@ -19,6 +19,7 @@
@@ -402,7 +402,7 @@ index de2d80f15..0e0ce23c6 100644
  
  // Include the METIS header, if using version 5. If using METIS 4, the needed
  // declarations are inlined below, i.e. no header is needed.
-@@ -2979,7 +2981,7 @@ void Mesh::FinalizeTopology(bool generate_bdr)
+@@ -2986,7 +2988,7 @@ void Mesh::FinalizeTopology(bool generate_bdr)
     {
        GetElementToFaceTable();
        GenerateFaces();
@@ -411,7 +411,7 @@ index de2d80f15..0e0ce23c6 100644
        {
           GenerateBoundaryElements();
           GetElementToFaceTable(); // update be_to_face
-@@ -2999,7 +3001,7 @@ void Mesh::FinalizeTopology(bool generate_bdr)
+@@ -3006,7 +3008,7 @@ void Mesh::FinalizeTopology(bool generate_bdr)
        if (Dim == 2)
        {
           GenerateFaces(); // 'Faces' in 2D refers to the edges
@@ -420,7 +420,7 @@ index de2d80f15..0e0ce23c6 100644
           {
              GenerateBoundaryElements();
           }
-@@ -5387,6 +5389,12 @@ const FiniteElementSpace *Mesh::GetNodalFESpace() const
+@@ -5394,6 +5396,12 @@ const FiniteElementSpace *Mesh::GetNodalFESpace() const
  
  void Mesh::SetCurvature(int order, bool discont, int space_dim, int ordering)
  {
@@ -433,7 +433,7 @@ index de2d80f15..0e0ce23c6 100644
     space_dim = (space_dim == -1) ? spaceDim : space_dim;
     FiniteElementCollection* nfec;
     if (discont)
-@@ -12108,6 +12116,878 @@ int Mesh::FindPoints(DenseMatrix &point_mat, Array<int>& elem_ids,
+@@ -12230,6 +12238,878 @@ void Mesh::GetGeometricParametersFromJacobian(const DenseMatrix &J,
  }
  
  
@@ -1313,7 +1313,7 @@ index de2d80f15..0e0ce23c6 100644
                                     int flags, MemoryType d_mt)
  {
 diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp
-index 3e0590067..f8aa5706a 100644
+index 64cf55ae4..6bddc43e9 100644
 --- a/mesh/mesh.hpp
 +++ b/mesh/mesh.hpp
 @@ -27,6 +27,7 @@
@@ -1337,7 +1337,7 @@ index 3e0590067..f8aa5706a 100644
  
     // Counter for Mesh transformations: refinement, derefinement, rebalancing.
     // Used for checking during Update operations on objects depending on the
-@@ -771,7 +774,7 @@ public:
+@@ -767,7 +770,7 @@ public:
  
     int AddBdrPoint(int v, int attr = 1);
  
@@ -1346,7 +1346,7 @@ index 3e0590067..f8aa5706a 100644
     /// Finalize the construction of a triangular Mesh.
     void FinalizeTriMesh(int generate_edges = 0, int refine = 0,
                          bool fix_orientation = true);
-@@ -1978,6 +1981,195 @@ public:
+@@ -1995,6 +1998,195 @@ public:
  std::ostream &operator<<(std::ostream &out, const Mesh &mesh);
  
  
@@ -1632,22 +1632,20 @@ index c434ae903..b72e5db55 100644
     void SetRefinementFlag(int rf) { refinement_flag = rf; }
  
 diff --git a/miniapps/meshing/makefile b/miniapps/meshing/makefile
-index ce82f238d..b9cd9a30c 100644
+index 315dc4465..656f80d0e 100644
 --- a/miniapps/meshing/makefile
 +++ b/miniapps/meshing/makefile
-@@ -118,7 +118,9 @@ clean-build:
- 	rm -rf *.dSYM *.TVD.*breakpoints
+@@ -121,7 +121,7 @@ clean-build:
+    rm -rf *.dSYM *.TVD.*breakpoints
  
  clean-exec:
--	@rm -f mobius-strip.mesh klein-bottle.mesh mesh-explorer.mesh
-+	@rm -f mobius-strip.mesh klein-bottle.mesh mesh-explorer.mesh*
-+	@rm -f toroid-*.mesh twist-*.mesh trimmer.mesh
-+	@rm -f mobius-strip.mesh klein-bottle.mesh mesh-explorer.mesh*
- 	@rm -f toroid-*.mesh twist-*.mesh trimmer.mesh reflected.mesh
- 	@rm -f partitioning.txt shaper.mesh extruder.mesh
- 	@rm -f optimized* perturbed* polar-nc.mesh
+-   @rm -f mobius-strip.mesh klein-bottle.mesh mesh-explorer.mesh
++   @rm -f mobius-strip.mesh klein-bottle.mesh mesh-explorer.mesh*
+    @rm -f toroid-*.mesh twist-*.mesh trimmer.mesh reflected.mesh
+    @rm -f partitioning.txt shaper.mesh extruder.mesh
+    @rm -f optimized* perturbed* polar-nc.mesh
 diff --git a/miniapps/meshing/mesh-explorer.cpp b/miniapps/meshing/mesh-explorer.cpp
-index 49f3f9690..068512670 100644
+index f05e18e83..6a2a4e78b 100644
 --- a/miniapps/meshing/mesh-explorer.cpp
 +++ b/miniapps/meshing/mesh-explorer.cpp
 @@ -308,6 +308,7 @@ int main (int argc, char *argv[])
@@ -1668,19 +1666,7 @@ index 49f3f9690..068512670 100644
             "V) Save in VTK format (only linear and quadratic meshes)\n"
             "D) Save as a DataCollection\n"
             "q) Quit\n"
-@@ -959,9 +961,8 @@ int main (int argc, char *argv[])
-                      cin >> nxyz[2]; np *= nxyz[2];
-                   }
-                }
--               int *part = mesh->CartesianPartitioning(nxyz);
--               partitioning = Array<int>(part, mesh->GetNE());
--               delete [] part;
-+               partitioning.MakeRef(mesh->CartesianPartitioning(nxyz),
-+                                    mesh->GetNE(), true);
-                recover_bdr_partitioning(mesh, partitioning, bdr_partitioning);
-             }
-             else if (pk == 's')
-@@ -972,7 +973,7 @@ int main (int argc, char *argv[])
+@@ -984,7 +986,7 @@ int main (int argc, char *argv[])
                 partitioning.SetSize(mesh->GetNE());
                 for (int i = 0; i < mesh->GetNE(); i++)
                 {
@@ -1689,19 +1675,15 @@ index 49f3f9690..068512670 100644
                 }
                 recover_bdr_partitioning(mesh, partitioning, bdr_partitioning);
              }
-@@ -985,9 +986,8 @@ int main (int argc, char *argv[])
+@@ -997,6 +999,7 @@ int main (int argc, char *argv[])
                 }
                 cout << "Enter number of processors: " << flush;
                 cin >> np;
--               int *part = mesh->GeneratePartitioning(np, part_method);
--               partitioning = Array<int>(part, mesh->GetNE());
--               delete [] part;
-+               partitioning.MakeRef(mesh->GeneratePartitioning(np, part_method),
-+                                    mesh->GetNE(), true);
-                recover_bdr_partitioning(mesh, partitioning, bdr_partitioning);
-             }
-             if (partitioning)
-@@ -1185,6 +1185,25 @@ int main (int argc, char *argv[])
++
+                int *part = mesh->GeneratePartitioning(np, part_method);
+                partitioning = Array<int>(part, mesh->GetNE());
+                delete [] part;
+@@ -1197,6 +1200,25 @@ int main (int argc, char *argv[])
           cout << "New mesh file: " << omesh_file << endl;
        }
  
diff --git a/palace/deps/patch/mfem/patch_pa_libceed.diff b/palace/deps/patch/mfem/patch_pa_libceed.diff
deleted file mode 100644
index b62f80880..000000000
--- a/palace/deps/patch/mfem/patch_pa_libceed.diff
+++ /dev/null
@@ -1,28397 +0,0 @@
-diff --git a/fem/CMakeLists.txt b/fem/CMakeLists.txt
-index 6da0cfea3..f11a5a72b 100644
---- a/fem/CMakeLists.txt
-+++ b/fem/CMakeLists.txt
-@@ -17,19 +17,25 @@ set(SRCS
-   integ/bilininteg_convection_mf.cpp
-   integ/bilininteg_convection_pa.cpp
-   integ/bilininteg_convection_ea.cpp
-+  integ/bilininteg_curlcurl_mf.cpp
-   integ/bilininteg_curlcurl_pa.cpp
-   integ/bilininteg_dgtrace_pa.cpp
-   integ/bilininteg_dgtrace_ea.cpp
-   integ/bilininteg_diffusion_mf.cpp
-   integ/bilininteg_diffusion_pa.cpp
-   integ/bilininteg_diffusion_ea.cpp
-+  integ/bilininteg_divdiv_mf.cpp
-   integ/bilininteg_divdiv_pa.cpp
-   integ/bilininteg_gradient_pa.cpp
--  integ/bilininteg_interp_pa.cpp
-+  integ/bilininteg_interp_id_pa.cpp
-+  integ/bilininteg_interp_grad_pa.cpp
-+  integ/bilininteg_interp_curl_pa.cpp
-   integ/bilininteg_mass_mf.cpp
-   integ/bilininteg_mass_pa.cpp
-   integ/bilininteg_mass_ea.cpp
-+  integ/bilininteg_mixedcurl_mf.cpp
-   integ/bilininteg_mixedcurl_pa.cpp
-+  integ/bilininteg_mixedvecgrad_mf.cpp
-   integ/bilininteg_mixedvecgrad_pa.cpp
-   integ/bilininteg_transpose_ea.cpp
-   integ/bilininteg_vecdiffusion_mf.cpp
-@@ -38,6 +44,7 @@ set(SRCS
-   integ/bilininteg_vecmass_mf.cpp
-   integ/bilininteg_vecmass_pa.cpp
-   integ/bilininteg_vectorfediv_pa.cpp
-+  integ/bilininteg_vectorfemass_mf.cpp
-   integ/bilininteg_vectorfemass_pa.cpp
-   integ/lininteg_boundary.cpp
-   integ/lininteg_boundary_flux.cpp
-@@ -72,15 +79,20 @@ set(SRCS
-   hybridization.cpp
-   intrules.cpp
-   ceed/interface/basis.cpp
--  ceed/interface/restriction.cpp
-   ceed/interface/operator.cpp
-+  ceed/interface/restriction.cpp
-   ceed/interface/util.cpp
-+  ceed/integrators/mass/mass.cpp
-   ceed/integrators/convection/convection.cpp
-   ceed/integrators/diffusion/diffusion.cpp
-   ceed/integrators/nlconvection/nlconvection.cpp
--  ceed/integrators/mass/mass.cpp
-+  ceed/integrators/vecfemass/vecfemass.cpp
-+  ceed/integrators/divdiv/divdiv.cpp
-+  ceed/integrators/curlcurl/curlcurl.cpp
-+  ceed/integrators/mixedvecgrad/mixedvecgrad.cpp
-+  ceed/integrators/mixedveccurl/mixedveccurl.cpp
-+  ceed/integrators/interp/interp.cpp
-   ceed/solvers/algebraic.cpp
--  ceed/solvers/full-assembly.cpp
-   ceed/solvers/solvers-atpmg.cpp
-   linearform.cpp
-   linearform_ext.cpp
-@@ -180,18 +192,33 @@ set(HDRS
-   hybridization.hpp
-   intrules.hpp
-   ceed/interface/basis.hpp
-+  ceed/interface/ceed.hpp
-+  ceed/interface/coefficient.hpp
-   ceed/interface/integrator.hpp
-   ceed/interface/interface.hpp
-+  ceed/interface/mixed_operator.hpp
-   ceed/interface/operator.hpp
-   ceed/interface/restriction.hpp
-   ceed/interface/util.hpp
-+  ceed/integrators/mass/mass.hpp
-+  ceed/integrators/mass/mass_qf.h
-   ceed/integrators/convection/convection.hpp
-+  ceed/integrators/convection/convection_qf.h
-   ceed/integrators/diffusion/diffusion.hpp
--  ceed/integrators/mass/mass.hpp
-+  ceed/integrators/diffusion/diffusion_qf.h
-   ceed/integrators/nlconvection/nlconvection.hpp
--  ceed/interface/coefficient.hpp
-+  ceed/integrators/nlconvection/nlconvection_qf.h
-+  ceed/integrators/vecfemass/vecfemass.hpp
-+  ceed/integrators/vecfemass/vecfemass_qf.h
-+  ceed/integrators/divdiv/divdiv.hpp
-+  ceed/integrators/divdiv/divdiv_qf.h
-+  ceed/integrators/curlcurl/curlcurl.hpp
-+  ceed/integrators/curlcurl/curlcurl_qf.h
-+  ceed/integrators/mixedvecgrad/mixedvecgrad.hpp
-+  ceed/integrators/mixedveccurl/mixedveccurl.hpp
-+  ceed/integrators/interp/interp.hpp
-+  ceed/integrators/util/util_qf.h
-   ceed/solvers/algebraic.hpp
--  ceed/solvers/full-assembly.hpp
-   ceed/solvers/solvers-atpmg.hpp
-   linearform.hpp
-   linearform_ext.hpp
-diff --git a/fem/bilinearform.cpp b/fem/bilinearform.cpp
-index a549d03a7..0a566cba5 100644
---- a/fem/bilinearform.cpp
-+++ b/fem/bilinearform.cpp
-@@ -1353,8 +1353,8 @@ void MixedBilinearForm::Assemble(int skip_zeros)
-          }
-          for (int k = 0; k < trace_face_integs.Size(); k++)
-          {
--            trace_face_integs[k]->AssembleFaceMatrix(*trial_face_fe, *test_fe1,
--                                                     *test_fe2, *ftr, elemmat);
-+            trace_face_integs[k]->AssembleFaceMatrix2(*trial_face_fe, *test_fe1,
-+                                                      *test_fe2, *ftr, elemmat);
-             mat->AddSubMatrix(test_vdofs, trial_vdofs, elemmat, skip_zeros);
-          }
-       }
-@@ -1409,10 +1409,10 @@ void MixedBilinearForm::Assemble(int skip_zeros)
-                    (*boundary_trace_face_integs_marker[k])[bdr_attr-1] == 0)
-                { continue; }
- 
--               boundary_trace_face_integs[k]->AssembleFaceMatrix(*trial_face_fe,
--                                                                 *test_fe1,
--                                                                 *test_fe2,
--                                                                 *ftr, elemmat);
-+               boundary_trace_face_integs[k]->AssembleFaceMatrix2(*trial_face_fe,
-+                                                                  *test_fe1,
-+                                                                  *test_fe2,
-+                                                                  *ftr, elemmat);
-                mat->AddSubMatrix(test_vdofs, trial_vdofs, elemmat, skip_zeros);
-             }
-          }
-diff --git a/fem/bilinearform_ext.cpp b/fem/bilinearform_ext.cpp
-index 90a1655f4..0c88fdf91 100644
---- a/fem/bilinearform_ext.cpp
-+++ b/fem/bilinearform_ext.cpp
-@@ -2131,6 +2131,33 @@ void PADiscreteLinearOperatorExtension::Assemble()
-    test_multiplicity.Reciprocal();
- }
- 
-+void PADiscreteLinearOperatorExtension::Mult(const Vector &x, Vector &y) const
-+{
-+   Array<BilinearFormIntegrator *> &interpolators = *a->GetDBFI();
-+   if (elem_restrict_trial)
-+   {
-+      elem_restrict_trial->Mult(x, local_trial);
-+   }
-+   if (elem_restrict_test)
-+   {
-+      local_test = 0.0;
-+      for (BilinearFormIntegrator *interp : interpolators)
-+      {
-+         interp->AddMultPA(elem_restrict_trial ? local_trial : x, local_test);
-+      }
-+      elem_restrict_test->MultTranspose(local_test, y);
-+   }
-+   else
-+   {
-+      y = 0.0;
-+      for (BilinearFormIntegrator *interp : interpolators)
-+      {
-+         interp->AddMultPA(elem_restrict_trial ? local_trial : x, y);
-+      }
-+   }
-+   y *= test_multiplicity;
-+}
-+
- void PADiscreteLinearOperatorExtension::AddMult(const Vector &x, Vector &y,
-                                                 const double c) const
- {
-@@ -2152,6 +2179,7 @@ void PADiscreteLinearOperatorExtension::AddMult(const Vector &x, Vector &y,
-    }
-    else
-    {
-+      temp_test = 0.0;
-       for (BilinearFormIntegrator *interp : interpolators)
-       {
-          interp->AddMultPA(elem_restrict_trial ? local_trial : x, temp_test);
-@@ -2165,8 +2193,10 @@ void PADiscreteLinearOperatorExtension::AddMultTranspose(const Vector &x,
-                                                          Vector &y,
-                                                          const double c) const
- {
-+   MFEM_VERIFY(c == 1.0,
-+               "General coefficient case for PADiscreteLinearOperatorExtension::"
-+               "AddMultTranspose is not yet supported!");
-    Array<BilinearFormIntegrator *> &interpolators = *a->GetDBFI();
--   temp_test.SetSize(y.Size());
-    temp_test.UseDevice(true);
-    temp_test = x;
-    temp_test *= test_multiplicity;
-@@ -2182,26 +2212,14 @@ void PADiscreteLinearOperatorExtension::AddMultTranspose(const Vector &x,
-          interp->AddMultTransposePA(elem_restrict_test ? local_test : temp_test,
-                                     local_trial);
-       }
--      if (c != 1.0)
--      {
--         local_trial *= c;
--      }
-       elem_restrict_trial->AddMultTranspose(local_trial, y);
-    }
-    else
-    {
-       y.UseDevice(true); // typically this is a large vector, so store on device
--      if (c != 1.0)
--      {
--         MFEM_ABORT("General coefficient case for PADiscreteLinearOperatorExtension::"
--                    "AddMultTranspose is not yet supported!");
--      }
--      else
-+      for (BilinearFormIntegrator *interp : interpolators)
-       {
--         for (BilinearFormIntegrator *interp : interpolators)
--         {
--            interp->AddMultTransposePA(elem_restrict_test ? local_test : temp_test, y);
--         }
-+         interp->AddMultTransposePA(elem_restrict_test ? local_test : temp_test, y);
-       }
-    }
- }
-diff --git a/fem/bilinearform_ext.hpp b/fem/bilinearform_ext.hpp
-index db26eb801..aa9271d8f 100644
---- a/fem/bilinearform_ext.hpp
-+++ b/fem/bilinearform_ext.hpp
-@@ -227,6 +227,7 @@ public:
-    PADiscreteLinearOperatorExtension(DiscreteLinearOperator *linop);
- 
-    void Assemble();
-+   void Mult(const Vector &x, Vector &y) const;
-    void AddMult(const Vector &x, Vector &y, const double c = 1.0) const;
-    void AddMultTranspose(const Vector &x, Vector &y, const double c = 1.0) const;
- };
-diff --git a/fem/bilininteg.cpp b/fem/bilininteg.cpp
-index e6fc2a6ee..096285723 100644
---- a/fem/bilininteg.cpp
-+++ b/fem/bilininteg.cpp
-@@ -141,7 +141,7 @@ void BilinearFormIntegrator::AssembleEA(const FiniteElementSpace&,
-               "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AssembleEAInteriorFaces(const FiniteElementSpace &,
-+void BilinearFormIntegrator::AssembleEAInteriorFaces(const FiniteElementSpace&,
-                                                      Vector&,
-                                                      Vector&)
- {
-@@ -156,47 +156,50 @@ void BilinearFormIntegrator::AssembleEABoundaryFaces(const FiniteElementSpace&,
-               "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AssembleElementMatrix(
--   const FiniteElement &el, ElementTransformation &Trans,
--   DenseMatrix &elmat)
-+void BilinearFormIntegrator::AssembleElementMatrix(const FiniteElement&,
-+                                                   ElementTransformation&,
-+                                                   DenseMatrix&)
- {
-    MFEM_ABORT("BilinearFormIntegrator::AssembleElementMatrix(...)\n"
-               "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AssembleElementMatrix2(
--   const FiniteElement &el1, const FiniteElement &el2,
--   ElementTransformation &Trans, DenseMatrix &elmat)
-+void BilinearFormIntegrator::AssembleElementMatrix2(const FiniteElement&,
-+                                                    const FiniteElement&,
-+                                                    ElementTransformation&,
-+                                                    DenseMatrix&)
- {
-    MFEM_ABORT("BilinearFormIntegrator::AssembleElementMatrix2(...)\n"
-               "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AssembleFaceMatrix(
--   const FiniteElement &el1, const FiniteElement &el2,
--   FaceElementTransformations &Trans, DenseMatrix &elmat)
-+void BilinearFormIntegrator::AssembleFaceMatrix(const FiniteElement&,
-+                                                const FiniteElement&,
-+                                                FaceElementTransformations&Trans,
-+                                                DenseMatrix&)
- {
-    MFEM_ABORT("BilinearFormIntegrator::AssembleFaceMatrix(...)\n"
-               "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AssembleFaceMatrix(
--   const FiniteElement &trial_face_fe, const FiniteElement &test_fe1,
--   const FiniteElement &test_fe2, FaceElementTransformations &Trans,
--   DenseMatrix &elmat)
-+void BilinearFormIntegrator::AssembleFaceMatrix2(const FiniteElement&,
-+                                                 const FiniteElement&,
-+                                                 const FiniteElement&,
-+                                                 FaceElementTransformations&,
-+                                                 DenseMatrix&)
- {
--   MFEM_ABORT("AssembleFaceMatrix (mixed form) is not implemented for this"
--              " Integrator class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AssembleFaceMatrix2(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AssembleTraceFaceMatrix (int elem,
--                                                      const FiniteElement &trial_face_fe,
--                                                      const FiniteElement &test_fe1,
--                                                      FaceElementTransformations &Trans,
--                                                      DenseMatrix &elmat)
-+void BilinearFormIntegrator::AssembleTraceFaceMatrix(int,
-+                                                     const FiniteElement&,
-+                                                     const FiniteElement&,
-+                                                     FaceElementTransformations&,
-+                                                     DenseMatrix&)
- {
--   MFEM_ABORT("AssembleTraceFaceMatrix (DPG form) is not implemented for this"
--              " Integrator class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AssembleTraceFaceMatrix(...)\n"
-+              "   is not implemented for this class.");
- }
- 
- void BilinearFormIntegrator::AssembleElementVector(
-@@ -221,17 +224,10 @@ void BilinearFormIntegrator::AssembleFaceVector(
-    elmat.Mult(elfun, elvect);
- }
- 
--void TransposeIntegrator::SetIntRule(const IntegrationRule *ir)
--{
--   IntRule = ir;
--   bfi->SetIntRule(ir);
--}
--
- void TransposeIntegrator::AssembleElementMatrix(
-    const FiniteElement &el, ElementTransformation &Trans, DenseMatrix &elmat)
- {
-    bfi->AssembleElementMatrix(el, Trans, bfi_elmat);
--   // elmat = bfi_elmat^t
-    elmat.Transpose(bfi_elmat);
- }
- 
-@@ -240,7 +236,6 @@ void TransposeIntegrator::AssembleElementMatrix2(
-    ElementTransformation &Trans, DenseMatrix &elmat)
- {
-    bfi->AssembleElementMatrix2(test_fe, trial_fe, Trans, bfi_elmat);
--   // elmat = bfi_elmat^t
-    elmat.Transpose(bfi_elmat);
- }
- 
-@@ -249,16 +244,9 @@ void TransposeIntegrator::AssembleFaceMatrix(
-    FaceElementTransformations &Trans, DenseMatrix &elmat)
- {
-    bfi->AssembleFaceMatrix(el1, el2, Trans, bfi_elmat);
--   // elmat = bfi_elmat^t
-    elmat.Transpose(bfi_elmat);
- }
- 
--void LumpedIntegrator::SetIntRule(const IntegrationRule *ir)
--{
--   IntRule = ir;
--   bfi->SetIntRule(ir);
--}
--
- void LumpedIntegrator::AssembleElementMatrix(
-    const FiniteElement &el, ElementTransformation &Trans, DenseMatrix &elmat)
- {
-@@ -266,38 +254,41 @@ void LumpedIntegrator::AssembleElementMatrix(
-    elmat.Lump();
- }
- 
--void InverseIntegrator::SetIntRule(const IntegrationRule *ir)
--{
--   IntRule = ir;
--   integrator->SetIntRule(ir);
--}
--
- void InverseIntegrator::AssembleElementMatrix(
-    const FiniteElement &el, ElementTransformation &Trans, DenseMatrix &elmat)
- {
--   integrator->AssembleElementMatrix(el, Trans, elmat);
-+   bfi->AssembleElementMatrix(el, Trans, elmat);
-    elmat.Invert();
- }
- 
-+bool SumIntegrator::SupportsCeed() const
-+{
-+   for (int i = 0; i < bfis.Size(); i++)
-+   {
-+      if (!bfis[i]->SupportsCeed()) { return false; }
-+   }
-+   return true;
-+}
-+
- void SumIntegrator::SetIntRule(const IntegrationRule *ir)
- {
-    IntRule = ir;
--   for (int i = 0; i < integrators.Size(); i++)
-+   for (int i = 0; i < bfis.Size(); i++)
-    {
--      integrators[i]->SetIntRule(ir);
-+      bfis[i]->SetIntRule(ir);
-    }
- }
- 
- void SumIntegrator::AssembleElementMatrix(
-    const FiniteElement &el, ElementTransformation &Trans, DenseMatrix &elmat)
- {
--   MFEM_ASSERT(integrators.Size() > 0, "empty SumIntegrator.");
-+   MFEM_ASSERT(bfis.Size() > 0, "empty SumIntegrator.");
- 
--   integrators[0]->AssembleElementMatrix(el, Trans, elmat);
--   for (int i = 1; i < integrators.Size(); i++)
-+   bfis[0]->AssembleElementMatrix(el, Trans, elmat);
-+   for (int i = 1; i < bfis.Size(); i++)
-    {
--      integrators[i]->AssembleElementMatrix(el, Trans, elem_mat);
--      elmat += elem_mat;
-+      bfis[i]->AssembleElementMatrix(el, Trans, bfi_elmat);
-+      elmat += bfi_elmat;
-    }
- }
- 
-@@ -305,13 +296,13 @@ void SumIntegrator::AssembleElementMatrix2(
-    const FiniteElement &el1, const FiniteElement &el2,
-    ElementTransformation &Trans, DenseMatrix &elmat)
- {
--   MFEM_ASSERT(integrators.Size() > 0, "empty SumIntegrator.");
-+   MFEM_ASSERT(bfis.Size() > 0, "empty SumIntegrator.");
- 
--   integrators[0]->AssembleElementMatrix2(el1, el2, Trans, elmat);
--   for (int i = 1; i < integrators.Size(); i++)
-+   bfis[0]->AssembleElementMatrix2(el1, el2, Trans, elmat);
-+   for (int i = 1; i < bfis.Size(); i++)
-    {
--      integrators[i]->AssembleElementMatrix2(el1, el2, Trans, elem_mat);
--      elmat += elem_mat;
-+      bfis[i]->AssembleElementMatrix2(el1, el2, Trans, bfi_elmat);
-+      elmat += bfi_elmat;
-    }
- }
- 
-@@ -319,134 +310,134 @@ void SumIntegrator::AssembleFaceMatrix(
-    const FiniteElement &el1, const FiniteElement &el2,
-    FaceElementTransformations &Trans, DenseMatrix &elmat)
- {
--   MFEM_ASSERT(integrators.Size() > 0, "empty SumIntegrator.");
-+   MFEM_ASSERT(bfis.Size() > 0, "empty SumIntegrator.");
- 
--   integrators[0]->AssembleFaceMatrix(el1, el2, Trans, elmat);
--   for (int i = 1; i < integrators.Size(); i++)
-+   bfis[0]->AssembleFaceMatrix(el1, el2, Trans, elmat);
-+   for (int i = 1; i < bfis.Size(); i++)
-    {
--      integrators[i]->AssembleFaceMatrix(el1, el2, Trans, elem_mat);
--      elmat += elem_mat;
-+      bfis[i]->AssembleFaceMatrix(el1, el2, Trans, bfi_elmat);
-+      elmat += bfi_elmat;
-    }
- }
- 
--void SumIntegrator::AssembleFaceMatrix(
--   const FiniteElement &tr_fe,
--   const FiniteElement &te_fe1, const FiniteElement &te_fe2,
--   FaceElementTransformations &Trans, DenseMatrix &elmat)
-+void SumIntegrator::AssembleFaceMatrix2(
-+   const FiniteElement &tr_fe, const FiniteElement &te_fe1,
-+   const FiniteElement &te_fe2, FaceElementTransformations &Trans,
-+   DenseMatrix &elmat)
- {
--   MFEM_ASSERT(integrators.Size() > 0, "empty SumIntegrator.");
-+   MFEM_ASSERT(bfis.Size() > 0, "empty SumIntegrator.");
- 
--   integrators[0]->AssembleFaceMatrix(tr_fe, te_fe1, te_fe2, Trans, elmat);
--   for (int i = 1; i < integrators.Size(); i++)
-+   bfis[0]->AssembleFaceMatrix2(tr_fe, te_fe1, te_fe2, Trans, elmat);
-+   for (int i = 1; i < bfis.Size(); i++)
-    {
--      integrators[i]->AssembleFaceMatrix(tr_fe, te_fe1, te_fe2, Trans, elem_mat);
--      elmat += elem_mat;
-+      bfis[i]->AssembleFaceMatrix2(tr_fe, te_fe1, te_fe2, Trans, bfi_elmat);
-+      elmat += bfi_elmat;
-    }
- }
- 
- void SumIntegrator::AssemblePA(const FiniteElementSpace& fes)
- {
--   for (int i = 0; i < integrators.Size(); i++)
-+   for (int i = 0; i < bfis.Size(); i++)
-    {
--      integrators[i]->AssemblePA(fes);
-+      bfis[i]->AssemblePA(fes);
-    }
- }
- 
- void SumIntegrator::AssemblePA(const FiniteElementSpace& trial_fes,
-                                const FiniteElementSpace& test_fes)
- {
--   for (int i = 0; i < integrators.Size(); i++)
-+   for (int i = 0; i < bfis.Size(); i++)
-    {
--      integrators[i]->AssemblePA(trial_fes, test_fes);
-+      bfis[i]->AssemblePA(trial_fes, test_fes);
-    }
- }
- 
- void SumIntegrator::AssembleDiagonalPA(Vector &diag)
- {
--   for (int i = 0; i < integrators.Size(); i++)
-+   for (int i = 0; i < bfis.Size(); i++)
-    {
--      integrators[i]->AssembleDiagonalPA(diag);
-+      bfis[i]->AssembleDiagonalPA(diag);
-    }
- }
- 
- void SumIntegrator::AssemblePAInteriorFaces(const FiniteElementSpace &fes)
- {
--   for (int i = 0; i < integrators.Size(); i++)
-+   for (int i = 0; i < bfis.Size(); i++)
-    {
--      integrators[i]->AssemblePAInteriorFaces(fes);
-+      bfis[i]->AssemblePAInteriorFaces(fes);
-    }
- }
- 
- void SumIntegrator::AssemblePABoundaryFaces(const FiniteElementSpace &fes)
- {
--   for (int i = 0; i < integrators.Size(); i++)
-+   for (int i = 0; i < bfis.Size(); i++)
-    {
--      integrators[i]->AssemblePABoundaryFaces(fes);
-+      bfis[i]->AssemblePABoundaryFaces(fes);
-    }
- }
- 
- void SumIntegrator::AddMultPA(const Vector& x, Vector& y) const
- {
--   for (int i = 0; i < integrators.Size(); i++)
-+   for (int i = 0; i < bfis.Size(); i++)
-    {
--      integrators[i]->AddMultPA(x, y);
-+      bfis[i]->AddMultPA(x, y);
-    }
- }
- 
- void SumIntegrator::AddMultTransposePA(const Vector &x, Vector &y) const
- {
--   for (int i = 0; i < integrators.Size(); i++)
-+   for (int i = 0; i < bfis.Size(); i++)
-    {
--      integrators[i]->AddMultTransposePA(x, y);
-+      bfis[i]->AddMultTransposePA(x, y);
-    }
- }
- 
- void SumIntegrator::AssembleMF(const FiniteElementSpace &fes)
- {
--   for (int i = 0; i < integrators.Size(); i++)
-+   for (int i = 0; i < bfis.Size(); i++)
-    {
--      integrators[i]->AssembleMF(fes);
-+      bfis[i]->AssembleMF(fes);
-    }
- }
- 
- void SumIntegrator::AssembleMF(const FiniteElementSpace& trial_fes,
-                                const FiniteElementSpace& test_fes)
- {
--   for (int i = 0; i < integrators.Size(); i++)
-+   for (int i = 0; i < bfis.Size(); i++)
-    {
--      integrators[i]->AssembleMF(trial_fes, test_fes);
-+      bfis[i]->AssembleMF(trial_fes, test_fes);
-    }
- }
- 
- void SumIntegrator::AssembleDiagonalMF(Vector &diag)
- {
--   for (int i = 0; i < integrators.Size(); i++)
-+   for (int i = 0; i < bfis.Size(); i++)
-    {
--      integrators[i]->AssembleDiagonalMF(diag);
-+      bfis[i]->AssembleDiagonalMF(diag);
-    }
- }
- 
- void SumIntegrator::AddMultMF(const Vector& x, Vector& y) const
- {
--   for (int i = 0; i < integrators.Size(); i++)
-+   for (int i = 0; i < bfis.Size(); i++)
-    {
--      integrators[i]->AddMultTransposeMF(x, y);
-+      bfis[i]->AddMultTransposeMF(x, y);
-    }
- }
- 
- void SumIntegrator::AddMultTransposeMF(const Vector &x, Vector &y) const
- {
--   for (int i = 0; i < integrators.Size(); i++)
-+   for (int i = 0; i < bfis.Size(); i++)
-    {
--      integrators[i]->AddMultMF(x, y);
-+      bfis[i]->AddMultMF(x, y);
-    }
- }
- 
- void SumIntegrator::AssembleEA(const FiniteElementSpace &fes, Vector &emat)
- {
--   for (int i = 0; i < integrators.Size(); i++)
-+   for (int i = 0; i < bfis.Size(); i++)
-    {
--      integrators[i]->AssembleEA(fes, emat);
-+      bfis[i]->AssembleEA(fes, emat);
-    }
- }
- 
-@@ -454,32 +445,41 @@ void SumIntegrator::AssembleEAInteriorFaces(const FiniteElementSpace &fes,
-                                             Vector &ea_data_int,
-                                             Vector &ea_data_ext)
- {
--   for (int i = 0; i < integrators.Size(); i++)
-+   for (int i = 0; i < bfis.Size(); i++)
-    {
--      integrators[i]->AssembleEAInteriorFaces(fes, ea_data_int, ea_data_ext);
-+      bfis[i]->AssembleEAInteriorFaces(fes, ea_data_int, ea_data_ext);
-    }
- }
- 
- void SumIntegrator::AssembleEABoundaryFaces(const FiniteElementSpace &fes,
-                                             Vector &ea_data_bdr)
- {
--   for (int i = 0; i < integrators.Size(); i++)
-+   for (int i = 0; i < bfis.Size(); i++)
-    {
--      integrators[i]->AssembleEABoundaryFaces(fes, ea_data_bdr);
-+      bfis[i]->AssembleEABoundaryFaces(fes, ea_data_bdr);
-    }
- }
- 
- SumIntegrator::~SumIntegrator()
- {
--   if (own_integrators)
-+   if (own_bfis)
-    {
--      for (int i = 0; i < integrators.Size(); i++)
-+      for (int i = 0; i < bfis.Size(); i++)
-       {
--         delete integrators[i];
-+         delete bfis[i];
-       }
-    }
- }
- 
-+const IntegrationRule &MixedScalarIntegrator::GetRule(
-+   const FiniteElement &trial_fe,
-+   const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
-+{
-+   int order = this->GetIntegrationOrder(trial_fe, test_fe, Trans);
-+   return IntRules.Get(trial_fe.GetGeomType(), order);
-+}
-+
- void MixedScalarIntegrator::AssembleElementMatrix2(
-    const FiniteElement &trial_fe, const FiniteElement &test_fe,
-    ElementTransformation &Trans, DenseMatrix &elmat)
-@@ -507,12 +507,8 @@ void MixedScalarIntegrator::AssembleElementMatrix2(
- 
-    elmat.SetSize(test_nd, trial_nd);
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int ir_order = this->GetIntegrationOrder(trial_fe, test_fe, Trans);
--      ir = &IntRules.Get(trial_fe.GetGeomType(), ir_order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
-+                                                            Trans);
- 
-    elmat = 0.0;
-    for (i = 0; i < ir->GetNPoints(); i++)
-@@ -539,6 +535,15 @@ void MixedScalarIntegrator::AssembleElementMatrix2(
- #endif
- }
- 
-+const IntegrationRule &MixedVectorIntegrator::GetRule(
-+   const FiniteElement &trial_fe,
-+   const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
-+{
-+   int order = this->GetIntegrationOrder(trial_fe, test_fe, Trans);
-+   return IntRules.Get(trial_fe.GetGeomType(), order);
-+}
-+
- void MixedVectorIntegrator::AssembleElementMatrix2(
-    const FiniteElement &trial_fe, const FiniteElement &test_fe,
-    ElementTransformation &Trans, DenseMatrix &elmat)
-@@ -598,12 +603,8 @@ void MixedVectorIntegrator::AssembleElementMatrix2(
- 
-    elmat.SetSize(test_nd, trial_nd);
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int ir_order = this->GetIntegrationOrder(trial_fe, test_fe, Trans);
--      ir = &IntRules.Get(trial_fe.GetGeomType(), ir_order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
-+                                                            Trans);
- 
-    elmat = 0.0;
-    for (i = 0; i < ir->GetNPoints(); i++)
-@@ -713,6 +714,15 @@ void MixedVectorIntegrator::AssembleElementMatrix2(
- #endif
- }
- 
-+const IntegrationRule &MixedScalarVectorIntegrator::GetRule(
-+   const FiniteElement &trial_fe,
-+   const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
-+{
-+   int order = this->GetIntegrationOrder(trial_fe, test_fe, Trans);
-+   return IntRules.Get(trial_fe.GetGeomType(), order);
-+}
-+
- void MixedScalarVectorIntegrator::AssembleElementMatrix2(
-    const FiniteElement &trial_fe, const FiniteElement &test_fe,
-    ElementTransformation &Trans, DenseMatrix &elmat)
-@@ -754,12 +764,8 @@ void MixedScalarVectorIntegrator::AssembleElementMatrix2(
- 
-    elmat.SetSize(test_nd, trial_nd);
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int ir_order = this->GetIntegrationOrder(trial_fe, test_fe, Trans);
--      ir = &IntRules.Get(trial_fe.GetGeomType(), ir_order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
-+                                                            Trans);
- 
-    elmat = 0.0;
-    for (i = 0; i < ir->GetNPoints(); i++)
-@@ -787,6 +793,15 @@ void MixedScalarVectorIntegrator::AssembleElementMatrix2(
-    }
- }
- 
-+const IntegrationRule &GradientIntegrator::GetRule(
-+   const FiniteElement &trial_fe,
-+   const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
-+{
-+   int order = Trans.OrderGrad(&trial_fe) + test_fe.GetOrder() + Trans.OrderJ();
-+   return IntRules.Get(trial_fe.GetGeomType(), order);
-+}
-+
- void GradientIntegrator::AssembleElementMatrix2(
-    const FiniteElement &trial_fe, const FiniteElement &test_fe,
-    ElementTransformation &Trans,  DenseMatrix &elmat)
-@@ -843,18 +858,30 @@ void GradientIntegrator::AssembleElementMatrix2(
-    }
- }
- 
--const IntegrationRule &GradientIntegrator::GetRule(
--   const FiniteElement &trial_fe,
--   const FiniteElement &test_fe,
-+const IntegrationRule &DiffusionIntegrator::GetRuleStatic(
-+   const FiniteElement &trial_fe, const FiniteElement &test_fe,
-    ElementTransformation &Trans)
- {
--   int order = Trans.OrderGrad(&trial_fe) + test_fe.GetOrder() + Trans.OrderJ();
-+   int order;
-+   if (trial_fe.Space() == FunctionSpace::Pk)
-+   {
-+      order = trial_fe.GetOrder() + test_fe.GetOrder() - 2;
-+   }
-+   else
-+   {
-+      // order = 2 * el.GetOrder() - 2;  // <-- this seems to work fine too
-+      order = trial_fe.GetOrder() + test_fe.GetOrder() + trial_fe.GetDim() - 1;
-+   }
-+   if (trial_fe.Space() == FunctionSpace::rQk)
-+   {
-+      return RefinedIntRules.Get(trial_fe.GetGeomType(), order);
-+   }
-    return IntRules.Get(trial_fe.GetGeomType(), order);
- }
- 
--void DiffusionIntegrator::AssembleElementMatrix(
--   const FiniteElement &el, ElementTransformation &Trans,
--   DenseMatrix &elmat)
-+void DiffusionIntegrator::AssembleElementMatrix(const FiniteElement &el,
-+                                                ElementTransformation &Trans,
-+                                                DenseMatrix &elmat)
- {
-    int nd = el.GetDof();
-    dim = el.GetDim();
-@@ -889,7 +916,7 @@ void DiffusionIntegrator::AssembleElementMatrix(
- #endif
-    elmat.SetSize(nd);
- 
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Trans);
- 
-    elmat = 0.0;
-    for (int i = 0; i < ir->GetNPoints(); i++)
-@@ -970,7 +997,8 @@ void DiffusionIntegrator::AssembleElementMatrix2(
- #endif
-    elmat.SetSize(te_nd, tr_nd);
- 
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
-+                                                            Trans);
- 
-    elmat = 0.0;
-    for (int i = 0; i < ir->GetNPoints(); i++)
-@@ -1048,7 +1076,7 @@ void DiffusionIntegrator::AssembleElementVector(
- 
-    elvect.SetSize(nd);
- 
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr);
- 
-    elvect = 0.0;
-    for (int i = 0; i < ir->GetNPoints(); i++)
-@@ -1094,13 +1122,9 @@ void DiffusionIntegrator::AssembleElementVector(
- }
- 
- void DiffusionIntegrator::ComputeElementFlux(
--   const FiniteElement &el,
--   ElementTransformation &Trans,
--   Vector &u,
--   const FiniteElement &fluxelem,
--   Vector &flux,
--   bool with_coef,
--   const IntegrationRule *ir)
-+   const FiniteElement &el, ElementTransformation &Trans,
-+   Vector &u, const FiniteElement &fluxelem, Vector &flux,
-+   bool with_coef, const IntegrationRule *ir)
- {
-    int nd, spaceDim, fnd;
- 
-@@ -1196,11 +1220,9 @@ void DiffusionIntegrator::ComputeElementFlux(
-    }
- }
- 
--double DiffusionIntegrator::ComputeFluxEnergy(
--   const FiniteElement &fluxelem,
--   ElementTransformation &Trans,
--   Vector &flux,
--   Vector* d_energy)
-+double DiffusionIntegrator::ComputeFluxEnergy(const FiniteElement &fluxelem,
-+                                              ElementTransformation &Trans,
-+                                              Vector &flux, Vector* d_energy)
- {
-    int nd = fluxelem.GetDof();
-    dim = fluxelem.GetDim();
-@@ -1218,7 +1240,7 @@ double DiffusionIntegrator::ComputeFluxEnergy(
-    if (d_energy) { vec.SetSize(spaceDim); }
-    if (MQ) { M.SetSize(spaceDim); }
- 
--   int order = 2 * fluxelem.GetOrder(); // <--
-+   int order = 2 * fluxelem.GetOrder();
-    const IntegrationRule *ir = &IntRules.Get(fluxelem.GetGeomType(), order);
- 
-    double energy = 0.0;
-@@ -1274,20 +1296,13 @@ double DiffusionIntegrator::ComputeFluxEnergy(
-    return energy;
- }
- 
--const IntegrationRule &DiffusionIntegrator::GetRule(
-+const IntegrationRule &MassIntegrator::GetRuleStatic(
-    const FiniteElement &trial_fe,
--   const FiniteElement &test_fe)
-+   const FiniteElement &test_fe,
-+   ElementTransformation &Trans,
-+   int Q_order)
- {
--   int order;
--   if (trial_fe.Space() == FunctionSpace::Pk)
--   {
--      order = trial_fe.GetOrder() + test_fe.GetOrder() - 2;
--   }
--   else
--   {
--      // order = 2*el.GetOrder() - 2;  // <-- this seems to work fine too
--      order = trial_fe.GetOrder() + test_fe.GetOrder() + trial_fe.GetDim() - 1;
--   }
-+   int order = trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW() + Q_order;
-    if (trial_fe.Space() == FunctionSpace::rQk)
-    {
-       return RefinedIntRules.Get(trial_fe.GetGeomType(), order);
-@@ -1295,22 +1310,20 @@ const IntegrationRule &DiffusionIntegrator::GetRule(
-    return IntRules.Get(trial_fe.GetGeomType(), order);
- }
- 
--void MassIntegrator::AssembleElementMatrix(
--   const FiniteElement &el,
--   ElementTransformation &Trans,
--   DenseMatrix &elmat)
-+void MassIntegrator::AssembleElementMatrix(const FiniteElement &el,
-+                                           ElementTransformation &Trans,
-+                                           DenseMatrix &elmat)
- {
-    int nd = el.GetDof();
--   // int dim = el.GetDim();
-    double w;
- 
- #ifdef MFEM_THREAD_SAFE
-    Vector shape;
- #endif
--   shape.SetSize(nd);
-    elmat.SetSize(nd);
-+   shape.SetSize(nd);
- 
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el, Trans);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Trans);
- 
-    elmat = 0.0;
-    for (int i = 0; i < ir->GetNPoints(); i++)
-@@ -1345,8 +1358,8 @@ void MassIntegrator::AssembleElementMatrix2(
-    shape.SetSize(tr_nd);
-    te_shape.SetSize(te_nd);
- 
--   const IntegrationRule *ir = IntRule ? IntRule :
--                               &GetRule(trial_fe, test_fe, Trans);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
-+                                                            Trans);
- 
-    elmat = 0.0;
-    for (int i = 0; i < ir->GetNPoints(); i++)
-@@ -1367,18 +1380,13 @@ void MassIntegrator::AssembleElementMatrix2(
-    }
- }
- 
--const IntegrationRule &MassIntegrator::GetRule(const FiniteElement &trial_fe,
--                                               const FiniteElement &test_fe,
--                                               ElementTransformation &Trans)
-+const IntegrationRule &BoundaryMassIntegrator::GetRule(
-+   const FiniteElement &el1,
-+   const FiniteElement &el2,
-+   FaceElementTransformations &Trans) const
- {
--   // int order = trial_fe.GetOrder() + test_fe.GetOrder();
--   const int order = trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW();
--
--   if (trial_fe.Space() == FunctionSpace::rQk)
--   {
--      return RefinedIntRules.Get(trial_fe.GetGeomType(), order);
--   }
--   return IntRules.Get(trial_fe.GetGeomType(), order);
-+   int order = el1.GetOrder() + el2.GetOrder();
-+   return IntRules.Get(Trans.GetGeometryType(), order);
- }
- 
- void BoundaryMassIntegrator::AssembleFaceMatrix(
-@@ -1397,13 +1405,7 @@ void BoundaryMassIntegrator::AssembleFaceMatrix(
-    elmat.SetSize(nd1);
-    shape.SetSize(nd1);
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int order = 2 * el1.GetOrder();
--
--      ir = &IntRules.Get(Trans.GetGeometryType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el1, Trans);
- 
-    elmat = 0.0;
-    for (int i = 0; i < ir->GetNPoints(); i++)
-@@ -1427,6 +1429,15 @@ void BoundaryMassIntegrator::AssembleFaceMatrix(
-    }
- }
- 
-+const IntegrationRule &ConvectionIntegrator::GetRule(
-+   const FiniteElement &trial_fe,
-+   const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
-+{
-+   int order = Trans.OrderGrad(&trial_fe) + Trans.Order() + test_fe.GetOrder();
-+   return IntRules.Get(trial_fe.GetGeomType(), order);
-+}
-+
- void ConvectionIntegrator::AssembleElementMatrix(
-    const FiniteElement &el, ElementTransformation &Trans, DenseMatrix &elmat)
- {
-@@ -1446,12 +1457,7 @@ void ConvectionIntegrator::AssembleElementMatrix(
- 
-    Vector vec1;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int order = Trans.OrderGrad(&el) + Trans.Order() + el.GetOrder();
--      ir = &IntRules.Get(el.GetGeomType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Trans);
- 
-    Q->Eval(Q_ir, Trans, *ir);
- 
-@@ -1474,18 +1480,8 @@ void ConvectionIntegrator::AssembleElementMatrix(
-    }
- }
- 
--const IntegrationRule &ConvectionIntegrator::GetRule(
--   const FiniteElement &fe,
--   ElementTransformation &Trans)
--{
--   int order = Trans.OrderGrad(&fe) + Trans.Order() + fe.GetOrder();
--   return IntRules.Get(fe.GetGeomType(), order);
--}
--
- void GroupConvectionIntegrator::AssembleElementMatrix(
--   const FiniteElement &el,
--   ElementTransformation &Trans,
--   DenseMatrix &elmat)
-+   const FiniteElement &el, ElementTransformation &Trans, DenseMatrix &elmat)
- {
-    int nd = el.GetDof();
-    int dim = el.GetDim();
-@@ -1496,12 +1492,7 @@ void GroupConvectionIntegrator::AssembleElementMatrix(
-    shape.SetSize(nd);
-    grad.SetSize(nd,dim);
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int order = Trans.OrderGrad(&el) + el.GetOrder();
--      ir = &IntRules.Get(el.GetGeomType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Trans);
- 
-    Q->Eval(Q_nodal, Trans, el.GetNodes()); // sets the size of Q_nodal
- 
-@@ -1536,10 +1527,9 @@ void GroupConvectionIntegrator::AssembleElementMatrix(
-    }
- }
- 
--void VectorMassIntegrator::AssembleElementMatrix(
--   const FiniteElement &el,
--   ElementTransformation &Trans,
--   DenseMatrix &elmat)
-+void VectorMassIntegrator::AssembleElementMatrix(const FiniteElement &el,
-+                                                 ElementTransformation &Trans,
-+                                                 DenseMatrix &elmat)
- {
-    int nd = el.GetDof();
-    int spaceDim = Trans.GetSpaceDim();
-@@ -1561,20 +1551,7 @@ void VectorMassIntegrator::AssembleElementMatrix(
-       mcoeff.SetSize(vdim);
-    }
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int order = 2 * el.GetOrder() + Trans.OrderW() + Q_order;
--
--      if (el.Space() == FunctionSpace::rQk)
--      {
--         ir = &RefinedIntRules.Get(el.GetGeomType(), order);
--      }
--      else
--      {
--         ir = &IntRules.Get(el.GetGeomType(), order);
--      }
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Trans);
- 
-    elmat = 0.0;
-    for (int s = 0; s < ir->GetNPoints(); s++)
-@@ -1644,21 +1621,8 @@ void VectorMassIntegrator::AssembleElementMatrix2(
-       mcoeff.SetSize(vdim);
-    }
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int order = (trial_fe.GetOrder() + test_fe.GetOrder() +
--                   Trans.OrderW() + Q_order);
--
--      if (trial_fe.Space() == FunctionSpace::rQk)
--      {
--         ir = &RefinedIntRules.Get(trial_fe.GetGeomType(), order);
--      }
--      else
--      {
--         ir = &IntRules.Get(trial_fe.GetGeomType(), order);
--      }
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
-+                                                            Trans);
- 
-    elmat = 0.0;
-    for (int s = 0; s < ir->GetNPoints(); s++)
-@@ -1704,6 +1668,179 @@ void VectorMassIntegrator::AssembleElementMatrix2(
-    }
- }
- 
-+void VectorDiffusionIntegrator::AssembleElementMatrix(
-+   const FiniteElement &el,
-+   ElementTransformation &Trans,
-+   DenseMatrix &elmat)
-+{
-+   const int dof = el.GetDof();
-+   dim = el.GetDim();
-+   sdim = Trans.GetSpaceDim();
-+
-+   // If vdim is not set, set it to the space dimension;
-+   vdim = (vdim <= 0) ? sdim : vdim;
-+   const bool square = (dim == sdim);
-+
-+   if (VQ)
-+   {
-+      vcoeff.SetSize(vdim);
-+   }
-+   else if (MQ)
-+   {
-+      mcoeff.SetSize(vdim);
-+   }
-+
-+   dshape.SetSize(dof, dim);
-+   dshapedxt.SetSize(dof, sdim);
-+
-+   elmat.SetSize(vdim * dof);
-+   pelmat.SetSize(dof);
-+
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Trans);
-+
-+   elmat = 0.0;
-+
-+   for (int i = 0; i < ir->GetNPoints(); i++)
-+   {
-+
-+      const IntegrationPoint &ip = ir->IntPoint(i);
-+      el.CalcDShape(ip, dshape);
-+
-+      Trans.SetIntPoint(&ip);
-+      double w = Trans.Weight();
-+      w = ip.weight / (square ? w : w*w*w);
-+      // AdjugateJacobian = / adj(J),         if J is square
-+      //                    \ adj(J^t.J).J^t, otherwise
-+      Mult(dshape, Trans.AdjugateJacobian(), dshapedxt);
-+
-+      if (VQ)
-+      {
-+         VQ->Eval(vcoeff, Trans, ip);
-+         for (int k = 0; k < vdim; ++k)
-+         {
-+            Mult_a_AAt(w*vcoeff(k), dshapedxt, pelmat);
-+            elmat.AddMatrix(pelmat, dof*k, dof*k);
-+         }
-+      }
-+      else if (MQ)
-+      {
-+         MQ->Eval(mcoeff, Trans, ip);
-+         for (int ii = 0; ii < vdim; ++ii)
-+         {
-+            for (int jj = 0; jj < vdim; ++jj)
-+            {
-+               Mult_a_AAt(w*mcoeff(ii,jj), dshapedxt, pelmat);
-+               elmat.AddMatrix(pelmat, dof*ii, dof*jj);
-+            }
-+         }
-+      }
-+      else
-+      {
-+         if (Q) { w *= Q->Eval(Trans, ip); }
-+         Mult_a_AAt(w, dshapedxt, pelmat);
-+         for (int k = 0; k < vdim; ++k)
-+         {
-+            elmat.AddMatrix(pelmat, dof*k, dof*k);
-+         }
-+      }
-+   }
-+}
-+
-+void VectorDiffusionIntegrator::AssembleElementVector(
-+   const FiniteElement &el, ElementTransformation &Tr,
-+   const Vector &elfun, Vector &elvect)
-+{
-+   const int dof = el.GetDof();
-+   dim = el.GetDim();
-+   sdim = Tr.GetSpaceDim();
-+
-+   // If vdim is not set, set it to the space dimension;
-+   vdim = (vdim <= 0) ? sdim : vdim;
-+   const bool square = (dim == sdim);
-+
-+   if (VQ)
-+   {
-+      vcoeff.SetSize(vdim);
-+   }
-+   else if (MQ)
-+   {
-+      mcoeff.SetSize(vdim);
-+   }
-+
-+   dshape.SetSize(dof, dim);
-+   dshapedxt.SetSize(dof, dim);
-+   // pelmat.SetSize(dim);
-+
-+   elvect.SetSize(dim*dof);
-+
-+   // NOTE: DenseMatrix is in column-major order. This is consistent with
-+   // vectors ordered byNODES. In the resulting DenseMatrix, each column
-+   // corresponds to a particular vdim.
-+   DenseMatrix mat_in(elfun.GetData(), dof, dim);
-+   DenseMatrix mat_out(elvect.GetData(), dof, dim);
-+
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr);
-+
-+   elvect = 0.0;
-+   for (int i = 0; i < ir->GetNPoints(); i++)
-+   {
-+      const IntegrationPoint &ip = ir->IntPoint(i);
-+      el.CalcDShape(ip, dshape);
-+
-+      Tr.SetIntPoint(&ip);
-+      double w = Tr.Weight();
-+      w = ip.weight / (square ? w : w*w*w);
-+      Mult(dshape, Tr.AdjugateJacobian(), dshapedxt);
-+      MultAAt(dshapedxt, pelmat);
-+
-+      if (VQ)
-+      {
-+         VQ->Eval(vcoeff, Tr, ip);
-+         for (int k = 0; k < vdim; ++k)
-+         {
-+            pelmat *= w*vcoeff(k);
-+            const Vector vec_in(mat_in.GetColumn(k), dof);
-+            Vector vec_out(mat_out.GetColumn(k), dof);
-+            pelmat.AddMult(vec_in, vec_out);
-+         }
-+      }
-+      else if (MQ)
-+      {
-+         MQ->Eval(mcoeff, Tr, ip);
-+         for (int ii = 0; ii < vdim; ++ii)
-+         {
-+            Vector vec_out(mat_out.GetColumn(ii), dof);
-+            for (int jj = 0; jj < vdim; ++jj)
-+            {
-+               pelmat *= w*mcoeff(ii,jj);
-+               const Vector vec_in(mat_in.GetColumn(jj), dof);
-+               pelmat.Mult(vec_in, vec_out);
-+            }
-+         }
-+      }
-+      else
-+      {
-+         if (Q) { w *= Q->Eval(Tr, ip); }
-+         pelmat *= w;
-+         for (int k = 0; k < vdim; ++k)
-+         {
-+            const Vector vec_in(mat_in.GetColumn(k), dof);
-+            Vector vec_out(mat_out.GetColumn(k), dof);
-+            pelmat.AddMult(vec_in, vec_out);
-+         }
-+      }
-+   }
-+}
-+
-+const IntegrationRule &VectorFEDivergenceIntegrator::GetRule(
-+   const FiniteElement &trial_fe,
-+   const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
-+{
-+   int order = trial_fe.GetOrder() + test_fe.GetOrder() - 1;
-+   return IntRules.Get(trial_fe.GetGeomType(), order);
-+}
-+
- void VectorFEDivergenceIntegrator::AssembleElementMatrix2(
-    const FiniteElement &trial_fe, const FiniteElement &test_fe,
-    ElementTransformation &Trans, DenseMatrix &elmat)
-@@ -1719,12 +1856,8 @@ void VectorFEDivergenceIntegrator::AssembleElementMatrix2(
- 
-    elmat.SetSize(test_nd, trial_nd);
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int order = trial_fe.GetOrder() + test_fe.GetOrder() - 1; // <--
--      ir = &IntRules.Get(trial_fe.GetGeomType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
-+                                                            Trans);
- 
-    elmat = 0.0;
-    for (i = 0; i < ir->GetNPoints(); i++)
-@@ -1744,6 +1877,46 @@ void VectorFEDivergenceIntegrator::AssembleElementMatrix2(
-    }
- }
- 
-+const IntegrationRule &VectorFEWeakDivergenceIntegrator::GetRule(
-+   const FiniteElement &trial_fe,
-+   const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
-+{
-+   // The integrand on the reference element is:
-+   //    -( Q/det(J) ) u_hat^T adj(J) adj(J)^T grad_hat(v_hat).
-+   //
-+   // For Trans in (P_k)^d, v_hat in P_l, u_hat in ND_m, and dim=sdim=d>=1
-+   // - J_{ij} is in P_{k-1}, so adj(J)_{ij} is in P_{(d-1)*(k-1)}
-+   // - so adj(J)^T grad_hat(v_hat) is in (P_{(d-1)*(k-1)+(l-1)})^d
-+   // - u_hat is in (P_m)^d
-+   // - adj(J)^T u_hat is in (P_{(d-1)*(k-1)+m})^d
-+   // - and u_hat^T adj(J) adj(J)^T grad_hat(v_hat) is in P_n with
-+   //   n = 2*(d-1)*(k-1)+(l-1)+m
-+   //
-+   // For Trans in (Q_k)^d, v_hat in Q_l, u_hat in ND_m, and dim=sdim=d>1
-+   // - J_{i*}, J's i-th row, is in ( Q_{k-1,k,k}, Q_{k,k-1,k}, Q_{k,k,k-1} )
-+   // - adj(J)_{*j} is in ( Q_{s,s-1,s-1}, Q_{s-1,s,s-1}, Q_{s-1,s-1,s} )
-+   //   with s = (d-1)*k
-+   // - adj(J)^T grad_hat(v_hat) is in Q_{(d-1)*k+(l-1)}
-+   // - u_hat is in ( Q_{m-1,m,m}, Q_{m,m-1,m}, Q_{m,m,m-1} )
-+   // - adj(J)^T u_hat is in Q_{(d-1)*k+(m-1)}
-+   // - and u_hat^T adj(J) adj(J)^T grad_hat(v_hat) is in Q_n with
-+   //   n = 2*(d-1)*k+(l-1)+(m-1)
-+   //
-+   // In the next formula we use the expressions for n with k=1, which means
-+   // that the term Q/det(J) is disregarded:
-+   int order;
-+   if (trial_fe.Space() == FunctionSpace::Pk)
-+   {
-+      order = trial_fe.GetOrder() + test_fe.GetOrder() - 1;
-+   }
-+   else
-+   {
-+      order = trial_fe.GetOrder() + test_fe.GetOrder() + 2 * (trial_fe.GetDim() - 2);
-+   }
-+   return IntRules.Get(trial_fe.GetGeomType(), order);
-+}
-+
- void VectorFEWeakDivergenceIntegrator::AssembleElementMatrix2(
-    const FiniteElement &trial_fe, const FiniteElement &test_fe,
-    ElementTransformation &Trans, DenseMatrix &elmat)
-@@ -1770,37 +1943,8 @@ void VectorFEWeakDivergenceIntegrator::AssembleElementMatrix2(
- 
-    elmat.SetSize(test_nd, trial_nd);
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      // The integrand on the reference element is:
--      //    -( Q/det(J) ) u_hat^T adj(J) adj(J)^T grad_hat(v_hat).
--      //
--      // For Trans in (P_k)^d, v_hat in P_l, u_hat in ND_m, and dim=sdim=d>=1
--      // - J_{ij} is in P_{k-1}, so adj(J)_{ij} is in P_{(d-1)*(k-1)}
--      // - so adj(J)^T grad_hat(v_hat) is in (P_{(d-1)*(k-1)+(l-1)})^d
--      // - u_hat is in (P_m)^d
--      // - adj(J)^T u_hat is in (P_{(d-1)*(k-1)+m})^d
--      // - and u_hat^T adj(J) adj(J)^T grad_hat(v_hat) is in P_n with
--      //   n = 2*(d-1)*(k-1)+(l-1)+m
--      //
--      // For Trans in (Q_k)^d, v_hat in Q_l, u_hat in ND_m, and dim=sdim=d>1
--      // - J_{i*}, J's i-th row, is in ( Q_{k-1,k,k}, Q_{k,k-1,k}, Q_{k,k,k-1} )
--      // - adj(J)_{*j} is in ( Q_{s,s-1,s-1}, Q_{s-1,s,s-1}, Q_{s-1,s-1,s} )
--      //   with s = (d-1)*k
--      // - adj(J)^T grad_hat(v_hat) is in Q_{(d-1)*k+(l-1)}
--      // - u_hat is in ( Q_{m-1,m,m}, Q_{m,m-1,m}, Q_{m,m,m-1} )
--      // - adj(J)^T u_hat is in Q_{(d-1)*k+(m-1)}
--      // - and u_hat^T adj(J) adj(J)^T grad_hat(v_hat) is in Q_n with
--      //   n = 2*(d-1)*k+(l-1)+(m-1)
--      //
--      // In the next formula we use the expressions for n with k=1, which means
--      // that the term Q/det(J) is disregarded:
--      int ir_order = (trial_fe.Space() == FunctionSpace::Pk) ?
--                     (trial_fe.GetOrder() + test_fe.GetOrder() - 1) :
--                     (trial_fe.GetOrder() + test_fe.GetOrder() + 2*(dim-2));
--      ir = &IntRules.Get(trial_fe.GetGeomType(), ir_order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
-+                                                            Trans);
- 
-    elmat = 0.0;
-    for (i = 0; i < ir->GetNPoints(); i++)
-@@ -1826,6 +1970,15 @@ void VectorFEWeakDivergenceIntegrator::AssembleElementMatrix2(
-    }
- }
- 
-+const IntegrationRule &VectorFECurlIntegrator::GetRule(
-+   const FiniteElement &trial_fe,
-+   const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
-+{
-+   int order = trial_fe.GetOrder() + test_fe.GetOrder() - 1;
-+   return IntRules.Get(trial_fe.GetGeomType(), order);
-+}
-+
- void VectorFECurlIntegrator::AssembleElementMatrix2(
-    const FiniteElement &trial_fe, const FiniteElement &test_fe,
-    ElementTransformation &Trans, DenseMatrix &elmat)
-@@ -1863,12 +2016,8 @@ void VectorFECurlIntegrator::AssembleElementMatrix2(
- 
-    elmat.SetSize(test_nd, trial_nd);
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int order = trial_fe.GetOrder() + test_fe.GetOrder() - 1; // <--
--      ir = &IntRules.Get(trial_fe.GetGeomType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
-+                                                            Trans);
- 
-    elmat = 0.0;
-    for (i = 0; i < ir->GetNPoints(); i++)
-@@ -1923,6 +2072,27 @@ void VectorFECurlIntegrator::AssembleElementMatrix2(
-    }
- }
- 
-+const IntegrationRule &DerivativeIntegrator::GetRule(
-+   const FiniteElement &trial_fe,
-+   const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
-+{
-+   int order;
-+   if (trial_fe.Space() == FunctionSpace::Pk)
-+   {
-+      order = trial_fe.GetOrder() + test_fe.GetOrder() - 1;
-+   }
-+   else
-+   {
-+      order = trial_fe.GetOrder() + test_fe.GetOrder() + trial_fe.GetDim();
-+   }
-+   if (trial_fe.Space() == FunctionSpace::rQk)
-+   {
-+      return RefinedIntRules.Get(trial_fe.GetGeomType(), order);
-+   }
-+   return IntRules.Get(trial_fe.GetGeomType(), order);
-+}
-+
- void DerivativeIntegrator::AssembleElementMatrix2(
-    const FiniteElement &trial_fe,
-    const FiniteElement &test_fe,
-@@ -1944,28 +2114,8 @@ void DerivativeIntegrator::AssembleElementMatrix2(
-    invdfdx.SetSize(dim, spaceDim);
-    shape.SetSize(test_nd);
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int order;
--      if (trial_fe.Space() == FunctionSpace::Pk)
--      {
--         order = trial_fe.GetOrder() + test_fe.GetOrder() - 1;
--      }
--      else
--      {
--         order = trial_fe.GetOrder() + test_fe.GetOrder() + dim;
--      }
--
--      if (trial_fe.Space() == FunctionSpace::rQk)
--      {
--         ir = &RefinedIntRules.Get(trial_fe.GetGeomType(), order);
--      }
--      else
--      {
--         ir = &IntRules.Get(trial_fe.GetGeomType(), order);
--      }
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
-+                                                            Trans);
- 
-    elmat = 0.0;
-    for (i = 0; i < ir->GetNPoints(); i++)
-@@ -1991,10 +2141,26 @@ void DerivativeIntegrator::AssembleElementMatrix2(
-    }
- }
- 
--void CurlCurlIntegrator::AssembleElementMatrix(
--   const FiniteElement &el,
--   ElementTransformation &Trans,
--   DenseMatrix &elmat)
-+const IntegrationRule &CurlCurlIntegrator::GetRule(
-+   const FiniteElement &trial_fe,
-+   const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
-+{
-+   int order;
-+   if (trial_fe.Space() == FunctionSpace::Pk)
-+   {
-+      order = test_fe.GetOrder() + trial_fe.GetOrder() - 2;
-+   }
-+   else
-+   {
-+      order = test_fe.GetOrder() + trial_fe.GetOrder() + trial_fe.GetDim() - 1;
-+   }
-+   return IntRules.Get(trial_fe.GetGeomType(), order);
-+}
-+
-+void CurlCurlIntegrator::AssembleElementMatrix(const FiniteElement &el,
-+                                               ElementTransformation &Trans,
-+                                               DenseMatrix &elmat)
- {
-    int nd = el.GetDof();
-    dim = el.GetDim();
-@@ -2009,25 +2175,10 @@ void CurlCurlIntegrator::AssembleElementMatrix(
-    curlshape_dFt.SetSize(nd,dimc);
- #endif
-    elmat.SetSize(nd);
--
-    if (MQ) { M.SetSize(dimc); }
-    if (DQ) { D.SetSize(dimc); }
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int order;
--      if (el.Space() == FunctionSpace::Pk)
--      {
--         order = 2*el.GetOrder() - 2;
--      }
--      else
--      {
--         order = 2*el.GetOrder();
--      }
--
--      ir = &IntRules.Get(el.GetGeomType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Trans);
- 
-    elmat = 0.0;
-    for (int i = 0; i < ir->GetNPoints(); i++)
-@@ -2090,20 +2241,8 @@ void CurlCurlIntegrator::AssembleElementMatrix2(const FiniteElement &trial_fe,
-    if (MQ) { M.SetSize(dimc); }
-    if (DQ) { D.SetSize(dimc); }
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int order;
--      if (trial_fe.Space() == FunctionSpace::Pk)
--      {
--         order = test_fe.GetOrder() + trial_fe.GetOrder() - 2;
--      }
--      else
--      {
--         order = test_fe.GetOrder() + trial_fe.GetOrder() + trial_fe.GetDim() - 1;
--      }
--      ir = &IntRules.Get(trial_fe.GetGeomType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
-+                                                            Trans);
- 
-    elmat = 0.0;
-    for (int i = 0; i < ir->GetNPoints(); i++)
-@@ -2174,7 +2313,7 @@ double CurlCurlIntegrator::ComputeFluxEnergy(const FiniteElement &fluxelem,
-    pointflux.SetSize(dim);
-    if (d_energy) { vec.SetSize(dim); }
- 
--   int order = 2 * fluxelem.GetOrder(); // <--
-+   int order = 2 * fluxelem.GetOrder();
-    const IntegrationRule &ir = IntRules.Get(fluxelem.GetGeomType(), order);
- 
-    double energy = 0.0;
-@@ -2265,10 +2404,18 @@ double CurlCurlIntegrator::ComputeFluxEnergy(const FiniteElement &fluxelem,
-    return energy;
- }
- 
-+const IntegrationRule &VectorCurlCurlIntegrator::GetRule(
-+   const FiniteElement &trial_fe,
-+   const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
-+{
-+   // Use the same integration rule as diffusion
-+   int order = Trans.OrderGrad(&trial_fe) + Trans.OrderGrad(&test_fe);
-+   return IntRules.Get(trial_fe.GetGeomType(), order);
-+}
-+
- void VectorCurlCurlIntegrator::AssembleElementMatrix(
--   const FiniteElement &el,
--   ElementTransformation &Trans,
--   DenseMatrix &elmat)
-+   const FiniteElement &el, ElementTransformation &Trans, DenseMatrix &elmat)
- {
-    int dim = el.GetDim();
-    int dof = el.GetDof();
-@@ -2284,13 +2431,7 @@ void VectorCurlCurlIntegrator::AssembleElementMatrix(
-    Jadj.SetSize(dim);
- #endif
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      // use the same integration rule as diffusion
--      int order = 2 * Trans.OrderGrad(&el);
--      ir = &IntRules.Get(el.GetGeomType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Trans);
- 
-    elmat.SetSize(dof*dim);
-    elmat = 0.0;
-@@ -2332,13 +2473,7 @@ double VectorCurlCurlIntegrator::GetElementEnergy(
- #endif
-    DenseMatrix elfun_mat(elfun.GetData(), dof, dim);
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      // use the same integration rule as diffusion
--      int order = 2 * Tr.OrderGrad(&el);
--      ir = &IntRules.Get(el.GetGeomType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr);
- 
-    double energy = 0.;
-    for (int i = 0; i < ir->GetNPoints(); i++)
-@@ -2380,6 +2515,15 @@ double VectorCurlCurlIntegrator::GetElementEnergy(
-    return 0.5 * energy;
- }
- 
-+const IntegrationRule &MixedCurlIntegrator::GetRule(
-+   const FiniteElement &trial_fe,
-+   const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
-+{
-+   int order = trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderJ();
-+   return IntRules.Get(trial_fe.GetGeomType(), order);
-+}
-+
- void MixedCurlIntegrator::AssembleElementMatrix2(
-    const FiniteElement &trial_fe, const FiniteElement &test_fe,
-    ElementTransformation &Trans, DenseMatrix &elmat)
-@@ -2415,13 +2559,9 @@ void MixedCurlIntegrator::AssembleElementMatrix2(
- 
-    double c;
-    Vector d_col;
--   const IntegrationRule *ir = IntRule;
- 
--   if (ir == NULL)
--   {
--      int order = trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderJ();
--      ir = &IntRules.Get(trial_fe.GetGeomType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
-+                                                            Trans);
- 
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-@@ -2458,6 +2598,15 @@ void MixedCurlIntegrator::AssembleElementMatrix2(
-    }
- }
- 
-+const IntegrationRule &VectorFEMassIntegrator::GetRule(
-+   const FiniteElement &trial_fe,
-+   const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
-+{
-+   int order = Trans.OrderW() + test_fe.GetOrder() + trial_fe.GetOrder();
-+   return IntRules.Get(test_fe.GetGeomType(), order);
-+}
-+
- void VectorFEMassIntegrator::AssembleElementMatrix(
-    const FiniteElement &el,
-    ElementTransformation &Trans,
-@@ -2483,13 +2632,7 @@ void VectorFEMassIntegrator::AssembleElementMatrix(
-    elmat.SetSize(dof);
-    elmat = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      // int order = 2 * el.GetOrder();
--      int order = Trans.OrderW() + 2 * el.GetOrder();
--      ir = &IntRules.Get(el.GetGeomType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Trans);
- 
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-@@ -2517,7 +2660,7 @@ void VectorFEMassIntegrator::AssembleElementMatrix(
-       {
-          if (Q)
-          {
--            w *= Q->Eval (Trans, ip);
-+            w *= Q->Eval(Trans, ip);
-          }
-          AddMult_a_AAt(w, trial_vshape, elmat);
-       }
-@@ -2551,15 +2694,11 @@ void VectorFEMassIntegrator::AssembleElementMatrix2(
- #endif
- 
-       elmat.SetSize(vdim*test_dof, trial_dof);
-+      elmat = 0.0;
- 
--      const IntegrationRule *ir = IntRule;
--      if (ir == NULL)
--      {
--         int order = (Trans.OrderW() + test_fe.GetOrder() + trial_fe.GetOrder());
--         ir = &IntRules.Get(test_fe.GetGeomType(), order);
--      }
-+      const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
-+                                                               Trans);
- 
--      elmat = 0.0;
-       for (int i = 0; i < ir->GetNPoints(); i++)
-       {
-          const IntegrationPoint &ip = ir->IntPoint(i);
-@@ -2652,12 +2791,8 @@ void VectorFEMassIntegrator::AssembleElementMatrix2(
- 
-       elmat.SetSize(test_dof, trial_dof);
- 
--      const IntegrationRule *ir = IntRule;
--      if (ir == NULL)
--      {
--         int order = (Trans.OrderW() + test_fe.GetOrder() + trial_fe.GetOrder());
--         ir = &IntRules.Get(test_fe.GetGeomType(), order);
--      }
-+      const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
-+                                                               Trans);
- 
-       elmat = 0.0;
-       for (int i = 0; i < ir->GetNPoints(); i++)
-@@ -2687,7 +2822,7 @@ void VectorFEMassIntegrator::AssembleElementMatrix2(
-          {
-             if (Q)
-             {
--               w *= Q->Eval (Trans, ip);
-+               w *= Q->Eval(Trans, ip);
-             }
-             AddMult_a_ABt(w,test_vshape,trial_vshape,elmat);
-          }
-@@ -2700,6 +2835,15 @@ void VectorFEMassIntegrator::AssembleElementMatrix2(
-    }
- }
- 
-+const IntegrationRule &VectorDivergenceIntegrator::GetRule(
-+   const FiniteElement &trial_fe,
-+   const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
-+{
-+   int order = Trans.OrderGrad(&trial_fe) + test_fe.GetOrder() + Trans.OrderJ();
-+   return IntRules.Get(trial_fe.GetGeomType(), order);
-+}
-+
- void VectorDivergenceIntegrator::AssembleElementMatrix2(
-    const FiniteElement &trial_fe,
-    const FiniteElement &test_fe,
-@@ -2741,7 +2885,7 @@ void VectorDivergenceIntegrator::AssembleElementMatrix2(
-       c = ip.weight;
-       if (Q)
-       {
--         c *= Q->Eval (Trans, ip);
-+         c *= Q->Eval(Trans, ip);
-       }
- 
-       // elmat += c * shape * divshape ^ t
-@@ -2750,12 +2894,13 @@ void VectorDivergenceIntegrator::AssembleElementMatrix2(
-    }
- }
- 
--const IntegrationRule &VectorDivergenceIntegrator::GetRule(
-+const IntegrationRule &DivDivIntegrator::GetRule(
-    const FiniteElement &trial_fe,
-    const FiniteElement &test_fe,
--   ElementTransformation &Trans)
-+   ElementTransformation &Trans) const
- {
--   int order = Trans.OrderGrad(&trial_fe) + test_fe.GetOrder() + Trans.OrderJ();
-+   int order = 2 * max(trial_fe.GetOrder(),
-+                       test_fe.GetOrder()) - 2; // <--- OK for RTk
-    return IntRules.Get(trial_fe.GetGeomType(), order);
- }
- 
-@@ -2774,12 +2919,7 @@ void DivDivIntegrator::AssembleElementMatrix(
- #endif
-    elmat.SetSize(dof);
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int order = 2 * el.GetOrder() - 2; // <--- OK for RTk
--      ir = &IntRules.Get(el.GetGeomType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Trans);
- 
-    elmat = 0.0;
-    for (int i = 0; i < ir->GetNPoints(); i++)
-@@ -2793,7 +2933,7 @@ void DivDivIntegrator::AssembleElementMatrix(
- 
-       if (Q)
-       {
--         c *= Q->Eval (Trans, ip);
-+         c *= Q->Eval(Trans, ip);
-       }
- 
-       // elmat += c * divshape * divshape ^ t
-@@ -2820,13 +2960,8 @@ void DivDivIntegrator::AssembleElementMatrix2(
- #endif
-    elmat.SetSize(te_nd,tr_nd);
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int order = 2 * max(test_fe.GetOrder(),
--                          trial_fe.GetOrder()) - 2; // <--- OK for RTk
--      ir = &IntRules.Get(test_fe.GetGeomType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
-+                                                            Trans);
- 
-    elmat = 0.0;
-    for (int i = 0; i < ir->GetNPoints(); i++)
-@@ -2841,7 +2976,7 @@ void DivDivIntegrator::AssembleElementMatrix2(
- 
-       if (Q)
-       {
--         c *= Q->Eval (Trans, ip);
-+         c *= Q->Eval(Trans, ip);
-       }
- 
-       te_divshape *= c;
-@@ -2849,176 +2984,13 @@ void DivDivIntegrator::AssembleElementMatrix2(
-    }
- }
- 
--void VectorDiffusionIntegrator::AssembleElementMatrix(
--   const FiniteElement &el,
--   ElementTransformation &Trans,
--   DenseMatrix &elmat)
--{
--   const int dof = el.GetDof();
--   dim = el.GetDim();
--   sdim = Trans.GetSpaceDim();
--
--   // If vdim is not set, set it to the space dimension;
--   vdim = (vdim <= 0) ? sdim : vdim;
--   const bool square = (dim == sdim);
--
--   if (VQ)
--   {
--      vcoeff.SetSize(vdim);
--   }
--   else if (MQ)
--   {
--      mcoeff.SetSize(vdim);
--   }
--
--   dshape.SetSize(dof, dim);
--   dshapedxt.SetSize(dof, sdim);
--
--   elmat.SetSize(vdim * dof);
--   pelmat.SetSize(dof);
--
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      ir = &DiffusionIntegrator::GetRule(el,el);
--   }
--
--   elmat = 0.0;
--
--   for (int i = 0; i < ir->GetNPoints(); i++)
--   {
--
--      const IntegrationPoint &ip = ir->IntPoint(i);
--      el.CalcDShape(ip, dshape);
--
--      Trans.SetIntPoint(&ip);
--      double w = Trans.Weight();
--      w = ip.weight / (square ? w : w*w*w);
--      // AdjugateJacobian = / adj(J),         if J is square
--      //                    \ adj(J^t.J).J^t, otherwise
--      Mult(dshape, Trans.AdjugateJacobian(), dshapedxt);
--
--      if (VQ)
--      {
--         VQ->Eval(vcoeff, Trans, ip);
--         for (int k = 0; k < vdim; ++k)
--         {
--            Mult_a_AAt(w*vcoeff(k), dshapedxt, pelmat);
--            elmat.AddMatrix(pelmat, dof*k, dof*k);
--         }
--      }
--      else if (MQ)
--      {
--         MQ->Eval(mcoeff, Trans, ip);
--         for (int ii = 0; ii < vdim; ++ii)
--         {
--            for (int jj = 0; jj < vdim; ++jj)
--            {
--               Mult_a_AAt(w*mcoeff(ii,jj), dshapedxt, pelmat);
--               elmat.AddMatrix(pelmat, dof*ii, dof*jj);
--            }
--         }
--      }
--      else
--      {
--         if (Q) { w *= Q->Eval(Trans, ip); }
--         Mult_a_AAt(w, dshapedxt, pelmat);
--         for (int k = 0; k < vdim; ++k)
--         {
--            elmat.AddMatrix(pelmat, dof*k, dof*k);
--         }
--      }
--   }
--}
--
--void VectorDiffusionIntegrator::AssembleElementVector(
--   const FiniteElement &el, ElementTransformation &Tr,
--   const Vector &elfun, Vector &elvect)
-+const IntegrationRule &ElasticityIntegrator::GetRule(
-+   const FiniteElement &trial_fe,
-+   const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
- {
--   const int dof = el.GetDof();
--   dim = el.GetDim();
--   sdim = Tr.GetSpaceDim();
--
--   // If vdim is not set, set it to the space dimension;
--   vdim = (vdim <= 0) ? sdim : vdim;
--   const bool square = (dim == sdim);
--
--   if (VQ)
--   {
--      vcoeff.SetSize(vdim);
--   }
--   else if (MQ)
--   {
--      mcoeff.SetSize(vdim);
--   }
--
--   dshape.SetSize(dof, dim);
--   dshapedxt.SetSize(dof, dim);
--   // pelmat.SetSize(dim);
--
--   elvect.SetSize(dim*dof);
--
--   // NOTE: DenseMatrix is in column-major order. This is consistent with
--   // vectors ordered byNODES. In the resulting DenseMatrix, each column
--   // corresponds to a particular vdim.
--   DenseMatrix mat_in(elfun.GetData(), dof, dim);
--   DenseMatrix mat_out(elvect.GetData(), dof, dim);
--
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      ir = &DiffusionIntegrator::GetRule(el,el);
--   }
--
--   elvect = 0.0;
--   for (int i = 0; i < ir->GetNPoints(); i++)
--   {
--      const IntegrationPoint &ip = ir->IntPoint(i);
--      el.CalcDShape(ip, dshape);
--
--      Tr.SetIntPoint(&ip);
--      double w = Tr.Weight();
--      w = ip.weight / (square ? w : w*w*w);
--      Mult(dshape, Tr.AdjugateJacobian(), dshapedxt);
--      MultAAt(dshapedxt, pelmat);
--
--      if (VQ)
--      {
--         VQ->Eval(vcoeff, Tr, ip);
--         for (int k = 0; k < vdim; ++k)
--         {
--            pelmat *= w*vcoeff(k);
--            const Vector vec_in(mat_in.GetColumn(k), dof);
--            Vector vec_out(mat_out.GetColumn(k), dof);
--            pelmat.AddMult(vec_in, vec_out);
--         }
--      }
--      else if (MQ)
--      {
--         MQ->Eval(mcoeff, Tr, ip);
--         for (int ii = 0; ii < vdim; ++ii)
--         {
--            Vector vec_out(mat_out.GetColumn(ii), dof);
--            for (int jj = 0; jj < vdim; ++jj)
--            {
--               pelmat *= w*mcoeff(ii,jj);
--               const Vector vec_in(mat_in.GetColumn(jj), dof);
--               pelmat.Mult(vec_in, vec_out);
--            }
--         }
--      }
--      else
--      {
--         if (Q) { w *= Q->Eval(Tr, ip); }
--         pelmat *= w;
--         for (int k = 0; k < vdim; ++k)
--         {
--            const Vector vec_in(mat_in.GetColumn(k), dof);
--            Vector vec_out(mat_out.GetColumn(k), dof);
--            pelmat.AddMult(vec_in, vec_out);
--         }
--      }
--   }
-+   int order = Trans.OrderGrad(&trial_fe) + Trans.OrderGrad(&test_fe);
-+   return IntRules.Get(trial_fe.GetGeomType(), order);
- }
- 
- void ElasticityIntegrator::AssembleElementMatrix(
-@@ -3042,12 +3014,7 @@ void ElasticityIntegrator::AssembleElementMatrix(
- 
-    elmat.SetSize(dof * dim);
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int order = 2 * Trans.OrderGrad(&el); // correct order?
--      ir = &IntRules.Get(el.GetGeomType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Trans);
- 
-    elmat = 0.0;
- 
-@@ -3210,12 +3177,7 @@ double ElasticityIntegrator::ComputeFluxEnergy(const FiniteElement &fluxelem,
-    // Use the same integration rule as in AssembleElementMatrix, replacing 'el'
-    // with 'fluxelem' when 'IntRule' is not set.
-    // Should we be using a different (more accurate) rule here?
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int order = 2 * Trans.OrderGrad(&fluxelem);
--      ir = &IntRules.Get(fluxelem.GetGeomType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(fluxelem, Trans);
- 
-    double energy = 0.0;
- 
-@@ -3275,6 +3237,28 @@ double ElasticityIntegrator::ComputeFluxEnergy(const FiniteElement &fluxelem,
-    return energy;
- }
- 
-+const IntegrationRule &DGTraceIntegrator::GetRule(
-+   const FiniteElement &el1, const FiniteElement &el2,
-+   FaceElementTransformations &Trans) const
-+{
-+   // Assuming order(u) == order(mesh)
-+   int order;
-+   if (Trans.Elem2No >= 0)
-+   {
-+      order = (min(Trans.Elem1->OrderW(), Trans.Elem2->OrderW()) +
-+               2 * max(el1.GetOrder(), el2.GetOrder()));
-+   }
-+   else
-+   {
-+      order = Trans.Elem1->OrderW() + 2 * el1.GetOrder();
-+   }
-+   if (el1.Space() == FunctionSpace::Pk)
-+   {
-+      order++;
-+   }
-+   return IntRules.Get(Trans.GetGeometryType(), order);
-+}
-+
- void DGTraceIntegrator::AssembleFaceMatrix(const FiniteElement &el1,
-                                            const FiniteElement &el2,
-                                            FaceElementTransformations &Trans,
-@@ -3302,24 +3286,7 @@ void DGTraceIntegrator::AssembleFaceMatrix(const FiniteElement &el1,
-    elmat.SetSize(ndof1 + ndof2);
-    elmat = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int order;
--      // Assuming order(u)==order(mesh)
--      if (Trans.Elem2No >= 0)
--         order = (min(Trans.Elem1->OrderW(), Trans.Elem2->OrderW()) +
--                  2*max(el1.GetOrder(), el2.GetOrder()));
--      else
--      {
--         order = Trans.Elem1->OrderW() + 2*el1.GetOrder();
--      }
--      if (el1.Space() == FunctionSpace::Pk)
--      {
--         order++;
--      }
--      ir = &IntRules.Get(Trans.GetGeometryType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el1, el2, Trans);
- 
-    for (int p = 0; p < ir->GetNPoints(); p++)
-    {
-@@ -3408,11 +3375,21 @@ void DGTraceIntegrator::AssembleFaceMatrix(const FiniteElement &el1,
-    }
- }
- 
--const IntegrationRule &DGTraceIntegrator::GetRule(
--   Geometry::Type geom, int order, FaceElementTransformations &T)
-+const IntegrationRule &DGDiffusionIntegrator::GetRuleStatic(
-+   const FiniteElement &el1, const FiniteElement &el2,
-+   FaceElementTransformations &Trans)
- {
--   int int_order = T.Elem1->OrderW() + 2*order;
--   return IntRules.Get(geom, int_order);
-+   // A simple choice for the integration order; is this OK?
-+   int order;
-+   if (Trans.Elem2No >= 0)
-+   {
-+      order = 2 * max(el1.GetOrder(), el2.GetOrder());
-+   }
-+   else
-+   {
-+      order = 2 * el1.GetOrder();
-+   }
-+   return IntRules.Get(Trans.GetGeometryType(), order);
- }
- 
- void DGDiffusionIntegrator::AssembleFaceMatrix(
-@@ -3459,21 +3436,7 @@ void DGDiffusionIntegrator::AssembleFaceMatrix(
-       jmat = 0.;
-    }
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      // a simple choice for the integration order; is this OK?
--      int order;
--      if (ndof2)
--      {
--         order = 2*max(el1.GetOrder(), el2.GetOrder());
--      }
--      else
--      {
--         order = 2*el1.GetOrder();
--      }
--      ir = &IntRules.Get(Trans.GetGeometryType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el1, el2, Trans);
- 
-    // assemble: < {(Q \nabla u).n},[v] >      --> elmat
-    //           kappa < {h^{-1} Q} [u],[v] >  --> jmat
-@@ -3749,13 +3712,7 @@ void DGElasticityIntegrator::AssembleFaceMatrix(
-       dshape2_dnM.SetSize(ndofs2);
-    }
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      // a simple choice for the integration order; is this OK?
--      const int order = 2 * max(el1.GetOrder(), ndofs2 ? el2.GetOrder() : 0);
--      ir = &IntRules.Get(Trans.GetGeometryType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el1, el2, Trans);
- 
-    for (int pind = 0; pind < ir->GetNPoints(); ++pind)
-    {
-@@ -3872,7 +3829,7 @@ void DGElasticityIntegrator::AssembleFaceMatrix(
-    }
- }
- 
--void TraceJumpIntegrator::AssembleFaceMatrix(
-+void TraceJumpIntegrator::AssembleFaceMatrix2(
-    const FiniteElement &trial_face_fe, const FiniteElement &test_fe1,
-    const FiniteElement &test_fe2, FaceElementTransformations &Trans,
-    DenseMatrix &elmat)
-@@ -3964,7 +3921,7 @@ void TraceJumpIntegrator::AssembleFaceMatrix(
-    }
- }
- 
--void NormalTraceJumpIntegrator::AssembleFaceMatrix(
-+void NormalTraceJumpIntegrator::AssembleFaceMatrix2(
-    const FiniteElement &trial_face_fe, const FiniteElement &test_fe1,
-    const FiniteElement &test_fe2, FaceElementTransformations &Trans,
-    DenseMatrix &elmat)
-@@ -4312,11 +4269,11 @@ struct ShapeCoefficient : public VectorCoefficient
- 
- }
- 
--void
--ScalarProductInterpolator::AssembleElementMatrix2(const FiniteElement &dom_fe,
--                                                  const FiniteElement &ran_fe,
--                                                  ElementTransformation &Trans,
--                                                  DenseMatrix &elmat)
-+void ScalarProductInterpolator::AssembleElementMatrix2(
-+   const FiniteElement &dom_fe,
-+   const FiniteElement &ran_fe,
-+   ElementTransformation &Trans,
-+   DenseMatrix &elmat)
- {
-    internal::ShapeCoefficient dom_shape_coeff(*Q, dom_fe);
- 
-@@ -4327,8 +4284,7 @@ ScalarProductInterpolator::AssembleElementMatrix2(const FiniteElement &dom_fe,
-    ran_fe.Project(dom_shape_coeff, Trans, elmat_as_vec);
- }
- 
--void
--ScalarVectorProductInterpolator::AssembleElementMatrix2(
-+void ScalarVectorProductInterpolator::AssembleElementMatrix2(
-    const FiniteElement &dom_fe,
-    const FiniteElement &ran_fe,
-    ElementTransformation &Trans,
-@@ -4361,8 +4317,7 @@ ScalarVectorProductInterpolator::AssembleElementMatrix2(
-    ran_fe.ProjectMatrixCoefficient(dom_shape_coeff, Trans, elmat_as_vec);
- }
- 
--void
--VectorScalarProductInterpolator::AssembleElementMatrix2(
-+void VectorScalarProductInterpolator::AssembleElementMatrix2(
-    const FiniteElement &dom_fe,
-    const FiniteElement &ran_fe,
-    ElementTransformation &Trans,
-@@ -4398,8 +4353,7 @@ VectorScalarProductInterpolator::AssembleElementMatrix2(
-    ran_fe.ProjectMatrixCoefficient(dom_shape_coeff, Trans, elmat_as_vec);
- }
- 
--void
--ScalarCrossProductInterpolator::AssembleElementMatrix2(
-+void ScalarCrossProductInterpolator::AssembleElementMatrix2(
-    const FiniteElement &dom_fe,
-    const FiniteElement &ran_fe,
-    ElementTransformation &Trans,
-@@ -4440,8 +4394,7 @@ ScalarCrossProductInterpolator::AssembleElementMatrix2(
-    ran_fe.Project(dom_shape_coeff, Trans, elmat_as_vec);
- }
- 
--void
--VectorCrossProductInterpolator::AssembleElementMatrix2(
-+void VectorCrossProductInterpolator::AssembleElementMatrix2(
-    const FiniteElement &dom_fe,
-    const FiniteElement &ran_fe,
-    ElementTransformation &Trans,
-diff --git a/fem/bilininteg.hpp b/fem/bilininteg.hpp
-index 209898714..598da405d 100644
---- a/fem/bilininteg.hpp
-+++ b/fem/bilininteg.hpp
-@@ -41,10 +41,6 @@ public:
-    // TODO: add support for other assembly levels (in addition to PA) and their
-    // actions.
- 
--   // TODO: for mixed meshes the quadrature rules to be used by methods like
--   // AssemblePA() can be given as a QuadratureSpace, e.g. using a new method:
--   // SetQuadratureSpace().
--
-    // TODO: the methods for the various assembly levels make sense even in the
-    // base class NonlinearFormIntegrator, except that not all assembly levels
-    // make sense for the action of the nonlinear operator (but they all make
-@@ -159,11 +155,11 @@ public:
- 
-    /** Abstract method used for assembling TraceFaceIntegrators in a
-        MixedBilinearForm. */
--   virtual void AssembleFaceMatrix(const FiniteElement &trial_face_fe,
--                                   const FiniteElement &test_fe1,
--                                   const FiniteElement &test_fe2,
--                                   FaceElementTransformations &Trans,
--                                   DenseMatrix &elmat);
-+   virtual void AssembleFaceMatrix2(const FiniteElement &trial_face_fe,
-+                                    const FiniteElement &test_fe1,
-+                                    const FiniteElement &test_fe2,
-+                                    FaceElementTransformations &Trans,
-+                                    DenseMatrix &elmat);
- 
-    /** Abstract method used for assembling TraceFaceIntegrators for
-        DPG weak formulations. */
-@@ -173,7 +169,6 @@ public:
-                                         FaceElementTransformations &Trans,
-                                         DenseMatrix &elmat);
- 
--
-    /// @brief Perform the local action of the BilinearFormIntegrator.
-    /// Note that the default implementation in the base class is general but not
-    /// efficient.
-@@ -282,7 +277,12 @@ public:
-    TransposeIntegrator(BilinearFormIntegrator *bfi_, bool own_bfi_ = true)
-    { bfi = bfi_; own_bfi = own_bfi_; }
- 
--   virtual void SetIntRule(const IntegrationRule *ir);
-+   virtual bool SupportsCeed() const { return bfi->SupportsCeed(); }
-+
-+   virtual void SetIntRule(const IntegrationRule *ir)
-+   {
-+      IntRule = ir; bfi->SetIntRule(ir);
-+   }
- 
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
-@@ -293,7 +293,6 @@ public:
-                                        ElementTransformation &Trans,
-                                        DenseMatrix &elmat);
- 
--   using BilinearFormIntegrator::AssembleFaceMatrix;
-    virtual void AssembleFaceMatrix(const FiniteElement &el1,
-                                    const FiniteElement &el2,
-                                    FaceElementTransformations &Trans,
-@@ -362,7 +361,12 @@ public:
-    LumpedIntegrator(BilinearFormIntegrator *bfi_, bool own_bfi_ = true)
-    { bfi = bfi_; own_bfi = own_bfi_; }
- 
--   virtual void SetIntRule(const IntegrationRule *ir);
-+   virtual bool SupportsCeed() const { return bfi->SupportsCeed(); }
-+
-+   virtual void SetIntRule(const IntegrationRule *ir)
-+   {
-+      IntRule = ir; bfi->SetIntRule(ir);
-+   }
- 
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
-@@ -375,57 +379,64 @@ public:
- class InverseIntegrator : public BilinearFormIntegrator
- {
- private:
--   bool own_integrator;
--   BilinearFormIntegrator *integrator;
-+   bool own_bfi;
-+   BilinearFormIntegrator *bfi;
- 
- public:
--   InverseIntegrator(BilinearFormIntegrator *integ, bool own_integ = 1)
--   { integrator = integ; own_integrator = own_integ; }
-+   InverseIntegrator(BilinearFormIntegrator *bfi_, bool own_bfi_ = true)
-+   { bfi = bfi_; own_bfi = own_bfi_; }
- 
--   virtual void SetIntRule(const IntegrationRule *ir);
-+   virtual bool SupportsCeed() const { return bfi->SupportsCeed(); }
-+
-+   virtual void SetIntRule(const IntegrationRule *ir)
-+   {
-+      IntRule = ir; bfi->SetIntRule(ir);
-+   }
- 
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
-                                       DenseMatrix &elmat);
- 
--   virtual ~InverseIntegrator() { if (own_integrator) { delete integrator; } }
-+   virtual ~InverseIntegrator() { if (own_bfi) { delete bfi; } }
- };
- 
- /// Integrator defining a sum of multiple Integrators.
- class SumIntegrator : public BilinearFormIntegrator
- {
- private:
--   bool own_integrators;
--   mutable DenseMatrix elem_mat;
--   Array<BilinearFormIntegrator*> integrators;
-+   bool own_bfis;
-+   Array<BilinearFormIntegrator*> bfis;
-+   DenseMatrix bfi_elmat;
- 
- public:
--   SumIntegrator(bool own_integs = true) { own_integrators = own_integs; }
-+   SumIntegrator(bool own_bfis_ = true) { own_bfis = own_bfis_; }
- 
--   virtual void SetIntRule(const IntegrationRule *ir);
-+   void AddIntegrator(BilinearFormIntegrator *bfi)
-+   { bfis.Append(bfi); }
-+
-+   virtual bool SupportsCeed() const;
- 
--   void AddIntegrator(BilinearFormIntegrator *integ)
--   { integrators.Append(integ); }
-+   virtual void SetIntRule(const IntegrationRule *ir);
- 
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
-                                       DenseMatrix &elmat);
-+
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-                                        ElementTransformation &Trans,
-                                        DenseMatrix &elmat);
- 
--   using BilinearFormIntegrator::AssembleFaceMatrix;
-    virtual void AssembleFaceMatrix(const FiniteElement &el1,
-                                    const FiniteElement &el2,
-                                    FaceElementTransformations &Trans,
-                                    DenseMatrix &elmat);
- 
--   virtual void AssembleFaceMatrix(const FiniteElement &trial_face_fe,
--                                   const FiniteElement &test_fe1,
--                                   const FiniteElement &test_fe2,
--                                   FaceElementTransformations &Trans,
--                                   DenseMatrix &elmat);
-+   virtual void AssembleFaceMatrix2(const FiniteElement &trial_face_fe,
-+                                    const FiniteElement &test_fe1,
-+                                    const FiniteElement &test_fe2,
-+                                    FaceElementTransformations &Trans,
-+                                    DenseMatrix &elmat);
- 
-    virtual void AssemblePA(const FiniteElementSpace &fes);
-    virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-@@ -469,6 +480,11 @@ public:
- class MixedScalarIntegrator: public BilinearFormIntegrator
- {
- public:
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const;
-+
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-                                        ElementTransformation &Trans,
-@@ -486,39 +502,40 @@ protected:
-    /// result if given the same FiniteElement. The default is false.
-    bool same_calc_shape;
- 
-+   Coefficient *Q;
-+
-    MixedScalarIntegrator() : same_calc_shape(false), Q(NULL) {}
-    MixedScalarIntegrator(Coefficient &q) : same_calc_shape(false), Q(&q) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement &trial_fe, const FiniteElement &test_fe) const
-+   virtual bool VerifyFiniteElementTypes(
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
--              test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR );
-+              test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarIntegrator:  "
-              "Trial and test spaces must both be scalar fields.";
-    }
- 
--   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
--                                          const FiniteElement &test_fe,
--                                          ElementTransformation &Trans)
-+   virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                   const FiniteElement &test_fe,
-+                                   ElementTransformation &Trans) const
-    { return trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW(); }
- 
--   inline virtual void CalcTestShape(const FiniteElement &test_fe,
--                                     ElementTransformation &Trans,
--                                     Vector &shape)
-+   virtual void CalcTestShape(const FiniteElement &test_fe,
-+                              ElementTransformation &Trans,
-+                              Vector &shape)
-    { test_fe.CalcPhysShape(Trans, shape); }
- 
--   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
--                                      ElementTransformation &Trans,
--                                      Vector &shape)
-+   virtual void CalcTrialShape(const FiniteElement &trial_fe,
-+                               ElementTransformation &Trans,
-+                               Vector &shape)
-    { trial_fe.CalcPhysShape(Trans, shape); }
- 
--   Coefficient *Q;
--
- private:
- #ifndef MFEM_THREAD_SAFE
-    Vector test_shape, trial_shape;
-@@ -530,6 +547,11 @@ private:
- class MixedVectorIntegrator: public BilinearFormIntegrator
- {
- public:
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const;
-+
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-                                        ElementTransformation &Trans,
-@@ -547,6 +569,12 @@ protected:
-    /// result if given the same FiniteElement. The default is false.
-    bool same_calc_shape;
- 
-+   int space_dim;
-+   Coefficient *Q;
-+   VectorCoefficient *VQ;
-+   DiagonalMatrixCoefficient *DQ;
-+   MatrixCoefficient *MQ;
-+
-    MixedVectorIntegrator()
-       : same_calc_shape(false), Q(NULL), VQ(NULL), DQ(NULL), MQ(NULL) {}
-    MixedVectorIntegrator(Coefficient &q)
-@@ -557,47 +585,41 @@ protected:
-    MixedVectorIntegrator(MatrixCoefficient &mq)
-       : same_calc_shape(false), Q(NULL), VQ(NULL), DQ(NULL), MQ(&mq) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
--              test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR );
-+              test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedVectorIntegrator:  "
-              "Trial and test spaces must both be vector fields";
-    }
- 
--   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
--                                          const FiniteElement &test_fe,
--                                          ElementTransformation &Trans)
-+   virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                   const FiniteElement &test_fe,
-+                                   ElementTransformation &Trans) const
-    { return trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW(); }
- 
--   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-+   virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return std::max(space_dim, test_fe.GetVDim()); }
- 
--   inline virtual void CalcTestShape(const FiniteElement &test_fe,
--                                     ElementTransformation &Trans,
--                                     DenseMatrix &shape)
-+   virtual void CalcTestShape(const FiniteElement &test_fe,
-+                              ElementTransformation &Trans,
-+                              DenseMatrix &shape)
-    { test_fe.CalcVShape(Trans, shape); }
- 
--   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-+   virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return std::max(space_dim, trial_fe.GetVDim()); }
- 
--   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
--                                      ElementTransformation &Trans,
--                                      DenseMatrix &shape)
-+   virtual void CalcTrialShape(const FiniteElement &trial_fe,
-+                               ElementTransformation &Trans,
-+                               DenseMatrix &shape)
-    { trial_fe.CalcVShape(Trans, shape); }
- 
--   int space_dim;
--   Coefficient *Q;
--   VectorCoefficient *VQ;
--   DiagonalMatrixCoefficient *DQ;
--   MatrixCoefficient *MQ;
--
- private:
- #ifndef MFEM_THREAD_SAFE
-    Vector V, D;
-@@ -611,6 +633,11 @@ private:
- class MixedScalarVectorIntegrator: public BilinearFormIntegrator
- {
- public:
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const;
-+
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-                                        ElementTransformation &Trans,
-@@ -627,24 +654,28 @@ public:
-    { AssembleElementMatrix2(fe, fe, Trans, elmat); }
- 
- protected:
-+   VectorCoefficient *VQ;
-+   int space_dim;
-+   bool transpose;
-+   bool cross_2d;  // In 2D use a cross product rather than a dot product
-+
-    MixedScalarVectorIntegrator(VectorCoefficient &vq, bool transpose_ = false,
-                                bool cross_2d_ = false)
-       : VQ(&vq), transpose(transpose_), cross_2d(cross_2d_) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return ((transpose &&
-                trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
--               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR ) ||
-+               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR) ||
-               (!transpose &&
-                trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
--               test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR )
--             );
-+               test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR));
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       if (transpose)
-       {
-@@ -660,35 +691,28 @@ protected:
-       }
-    }
- 
--   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
--                                          const FiniteElement &test_fe,
--                                          ElementTransformation &Trans)
-+   virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                   const FiniteElement &test_fe,
-+                                   ElementTransformation &Trans) const
-    { return trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW(); }
- 
--   inline virtual int GetVDim(const FiniteElement &vector_fe)
-+   virtual int GetVDim(const FiniteElement &vector_fe) const
-    { return std::max(space_dim, vector_fe.GetVDim()); }
- 
--   inline virtual void CalcVShape(const FiniteElement &vector_fe,
--                                  ElementTransformation &Trans,
--                                  DenseMatrix &shape_)
-+   virtual void CalcVShape(const FiniteElement &vector_fe,
-+                           ElementTransformation &Trans,
-+                           DenseMatrix &shape_)
-    { vector_fe.CalcVShape(Trans, shape_); }
- 
--   inline virtual void CalcShape(const FiniteElement &scalar_fe,
--                                 ElementTransformation &Trans,
--                                 Vector &shape_)
-+   virtual void CalcShape(const FiniteElement &scalar_fe,
-+                          ElementTransformation &Trans,
-+                          Vector &shape_)
-    { scalar_fe.CalcPhysShape(Trans, shape_); }
- 
--   VectorCoefficient *VQ;
--   int space_dim;
--   bool transpose;
--   bool cross_2d;  // In 2D use a cross product rather than a dot product
--
- private:
- #ifndef MFEM_THREAD_SAFE
--   Vector V;
-+   Vector V, shape, vshape_tmp;
-    DenseMatrix vshape;
--   Vector      shape;
--   Vector      vshape_tmp;
- #endif
- };
- 
-@@ -723,25 +747,25 @@ public:
-       : MixedScalarIntegrator(q) {}
- 
- protected:
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDim() == 1 && test_fe.GetDim() == 1 &&
--              trial_fe.GetDerivType() == mfem::FiniteElement::GRAD  &&
--              test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR );
-+              trial_fe.GetDerivType() == mfem::FiniteElement::GRAD &&
-+              test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarDerivativeIntegrator:  "
-              "Trial and test spaces must both be scalar fields in 1D "
-              "and the trial space must implement CalcDShape.";
-    }
- 
--   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
--                                      ElementTransformation &Trans,
--                                      Vector &shape)
-+   virtual void CalcTrialShape(const FiniteElement &trial_fe,
-+                               ElementTransformation &Trans,
-+                               Vector &shape)
-    {
-       DenseMatrix dshape(shape.GetData(), shape.Size(), 1);
-       trial_fe.CalcPhysDShape(Trans, dshape);
-@@ -758,16 +782,16 @@ public:
-       : MixedScalarIntegrator(q) {}
- 
- protected:
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDim() == 1 && test_fe.GetDim() == 1 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
--              test_fe.GetDerivType()  == mfem::FiniteElement::GRAD );
-+              test_fe.GetDerivType()  == mfem::FiniteElement::GRAD);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarWeakDerivativeIntegrator:  "
-              "Trial and test spaces must both be scalar fields in 1D "
-@@ -775,9 +799,9 @@ protected:
-              "map type \"VALUE\".";
-    }
- 
--   inline virtual void CalcTestShape(const FiniteElement &test_fe,
--                                     ElementTransformation &Trans,
--                                     Vector &shape)
-+   virtual void CalcTestShape(const FiniteElement &test_fe,
-+                              ElementTransformation &Trans,
-+                              Vector &shape)
-    {
-       DenseMatrix dshape(shape.GetData(), shape.Size(), 1);
-       test_fe.CalcPhysDShape(Trans, dshape);
-@@ -796,29 +820,29 @@ public:
-       : MixedScalarIntegrator(q) {}
- 
- protected:
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
--      return (trial_fe.GetDerivType() == mfem::FiniteElement::DIV  &&
--              test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR );
-+      return (trial_fe.GetDerivType() == mfem::FiniteElement::DIV &&
-+              test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarDivergenceIntegrator:  "
-              "Trial must be H(Div) and the test space must be a "
-              "scalar field";
-    }
- 
--   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
--                                          const FiniteElement &test_fe,
--                                          ElementTransformation &Trans)
-+   virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                   const FiniteElement &test_fe,
-+                                   ElementTransformation &Trans) const
-    { return trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW() - 1; }
- 
--   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
--                                      ElementTransformation &Trans,
--                                      Vector &shape)
-+   virtual void CalcTrialShape(const FiniteElement &trial_fe,
-+                               ElementTransformation &Trans,
-+                               Vector &shape)
-    { trial_fe.CalcPhysDivShape(Trans, shape); }
- };
- 
-@@ -832,15 +856,15 @@ public:
-       : MixedScalarVectorIntegrator(vq) {}
- 
- protected:
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
--      return (trial_fe.GetDerivType() == mfem::FiniteElement::DIV  &&
--              test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR );
-+      return (trial_fe.GetDerivType() == mfem::FiniteElement::DIV &&
-+              test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedVectorDivergenceIntegrator:  "
-              "Trial must be H(Div) and the test space must be a "
-@@ -849,14 +873,14 @@ protected:
- 
-    // Subtract one due to the divergence and add one for the coefficient
-    // which is assumed to be at least linear.
--   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
--                                          const FiniteElement &test_fe,
--                                          ElementTransformation &Trans)
-+   virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                   const FiniteElement &test_fe,
-+                                   ElementTransformation &Trans) const
-    { return trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW() - 1 + 1; }
- 
--   inline virtual void CalcShape(const FiniteElement &scalar_fe,
--                                 ElementTransformation &Trans,
--                                 Vector &shape)
-+   virtual void CalcShape(const FiniteElement &scalar_fe,
-+                          ElementTransformation &Trans,
-+                          Vector &shape)
-    { scalar_fe.CalcPhysDivShape(Trans, shape); }
- };
- 
-@@ -871,24 +895,24 @@ public:
-       : MixedScalarIntegrator(q) {}
- 
- protected:
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
--              test_fe.GetDerivType()  == mfem::FiniteElement::DIV );
-+              test_fe.GetDerivType()  == mfem::FiniteElement::DIV);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarWeakGradientIntegrator:  "
-              "Trial space must be a scalar field "
-              "and the test space must be H(Div)";
-    }
- 
--   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
--                                          const FiniteElement &test_fe,
--                                          ElementTransformation &Trans)
-+   virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                   const FiniteElement &test_fe,
-+                                   ElementTransformation &Trans) const
-    { return trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW() - 1; }
- 
-    virtual void CalcTestShape(const FiniteElement &test_fe,
-@@ -911,7 +935,7 @@ public:
-       : MixedScalarIntegrator(q) {}
- 
- protected:
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-@@ -920,21 +944,21 @@ protected:
-               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarCurlIntegrator:  "
-              "Trial must be H(Curl) and the test space must be a "
-              "scalar field";
-    }
- 
--   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
--                                          const FiniteElement &test_fe,
--                                          ElementTransformation &Trans)
-+   virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                   const FiniteElement &test_fe,
-+                                   ElementTransformation &Trans) const
-    { return trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW() - 1; }
- 
--   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
--                                      ElementTransformation &Trans,
--                                      Vector &shape)
-+   virtual void CalcTrialShape(const FiniteElement &trial_fe,
-+                               ElementTransformation &Trans,
-+                               Vector &shape)
-    {
-       DenseMatrix dshape(shape.GetData(), shape.Size(), 1);
-       trial_fe.CalcPhysCurlShape(Trans, dshape);
-@@ -968,25 +992,25 @@ public:
-       : MixedScalarIntegrator(q) {}
- 
- protected:
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDim() == 2 && test_fe.GetDim() == 2 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
--              test_fe.GetDerivType()  == mfem::FiniteElement::CURL );
-+              test_fe.GetDerivType()  == mfem::FiniteElement::CURL);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarWeakCurlIntegrator:  "
-              "Trial space must be a scalar field "
-              "and the test space must be H(Curl)";
-    }
- 
--   inline virtual void CalcTestShape(const FiniteElement &test_fe,
--                                     ElementTransformation &Trans,
--                                     Vector &shape)
-+   virtual void CalcTestShape(const FiniteElement &test_fe,
-+                              ElementTransformation &Trans,
-+                              Vector &shape)
-    {
-       DenseMatrix dshape(shape.GetData(), shape.Size(), 1);
-       test_fe.CalcPhysCurlShape(Trans, dshape);
-@@ -1026,15 +1050,15 @@ public:
-    MixedDotProductIntegrator(VectorCoefficient &vq)
-       : MixedScalarVectorIntegrator(vq, true) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
--              test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR );
-+              test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedDotProductIntegrator:  "
-              "Trial space must be a vector field "
-@@ -1051,16 +1075,16 @@ public:
-    MixedWeakGradDotIntegrator(VectorCoefficient &vq)
-       : MixedScalarVectorIntegrator(vq, true) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR &&
--              test_fe.GetDerivType()  == mfem::FiniteElement::DIV );
-+              test_fe.GetDerivType()  == mfem::FiniteElement::DIV);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedWeakGradDotIntegrator:  "
-              "Trial space must be a vector field "
-@@ -1069,14 +1093,14 @@ public:
- 
-    // Subtract one due to the gradient and add one for the coefficient
-    // which is assumed to be at least linear.
--   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
--                                          const FiniteElement &test_fe,
--                                          ElementTransformation &Trans)
-+   virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                   const FiniteElement &test_fe,
-+                                   ElementTransformation &Trans) const
-    { return trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW() - 1 + 1; }
- 
--   inline virtual void CalcShape(const FiniteElement &scalar_fe,
--                                 ElementTransformation &Trans,
--                                 Vector &shape)
-+   virtual void CalcShape(const FiniteElement &scalar_fe,
-+                          ElementTransformation &Trans,
-+                          Vector &shape)
-    { scalar_fe.CalcPhysDivShape(Trans, shape); shape *= -1.0; }
- };
- 
-@@ -1088,29 +1112,29 @@ public:
-    MixedWeakDivCrossIntegrator(VectorCoefficient &vq)
-       : MixedVectorIntegrator(vq, false) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetVDim() == 3 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR &&
--              test_fe.GetDerivType()  == mfem::FiniteElement::GRAD );
-+              test_fe.GetDerivType()  == mfem::FiniteElement::GRAD);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedWeakDivCrossIntegrator:  "
-              "Trial space must be a vector field in 3D "
-              "and the test space must be a scalar field with a gradient";
-    }
- 
--   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-+   virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTestShape(const FiniteElement &test_fe,
--                                     ElementTransformation &Trans,
--                                     DenseMatrix &shape)
-+   virtual void CalcTestShape(const FiniteElement &test_fe,
-+                              ElementTransformation &Trans,
-+                              DenseMatrix &shape)
-    { test_fe.CalcPhysDShape(Trans, shape); shape *= -1.0; }
- };
- 
-@@ -1128,26 +1152,26 @@ public:
-    MixedGradGradIntegrator(MatrixCoefficient &mq)
-       : MixedVectorIntegrator(mq) { same_calc_shape = true; }
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
-               trial_fe.GetDerivType() == mfem::FiniteElement::GRAD &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR &&
--              test_fe.GetDerivType()  == mfem::FiniteElement::GRAD );
-+              test_fe.GetDerivType()  == mfem::FiniteElement::GRAD);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedGradGradIntegrator:  "
-              "Trial and test spaces must both be scalar fields "
-              "with a gradient operator.";
-    }
- 
--   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
--                                          const FiniteElement &test_fe,
--                                          ElementTransformation &Trans)
-+   virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                   const FiniteElement &test_fe,
-+                                   ElementTransformation &Trans) const
-    {
-       // Same as DiffusionIntegrator
-       return test_fe.Space() == FunctionSpace::Pk ?
-@@ -1155,20 +1179,20 @@ public:
-              trial_fe.GetOrder() + test_fe.GetOrder() + test_fe.GetDim() - 1;
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-+   virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
--                                      ElementTransformation &Trans,
--                                      DenseMatrix &shape)
-+   virtual void CalcTrialShape(const FiniteElement &trial_fe,
-+                               ElementTransformation &Trans,
-+                               DenseMatrix &shape)
-    { trial_fe.CalcPhysDShape(Trans, shape); }
- 
--   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-+   virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTestShape(const FiniteElement &test_fe,
--                                     ElementTransformation &Trans,
--                                     DenseMatrix &shape)
-+   virtual void CalcTestShape(const FiniteElement &test_fe,
-+                              ElementTransformation &Trans,
-+                              DenseMatrix &shape)
-    { test_fe.CalcPhysDShape(Trans, shape); }
- };
- 
-@@ -1180,37 +1204,37 @@ public:
-    MixedCrossGradGradIntegrator(VectorCoefficient &vq)
-       : MixedVectorIntegrator(vq, false) { same_calc_shape = true; }
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
-               trial_fe.GetDerivType() == mfem::FiniteElement::GRAD &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR &&
--              test_fe.GetDerivType()  == mfem::FiniteElement::GRAD );
-+              test_fe.GetDerivType()  == mfem::FiniteElement::GRAD);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedCrossGradGradIntegrator:  "
-              "Trial and test spaces must both be scalar fields "
-              "with a gradient operator.";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-+   virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
--                                      ElementTransformation &Trans,
--                                      DenseMatrix &shape)
-+   virtual void CalcTrialShape(const FiniteElement &trial_fe,
-+                               ElementTransformation &Trans,
-+                               DenseMatrix &shape)
-    { trial_fe.CalcPhysDShape(Trans, shape); }
- 
--   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-+   virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTestShape(const FiniteElement &test_fe,
--                                     ElementTransformation &Trans,
--                                     DenseMatrix &shape)
-+   virtual void CalcTestShape(const FiniteElement &test_fe,
-+                              ElementTransformation &Trans,
-+                              DenseMatrix &shape)
-    { test_fe.CalcPhysDShape(Trans, shape); }
- };
- 
-@@ -1228,7 +1252,7 @@ public:
-    MixedCurlCurlIntegrator(MatrixCoefficient &mq)
-       : MixedVectorIntegrator(mq) { same_calc_shape = true; }
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-@@ -1236,30 +1260,30 @@ public:
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-               trial_fe.GetDerivType() == mfem::FiniteElement::CURL &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR &&
--              test_fe.GetDerivType()  == mfem::FiniteElement::CURL );
-+              test_fe.GetDerivType()  == mfem::FiniteElement::CURL);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedCurlCurlIntegrator"
-              "Trial and test spaces must both be vector fields in 3D "
-              "with a curl.";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-+   virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return trial_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
--                                      ElementTransformation &Trans,
--                                      DenseMatrix &shape)
-+   virtual void CalcTrialShape(const FiniteElement &trial_fe,
-+                               ElementTransformation &Trans,
-+                               DenseMatrix &shape)
-    { trial_fe.CalcPhysCurlShape(Trans, shape); }
- 
--   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-+   virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return test_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTestShape(const FiniteElement &test_fe,
--                                     ElementTransformation &Trans,
--                                     DenseMatrix &shape)
-+   virtual void CalcTestShape(const FiniteElement &test_fe,
-+                              ElementTransformation &Trans,
-+                              DenseMatrix &shape)
-    { test_fe.CalcPhysCurlShape(Trans, shape); }
- };
- 
-@@ -1271,7 +1295,7 @@ public:
-    MixedCrossCurlCurlIntegrator(VectorCoefficient &vq)
-       : MixedVectorIntegrator(vq, false) { same_calc_shape = true; }
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-@@ -1280,30 +1304,30 @@ public:
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-               trial_fe.GetDerivType() == mfem::FiniteElement::CURL &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR &&
--              test_fe.GetDerivType()  == mfem::FiniteElement::CURL );
-+              test_fe.GetDerivType()  == mfem::FiniteElement::CURL);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedCrossCurlCurlIntegrator:  "
-              "Trial and test spaces must both be vector fields in 3D "
-              "with a curl.";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-+   virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return trial_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
--                                      ElementTransformation &Trans,
--                                      DenseMatrix &shape)
-+   virtual void CalcTrialShape(const FiniteElement &trial_fe,
-+                               ElementTransformation &Trans,
-+                               DenseMatrix &shape)
-    { trial_fe.CalcPhysCurlShape(Trans, shape); }
- 
--   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-+   virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return test_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTestShape(const FiniteElement &test_fe,
--                                     ElementTransformation &Trans,
--                                     DenseMatrix &shape)
-+   virtual void CalcTestShape(const FiniteElement &test_fe,
-+                              ElementTransformation &Trans,
-+                              DenseMatrix &shape)
-    { test_fe.CalcPhysCurlShape(Trans, shape); }
- };
- 
-@@ -1315,7 +1339,7 @@ public:
-    MixedCrossCurlGradIntegrator(VectorCoefficient &vq)
-       : MixedVectorIntegrator(vq, false) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-@@ -1323,30 +1347,30 @@ public:
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-               trial_fe.GetDerivType() == mfem::FiniteElement::CURL &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR &&
--              test_fe.GetDerivType()  == mfem::FiniteElement::GRAD );
-+              test_fe.GetDerivType()  == mfem::FiniteElement::GRAD);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedCrossCurlGradIntegrator"
-              "Trial space must be a vector field in 3D with a curl"
-              "and the test space must be a scalar field with a gradient";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-+   virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return trial_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
--                                      ElementTransformation &Trans,
--                                      DenseMatrix &shape)
-+   virtual void CalcTrialShape(const FiniteElement &trial_fe,
-+                               ElementTransformation &Trans,
-+                               DenseMatrix &shape)
-    { trial_fe.CalcPhysCurlShape(Trans, shape); }
- 
--   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-+   virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTestShape(const FiniteElement &test_fe,
--                                     ElementTransformation &Trans,
--                                     DenseMatrix &shape)
-+   virtual void CalcTestShape(const FiniteElement &test_fe,
-+                              ElementTransformation &Trans,
-+                              DenseMatrix &shape)
-    { test_fe.CalcPhysDShape(Trans, shape); }
- };
- 
-@@ -1358,7 +1382,7 @@ public:
-    MixedCrossGradCurlIntegrator(VectorCoefficient &vq)
-       : MixedVectorIntegrator(vq, false) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-@@ -1366,30 +1390,30 @@ public:
-               trial_fe.GetRangeType()  == mfem::FiniteElement::SCALAR &&
-               trial_fe.GetDerivType()  == mfem::FiniteElement::GRAD &&
-               test_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
--              test_fe.GetDerivType() == mfem::FiniteElement::CURL );
-+              test_fe.GetDerivType() == mfem::FiniteElement::CURL);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedCrossGradCurlIntegrator"
-              "Trial space must be a scalar field in 3D with a gradient"
-              "and the test space must be a vector field with a curl";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-+   virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
--                                      ElementTransformation &Trans,
--                                      DenseMatrix &shape)
-+   virtual void CalcTrialShape(const FiniteElement &trial_fe,
-+                               ElementTransformation &Trans,
-+                               DenseMatrix &shape)
-    { trial_fe.CalcPhysDShape(Trans, shape); }
- 
--   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-+   virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return test_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTestShape(const FiniteElement &test_fe,
--                                     ElementTransformation &Trans,
--                                     DenseMatrix &shape)
-+   virtual void CalcTestShape(const FiniteElement &test_fe,
-+                              ElementTransformation &Trans,
-+                              DenseMatrix &shape)
-    { test_fe.CalcPhysCurlShape(Trans, shape); }
- };
- 
-@@ -1402,29 +1426,29 @@ public:
-    MixedWeakCurlCrossIntegrator(VectorCoefficient &vq)
-       : MixedVectorIntegrator(vq, false) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetVDim() == 3 && test_fe.GetCurlDim() == 3 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR &&
--              test_fe.GetDerivType()  == mfem::FiniteElement::CURL );
-+              test_fe.GetDerivType()  == mfem::FiniteElement::CURL);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedWeakCurlCrossIntegrator:  "
-              "Trial space must be a vector field in 3D "
-              "and the test space must be a vector field with a curl";
-    }
- 
--   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-+   virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return test_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTestShape(const FiniteElement &test_fe,
--                                     ElementTransformation &Trans,
--                                     DenseMatrix &shape)
-+   virtual void CalcTestShape(const FiniteElement &test_fe,
-+                              ElementTransformation &Trans,
-+                              DenseMatrix &shape)
-    { test_fe.CalcPhysCurlShape(Trans, shape); }
- };
- 
-@@ -1437,26 +1461,26 @@ public:
-    MixedScalarWeakCurlCrossIntegrator(VectorCoefficient &vq)
-       : MixedScalarVectorIntegrator(vq, true, true) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDim() == 2 && test_fe.GetDim() == 2 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR &&
--              test_fe.GetDerivType()  == mfem::FiniteElement::CURL );
-+              test_fe.GetDerivType()  == mfem::FiniteElement::CURL);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarWeakCurlCrossIntegrator:  "
-              "Trial space must be a vector field in 2D "
-              "and the test space must be a vector field with a curl";
-    }
- 
--   inline virtual void CalcShape(const FiniteElement &scalar_fe,
--                                 ElementTransformation &Trans,
--                                 Vector &shape)
-+   virtual void CalcShape(const FiniteElement &scalar_fe,
-+                          ElementTransformation &Trans,
-+                          Vector &shape)
-    {
-       DenseMatrix dshape(shape.GetData(), shape.Size(), 1);
-       scalar_fe.CalcPhysCurlShape(Trans, dshape);
-@@ -1472,34 +1496,34 @@ public:
-    MixedCrossGradIntegrator(VectorCoefficient &vq)
-       : MixedVectorIntegrator(vq, false) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (test_fe.GetVDim() == 3 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
-               trial_fe.GetDerivType() == mfem::FiniteElement::GRAD &&
--              test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR );
-+              test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedCrossGradIntegrator:  "
-              "Trial space must be a scalar field with a gradient operator"
-              " and the test space must be a vector field both in 3D.";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-+   virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
--                                      ElementTransformation &Trans,
--                                      DenseMatrix &shape)
-+   virtual void CalcTrialShape(const FiniteElement &trial_fe,
-+                               ElementTransformation &Trans,
-+                               DenseMatrix &shape)
-    { trial_fe.CalcPhysDShape(Trans, shape); }
- 
--   inline virtual void CalcTestShape(const FiniteElement &test_fe,
--                                     ElementTransformation &Trans,
--                                     DenseMatrix &shape)
-+   virtual void CalcTestShape(const FiniteElement &test_fe,
-+                              ElementTransformation &Trans,
-+                              DenseMatrix &shape)
-    { test_fe.CalcVShape(Trans, shape); }
- };
- 
-@@ -1512,29 +1536,29 @@ public:
-    MixedCrossCurlIntegrator(VectorCoefficient &vq)
-       : MixedVectorIntegrator(vq, false) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetCurlDim() == 3 && test_fe.GetVDim() == 3 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
--              trial_fe.GetDerivType() == mfem::FiniteElement::CURL   &&
--              test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR );
-+              trial_fe.GetDerivType() == mfem::FiniteElement::CURL &&
-+              test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedCrossCurlIntegrator:  "
-              "Trial space must be a vector field in 3D with a curl "
-              "and the test space must be a vector field";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-+   virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return trial_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
--                                      ElementTransformation &Trans,
--                                      DenseMatrix &shape)
-+   virtual void CalcTrialShape(const FiniteElement &trial_fe,
-+                               ElementTransformation &Trans,
-+                               DenseMatrix &shape)
-    { trial_fe.CalcPhysCurlShape(Trans, shape); }
- };
- 
-@@ -1547,26 +1571,26 @@ public:
-    MixedScalarCrossCurlIntegrator(VectorCoefficient &vq)
-       : MixedScalarVectorIntegrator(vq, false, true) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDim() == 2 && test_fe.GetDim() == 2 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
--              trial_fe.GetDerivType() == mfem::FiniteElement::CURL   &&
--              test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR );
-+              trial_fe.GetDerivType() == mfem::FiniteElement::CURL &&
-+              test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedCrossCurlIntegrator:  "
-              "Trial space must be a vector field in 2D with a curl "
-              "and the test space must be a vector field";
-    }
- 
--   inline virtual void CalcShape(const FiniteElement &scalar_fe,
--                                 ElementTransformation &Trans,
--                                 Vector &shape)
-+   virtual void CalcShape(const FiniteElement &scalar_fe,
-+                          ElementTransformation &Trans,
-+                          Vector &shape)
-    {
-       DenseMatrix dshape(shape.GetData(), shape.Size(), 1);
-       scalar_fe.CalcPhysCurlShape(Trans, dshape); shape *= -1.0;
-@@ -1581,29 +1605,29 @@ public:
-    MixedScalarCrossGradIntegrator(VectorCoefficient &vq)
-       : MixedScalarVectorIntegrator(vq, true, true) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDim() == 2 && test_fe.GetDim() == 2 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
--              trial_fe.GetDerivType() == mfem::FiniteElement::GRAD   &&
--              test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR );
-+              trial_fe.GetDerivType() == mfem::FiniteElement::GRAD &&
-+              test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarCrossGradIntegrator:  "
-              "Trial space must be a scalar field in 2D with a gradient "
-              "and the test space must be a scalar field";
-    }
- 
--   inline int GetVDim(const FiniteElement &vector_fe)
-+   virtual int GetVDim(const FiniteElement &vector_fe) const
-    { return space_dim; }
- 
--   inline virtual void CalcVShape(const FiniteElement &vector_fe,
--                                  ElementTransformation &Trans,
--                                  DenseMatrix &shape)
-+   virtual void CalcVShape(const FiniteElement &vector_fe,
-+                           ElementTransformation &Trans,
-+                           DenseMatrix &shape)
-    { vector_fe.CalcPhysDShape(Trans, shape); }
- };
- 
-@@ -1615,16 +1639,16 @@ public:
-    MixedScalarCrossProductIntegrator(VectorCoefficient &vq)
-       : MixedScalarVectorIntegrator(vq, true, true) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDim() == 2 && test_fe.GetDim() == 2 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
--              test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR );
-+              test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarCrossProductIntegrator:  "
-              "Trial space must be a vector field in 2D "
-@@ -1640,25 +1664,25 @@ public:
-    MixedScalarWeakCrossProductIntegrator(VectorCoefficient &vq)
-       : MixedScalarVectorIntegrator(vq, false, true) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDim() == 2 && test_fe.GetDim() == 2 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
--              test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR );
-+              test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarWeakCrossProductIntegrator:  "
-              "Trial space must be a scalar field in 2D "
-              "and the test space must be a vector field";
-    }
- 
--   inline virtual void CalcShape(const FiniteElement &scalar_fe,
--                                 ElementTransformation &Trans,
--                                 Vector &shape)
-+   virtual void CalcShape(const FiniteElement &scalar_fe,
-+                          ElementTransformation &Trans,
-+                          Vector &shape)
-    { scalar_fe.CalcPhysShape(Trans, shape); shape *= -1.0; }
- };
- 
-@@ -1670,28 +1694,28 @@ public:
-    MixedDirectionalDerivativeIntegrator(VectorCoefficient &vq)
-       : MixedScalarVectorIntegrator(vq, true) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
--              trial_fe.GetDerivType() == mfem::FiniteElement::GRAD   &&
--              test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR );
-+              trial_fe.GetDerivType() == mfem::FiniteElement::GRAD &&
-+              test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedDirectionalDerivativeIntegrator:  "
-              "Trial space must be a scalar field with a gradient "
-              "and the test space must be a scalar field";
-    }
- 
--   inline virtual int GetVDim(const FiniteElement &vector_fe)
-+   virtual int GetVDim(const FiniteElement &vector_fe) const
-    { return space_dim; }
- 
--   inline virtual void CalcVShape(const FiniteElement &vector_fe,
--                                  ElementTransformation &Trans,
--                                  DenseMatrix &shape)
-+   virtual void CalcVShape(const FiniteElement &vector_fe,
-+                           ElementTransformation &Trans,
-+                           DenseMatrix &shape)
-    { vector_fe.CalcPhysDShape(Trans, shape); }
- };
- 
-@@ -1703,34 +1727,34 @@ public:
-    MixedGradDivIntegrator(VectorCoefficient &vq)
-       : MixedScalarVectorIntegrator(vq, true) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
--              trial_fe.GetDerivType() == mfem::FiniteElement::GRAD   &&
-+              trial_fe.GetDerivType() == mfem::FiniteElement::GRAD &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR &&
--              test_fe.GetDerivType()  == mfem::FiniteElement::DIV   );
-+              test_fe.GetDerivType()  == mfem::FiniteElement::DIV);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedGradDivIntegrator:  "
-              "Trial space must be a scalar field with a gradient"
-              "and the test space must be a vector field with a divergence";
-    }
- 
--   inline virtual int GetVDim(const FiniteElement &vector_fe)
-+   virtual int GetVDim(const FiniteElement &vector_fe) const
-    { return space_dim; }
- 
--   inline virtual void CalcVShape(const FiniteElement &vector_fe,
--                                  ElementTransformation &Trans,
--                                  DenseMatrix &shape)
-+   virtual void CalcVShape(const FiniteElement &vector_fe,
-+                           ElementTransformation &Trans,
-+                           DenseMatrix &shape)
-    { vector_fe.CalcPhysDShape(Trans, shape); shape *= -1.0; }
- 
--   inline virtual void CalcShape(const FiniteElement &scalar_fe,
--                                 ElementTransformation &Trans,
--                                 Vector &shape)
-+   virtual void CalcShape(const FiniteElement &scalar_fe,
-+                          ElementTransformation &Trans,
-+                          Vector &shape)
-    { scalar_fe.CalcPhysDivShape(Trans, shape); }
- };
- 
-@@ -1742,35 +1766,34 @@ public:
-    MixedDivGradIntegrator(VectorCoefficient &vq)
-       : MixedScalarVectorIntegrator(vq, false) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
--              trial_fe.GetDerivType() == mfem::FiniteElement::DIV    &&
-+              trial_fe.GetDerivType() == mfem::FiniteElement::DIV &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR &&
--              test_fe.GetDerivType()  == mfem::FiniteElement::GRAD
--             );
-+              test_fe.GetDerivType()  == mfem::FiniteElement::GRAD);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedDivGradIntegrator:  "
-              "Trial space must be a vector field with a divergence"
-              "and the test space must be a scalar field with a gradient";
-    }
- 
--   inline virtual int GetVDim(const FiniteElement &vector_fe)
-+   virtual int GetVDim(const FiniteElement &vector_fe) const
-    { return space_dim; }
- 
--   inline virtual void CalcVShape(const FiniteElement &vector_fe,
--                                  ElementTransformation &Trans,
--                                  DenseMatrix &shape)
-+   virtual void CalcVShape(const FiniteElement &vector_fe,
-+                           ElementTransformation &Trans,
-+                           DenseMatrix &shape)
-    { vector_fe.CalcPhysDShape(Trans, shape); shape *= -1.0; }
- 
--   inline virtual void CalcShape(const FiniteElement &scalar_fe,
--                                 ElementTransformation &Trans,
--                                 Vector &shape)
-+   virtual void CalcShape(const FiniteElement &scalar_fe,
-+                          ElementTransformation &Trans,
-+                          Vector &shape)
-    { scalar_fe.CalcPhysDivShape(Trans, shape); }
- };
- 
-@@ -1782,28 +1805,28 @@ public:
-    MixedScalarWeakDivergenceIntegrator(VectorCoefficient &vq)
-       : MixedScalarVectorIntegrator(vq, false) {}
- 
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR &&
--              test_fe.GetDerivType()  == mfem::FiniteElement::GRAD   );
-+              test_fe.GetDerivType()  == mfem::FiniteElement::GRAD);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarWeakDivergenceIntegrator:  "
-              "Trial space must be a scalar field "
-              "and the test space must be a scalar field with a gradient";
-    }
- 
--   inline int GetVDim(const FiniteElement &vector_fe)
-+   virtual int GetVDim(const FiniteElement &vector_fe) const
-    { return space_dim; }
- 
--   inline virtual void CalcVShape(const FiniteElement &vector_fe,
--                                  ElementTransformation &Trans,
--                                  DenseMatrix &shape)
-+   virtual void CalcVShape(const FiniteElement &vector_fe,
-+                           ElementTransformation &Trans,
-+                           DenseMatrix &shape)
-    { vector_fe.CalcPhysDShape(Trans, shape); shape *= -1.0; }
- };
- 
-@@ -1824,40 +1847,57 @@ public:
-    MixedVectorGradientIntegrator(MatrixCoefficient &mq)
-       : MixedVectorIntegrator(mq) {}
- 
-+   virtual bool SupportsCeed() const { return DeviceCanUseCeed(); }
-+
-    using BilinearFormIntegrator::AssemblePA;
-    virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-                            const FiniteElementSpace &test_fes);
- 
-+   using BilinearFormIntegrator::AssemblePABoundary;
-+   virtual void AssemblePABoundary(const FiniteElementSpace &trial_fes,
-+                                   const FiniteElementSpace &test_fes);
-+
-    virtual void AddMultPA(const Vector &x, Vector &y) const;
- 
-    virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
- 
-+   using BilinearFormIntegrator::AssembleMF;
-+   virtual void AssembleMF(const FiniteElementSpace &trial_fes,
-+                           const FiniteElementSpace &test_fes);
-+
-+   using BilinearFormIntegrator::AssembleMFBoundary;
-+   virtual void AssembleMFBoundary(const FiniteElementSpace &trial_fes,
-+                                   const FiniteElementSpace &test_fes);
-+
-+   virtual void AddMultMF(const Vector &x, Vector &y) const;
-+
- protected:
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDerivType() == mfem::FiniteElement::GRAD &&
--              test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR );
-+              test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedVectorGradientIntegrator:  "
-              "Trial spaces must be H1 and the test space must be a "
-              "vector field in 2D or 3D";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-+   virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
--                                      ElementTransformation &Trans,
--                                      DenseMatrix &shape)
-+   virtual void CalcTrialShape(const FiniteElement &trial_fe,
-+                               ElementTransformation &Trans,
-+                               DenseMatrix &shape)
-    {
-       trial_fe.CalcPhysDShape(Trans, shape);
-    }
- 
-+private:
-    DenseMatrix Jinv;
- 
-    // PA extension
-@@ -1882,6 +1922,8 @@ public:
-    MixedVectorCurlIntegrator(MatrixCoefficient &mq)
-       : MixedVectorIntegrator(mq) {}
- 
-+   virtual bool SupportsCeed() const { return DeviceCanUseCeed(); }
-+
-    using BilinearFormIntegrator::AssemblePA;
-    virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-                            const FiniteElementSpace &test_fes);
-@@ -1890,29 +1932,35 @@ public:
- 
-    virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
- 
-+   using BilinearFormIntegrator::AssembleMF;
-+   virtual void AssembleMF(const FiniteElementSpace &trial_fes,
-+                           const FiniteElementSpace &test_fes);
-+
-+   virtual void AddMultMF(const Vector &x, Vector &y) const;
-+
- protected:
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetCurlDim() == 3 && test_fe.GetVDim() == 3 &&
--              trial_fe.GetDerivType() == mfem::FiniteElement::CURL  &&
--              test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR );
-+              trial_fe.GetDerivType() == mfem::FiniteElement::CURL &&
-+              test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedVectorCurlIntegrator:  "
-              "Trial space must be H(Curl) and the test space must be a "
-              "vector field in 3D";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-+   virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return trial_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
--                                      ElementTransformation &Trans,
--                                      DenseMatrix &shape)
-+   virtual void CalcTrialShape(const FiniteElement &trial_fe,
-+                               ElementTransformation &Trans,
-+                               DenseMatrix &shape)
-    {
-       trial_fe.CalcPhysCurlShape(Trans, shape);
-    }
-@@ -1942,6 +1990,8 @@ public:
-    MixedVectorWeakCurlIntegrator(MatrixCoefficient &mq)
-       : MixedVectorIntegrator(mq) {}
- 
-+   virtual bool SupportsCeed() const { return DeviceCanUseCeed(); }
-+
-    using BilinearFormIntegrator::AssemblePA;
-    virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-                            const FiniteElementSpace &test_fes);
-@@ -1950,29 +2000,35 @@ public:
- 
-    virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
- 
-+   using BilinearFormIntegrator::AssembleMF;
-+   virtual void AssembleMF(const FiniteElementSpace &trial_fes,
-+                           const FiniteElementSpace &test_fes);
-+
-+   virtual void AddMultMF(const Vector &x, Vector &y) const;
-+
- protected:
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetVDim() == 3 && test_fe.GetCurlDim() == 3 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
--              test_fe.GetDerivType()  == mfem::FiniteElement::CURL );
-+              test_fe.GetDerivType()  == mfem::FiniteElement::CURL);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedVectorWeakCurlIntegrator:  "
-              "Trial space must be vector field in 3D and the "
-              "test space must be H(Curl)";
-    }
- 
--   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-+   virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return test_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTestShape(const FiniteElement &test_fe,
--                                     ElementTransformation &Trans,
--                                     DenseMatrix &shape)
-+   virtual void CalcTestShape(const FiniteElement &test_fe,
-+                              ElementTransformation &Trans,
-+                              DenseMatrix &shape)
-    {
-       test_fe.CalcPhysCurlShape(Trans, shape);
-    }
-@@ -2000,28 +2056,50 @@ public:
-    MixedVectorWeakDivergenceIntegrator(MatrixCoefficient &mq)
-       : MixedVectorIntegrator(mq) {}
- 
-+   virtual bool SupportsCeed() const { return DeviceCanUseCeed(); }
-+
-+   using BilinearFormIntegrator::AssemblePA;
-+   virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-+                           const FiniteElementSpace &test_fes);
-+
-+   using BilinearFormIntegrator::AssemblePABoundary;
-+   virtual void AssemblePABoundary(const FiniteElementSpace &trial_fes,
-+                                   const FiniteElementSpace &test_fes);
-+
-+   virtual void AddMultPA(const Vector &x, Vector &y) const;
-+
-+   using BilinearFormIntegrator::AssembleMF;
-+   virtual void AssembleMF(const FiniteElementSpace &trial_fes,
-+                           const FiniteElementSpace &test_fes);
-+
-+   using BilinearFormIntegrator::AssembleMFBoundary;
-+   virtual void AssembleMFBoundary(const FiniteElementSpace &trial_fes,
-+                                   const FiniteElementSpace &test_fes);
-+
-+   virtual void AddMultMF(const Vector &x, Vector &y) const;
-+
- protected:
--   inline virtual bool VerifyFiniteElementTypes(
-+   virtual bool VerifyFiniteElementTypes(
-       const FiniteElement &trial_fe,
-       const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
--              test_fe.GetDerivType()  == mfem::FiniteElement::GRAD );
-+              test_fe.GetDerivType()  == mfem::FiniteElement::GRAD);
-    }
- 
--   inline virtual const char *FiniteElementTypeFailureMessage() const
-+   virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedVectorWeakDivergenceIntegrator:  "
-              "Trial space must be vector field and the "
-              "test space must be H1";
-    }
- 
--   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-+   virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTestShape(const FiniteElement &test_fe,
--                                     ElementTransformation &Trans,
--                                     DenseMatrix &shape)
-+   virtual void CalcTestShape(const FiniteElement &test_fe,
-+                              ElementTransformation &Trans,
-+                              DenseMatrix &shape)
-    {
-       test_fe.CalcPhysDShape(Trans, shape);
-       shape *= -1.0;
-@@ -2063,6 +2141,11 @@ public:
-       Q{&q}, trial_maps{NULL}, test_maps{NULL}, geom{NULL}
-    {}
- 
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const;
-+
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-                                        ElementTransformation &Trans,
-@@ -2075,10 +2158,6 @@ public:
-    virtual void AddMultPA(const Vector &x, Vector &y) const;
- 
-    virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
--
--   static const IntegrationRule &GetRule(const FiniteElement &trial_fe,
--                                         const FiniteElement &test_fe,
--                                         ElementTransformation &Trans);
- };
- 
- /** Class for integrating the bilinear form a(u,v) := (Q grad u, grad v) where Q
-@@ -2128,6 +2207,18 @@ public:
-       : BilinearFormIntegrator(ir),
-         Q(NULL), VQ(NULL), MQ(&q), maps(NULL), geom(NULL) {}
- 
-+   virtual bool SupportsCeed() const { return DeviceCanUseCeed(); }
-+
-+   static const IntegrationRule &GetRuleStatic(const FiniteElement &trial_fe,
-+                                               const FiniteElement &test_fe,
-+                                               ElementTransformation &Trans);
-+
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const
-+   { return GetRuleStatic(trial_fe, test_fe, Trans); }
-+
-    /** Given a particular Finite Element computes the element stiffness matrix
-        elmat. */
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-@@ -2159,6 +2250,9 @@ public:
-    using BilinearFormIntegrator::AssemblePA;
-    virtual void AssemblePA(const FiniteElementSpace &fes);
- 
-+   using BilinearFormIntegrator::AssemblePABoundary;
-+   virtual void AssemblePABoundary(const FiniteElementSpace &fes);
-+
-    virtual void AssembleDiagonalPA(Vector &diag);
- 
-    virtual void AddMultPA(const Vector &x, Vector &y) const;
-@@ -2168,6 +2262,9 @@ public:
-    using BilinearFormIntegrator::AssembleMF;
-    virtual void AssembleMF(const FiniteElementSpace &fes);
- 
-+   using BilinearFormIntegrator::AssembleMFBoundary;
-+   virtual void AssembleMFBoundary(const FiniteElementSpace &fes);
-+
-    virtual void AssembleDiagonalMF(Vector &diag);
- 
-    virtual void AddMultMF(const Vector &x, Vector &y) const;
-@@ -2175,11 +2272,6 @@ public:
-    using BilinearFormIntegrator::AssembleEA;
-    virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat);
- 
--   static const IntegrationRule &GetRule(const FiniteElement &trial_fe,
--                                         const FiniteElement &test_fe);
--
--   bool SupportsCeed() const { return DeviceCanUseCeed(); }
--
-    Coefficient *GetCoefficient() const { return Q; }
- };
- 
-@@ -2210,6 +2302,19 @@ public:
-    MassIntegrator(Coefficient &q, const IntegrationRule *ir = NULL)
-       : BilinearFormIntegrator(ir), Q(&q), maps(NULL), geom(NULL) {}
- 
-+   virtual bool SupportsCeed() const { return DeviceCanUseCeed(); }
-+
-+   static const IntegrationRule &GetRuleStatic(const FiniteElement &trial_fe,
-+                                               const FiniteElement &test_fe,
-+                                               ElementTransformation &Trans,
-+                                               int Q_order = 0);
-+
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const
-+   { return GetRuleStatic(trial_fe, test_fe, Trans); }
-+
-    /** Given a particular Finite Element computes the element mass matrix
-        elmat. */
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-@@ -2236,6 +2341,9 @@ public:
-    using BilinearFormIntegrator::AssembleMF;
-    virtual void AssembleMF(const FiniteElementSpace &fes);
- 
-+   using BilinearFormIntegrator::AssembleMFBoundary;
-+   virtual void AssembleMFBoundary(const FiniteElementSpace &fes);
-+
-    virtual void AssembleDiagonalMF(Vector &diag);
- 
-    virtual void AddMultMF(const Vector &x, Vector &y) const;
-@@ -2243,12 +2351,6 @@ public:
-    using BilinearFormIntegrator::AssembleEA;
-    virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat);
- 
--   static const IntegrationRule &GetRule(const FiniteElement &trial_fe,
--                                         const FiniteElement &test_fe,
--                                         ElementTransformation &Trans);
--
--   bool SupportsCeed() const { return DeviceCanUseCeed(); }
--
-    const Coefficient *GetCoefficient() const { return Q; }
- };
- 
-@@ -2258,7 +2360,10 @@ class BoundaryMassIntegrator : public MassIntegrator
- public:
-    BoundaryMassIntegrator(Coefficient &q) : MassIntegrator(q) {}
- 
--   using BilinearFormIntegrator::AssembleFaceMatrix;
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &el1,
-+                                          const FiniteElement &el2,
-+                                          FaceElementTransformations &Trans) const;
- 
-    virtual void AssembleFaceMatrix(const FiniteElement &el1,
-                                    const FiniteElement &el2,
-@@ -2289,6 +2394,13 @@ public:
-    ConvectionIntegrator(VectorCoefficient &q, double a = 1.0)
-       : Q(&q) { alpha = a; }
- 
-+   virtual bool SupportsCeed() const { return DeviceCanUseCeed(); }
-+
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const;
-+
-    virtual void AssembleElementMatrix(const FiniteElement &fes,
-                                       ElementTransformation &Trans,
-                                       DenseMatrix &elmat);
-@@ -2296,6 +2408,9 @@ public:
-    using BilinearFormIntegrator::AssemblePA;
-    virtual void AssemblePA(const FiniteElementSpace &fes);
- 
-+   using BilinearFormIntegrator::AssemblePABoundary;
-+   virtual void AssemblePABoundary(const FiniteElementSpace &fes);
-+
-    virtual void AssembleDiagonalPA(Vector &diag);
- 
-    virtual void AddMultPA(const Vector &x, Vector &y) const;
-@@ -2305,17 +2420,15 @@ public:
-    using BilinearFormIntegrator::AssembleMF;
-    virtual void AssembleMF(const FiniteElementSpace &fes);
- 
-+   using BilinearFormIntegrator::AssembleMFBoundary;
-+   virtual void AssembleMFBoundary(const FiniteElementSpace &fes);
-+
-    virtual void AssembleDiagonalMF(Vector &diag);
- 
-    virtual void AddMultMF(const Vector &x, Vector &y) const;
- 
-    using BilinearFormIntegrator::AssembleEA;
-    virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat);
--
--   static const IntegrationRule &GetRule(const FiniteElement &fe,
--                                         ElementTransformation &Trans);
--
--   bool SupportsCeed() const { return DeviceCanUseCeed(); }
- };
- 
- // Alias for @ConvectionIntegrator.
-@@ -2349,33 +2462,154 @@ public:
-                                       DenseMatrix &);
- };
- 
--/** Class for integrating the bilinear form a(u,v) := (Q u, v),
--    where u=(u1,...,un) and v=(v1,...,vn); ui and vi are defined
--    by scalar FE through standard transformation. */
--class VectorMassIntegrator: public BilinearFormIntegrator
--{
--private:
--   int vdim;
--   Vector shape, te_shape, vec;
--   DenseMatrix partelmat;
--   DenseMatrix mcoeff;
--   int Q_order;
-+/** Integrator for
- 
-+      (Q grad u, grad v) = sum_i (Q grad u_i, grad v_i) e_i e_i^T
-+
-+    for vector FE spaces, where e_i is the unit vector in the i-th direction.
-+    The resulting local element matrix is square, of size <tt> vdim*dof </tt>,
-+    where \c vdim is the vector dimension space and \c dof is the local degrees
-+    of freedom. The integrator is not aware of the true vector dimension and
-+    must use \c VectorCoefficient, \c MatrixCoefficient, or a caller-specified
-+    value to determine the vector space. For a scalar coefficient, the caller
-+    may manually specify the vector dimension or the vector dimension is assumed
-+    to be the spatial dimension (i.e. 2-dimension or 3-dimension).
-+*/
-+class VectorDiffusionIntegrator : public BilinearFormIntegrator
-+{
- protected:
--   Coefficient *Q;
--   VectorCoefficient *VQ;
--   MatrixCoefficient *MQ;
-+   Coefficient *Q = NULL;
-+   VectorCoefficient *VQ = NULL;
-+   MatrixCoefficient *MQ = NULL;
- 
-    // PA extension
--   Vector pa_data;
-    const DofToQuad *maps;         ///< Not owned
-    const GeometricFactors *geom;  ///< Not owned
--   int dim, ne, nq, dofs1D, quad1D;
-+   int dim, sdim, ne, dofs1D, quad1D;
-+   Vector pa_data;
-+
-+private:
-+   DenseMatrix dshape, dshapedxt, pelmat;
-+   int vdim = -1;
-+   DenseMatrix mcoeff;
-+   Vector vcoeff;
- 
- public:
--   /// Construct an integrator with coefficient 1.0
--   VectorMassIntegrator()
--      : vdim(-1), Q_order(0), Q(NULL), VQ(NULL), MQ(NULL) {}
-+   VectorDiffusionIntegrator() {}
-+
-+   /** \brief Integrator with unit coefficient for caller-specified vector
-+       dimension.
-+
-+       If the vector dimension does not match the true dimension of the space,
-+       the resulting element matrix will be mathematically invalid. */
-+   VectorDiffusionIntegrator(int vector_dimension)
-+      : vdim(vector_dimension) {}
-+
-+   VectorDiffusionIntegrator(Coefficient &q)
-+      : Q(&q) {}
-+
-+   VectorDiffusionIntegrator(Coefficient &q, const IntegrationRule *ir)
-+      : BilinearFormIntegrator(ir), Q(&q) {}
-+
-+   /** \brief Integrator with scalar coefficient for caller-specified vector
-+       dimension.
-+
-+       The element matrix is block-diagonal with \c vdim copies of the element
-+       matrix integrated with the \c Coefficient.
-+
-+       If the vector dimension does not match the true dimension of the space,
-+       the resulting element matrix will be mathematically invalid. */
-+   VectorDiffusionIntegrator(Coefficient &q, int vector_dimension)
-+      : Q(&q), vdim(vector_dimension) {}
-+
-+   /** \brief Integrator with \c VectorCoefficient. The vector dimension of the
-+       \c FiniteElementSpace is assumed to be the same as the dimension of the
-+       \c Vector.
-+
-+       The element matrix is block-diagonal and each block is integrated with
-+       coefficient q_i.
-+
-+       If the vector dimension does not match the true dimension of the space,
-+       the resulting element matrix will be mathematically invalid. */
-+   VectorDiffusionIntegrator(VectorCoefficient &vq)
-+      : VQ(&vq), vdim(vq.GetVDim()) {}
-+
-+   /** \brief Integrator with \c MatrixCoefficient. The vector dimension of the
-+       \c FiniteElementSpace is assumed to be the same as the dimension of the
-+       \c Matrix.
-+
-+       The element matrix is populated in each block. Each block is integrated
-+       with coefficient q_ij.
-+
-+       If the vector dimension does not match the true dimension of the space,
-+       the resulting element matrix will be mathematically invalid. */
-+   VectorDiffusionIntegrator(MatrixCoefficient& mq)
-+      : MQ(&mq), vdim(mq.GetVDim()) {}
-+
-+   virtual bool SupportsCeed() const { return DeviceCanUseCeed(); }
-+
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const
-+   { return DiffusionIntegrator::GetRuleStatic(trial_fe, test_fe, Trans); }
-+
-+   virtual void AssembleElementMatrix(const FiniteElement &el,
-+                                      ElementTransformation &Trans,
-+                                      DenseMatrix &elmat);
-+
-+   virtual void AssembleElementVector(const FiniteElement &el,
-+                                      ElementTransformation &Tr,
-+                                      const Vector &elfun, Vector &elvect);
-+
-+   using BilinearFormIntegrator::AssemblePA;
-+   virtual void AssemblePA(const FiniteElementSpace &fes);
-+
-+   using BilinearFormIntegrator::AssemblePABoundary;
-+   virtual void AssemblePABoundary(const FiniteElementSpace &fes);
-+
-+   virtual void AssembleDiagonalPA(Vector &diag);
-+
-+   virtual void AddMultPA(const Vector &x, Vector &y) const;
-+
-+   using BilinearFormIntegrator::AssembleMF;
-+   virtual void AssembleMF(const FiniteElementSpace &fes);
-+
-+   using BilinearFormIntegrator::AssembleMFBoundary;
-+   virtual void AssembleMFBoundary(const FiniteElementSpace &fes);
-+
-+   virtual void AssembleDiagonalMF(Vector &diag);
-+
-+   virtual void AddMultMF(const Vector &x, Vector &y) const;
-+};
-+
-+/** Class for integrating the bilinear form a(u,v) := (Q u, v),
-+    where u=(u1,...,un) and v=(v1,...,vn); ui and vi are defined
-+    by scalar FE through standard transformation. */
-+class VectorMassIntegrator: public BilinearFormIntegrator
-+{
-+private:
-+   int vdim;
-+   Vector shape, te_shape, vec;
-+   DenseMatrix partelmat;
-+   DenseMatrix mcoeff;
-+   int Q_order;
-+
-+protected:
-+   Coefficient *Q;
-+   VectorCoefficient *VQ;
-+   MatrixCoefficient *MQ;
-+
-+   // PA extension
-+   Vector pa_data;
-+   const DofToQuad *maps;         ///< Not owned
-+   const GeometricFactors *geom;  ///< Not owned
-+   int dim, ne, nq, dofs1D, quad1D;
-+
-+public:
-+   /// Construct an integrator with coefficient 1.0
-+   VectorMassIntegrator()
-+      : vdim(-1), Q_order(0), Q(NULL), VQ(NULL), MQ(NULL) {}
-    /** Construct an integrator with scalar coefficient q.  If possible, save
-        memory by using a scalar integrator since the resulting matrix is block
-        diagonal with the same diagonal block repeated. */
-@@ -2394,6 +2628,14 @@ public:
-    int GetVDim() const { return vdim; }
-    void SetVDim(int vdim_) { vdim = vdim_; }
- 
-+   virtual bool SupportsCeed() const { return DeviceCanUseCeed(); }
-+
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const
-+   { return MassIntegrator::GetRuleStatic(trial_fe, test_fe, Trans, Q_order); }
-+
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
-                                       DenseMatrix &elmat);
-@@ -2406,6 +2648,9 @@ public:
-    using BilinearFormIntegrator::AssemblePA;
-    virtual void AssemblePA(const FiniteElementSpace &fes);
- 
-+   using BilinearFormIntegrator::AssemblePABoundary;
-+   virtual void AssemblePABoundary(const FiniteElementSpace &fes);
-+
-    virtual void AssembleDiagonalPA(Vector &diag);
- 
-    virtual void AddMultPA(const Vector &x, Vector &y) const;
-@@ -2413,11 +2658,12 @@ public:
-    using BilinearFormIntegrator::AssembleMF;
-    virtual void AssembleMF(const FiniteElementSpace &fes);
- 
-+   using BilinearFormIntegrator::AssembleMFBoundary;
-+   virtual void AssembleMFBoundary(const FiniteElementSpace &fes);
-+
-    virtual void AssembleDiagonalMF(Vector &diag);
- 
-    virtual void AddMultMF(const Vector &x, Vector &y) const;
--
--   bool SupportsCeed() const { return DeviceCanUseCeed(); }
- };
- 
- /** Class for integrating (div u, p) where u is a vector field given by
-@@ -2430,14 +2676,14 @@ public:
-     ElementTransformation Trans. */
- class VectorFEDivergenceIntegrator : public BilinearFormIntegrator
- {
--protected:
--   Coefficient *Q;
--
- private:
- #ifndef MFEM_THREAD_SAFE
-    Vector divshape, shape;
- #endif
- 
-+protected:
-+   Coefficient *Q;
-+
-    // PA extension
-    Vector pa_data;
-    const DofToQuad *mapsO;         ///< Not owned. DOF-to-quad map, open.
-@@ -2449,9 +2695,10 @@ public:
-    VectorFEDivergenceIntegrator() { Q = NULL; }
-    VectorFEDivergenceIntegrator(Coefficient &q) { Q = &q; }
- 
--   virtual void AssembleElementMatrix(const FiniteElement &el,
--                                      ElementTransformation &Trans,
--                                      DenseMatrix &elmat) {}
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const;
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-@@ -2473,9 +2720,6 @@ public:
-     This is equivalent to a weak divergence of the Nedelec basis functions. */
- class VectorFEWeakDivergenceIntegrator: public BilinearFormIntegrator
- {
--protected:
--   Coefficient *Q;
--
- private:
- #ifndef MFEM_THREAD_SAFE
-    DenseMatrix dshape;
-@@ -2484,13 +2728,17 @@ private:
-    DenseMatrix invdfdx;
- #endif
- 
-+protected:
-+   Coefficient *Q;
-+
- public:
-    VectorFEWeakDivergenceIntegrator() { Q = NULL; }
-    VectorFEWeakDivergenceIntegrator(Coefficient &q) { Q = &q; }
- 
--   virtual void AssembleElementMatrix(const FiniteElement &el,
--                                      ElementTransformation &Trans,
--                                      DenseMatrix &elmat) {}
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const;
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-@@ -2502,9 +2750,6 @@ public:
-     test spaces are switched, assembles the form (u, curl v). */
- class VectorFECurlIntegrator: public BilinearFormIntegrator
- {
--protected:
--   Coefficient *Q;
--
- private:
- #ifndef MFEM_THREAD_SAFE
-    DenseMatrix curlshapeTrial;
-@@ -2512,13 +2757,17 @@ private:
-    DenseMatrix curlshapeTrial_dFT;
- #endif
- 
-+protected:
-+   Coefficient *Q;
-+
- public:
-    VectorFECurlIntegrator() { Q = NULL; }
-    VectorFECurlIntegrator(Coefficient &q) { Q = &q; }
- 
--   virtual void AssembleElementMatrix(const FiniteElement &el,
--                                      ElementTransformation &Trans,
--                                      DenseMatrix &elmat) {}
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const;
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-@@ -2540,6 +2789,11 @@ private:
- public:
-    DerivativeIntegrator(Coefficient &q, int i) : Q(&q), xi(i) {}
- 
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const;
-+
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
-                                       DenseMatrix &elmat)
-@@ -2587,6 +2841,13 @@ public:
-    CurlCurlIntegrator(MatrixCoefficient &mq, const IntegrationRule *ir = NULL) :
-       BilinearFormIntegrator(ir), Q(NULL), DQ(NULL), MQ(&mq) {}
- 
-+   virtual bool SupportsCeed() const { return DeviceCanUseCeed(); }
-+
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const;
-+
-    /* Given a particular Finite Element, compute the
-       element curl-curl matrix elmat */
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-@@ -2611,10 +2872,23 @@ public:
-    using BilinearFormIntegrator::AssemblePA;
-    virtual void AssemblePA(const FiniteElementSpace &fes);
- 
-+   using BilinearFormIntegrator::AssemblePABoundary;
-+   virtual void AssemblePABoundary(const FiniteElementSpace &fes);
-+
-    virtual void AssembleDiagonalPA(Vector &diag);
- 
-    virtual void AddMultPA(const Vector &x, Vector &y) const;
- 
-+   using BilinearFormIntegrator::AssembleMF;
-+   virtual void AssembleMF(const FiniteElementSpace &fes);
-+
-+   using BilinearFormIntegrator::AssembleMFBoundary;
-+   virtual void AssembleMFBoundary(const FiniteElementSpace &fes);
-+
-+   virtual void AssembleDiagonalMF(Vector &diag);
-+
-+   virtual void AddMultMF(const Vector &x, Vector &y) const;
-+
-    const Coefficient *GetCoefficient() const { return Q; }
- };
- 
-@@ -2632,13 +2906,18 @@ protected:
- 
- public:
-    VectorCurlCurlIntegrator() { Q = NULL; }
--
-    VectorCurlCurlIntegrator(Coefficient &q) : Q(&q) {}
- 
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const;
-+
-    /// Assemble an element matrix
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
-                                       DenseMatrix &elmat);
-+
-    /// Compute element energy: (1/2) (curl u, curl u)_E
-    virtual double GetElementEnergy(const FiniteElement &el,
-                                    ElementTransformation &Tr,
-@@ -2663,11 +2942,17 @@ private:
-    DenseMatrix dshape;
-    DenseMatrix curlshape;
-    DenseMatrix elmat_comp;
-+
- public:
-    MixedCurlIntegrator() : Q{NULL} {}
-    MixedCurlIntegrator(Coefficient *q_) :  Q{q_} {}
-    MixedCurlIntegrator(Coefficient &q) :  Q{&q} {}
- 
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const;
-+
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-                                        ElementTransformation &Trans,
-@@ -2681,9 +2966,6 @@ public:
- class VectorFEMassIntegrator: public BilinearFormIntegrator
- {
- private:
--   void Init(Coefficient *q, DiagonalMatrixCoefficient *dq, MatrixCoefficient *mq)
--   { Q = q; DQ = dq; MQ = mq; }
--
- #ifndef MFEM_THREAD_SAFE
-    Vector shape;
-    Vector D;
-@@ -2693,6 +2975,9 @@ private:
-    DenseMatrix trial_vshape;
- #endif
- 
-+   void Init(Coefficient *q, DiagonalMatrixCoefficient *dq, MatrixCoefficient *mq)
-+   { Q = q; DQ = dq; MQ = mq; }
-+
- protected:
-    Coefficient *Q;
-    DiagonalMatrixCoefficient *DQ;
-@@ -2717,6 +3002,13 @@ public:
-    VectorFEMassIntegrator(MatrixCoefficient *mq_) { Init(NULL, NULL, mq_); }
-    VectorFEMassIntegrator(MatrixCoefficient &mq) { Init(NULL, NULL, &mq); }
- 
-+   virtual bool SupportsCeed() const { return DeviceCanUseCeed(); }
-+
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const;
-+
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
-                                       DenseMatrix &elmat);
-@@ -2730,12 +3022,25 @@ public:
-    virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-                            const FiniteElementSpace &test_fes);
- 
-+   using BilinearFormIntegrator::AssemblePABoundary;
-+   virtual void AssemblePABoundary(const FiniteElementSpace &fes);
-+
-    virtual void AssembleDiagonalPA(Vector &diag);
- 
-    virtual void AddMultPA(const Vector &x, Vector &y) const;
- 
-    virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
- 
-+   using BilinearFormIntegrator::AssembleMF;
-+   virtual void AssembleMF(const FiniteElementSpace &fes);
-+
-+   using BilinearFormIntegrator::AssembleMFBoundary;
-+   virtual void AssembleMFBoundary(const FiniteElementSpace &fes);
-+
-+   virtual void AssembleDiagonalMF(Vector &diag);
-+
-+   virtual void AddMultMF(const Vector &x, Vector &y) const;
-+
-    const Coefficient *GetCoefficient() const { return Q; }
- };
- 
-@@ -2762,14 +3067,16 @@ private:
- 
- public:
-    VectorDivergenceIntegrator() :
--      Q(NULL), trial_maps(NULL), test_maps(NULL), geom(NULL)
--   {  }
-+      Q(NULL), trial_maps(NULL), test_maps(NULL), geom(NULL) {}
-    VectorDivergenceIntegrator(Coefficient *q_) :
--      Q(q_), trial_maps(NULL), test_maps(NULL), geom(NULL)
--   {}
-+      Q(q_), trial_maps(NULL), test_maps(NULL), geom(NULL) {}
-    VectorDivergenceIntegrator(Coefficient &q) :
--      Q(&q), trial_maps(NULL), test_maps(NULL), geom(NULL)
--   {}
-+      Q(&q), trial_maps(NULL), test_maps(NULL), geom(NULL) {}
-+
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const;
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-@@ -2783,10 +3090,6 @@ public:
-    virtual void AddMultPA(const Vector &x, Vector &y) const;
- 
-    virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
--
--   static const IntegrationRule &GetRule(const FiniteElement &trial_fe,
--                                         const FiniteElement &test_fe,
--                                         ElementTransformation &Trans);
- };
- 
- /// (Q div u, div v) for RT elements
-@@ -2812,6 +3115,13 @@ public:
-    DivDivIntegrator(Coefficient &q, const IntegrationRule *ir = NULL) :
-       BilinearFormIntegrator(ir), Q(&q) {}
- 
-+   virtual bool SupportsCeed() const { return DeviceCanUseCeed(); }
-+
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const;
-+
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
-                                       DenseMatrix &elmat);
-@@ -2824,107 +3134,8 @@ public:
-    using BilinearFormIntegrator::AssemblePA;
-    virtual void AssemblePA(const FiniteElementSpace &fes);
- 
--   virtual void AssembleDiagonalPA(Vector &diag);
--
--   virtual void AddMultPA(const Vector &x, Vector &y) const;
--
--   const Coefficient *GetCoefficient() const { return Q; }
--};
--
--/** Integrator for
--
--      (Q grad u, grad v) = sum_i (Q grad u_i, grad v_i) e_i e_i^T
--
--    for vector FE spaces, where e_i is the unit vector in the i-th direction.
--    The resulting local element matrix is square, of size <tt> vdim*dof </tt>,
--    where \c vdim is the vector dimension space and \c dof is the local degrees
--    of freedom. The integrator is not aware of the true vector dimension and
--    must use \c VectorCoefficient, \c MatrixCoefficient, or a caller-specified
--    value to determine the vector space. For a scalar coefficient, the caller
--    may manually specify the vector dimension or the vector dimension is assumed
--    to be the spatial dimension (i.e. 2-dimension or 3-dimension).
--*/
--class VectorDiffusionIntegrator : public BilinearFormIntegrator
--{
--protected:
--   Coefficient *Q = NULL;
--   VectorCoefficient *VQ = NULL;
--   MatrixCoefficient *MQ = NULL;
--
--   // PA extension
--   const DofToQuad *maps;         ///< Not owned
--   const GeometricFactors *geom;  ///< Not owned
--   int dim, sdim, ne, dofs1D, quad1D;
--   Vector pa_data;
--
--private:
--   DenseMatrix dshape, dshapedxt, pelmat;
--   int vdim = -1;
--   DenseMatrix mcoeff;
--   Vector vcoeff;
--
--public:
--   VectorDiffusionIntegrator() {}
--
--   /** \brief Integrator with unit coefficient for caller-specified vector
--       dimension.
--
--       If the vector dimension does not match the true dimension of the space,
--       the resulting element matrix will be mathematically invalid. */
--   VectorDiffusionIntegrator(int vector_dimension)
--      : vdim(vector_dimension) {}
--
--   VectorDiffusionIntegrator(Coefficient &q)
--      : Q(&q) {}
--
--   VectorDiffusionIntegrator(Coefficient &q, const IntegrationRule *ir)
--      : BilinearFormIntegrator(ir), Q(&q) {}
--
--   /** \brief Integrator with scalar coefficient for caller-specified vector
--       dimension.
--
--       The element matrix is block-diagonal with \c vdim copies of the element
--       matrix integrated with the \c Coefficient.
--
--       If the vector dimension does not match the true dimension of the space,
--       the resulting element matrix will be mathematically invalid. */
--   VectorDiffusionIntegrator(Coefficient &q, int vector_dimension)
--      : Q(&q), vdim(vector_dimension) {}
--
--   /** \brief Integrator with \c VectorCoefficient. The vector dimension of the
--       \c FiniteElementSpace is assumed to be the same as the dimension of the
--       \c Vector.
--
--       The element matrix is block-diagonal and each block is integrated with
--       coefficient q_i.
--
--       If the vector dimension does not match the true dimension of the space,
--       the resulting element matrix will be mathematically invalid. */
--   VectorDiffusionIntegrator(VectorCoefficient &vq)
--      : VQ(&vq), vdim(vq.GetVDim()) {}
--
--   /** \brief Integrator with \c MatrixCoefficient. The vector dimension of the
--       \c FiniteElementSpace is assumed to be the same as the dimension of the
--       \c Matrix.
--
--       The element matrix is populated in each block. Each block is integrated
--       with coefficient q_ij.
--
--       If the vector dimension does not match the true dimension of the space,
--       the resulting element matrix will be mathematically invalid. */
--   VectorDiffusionIntegrator(MatrixCoefficient& mq)
--      : MQ(&mq), vdim(mq.GetVDim()) {}
--
--   virtual void AssembleElementMatrix(const FiniteElement &el,
--                                      ElementTransformation &Trans,
--                                      DenseMatrix &elmat);
--
--   virtual void AssembleElementVector(const FiniteElement &el,
--                                      ElementTransformation &Tr,
--                                      const Vector &elfun, Vector &elvect);
--
--   using BilinearFormIntegrator::AssemblePA;
--   virtual void AssemblePA(const FiniteElementSpace &fes);
-+   using BilinearFormIntegrator::AssemblePABoundary;
-+   virtual void AssemblePABoundary(const FiniteElementSpace &fes);
- 
-    virtual void AssembleDiagonalPA(Vector &diag);
- 
-@@ -2933,11 +3144,14 @@ public:
-    using BilinearFormIntegrator::AssembleMF;
-    virtual void AssembleMF(const FiniteElementSpace &fes);
- 
-+   using BilinearFormIntegrator::AssembleMFBoundary;
-+   virtual void AssembleMFBoundary(const FiniteElementSpace &fes);
-+
-    virtual void AssembleDiagonalMF(Vector &diag);
- 
-    virtual void AddMultMF(const Vector &x, Vector &y) const;
- 
--   bool SupportsCeed() const { return DeviceCanUseCeed(); }
-+   const Coefficient *GetCoefficient() const { return Q; }
- };
- 
- /** Integrator for the linear elasticity form:
-@@ -2966,6 +3180,11 @@ public:
-    ElasticityIntegrator(Coefficient &m, double q_l, double q_m)
-    { lambda = NULL; mu = &m; q_lambda = q_l; q_mu = q_m; }
- 
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const;
-+
-    virtual void AssembleElementMatrix(const FiniteElement &,
-                                       ElementTransformation &,
-                                       DenseMatrix &);
-@@ -3054,7 +3273,11 @@ public:
-                      double a, double b)
-    { rho = &rho_; u = &u_; alpha = a; beta = b; }
- 
--   using BilinearFormIntegrator::AssembleFaceMatrix;
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &el1,
-+                                          const FiniteElement &el2,
-+                                          FaceElementTransformations &Trans) const;
-+
-    virtual void AssembleFaceMatrix(const FiniteElement &el1,
-                                    const FiniteElement &el2,
-                                    FaceElementTransformations &Trans,
-@@ -3075,9 +3298,6 @@ public:
-    virtual void AssembleEABoundaryFaces(const FiniteElementSpace& fes,
-                                         Vector &ea_data_bdr);
- 
--   static const IntegrationRule &GetRule(Geometry::Type geom, int order,
--                                         FaceElementTransformations &T);
--
- private:
-    void SetupPA(const FiniteElementSpace &fes, FaceType type);
- };
-@@ -3137,7 +3357,16 @@ public:
-    DGDiffusionIntegrator(MatrixCoefficient &q, const double s, const double k)
-       : Q(NULL), MQ(&q), sigma(s), kappa(k) {}
- 
--   using BilinearFormIntegrator::AssembleFaceMatrix;
-+   static const IntegrationRule &GetRuleStatic(const FiniteElement &el1,
-+                                               const FiniteElement &el2,
-+                                               FaceElementTransformations &Trans);
-+
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &el1,
-+                                          const FiniteElement &el2,
-+                                          FaceElementTransformations &Trans) const
-+   { return GetRuleStatic(el1, el2, Trans); }
-+
-    virtual void AssembleFaceMatrix(const FiniteElement &el1,
-                                    const FiniteElement &el2,
-                                    FaceElementTransformations &Trans,
-@@ -3197,7 +3426,12 @@ public:
-    MFEM_DEPRECATED DGDiffusionBR2Integrator(class FiniteElementSpace *fes,
-                                             double e = 1.0);
- 
--   using BilinearFormIntegrator::AssembleFaceMatrix;
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &el1,
-+                                          const FiniteElement &el2,
-+                                          FaceElementTransformations &Trans) const
-+   { return DGDiffusionIntegrator::GetRuleStatic(el1, el2, Trans); }
-+
-    virtual void AssembleFaceMatrix(const FiniteElement &el1,
-                                    const FiniteElement &el2,
-                                    FaceElementTransformations &Trans,
-@@ -3274,7 +3508,12 @@ public:
-                           double alpha_, double kappa_)
-       : lambda(&lambda_), mu(&mu_), alpha(alpha_), kappa(kappa_) {}
- 
--   using BilinearFormIntegrator::AssembleFaceMatrix;
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &el1,
-+                                          const FiniteElement &el2,
-+                                          FaceElementTransformations &Trans) const
-+   { return DGDiffusionIntegrator::GetRuleStatic(el1, el2, Trans); }
-+
-    virtual void AssembleFaceMatrix(const FiniteElement &el1,
-                                    const FiniteElement &el2,
-                                    FaceElementTransformations &Trans,
-@@ -3325,12 +3564,11 @@ private:
- public:
-    TraceJumpIntegrator() {}
- 
--   using BilinearFormIntegrator::AssembleFaceMatrix;
--   virtual void AssembleFaceMatrix(const FiniteElement &trial_face_fe,
--                                   const FiniteElement &test_fe1,
--                                   const FiniteElement &test_fe2,
--                                   FaceElementTransformations &Trans,
--                                   DenseMatrix &elmat);
-+   virtual void AssembleFaceMatrix2(const FiniteElement &trial_face_fe,
-+                                    const FiniteElement &test_fe1,
-+                                    const FiniteElement &test_fe2,
-+                                    FaceElementTransformations &Trans,
-+                                    DenseMatrix &elmat);
- };
- 
- /** Integrator for the form: < v, [w.n] > over all faces (the interface) where
-@@ -3345,12 +3583,11 @@ private:
- public:
-    NormalTraceJumpIntegrator() {}
- 
--   using BilinearFormIntegrator::AssembleFaceMatrix;
--   virtual void AssembleFaceMatrix(const FiniteElement &trial_face_fe,
--                                   const FiniteElement &test_fe1,
--                                   const FiniteElement &test_fe2,
--                                   FaceElementTransformations &Trans,
--                                   DenseMatrix &elmat);
-+   virtual void AssembleFaceMatrix2(const FiniteElement &trial_face_fe,
-+                                    const FiniteElement &test_fe1,
-+                                    const FiniteElement &test_fe2,
-+                                    FaceElementTransformations &Trans,
-+                                    DenseMatrix &elmat);
- };
- 
- /** Integrator for the DPG form: < v, w > over a face (the interface) where
-@@ -3361,13 +3598,15 @@ class TraceIntegrator : public BilinearFormIntegrator
- {
- private:
-    Vector face_shape, shape;
-+
- public:
--   TraceIntegrator() { }
--   void AssembleTraceFaceMatrix(int elem,
--                                const FiniteElement &trial_face_fe,
--                                const FiniteElement &test_fe,
--                                FaceElementTransformations &Trans,
--                                DenseMatrix &elmat);
-+   TraceIntegrator() {}
-+
-+   virtual void AssembleTraceFaceMatrix(int elem,
-+                                        const FiniteElement &trial_face_fe,
-+                                        const FiniteElement &test_fe,
-+                                        FaceElementTransformations &Trans,
-+                                        DenseMatrix &elmat);
- };
- 
- /** Integrator for the form: < v, w.n > over a face (the interface) where
-@@ -3380,15 +3619,15 @@ private:
-    DenseMatrix shape;
- 
- public:
--   NormalTraceIntegrator() { }
--   virtual void AssembleTraceFaceMatrix(int ielem,
-+   NormalTraceIntegrator() {}
-+
-+   virtual void AssembleTraceFaceMatrix(int elem,
-                                         const FiniteElement &trial_face_fe,
-                                         const FiniteElement &test_fe,
-                                         FaceElementTransformations &Trans,
-                                         DenseMatrix &elmat);
- };
- 
--
- /** Integrator for the form: < v, w × n > over a face (the interface)
-  *  In 3D the trial variable v is defined on the interface (H^-1/2(curl), trace of H(curl))
-  *  In 2D it's defined on the interface (H^1/2, trace of H1)
-@@ -3426,17 +3665,30 @@ private:
-    }
- 
- public:
--   TangentTraceIntegrator() { }
--   void AssembleTraceFaceMatrix(int elem,
--                                const FiniteElement &trial_face_fe,
--                                const FiniteElement &test_fe,
--                                FaceElementTransformations &Trans,
--                                DenseMatrix &elmat);
-+   TangentTraceIntegrator() {}
-+
-+   virtual void AssembleTraceFaceMatrix(int elem,
-+                                        const FiniteElement &trial_face_fe,
-+                                        const FiniteElement &test_fe,
-+                                        FaceElementTransformations &Trans,
-+                                        DenseMatrix &elmat);
- };
- 
- /** Abstract class to serve as a base for local interpolators to be used in the
-     DiscreteLinearOperator class. */
--class DiscreteInterpolator : public BilinearFormIntegrator {};
-+class DiscreteInterpolator : public BilinearFormIntegrator
-+{
-+public:
-+   // This avoids an error when GetRule is called with an interpolator even if
-+   // it is never used.
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-+                                          ElementTransformation &Trans) const
-+   {
-+      return IntRules.Get(0, 0);
-+   }
-+};
- 
- /** Class for constructing the gradient as a DiscreteLinearOperator from an
-     H1-conforming space to an H(curl)-conforming space. The range space can be
-@@ -3447,17 +3699,14 @@ public:
-    GradientInterpolator() : dofquad_fe(NULL) {}
-    virtual ~GradientInterpolator() { delete dofquad_fe; }
- 
-+   virtual bool SupportsCeed() const { return DeviceCanUseCeed(); }
-+
-    virtual void AssembleElementMatrix2(const FiniteElement &h1_fe,
-                                        const FiniteElement &nd_fe,
-                                        ElementTransformation &Trans,
-                                        DenseMatrix &elmat)
-    { nd_fe.ProjectGrad(h1_fe, Trans, elmat); }
- 
--   /** @brief Setup method for PA data.
--
--       @param[in] trial_fes   H1 Lagrange space
--       @param[in] test_fes    H(curl) Nedelec space
--    */
-    using BilinearFormIntegrator::AssemblePA;
-    virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-                            const FiniteElementSpace &test_fes);
-@@ -3469,7 +3718,6 @@ public:
- private:
-    /// 1D finite element that generates and owns the 1D DofToQuad maps below
-    FiniteElement *dofquad_fe;
--
-    bool B_id; // is the B basis operator (maps_C_C) the identity?
-    const DofToQuad *maps_C_C; // one-d map with Lobatto rows, Lobatto columns
-    const DofToQuad *maps_O_C; // one-d map with Legendre rows, Lobatto columns
-@@ -3482,7 +3730,9 @@ private:
- class IdentityInterpolator : public DiscreteInterpolator
- {
- public:
--   IdentityInterpolator(): dofquad_fe(NULL) { }
-+   IdentityInterpolator(): dofquad_fe(NULL) {}
-+
-+   virtual bool SupportsCeed() const { return DeviceCanUseCeed(); }
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &dom_fe,
-                                        const FiniteElement &ran_fe,
-@@ -3503,11 +3753,9 @@ public:
- private:
-    /// 1D finite element that generates and owns the 1D DofToQuad maps below
-    FiniteElement *dofquad_fe;
--
-    const DofToQuad *maps_C_C; // one-d map with Lobatto rows, Lobatto columns
-    const DofToQuad *maps_O_C; // one-d map with Legendre rows, Lobatto columns
-    int dim, ne, o_dofs1D, c_dofs1D;
--
-    Vector pa_data;
- };
- 
-@@ -3517,11 +3765,21 @@ private:
- class CurlInterpolator : public DiscreteInterpolator
- {
- public:
-+   virtual bool SupportsCeed() const { return DeviceCanUseCeed(); }
-+
-    virtual void AssembleElementMatrix2(const FiniteElement &dom_fe,
-                                        const FiniteElement &ran_fe,
-                                        ElementTransformation &Trans,
-                                        DenseMatrix &elmat)
-    { ran_fe.ProjectCurl(dom_fe, Trans, elmat); }
-+
-+   using BilinearFormIntegrator::AssemblePA;
-+   virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-+                           const FiniteElementSpace &test_fes);
-+
-+   virtual void AddMultPA(const Vector &x, Vector &y) const;
-+
-+   virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
- };
- 
- /** Class for constructing the (local) discrete divergence matrix which can
-@@ -3577,8 +3835,7 @@ protected:
- class ScalarVectorProductInterpolator : public DiscreteInterpolator
- {
- public:
--   ScalarVectorProductInterpolator(Coefficient &sc)
--      : Q(&sc) {}
-+   ScalarVectorProductInterpolator(Coefficient &sc) : Q(&sc) {}
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &dom_fe,
-                                        const FiniteElement &ran_fe,
-@@ -3595,8 +3852,7 @@ protected:
- class VectorScalarProductInterpolator : public DiscreteInterpolator
- {
- public:
--   VectorScalarProductInterpolator(VectorCoefficient &vc)
--      : VQ(&vc) {}
-+   VectorScalarProductInterpolator(VectorCoefficient &vc) : VQ(&vc) {}
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &dom_fe,
-                                        const FiniteElement &ran_fe,
-@@ -3630,8 +3886,7 @@ protected:
- class VectorCrossProductInterpolator : public DiscreteInterpolator
- {
- public:
--   VectorCrossProductInterpolator(VectorCoefficient &vc)
--      : VQ(&vc) {}
-+   VectorCrossProductInterpolator(VectorCoefficient &vc) : VQ(&vc) {}
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &nd_fe,
-                                        const FiniteElement &rt_fe,
-diff --git a/fem/ceed/integrators/convection/convection.cpp b/fem/ceed/integrators/convection/convection.cpp
-index c5560f354..c980123ba 100644
---- a/fem/ceed/integrators/convection/convection.cpp
-+++ b/fem/ceed/integrators/convection/convection.cpp
-@@ -25,81 +25,94 @@ namespace ceed
- #ifdef MFEM_USE_CEED
- struct ConvectionOperatorInfo : public OperatorInfo
- {
--   ConvectionContext ctx;
--   ConvectionOperatorInfo(int dim, double alpha)
-+   ConvectionContext ctx = {0};
-+   ConvectionOperatorInfo(const mfem::FiniteElementSpace &fes,
-+                          mfem::VectorCoefficient *VQ, double alpha,
-+                          bool use_bdr = false, bool use_mf = false)
-    {
-+      MFEM_VERIFY(VQ && VQ->GetVDim() == fes.GetMesh()->SpaceDimension(),
-+                  "Incorrect coefficient dimensions in ceed::ConvectionOperatorInfo!");
-+      ctx.dim = fes.GetMesh()->Dimension() - use_bdr;
-+      ctx.space_dim = fes.GetMesh()->SpaceDimension();
-+      ctx.alpha = alpha;
-+      if (!use_mf)
-+      {
-+         apply_func = ":f_apply_conv";
-+         apply_qf = &f_apply_conv;
-+      }
-+      else
-+      {
-+         build_func = "";
-+         build_qf = nullptr;
-+      }
-+      if (mfem::VectorConstantCoefficient *const_coeff =
-+             dynamic_cast<mfem::VectorConstantCoefficient *>(VQ))
-+      {
-+         const int vdim = VQ->GetVDim();
-+         MFEM_VERIFY(vdim <= LIBCEED_CONV_COEFF_COMP_MAX,
-+                     "VectorCoefficient dimension exceeds context storage!");
-+         const mfem::Vector &val = const_coeff->GetVec();
-+         for (int i = 0; i < vdim; i++)
-+         {
-+            ctx.coeff[i] = val[i];
-+         }
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_conv_const";
-+            build_qf = &f_build_conv_const;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_conv_mf_const";
-+            apply_qf = &f_apply_conv_mf_const;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_conv_quad";
-+            build_qf = &f_build_conv_quad;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_conv_mf_quad";
-+            apply_qf = &f_apply_conv_mf_quad;
-+         }
-+      }
-       header = "/integrators/convection/convection_qf.h";
--      build_func_const = ":f_build_conv_const";
--      build_qf_const = &f_build_conv_const;
--      build_func_quad = ":f_build_conv_quad";
--      build_qf_quad = &f_build_conv_quad;
--      apply_func = ":f_apply_conv";
--      apply_qf = &f_apply_conv;
--      apply_func_mf_const = ":f_apply_conv_mf_const";
--      apply_qf_mf_const = &f_apply_conv_mf_const;
--      apply_func_mf_quad = ":f_apply_conv_mf_quad";
--      apply_qf_mf_quad = &f_apply_conv_mf_quad;
-       trial_op = EvalMode::Grad;
-       test_op = EvalMode::Interp;
--      qdatasize = dim * (dim + 1) / 2;
--      ctx.alpha = alpha;
-+      qdatasize = ctx.dim;
-    }
- };
- #endif
- 
- PAConvectionIntegrator::PAConvectionIntegrator(
-+   const mfem::ConvectionIntegrator &integ,
-    const mfem::FiniteElementSpace &fes,
--   const mfem::IntegrationRule &irm,
--   mfem::VectorCoefficient *Q,
--   const double alpha)
--   : PAIntegrator()
--{
--#ifdef MFEM_USE_CEED
--   ConvectionOperatorInfo info(fes.GetMesh()->Dimension(), alpha);
--   Assemble(info, fes, irm, Q);
--#else
--   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
--#endif
--}
--
--MixedPAConvectionIntegrator::MixedPAConvectionIntegrator(
--   const ConvectionIntegrator &integ,
--   const mfem::FiniteElementSpace &fes,
--   mfem::VectorCoefficient *Q,
--   const double alpha)
-+   mfem::VectorCoefficient *VQ,
-+   const double alpha,
-+   const bool use_bdr)
- {
- #ifdef MFEM_USE_CEED
--   ConvectionOperatorInfo info(fes.GetMesh()->Dimension(), alpha);
--   Assemble(integ, info, fes, Q);
-+   ConvectionOperatorInfo info(fes, VQ, alpha, use_bdr);
-+   Assemble(integ, info, fes, VQ, use_bdr);
- #else
-    MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
- #endif
- }
- 
- MFConvectionIntegrator::MFConvectionIntegrator(
-+   const mfem::ConvectionIntegrator &integ,
-    const mfem::FiniteElementSpace &fes,
--   const mfem::IntegrationRule &irm,
--   mfem::VectorCoefficient *Q,
--   const double alpha)
--   : MFIntegrator()
--{
--#ifdef MFEM_USE_CEED
--   ConvectionOperatorInfo info(fes.GetMesh()->Dimension(), alpha);
--   Assemble(info, fes, irm, Q);
--#else
--   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
--#endif
--}
--
--MixedMFConvectionIntegrator::MixedMFConvectionIntegrator(
--   const ConvectionIntegrator &integ,
--   const mfem::FiniteElementSpace &fes,
--   mfem::VectorCoefficient *Q,
--   const double alpha)
-+   mfem::VectorCoefficient *VQ,
-+   const double alpha,
-+   const bool use_bdr)
- {
- #ifdef MFEM_USE_CEED
--   ConvectionOperatorInfo info(fes.GetMesh()->Dimension(), alpha);
--   Assemble(integ, info, fes, Q);
-+   ConvectionOperatorInfo info(fes, VQ, alpha, use_bdr, true);
-+   Assemble(integ, info, fes, VQ, use_bdr, true);
- #else
-    MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
- #endif
-diff --git a/fem/ceed/integrators/convection/convection.hpp b/fem/ceed/integrators/convection/convection.hpp
-index 1cd968770..713b98c2c 100644
---- a/fem/ceed/integrators/convection/convection.hpp
-+++ b/fem/ceed/integrators/convection/convection.hpp
-@@ -13,7 +13,7 @@
- #define MFEM_LIBCEED_CONV_HPP
- 
- #include "../../interface/integrator.hpp"
--#include "../../interface/mixed_integrator.hpp"
-+#include "../../interface/mixed_operator.hpp"
- #include "../../../fespace.hpp"
- 
- namespace mfem
-@@ -23,41 +23,25 @@ namespace ceed
- {
- 
- /// Represent a ConvectionIntegrator with AssemblyLevel::Partial using libCEED.
--class PAConvectionIntegrator : public PAIntegrator
-+class PAConvectionIntegrator : public MixedOperator<Integrator>
- {
- public:
--   PAConvectionIntegrator(const mfem::FiniteElementSpace &fes,
--                          const mfem::IntegrationRule &ir,
--                          mfem::VectorCoefficient *Q,
--                          const double alpha);
--};
--
--class MixedPAConvectionIntegrator : public MixedIntegrator<PAIntegrator>
--{
--public:
--   MixedPAConvectionIntegrator(const ConvectionIntegrator &integ,
--                               const mfem::FiniteElementSpace &fes,
--                               mfem::VectorCoefficient *Q,
--                               const double alpha);
-+   PAConvectionIntegrator(const mfem::ConvectionIntegrator &integ,
-+                          const mfem::FiniteElementSpace &fes,
-+                          mfem::VectorCoefficient *VQ,
-+                          const double alpha,
-+                          const bool use_bdr = false);
- };
- 
- /// Represent a ConvectionIntegrator with AssemblyLevel::None using libCEED.
--class MFConvectionIntegrator : public MFIntegrator
--{
--public:
--   MFConvectionIntegrator(const mfem::FiniteElementSpace &fes,
--                          const mfem::IntegrationRule &ir,
--                          mfem::VectorCoefficient *Q,
--                          const double alpha);
--};
--
--class MixedMFConvectionIntegrator : public MixedIntegrator<MFIntegrator>
-+class MFConvectionIntegrator : public MixedOperator<Integrator>
- {
- public:
--   MixedMFConvectionIntegrator(const ConvectionIntegrator &integ,
--                               const mfem::FiniteElementSpace &fes,
--                               mfem::VectorCoefficient *Q,
--                               const double alpha);
-+   MFConvectionIntegrator(const mfem::ConvectionIntegrator &integ,
-+                          const mfem::FiniteElementSpace &fes,
-+                          mfem::VectorCoefficient *VQ,
-+                          const double alpha,
-+                          const bool use_bdr = false);
- };
- 
- }
-diff --git a/fem/ceed/integrators/convection/convection_qf.h b/fem/ceed/integrators/convection/convection_qf.h
-index 68e96895e..0dd11387c 100644
---- a/fem/ceed/integrators/convection/convection_qf.h
-+++ b/fem/ceed/integrators/convection/convection_qf.h
-@@ -9,207 +9,151 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--/// A structure used to pass additional data to f_build_conv and f_apply_conv
--struct ConvectionContext {
--   CeedInt dim, space_dim, vdim;
--   CeedScalar coeff[3];
-+#ifndef MFEM_LIBCEED_CONV_QF_H
-+#define MFEM_LIBCEED_CONV_QF_H
-+
-+#include "../util/util_qf.h"
-+
-+#define LIBCEED_CONV_COEFF_COMP_MAX 3
-+
-+struct ConvectionContext
-+{
-+   CeedInt dim, space_dim;
-    CeedScalar alpha;
-+   CeedScalar coeff[LIBCEED_CONV_COEFF_COMP_MAX];
- };
- 
--/// libCEED Q-function for building quadrature data for a convection operator
-+/// libCEED QFunction for building quadrature data for a convection operator
- /// with a constant coefficient
- CEED_QFUNCTION(f_build_conv_const)(void *ctx, CeedInt Q,
-                                    const CeedScalar *const *in,
-                                    CeedScalar *const *out)
- {
--   ConvectionContext *bc = (ConvectionContext*)ctx;
--   // in[0] is Jacobians with shape [dim, nc=dim, Q]
-+   ConvectionContext *bc = (ConvectionContext *)ctx;
-+   // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q]
-    // in[1] is quadrature weights, size (Q)
-    //
--   // At every quadrature point, compute and store qw * adj(J).
--   const CeedScalar coeff0 = bc->coeff[0];
--   const CeedScalar coeff1 = bc->coeff[1];
--   const CeedScalar coeff2 = bc->coeff[2];
-+   // At every quadrature point, compute and store qw * α * c^T adj(J)^T
-    const CeedScalar alpha  = bc->alpha;
-+   const CeedScalar *coeff = bc->coeff;
-    const CeedScalar *J = in[0], *qw = in[1];
-    CeedScalar *qd = out[0];
--   switch (bc->dim + 10 * bc->space_dim)
-+   switch (10 * bc->space_dim + bc->dim)
-    {
-       case 11:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            qd[i] = alpha * coeff0 * qw[i] * J[i];
-+            const CeedScalar coeff0 = coeff[0];
-+            qd[i] = qw[i] * alpha * coeff0 * J[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultCtAdjJt21(J + i, Q, coeff, 1, qw[i] * alpha, Q, qd + i);
-          }
-          break;
-       case 22:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultCtAdjJt22(J + i, Q, coeff, 1, qw[i] * alpha, Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 2   qd: 0 1   adj(J):  J22 -J12
--            //    1 3       1 2           -J21  J11
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J12 = J[i + Q * 2];
--            const CeedScalar J22 = J[i + Q * 3];
--            const CeedScalar w = alpha * qw[i];
--            const CeedScalar wx = w * coeff0;
--            const CeedScalar wy = w * coeff1;
--            qd[i + Q * 0] =  wx * J22 - wy * J12;
--            qd[i + Q * 1] = -wx * J21 + wy * J11;
-+            MultCtAdjJt32(J + i, Q, coeff, 1, qw[i] * alpha, Q, qd + i);
-          }
-          break;
-       case 33:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 3 6   qd: 0 1 2
--            //    1 4 7       1 3 4
--            //    2 5 8       2 4 5
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J31 = J[i + Q * 2];
--            const CeedScalar J12 = J[i + Q * 3];
--            const CeedScalar J22 = J[i + Q * 4];
--            const CeedScalar J32 = J[i + Q * 5];
--            const CeedScalar J13 = J[i + Q * 6];
--            const CeedScalar J23 = J[i + Q * 7];
--            const CeedScalar J33 = J[i + Q * 8];
--            const CeedScalar A11 = J22 * J33 - J23 * J32;
--            const CeedScalar A12 = J13 * J32 - J12 * J33;
--            const CeedScalar A13 = J12 * J23 - J13 * J22;
--            const CeedScalar A21 = J23 * J31 - J21 * J33;
--            const CeedScalar A22 = J11 * J33 - J13 * J31;
--            const CeedScalar A23 = J13 * J21 - J11 * J23;
--            const CeedScalar A31 = J21 * J32 - J22 * J31;
--            const CeedScalar A32 = J12 * J31 - J11 * J32;
--            const CeedScalar A33 = J11 * J22 - J12 * J21;
--            const CeedScalar w = alpha * qw[i];
--            const CeedScalar wx = w * coeff0;
--            const CeedScalar wy = w * coeff1;
--            const CeedScalar wz = w * coeff2;
--            qd[i + Q * 0] = wx * A11 + wy * A12 + wz * A13;
--            qd[i + Q * 1] = wx * A21 + wy * A22 + wz * A23;
--            qd[i + Q * 2] = wx * A31 + wy * A32 + wz * A33;
-+            MultCtAdjJt33(J + i, Q, coeff, 1, qw[i] * alpha, Q, qd + i);
-          }
-          break;
-    }
-    return 0;
- }
- 
--/// libCEED Q-function for building quadrature data for a convection operator
--/// coefficient evaluated at quadrature points.
-+/// libCEED QFunction for building quadrature data for a convection operator
-+/// with a coefficient evaluated at quadrature points
- CEED_QFUNCTION(f_build_conv_quad)(void *ctx, CeedInt Q,
-                                   const CeedScalar *const *in,
-                                   CeedScalar *const *out)
- {
-    ConvectionContext *bc = (ConvectionContext *)ctx;
--   // in[1] is Jacobians with shape [dim, nc=dim, Q]
-+   // in[0] is coefficients with shape [ncomp=space_dim, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-    // in[2] is quadrature weights, size (Q)
-    //
--   // At every quadrature point, compute and store qw * adj(J).
--   const CeedScalar *c = in[0], *J = in[1], *qw = in[2];
-+   // At every quadrature point, compute and store qw * α * c^T adj(J)^T
-    const CeedScalar alpha  = bc->alpha;
-+   const CeedScalar *c = in[0], *J = in[1], *qw = in[2];
-    CeedScalar *qd = out[0];
--   switch (bc->dim + 10 * bc->space_dim)
-+   switch (10 * bc->space_dim + bc->dim)
-    {
-       case 11:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            const CeedScalar coeff = c[i];
--            qd[i] = alpha * coeff * qw[i] * J[i];
-+            qd[i] = qw[i] * alpha * c[i] * J[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultCtAdjJt21(J + i, Q, c + i, Q, qw[i] * alpha, Q, qd + i);
-          }
-          break;
-       case 22:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 2   qd: 0 1   adj(J):  J22 -J12
--            //    1 3       1 2           -J21  J11
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J12 = J[i + Q * 2];
--            const CeedScalar J22 = J[i + Q * 3];
--            const CeedScalar w = alpha * qw[i];
--            const CeedScalar wx = w * c[i + Q * 0];
--            const CeedScalar wy = w * c[i + Q * 1];
--            qd[i + Q * 0] =  wx * J22 - wy * J12;
--            qd[i + Q * 1] = -wx * J21 + wy * J11;
-+            MultCtAdjJt22(J + i, Q, c + i, Q, qw[i] * alpha, Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultCtAdjJt32(J + i, Q, c + i, Q, qw[i] * alpha, Q, qd + i);
-          }
-          break;
-       case 33:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 3 6   qd: 0 1 2
--            //    1 4 7       1 3 4
--            //    2 5 8       2 4 5
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J31 = J[i + Q * 2];
--            const CeedScalar J12 = J[i + Q * 3];
--            const CeedScalar J22 = J[i + Q * 4];
--            const CeedScalar J32 = J[i + Q * 5];
--            const CeedScalar J13 = J[i + Q * 6];
--            const CeedScalar J23 = J[i + Q * 7];
--            const CeedScalar J33 = J[i + Q * 8];
--            const CeedScalar A11 = J22 * J33 - J23 * J32;
--            const CeedScalar A12 = J13 * J32 - J12 * J33;
--            const CeedScalar A13 = J12 * J23 - J13 * J22;
--            const CeedScalar A21 = J23 * J31 - J21 * J33;
--            const CeedScalar A22 = J11 * J33 - J13 * J31;
--            const CeedScalar A23 = J13 * J21 - J11 * J23;
--            const CeedScalar A31 = J21 * J32 - J22 * J31;
--            const CeedScalar A32 = J12 * J31 - J11 * J32;
--            const CeedScalar A33 = J11 * J22 - J12 * J21;
--            const CeedScalar w = alpha * qw[i];
--            const CeedScalar wx = w * c[i + Q * 0];
--            const CeedScalar wy = w * c[i + Q * 1];
--            const CeedScalar wz = w * c[i + Q * 2];
--            qd[i + Q * 0] = wx * A11 + wy * A12 + wz * A13;
--            qd[i + Q * 1] = wx * A21 + wy * A22 + wz * A23;
--            qd[i + Q * 2] = wx * A31 + wy * A32 + wz * A33;
-+            MultCtAdjJt33(J + i, Q, c + i, Q, qw[i] * alpha, Q, qd + i);
-          }
-          break;
-    }
-    return 0;
- }
- 
--/// libCEED Q-function for applying a conv operator
-+/// libCEED QFunction for applying a convection operator
- CEED_QFUNCTION(f_apply_conv)(void *ctx, CeedInt Q,
-                              const CeedScalar *const *in,
-                              CeedScalar *const *out)
- {
-    ConvectionContext *bc = (ConvectionContext *)ctx;
--   // in[0], out[0] have shape [dim, nc=1, Q]
-+   // in[0] has shape [dim, ncomp=1, Q]
-+   // out[0] has shape [ncomp=1, Q]
-    const CeedScalar *ug = in[0], *qd = in[1];
-    CeedScalar *vg = out[0];
--   switch (10*bc->dim + bc->vdim)
-+   switch (bc->dim)
-    {
--      case 11:
--         for (CeedInt i = 0; i < Q; i++)
-+      case 1:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            vg[i] = ug[i] * qd[i];
-+            vg[i] = qd[i] * ug[i];
-          }
-          break;
--      case 21:
--         for (CeedInt i = 0; i < Q; i++)
-+      case 2:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
-             const CeedScalar ug0 = ug[i + Q * 0];
-             const CeedScalar ug1 = ug[i + Q * 1];
-             vg[i] = qd[i + Q * 0] * ug0 + qd[i + Q * 1] * ug1;
-          }
-          break;
--      case 22:
--         for (CeedInt i = 0; i < Q; i++)
--         {
--            const CeedScalar qd0 = qd[i + Q * 0];
--            const CeedScalar qd1 = qd[i + Q * 1];
--            for (CeedInt c = 0; c < 2; c++)
--            {
--               const CeedScalar ug0 = ug[i + Q * (c+2*0)];
--               const CeedScalar ug1 = ug[i + Q * (c+2*1)];
--               vg[i + Q * c] = qd0 * ug0 + qd1 * ug1;
--            }
--         }
--         break;
--      case 31:
--         for (CeedInt i = 0; i < Q; i++)
-+      case 3:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
-             const CeedScalar ug0 = ug[i + Q * 0];
-             const CeedScalar ug1 = ug[i + Q * 1];
-@@ -217,313 +161,147 @@ CEED_QFUNCTION(f_apply_conv)(void *ctx, CeedInt Q,
-             vg[i] = qd[i + Q * 0] * ug0 + qd[i + Q * 1] * ug1 + qd[i + Q * 2] * ug2;
-          }
-          break;
--      case 33:
--         for (CeedInt i = 0; i < Q; i++)
--         {
--            const CeedScalar qd0 = qd[i + Q * 0];
--            const CeedScalar qd1 = qd[i + Q * 1];
--            const CeedScalar qd2 = qd[i + Q * 2];
--            for (CeedInt c = 0; c < 3; c++)
--            {
--               const CeedScalar ug0 = ug[i + Q * (c+3*0)];
--               const CeedScalar ug1 = ug[i + Q * (c+3*1)];
--               const CeedScalar ug2 = ug[i + Q * (c+3*2)];
--               vg[i + Q * c] = qd0 * ug0 + qd1 * ug1 + qd2 * ug2;
--            }
--         }
--         break;
-    }
-    return 0;
- }
- 
--/// libCEED Q-function for applying a conv operator
-+/// libCEED QFunction for applying a convection operator with a constant
-+/// coefficient
- CEED_QFUNCTION(f_apply_conv_mf_const)(void *ctx, CeedInt Q,
-                                       const CeedScalar *const *in,
-                                       CeedScalar *const *out)
- {
--   ConvectionContext *bc = (ConvectionContext*)ctx;
--   // in[0], out[0] have shape [dim, nc=1, Q]
--   // in[1] is Jacobians with shape [dim, nc=dim, Q]
-+   ConvectionContext *bc = (ConvectionContext *)ctx;
-+   // in[0] has shape [dim, ncomp=1, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-    // in[2] is quadrature weights, size (Q)
-+   // out[0] has shape [ncomp=1, Q]
-    //
--   // At every quadrature point, compute qw * adj(J).
--   const CeedScalar coeff0 = bc->coeff[0];
--   const CeedScalar coeff1 = bc->coeff[1];
--   const CeedScalar coeff2 = bc->coeff[2];
-+   // At every quadrature point, compute qw * α * c^T adj(J)^T
-    const CeedScalar alpha  = bc->alpha;
-+   const CeedScalar *coeff = bc->coeff;
-    const CeedScalar *ug = in[0], *J = in[1], *qw = in[2];
-    CeedScalar *vg = out[0];
--   switch (10 * bc->dim + bc->vdim)
-+   switch (10 * bc->space_dim + bc->dim)
-    {
-       case 11:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            const CeedScalar qd = alpha * coeff0 * qw[i] * J[i];
--            vg[i] = ug[i] * qd;
-+            const CeedScalar coeff0 = coeff[0];
-+            const CeedScalar qd = qw[i] * alpha * coeff0 * J[i];
-+            vg[i] = qd * ug[i];
-          }
-          break;
-       case 21:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 2   qd: 0 1   adj(J):  J22 -J12
--            //    1 3       1 2           -J21  J11
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J12 = J[i + Q * 2];
--            const CeedScalar J22 = J[i + Q * 3];
--            const CeedScalar w = alpha * qw[i];
--            const CeedScalar wx = w * coeff0;
--            const CeedScalar wy = w * coeff1;
--            const CeedScalar qd0 =  wx * J22 - wy * J12;
--            const CeedScalar qd1 = -wx * J21 + wy * J11;
--            const CeedScalar ug0 = ug[i + Q * 0];
--            const CeedScalar ug1 = ug[i + Q * 1];
--            vg[i] = qd0 * ug0 + qd1 * ug1;
-+            CeedScalar qd;
-+            MultCtAdjJt21(J + i, Q, coeff, 1, qw[i] * alpha, 1, &qd);
-+            vg[i] = qd * ug[i];
-          }
-          break;
-       case 22:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 2   qd: 0 1   adj(J):  J22 -J12
--            //    1 3       1 2           -J21  J11
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J12 = J[i + Q * 2];
--            const CeedScalar J22 = J[i + Q * 3];
--            const CeedScalar w = alpha * qw[i];
--            const CeedScalar wx = w * coeff0;
--            const CeedScalar wy = w * coeff1;
--            const CeedScalar qd0 =  wx * J22 - wy * J12;
--            const CeedScalar qd1 = -wx * J21 + wy * J11;
--            for (CeedInt c = 0; c < 2; c++)
--            {
--               const CeedScalar ug0 = ug[i + Q * (c+2*0)];
--               const CeedScalar ug1 = ug[i + Q * (c+2*1)];
--               vg[i + Q * c] = qd0 * ug0 + qd1 * ug1;
--            }
-+            CeedScalar qd[2];
-+            MultCtAdjJt22(J + i, Q, coeff, 1, qw[i] * alpha, 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            vg[i] = qd[0] * ug0 + qd[1] * ug1;
-          }
-          break;
--      case 31:
--         for (CeedInt i = 0; i < Q; i++)
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 3 6   qd: 0 1 2
--            //    1 4 7       1 3 4
--            //    2 5 8       2 4 5
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J31 = J[i + Q * 2];
--            const CeedScalar J12 = J[i + Q * 3];
--            const CeedScalar J22 = J[i + Q * 4];
--            const CeedScalar J32 = J[i + Q * 5];
--            const CeedScalar J13 = J[i + Q * 6];
--            const CeedScalar J23 = J[i + Q * 7];
--            const CeedScalar J33 = J[i + Q * 8];
--            const CeedScalar A11 = J22 * J33 - J23 * J32;
--            const CeedScalar A12 = J13 * J32 - J12 * J33;
--            const CeedScalar A13 = J12 * J23 - J13 * J22;
--            const CeedScalar A21 = J23 * J31 - J21 * J33;
--            const CeedScalar A22 = J11 * J33 - J13 * J31;
--            const CeedScalar A23 = J13 * J21 - J11 * J23;
--            const CeedScalar A31 = J21 * J32 - J22 * J31;
--            const CeedScalar A32 = J12 * J31 - J11 * J32;
--            const CeedScalar A33 = J11 * J22 - J12 * J21;
--            const CeedScalar w = alpha * qw[i];
--            const CeedScalar wx = w * coeff0;
--            const CeedScalar wy = w * coeff1;
--            const CeedScalar wz = w * coeff2;
--            const CeedScalar qd0 = wx * A11 + wy * A12 + wz * A13;
--            const CeedScalar qd1 = wx * A21 + wy * A22 + wz * A23;
--            const CeedScalar qd2 = wx * A31 + wy * A32 + wz * A33;
-+            CeedScalar qd[2];
-+            MultCtAdjJt32(J + i, Q, coeff, 1, qw[i] * alpha, 1, qd);
-             const CeedScalar ug0 = ug[i + Q * 0];
-             const CeedScalar ug1 = ug[i + Q * 1];
--            const CeedScalar ug2 = ug[i + Q * 2];
--            vg[i] = qd0 * ug0 + qd1 * ug1 + qd2 * ug2;
-+            vg[i] = qd[0] * ug0 + qd[1] * ug1;
-          }
-          break;
-       case 33:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 3 6   qd: 0 1 2
--            //    1 4 7       1 3 4
--            //    2 5 8       2 4 5
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J31 = J[i + Q * 2];
--            const CeedScalar J12 = J[i + Q * 3];
--            const CeedScalar J22 = J[i + Q * 4];
--            const CeedScalar J32 = J[i + Q * 5];
--            const CeedScalar J13 = J[i + Q * 6];
--            const CeedScalar J23 = J[i + Q * 7];
--            const CeedScalar J33 = J[i + Q * 8];
--            const CeedScalar A11 = J22 * J33 - J23 * J32;
--            const CeedScalar A12 = J13 * J32 - J12 * J33;
--            const CeedScalar A13 = J12 * J23 - J13 * J22;
--            const CeedScalar A21 = J23 * J31 - J21 * J33;
--            const CeedScalar A22 = J11 * J33 - J13 * J31;
--            const CeedScalar A23 = J13 * J21 - J11 * J23;
--            const CeedScalar A31 = J21 * J32 - J22 * J31;
--            const CeedScalar A32 = J12 * J31 - J11 * J32;
--            const CeedScalar A33 = J11 * J22 - J12 * J21;
--            const CeedScalar w = alpha * qw[i];
--            const CeedScalar wx = w * coeff0;
--            const CeedScalar wy = w * coeff1;
--            const CeedScalar wz = w * coeff2;
--            const CeedScalar qd0 = wx * A11 + wy * A12 + wz * A13;
--            const CeedScalar qd1 = wx * A21 + wy * A22 + wz * A23;
--            const CeedScalar qd2 = wx * A31 + wy * A32 + wz * A33;
--            for (CeedInt c = 0; c < 3; c++)
--            {
--               const CeedScalar ug0 = ug[i + Q * (c+3*0)];
--               const CeedScalar ug1 = ug[i + Q * (c+3*1)];
--               const CeedScalar ug2 = ug[i + Q * (c+3*2)];
--               vg[i + Q * c] = qd0 * ug0 + qd1 * ug1 + qd2 * ug2;
--            }
-+            CeedScalar qd[3];
-+            MultCtAdjJt33(J + i, Q, coeff, 1, qw[i] * alpha, 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            const CeedScalar ug2 = ug[i + Q * 2];
-+            vg[i] = qd[0] * ug0 + qd[1] * ug1 + qd[2] * ug2;
-          }
-          break;
-    }
-    return 0;
- }
- 
-+/// libCEED QFunction for applying a convection operator with a coefficient
-+/// evaluated at quadrature points
- CEED_QFUNCTION(f_apply_conv_mf_quad)(void *ctx, CeedInt Q,
-                                      const CeedScalar *const *in,
-                                      CeedScalar *const *out)
- {
--   ConvectionContext *bc = (ConvectionContext*)ctx;
--   // in[0], out[0] have shape [dim, nc=1, Q]
--   // in[1] is Jacobians with shape [dim, nc=dim, Q]
--   // in[2] is quadrature weights, size (Q)
-+   ConvectionContext *bc = (ConvectionContext *)ctx;
-+   // in[0] has shape [dim, ncomp=1, Q]
-+   // in[1] is coefficients with shape [ncomp=space_dim, Q]
-+   // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[3] is quadrature weights, size (Q)
-+   // out[0] has shape [ncomp=1, Q]
-    //
--   // At every quadrature point, compute qw * adj(J).
--   const CeedScalar *c = in[0], *ug = in[1], *J = in[2], *qw = in[3];
-+   // At every quadrature point, compute qw * α * c^T adj(J)^T
-    const CeedScalar alpha  = bc->alpha;
-+   const CeedScalar *ug = in[0], *c = in[1], *J = in[2], *qw = in[3];
-    CeedScalar *vg = out[0];
--   switch (10 * bc->dim + bc->vdim)
-+   switch (10 * bc->space_dim + bc->dim)
-    {
-       case 11:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            const CeedScalar qd = alpha * c[i] * qw[i] * J[i];
--            vg[i] = ug[i] * qd;
-+            const CeedScalar qd = qw[i] * alpha * c[i] * J[i];
-+            vg[i] = qd * ug[i];
-          }
-          break;
-       case 21:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 2   qd: 0 1   adj(J):  J22 -J12
--            //    1 3       1 2           -J21  J11
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J12 = J[i + Q * 2];
--            const CeedScalar J22 = J[i + Q * 3];
--            const CeedScalar w = alpha * qw[i];
--            const CeedScalar wx = w * c[i + Q * 0];
--            const CeedScalar wy = w * c[i + Q * 1];
--            const CeedScalar qd0 =  wx * J22 - wy * J12;
--            const CeedScalar qd1 = -wx * J21 + wy * J11;
--            const CeedScalar ug0 = ug[i + Q * 0];
--            const CeedScalar ug1 = ug[i + Q * 1];
--            vg[i] = qd0 * ug0 + qd1 * ug1;
-+            CeedScalar qd;
-+            MultCtAdjJt21(J + i, Q, c + i, Q, qw[i] * alpha, 1, &qd);
-+            vg[i] = qd * ug[i];
-          }
-          break;
-       case 22:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 2   qd: 0 1   adj(J):  J22 -J12
--            //    1 3       1 2           -J21  J11
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J12 = J[i + Q * 2];
--            const CeedScalar J22 = J[i + Q * 3];
--            const CeedScalar w = alpha * qw[i];
--            const CeedScalar wx = w * c[i + Q * 0];
--            const CeedScalar wy = w * c[i + Q * 1];
--            const CeedScalar qd0 =  wx * J22 - wy * J12;
--            const CeedScalar qd1 = -wx * J21 + wy * J11;
--            for (CeedInt d = 0; d < 2; d++)
--            {
--               const CeedScalar ug0 = ug[i + Q * (d+2*0)];
--               const CeedScalar ug1 = ug[i + Q * (d+2*1)];
--               vg[i + Q * d] = qd0 * ug0 + qd1 * ug1;
--            }
-+            CeedScalar qd[2];
-+            MultCtAdjJt22(J + i, Q, c + i, Q, qw[i] * alpha, 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            vg[i] = qd[0] * ug0 + qd[1] * ug1;
-          }
-          break;
--      case 31:
--         for (CeedInt i = 0; i < Q; i++)
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 3 6   qd: 0 1 2
--            //    1 4 7       1 3 4
--            //    2 5 8       2 4 5
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J31 = J[i + Q * 2];
--            const CeedScalar J12 = J[i + Q * 3];
--            const CeedScalar J22 = J[i + Q * 4];
--            const CeedScalar J32 = J[i + Q * 5];
--            const CeedScalar J13 = J[i + Q * 6];
--            const CeedScalar J23 = J[i + Q * 7];
--            const CeedScalar J33 = J[i + Q * 8];
--            const CeedScalar A11 = J22 * J33 - J23 * J32;
--            const CeedScalar A12 = J13 * J32 - J12 * J33;
--            const CeedScalar A13 = J12 * J23 - J13 * J22;
--            const CeedScalar A21 = J23 * J31 - J21 * J33;
--            const CeedScalar A22 = J11 * J33 - J13 * J31;
--            const CeedScalar A23 = J13 * J21 - J11 * J23;
--            const CeedScalar A31 = J21 * J32 - J22 * J31;
--            const CeedScalar A32 = J12 * J31 - J11 * J32;
--            const CeedScalar A33 = J11 * J22 - J12 * J21;
--            const CeedScalar w = alpha * qw[i];
--            const CeedScalar wx = w * c[i + Q * 0];
--            const CeedScalar wy = w * c[i + Q * 1];
--            const CeedScalar wz = w * c[i + Q * 2];
--            const CeedScalar qd0 = wx * A11 + wy * A12 + wz * A13;
--            const CeedScalar qd1 = wx * A21 + wy * A22 + wz * A23;
--            const CeedScalar qd2 = wx * A31 + wy * A32 + wz * A33;
-+            CeedScalar qd[2];
-+            MultCtAdjJt32(J + i, Q, c + i, Q, qw[i] * alpha, 1, qd);
-             const CeedScalar ug0 = ug[i + Q * 0];
-             const CeedScalar ug1 = ug[i + Q * 1];
--            const CeedScalar ug2 = ug[i + Q * 2];
--            vg[i] = qd0 * ug0 + qd1 * ug1 + qd2 * ug2;
-+            vg[i] = qd[0] * ug0 + qd[1] * ug1;
-          }
-          break;
-       case 33:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 3 6   qd: 0 1 2
--            //    1 4 7       1 3 4
--            //    2 5 8       2 4 5
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J31 = J[i + Q * 2];
--            const CeedScalar J12 = J[i + Q * 3];
--            const CeedScalar J22 = J[i + Q * 4];
--            const CeedScalar J32 = J[i + Q * 5];
--            const CeedScalar J13 = J[i + Q * 6];
--            const CeedScalar J23 = J[i + Q * 7];
--            const CeedScalar J33 = J[i + Q * 8];
--            const CeedScalar A11 = J22 * J33 - J23 * J32;
--            const CeedScalar A12 = J13 * J32 - J12 * J33;
--            const CeedScalar A13 = J12 * J23 - J13 * J22;
--            const CeedScalar A21 = J23 * J31 - J21 * J33;
--            const CeedScalar A22 = J11 * J33 - J13 * J31;
--            const CeedScalar A23 = J13 * J21 - J11 * J23;
--            const CeedScalar A31 = J21 * J32 - J22 * J31;
--            const CeedScalar A32 = J12 * J31 - J11 * J32;
--            const CeedScalar A33 = J11 * J22 - J12 * J21;
--            const CeedScalar w = alpha * qw[i];
--            const CeedScalar wx = w * c[i + Q * 0];
--            const CeedScalar wy = w * c[i + Q * 1];
--            const CeedScalar wz = w * c[i + Q * 2];
--            const CeedScalar qd0 = wx * A11 + wy * A12 + wz * A13;
--            const CeedScalar qd1 = wx * A21 + wy * A22 + wz * A23;
--            const CeedScalar qd2 = wx * A31 + wy * A32 + wz * A33;
--            for (CeedInt d = 0; d < 3; d++)
--            {
--               const CeedScalar ug0 = ug[i + Q * (d+3*0)];
--               const CeedScalar ug1 = ug[i + Q * (d+3*1)];
--               const CeedScalar ug2 = ug[i + Q * (d+3*2)];
--               vg[i + Q * d] = qd0 * ug0 + qd1 * ug1 + qd2 * ug2;
--            }
-+            CeedScalar qd[3];
-+            MultCtAdjJt33(J + i, Q, c + i, Q, qw[i] * alpha, 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            const CeedScalar ug2 = ug[i + Q * 2];
-+            vg[i] = qd[0] * ug0 + qd[1] * ug1 + qd[2] * ug2;
-          }
-          break;
-    }
-    return 0;
- }
-+
-+#endif // MFEM_LIBCEED_CONV_QF_H
-diff --git a/fem/ceed/integrators/curlcurl/curlcurl.cpp b/fem/ceed/integrators/curlcurl/curlcurl.cpp
-new file mode 100644
-index 000000000..ace7c18e4
---- /dev/null
-+++ b/fem/ceed/integrators/curlcurl/curlcurl.cpp
-@@ -0,0 +1,244 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "curlcurl.hpp"
-+
-+#include "../../../../config/config.hpp"
-+#ifdef MFEM_USE_CEED
-+#include "curlcurl_qf.h"
-+#endif
-+
-+namespace mfem
-+{
-+
-+namespace ceed
-+{
-+
-+#ifdef MFEM_USE_CEED
-+struct CurlCurlOperatorInfo : public OperatorInfo
-+{
-+   CurlCurlContext ctx = {0};
-+   template <typename CoeffType>
-+   CurlCurlOperatorInfo(const mfem::FiniteElementSpace &fes, CoeffType *Q,
-+                        bool use_bdr = false, bool use_mf = false)
-+   {
-+      MFEM_VERIFY(fes.GetVDim() == 1,
-+                  "libCEED interface for vector FE does not support vdim > 1!");
-+      ctx.dim = fes.GetMesh()->Dimension() - use_bdr;
-+      MFEM_VERIFY(ctx.dim == 2 || ctx.dim == 3,
-+                  "CurlCurlIntegrator requires dim == 2 or dim == 3!");
-+      ctx.space_dim = fes.GetMesh()->SpaceDimension();
-+      ctx.curl_dim = (ctx.dim < 3) ? 1 : ctx.dim;
-+      if (!use_mf)
-+      {
-+         apply_func = ":f_apply_curlcurl";
-+         apply_qf = &f_apply_curlcurl;
-+      }
-+      else
-+      {
-+         build_func = "";
-+         build_qf = nullptr;
-+      }
-+      if (Q == nullptr)
-+      {
-+         ctx.coeff[0] = 1.0;
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_curlcurl_const_scalar";
-+            build_qf = &f_build_curlcurl_const_scalar;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_curlcurl_mf_const_scalar";
-+            apply_qf = &f_apply_curlcurl_mf_const_scalar;
-+         }
-+      }
-+      else
-+      {
-+         InitCoefficient(*Q, use_mf);
-+      }
-+      header = "/integrators/curlcurl/curlcurl_qf.h";
-+      trial_op = EvalMode::Curl;
-+      test_op = EvalMode::Curl;
-+      qdatasize = (ctx.curl_dim * (ctx.curl_dim + 1)) / 2;
-+   }
-+   void InitCoefficient(mfem::Coefficient &Q, bool use_mf)
-+   {
-+      if (mfem::ConstantCoefficient *const_coeff =
-+             dynamic_cast<mfem::ConstantCoefficient *>(&Q))
-+      {
-+         ctx.coeff[0] = const_coeff->constant;
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_curlcurl_const_scalar";
-+            build_qf = &f_build_curlcurl_const_scalar;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_curlcurl_mf_const_scalar";
-+            apply_qf = &f_apply_curlcurl_mf_const_scalar;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_curlcurl_quad_scalar";
-+            build_qf = &f_build_curlcurl_quad_scalar;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_curlcurl_mf_quad_scalar";
-+            apply_qf = &f_apply_curlcurl_mf_quad_scalar;
-+         }
-+      }
-+   }
-+   void InitCoefficient(mfem::VectorCoefficient &VQ, bool use_mf)
-+   {
-+      if (mfem::VectorConstantCoefficient *const_coeff =
-+             dynamic_cast<mfem::VectorConstantCoefficient *>(&VQ))
-+      {
-+         const int vdim = VQ.GetVDim();
-+         MFEM_VERIFY(vdim <= LIBCEED_CURLCURL_COEFF_COMP_MAX,
-+                     "VectorCoefficient dimension exceeds context storage!");
-+         const mfem::Vector &val = const_coeff->GetVec();
-+         for (int i = 0; i < vdim; i++)
-+         {
-+            ctx.coeff[i] = val[i];
-+         }
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_curlcurl_const_vector";
-+            build_qf = &f_build_curlcurl_const_vector;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_curlcurl_mf_const_vector";
-+            apply_qf = &f_apply_curlcurl_mf_const_vector;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_curlcurl_quad_vector";
-+            build_qf = &f_build_curlcurl_quad_vector;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_curlcurl_mf_quad_vector";
-+            apply_qf = &f_apply_curlcurl_mf_quad_vector;
-+         }
-+      }
-+   }
-+   void InitCoefficient(mfem::MatrixCoefficient &MQ, bool use_mf)
-+   {
-+      // Assumes matrix coefficient is symmetric
-+      if (mfem::MatrixConstantCoefficient *const_coeff =
-+             dynamic_cast<mfem::MatrixConstantCoefficient *>(&MQ))
-+      {
-+         const int vdim = MQ.GetVDim();
-+         MFEM_VERIFY((vdim * (vdim + 1)) / 2 <= LIBCEED_CURLCURL_COEFF_COMP_MAX,
-+                     "MatrixCoefficient dimensions exceed context storage!");
-+         const mfem::DenseMatrix &val = const_coeff->GetMatrix();
-+         for (int j = 0; j < vdim; j++)
-+         {
-+            for (int i = j; i < vdim; i++)
-+            {
-+               const int idx = (j * vdim) - (((j - 1) * j) / 2) + i - j;
-+               ctx.coeff[idx] = val(i, j);
-+            }
-+         }
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_curlcurl_const_matrix";
-+            build_qf = &f_build_curlcurl_const_matrix;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_curlcurl_mf_const_matrix";
-+            apply_qf = &f_apply_curlcurl_mf_const_matrix;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_curlcurl_quad_matrix";
-+            build_qf = &f_build_curlcurl_quad_matrix;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_curlcurl_mf_quad_matrix";
-+            apply_qf = &f_apply_curlcurl_mf_quad_matrix;
-+         }
-+      }
-+   }
-+};
-+#endif
-+
-+template <typename CoeffType>
-+PACurlCurlIntegrator::PACurlCurlIntegrator(
-+   const mfem::CurlCurlIntegrator &integ,
-+   const mfem::FiniteElementSpace &fes,
-+   CoeffType *Q,
-+   const bool use_bdr)
-+{
-+#ifdef MFEM_USE_CEED
-+   CurlCurlOperatorInfo info(fes, Q, use_bdr);
-+   Assemble(integ, info, fes, Q, use_bdr);
-+#else
-+   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
-+#endif
-+}
-+
-+template <typename CoeffType>
-+MFCurlCurlIntegrator::MFCurlCurlIntegrator(
-+   const mfem::CurlCurlIntegrator &integ,
-+   const mfem::FiniteElementSpace &fes,
-+   CoeffType *Q,
-+   const bool use_bdr)
-+{
-+#ifdef MFEM_USE_CEED
-+   CurlCurlOperatorInfo info(fes, Q, use_bdr, true);
-+   Assemble(integ, info, fes, Q, use_bdr, true);
-+#else
-+   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
-+#endif
-+}
-+
-+// @cond DOXYGEN_SKIP
-+
-+template PACurlCurlIntegrator::PACurlCurlIntegrator(
-+   const mfem::CurlCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::Coefficient *, const bool);
-+template PACurlCurlIntegrator::PACurlCurlIntegrator(
-+   const mfem::CurlCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::VectorCoefficient *, const bool);
-+template PACurlCurlIntegrator::PACurlCurlIntegrator(
-+   const mfem::CurlCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::MatrixCoefficient *, const bool);
-+
-+template MFCurlCurlIntegrator::MFCurlCurlIntegrator(
-+   const mfem::CurlCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::Coefficient *, const bool);
-+template MFCurlCurlIntegrator::MFCurlCurlIntegrator(
-+   const mfem::CurlCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::VectorCoefficient *, const bool);
-+template MFCurlCurlIntegrator::MFCurlCurlIntegrator(
-+   const mfem::CurlCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::MatrixCoefficient *, const bool);
-+
-+// @endcond
-+
-+} // namespace ceed
-+
-+} // namespace mfem
-diff --git a/fem/ceed/integrators/curlcurl/curlcurl.hpp b/fem/ceed/integrators/curlcurl/curlcurl.hpp
-new file mode 100644
-index 000000000..71d62c915
---- /dev/null
-+++ b/fem/ceed/integrators/curlcurl/curlcurl.hpp
-@@ -0,0 +1,51 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#ifndef MFEM_LIBCEED_CURLCURL_HPP
-+#define MFEM_LIBCEED_CURLCURL_HPP
-+
-+#include "../../interface/integrator.hpp"
-+#include "../../interface/mixed_operator.hpp"
-+#include "../../../fespace.hpp"
-+
-+namespace mfem
-+{
-+
-+namespace ceed
-+{
-+
-+/// Represent a CurlCurlIntegrator with AssemblyLevel::Partial using libCEED.
-+class PACurlCurlIntegrator : public MixedOperator<Integrator>
-+{
-+public:
-+   template <typename CoeffType>
-+   PACurlCurlIntegrator(const mfem::CurlCurlIntegrator &integ,
-+                        const mfem::FiniteElementSpace &fes,
-+                        CoeffType *Q,
-+                        const bool use_bdr = false);
-+};
-+
-+/// Represent a CurlCurlIntegrator with AssemblyLevel::None using libCEED.
-+class MFCurlCurlIntegrator : public MixedOperator<Integrator>
-+{
-+public:
-+   template <typename CoeffType>
-+   MFCurlCurlIntegrator(const mfem::CurlCurlIntegrator &integ,
-+                        const mfem::FiniteElementSpace &fes,
-+                        CoeffType *Q,
-+                        const bool use_bdr = false);
-+};
-+
-+}
-+
-+}
-+
-+#endif // MFEM_LIBCEED_CURLCURL_HPP
-diff --git a/fem/ceed/integrators/curlcurl/curlcurl_qf.h b/fem/ceed/integrators/curlcurl/curlcurl_qf.h
-new file mode 100644
-index 000000000..6fbace54b
---- /dev/null
-+++ b/fem/ceed/integrators/curlcurl/curlcurl_qf.h
-@@ -0,0 +1,479 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#ifndef MFEM_LIBCEED_CURLCURL_QF_H
-+#define MFEM_LIBCEED_CURLCURL_QF_H
-+
-+#include "../util/util_qf.h"
-+
-+#define LIBCEED_CURLCURL_COEFF_COMP_MAX 6
-+
-+struct CurlCurlContext
-+{
-+   CeedInt dim, space_dim, curl_dim;
-+   CeedScalar coeff[LIBCEED_CURLCURL_COEFF_COMP_MAX];
-+};
-+
-+/// libCEED QFunction for building quadrature data for a curl-curl operator
-+/// with a scalar constant coefficient
-+CEED_QFUNCTION(f_build_curlcurl_const_scalar)(void *ctx, CeedInt Q,
-+                                              const CeedScalar *const *in,
-+                                              CeedScalar *const *out)
-+{
-+   CurlCurlContext *bc = (CurlCurlContext *)ctx;
-+   // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[1] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) J^T C J and store the
-+   // symmetric part of the result. In 2D, compute and store qw * c / det(J)
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *J = in[0], *qw = in[1];
-+   CeedScalar *qd = out[0];
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim)
-+   {
-+      case 221:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar coeff0 = coeff[0];
-+            qd[i] = qw[i] * coeff0 / DetJ22(J + i, Q);
-+         }
-+         break;
-+      case 321:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar coeff0 = coeff[0];
-+            qd[i] = qw[i] * coeff0 / DetJ32(J + i, Q);
-+         }
-+         break;
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ33(J + i, Q, coeff, 1, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for a curl-curl operator
-+/// with a vector constant coefficient
-+CEED_QFUNCTION(f_build_curlcurl_const_vector)(void *ctx, CeedInt Q,
-+                                              const CeedScalar *const *in,
-+                                              CeedScalar *const *out)
-+{
-+   CurlCurlContext *bc = (CurlCurlContext *)ctx;
-+   // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[1] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) J^T C J and store the
-+   // symmetric part of the result. In 2D, compute and store qw * c / det(J)
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *J = in[0], *qw = in[1];
-+   CeedScalar *qd = out[0];
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim)
-+   {
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ33(J + i, Q, coeff, 1, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for a curl-curl operator
-+/// with a matrix constant coefficient
-+CEED_QFUNCTION(f_build_curlcurl_const_matrix)(void *ctx, CeedInt Q,
-+                                              const CeedScalar *const *in,
-+                                              CeedScalar *const *out)
-+{
-+   CurlCurlContext *bc = (CurlCurlContext *)ctx;
-+   // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[1] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) J^T C J and store the
-+   // symmetric part of the result. In 2D, compute and store qw * c / det(J)
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *J = in[0], *qw = in[1];
-+   CeedScalar *qd = out[0];
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim)
-+   {
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ33(J + i, Q, coeff, 1, 6, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for a curl-curl operator
-+/// with a scalar coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_build_curlcurl_quad_scalar)(void *ctx, CeedInt Q,
-+                                             const CeedScalar *const *in,
-+                                             CeedScalar *const *out)
-+{
-+   CurlCurlContext *bc = (CurlCurlContext *)ctx;
-+   // in[0] is coefficients with shape [ncomp=1, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) J^T C J and store the
-+   // symmetric part of the result. In 2D, compute and store qw * c / det(J)
-+   const CeedScalar *c = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *qd = out[0];
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim)
-+   {
-+      case 221:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * c[i] / DetJ22(J + i, Q);
-+         }
-+         break;
-+      case 321:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * c[i] / DetJ32(J + i, Q);
-+         }
-+         break;
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ33(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for a curl-curl operator
-+/// with a vector coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_build_curlcurl_quad_vector)(void *ctx, CeedInt Q,
-+                                             const CeedScalar *const *in,
-+                                             CeedScalar *const *out)
-+{
-+   CurlCurlContext *bc = (CurlCurlContext *)ctx;
-+   // in[0] is coefficients with shape [ncomp=space_dim, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) J^T C J and store the
-+   // symmetric part of the result. In 2D, compute and store qw * c / det(J)
-+   const CeedScalar *c = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *qd = out[0];
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim)
-+   {
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ33(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for a curl-curl operator
-+/// with a matrix coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_build_curlcurl_quad_matrix)(void *ctx, CeedInt Q,
-+                                             const CeedScalar *const *in,
-+                                             CeedScalar *const *out)
-+{
-+   CurlCurlContext *bc = (CurlCurlContext *)ctx;
-+   // in[0] is coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) J^T C J and store the
-+   // symmetric part of the result. In 2D, compute and store qw * c / det(J)
-+   const CeedScalar *c = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *qd = out[0];
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim)
-+   {
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ33(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying a curl-curl operator
-+CEED_QFUNCTION(f_apply_curlcurl)(void *ctx, CeedInt Q,
-+                                 const CeedScalar *const *in,
-+                                 CeedScalar *const *out)
-+{
-+   CurlCurlContext *bc = (CurlCurlContext *)ctx;
-+   // in[0], out[0] have shape [curl_dim, ncomp=1, Q]
-+   const CeedScalar *uc = in[0], *qd = in[1];
-+   CeedScalar *vc = out[0];
-+   switch (10 * bc->dim + bc->curl_dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            vc[i] = qd[i] * uc[i];
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar uc0 = uc[i + Q * 0];
-+            const CeedScalar uc1 = uc[i + Q * 1];
-+            const CeedScalar uc2 = uc[i + Q * 2];
-+            vc[i + Q * 0] = qd[i + Q * 0] * uc0 + qd[i + Q * 1] * uc1 + qd[i + Q * 2] * uc2;
-+            vc[i + Q * 1] = qd[i + Q * 1] * uc0 + qd[i + Q * 3] * uc1 + qd[i + Q * 4] * uc2;
-+            vc[i + Q * 2] = qd[i + Q * 2] * uc0 + qd[i + Q * 4] * uc1 + qd[i + Q * 5] * uc2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying a curl-curl operator with a scalar constant
-+/// coefficient
-+CEED_QFUNCTION(f_apply_curlcurl_mf_const_scalar)(void *ctx, CeedInt Q,
-+                                                 const CeedScalar *const *in,
-+                                                 CeedScalar *const *out)
-+{
-+   CurlCurlContext *bc = (CurlCurlContext *)ctx;
-+   // in[0], out[0] have shape [curl_dim, ncomp=1, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) J^T C J
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *uc = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *vc = out[0];
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim)
-+   {
-+      case 221:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar coeff0 = coeff[0];
-+            const CeedScalar qd = qw[i] * coeff0 / DetJ22(J + i, Q);
-+            vc[i] = qd * uc[i];
-+         }
-+         break;
-+      case 321:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar coeff0 = coeff[0];
-+            const CeedScalar qd = qw[i] * coeff0 / DetJ32(J + i, Q);
-+            vc[i] = qd * uc[i];
-+         }
-+         break;
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultJtCJ33(J + i, Q, coeff, 1, 1, qw[i], 1, qd);
-+            const CeedScalar uc0 = uc[i + Q * 0];
-+            const CeedScalar uc1 = uc[i + Q * 1];
-+            const CeedScalar uc2 = uc[i + Q * 2];
-+            vc[i + Q * 0] = qd[0] * uc0 + qd[1] * uc1 + qd[2] * uc2;
-+            vc[i + Q * 1] = qd[1] * uc0 + qd[3] * uc1 + qd[4] * uc2;
-+            vc[i + Q * 2] = qd[2] * uc0 + qd[4] * uc1 + qd[5] * uc2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying a curl-curl operator with a vector constant
-+/// coefficient
-+CEED_QFUNCTION(f_apply_curlcurl_mf_const_vector)(void *ctx, CeedInt Q,
-+                                                 const CeedScalar *const *in,
-+                                                 CeedScalar *const *out)
-+{
-+   CurlCurlContext *bc = (CurlCurlContext *)ctx;
-+   // in[0], out[0] have shape [curl_dim, ncomp=1, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) J^T C J
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *uc = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *vc = out[0];
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim)
-+   {
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultJtCJ33(J + i, Q, coeff, 1, 3, qw[i], 1, qd);
-+            const CeedScalar uc0 = uc[i + Q * 0];
-+            const CeedScalar uc1 = uc[i + Q * 1];
-+            const CeedScalar uc2 = uc[i + Q * 2];
-+            vc[i + Q * 0] = qd[0] * uc0 + qd[1] * uc1 + qd[2] * uc2;
-+            vc[i + Q * 1] = qd[1] * uc0 + qd[3] * uc1 + qd[4] * uc2;
-+            vc[i + Q * 2] = qd[2] * uc0 + qd[4] * uc1 + qd[5] * uc2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying a curl-curl operator with a matrix constant
-+/// coefficient
-+CEED_QFUNCTION(f_apply_curlcurl_mf_const_matrix)(void *ctx, CeedInt Q,
-+                                                 const CeedScalar *const *in,
-+                                                 CeedScalar *const *out)
-+{
-+   CurlCurlContext *bc = (CurlCurlContext *)ctx;
-+   // in[0], out[0] have shape [curl_dim, ncomp=1, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) J^T C J
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *uc = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *vc = out[0];
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim)
-+   {
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultJtCJ33(J + i, Q, coeff, 1, 6, qw[i], 1, qd);
-+            const CeedScalar uc0 = uc[i + Q * 0];
-+            const CeedScalar uc1 = uc[i + Q * 1];
-+            const CeedScalar uc2 = uc[i + Q * 2];
-+            vc[i + Q * 0] = qd[0] * uc0 + qd[1] * uc1 + qd[2] * uc2;
-+            vc[i + Q * 1] = qd[1] * uc0 + qd[3] * uc1 + qd[4] * uc2;
-+            vc[i + Q * 2] = qd[2] * uc0 + qd[4] * uc1 + qd[5] * uc2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying a curl-curl operator with a scalar
-+/// coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_apply_curlcurl_mf_quad_scalar)(void *ctx, CeedInt Q,
-+                                                const CeedScalar *const *in,
-+                                                CeedScalar *const *out)
-+{
-+   CurlCurlContext *bc = (CurlCurlContext *)ctx;
-+   // in[0], out[0] have shape [curl_dim, ncomp=1, Q]
-+   // in[1] is coefficients with shape [ncomp=1, Q]
-+   // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[3] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) J^T C J
-+   const CeedScalar *uc = in[0], *c = in[1], *J = in[2], *qw = in[3];
-+   CeedScalar *vc = out[0];
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim)
-+   {
-+      case 221:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * c[i] / DetJ22(J + i, Q);
-+            vc[i] = qd * uc[i];
-+         }
-+         break;
-+      case 321:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * c[i] / DetJ32(J + i, Q);
-+            vc[i] = qd * uc[i];
-+         }
-+         break;
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultJtCJ33(J + i, Q, c + i, Q, 1, qw[i], 1, qd);
-+            const CeedScalar uc0 = uc[i + Q * 0];
-+            const CeedScalar uc1 = uc[i + Q * 1];
-+            const CeedScalar uc2 = uc[i + Q * 2];
-+            vc[i + Q * 0] = qd[0] * uc0 + qd[1] * uc1 + qd[2] * uc2;
-+            vc[i + Q * 1] = qd[1] * uc0 + qd[3] * uc1 + qd[4] * uc2;
-+            vc[i + Q * 2] = qd[2] * uc0 + qd[4] * uc1 + qd[5] * uc2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying a curl-curl operator with a vector
-+/// coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_apply_curlcurl_mf_quad_vector)(void *ctx, CeedInt Q,
-+                                                const CeedScalar *const *in,
-+                                                CeedScalar *const *out)
-+{
-+   CurlCurlContext *bc = (CurlCurlContext *)ctx;
-+   // in[0], out[0] have shape [curl_dim, ncomp=1, Q]
-+   // in[1] is coefficients with shape [ncomp=space_dim, Q]
-+   // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[3] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) J^T C J
-+   const CeedScalar *uc = in[0], *c = in[1], *J = in[2], *qw = in[3];
-+   CeedScalar *vc = out[0];
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim)
-+   {
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultJtCJ33(J + i, Q, c + i, Q, 3, qw[i], 1, qd);
-+            const CeedScalar uc0 = uc[i + Q * 0];
-+            const CeedScalar uc1 = uc[i + Q * 1];
-+            const CeedScalar uc2 = uc[i + Q * 2];
-+            vc[i + Q * 0] = qd[0] * uc0 + qd[1] * uc1 + qd[2] * uc2;
-+            vc[i + Q * 1] = qd[1] * uc0 + qd[3] * uc1 + qd[4] * uc2;
-+            vc[i + Q * 2] = qd[2] * uc0 + qd[4] * uc1 + qd[5] * uc2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying a curl-curl operator with a matrix
-+/// coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_apply_curlcurl_mf_quad_matrix)(void *ctx, CeedInt Q,
-+                                                const CeedScalar *const *in,
-+                                                CeedScalar *const *out)
-+{
-+   CurlCurlContext *bc = (CurlCurlContext *)ctx;
-+   // in[0], out[0] have shape [curl_dim, ncomp=1, Q]
-+   // in[1] is coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q]
-+   // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[3] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) J^T C J
-+   const CeedScalar *uc = in[0], *c = in[1], *J = in[2], *qw = in[3];
-+   CeedScalar *vc = out[0];
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim)
-+   {
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultJtCJ33(J + i, Q, c + i, Q, 6, qw[i], 1, qd);
-+            const CeedScalar uc0 = uc[i + Q * 0];
-+            const CeedScalar uc1 = uc[i + Q * 1];
-+            const CeedScalar uc2 = uc[i + Q * 2];
-+            vc[i + Q * 0] = qd[0] * uc0 + qd[1] * uc1 + qd[2] * uc2;
-+            vc[i + Q * 1] = qd[1] * uc0 + qd[3] * uc1 + qd[4] * uc2;
-+            vc[i + Q * 2] = qd[2] * uc0 + qd[4] * uc1 + qd[5] * uc2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+#endif // MFEM_LIBCEED_CURLCURL_QF_H
-diff --git a/fem/ceed/integrators/diffusion/diffusion.cpp b/fem/ceed/integrators/diffusion/diffusion.cpp
-index 4cd68669f..d2f56db82 100644
---- a/fem/ceed/integrators/diffusion/diffusion.cpp
-+++ b/fem/ceed/integrators/diffusion/diffusion.cpp
-@@ -25,106 +25,253 @@ namespace ceed
- #ifdef MFEM_USE_CEED
- struct DiffusionOperatorInfo : public OperatorInfo
- {
--   DiffusionContext ctx;
--   DiffusionOperatorInfo(int dim)
-+   DiffusionContext ctx = {0};
-+   template <typename CoeffType>
-+   DiffusionOperatorInfo(const mfem::FiniteElementSpace &fes, CoeffType *Q,
-+                         bool use_bdr = false, bool use_mf = false)
-    {
-+      ctx.dim = fes.GetMesh()->Dimension() - use_bdr;
-+      ctx.space_dim = fes.GetMesh()->SpaceDimension();
-+      ctx.vdim = fes.GetVDim();
-+      if (!use_mf)
-+      {
-+         apply_func = ":f_apply_diff";
-+         apply_qf = &f_apply_diff;
-+      }
-+      else
-+      {
-+         build_func = "";
-+         build_qf = nullptr;
-+      }
-+      if (Q == nullptr)
-+      {
-+         ctx.coeff[0] = 1.0;
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_diff_const_scalar";
-+            build_qf = &f_build_diff_const_scalar;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_diff_mf_const_scalar";
-+            apply_qf = &f_apply_diff_mf_const_scalar;
-+         }
-+      }
-+      else
-+      {
-+         InitCoefficient(*Q, use_mf);
-+      }
-       header = "/integrators/diffusion/diffusion_qf.h";
--      build_func_const = ":f_build_diff_const";
--      build_qf_const = &f_build_diff_const;
--      build_func_quad = ":f_build_diff_quad";
--      build_qf_quad = &f_build_diff_quad;
--      apply_func = ":f_apply_diff";
--      apply_qf = &f_apply_diff;
--      apply_func_mf_const = ":f_apply_diff_mf_const";
--      apply_qf_mf_const = &f_apply_diff_mf_const;
--      apply_func_mf_quad = ":f_apply_diff_mf_quad";
--      apply_qf_mf_quad = &f_apply_diff_mf_quad;
-       trial_op = EvalMode::Grad;
-       test_op = EvalMode::Grad;
--      qdatasize = dim*(dim+1)/2;
-+      qdatasize = (ctx.dim * (ctx.dim + 1)) / 2;
-+   }
-+   void InitCoefficient(mfem::Coefficient &Q, bool use_mf)
-+   {
-+      if (mfem::ConstantCoefficient *const_coeff =
-+             dynamic_cast<mfem::ConstantCoefficient *>(&Q))
-+      {
-+         ctx.coeff[0] = const_coeff->constant;
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_diff_const_scalar";
-+            build_qf = &f_build_diff_const_scalar;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_diff_mf_const_scalar";
-+            apply_qf = &f_apply_diff_mf_const_scalar;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_diff_quad_scalar";
-+            build_qf = &f_build_diff_quad_scalar;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_diff_mf_quad_scalar";
-+            apply_qf = &f_apply_diff_mf_quad_scalar;
-+         }
-+      }
-+   }
-+   void InitCoefficient(mfem::VectorCoefficient &VQ, bool use_mf)
-+   {
-+      if (mfem::VectorConstantCoefficient *const_coeff =
-+             dynamic_cast<mfem::VectorConstantCoefficient *>(&VQ))
-+      {
-+         const int vdim = VQ.GetVDim();
-+         MFEM_VERIFY(vdim <= LIBCEED_DIFF_COEFF_COMP_MAX,
-+                     "VectorCoefficient dimension exceeds context storage!");
-+         const mfem::Vector &val = const_coeff->GetVec();
-+         for (int i = 0; i < vdim; i++)
-+         {
-+            ctx.coeff[i] = val[i];
-+         }
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_diff_const_vector";
-+            build_qf = &f_build_diff_const_vector;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_diff_mf_const_vector";
-+            apply_qf = &f_apply_diff_mf_const_vector;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_diff_quad_vector";
-+            build_qf = &f_build_diff_quad_vector;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_diff_mf_quad_vector";
-+            apply_qf = &f_apply_diff_mf_quad_vector;
-+         }
-+      }
-+   }
-+   void InitCoefficient(mfem::MatrixCoefficient &MQ, bool use_mf)
-+   {
-+      // Assumes matrix coefficient is symmetric
-+      if (mfem::MatrixConstantCoefficient *const_coeff =
-+             dynamic_cast<mfem::MatrixConstantCoefficient *>(&MQ))
-+      {
-+         const int vdim = MQ.GetVDim();
-+         MFEM_VERIFY((vdim * (vdim + 1)) / 2 <= LIBCEED_DIFF_COEFF_COMP_MAX,
-+                     "MatrixCoefficient dimensions exceed context storage!");
-+         const mfem::DenseMatrix &val = const_coeff->GetMatrix();
-+         for (int j = 0; j < vdim; j++)
-+         {
-+            for (int i = j; i < vdim; i++)
-+            {
-+               const int idx = (j * vdim) - (((j - 1) * j) / 2) + i - j;
-+               ctx.coeff[idx] = val(i, j);
-+            }
-+         }
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_diff_const_matrix";
-+            build_qf = &f_build_diff_const_matrix;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_diff_mf_const_matrix";
-+            apply_qf = &f_apply_diff_mf_const_matrix;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_diff_quad_matrix";
-+            build_qf = &f_build_diff_quad_matrix;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_diff_mf_quad_matrix";
-+            apply_qf = &f_apply_diff_mf_quad_matrix;
-+         }
-+      }
-    }
- };
- #endif
- 
-+template <typename CoeffType>
- PADiffusionIntegrator::PADiffusionIntegrator(
-+   const mfem::DiffusionIntegrator &integ,
-    const mfem::FiniteElementSpace &fes,
--   const mfem::IntegrationRule &irm,
--   mfem::Coefficient *Q)
--   : PAIntegrator()
-+   CoeffType *Q,
-+   const bool use_bdr)
- {
- #ifdef MFEM_USE_CEED
--   DiffusionOperatorInfo info(fes.GetMesh()->Dimension());
--   Assemble(info, fes, irm, Q);
-+   DiffusionOperatorInfo info(fes, Q, use_bdr);
-+   Assemble(integ, info, fes, Q, use_bdr);
- #else
-    MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
- #endif
- }
- 
--MixedPADiffusionIntegrator::MixedPADiffusionIntegrator(
--   const DiffusionIntegrator &integ,
-+template <typename CoeffType>
-+PADiffusionIntegrator::PADiffusionIntegrator(
-+   const mfem::VectorDiffusionIntegrator &integ,
-    const mfem::FiniteElementSpace &fes,
--   mfem::Coefficient *Q)
-+   CoeffType *Q,
-+   const bool use_bdr)
- {
- #ifdef MFEM_USE_CEED
--   DiffusionOperatorInfo info(fes.GetMesh()->Dimension());
--   Assemble(integ, info, fes, Q);
-+   DiffusionOperatorInfo info(fes, Q, use_bdr);
-+   Assemble(integ, info, fes, Q, use_bdr);
- #else
-    MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
- #endif
- }
- 
--MixedPADiffusionIntegrator::MixedPADiffusionIntegrator(
--   const VectorDiffusionIntegrator &integ,
-+template <typename CoeffType>
-+MFDiffusionIntegrator::MFDiffusionIntegrator(
-+   const mfem::DiffusionIntegrator &integ,
-    const mfem::FiniteElementSpace &fes,
--   mfem::Coefficient *Q)
-+   CoeffType *Q,
-+   const bool use_bdr)
- {
- #ifdef MFEM_USE_CEED
--   DiffusionOperatorInfo info(fes.GetMesh()->Dimension());
--   Assemble(integ, info, fes, Q);
-+   DiffusionOperatorInfo info(fes, Q, use_bdr, true);
-+   Assemble(integ, info, fes, Q, use_bdr, true);
- #else
-    MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
- #endif
- }
- 
-+template <typename CoeffType>
- MFDiffusionIntegrator::MFDiffusionIntegrator(
-+   const mfem::VectorDiffusionIntegrator &integ,
-    const mfem::FiniteElementSpace &fes,
--   const mfem::IntegrationRule &irm,
--   mfem::Coefficient *Q)
--   : MFIntegrator()
-+   CoeffType *Q,
-+   const bool use_bdr)
- {
- #ifdef MFEM_USE_CEED
--   DiffusionOperatorInfo info(fes.GetMesh()->Dimension());
--   Assemble(info, fes, irm, Q);
-+   DiffusionOperatorInfo info(fes, Q, use_bdr, true);
-+   Assemble(integ, info, fes, Q, use_bdr, true);
- #else
-    MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
- #endif
- }
- 
--MixedMFDiffusionIntegrator::MixedMFDiffusionIntegrator(
--   const DiffusionIntegrator &integ,
--   const mfem::FiniteElementSpace &fes,
--   mfem::Coefficient *Q)
--{
--#ifdef MFEM_USE_CEED
--   DiffusionOperatorInfo info(fes.GetMesh()->Dimension());
--   Assemble(integ, info, fes, Q);
--#else
--   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
--#endif
--}
-+// @cond DOXYGEN_SKIP
- 
--MixedMFDiffusionIntegrator::MixedMFDiffusionIntegrator(
--   const VectorDiffusionIntegrator &integ,
--   const mfem::FiniteElementSpace &fes,
--   mfem::Coefficient *Q)
--{
--#ifdef MFEM_USE_CEED
--   DiffusionOperatorInfo info(fes.GetMesh()->Dimension());
--   Assemble(integ, info, fes, Q);
--#else
--   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
--#endif
--}
-+template PADiffusionIntegrator::PADiffusionIntegrator(
-+   const mfem::DiffusionIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::Coefficient *, const bool);
-+template PADiffusionIntegrator::PADiffusionIntegrator(
-+   const mfem::DiffusionIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::VectorCoefficient *, const bool);
-+template PADiffusionIntegrator::PADiffusionIntegrator(
-+   const mfem::DiffusionIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::MatrixCoefficient *, const bool);
-+
-+template PADiffusionIntegrator::PADiffusionIntegrator(
-+   const mfem::VectorDiffusionIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::Coefficient *, const bool);
-+
-+template MFDiffusionIntegrator::MFDiffusionIntegrator(
-+   const mfem::DiffusionIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::Coefficient *, const bool);
-+template MFDiffusionIntegrator::MFDiffusionIntegrator(
-+   const mfem::DiffusionIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::VectorCoefficient *, const bool);
-+template MFDiffusionIntegrator::MFDiffusionIntegrator(
-+   const mfem::DiffusionIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::MatrixCoefficient *, const bool);
-+
-+template MFDiffusionIntegrator::MFDiffusionIntegrator(
-+   const mfem::VectorDiffusionIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::Coefficient *, const bool);
-+
-+// @endcond
- 
- } // namespace ceed
- 
-diff --git a/fem/ceed/integrators/diffusion/diffusion.hpp b/fem/ceed/integrators/diffusion/diffusion.hpp
-index dd28c9d16..b92710bad 100644
---- a/fem/ceed/integrators/diffusion/diffusion.hpp
-+++ b/fem/ceed/integrators/diffusion/diffusion.hpp
-@@ -13,7 +13,7 @@
- #define MFEM_LIBCEED_DIFF_HPP
- 
- #include "../../interface/integrator.hpp"
--#include "../../interface/mixed_integrator.hpp"
-+#include "../../interface/mixed_operator.hpp"
- #include "../../../fespace.hpp"
- 
- namespace mfem
-@@ -23,45 +23,37 @@ namespace ceed
- {
- 
- /// Represent a DiffusionIntegrator with AssemblyLevel::Partial using libCEED.
--class PADiffusionIntegrator : public PAIntegrator
-+class PADiffusionIntegrator : public MixedOperator<Integrator>
- {
- public:
--   PADiffusionIntegrator(const mfem::FiniteElementSpace &fes,
--                         const mfem::IntegrationRule &ir,
--                         mfem::Coefficient *Q);
--};
--
--class MixedPADiffusionIntegrator : public MixedIntegrator<PAIntegrator>
--{
--public:
--   MixedPADiffusionIntegrator(const DiffusionIntegrator &integ,
--                              const mfem::FiniteElementSpace &fes,
--                              mfem::Coefficient *Q);
--
--   MixedPADiffusionIntegrator(const VectorDiffusionIntegrator &integ,
--                              const mfem::FiniteElementSpace &fes,
--                              mfem::Coefficient *Q);
-+   template <typename CoeffType>
-+   PADiffusionIntegrator(const mfem::DiffusionIntegrator &integ,
-+                         const mfem::FiniteElementSpace &fes,
-+                         CoeffType *Q,
-+                         const bool use_bdr = false);
-+
-+   template <typename CoeffType>
-+   PADiffusionIntegrator(const mfem::VectorDiffusionIntegrator &integ,
-+                         const mfem::FiniteElementSpace &fes,
-+                         CoeffType *Q,
-+                         const bool use_bdr = false);
- };
- 
- /// Represent a DiffusionIntegrator with AssemblyLevel::None using libCEED.
--class MFDiffusionIntegrator : public MFIntegrator
-+class MFDiffusionIntegrator : public MixedOperator<Integrator>
- {
- public:
--   MFDiffusionIntegrator(const mfem::FiniteElementSpace &fes,
--                         const mfem::IntegrationRule &ir,
--                         mfem::Coefficient *Q);
--};
--
--class MixedMFDiffusionIntegrator : public MixedIntegrator<MFIntegrator>
--{
--public:
--   MixedMFDiffusionIntegrator(const DiffusionIntegrator &integ,
--                              const mfem::FiniteElementSpace &fes,
--                              mfem::Coefficient *Q);
--
--   MixedMFDiffusionIntegrator(const VectorDiffusionIntegrator &integ,
--                              const mfem::FiniteElementSpace &fes,
--                              mfem::Coefficient *Q);
-+   template <typename CoeffType>
-+   MFDiffusionIntegrator(const mfem::DiffusionIntegrator &integ,
-+                         const mfem::FiniteElementSpace &fes,
-+                         CoeffType *Q,
-+                         const bool use_bdr = false);
-+
-+   template <typename CoeffType>
-+   MFDiffusionIntegrator(const mfem::VectorDiffusionIntegrator &integ,
-+                         const mfem::FiniteElementSpace &fes,
-+                         CoeffType *Q,
-+                         const bool use_bdr = false);
- };
- 
- }
-diff --git a/fem/ceed/integrators/diffusion/diffusion_qf.h b/fem/ceed/integrators/diffusion/diffusion_qf.h
-index aa4850e37..9ab50a3ed 100644
---- a/fem/ceed/integrators/diffusion/diffusion_qf.h
-+++ b/fem/ceed/integrators/diffusion/diffusion_qf.h
-@@ -9,180 +9,331 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
-+#ifndef MFEM_LIBCEED_DIFF_QF_H
-+#define MFEM_LIBCEED_DIFF_QF_H
- 
--/// A structure used to pass additional data to f_build_diff and f_apply_diff
--struct DiffusionContext { CeedInt dim, space_dim, vdim; CeedScalar coeff; };
-+#include "../util/util_qf.h"
- 
--/// libCEED Q-function for building quadrature data for a diffusion operator
--/// with a constant coefficient
--CEED_QFUNCTION(f_build_diff_const)(void *ctx, CeedInt Q,
--                                   const CeedScalar *const *in,
--                                   CeedScalar *const *out)
-+#define LIBCEED_DIFF_COEFF_COMP_MAX 6
-+
-+struct DiffusionContext
-+{
-+   CeedInt dim, space_dim, vdim;
-+   CeedScalar coeff[LIBCEED_DIFF_COEFF_COMP_MAX];
-+};
-+
-+/// libCEED QFunction for building quadrature data for a diffusion operator
-+/// with a scalar constant coefficient
-+CEED_QFUNCTION(f_build_diff_const_scalar)(void *ctx, CeedInt Q,
-+                                          const CeedScalar *const *in,
-+                                          CeedScalar *const *out)
- {
--   DiffusionContext *bc = (DiffusionContext*)ctx;
--   // in[0] is Jacobians with shape [dim, nc=dim, Q]
-+   DiffusionContext *bc = (DiffusionContext *)ctx;
-+   // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q]
-    // in[1] is quadrature weights, size (Q)
-    //
--   // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T and store
--   // the symmetric part of the result.
--   const CeedScalar coeff = bc->coeff;
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T and store
-+   // the symmetric part of the result
-+   const CeedScalar *coeff = bc->coeff;
-    const CeedScalar *J = in[0], *qw = in[1];
-    CeedScalar *qd = out[0];
--   switch (bc->dim + 10 * bc->space_dim)
-+   switch (10 * bc->space_dim + bc->dim)
-    {
-       case 11:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar coeff0 = coeff[0];
-+            qd[i] = qw[i] * coeff0 / J[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt21(J + i, Q, coeff, 1, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt22(J + i, Q, coeff, 1, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt32(J + i, Q, coeff, 1, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt33(J + i, Q, coeff, 1, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for a diffusion operator
-+/// with a vector constant coefficient
-+CEED_QFUNCTION(f_build_diff_const_vector)(void *ctx, CeedInt Q,
-+                                          const CeedScalar *const *in,
-+                                          CeedScalar *const *out)
-+{
-+   DiffusionContext *bc = (DiffusionContext *)ctx;
-+   // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[1] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T and store
-+   // the symmetric part of the result
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *J = in[0], *qw = in[1];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt21(J + i, Q, coeff, 1, 2, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt22(J + i, Q, coeff, 1, 2, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt32(J + i, Q, coeff, 1, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt33(J + i, Q, coeff, 1, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for a diffusion operator
-+/// with a matrix constant coefficient
-+CEED_QFUNCTION(f_build_diff_const_matrix)(void *ctx, CeedInt Q,
-+                                          const CeedScalar *const *in,
-+                                          CeedScalar *const *out)
-+{
-+   DiffusionContext *bc = (DiffusionContext *)ctx;
-+   // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[1] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T and store
-+   // the symmetric part of the result
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *J = in[0], *qw = in[1];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            qd[i] = coeff * qw[i] / J[i];
-+            MultAdjJCAdjJt21(J + i, Q, coeff, 1, 3, qw[i], Q, qd + i);
-          }
-          break;
-       case 22:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt22(J + i, Q, coeff, 1, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 2   qd: 0 1   adj(J):  J22 -J12
--            //    1 3       1 2           -J21  J11
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J12 = J[i + Q * 2];
--            const CeedScalar J22 = J[i + Q * 3];
--            const CeedScalar w = qw[i] / (J11 * J22 - J21 * J12);
--            qd[i + Q * 0] =   coeff * w * (J12 * J12 + J22 * J22);
--            qd[i + Q * 1] = - coeff * w * (J11 * J12 + J21 * J22);
--            qd[i + Q * 2] =   coeff * w * (J11 * J11 + J21 * J21);
-+            MultAdjJCAdjJt32(J + i, Q, coeff, 1, 6, qw[i], Q, qd + i);
-          }
-          break;
-       case 33:
--         for (CeedInt i = 0; i < Q; i++)
--         {
--            // J: 0 3 6   qd: 0 1 2
--            //    1 4 7       1 3 4
--            //    2 5 8       2 4 5
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J31 = J[i + Q * 2];
--            const CeedScalar J12 = J[i + Q * 3];
--            const CeedScalar J22 = J[i + Q * 4];
--            const CeedScalar J32 = J[i + Q * 5];
--            const CeedScalar J13 = J[i + Q * 6];
--            const CeedScalar J23 = J[i + Q * 7];
--            const CeedScalar J33 = J[i + Q * 8];
--            const CeedScalar A11 = J22 * J33 - J23 * J32;
--            const CeedScalar A12 = J13 * J32 - J12 * J33;
--            const CeedScalar A13 = J12 * J23 - J13 * J22;
--            const CeedScalar A21 = J23 * J31 - J21 * J33;
--            const CeedScalar A22 = J11 * J33 - J13 * J31;
--            const CeedScalar A23 = J13 * J21 - J11 * J23;
--            const CeedScalar A31 = J21 * J32 - J22 * J31;
--            const CeedScalar A32 = J12 * J31 - J11 * J32;
--            const CeedScalar A33 = J11 * J22 - J12 * J21;
--            const CeedScalar w = qw[i] / (J11 * A11 + J21 * A12 + J31 * A13);
--            qd[i + Q * 0] = coeff * w * (A11 * A11 + A12 * A12 + A13 * A13);
--            qd[i + Q * 1] = coeff * w * (A11 * A21 + A12 * A22 + A13 * A23);
--            qd[i + Q * 2] = coeff * w * (A11 * A31 + A12 * A32 + A13 * A33);
--            qd[i + Q * 3] = coeff * w * (A21 * A21 + A22 * A22 + A23 * A23);
--            qd[i + Q * 4] = coeff * w * (A21 * A31 + A22 * A32 + A23 * A33);
--            qd[i + Q * 5] = coeff * w * (A31 * A31 + A32 * A32 + A33 * A33);
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt33(J + i, Q, coeff, 1, 6, qw[i], Q, qd + i);
-          }
-          break;
-    }
-    return 0;
- }
- 
--/// libCEED Q-function for building quadrature data for a diffusion operator
--/// coefficient evaluated at quadrature points.
--CEED_QFUNCTION(f_build_diff_quad)(void *ctx, CeedInt Q,
--                                  const CeedScalar *const *in,
--                                  CeedScalar *const *out)
-+/// libCEED QFunction for building quadrature data for a diffusion operator
-+/// with a scalar coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_build_diff_quad_scalar)(void *ctx, CeedInt Q,
-+                                         const CeedScalar *const *in,
-+                                         CeedScalar *const *out)
- {
-    DiffusionContext *bc = (DiffusionContext *)ctx;
--   // in[1] is Jacobians with shape [dim, nc=dim, Q]
-+   // in[0] is coefficients with shape [ncomp=1, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-    // in[2] is quadrature weights, size (Q)
-    //
--   // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T and store
--   // the symmetric part of the result.
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T and store
-+   // the symmetric part of the result
-    const CeedScalar *c = in[0], *J = in[1], *qw = in[2];
-    CeedScalar *qd = out[0];
--   switch (bc->dim + 10 * bc->space_dim)
-+   switch (10 * bc->space_dim + bc->dim)
-    {
-       case 11:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            qd[i] = c[i] * qw[i] / J[i];
-+            qd[i] = qw[i] * c[i] / J[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt21(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i);
-          }
-          break;
-       case 22:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 2   qd: 0 1   adj(J):  J22 -J12
--            //    1 3       1 2           -J21  J11
--            const CeedScalar coeff = c[i];
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J12 = J[i + Q * 2];
--            const CeedScalar J22 = J[i + Q * 3];
--            const CeedScalar w = qw[i] / (J11 * J22 - J21 * J12);
--            qd[i + Q * 0] =   coeff * w * (J12 * J12 + J22 * J22);
--            qd[i + Q * 1] = - coeff * w * (J11 * J12 + J21 * J22);
--            qd[i + Q * 2] =   coeff * w * (J11 * J11 + J21 * J21);
-+            MultAdjJCAdjJt22(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt32(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i);
-          }
-          break;
-       case 33:
--         for (CeedInt i = 0; i < Q; i++)
--         {
--            // J: 0 3 6   qd: 0 1 2
--            //    1 4 7       1 3 4
--            //    2 5 8       2 4 5
--            const CeedScalar coeff = c[i];
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J31 = J[i + Q * 2];
--            const CeedScalar J12 = J[i + Q * 3];
--            const CeedScalar J22 = J[i + Q * 4];
--            const CeedScalar J32 = J[i + Q * 5];
--            const CeedScalar J13 = J[i + Q * 6];
--            const CeedScalar J23 = J[i + Q * 7];
--            const CeedScalar J33 = J[i + Q * 8];
--            const CeedScalar A11 = J22 * J33 - J23 * J32;
--            const CeedScalar A12 = J13 * J32 - J12 * J33;
--            const CeedScalar A13 = J12 * J23 - J13 * J22;
--            const CeedScalar A21 = J23 * J31 - J21 * J33;
--            const CeedScalar A22 = J11 * J33 - J13 * J31;
--            const CeedScalar A23 = J13 * J21 - J11 * J23;
--            const CeedScalar A31 = J21 * J32 - J22 * J31;
--            const CeedScalar A32 = J12 * J31 - J11 * J32;
--            const CeedScalar A33 = J11 * J22 - J12 * J21;
--            const CeedScalar w = qw[i] / (J11 * A11 + J21 * A12 + J31 * A13);
--            qd[i + Q * 0] = coeff * w * (A11 * A11 + A12 * A12 + A13 * A13);
--            qd[i + Q * 1] = coeff * w * (A11 * A21 + A12 * A22 + A13 * A23);
--            qd[i + Q * 2] = coeff * w * (A11 * A31 + A12 * A32 + A13 * A33);
--            qd[i + Q * 3] = coeff * w * (A21 * A21 + A22 * A22 + A23 * A23);
--            qd[i + Q * 4] = coeff * w * (A21 * A31 + A22 * A32 + A23 * A33);
--            qd[i + Q * 5] = coeff * w * (A31 * A31 + A32 * A32 + A33 * A33);
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt33(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i);
-          }
-          break;
-    }
-    return 0;
- }
- 
--/// libCEED Q-function for applying a diff operator
-+/// libCEED QFunction for building quadrature data for a diffusion operator
-+/// with a vector coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_build_diff_quad_vector)(void *ctx, CeedInt Q,
-+                                         const CeedScalar *const *in,
-+                                         CeedScalar *const *out)
-+{
-+   DiffusionContext *bc = (DiffusionContext *)ctx;
-+   // in[0] is coefficients with shape [ncomp=space_dim, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T and store
-+   // the symmetric part of the result
-+   const CeedScalar *c = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt21(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt22(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt32(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt33(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for a diffusion operator
-+/// with a matrix coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_build_diff_quad_matrix)(void *ctx, CeedInt Q,
-+                                         const CeedScalar *const *in,
-+                                         CeedScalar *const *out)
-+{
-+   DiffusionContext *bc = (DiffusionContext *)ctx;
-+   // in[0] is coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T and store
-+   // the symmetric part of the result
-+   const CeedScalar *c = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt21(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt22(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt32(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt33(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying a diffusion operator
- CEED_QFUNCTION(f_apply_diff)(void *ctx, CeedInt Q,
-                              const CeedScalar *const *in,
-                              CeedScalar *const *out)
- {
-    DiffusionContext *bc = (DiffusionContext *)ctx;
--   // in[0], out[0] have shape [dim, nc=1, Q]
-+   // in[0], out[0] have shape [dim, ncomp=vdim, Q]
-    const CeedScalar *ug = in[0], *qd = in[1];
-    CeedScalar *vg = out[0];
--   switch (10*bc->dim + bc->vdim)
-+   switch (10 * bc->dim + bc->vdim)
-    {
-       case 11:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            vg[i] = ug[i] * qd[i];
-+            vg[i] = qd[i] * ug[i];
-+         }
-+         break;
-+      case 12:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd0 = qd[i];
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-+            {
-+               vg[i + Q * d] = qd0 * ug[i + Q * d];
-+            }
-          }
-          break;
-       case 21:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
-             const CeedScalar ug0 = ug[i + Q * 0];
-             const CeedScalar ug1 = ug[i + Q * 1];
-@@ -191,23 +342,23 @@ CEED_QFUNCTION(f_apply_diff)(void *ctx, CeedInt Q,
-          }
-          break;
-       case 22:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
-             const CeedScalar qd00 = qd[i + Q * 0];
-             const CeedScalar qd01 = qd[i + Q * 1];
-             const CeedScalar qd10 = qd01;
-             const CeedScalar qd11 = qd[i + Q * 2];
--            for (CeedInt c = 0; c < 2; c++)
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-             {
--               const CeedScalar ug0 = ug[i + Q * (c+2*0)];
--               const CeedScalar ug1 = ug[i + Q * (c+2*1)];
--               vg[i + Q * (c+2*0)] = qd00 * ug0 + qd01 * ug1;
--               vg[i + Q * (c+2*1)] = qd10 * ug0 + qd11 * ug1;
-+               const CeedScalar ug0 = ug[i + Q * (d + 2 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 2 * 1)];
-+               vg[i + Q * (d + 2 * 0)] = qd00 * ug0 + qd01 * ug1;
-+               vg[i + Q * (d + 2 * 1)] = qd10 * ug0 + qd11 * ug1;
-             }
-          }
-          break;
-       case 31:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
-             const CeedScalar ug0 = ug[i + Q * 0];
-             const CeedScalar ug1 = ug[i + Q * 1];
-@@ -217,8 +368,24 @@ CEED_QFUNCTION(f_apply_diff)(void *ctx, CeedInt Q,
-             vg[i + Q * 2] = qd[i + Q * 2] * ug0 + qd[i + Q * 4] * ug1 + qd[i + Q * 5] * ug2;
-          }
-          break;
-+      case 23:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd00 = qd[i + Q * 0];
-+            const CeedScalar qd01 = qd[i + Q * 1];
-+            const CeedScalar qd10 = qd01;
-+            const CeedScalar qd11 = qd[i + Q * 2];
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-+            {
-+               const CeedScalar ug0 = ug[i + Q * (d + 3 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 3 * 1)];
-+               vg[i + Q * (d + 3 * 0)] = qd00 * ug0 + qd01 * ug1;
-+               vg[i + Q * (d + 3 * 1)] = qd10 * ug0 + qd11 * ug1;
-+            }
-+         }
-+         break;
-       case 33:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
-             const CeedScalar qd00 = qd[i + Q * 0];
-             const CeedScalar qd01 = qd[i + Q * 1];
-@@ -229,14 +396,14 @@ CEED_QFUNCTION(f_apply_diff)(void *ctx, CeedInt Q,
-             const CeedScalar qd20 = qd02;
-             const CeedScalar qd21 = qd12;
-             const CeedScalar qd22 = qd[i + Q * 5];
--            for (CeedInt c = 0; c < 3; c++)
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-             {
--               const CeedScalar ug0 = ug[i + Q * (c+3*0)];
--               const CeedScalar ug1 = ug[i + Q * (c+3*1)];
--               const CeedScalar ug2 = ug[i + Q * (c+3*2)];
--               vg[i + Q * (c+3*0)] = qd00 * ug0 + qd01 * ug1 + qd02 * ug2;
--               vg[i + Q * (c+3*1)] = qd10 * ug0 + qd11 * ug1 + qd12 * ug2;
--               vg[i + Q * (c+3*2)] = qd20 * ug0 + qd21 * ug1 + qd22 * ug2;
-+               const CeedScalar ug0 = ug[i + Q * (d + 3 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 3 * 1)];
-+               const CeedScalar ug2 = ug[i + Q * (d + 3 * 2)];
-+               vg[i + Q * (d + 3 * 0)] = qd00 * ug0 + qd01 * ug1 + qd02 * ug2;
-+               vg[i + Q * (d + 3 * 1)] = qd10 * ug0 + qd11 * ug1 + qd12 * ug2;
-+               vg[i + Q * (d + 3 * 2)] = qd20 * ug0 + qd21 * ug1 + qd22 * ug2;
-             }
-          }
-          break;
-@@ -244,104 +411,105 @@ CEED_QFUNCTION(f_apply_diff)(void *ctx, CeedInt Q,
-    return 0;
- }
- 
--/// libCEED Q-function for applying a diff operator
--CEED_QFUNCTION(f_apply_diff_mf_const)(void *ctx, CeedInt Q,
--                                      const CeedScalar *const *in,
--                                      CeedScalar *const *out)
-+/// libCEED QFunction for applying a diffusion operator with a scalar constant
-+/// coefficient
-+CEED_QFUNCTION(f_apply_diff_mf_const_scalar)(void *ctx, CeedInt Q,
-+                                             const CeedScalar *const *in,
-+                                             CeedScalar *const *out)
- {
--   DiffusionContext *bc = (DiffusionContext*)ctx;
--   // in[0], out[0] have shape [dim, nc=1, Q]
--   // in[1] is Jacobians with shape [dim, nc=dim, Q]
-+   DiffusionContext *bc = (DiffusionContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=vdim, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-    // in[2] is quadrature weights, size (Q)
-    //
--   // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T
--   const CeedScalar coeff = bc->coeff;
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T
-+   const CeedScalar *coeff = bc->coeff;
-    const CeedScalar *ug = in[0], *J = in[1], *qw = in[2];
-    CeedScalar *vg = out[0];
--   switch (10 * bc->dim + bc->vdim)
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->vdim)
-    {
--      case 11:
--         for (CeedInt i = 0; i < Q; i++)
-+      case 111:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            const CeedScalar qd = coeff * qw[i] / J[i];
--            vg[i] = ug[i] * qd;
-+            const CeedScalar coeff0 = coeff[0];
-+            const CeedScalar qd = qw[i] * coeff0 / J[i];
-+            vg[i] = qd * ug[i];
-          }
-          break;
--      case 21:
--         for (CeedInt i = 0; i < Q; i++)
--         {
--            // J: 0 2   qd: 0 1   adj(J):  J22 -J12
--            //    1 3       1 2           -J21  J11
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J12 = J[i + Q * 2];
--            const CeedScalar J22 = J[i + Q * 3];
--            const CeedScalar w = qw[i] / (J11 * J22 - J21 * J12);
-+      case 211:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, coeff, 1, 1, qw[i], 1, &qd);
-+            vg[i] = qd * ug[i];
-+         }
-+         break;
-+      case 212:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, coeff, 1, 1, qw[i], 1, &qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-+            {
-+               vg[i + Q * d] = qd * ug[i + Q * d];
-+            }
-+         }
-+         break;
-+      case 221:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-             CeedScalar qd[3];
--            qd[0] =   coeff * w * (J12 * J12 + J22 * J22);
--            qd[1] = - coeff * w * (J11 * J12 + J21 * J22);
--            qd[2] =   coeff * w * (J11 * J11 + J21 * J21);
-+            MultAdjJCAdjJt22(J + i, Q, coeff, 1, 1, qw[i], 1, qd);
-             const CeedScalar ug0 = ug[i + Q * 0];
-             const CeedScalar ug1 = ug[i + Q * 1];
-             vg[i + Q * 0] = qd[0] * ug0 + qd[1] * ug1;
-             vg[i + Q * 1] = qd[1] * ug0 + qd[2] * ug1;
-          }
-          break;
--      case 22:
--         for (CeedInt i = 0; i < Q; i++)
--         {
--            // J: 0 2   qd: 0 1   adj(J):  J22 -J12
--            //    1 3       1 2           -J21  J11
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J12 = J[i + Q * 2];
--            const CeedScalar J22 = J[i + Q * 3];
--            const CeedScalar w = qw[i] / (J11 * J22 - J21 * J12);
-+      case 222:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-             CeedScalar qd[3];
--            qd[0] =   coeff * w * (J12 * J12 + J22 * J22);
--            qd[1] = - coeff * w * (J11 * J12 + J21 * J22);
--            qd[2] =   coeff * w * (J11 * J11 + J21 * J21);
--            for (CeedInt c = 0; c < 2; c++)
-+            MultAdjJCAdjJt22(J + i, Q, coeff, 1, 1, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-             {
--               const CeedScalar ug0 = ug[i + Q * (c+2*0)];
--               const CeedScalar ug1 = ug[i + Q * (c+2*1)];
--               vg[i + Q * (c+2*0)] = qd[0] * ug0 + qd[1] * ug1;
--               vg[i + Q * (c+2*1)] = qd[1] * ug0 + qd[2] * ug1;
-+               const CeedScalar ug0 = ug[i + Q * (d + 2 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 2 * 1)];
-+               vg[i + Q * (d + 2 * 0)] = qd[0] * ug0 + qd[1] * ug1;
-+               vg[i + Q * (d + 2 * 1)] = qd[1] * ug0 + qd[2] * ug1;
-             }
-          }
-          break;
--      case 31:
--         for (CeedInt i = 0; i < Q; i++)
--         {
--            // J: 0 3 6   qd: 0 1 2
--            //    1 4 7       1 3 4
--            //    2 5 8       2 4 5
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J31 = J[i + Q * 2];
--            const CeedScalar J12 = J[i + Q * 3];
--            const CeedScalar J22 = J[i + Q * 4];
--            const CeedScalar J32 = J[i + Q * 5];
--            const CeedScalar J13 = J[i + Q * 6];
--            const CeedScalar J23 = J[i + Q * 7];
--            const CeedScalar J33 = J[i + Q * 8];
--            const CeedScalar A11 = J22 * J33 - J23 * J32;
--            const CeedScalar A12 = J13 * J32 - J12 * J33;
--            const CeedScalar A13 = J12 * J23 - J13 * J22;
--            const CeedScalar A21 = J23 * J31 - J21 * J33;
--            const CeedScalar A22 = J11 * J33 - J13 * J31;
--            const CeedScalar A23 = J13 * J21 - J11 * J23;
--            const CeedScalar A31 = J21 * J32 - J22 * J31;
--            const CeedScalar A32 = J12 * J31 - J11 * J32;
--            const CeedScalar A33 = J11 * J22 - J12 * J21;
--            const CeedScalar w = qw[i] / (J11 * A11 + J21 * A12 + J31 * A13);
-+      case 321:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, coeff, 1, 1, qw[i], 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            vg[i + Q * 0] = qd[0] * ug0 + qd[1] * ug1;
-+            vg[i + Q * 1] = qd[1] * ug0 + qd[2] * ug1;
-+         }
-+         break;
-+      case 323:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, coeff, 1, 1, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-+            {
-+               const CeedScalar ug0 = ug[i + Q * (d + 3 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 3 * 1)];
-+               vg[i + Q * (d + 3 * 0)] = qd[0] * ug0 + qd[1] * ug1;
-+               vg[i + Q * (d + 3 * 1)] = qd[1] * ug0 + qd[2] * ug1;
-+            }
-+         }
-+         break;
-+      case 331:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-             CeedScalar qd[6];
--            qd[0] = coeff * w * (A11 * A11 + A12 * A12 + A13 * A13);
--            qd[1] = coeff * w * (A11 * A21 + A12 * A22 + A13 * A23);
--            qd[2] = coeff * w * (A11 * A31 + A12 * A32 + A13 * A33);
--            qd[3] = coeff * w * (A21 * A21 + A22 * A22 + A23 * A23);
--            qd[4] = coeff * w * (A21 * A31 + A22 * A32 + A23 * A33);
--            qd[5] = coeff * w * (A31 * A31 + A32 * A32 + A33 * A33);
-+            MultAdjJCAdjJt33(J + i, Q, coeff, 1, 1, qw[i], 1, qd);
-             const CeedScalar ug0 = ug[i + Q * 0];
-             const CeedScalar ug1 = ug[i + Q * 1];
-             const CeedScalar ug2 = ug[i + Q * 2];
-@@ -350,46 +518,19 @@ CEED_QFUNCTION(f_apply_diff_mf_const)(void *ctx, CeedInt Q,
-             vg[i + Q * 2] = qd[2] * ug0 + qd[4] * ug1 + qd[5] * ug2;
-          }
-          break;
--      case 33:
--         for (CeedInt i = 0; i < Q; i++)
--         {
--            // J: 0 3 6   qd: 0 1 2
--            //    1 4 7       1 3 4
--            //    2 5 8       2 4 5
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J31 = J[i + Q * 2];
--            const CeedScalar J12 = J[i + Q * 3];
--            const CeedScalar J22 = J[i + Q * 4];
--            const CeedScalar J32 = J[i + Q * 5];
--            const CeedScalar J13 = J[i + Q * 6];
--            const CeedScalar J23 = J[i + Q * 7];
--            const CeedScalar J33 = J[i + Q * 8];
--            const CeedScalar A11 = J22 * J33 - J23 * J32;
--            const CeedScalar A12 = J13 * J32 - J12 * J33;
--            const CeedScalar A13 = J12 * J23 - J13 * J22;
--            const CeedScalar A21 = J23 * J31 - J21 * J33;
--            const CeedScalar A22 = J11 * J33 - J13 * J31;
--            const CeedScalar A23 = J13 * J21 - J11 * J23;
--            const CeedScalar A31 = J21 * J32 - J22 * J31;
--            const CeedScalar A32 = J12 * J31 - J11 * J32;
--            const CeedScalar A33 = J11 * J22 - J12 * J21;
--            const CeedScalar w = qw[i] / (J11 * A11 + J21 * A12 + J31 * A13);
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-             CeedScalar qd[6];
--            qd[0] = coeff * w * (A11 * A11 + A12 * A12 + A13 * A13);
--            qd[1] = coeff * w * (A11 * A21 + A12 * A22 + A13 * A23);
--            qd[2] = coeff * w * (A11 * A31 + A12 * A32 + A13 * A33);
--            qd[3] = coeff * w * (A21 * A21 + A22 * A22 + A23 * A23);
--            qd[4] = coeff * w * (A21 * A31 + A22 * A32 + A23 * A33);
--            qd[5] = coeff * w * (A31 * A31 + A32 * A32 + A33 * A33);
--            for (CeedInt c = 0; c < 3; c++)
-+            MultAdjJCAdjJt33(J + i, Q, coeff, 1, 1, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-             {
--               const CeedScalar ug0 = ug[i + Q * (c+3*0)];
--               const CeedScalar ug1 = ug[i + Q * (c+3*1)];
--               const CeedScalar ug2 = ug[i + Q * (c+3*2)];
--               vg[i + Q * (c+3*0)] = qd[0] * ug0 + qd[1] * ug1 + qd[2] * ug2;
--               vg[i + Q * (c+3*1)] = qd[1] * ug0 + qd[3] * ug1 + qd[4] * ug2;
--               vg[i + Q * (c+3*2)] = qd[2] * ug0 + qd[4] * ug1 + qd[5] * ug2;
-+               const CeedScalar ug0 = ug[i + Q * (d + 3 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 3 * 1)];
-+               const CeedScalar ug2 = ug[i + Q * (d + 3 * 2)];
-+               vg[i + Q * (d + 3 * 0)] = qd[0] * ug0 + qd[1] * ug1 + qd[2] * ug2;
-+               vg[i + Q * (d + 3 * 1)] = qd[1] * ug0 + qd[3] * ug1 + qd[4] * ug2;
-+               vg[i + Q * (d + 3 * 2)] = qd[2] * ug0 + qd[4] * ug1 + qd[5] * ug2;
-             }
-          }
-          break;
-@@ -397,105 +538,97 @@ CEED_QFUNCTION(f_apply_diff_mf_const)(void *ctx, CeedInt Q,
-    return 0;
- }
- 
--CEED_QFUNCTION(f_apply_diff_mf_quad)(void *ctx, CeedInt Q,
--                                     const CeedScalar *const *in,
--                                     CeedScalar *const *out)
-+/// libCEED QFunction for applying a diffusion operator with a vector constant
-+/// coefficient
-+CEED_QFUNCTION(f_apply_diff_mf_const_vector)(void *ctx, CeedInt Q,
-+                                             const CeedScalar *const *in,
-+                                             CeedScalar *const *out)
- {
--   DiffusionContext *bc = (DiffusionContext*)ctx;
--   // in[0], out[0] have shape [dim, nc=1, Q]
--   // in[1] is Jacobians with shape [dim, nc=dim, Q]
-+   DiffusionContext *bc = (DiffusionContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=vdim, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-    // in[2] is quadrature weights, size (Q)
-    //
--   // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T
--   const CeedScalar *c = in[0], *ug = in[1], *J = in[2], *qw = in[3];
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *ug = in[0], *J = in[1], *qw = in[2];
-    CeedScalar *vg = out[0];
--   switch (10 * bc->dim + bc->vdim)
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->vdim)
-    {
--      case 11:
--         for (CeedInt i = 0; i < Q; i++)
-+      case 211:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            const CeedScalar qd = c[i] * qw[i] / J[i];
--            vg[i] = ug[i] * qd;
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, coeff, 1, 2, qw[i], 1, &qd);
-+            vg[i] = qd * ug[i];
-          }
-          break;
--      case 21:
--         for (CeedInt i = 0; i < Q; i++)
--         {
--            // J: 0 2   qd: 0 1   adj(J):  J22 -J12
--            //    1 3       1 2           -J21  J11
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J12 = J[i + Q * 2];
--            const CeedScalar J22 = J[i + Q * 3];
--            const CeedScalar w = qw[i] / (J11 * J22 - J21 * J12);
-+      case 212:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, coeff, 1, 2, qw[i], 1, &qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-+            {
-+               vg[i + Q * d] = qd * ug[i + Q * d];
-+            }
-+         }
-+         break;
-+      case 221:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-             CeedScalar qd[3];
--            const CeedScalar coeff = c[i];
--            qd[0] =   coeff * w * (J12 * J12 + J22 * J22);
--            qd[1] = - coeff * w * (J11 * J12 + J21 * J22);
--            qd[2] =   coeff * w * (J11 * J11 + J21 * J21);
-+            MultAdjJCAdjJt22(J + i, Q, coeff, 1, 2, qw[i], 1, qd);
-             const CeedScalar ug0 = ug[i + Q * 0];
-             const CeedScalar ug1 = ug[i + Q * 1];
-             vg[i + Q * 0] = qd[0] * ug0 + qd[1] * ug1;
-             vg[i + Q * 1] = qd[1] * ug0 + qd[2] * ug1;
-          }
-          break;
--      case 22:
--         for (CeedInt i = 0; i < Q; i++)
--         {
--            // J: 0 2   qd: 0 1   adj(J):  J22 -J12
--            //    1 3       1 2           -J21  J11
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J12 = J[i + Q * 2];
--            const CeedScalar J22 = J[i + Q * 3];
--            const CeedScalar w = qw[i] / (J11 * J22 - J21 * J12);
-+      case 222:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-             CeedScalar qd[3];
--            const CeedScalar coeff = c[i];
--            qd[0] =   coeff * w * (J12 * J12 + J22 * J22);
--            qd[1] = - coeff * w * (J11 * J12 + J21 * J22);
--            qd[2] =   coeff * w * (J11 * J11 + J21 * J21);
--            for (CeedInt d = 0; d < 2; d++)
-+            MultAdjJCAdjJt22(J + i, Q, coeff, 1, 2, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-             {
--               const CeedScalar ug0 = ug[i + Q * (d+2*0)];
--               const CeedScalar ug1 = ug[i + Q * (d+2*1)];
--               vg[i + Q * (d+2*0)] = qd[0] * ug0 + qd[1] * ug1;
--               vg[i + Q * (d+2*1)] = qd[1] * ug0 + qd[2] * ug1;
-+               const CeedScalar ug0 = ug[i + Q * (d + 2 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 2 * 1)];
-+               vg[i + Q * (d + 2 * 0)] = qd[0] * ug0 + qd[1] * ug1;
-+               vg[i + Q * (d + 2 * 1)] = qd[1] * ug0 + qd[2] * ug1;
-             }
-          }
-          break;
--      case 31:
--         for (CeedInt i = 0; i < Q; i++)
--         {
--            // J: 0 3 6   qd: 0 1 2
--            //    1 4 7       1 3 4
--            //    2 5 8       2 4 5
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J31 = J[i + Q * 2];
--            const CeedScalar J12 = J[i + Q * 3];
--            const CeedScalar J22 = J[i + Q * 4];
--            const CeedScalar J32 = J[i + Q * 5];
--            const CeedScalar J13 = J[i + Q * 6];
--            const CeedScalar J23 = J[i + Q * 7];
--            const CeedScalar J33 = J[i + Q * 8];
--            const CeedScalar A11 = J22 * J33 - J23 * J32;
--            const CeedScalar A12 = J13 * J32 - J12 * J33;
--            const CeedScalar A13 = J12 * J23 - J13 * J22;
--            const CeedScalar A21 = J23 * J31 - J21 * J33;
--            const CeedScalar A22 = J11 * J33 - J13 * J31;
--            const CeedScalar A23 = J13 * J21 - J11 * J23;
--            const CeedScalar A31 = J21 * J32 - J22 * J31;
--            const CeedScalar A32 = J12 * J31 - J11 * J32;
--            const CeedScalar A33 = J11 * J22 - J12 * J21;
--            const CeedScalar w = qw[i] / (J11 * A11 + J21 * A12 + J31 * A13);
-+      case 321:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, coeff, 1, 3, qw[i], 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            vg[i + Q * 0] = qd[0] * ug0 + qd[1] * ug1;
-+            vg[i + Q * 1] = qd[1] * ug0 + qd[2] * ug1;
-+         }
-+         break;
-+      case 323:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, coeff, 1, 3, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-+            {
-+               const CeedScalar ug0 = ug[i + Q * (d + 3 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 3 * 1)];
-+               vg[i + Q * (d + 3 * 0)] = qd[0] * ug0 + qd[1] * ug1;
-+               vg[i + Q * (d + 3 * 1)] = qd[1] * ug0 + qd[2] * ug1;
-+            }
-+         }
-+         break;
-+      case 331:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-             CeedScalar qd[6];
--            const CeedScalar coeff = c[i];
--            qd[0] = coeff * w * (A11 * A11 + A12 * A12 + A13 * A13);
--            qd[1] = coeff * w * (A11 * A21 + A12 * A22 + A13 * A23);
--            qd[2] = coeff * w * (A11 * A31 + A12 * A32 + A13 * A33);
--            qd[3] = coeff * w * (A21 * A21 + A22 * A22 + A23 * A23);
--            qd[4] = coeff * w * (A21 * A31 + A22 * A32 + A23 * A33);
--            qd[5] = coeff * w * (A31 * A31 + A32 * A32 + A33 * A33);
-+            MultAdjJCAdjJt33(J + i, Q, coeff, 1, 3, qw[i], 1, qd);
-             const CeedScalar ug0 = ug[i + Q * 0];
-             const CeedScalar ug1 = ug[i + Q * 1];
-             const CeedScalar ug2 = ug[i + Q * 2];
-@@ -504,50 +637,507 @@ CEED_QFUNCTION(f_apply_diff_mf_quad)(void *ctx, CeedInt Q,
-             vg[i + Q * 2] = qd[2] * ug0 + qd[4] * ug1 + qd[5] * ug2;
-          }
-          break;
--      case 33:
--         for (CeedInt i = 0; i < Q; i++)
--         {
--            // J: 0 3 6   qd: 0 1 2
--            //    1 4 7       1 3 4
--            //    2 5 8       2 4 5
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J31 = J[i + Q * 2];
--            const CeedScalar J12 = J[i + Q * 3];
--            const CeedScalar J22 = J[i + Q * 4];
--            const CeedScalar J32 = J[i + Q * 5];
--            const CeedScalar J13 = J[i + Q * 6];
--            const CeedScalar J23 = J[i + Q * 7];
--            const CeedScalar J33 = J[i + Q * 8];
--            const CeedScalar A11 = J22 * J33 - J23 * J32;
--            const CeedScalar A12 = J13 * J32 - J12 * J33;
--            const CeedScalar A13 = J12 * J23 - J13 * J22;
--            const CeedScalar A21 = J23 * J31 - J21 * J33;
--            const CeedScalar A22 = J11 * J33 - J13 * J31;
--            const CeedScalar A23 = J13 * J21 - J11 * J23;
--            const CeedScalar A31 = J21 * J32 - J22 * J31;
--            const CeedScalar A32 = J12 * J31 - J11 * J32;
--            const CeedScalar A33 = J11 * J22 - J12 * J21;
--            const CeedScalar w = qw[i] / (J11 * A11 + J21 * A12 + J31 * A13);
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-             CeedScalar qd[6];
--            const CeedScalar coeff = c[i];
--            qd[0] = coeff * w * (A11 * A11 + A12 * A12 + A13 * A13);
--            qd[1] = coeff * w * (A11 * A21 + A12 * A22 + A13 * A23);
--            qd[2] = coeff * w * (A11 * A31 + A12 * A32 + A13 * A33);
--            qd[3] = coeff * w * (A21 * A21 + A22 * A22 + A23 * A23);
--            qd[4] = coeff * w * (A21 * A31 + A22 * A32 + A23 * A33);
--            qd[5] = coeff * w * (A31 * A31 + A32 * A32 + A33 * A33);
--            for (CeedInt d = 0; d < 3; d++)
-+            MultAdjJCAdjJt33(J + i, Q, coeff, 1, 3, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-             {
--               const CeedScalar ug0 = ug[i + Q * (d+3*0)];
--               const CeedScalar ug1 = ug[i + Q * (d+3*1)];
--               const CeedScalar ug2 = ug[i + Q * (d+3*2)];
--               vg[i + Q * (d+3*0)] = qd[0] * ug0 + qd[1] * ug1 + qd[2] * ug2;
--               vg[i + Q * (d+3*1)] = qd[1] * ug0 + qd[3] * ug1 + qd[4] * ug2;
--               vg[i + Q * (d+3*2)] = qd[2] * ug0 + qd[4] * ug1 + qd[5] * ug2;
-+               const CeedScalar ug0 = ug[i + Q * (d + 3 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 3 * 1)];
-+               const CeedScalar ug2 = ug[i + Q * (d + 3 * 2)];
-+               vg[i + Q * (d + 3 * 0)] = qd[0] * ug0 + qd[1] * ug1 + qd[2] * ug2;
-+               vg[i + Q * (d + 3 * 1)] = qd[1] * ug0 + qd[3] * ug1 + qd[4] * ug2;
-+               vg[i + Q * (d + 3 * 2)] = qd[2] * ug0 + qd[4] * ug1 + qd[5] * ug2;
-             }
-          }
-          break;
-    }
-    return 0;
- }
-+
-+/// libCEED QFunction for applying a diffusion operator with a matrix constant
-+/// coefficient
-+CEED_QFUNCTION(f_apply_diff_mf_const_matrix)(void *ctx, CeedInt Q,
-+                                             const CeedScalar *const *in,
-+                                             CeedScalar *const *out)
-+{
-+   DiffusionContext *bc = (DiffusionContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=vdim, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *ug = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *vg = out[0];
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->vdim)
-+   {
-+      case 211:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, coeff, 1, 3, qw[i], 1, &qd);
-+            vg[i] = qd * ug[i];
-+         }
-+         break;
-+      case 212:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, coeff, 1, 3, qw[i], 1, &qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-+            {
-+               vg[i + Q * d] = qd * ug[i + Q * d];
-+            }
-+         }
-+         break;
-+      case 221:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt22(J + i, Q, coeff, 1, 3, qw[i], 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            vg[i + Q * 0] = qd[0] * ug0 + qd[1] * ug1;
-+            vg[i + Q * 1] = qd[1] * ug0 + qd[2] * ug1;
-+         }
-+         break;
-+      case 222:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt22(J + i, Q, coeff, 1, 3, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-+            {
-+               const CeedScalar ug0 = ug[i + Q * (d + 2 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 2 * 1)];
-+               vg[i + Q * (d + 2 * 0)] = qd[0] * ug0 + qd[1] * ug1;
-+               vg[i + Q * (d + 2 * 1)] = qd[1] * ug0 + qd[2] * ug1;
-+            }
-+         }
-+         break;
-+      case 321:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, coeff, 1, 6, qw[i], 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            vg[i + Q * 0] = qd[0] * ug0 + qd[1] * ug1;
-+            vg[i + Q * 1] = qd[1] * ug0 + qd[2] * ug1;
-+         }
-+         break;
-+      case 323:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, coeff, 1, 6, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-+            {
-+               const CeedScalar ug0 = ug[i + Q * (d + 3 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 3 * 1)];
-+               vg[i + Q * (d + 3 * 0)] = qd[0] * ug0 + qd[1] * ug1;
-+               vg[i + Q * (d + 3 * 1)] = qd[1] * ug0 + qd[2] * ug1;
-+            }
-+         }
-+         break;
-+      case 331:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultAdjJCAdjJt33(J + i, Q, coeff, 1, 6, qw[i], 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            const CeedScalar ug2 = ug[i + Q * 2];
-+            vg[i + Q * 0] = qd[0] * ug0 + qd[1] * ug1 + qd[2] * ug2;
-+            vg[i + Q * 1] = qd[1] * ug0 + qd[3] * ug1 + qd[4] * ug2;
-+            vg[i + Q * 2] = qd[2] * ug0 + qd[4] * ug1 + qd[5] * ug2;
-+         }
-+         break;
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultAdjJCAdjJt33(J + i, Q, coeff, 1, 6, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-+            {
-+               const CeedScalar ug0 = ug[i + Q * (d + 3 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 3 * 1)];
-+               const CeedScalar ug2 = ug[i + Q * (d + 3 * 2)];
-+               vg[i + Q * (d + 3 * 0)] = qd[0] * ug0 + qd[1] * ug1 + qd[2] * ug2;
-+               vg[i + Q * (d + 3 * 1)] = qd[1] * ug0 + qd[3] * ug1 + qd[4] * ug2;
-+               vg[i + Q * (d + 3 * 2)] = qd[2] * ug0 + qd[4] * ug1 + qd[5] * ug2;
-+            }
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying a diffusion operator with a scalar
-+/// coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_apply_diff_mf_quad_scalar)(void *ctx, CeedInt Q,
-+                                            const CeedScalar *const *in,
-+                                            CeedScalar *const *out)
-+{
-+   DiffusionContext *bc = (DiffusionContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=vdim, Q]
-+   // in[1] is coefficients with shape [ncomp=1, Q]
-+   // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[3] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T
-+   const CeedScalar *ug = in[0], *c = in[1], *J = in[2], *qw = in[3];
-+   CeedScalar *vg = out[0];
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->vdim)
-+   {
-+      case 111:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * c[i] / J[i];
-+            vg[i] = qd * ug[i];
-+         }
-+         break;
-+      case 211:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, c + i, Q, 1, qw[i], 1, &qd);
-+            vg[i] = qd * ug[i];
-+         }
-+         break;
-+      case 212:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, c + i, Q, 1, qw[i], 1, &qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-+            {
-+               vg[i + Q * d] = qd * ug[i + Q * d];
-+            }
-+         }
-+         break;
-+      case 221:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt22(J + i, Q, c + i, Q, 1, qw[i], 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            vg[i + Q * 0] = qd[0] * ug0 + qd[1] * ug1;
-+            vg[i + Q * 1] = qd[1] * ug0 + qd[2] * ug1;
-+         }
-+         break;
-+      case 222:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt22(J + i, Q, c + i, Q, 1, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-+            {
-+               const CeedScalar ug0 = ug[i + Q * (d + 2 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 2 * 1)];
-+               vg[i + Q * (d + 2 * 0)] = qd[0] * ug0 + qd[1] * ug1;
-+               vg[i + Q * (d + 2 * 1)] = qd[1] * ug0 + qd[2] * ug1;
-+            }
-+         }
-+         break;
-+      case 321:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, c + i, Q, 1, qw[i], 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            vg[i + Q * 0] = qd[0] * ug0 + qd[1] * ug1;
-+            vg[i + Q * 1] = qd[1] * ug0 + qd[2] * ug1;
-+         }
-+         break;
-+      case 323:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, c + i, Q, 1, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-+            {
-+               const CeedScalar ug0 = ug[i + Q * (d + 3 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 3 * 1)];
-+               vg[i + Q * (d + 3 * 0)] = qd[0] * ug0 + qd[1] * ug1;
-+               vg[i + Q * (d + 3 * 1)] = qd[1] * ug0 + qd[2] * ug1;
-+            }
-+         }
-+         break;
-+      case 331:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultAdjJCAdjJt33(J + i, Q, c + i, Q, 1, qw[i], 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            const CeedScalar ug2 = ug[i + Q * 2];
-+            vg[i + Q * 0] = qd[0] * ug0 + qd[1] * ug1 + qd[2] * ug2;
-+            vg[i + Q * 1] = qd[1] * ug0 + qd[3] * ug1 + qd[4] * ug2;
-+            vg[i + Q * 2] = qd[2] * ug0 + qd[4] * ug1 + qd[5] * ug2;
-+         }
-+         break;
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultAdjJCAdjJt33(J + i, Q, c + i, Q, 1, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-+            {
-+               const CeedScalar ug0 = ug[i + Q * (d + 3 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 3 * 1)];
-+               const CeedScalar ug2 = ug[i + Q * (d + 3 * 2)];
-+               vg[i + Q * (d + 3 * 0)] = qd[0] * ug0 + qd[1] * ug1 + qd[2] * ug2;
-+               vg[i + Q * (d + 3 * 1)] = qd[1] * ug0 + qd[3] * ug1 + qd[4] * ug2;
-+               vg[i + Q * (d + 3 * 2)] = qd[2] * ug0 + qd[4] * ug1 + qd[5] * ug2;
-+            }
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying a diffusion operator with a vector
-+/// coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_apply_diff_mf_quad_vector)(void *ctx, CeedInt Q,
-+                                            const CeedScalar *const *in,
-+                                            CeedScalar *const *out)
-+{
-+   DiffusionContext *bc = (DiffusionContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=vdim, Q]
-+   // in[1] is coefficients with shape [ncomp=space_dim, Q]
-+   // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[3] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T
-+   const CeedScalar *ug = in[0], *c = in[1], *J = in[2], *qw = in[3];
-+   CeedScalar *vg = out[0];
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->vdim)
-+   {
-+      case 211:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, c + i, Q, 2, qw[i], 1, &qd);
-+            vg[i] = qd * ug[i];
-+         }
-+         break;
-+      case 212:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, c + i, Q, 2, qw[i], 1, &qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-+            {
-+               vg[i + Q * d] = qd * ug[i + Q * d];
-+            }
-+         }
-+         break;
-+      case 221:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt22(J + i, Q, c + i, Q, 2, qw[i], 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            vg[i + Q * 0] = qd[0] * ug0 + qd[1] * ug1;
-+            vg[i + Q * 1] = qd[1] * ug0 + qd[2] * ug1;
-+         }
-+         break;
-+      case 222:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt22(J + i, Q, c + i, Q, 2, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-+            {
-+               const CeedScalar ug0 = ug[i + Q * (d + 2 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 2 * 1)];
-+               vg[i + Q * (d + 2 * 0)] = qd[0] * ug0 + qd[1] * ug1;
-+               vg[i + Q * (d + 2 * 1)] = qd[1] * ug0 + qd[2] * ug1;
-+            }
-+         }
-+         break;
-+      case 321:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, c + i, Q, 3, qw[i], 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            vg[i + Q * 0] = qd[0] * ug0 + qd[1] * ug1;
-+            vg[i + Q * 1] = qd[1] * ug0 + qd[2] * ug1;
-+         }
-+         break;
-+      case 323:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, c + i, Q, 3, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-+            {
-+               const CeedScalar ug0 = ug[i + Q * (d + 3 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 3 * 1)];
-+               vg[i + Q * (d + 3 * 0)] = qd[0] * ug0 + qd[1] * ug1;
-+               vg[i + Q * (d + 3 * 1)] = qd[1] * ug0 + qd[2] * ug1;
-+            }
-+         }
-+         break;
-+      case 331:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultAdjJCAdjJt33(J + i, Q, c + i, Q, 3, qw[i], 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            const CeedScalar ug2 = ug[i + Q * 2];
-+            vg[i + Q * 0] = qd[0] * ug0 + qd[1] * ug1 + qd[2] * ug2;
-+            vg[i + Q * 1] = qd[1] * ug0 + qd[3] * ug1 + qd[4] * ug2;
-+            vg[i + Q * 2] = qd[2] * ug0 + qd[4] * ug1 + qd[5] * ug2;
-+         }
-+         break;
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultAdjJCAdjJt33(J + i, Q, c + i, Q, 3, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-+            {
-+               const CeedScalar ug0 = ug[i + Q * (d + 3 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 3 * 1)];
-+               const CeedScalar ug2 = ug[i + Q * (d + 3 * 2)];
-+               vg[i + Q * (d + 3 * 0)] = qd[0] * ug0 + qd[1] * ug1 + qd[2] * ug2;
-+               vg[i + Q * (d + 3 * 1)] = qd[1] * ug0 + qd[3] * ug1 + qd[4] * ug2;
-+               vg[i + Q * (d + 3 * 2)] = qd[2] * ug0 + qd[4] * ug1 + qd[5] * ug2;
-+            }
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying a diffusion operator with a matrix
-+/// coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_apply_diff_mf_quad_matrix)(void *ctx, CeedInt Q,
-+                                            const CeedScalar *const *in,
-+                                            CeedScalar *const *out)
-+{
-+   DiffusionContext *bc = (DiffusionContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=vdim, Q]
-+   // in[1] is coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q]
-+   // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[3] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T
-+   const CeedScalar *ug = in[0], *c = in[1], *J = in[2], *qw = in[3];
-+   CeedScalar *vg = out[0];
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->vdim)
-+   {
-+      case 211:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, c + i, Q, 3, qw[i], 1, &qd);
-+            vg[i] = qd * ug[i];
-+         }
-+         break;
-+      case 212:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, c + i, Q, 3, qw[i], 1, &qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-+            {
-+               vg[i + Q * d] = qd * ug[i + Q * d];
-+            }
-+         }
-+         break;
-+      case 221:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt22(J + i, Q, c + i, Q, 3, qw[i], 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            vg[i + Q * 0] = qd[0] * ug0 + qd[1] * ug1;
-+            vg[i + Q * 1] = qd[1] * ug0 + qd[2] * ug1;
-+         }
-+         break;
-+      case 222:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt22(J + i, Q, c + i, Q, 3, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-+            {
-+               const CeedScalar ug0 = ug[i + Q * (d + 2 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 2 * 1)];
-+               vg[i + Q * (d + 2 * 0)] = qd[0] * ug0 + qd[1] * ug1;
-+               vg[i + Q * (d + 2 * 1)] = qd[1] * ug0 + qd[2] * ug1;
-+            }
-+         }
-+         break;
-+      case 321:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, c + i, Q, 6, qw[i], 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            vg[i + Q * 0] = qd[0] * ug0 + qd[1] * ug1;
-+            vg[i + Q * 1] = qd[1] * ug0 + qd[2] * ug1;
-+         }
-+         break;
-+      case 323:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, c + i, Q, 6, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-+            {
-+               const CeedScalar ug0 = ug[i + Q * (d + 3 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 3 * 1)];
-+               vg[i + Q * (d + 3 * 0)] = qd[0] * ug0 + qd[1] * ug1;
-+               vg[i + Q * (d + 3 * 1)] = qd[1] * ug0 + qd[2] * ug1;
-+            }
-+         }
-+         break;
-+      case 331:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultAdjJCAdjJt33(J + i, Q, c + i, Q, 6, qw[i], 1, qd);
-+            const CeedScalar ug0 = ug[i + Q * 0];
-+            const CeedScalar ug1 = ug[i + Q * 1];
-+            const CeedScalar ug2 = ug[i + Q * 2];
-+            vg[i + Q * 0] = qd[0] * ug0 + qd[1] * ug1 + qd[2] * ug2;
-+            vg[i + Q * 1] = qd[1] * ug0 + qd[3] * ug1 + qd[4] * ug2;
-+            vg[i + Q * 2] = qd[2] * ug0 + qd[4] * ug1 + qd[5] * ug2;
-+         }
-+         break;
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultAdjJCAdjJt33(J + i, Q, c + i, Q, 6, qw[i], 1, qd);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-+            {
-+               const CeedScalar ug0 = ug[i + Q * (d + 3 * 0)];
-+               const CeedScalar ug1 = ug[i + Q * (d + 3 * 1)];
-+               const CeedScalar ug2 = ug[i + Q * (d + 3 * 2)];
-+               vg[i + Q * (d + 3 * 0)] = qd[0] * ug0 + qd[1] * ug1 + qd[2] * ug2;
-+               vg[i + Q * (d + 3 * 1)] = qd[1] * ug0 + qd[3] * ug1 + qd[4] * ug2;
-+               vg[i + Q * (d + 3 * 2)] = qd[2] * ug0 + qd[4] * ug1 + qd[5] * ug2;
-+            }
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+#endif // MFEM_LIBCEED_DIFF_QF_H
-diff --git a/fem/ceed/integrators/divdiv/divdiv.cpp b/fem/ceed/integrators/divdiv/divdiv.cpp
-new file mode 100644
-index 000000000..f574c0f4e
---- /dev/null
-+++ b/fem/ceed/integrators/divdiv/divdiv.cpp
-@@ -0,0 +1,124 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "divdiv.hpp"
-+
-+#include "../../../../config/config.hpp"
-+#ifdef MFEM_USE_CEED
-+#include "divdiv_qf.h"
-+#endif
-+
-+namespace mfem
-+{
-+
-+namespace ceed
-+{
-+
-+#ifdef MFEM_USE_CEED
-+struct DivDivOperatorInfo : public OperatorInfo
-+{
-+   DivDivContext ctx = {0};
-+   DivDivOperatorInfo(const mfem::FiniteElementSpace &fes, mfem::Coefficient *Q,
-+                      bool use_bdr = false, bool use_mf = false)
-+   {
-+      MFEM_VERIFY(fes.GetVDim() == 1,
-+                  "libCEED interface for vector FE does not support vdim > 1!");
-+      ctx.dim = fes.GetMesh()->Dimension() - use_bdr;
-+      ctx.space_dim = fes.GetMesh()->SpaceDimension();
-+      if (!use_mf)
-+      {
-+         apply_func = ":f_apply_divdiv";
-+         apply_qf = &f_apply_divdiv;
-+      }
-+      else
-+      {
-+         build_func = "";
-+         build_qf = nullptr;
-+      }
-+      if (Q == nullptr)
-+      {
-+         ctx.coeff = 1.0;
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_divdiv_const";
-+            build_qf = &f_build_divdiv_const;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_divdiv_mf_const";
-+            apply_qf = &f_apply_divdiv_mf_const;
-+         }
-+      }
-+      else if (mfem::ConstantCoefficient *const_coeff =
-+                  dynamic_cast<mfem::ConstantCoefficient *>(Q))
-+      {
-+         ctx.coeff = const_coeff->constant;
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_divdiv_const";
-+            build_qf = &f_build_divdiv_const;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_divdiv_mf_const";
-+            apply_qf = &f_apply_divdiv_mf_const;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_divdiv_quad";
-+            build_qf = &f_build_divdiv_quad;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_divdiv_mf_quad";
-+            apply_qf = &f_apply_divdiv_mf_quad;
-+         }
-+      }
-+      header = "/integrators/divdiv/divdiv_qf.h";
-+      trial_op = EvalMode::Div;
-+      test_op = EvalMode::Div;
-+      qdatasize = 1;
-+   }
-+};
-+#endif
-+
-+PADivDivIntegrator::PADivDivIntegrator(const mfem::DivDivIntegrator &integ,
-+                                       const mfem::FiniteElementSpace &fes,
-+                                       mfem::Coefficient *Q,
-+                                       const bool use_bdr)
-+{
-+#ifdef MFEM_USE_CEED
-+   DivDivOperatorInfo info(fes, Q, use_bdr);
-+   Assemble(integ, info, fes, Q, use_bdr);
-+#else
-+   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
-+#endif
-+}
-+
-+MFDivDivIntegrator::MFDivDivIntegrator(const mfem::DivDivIntegrator &integ,
-+                                       const mfem::FiniteElementSpace &fes,
-+                                       mfem::Coefficient *Q,
-+                                       const bool use_bdr)
-+{
-+#ifdef MFEM_USE_CEED
-+   DivDivOperatorInfo info(fes, Q, use_bdr, true);
-+   Assemble(integ, info, fes, Q, use_bdr, true);
-+#else
-+   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
-+#endif
-+}
-+
-+} // namespace ceed
-+
-+} // namespace mfem
-diff --git a/fem/ceed/integrators/divdiv/divdiv.hpp b/fem/ceed/integrators/divdiv/divdiv.hpp
-new file mode 100644
-index 000000000..1e5bf163e
---- /dev/null
-+++ b/fem/ceed/integrators/divdiv/divdiv.hpp
-@@ -0,0 +1,49 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#ifndef MFEM_LIBCEED_DIVDIV_HPP
-+#define MFEM_LIBCEED_DIVDIV_HPP
-+
-+#include "../../interface/integrator.hpp"
-+#include "../../interface/mixed_operator.hpp"
-+#include "../../../fespace.hpp"
-+
-+namespace mfem
-+{
-+
-+namespace ceed
-+{
-+
-+/// Represent a DivDivIntegrator with AssemblyLevel::Partial using libCEED.
-+class PADivDivIntegrator : public MixedOperator<Integrator>
-+{
-+public:
-+   PADivDivIntegrator(const mfem::DivDivIntegrator &integ,
-+                      const mfem::FiniteElementSpace &fes,
-+                      mfem::Coefficient *Q,
-+                      const bool use_bdr = false);
-+};
-+
-+/// Represent a DivDivIntegrator with AssemblyLevel::None using libCEED.
-+class MFDivDivIntegrator : public MixedOperator<Integrator>
-+{
-+public:
-+   MFDivDivIntegrator(const mfem::DivDivIntegrator &integ,
-+                      const mfem::FiniteElementSpace &fes,
-+                      mfem::Coefficient *Q,
-+                      const bool use_bdr = false);
-+};
-+
-+}
-+
-+}
-+
-+#endif // MFEM_LIBCEED_DIVDIV_HPP
-diff --git a/fem/ceed/integrators/divdiv/divdiv_qf.h b/fem/ceed/integrators/divdiv/divdiv_qf.h
-new file mode 100644
-index 000000000..853aa0011
---- /dev/null
-+++ b/fem/ceed/integrators/divdiv/divdiv_qf.h
-@@ -0,0 +1,250 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#ifndef MFEM_LIBCEED_DIVDIV_QF_H
-+#define MFEM_LIBCEED_DIVDIV_QF_H
-+
-+#include "../util/util_qf.h"
-+
-+struct DivDivContext
-+{
-+   CeedInt dim, space_dim;
-+   CeedScalar coeff;
-+};
-+
-+/// libCEED QFunction for building quadrature data for a div-div operator
-+/// with a constant coefficient
-+CEED_QFUNCTION(f_build_divdiv_const)(void *ctx, CeedInt Q,
-+                                     const CeedScalar *const *in,
-+                                     CeedScalar *const *out)
-+{
-+   DivDivContext *bc = (DivDivContext *)ctx;
-+   // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[1] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute and store qw * c / det(J)
-+   const CeedScalar coeff = bc->coeff;
-+   const CeedScalar *J = in[0], *qw = in[1];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 11:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * coeff / J[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * coeff / DetJ21(J + i, Q);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * coeff / DetJ22(J + i, Q);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * coeff / DetJ32(J + i, Q);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * coeff / DetJ33(J + i, Q);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for a div-div operator
-+/// with a coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_build_divdiv_quad)(void *ctx, CeedInt Q,
-+                                    const CeedScalar *const *in,
-+                                    CeedScalar *const *out)
-+{
-+   DivDivContext *bc = (DivDivContext *)ctx;
-+   // in[0] is coefficients, size (Q)
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute and store qw * c / det(J)
-+   const CeedScalar *c = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 11:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * c[i] / J[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * c[i] / DetJ21(J + i, Q);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * c[i] / DetJ22(J + i, Q);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * c[i] / DetJ32(J + i, Q);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * c[i] / DetJ33(J + i, Q);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying a div-div operator
-+CEED_QFUNCTION(f_apply_divdiv)(void *ctx, CeedInt Q,
-+                               const CeedScalar *const *in,
-+                               CeedScalar *const *out)
-+{
-+   // in[0], out[0] have shape [ncomp=1, Q]
-+   const CeedScalar *ud = in[0], *qd = in[1];
-+   CeedScalar *vd = out[0];
-+   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+   {
-+      vd[i] = qd[i] * ud[i];
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying a div-div operator with a constant
-+/// coefficient
-+CEED_QFUNCTION(f_apply_divdiv_mf_const)(void *ctx, CeedInt Q,
-+                                        const CeedScalar *const *in,
-+                                        CeedScalar *const *out)
-+{
-+   DivDivContext *bc = (DivDivContext *)ctx;
-+   // in[0], out[0] have shape [ncomp=1, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw * c / det(J)
-+   const CeedScalar coeff = bc->coeff;
-+   const CeedScalar *ud = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *vd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 11:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * coeff / J[i];
-+            vd[i] = qd * ud[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * coeff / DetJ21(J + i, Q);
-+            vd[i] = qd * ud[i];
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * coeff / DetJ22(J + i, Q);
-+            vd[i] = qd * ud[i];
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * coeff / DetJ32(J + i, Q);
-+            vd[i] = qd * ud[i];
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * coeff / DetJ33(J + i, Q);
-+            vd[i] = qd * ud[i];
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying a div-div operator with a coefficient
-+/// evaluated at quadrature points
-+CEED_QFUNCTION(f_apply_divdiv_mf_quad)(void *ctx, CeedInt Q,
-+                                       const CeedScalar *const *in,
-+                                       CeedScalar *const *out)
-+{
-+   DivDivContext *bc = (DivDivContext *)ctx;
-+   // in[0], out[0] have shape [ncomp=1, Q]
-+   // in[0] is coefficients, size (Q)
-+   // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[3] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw * c / det(J)
-+   const CeedScalar *ud = in[0], *c = in[1], *J = in[2], *qw = in[3];
-+   CeedScalar *vd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 11:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * c[i] / J[i];
-+            vd[i] = qd * ud[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * c[i] / DetJ21(J + i, Q);
-+            vd[i] = qd * ud[i];
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * c[i] / DetJ22(J + i, Q);
-+            vd[i] = qd * ud[i];
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * c[i] / DetJ32(J + i, Q);
-+            vd[i] = qd * ud[i];
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * c[i] / DetJ33(J + i, Q);
-+            vd[i] = qd * ud[i];
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+#endif // MFEM_LIBCEED_DIVDIV_QF_H
-diff --git a/fem/ceed/integrators/interp/interp.cpp b/fem/ceed/integrators/interp/interp.cpp
-new file mode 100644
-index 000000000..6aab4f47f
---- /dev/null
-+++ b/fem/ceed/integrators/interp/interp.cpp
-@@ -0,0 +1,58 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "interp.hpp"
-+
-+#include "../../../../config/config.hpp"
-+
-+namespace mfem
-+{
-+
-+namespace ceed
-+{
-+
-+#ifdef MFEM_USE_CEED
-+struct DiscreteInterpolatorOperatorInfo : public OperatorInfo
-+{
-+   DiscreteInterpolatorOperatorInfo()
-+   {
-+      // Discrete interpolators use a built-in QFunction
-+      header = "";
-+      header = "";
-+      build_func = "";
-+      build_qf = nullptr;
-+      apply_func = "";
-+      apply_qf = nullptr;
-+      apply_func_mf = "";
-+      apply_qf_mf = nullptr;
-+      trial_op = EvalMode::Interp;
-+      test_op = EvalMode::None;
-+      qdatasize = 0;
-+   }
-+};
-+#endif
-+
-+PADiscreteInterpolator::PADiscreteInterpolator(
-+   const mfem::DiscreteInterpolator &interp,
-+   const mfem::FiniteElementSpace &trial_fes,
-+   const mfem::FiniteElementSpace &test_fes)
-+{
-+#ifdef MFEM_USE_CEED
-+   DiscreteInterpolatorOperatorInfo info;
-+   Assemble(interp, info, trial_fes, test_fes, (mfem::Coefficient *)nullptr);
-+#else
-+   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
-+#endif
-+}
-+
-+} // namespace ceed
-+
-+} // namespace mfem
-diff --git a/fem/ceed/solvers/full-assembly.hpp b/fem/ceed/integrators/interp/interp.hpp
-similarity index 50%
-rename from fem/ceed/solvers/full-assembly.hpp
-rename to fem/ceed/integrators/interp/interp.hpp
-index ed338f725..a923df572 100644
---- a/fem/ceed/solvers/full-assembly.hpp
-+++ b/fem/ceed/integrators/interp/interp.hpp
-@@ -9,12 +9,12 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#ifndef MFEM_CEED_ASSEMBLE_HPP
--#define MFEM_CEED_ASSEMBLE_HPP
-+#ifndef MFEM_LIBCEED_INTERP_HPP
-+#define MFEM_LIBCEED_INTERP_HPP
- 
--#include "../interface/ceed.hpp"
--
--#ifdef MFEM_USE_CEED
-+#include "../../interface/integrator.hpp"
-+#include "../../interface/mixed_operator.hpp"
-+#include "../../../fespace.hpp"
- 
- namespace mfem
- {
-@@ -22,18 +22,19 @@ namespace mfem
- namespace ceed
- {
- 
--/** @brief Assembles a CeedOperator as an mfem::SparseMatrix
--
--    In parallel, this assembles independently on each processor, that is, it
--    assembles at the L-vector level. The assembly procedure is always performed
--    on the host, but this works also for operators stored on device by copying
--    memory. */
--int CeedOperatorFullAssemble(CeedOperator op, SparseMatrix **mat);
--
--} // namespace ceed
-+/** Represent DiscreteInterpolator classes with AssemblyLevel::Partial
-+    using libCEED. */
-+class PADiscreteInterpolator : public MixedOperator<Interpolator>
-+{
-+public:
-+   PADiscreteInterpolator(
-+      const mfem::DiscreteInterpolator &interp,
-+      const mfem::FiniteElementSpace &trial_fes,
-+      const mfem::FiniteElementSpace &test_fes);
-+};
- 
--} // namespace mfem
-+}
- 
--#endif
-+}
- 
--#endif
-+#endif // MFEM_LIBCEED_INTERP_HPP
-diff --git a/fem/ceed/integrators/mass/mass.cpp b/fem/ceed/integrators/mass/mass.cpp
-index dfcc9a8ce..6a8d67ddc 100644
---- a/fem/ceed/integrators/mass/mass.cpp
-+++ b/fem/ceed/integrators/mass/mass.cpp
-@@ -25,20 +25,66 @@ namespace ceed
- #ifdef MFEM_USE_CEED
- struct MassOperatorInfo : public OperatorInfo
- {
--   MassContext ctx;
--   MassOperatorInfo()
-+   MassContext ctx = {0};
-+   MassOperatorInfo(const mfem::FiniteElementSpace &fes, mfem::Coefficient *Q,
-+                    bool use_bdr = false, bool use_mf = false)
-    {
-+      ctx.dim = fes.GetMesh()->Dimension() - use_bdr;
-+      ctx.space_dim = fes.GetMesh()->SpaceDimension();
-+      ctx.vdim = fes.GetVDim();
-+      if (!use_mf)
-+      {
-+         apply_func = ":f_apply_mass";
-+         apply_qf = &f_apply_mass;
-+      }
-+      else
-+      {
-+         build_func = "";
-+         build_qf = nullptr;
-+      }
-+      if (Q == nullptr)
-+      {
-+         ctx.coeff = 1.0;
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_mass_const";
-+            build_qf = &f_build_mass_const;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_mass_mf_const";
-+            apply_qf = &f_apply_mass_mf_const;
-+         }
-+      }
-+      else if (mfem::ConstantCoefficient *const_coeff =
-+                  dynamic_cast<mfem::ConstantCoefficient *>(Q))
-+      {
-+         ctx.coeff = const_coeff->constant;
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_mass_const";
-+            build_qf = &f_build_mass_const;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_mass_mf_const";
-+            apply_qf = &f_apply_mass_mf_const;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_mass_quad";
-+            build_qf = &f_build_mass_quad;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_mass_mf_quad";
-+            apply_qf = &f_apply_mass_mf_quad;
-+         }
-+      }
-       header = "/integrators/mass/mass_qf.h";
--      build_func_const = ":f_build_mass_const";
--      build_qf_const = &f_build_mass_const;
--      build_func_quad = ":f_build_mass_quad";
--      build_qf_quad = &f_build_mass_quad;
--      apply_func = ":f_apply_mass";
--      apply_qf = &f_apply_mass;
--      apply_func_mf_const = ":f_apply_mass_mf_const";
--      apply_qf_mf_const = &f_apply_mass_mf_const;
--      apply_func_mf_quad = ":f_apply_mass_mf_quad";
--      apply_qf_mf_quad = &f_apply_mass_mf_quad;
-       trial_op = EvalMode::Interp;
-       test_op = EvalMode::Interp;
-       qdatasize = 1;
-@@ -46,75 +92,53 @@ struct MassOperatorInfo : public OperatorInfo
- };
- #endif
- 
--PAMassIntegrator::PAMassIntegrator(const mfem::FiniteElementSpace &fes,
--                                   const mfem::IntegrationRule &irm,
--                                   mfem::Coefficient *Q)
--   : PAIntegrator()
-+PAMassIntegrator::PAMassIntegrator(const mfem::MassIntegrator &integ,
-+                                   const mfem::FiniteElementSpace &fes,
-+                                   mfem::Coefficient *Q,
-+                                   const bool use_bdr)
- {
- #ifdef MFEM_USE_CEED
--   MassOperatorInfo info;
--   Assemble(info, fes, irm, Q);
-+   MassOperatorInfo info(fes, Q, use_bdr);
-+   Assemble(integ, info, fes, Q, use_bdr);
- #else
-    MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
- #endif
- }
- 
--MixedPAMassIntegrator::MixedPAMassIntegrator(const MassIntegrator &integ,
--                                             const mfem::FiniteElementSpace &fes,
--                                             mfem::Coefficient *Q)
-+PAMassIntegrator::PAMassIntegrator(const mfem::VectorMassIntegrator &integ,
-+                                   const mfem::FiniteElementSpace &fes,
-+                                   mfem::Coefficient *Q,
-+                                   const bool use_bdr)
- {
- #ifdef MFEM_USE_CEED
--   MassOperatorInfo info;
--   Assemble(integ, info, fes, Q);
-+   MassOperatorInfo info(fes, Q, use_bdr);
-+   Assemble(integ, info, fes, Q, use_bdr);
- #else
-    MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
- #endif
- }
- 
--MixedPAMassIntegrator::MixedPAMassIntegrator(const VectorMassIntegrator &integ,
--                                             const mfem::FiniteElementSpace &fes,
--                                             mfem::Coefficient *Q)
-+MFMassIntegrator::MFMassIntegrator(const mfem::MassIntegrator &integ,
-+                                   const mfem::FiniteElementSpace &fes,
-+                                   mfem::Coefficient *Q,
-+                                   const bool use_bdr)
- {
- #ifdef MFEM_USE_CEED
--   MassOperatorInfo info;
--   Assemble(integ, info, fes, Q);
-+   MassOperatorInfo info(fes, Q, use_bdr, true);
-+   Assemble(integ, info, fes, Q, use_bdr, true);
- #else
-    MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
- #endif
- }
- 
--MFMassIntegrator::MFMassIntegrator(const mfem::FiniteElementSpace &fes,
--                                   const mfem::IntegrationRule &irm,
--                                   mfem::Coefficient *Q)
--   : MFIntegrator()
-+MFMassIntegrator::MFMassIntegrator(const mfem::VectorMassIntegrator &integ,
-+                                   const mfem::FiniteElementSpace &fes,
-+                                   mfem::Coefficient *Q,
-+                                   const bool use_bdr)
- {
- #ifdef MFEM_USE_CEED
--   MassOperatorInfo info;
--   Assemble(info, fes, irm, Q);
--#else
--   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
--#endif
--}
--
--MixedMFMassIntegrator::MixedMFMassIntegrator(const MassIntegrator &integ,
--                                             const mfem::FiniteElementSpace &fes,
--                                             mfem::Coefficient *Q)
--{
--#ifdef MFEM_USE_CEED
--   MassOperatorInfo info;
--   Assemble(integ, info, fes, Q);
--#else
--   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
--#endif
--}
--
--MixedMFMassIntegrator::MixedMFMassIntegrator(const VectorMassIntegrator &integ,
--                                             const mfem::FiniteElementSpace &fes,
--                                             mfem::Coefficient *Q)
--{
--#ifdef MFEM_USE_CEED
--   MassOperatorInfo info;
--   Assemble(integ, info, fes, Q);
-+   MassOperatorInfo info(fes, Q, use_bdr, true);
-+   Assemble(integ, info, fes, Q, use_bdr, true);
- #else
-    MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
- #endif
-diff --git a/fem/ceed/integrators/mass/mass.hpp b/fem/ceed/integrators/mass/mass.hpp
-index 696f8c3dc..4125fc6ed 100644
---- a/fem/ceed/integrators/mass/mass.hpp
-+++ b/fem/ceed/integrators/mass/mass.hpp
-@@ -13,7 +13,7 @@
- #define MFEM_LIBCEED_MASS_HPP
- 
- #include "../../interface/integrator.hpp"
--#include "../../interface/mixed_integrator.hpp"
-+#include "../../interface/mixed_operator.hpp"
- #include "../../../fespace.hpp"
- 
- namespace mfem
-@@ -23,45 +23,33 @@ namespace ceed
- {
- 
- /// Represent a MassIntegrator with AssemblyLevel::Partial using libCEED.
--class PAMassIntegrator : public PAIntegrator
-+class PAMassIntegrator : public MixedOperator<Integrator>
- {
- public:
--   PAMassIntegrator(const mfem::FiniteElementSpace &fes,
--                    const mfem::IntegrationRule &ir,
--                    mfem::Coefficient *Q);
--};
--
--class MixedPAMassIntegrator : public MixedIntegrator<PAIntegrator>
--{
--public:
--   MixedPAMassIntegrator(const MassIntegrator &integ,
--                         const mfem::FiniteElementSpace &fes,
--                         mfem::Coefficient *Q);
--
--   MixedPAMassIntegrator(const VectorMassIntegrator &integ,
--                         const mfem::FiniteElementSpace &fes,
--                         mfem::Coefficient *Q);
-+   PAMassIntegrator(const mfem::MassIntegrator &integ,
-+                    const mfem::FiniteElementSpace &fes,
-+                    mfem::Coefficient *Q,
-+                    const bool use_bdr = false);
-+
-+   PAMassIntegrator(const mfem::VectorMassIntegrator &integ,
-+                    const mfem::FiniteElementSpace &fes,
-+                    mfem::Coefficient *Q,
-+                    const bool use_bdr = false);
- };
- 
- /// Represent a MassIntegrator with AssemblyLevel::None using libCEED.
--class MFMassIntegrator : public MFIntegrator
-+class MFMassIntegrator : public MixedOperator<Integrator>
- {
- public:
--   MFMassIntegrator(const mfem::FiniteElementSpace &fes,
--                    const mfem::IntegrationRule &ir,
--                    mfem::Coefficient *Q);
--};
--
--class MixedMFMassIntegrator : public MixedIntegrator<MFIntegrator>
--{
--public:
--   MixedMFMassIntegrator(const MassIntegrator &integ,
--                         const mfem::FiniteElementSpace &fes,
--                         mfem::Coefficient *Q);
--
--   MixedMFMassIntegrator(const VectorMassIntegrator &integ,
--                         const mfem::FiniteElementSpace &fes,
--                         mfem::Coefficient *Q);
-+   MFMassIntegrator(const mfem::MassIntegrator &integ,
-+                    const mfem::FiniteElementSpace &fes,
-+                    mfem::Coefficient *Q,
-+                    const bool use_bdr = false);
-+
-+   MFMassIntegrator(const mfem::VectorMassIntegrator &integ,
-+                    const mfem::FiniteElementSpace &fes,
-+                    mfem::Coefficient *Q,
-+                    const bool use_bdr = false);
- };
- 
- }
-diff --git a/fem/ceed/integrators/mass/mass_qf.h b/fem/ceed/integrators/mass/mass_qf.h
-index 85002ae04..3cdd3b5e3 100644
---- a/fem/ceed/integrators/mass/mass_qf.h
-+++ b/fem/ceed/integrators/mass/mass_qf.h
-@@ -9,128 +9,151 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
-+#ifndef MFEM_LIBCEED_MASS_QF_H
-+#define MFEM_LIBCEED_MASS_QF_H
- 
--/// A structure used to pass additional data to f_build_diff and f_apply_diff
--struct MassContext { CeedInt dim, space_dim, vdim; CeedScalar coeff; };
-+#include "../util/util_qf.h"
- 
--/// libCEED Q-function for building quadrature data for a mass operator with a
--/// constant coefficient
-+struct MassContext
-+{
-+   CeedInt dim, space_dim, vdim;
-+   CeedScalar coeff;
-+};
-+
-+/// libCEED QFunction for building quadrature data for a mass operator
-+/// with a constant coefficient
- CEED_QFUNCTION(f_build_mass_const)(void *ctx, CeedInt Q,
-                                    const CeedScalar *const *in,
-                                    CeedScalar *const *out)
- {
--   // in[0] is Jacobians with shape [dim, nc=dim, Q]
-+   // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q]
-    // in[1] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute and store qw * c * det(J)
-    MassContext *bc = (MassContext *)ctx;
-    const CeedScalar coeff = bc->coeff;
-    const CeedScalar *J = in[0], *qw = in[1];
--   CeedScalar *rho = out[0];
--   switch (bc->dim + 10*bc->space_dim)
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-    {
-       case 11:
--         for (CeedInt i=0; i<Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * coeff * J[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            rho[i] = coeff * J[i] * qw[i];
-+            qd[i] = qw[i] * coeff * DetJ21(J + i, Q);
-          }
-          break;
-       case 22:
--         for (CeedInt i=0; i<Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // 0 2
--            // 1 3
--            rho[i] = coeff * (J[i+Q*0]*J[i+Q*3] - J[i+Q*1]*J[i+Q*2]) * qw[i];
-+            qd[i] = qw[i] * coeff * DetJ22(J + i, Q);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * coeff * DetJ32(J + i, Q);
-          }
-          break;
-       case 33:
--         for (CeedInt i=0; i<Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // 0 3 6
--            // 1 4 7
--            // 2 5 8
--            rho[i] = (J[i+Q*0]*(J[i+Q*4]*J[i+Q*8] - J[i+Q*5]*J[i+Q*7]) -
--                      J[i+Q*1]*(J[i+Q*3]*J[i+Q*8] - J[i+Q*5]*J[i+Q*6]) +
--                      J[i+Q*2]*(J[i+Q*3]*J[i+Q*7] - J[i+Q*4]*J[i+Q*6])) * coeff * qw[i];
-+            qd[i] = qw[i] * coeff * DetJ33(J + i, Q);
-          }
-          break;
-    }
-    return 0;
- }
- 
--/// libCEED Q-function for building quadrature data for a mass operator with a
--/// coefficient evaluated at quadrature points.
-+/// libCEED QFunction for building quadrature data for a mass operator
-+/// with a coefficient evaluated at quadrature points
- CEED_QFUNCTION(f_build_mass_quad)(void *ctx, CeedInt Q,
-                                   const CeedScalar *const *in,
-                                   CeedScalar *const *out)
- {
--   // in[0] is Jacobians with shape [dim, nc=dim, Q]
--   // in[1] is quadrature weights, size (Q)
-+   // in[0] is coefficients, size (Q)
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute and store qw * c * det(J)
-    MassContext *bc = (MassContext *)ctx;
-    const CeedScalar *c = in[0], *J = in[1], *qw = in[2];
--   CeedScalar *rho = out[0];
--   switch (bc->dim + 10*bc->space_dim)
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-    {
-       case 11:
--         for (CeedInt i=0; i<Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            rho[i] = c[i] * J[i] * qw[i];
-+            qd[i] = qw[i] * c[i] * J[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * c[i] * DetJ21(J + i, Q);
-          }
-          break;
-       case 22:
--         for (CeedInt i=0; i<Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * c[i] * DetJ22(J + i, Q);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // 0 2
--            // 1 3
--            rho[i] = c[i] * (J[i+Q*0]*J[i+Q*3] - J[i+Q*1]*J[i+Q*2]) * qw[i];
-+            qd[i] = qw[i] * c[i] * DetJ32(J + i, Q);
-          }
-          break;
-       case 33:
--         for (CeedInt i=0; i<Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // 0 3 6
--            // 1 4 7
--            // 2 5 8
--            rho[i] = (J[i+Q*0]*(J[i+Q*4]*J[i+Q*8] - J[i+Q*5]*J[i+Q*7]) -
--                      J[i+Q*1]*(J[i+Q*3]*J[i+Q*8] - J[i+Q*5]*J[i+Q*6]) +
--                      J[i+Q*2]*(J[i+Q*3]*J[i+Q*7] - J[i+Q*4]*J[i+Q*6])) * c[i] * qw[i];
-+            qd[i] = qw[i] * c[i] * DetJ33(J + i, Q);
-          }
-          break;
-    }
-    return 0;
- }
- 
--/// libCEED Q-function for applying a mass operator
-+/// libCEED QFunction for applying a mass operator
- CEED_QFUNCTION(f_apply_mass)(void *ctx, CeedInt Q,
-                              const CeedScalar *const *in,
-                              CeedScalar *const *out)
- {
-    MassContext *bc = (MassContext *)ctx;
--   const CeedScalar *u = in[0], *w = in[1];
-+   // in[0], out[0] have shape [ncomp=vdim, Q]
-+   const CeedScalar *u = in[0], *qd = in[1];
-    CeedScalar *v = out[0];
-    switch (bc->vdim)
-    {
-       case 1:
--         for (CeedInt i=0; i<Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            v[i] = w[i] * u[i];
-+            v[i] = qd[i] * u[i];
-          }
-          break;
-       case 2:
--         for (CeedInt i=0; i<Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            const CeedScalar W = w[i];
--            for (CeedInt c = 0; c < 2; c++)
-+            const CeedScalar qdi = qd[i];
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-             {
--               v[i+c*Q] = W * u[i+c*Q];
-+               v[i + d * Q] = qdi * u[i + d * Q];
-             }
-          }
-          break;
-       case 3:
--         for (CeedInt i=0; i<Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            const CeedScalar W = w[i];
--            for (CeedInt c = 0; c < 3; c++)
-+            const CeedScalar qdi = qd[i];
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-             {
--               v[i+c*Q] = W * u[i+c*Q];
-+               v[i + d * Q] = qdi * u[i + d * Q];
-             }
-          }
-          break;
-@@ -138,63 +161,95 @@ CEED_QFUNCTION(f_apply_mass)(void *ctx, CeedInt Q,
-    return 0;
- }
- 
--/// libCEED Q-function for applying a diff operator
-+/// libCEED QFunction for applying a mass operator with a constant
-+/// coefficient
- CEED_QFUNCTION(f_apply_mass_mf_const)(void *ctx, CeedInt Q,
--                                      const CeedScalar *const *in, CeedScalar *const *out)
-+                                      const CeedScalar *const *in,
-+                                      CeedScalar *const *out)
- {
--   MassContext *bc = (MassContext*)ctx;
-+   MassContext *bc = (MassContext *)ctx;
-+   // in[0], out[0] have shape [ncomp=vdim, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw * c * det(J)
-    const CeedScalar coeff = bc->coeff;
-    const CeedScalar *u = in[0], *J = in[1], *qw = in[2];
-    CeedScalar *v = out[0];
--   switch (10 * bc->dim + bc->vdim)
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->vdim)
-    {
--      case 11:
--         for (CeedInt i = 0; i < Q; i++)
-+      case 111:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            const CeedScalar rho = coeff * qw[i] / J[i];
--            v[i] = rho * u[i];
-+            const CeedScalar qd = qw[i] * coeff * J[i];
-+            v[i] = qd * u[i];
-          }
-          break;
--      case 21:
--         for (CeedInt i = 0; i < Q; i++)
-+      case 211:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            const CeedScalar rho = coeff * (J[i+Q*0]*J[i+Q*3] - J[i+Q*1]*J[i+Q*2]) * qw[i];
--            v[i] = rho * u[i];
-+            const CeedScalar qd = qw[i] * coeff * DetJ21(J + i, Q);
-+            v[i] = qd * u[i];
-          }
-          break;
--      case 22:
--         for (CeedInt i=0; i<Q; i++)
-+      case 212:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // 0 2
--            // 1 3
--            const CeedScalar rho = coeff * (J[i+Q*0]*J[i+Q*3] - J[i+Q*1]*J[i+Q*2]) * qw[i];
--            for (CeedInt c = 0; c < 2; c++)
-+            const CeedScalar qd = qw[i] * coeff * DetJ21(J + i, Q);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-             {
--               v[i+c*Q] = rho * u[i+c*Q];
-+               v[i + d * Q] = qd * u[i + d * Q];
-             }
-          }
-          break;
--      case 31:
--         for (CeedInt i = 0; i < Q; i++)
-+      case 221:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            const CeedScalar rho = (J[i+Q*0]*(J[i+Q*4]*J[i+Q*8] - J[i+Q*5]*J[i+Q*7]) -
--                                    J[i+Q*1]*(J[i+Q*3]*J[i+Q*8] - J[i+Q*5]*J[i+Q*6]) +
--                                    J[i+Q*2]*(J[i+Q*3]*J[i+Q*7] - J[i+Q*4]*J[i+Q*6])) * coeff * qw[i];
--            v[i] = rho * u[i];
-+            const CeedScalar qd = qw[i] * coeff * DetJ22(J + i, Q);
-+            v[i] = qd * u[i];
-          }
-          break;
--      case 33:
--         for (CeedInt i=0; i<Q; i++)
-+      case 222:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * coeff * DetJ22(J + i, Q);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-+            {
-+               v[i + d * Q] = qd * u[i + d * Q];
-+            }
-+         }
-+         break;
-+      case 321:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * coeff * DetJ32(J + i, Q);
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 323:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // 0 3 6
--            // 1 4 7
--            // 2 5 8
--            const CeedScalar rho = (J[i+Q*0]*(J[i+Q*4]*J[i+Q*8] - J[i+Q*5]*J[i+Q*7]) -
--                                    J[i+Q*1]*(J[i+Q*3]*J[i+Q*8] - J[i+Q*5]*J[i+Q*6]) +
--                                    J[i+Q*2]*(J[i+Q*3]*J[i+Q*7] - J[i+Q*4]*J[i+Q*6])) * coeff * qw[i];
--            for (CeedInt c = 0; c < 3; c++)
-+            const CeedScalar qd = qw[i] * coeff * DetJ32(J + i, Q);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-             {
--               v[i+c*Q] = rho * u[i+c*Q];
-+               v[i + d * Q] = qd * u[i + d * Q];
-+            }
-+         }
-+         break;
-+      case 331:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * coeff * DetJ33(J + i, Q);
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * coeff * DetJ33(J + i, Q);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-+            {
-+               v[i + d * Q] = qd * u[i + d * Q];
-             }
-          }
-          break;
-@@ -202,69 +257,99 @@ CEED_QFUNCTION(f_apply_mass_mf_const)(void *ctx, CeedInt Q,
-    return 0;
- }
- 
-+/// libCEED QFunction for applying a mass operator with a coefficient
-+/// evaluated at quadrature points
- CEED_QFUNCTION(f_apply_mass_mf_quad)(void *ctx, CeedInt Q,
--                                     const CeedScalar *const *in, CeedScalar *const *out)
-+                                     const CeedScalar *const *in,
-+                                     CeedScalar *const *out)
- {
--   MassContext *bc = (MassContext*)ctx;
--   const CeedScalar *c = in[0], *u = in[1], *J = in[2], *qw = in[3];
-+   MassContext *bc = (MassContext *)ctx;
-+   // in[0], out[0] have shape [ncomp=vdim, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw * c * det(J)
-+   const CeedScalar *u = in[0], *c = in[1], *J = in[2], *qw = in[3];
-    CeedScalar *v = out[0];
--   switch (10 * bc->dim + bc->vdim)
-+   switch (100 * bc->space_dim + 10 * bc->dim + bc->vdim)
-    {
--      case 11:
--         for (CeedInt i=0; i<Q; i++)
-+      case 111:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            const CeedScalar rho = c[i] * J[i] * qw[i];
--            v[i] = rho * u[i];
-+            const CeedScalar qd = qw[i] * c[i] * J[i];
-+            v[i] = qd * u[i];
-          }
-          break;
--      case 21:
--         for (CeedInt i=0; i<Q; i++)
-+      case 211:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // 0 2
--            // 1 3
--            const CeedScalar rho = c[i] * (J[i+Q*0]*J[i+Q*3] - J[i+Q*1]*J[i+Q*2]) * qw[i];
--            v[i] = rho * u[i];
-+            const CeedScalar qd = qw[i] * c[i] * DetJ21(J + i, Q);
-+            v[i] = qd * u[i];
-          }
-          break;
--      case 22:
--         for (CeedInt i=0; i<Q; i++)
-+      case 212:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // 0 2
--            // 1 3
--            const CeedScalar rho = c[i] * (J[i+Q*0]*J[i+Q*3] - J[i+Q*1]*J[i+Q*2]) * qw[i];
--            for (CeedInt d = 0; d < 2; d++)
-+            const CeedScalar qd = qw[i] * c[i] * DetJ21(J + i, Q);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-             {
--               v[i+d*Q] = rho * u[i+d*Q];
-+               v[i + d * Q] = qd * u[i + d * Q];
-             }
-          }
-          break;
--      case 31:
--         for (CeedInt i=0; i<Q; i++)
-+      case 221:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // 0 3 6
--            // 1 4 7
--            // 2 5 8
--            const CeedScalar rho = (J[i+Q*0]*(J[i+Q*4]*J[i+Q*8] - J[i+Q*5]*J[i+Q*7]) -
--                                    J[i+Q*1]*(J[i+Q*3]*J[i+Q*8] - J[i+Q*5]*J[i+Q*6]) +
--                                    J[i+Q*2]*(J[i+Q*3]*J[i+Q*7] - J[i+Q*4]*J[i+Q*6])) * c[i] * qw[i];
--            v[i] = rho * u[i];
-+            const CeedScalar qd = qw[i] * c[i] * DetJ22(J + i, Q);
-+            v[i] = qd * u[i];
-          }
-          break;
--      case 33:
--         for (CeedInt i=0; i<Q; i++)
-+      case 222:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * c[i] * DetJ22(J + i, Q);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++)
-+            {
-+               v[i + d * Q] = qd * u[i + d * Q];
-+            }
-+         }
-+         break;
-+      case 321:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * c[i] * DetJ32(J + i, Q);
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 323:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // 0 3 6
--            // 1 4 7
--            // 2 5 8
--            const CeedScalar rho = (J[i+Q*0]*(J[i+Q*4]*J[i+Q*8] - J[i+Q*5]*J[i+Q*7]) -
--                                    J[i+Q*1]*(J[i+Q*3]*J[i+Q*8] - J[i+Q*5]*J[i+Q*6]) +
--                                    J[i+Q*2]*(J[i+Q*3]*J[i+Q*7] - J[i+Q*4]*J[i+Q*6])) * c[i] * qw[i];
--            for (CeedInt d = 0; d < 3; d++)
-+            const CeedScalar qd = qw[i] * c[i] * DetJ32(J + i, Q);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-             {
--               v[i+d*Q] = rho * u[i+d*Q];
-+               v[i + d * Q] = qd * u[i + d * Q];
-+            }
-+         }
-+         break;
-+      case 331:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * c[i] * DetJ33(J + i, Q);
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 333:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * c[i] * DetJ33(J + i, Q);
-+            CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++)
-+            {
-+               v[i + d * Q] = qd * u[i + d * Q];
-             }
-          }
-          break;
-    }
-    return 0;
- }
-+
-+#endif // MFEM_LIBCEED_MASS_QF_H
-diff --git a/fem/ceed/integrators/mixedveccurl/mixedveccurl.cpp b/fem/ceed/integrators/mixedveccurl/mixedveccurl.cpp
-new file mode 100644
-index 000000000..a0186e391
---- /dev/null
-+++ b/fem/ceed/integrators/mixedveccurl/mixedveccurl.cpp
-@@ -0,0 +1,338 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "mixedveccurl.hpp"
-+
-+#include "../../../../config/config.hpp"
-+#ifdef MFEM_USE_CEED
-+#include "../curlcurl/curlcurl_qf.h"
-+#endif
-+
-+namespace mfem
-+{
-+
-+namespace ceed
-+{
-+
-+#ifdef MFEM_USE_CEED
-+struct MixedVectorCurlOperatorInfoBase : public OperatorInfo
-+{
-+   CurlCurlContext ctx = {0};
-+   template <typename CoeffType>
-+   MixedVectorCurlOperatorInfoBase(const mfem::FiniteElementSpace &trial_fes,
-+                                   const mfem::FiniteElementSpace &test_fes,
-+                                   CoeffType *Q, bool use_bdr = false,
-+                                   bool use_mf = false)
-+   {
-+      // Reuse H(div) quadrature functions for CurlCurlIntegrator
-+      MFEM_VERIFY(trial_fes.GetVDim() == 1 && test_fes.GetVDim() == 1,
-+                  "libCEED interface for vector FE does not support vdim > 1!");
-+      ctx.dim = trial_fes.GetMesh()->Dimension() - use_bdr;
-+      MFEM_VERIFY(ctx.dim == 3,
-+                  "MixedVectorCurlIntegrator and MixedVectorWeakCurlIntegrator "
-+                  "require dim == 3!");
-+      ctx.space_dim = trial_fes.GetMesh()->SpaceDimension();
-+      ctx.curl_dim = (ctx.dim < 3) ? 1 : ctx.dim;
-+      if (!use_mf)
-+      {
-+         apply_func = ":f_apply_curlcurl";
-+         apply_qf = &f_apply_curlcurl;
-+      }
-+      else
-+      {
-+         build_func = "";
-+         build_qf = nullptr;
-+      }
-+      if (Q == nullptr)
-+      {
-+         ctx.coeff[0] = 1.0;
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_curlcurl_const_scalar";
-+            build_qf = &f_build_curlcurl_const_scalar;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_curlcurl_mf_const_scalar";
-+            apply_qf = &f_apply_curlcurl_mf_const_scalar;
-+         }
-+      }
-+      else
-+      {
-+         InitCoefficient(*Q, use_mf);
-+      }
-+      header = "/integrators/curlcurl/curlcurl_qf.h";
-+      qdatasize = (ctx.curl_dim * (ctx.curl_dim + 1)) / 2;
-+   }
-+   void InitCoefficient(mfem::Coefficient &Q, bool use_mf)
-+   {
-+      if (mfem::ConstantCoefficient *const_coeff =
-+             dynamic_cast<mfem::ConstantCoefficient *>(&Q))
-+      {
-+         ctx.coeff[0] = const_coeff->constant;
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_curlcurl_const_scalar";
-+            build_qf = &f_build_curlcurl_const_scalar;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_curlcurl_mf_const_scalar";
-+            apply_qf = &f_apply_curlcurl_mf_const_scalar;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_curlcurl_quad_scalar";
-+            build_qf = &f_build_curlcurl_quad_scalar;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_curlcurl_mf_quad_scalar";
-+            apply_qf = &f_apply_curlcurl_mf_quad_scalar;
-+         }
-+      }
-+   }
-+   void InitCoefficient(mfem::VectorCoefficient &VQ, bool use_mf)
-+   {
-+      if (mfem::VectorConstantCoefficient *const_coeff =
-+             dynamic_cast<mfem::VectorConstantCoefficient *>(&VQ))
-+      {
-+         const int vdim = VQ.GetVDim();
-+         MFEM_VERIFY(vdim <= LIBCEED_CURLCURL_COEFF_COMP_MAX,
-+                     "VectorCoefficient dimension exceeds context storage!");
-+         const mfem::Vector &val = const_coeff->GetVec();
-+         for (int i = 0; i < vdim; i++)
-+         {
-+            ctx.coeff[i] = val[i];
-+         }
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_curlcurl_const_vector";
-+            build_qf = &f_build_curlcurl_const_vector;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_curlcurl_mf_const_vector";
-+            apply_qf = &f_apply_curlcurl_mf_const_vector;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_curlcurl_quad_vector";
-+            build_qf = &f_build_curlcurl_quad_vector;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_curlcurl_mf_quad_vector";
-+            apply_qf = &f_apply_curlcurl_mf_quad_vector;
-+         }
-+      }
-+   }
-+   void InitCoefficient(mfem::MatrixCoefficient &MQ, bool use_mf)
-+   {
-+      // Assumes matrix coefficient is symmetric
-+      if (mfem::MatrixConstantCoefficient *const_coeff =
-+             dynamic_cast<mfem::MatrixConstantCoefficient *>(&MQ))
-+      {
-+         const int vdim = MQ.GetVDim();
-+         MFEM_VERIFY((vdim * (vdim + 1)) / 2 <= LIBCEED_CURLCURL_COEFF_COMP_MAX,
-+                     "MatrixCoefficient dimensions exceed context storage!");
-+         const mfem::DenseMatrix &val = const_coeff->GetMatrix();
-+         for (int j = 0; j < vdim; j++)
-+         {
-+            for (int i = j; i < vdim; i++)
-+            {
-+               const int idx = (j * vdim) - (((j - 1) * j) / 2) + i - j;
-+               ctx.coeff[idx] = val(i, j);
-+            }
-+         }
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_curlcurl_const_matrix";
-+            build_qf = &f_build_curlcurl_const_matrix;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_curlcurl_mf_const_matrix";
-+            apply_qf = &f_apply_curlcurl_mf_const_matrix;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_curlcurl_quad_matrix";
-+            build_qf = &f_build_curlcurl_quad_matrix;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_curlcurl_mf_quad_matrix";
-+            apply_qf = &f_apply_curlcurl_mf_quad_matrix;
-+         }
-+      }
-+   }
-+};
-+
-+struct MixedVectorCurlOperatorInfo : public MixedVectorCurlOperatorInfoBase
-+{
-+   template <typename CoeffType>
-+   MixedVectorCurlOperatorInfo(const mfem::FiniteElementSpace &trial_fes,
-+                               const mfem::FiniteElementSpace &test_fes,
-+                               CoeffType *Q, bool use_bdr = false,
-+                               bool use_mf = false)
-+      : MixedVectorCurlOperatorInfoBase(trial_fes, test_fes, Q, use_bdr, use_mf)
-+   {
-+      MFEM_VERIFY(
-+         trial_fes.FEColl()->GetDerivMapType(ctx.dim) == mfem::FiniteElement::H_DIV &&
-+         test_fes.FEColl()->GetMapType(ctx.dim) == mfem::FiniteElement::H_DIV,
-+         "libCEED interface for MixedVectorCurlIntegrator requires "
-+         "H(curl) domain and H(div) range FE spaces!");
-+      trial_op = EvalMode::Curl;
-+      test_op = EvalMode::Interp;
-+   }
-+};
-+
-+struct MixedVectorWeakCurlOperatorInfo : public MixedVectorCurlOperatorInfoBase
-+{
-+   template <typename CoeffType>
-+   MixedVectorWeakCurlOperatorInfo(const mfem::FiniteElementSpace &trial_fes,
-+                                   const mfem::FiniteElementSpace &test_fes,
-+                                   CoeffType *Q, bool use_bdr = false,
-+                                   bool use_mf = false)
-+      : MixedVectorCurlOperatorInfoBase(trial_fes, test_fes, Q, use_bdr, use_mf)
-+   {
-+      MFEM_VERIFY(
-+         trial_fes.FEColl()->GetMapType(ctx.dim) == mfem::FiniteElement::H_DIV &&
-+         test_fes.FEColl()->GetDerivMapType(ctx.dim) == mfem::FiniteElement::H_DIV,
-+         "libCEED interface for MixedVectorWeakCurlIntegrator requires "
-+         "H(div) domain and H(curl) range FE spaces!");
-+      trial_op = EvalMode::Interp;
-+      test_op = EvalMode::Curl;
-+   }
-+};
-+#endif
-+
-+template <typename CoeffType>
-+PAMixedVectorCurlIntegrator::PAMixedVectorCurlIntegrator(
-+   const mfem::MixedVectorCurlIntegrator &integ,
-+   const mfem::FiniteElementSpace &trial_fes,
-+   const mfem::FiniteElementSpace &test_fes,
-+   CoeffType *Q,
-+   const bool use_bdr)
-+{
-+#ifdef MFEM_USE_CEED
-+   MixedVectorCurlOperatorInfo info(trial_fes, test_fes, Q, use_bdr);
-+   Assemble(integ, info, trial_fes, test_fes, Q, use_bdr);
-+#else
-+   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
-+#endif
-+}
-+
-+template <typename CoeffType>
-+MFMixedVectorCurlIntegrator::MFMixedVectorCurlIntegrator(
-+   const mfem::MixedVectorCurlIntegrator &integ,
-+   const mfem::FiniteElementSpace &trial_fes,
-+   const mfem::FiniteElementSpace &test_fes,
-+   CoeffType *Q,
-+   const bool use_bdr)
-+{
-+#ifdef MFEM_USE_CEED
-+   MixedVectorCurlOperatorInfo info(trial_fes, test_fes, Q, use_bdr, true);
-+   Assemble(integ, info, trial_fes, test_fes, Q, use_bdr, true);
-+#else
-+   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
-+#endif
-+}
-+
-+template <typename CoeffType>
-+PAMixedVectorWeakCurlIntegrator::PAMixedVectorWeakCurlIntegrator(
-+   const mfem::MixedVectorWeakCurlIntegrator &integ,
-+   const mfem::FiniteElementSpace &trial_fes,
-+   const mfem::FiniteElementSpace &test_fes,
-+   CoeffType *Q,
-+   const bool use_bdr)
-+{
-+#ifdef MFEM_USE_CEED
-+   MixedVectorWeakCurlOperatorInfo info(trial_fes, test_fes, Q, use_bdr);
-+   Assemble(integ, info, trial_fes, test_fes, Q, use_bdr);
-+#else
-+   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
-+#endif
-+}
-+
-+template <typename CoeffType>
-+MFMixedVectorWeakCurlIntegrator::MFMixedVectorWeakCurlIntegrator(
-+   const mfem::MixedVectorWeakCurlIntegrator &integ,
-+   const mfem::FiniteElementSpace &trial_fes,
-+   const mfem::FiniteElementSpace &test_fes,
-+   CoeffType *Q,
-+   const bool use_bdr)
-+{
-+#ifdef MFEM_USE_CEED
-+   MixedVectorWeakCurlOperatorInfo info(trial_fes, test_fes, Q, use_bdr, true);
-+   Assemble(integ, info, trial_fes, test_fes, Q, use_bdr, true);
-+#else
-+   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
-+#endif
-+}
-+
-+// @cond DOXYGEN_SKIP
-+
-+template PAMixedVectorCurlIntegrator::PAMixedVectorCurlIntegrator(
-+   const mfem::MixedVectorCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::Coefficient *, const bool);
-+template PAMixedVectorCurlIntegrator::PAMixedVectorCurlIntegrator(
-+   const mfem::MixedVectorCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::VectorCoefficient *, const bool);
-+template PAMixedVectorCurlIntegrator::PAMixedVectorCurlIntegrator(
-+   const mfem::MixedVectorCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::MatrixCoefficient *, const bool);
-+
-+template MFMixedVectorCurlIntegrator::MFMixedVectorCurlIntegrator(
-+   const mfem::MixedVectorCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::Coefficient *, const bool);
-+template MFMixedVectorCurlIntegrator::MFMixedVectorCurlIntegrator(
-+   const mfem::MixedVectorCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::VectorCoefficient *, const bool);
-+template MFMixedVectorCurlIntegrator::MFMixedVectorCurlIntegrator(
-+   const mfem::MixedVectorCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::MatrixCoefficient *, const bool);
-+
-+template PAMixedVectorWeakCurlIntegrator::PAMixedVectorWeakCurlIntegrator(
-+   const mfem::MixedVectorWeakCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::Coefficient *, const bool);
-+template PAMixedVectorWeakCurlIntegrator::PAMixedVectorWeakCurlIntegrator(
-+   const mfem::MixedVectorWeakCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::VectorCoefficient *, const bool);
-+template PAMixedVectorWeakCurlIntegrator::PAMixedVectorWeakCurlIntegrator(
-+   const mfem::MixedVectorWeakCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::MatrixCoefficient *, const bool);
-+
-+template MFMixedVectorWeakCurlIntegrator::MFMixedVectorWeakCurlIntegrator(
-+   const mfem::MixedVectorWeakCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::Coefficient *, const bool);
-+template MFMixedVectorWeakCurlIntegrator::MFMixedVectorWeakCurlIntegrator(
-+   const mfem::MixedVectorWeakCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::VectorCoefficient *, const bool);
-+template MFMixedVectorWeakCurlIntegrator::MFMixedVectorWeakCurlIntegrator(
-+   const mfem::MixedVectorWeakCurlIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::MatrixCoefficient *, const bool);
-+
-+// @endcond
-+
-+} // namespace ceed
-+
-+} // namespace mfem
-diff --git a/fem/ceed/integrators/mixedveccurl/mixedveccurl.hpp b/fem/ceed/integrators/mixedveccurl/mixedveccurl.hpp
-new file mode 100644
-index 000000000..3bae19e0c
---- /dev/null
-+++ b/fem/ceed/integrators/mixedveccurl/mixedveccurl.hpp
-@@ -0,0 +1,85 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#ifndef MFEM_LIBCEED_MIXEDVECCURL_HPP
-+#define MFEM_LIBCEED_MIXEDVECCURL_HPP
-+
-+#include "../../interface/integrator.hpp"
-+#include "../../interface/mixed_operator.hpp"
-+#include "../../../fespace.hpp"
-+
-+namespace mfem
-+{
-+
-+namespace ceed
-+{
-+
-+/** Represent a MixedVectorCurlIntegrator with AssemblyLevel::Partial
-+    using libCEED. */
-+class PAMixedVectorCurlIntegrator : public MixedOperator<Integrator>
-+{
-+public:
-+   template <typename CoeffType>
-+   PAMixedVectorCurlIntegrator(
-+      const mfem::MixedVectorCurlIntegrator &integ,
-+      const mfem::FiniteElementSpace &trial_fes,
-+      const mfem::FiniteElementSpace &test_fes,
-+      CoeffType *Q,
-+      const bool use_bdr = false);
-+};
-+
-+/** Represent a MixedVectorCurlIntegrator with AssemblyLevel::None
-+    using libCEED. */
-+class MFMixedVectorCurlIntegrator : public MixedOperator<Integrator>
-+{
-+public:
-+   template <typename CoeffType>
-+   MFMixedVectorCurlIntegrator(
-+      const mfem::MixedVectorCurlIntegrator &integ,
-+      const mfem::FiniteElementSpace &trial_fes,
-+      const mfem::FiniteElementSpace &test_fes,
-+      CoeffType *Q,
-+      const bool use_bdr = false);
-+};
-+
-+/** Represent a MixedVectorWeakCurlIntegrator with AssemblyLevel::Partial
-+    using libCEED. */
-+class PAMixedVectorWeakCurlIntegrator : public MixedOperator<Integrator>
-+{
-+public:
-+   template <typename CoeffType>
-+   PAMixedVectorWeakCurlIntegrator(
-+      const mfem::MixedVectorWeakCurlIntegrator &integ,
-+      const mfem::FiniteElementSpace &trial_fes,
-+      const mfem::FiniteElementSpace &test_fes,
-+      CoeffType *Q,
-+      const bool use_bdr = false);
-+};
-+
-+/** Represent a MixedVectorWeakCurlIntegrator with AssemblyLevel::None
-+    using libCEED. */
-+class MFMixedVectorWeakCurlIntegrator : public MixedOperator<Integrator>
-+{
-+public:
-+   template <typename CoeffType>
-+   MFMixedVectorWeakCurlIntegrator(
-+      const mfem::MixedVectorWeakCurlIntegrator &integ,
-+      const mfem::FiniteElementSpace &trial_fes,
-+      const mfem::FiniteElementSpace &test_fes,
-+      CoeffType *Q,
-+      const bool use_bdr = false);
-+};
-+
-+}
-+
-+}
-+
-+#endif // MFEM_LIBCEED_MIXEDVECCURL_HPP
-diff --git a/fem/ceed/integrators/mixedvecgrad/mixedvecgrad.cpp b/fem/ceed/integrators/mixedvecgrad/mixedvecgrad.cpp
-new file mode 100644
-index 000000000..f8f708bab
---- /dev/null
-+++ b/fem/ceed/integrators/mixedvecgrad/mixedvecgrad.cpp
-@@ -0,0 +1,396 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "mixedvecgrad.hpp"
-+
-+#include "../../../../config/config.hpp"
-+#ifdef MFEM_USE_CEED
-+#include "../diffusion/diffusion_qf.h"
-+#endif
-+
-+namespace mfem
-+{
-+
-+namespace ceed
-+{
-+
-+#ifdef MFEM_USE_CEED
-+struct MixedVectorGradientOperatorInfoBase : public OperatorInfo
-+{
-+   DiffusionContext ctx = {0};
-+   template <typename CoeffType>
-+   MixedVectorGradientOperatorInfoBase(const mfem::FiniteElementSpace &trial_fes,
-+                                       const mfem::FiniteElementSpace &test_fes,
-+                                       CoeffType *Q, bool use_bdr = false,
-+                                       bool use_mf = false)
-+   {
-+      // Reuse H(curl) quadrature functions for DiffusionIntegrator
-+      MFEM_VERIFY(trial_fes.GetVDim() == 1 && test_fes.GetVDim() == 1,
-+                  "libCEED interface for vector FE does not support vdim > 1!");
-+      ctx.dim = trial_fes.GetMesh()->Dimension() - use_bdr;
-+      MFEM_VERIFY(ctx.dim == 2 || ctx.dim == 3,
-+                  "MixedVectorGradientIntegrator and MixedVectorWeakDivergenceIntegrator "
-+                  "require dim == 2 or dim == 3!");
-+      ctx.space_dim = trial_fes.GetMesh()->SpaceDimension();
-+      ctx.vdim = 1;
-+      if (!use_mf)
-+      {
-+         apply_func = ":f_apply_diff";
-+         apply_qf = &f_apply_diff;
-+      }
-+      else
-+      {
-+         build_func = "";
-+         build_qf = nullptr;
-+      }
-+      if (Q == nullptr)
-+      {
-+         ctx.coeff[0] = 1.0;
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_diff_const_scalar";
-+            build_qf = &f_build_diff_const_scalar;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_diff_mf_const_scalar";
-+            apply_qf = &f_apply_diff_mf_const_scalar;
-+         }
-+      }
-+      else
-+      {
-+         InitCoefficient(*Q, use_mf);
-+      }
-+      header = "/integrators/diffusion/diffusion_qf.h";
-+      qdatasize = (ctx.dim * (ctx.dim + 1)) / 2;
-+   }
-+   void InitCoefficient(mfem::Coefficient &Q, bool use_mf)
-+   {
-+      if (mfem::ConstantCoefficient *const_coeff =
-+             dynamic_cast<mfem::ConstantCoefficient *>(&Q))
-+      {
-+         ctx.coeff[0] = const_coeff->constant;
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_diff_const_scalar";
-+            build_qf = &f_build_diff_const_scalar;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_diff_mf_const_scalar";
-+            apply_qf = &f_apply_diff_mf_const_scalar;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_diff_quad_scalar";
-+            build_qf = &f_build_diff_quad_scalar;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_diff_mf_quad_scalar";
-+            apply_qf = &f_apply_diff_mf_quad_scalar;
-+         }
-+      }
-+   }
-+   void InitCoefficient(mfem::VectorCoefficient &VQ, bool use_mf)
-+   {
-+      if (mfem::VectorConstantCoefficient *const_coeff =
-+             dynamic_cast<mfem::VectorConstantCoefficient *>(&VQ))
-+      {
-+         const int vdim = VQ.GetVDim();
-+         MFEM_VERIFY(vdim <= LIBCEED_DIFF_COEFF_COMP_MAX,
-+                     "VectorCoefficient dimension exceeds context storage!");
-+         const mfem::Vector &val = const_coeff->GetVec();
-+         for (int i = 0; i < vdim; i++)
-+         {
-+            ctx.coeff[i] = val[i];
-+         }
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_diff_const_vector";
-+            build_qf = &f_build_diff_const_vector;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_diff_mf_const_vector";
-+            apply_qf = &f_apply_diff_mf_const_vector;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_diff_quad_vector";
-+            build_qf = &f_build_diff_quad_vector;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_diff_mf_quad_vector";
-+            apply_qf = &f_apply_diff_mf_quad_vector;
-+         }
-+      }
-+   }
-+   void InitCoefficient(mfem::MatrixCoefficient &MQ, bool use_mf)
-+   {
-+      // Assumes matrix coefficient is symmetric
-+      if (mfem::MatrixConstantCoefficient *const_coeff =
-+             dynamic_cast<mfem::MatrixConstantCoefficient *>(&MQ))
-+      {
-+         const int vdim = MQ.GetVDim();
-+         MFEM_VERIFY((vdim * (vdim + 1)) / 2 <= LIBCEED_DIFF_COEFF_COMP_MAX,
-+                     "MatrixCoefficient dimensions exceed context storage!");
-+         const mfem::DenseMatrix &val = const_coeff->GetMatrix();
-+         for (int j = 0; j < vdim; j++)
-+         {
-+            for (int i = j; i < vdim; i++)
-+            {
-+               const int idx = (j * vdim) - (((j - 1) * j) / 2) + i - j;
-+               ctx.coeff[idx] = val(i, j);
-+            }
-+         }
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_diff_const_matrix";
-+            build_qf = &f_build_diff_const_matrix;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_diff_mf_const_matrix";
-+            apply_qf = &f_apply_diff_mf_const_matrix;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_diff_quad_matrix";
-+            build_qf = &f_build_diff_quad_matrix;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_diff_mf_quad_matrix";
-+            apply_qf = &f_apply_diff_mf_quad_matrix;
-+         }
-+      }
-+   }
-+};
-+
-+struct MixedVectorGradientOperatorInfo :
-+   public MixedVectorGradientOperatorInfoBase
-+{
-+   template <typename CoeffType>
-+   MixedVectorGradientOperatorInfo(const mfem::FiniteElementSpace &trial_fes,
-+                                   const mfem::FiniteElementSpace &test_fes,
-+                                   CoeffType *Q, bool use_bdr = false,
-+                                   bool use_mf = false)
-+      : MixedVectorGradientOperatorInfoBase(trial_fes, test_fes, Q, use_bdr, use_mf)
-+   {
-+      MFEM_VERIFY(
-+         (trial_fes.FEColl()->GetDerivMapType(ctx.dim) == mfem::FiniteElement::H_CURL &&
-+          test_fes.FEColl()->GetMapType(ctx.dim) == mfem::FiniteElement::H_CURL),
-+         "libCEED interface for MixedVectorGradientIntegrator requires "
-+         "H^1 domain and H(curl) range FE spaces!");
-+      trial_op = EvalMode::Grad;
-+      test_op = EvalMode::Interp;
-+   }
-+};
-+
-+struct MixedVectorWeakDivergenceOperatorInfo :
-+   public MixedVectorGradientOperatorInfoBase
-+{
-+   template <typename CoeffType>
-+   MixedVectorWeakDivergenceOperatorInfo(const mfem::FiniteElementSpace &trial_fes,
-+                                         const mfem::FiniteElementSpace &test_fes,
-+                                         CoeffType *Q, bool use_bdr = false,
-+                                         bool use_mf = false)
-+      : MixedVectorGradientOperatorInfoBase(trial_fes, test_fes, Q, use_bdr, use_mf)
-+   {
-+      MFEM_VERIFY(
-+         (trial_fes.FEColl()->GetMapType(ctx.dim) == mfem::FiniteElement::H_CURL &&
-+          test_fes.FEColl()->GetDerivMapType(ctx.dim) == mfem::FiniteElement::H_CURL),
-+         "libCEED interface for MixedVectorWeakDivergenceIntegrator requires "
-+         "H(curl) domain and H^1 range FE spaces!");
-+      trial_op = EvalMode::Interp;
-+      test_op = EvalMode::Grad;
-+      for (int i = 0; i < LIBCEED_DIFF_COEFF_COMP_MAX; i++)
-+      {
-+         ctx.coeff[i] *= -1.0;
-+      }
-+   }
-+};
-+#endif
-+
-+template <typename CoeffType>
-+PAMixedVectorGradientIntegrator::PAMixedVectorGradientIntegrator(
-+   const mfem::MixedVectorGradientIntegrator &integ,
-+   const mfem::FiniteElementSpace &trial_fes,
-+   const mfem::FiniteElementSpace &test_fes,
-+   CoeffType *Q,
-+   const bool use_bdr)
-+{
-+#ifdef MFEM_USE_CEED
-+   MixedVectorGradientOperatorInfo info(trial_fes, test_fes, Q, use_bdr);
-+   Assemble(integ, info, trial_fes, test_fes, Q, use_bdr);
-+#else
-+   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
-+#endif
-+}
-+
-+template <typename CoeffType>
-+MFMixedVectorGradientIntegrator::MFMixedVectorGradientIntegrator(
-+   const mfem::MixedVectorGradientIntegrator &integ,
-+   const mfem::FiniteElementSpace &trial_fes,
-+   const mfem::FiniteElementSpace &test_fes,
-+   CoeffType *Q,
-+   const bool use_bdr)
-+{
-+#ifdef MFEM_USE_CEED
-+   MixedVectorGradientOperatorInfo info(trial_fes, test_fes, Q, use_bdr, true);
-+   Assemble(integ, info, trial_fes, test_fes, Q, use_bdr, true);
-+#else
-+   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
-+#endif
-+}
-+
-+namespace
-+{
-+
-+#ifdef MFEM_USE_CEED
-+mfem::Coefficient *NegativeCoeff(mfem::Coefficient &Q)
-+{
-+   return (dynamic_cast<mfem::ConstantCoefficient *>(&Q) != nullptr) ?
-+          nullptr : new mfem::ProductCoefficient(-1.0, Q);
-+}
-+
-+mfem::VectorCoefficient *NegativeCoeff(mfem::VectorCoefficient &Q)
-+{
-+   return (dynamic_cast<mfem::VectorConstantCoefficient *>(&Q) != nullptr) ?
-+          nullptr : new mfem::ScalarVectorProductCoefficient(-1.0, Q);
-+}
-+
-+mfem::MatrixCoefficient *NegativeCoeff(mfem::MatrixCoefficient &Q)
-+{
-+   return (dynamic_cast<mfem::MatrixConstantCoefficient *>(&Q) != nullptr) ?
-+          nullptr : new mfem::ScalarMatrixProductCoefficient(-1.0, Q);
-+}
-+#endif
-+
-+} // namespace
-+
-+template <typename CoeffType>
-+PAMixedVectorWeakDivergenceIntegrator::PAMixedVectorWeakDivergenceIntegrator(
-+   const mfem::MixedVectorWeakDivergenceIntegrator &integ,
-+   const mfem::FiniteElementSpace &trial_fes,
-+   const mfem::FiniteElementSpace &test_fes,
-+   CoeffType *Q,
-+   const bool use_bdr)
-+{
-+#ifdef MFEM_USE_CEED
-+   MixedVectorWeakDivergenceOperatorInfo info(trial_fes, test_fes, Q, use_bdr);
-+   if (Q)
-+   {
-+      // Does not inherit ownership of old Q
-+      auto *nQ = NegativeCoeff(*Q);
-+      Assemble(integ, info, trial_fes, test_fes, nQ, use_bdr);
-+      delete nQ;
-+   }
-+   else
-+   {
-+      Assemble(integ, info, trial_fes, test_fes, Q, use_bdr);
-+   }
-+#else
-+   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
-+#endif
-+}
-+
-+template <typename CoeffType>
-+MFMixedVectorWeakDivergenceIntegrator::MFMixedVectorWeakDivergenceIntegrator(
-+   const mfem::MixedVectorWeakDivergenceIntegrator &integ,
-+   const mfem::FiniteElementSpace &trial_fes,
-+   const mfem::FiniteElementSpace &test_fes,
-+   CoeffType *Q,
-+   const bool use_bdr)
-+{
-+#ifdef MFEM_USE_CEED
-+   MixedVectorWeakDivergenceOperatorInfo info(trial_fes, test_fes, Q, use_bdr,
-+                                              true);
-+   if (Q)
-+   {
-+      // Does not inherit ownership of old Q
-+      auto *nQ = NegativeCoeff(*Q);
-+      Assemble(integ, info, trial_fes, test_fes, nQ, use_bdr, true);
-+      delete nQ;
-+   }
-+   else
-+   {
-+      Assemble(integ, info, trial_fes, test_fes, Q, use_bdr, true);
-+   }
-+#else
-+   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
-+#endif
-+}
-+
-+// @cond DOXYGEN_SKIP
-+
-+template PAMixedVectorGradientIntegrator::PAMixedVectorGradientIntegrator(
-+   const mfem::MixedVectorGradientIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::Coefficient *, const bool);
-+template PAMixedVectorGradientIntegrator::PAMixedVectorGradientIntegrator(
-+   const mfem::MixedVectorGradientIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::VectorCoefficient *, const bool);
-+template PAMixedVectorGradientIntegrator::PAMixedVectorGradientIntegrator(
-+   const mfem::MixedVectorGradientIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::MatrixCoefficient *, const bool);
-+
-+template PAMixedVectorWeakDivergenceIntegrator::PAMixedVectorWeakDivergenceIntegrator(
-+   const mfem::MixedVectorWeakDivergenceIntegrator &,
-+   const mfem::FiniteElementSpace &, const mfem::FiniteElementSpace &,
-+   mfem::Coefficient *, const bool);
-+template PAMixedVectorWeakDivergenceIntegrator::PAMixedVectorWeakDivergenceIntegrator(
-+   const mfem::MixedVectorWeakDivergenceIntegrator &,
-+   const mfem::FiniteElementSpace &, const mfem::FiniteElementSpace &,
-+   mfem::VectorCoefficient *, const bool);
-+template PAMixedVectorWeakDivergenceIntegrator::PAMixedVectorWeakDivergenceIntegrator(
-+   const mfem::MixedVectorWeakDivergenceIntegrator &,
-+   const mfem::FiniteElementSpace &, const mfem::FiniteElementSpace &,
-+   mfem::MatrixCoefficient *, const bool);
-+
-+template MFMixedVectorGradientIntegrator::MFMixedVectorGradientIntegrator(
-+   const mfem::MixedVectorGradientIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::Coefficient *, const bool);
-+template MFMixedVectorGradientIntegrator::MFMixedVectorGradientIntegrator(
-+   const mfem::MixedVectorGradientIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::VectorCoefficient *, const bool);
-+template MFMixedVectorGradientIntegrator::MFMixedVectorGradientIntegrator(
-+   const mfem::MixedVectorGradientIntegrator &, const mfem::FiniteElementSpace &,
-+   const mfem::FiniteElementSpace &, mfem::MatrixCoefficient *, const bool);
-+
-+template MFMixedVectorWeakDivergenceIntegrator::MFMixedVectorWeakDivergenceIntegrator(
-+   const mfem::MixedVectorWeakDivergenceIntegrator &,
-+   const mfem::FiniteElementSpace &, const mfem::FiniteElementSpace &,
-+   mfem::Coefficient *, const bool);
-+template MFMixedVectorWeakDivergenceIntegrator::MFMixedVectorWeakDivergenceIntegrator(
-+   const mfem::MixedVectorWeakDivergenceIntegrator &,
-+   const mfem::FiniteElementSpace &, const mfem::FiniteElementSpace &,
-+   mfem::VectorCoefficient *, const bool);
-+template MFMixedVectorWeakDivergenceIntegrator::MFMixedVectorWeakDivergenceIntegrator(
-+   const mfem::MixedVectorWeakDivergenceIntegrator &,
-+   const mfem::FiniteElementSpace &, const mfem::FiniteElementSpace &,
-+   mfem::MatrixCoefficient *, const bool);
-+
-+// @endcond
-+
-+} // namespace ceed
-+
-+} // namespace mfem
-diff --git a/fem/ceed/integrators/mixedvecgrad/mixedvecgrad.hpp b/fem/ceed/integrators/mixedvecgrad/mixedvecgrad.hpp
-new file mode 100644
-index 000000000..c4220ea0f
---- /dev/null
-+++ b/fem/ceed/integrators/mixedvecgrad/mixedvecgrad.hpp
-@@ -0,0 +1,85 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#ifndef MFEM_LIBCEED_MIXEDVECGRAD_HPP
-+#define MFEM_LIBCEED_MIXEDVECGRAD_HPP
-+
-+#include "../../interface/integrator.hpp"
-+#include "../../interface/mixed_operator.hpp"
-+#include "../../../fespace.hpp"
-+
-+namespace mfem
-+{
-+
-+namespace ceed
-+{
-+
-+/** Represent a MixedVectorGradientIntegrator with AssemblyLevel::Partial
-+    using libCEED. */
-+class PAMixedVectorGradientIntegrator : public MixedOperator<Integrator>
-+{
-+public:
-+   template <typename CoeffType>
-+   PAMixedVectorGradientIntegrator(
-+      const mfem::MixedVectorGradientIntegrator &integ,
-+      const mfem::FiniteElementSpace &trial_fes,
-+      const mfem::FiniteElementSpace &test_fes,
-+      CoeffType *Q,
-+      const bool use_bdr = false);
-+};
-+
-+/** Represent a MixedVectorGradientIntegrator with AssemblyLevel::None
-+    using libCEED. */
-+class MFMixedVectorGradientIntegrator : public MixedOperator<Integrator>
-+{
-+public:
-+   template <typename CoeffType>
-+   MFMixedVectorGradientIntegrator(
-+      const mfem::MixedVectorGradientIntegrator &integ,
-+      const mfem::FiniteElementSpace &trial_fes,
-+      const mfem::FiniteElementSpace &test_fes,
-+      CoeffType *Q,
-+      const bool use_bdr = false);
-+};
-+
-+/** Represent a MixedVectorWeakDivergenceIntegrator with AssemblyLevel::Partial
-+    using libCEED. */
-+class PAMixedVectorWeakDivergenceIntegrator : public MixedOperator<Integrator>
-+{
-+public:
-+   template <typename CoeffType>
-+   PAMixedVectorWeakDivergenceIntegrator(
-+      const mfem::MixedVectorWeakDivergenceIntegrator &integ,
-+      const mfem::FiniteElementSpace &trial_fes,
-+      const mfem::FiniteElementSpace &test_fes,
-+      CoeffType *Q,
-+      const bool use_bdr = false);
-+};
-+
-+/** Represent a MixedVectorWeakDivergenceIntegrator with AssemblyLevel::None
-+    using libCEED. */
-+class MFMixedVectorWeakDivergenceIntegrator : public MixedOperator<Integrator>
-+{
-+public:
-+   template <typename CoeffType>
-+   MFMixedVectorWeakDivergenceIntegrator(
-+      const mfem::MixedVectorWeakDivergenceIntegrator &integ,
-+      const mfem::FiniteElementSpace &trial_fes,
-+      const mfem::FiniteElementSpace &test_fes,
-+      CoeffType *Q,
-+      const bool use_bdr = false);
-+};
-+
-+}
-+
-+}
-+
-+#endif // MFEM_LIBCEED_MIXEDVECGRAD_HPP
-diff --git a/fem/ceed/integrators/nlconvection/nlconvection.cpp b/fem/ceed/integrators/nlconvection/nlconvection.cpp
-index ba4a274dc..c285051ee 100644
---- a/fem/ceed/integrators/nlconvection/nlconvection.cpp
-+++ b/fem/ceed/integrators/nlconvection/nlconvection.cpp
-@@ -25,76 +25,98 @@ namespace ceed
- #ifdef MFEM_USE_CEED
- struct NLConvectionOperatorInfo : public OperatorInfo
- {
--   NLConvectionContext ctx;
--   NLConvectionOperatorInfo(int dim)
-+   NLConvectionContext ctx = {0};
-+   NLConvectionOperatorInfo(const mfem::FiniteElementSpace &fes,
-+                            mfem::Coefficient *Q, bool use_bdr = false,
-+                            bool use_mf = false)
-    {
-+      MFEM_VERIFY(fes.GetVDim() == fes.GetMesh()->SpaceDimension(),
-+                  "Missing coefficient in ceed::NLConvectionOperatorInfo!");
-+      ctx.dim = fes.GetMesh()->Dimension() - use_bdr;
-+      ctx.space_dim = fes.GetMesh()->SpaceDimension();
-+      if (!use_mf)
-+      {
-+         apply_func = ":f_apply_conv";
-+         apply_qf = &f_apply_conv;
-+      }
-+      else
-+      {
-+         build_func = "";
-+         build_qf = nullptr;
-+      }
-+      if (Q == nullptr)
-+      {
-+         ctx.coeff = 1.0;
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_conv_const";
-+            build_qf = &f_build_conv_const;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_conv_mf_const";
-+            apply_qf = &f_apply_conv_mf_const;
-+         }
-+      }
-+      else if (mfem::ConstantCoefficient *const_coeff =
-+                  dynamic_cast<mfem::ConstantCoefficient *>(Q))
-+      {
-+         ctx.coeff = const_coeff->constant;
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_conv_const";
-+            build_qf = &f_build_conv_const;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_conv_mf_const";
-+            apply_qf = &f_apply_conv_mf_const;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = ":f_build_conv_quad";
-+            build_qf = &f_build_conv_quad;
-+         }
-+         else
-+         {
-+            apply_func = ":f_apply_conv_mf_quad";
-+            apply_qf = &f_apply_conv_mf_quad;
-+         }
-+      }
-       header = "/integrators/nlconvection/nlconvection_qf.h";
--      build_func_const = ":f_build_conv_const";
--      build_qf_const = &f_build_conv_const;
--      build_func_quad = ":f_build_conv_quad";
--      build_qf_quad = &f_build_conv_quad;
--      apply_func = ":f_apply_conv";
--      apply_qf = &f_apply_conv;
--      apply_func_mf_const = ":f_apply_conv_mf_const";
--      apply_qf_mf_const = &f_apply_conv_mf_const;
--      apply_func_mf_quad = ":f_apply_conv_mf_quad";
--      apply_qf_mf_quad = &f_apply_conv_mf_quad;
-       trial_op = EvalMode::InterpAndGrad;
-       test_op = EvalMode::Interp;
--      qdatasize = dim * dim;
-+      qdatasize = ctx.dim * ctx.space_dim;
-    }
- };
- #endif
- 
--PAVectorConvectionNLFIntegrator::PAVectorConvectionNLFIntegrator(
-+PAVectorConvectionNLIntegrator::PAVectorConvectionNLIntegrator(
-+   const mfem::VectorConvectionNLFIntegrator &integ,
-    const mfem::FiniteElementSpace &fes,
--   const mfem::IntegrationRule &irm,
--   mfem::Coefficient *Q)
--   : PAIntegrator()
-+   mfem::Coefficient *Q,
-+   const bool use_bdr)
- {
- #ifdef MFEM_USE_CEED
--   NLConvectionOperatorInfo info(fes.GetMesh()->Dimension());
--   Assemble(info, fes, irm, Q);
-+   NLConvectionOperatorInfo info(fes, Q, use_bdr);
-+   Assemble(integ, info, fes, Q, use_bdr);
- #else
-    MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
- #endif
- }
- 
--MixedPAVectorConvectionNLIntegrator::MixedPAVectorConvectionNLIntegrator(
--   const VectorConvectionNLFIntegrator &integ,
-+MFVectorConvectionNLIntegrator::MFVectorConvectionNLIntegrator(
-+   const mfem::VectorConvectionNLFIntegrator &integ,
-    const mfem::FiniteElementSpace &fes,
--   mfem::Coefficient *Q)
-+   mfem::Coefficient *Q,
-+   const bool use_bdr)
- {
- #ifdef MFEM_USE_CEED
--   NLConvectionOperatorInfo info(fes.GetMesh()->Dimension());
--   Assemble(integ, info, fes, Q);
--#else
--   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
--#endif
--}
--
--MFVectorConvectionNLFIntegrator::MFVectorConvectionNLFIntegrator(
--   const mfem::FiniteElementSpace &fes,
--   const mfem::IntegrationRule &irm,
--   mfem::Coefficient *Q)
--   : MFIntegrator()
--{
--#ifdef MFEM_USE_CEED
--   NLConvectionOperatorInfo info(fes.GetMesh()->Dimension());
--   Assemble(info, fes, irm, Q);
--#else
--   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
--#endif
--}
--
--MixedMFVectorConvectionNLIntegrator::MixedMFVectorConvectionNLIntegrator(
--   const VectorConvectionNLFIntegrator &integ,
--   const mfem::FiniteElementSpace &fes,
--   mfem::Coefficient *Q)
--{
--#ifdef MFEM_USE_CEED
--   NLConvectionOperatorInfo info(fes.GetMesh()->Dimension());
--   Assemble(integ, info, fes, Q);
-+   NLConvectionOperatorInfo info(fes, Q, use_bdr, true);
-+   Assemble(integ, info, fes, Q, use_bdr, true);
- #else
-    MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
- #endif
-diff --git a/fem/ceed/integrators/nlconvection/nlconvection.hpp b/fem/ceed/integrators/nlconvection/nlconvection.hpp
-index 3efe88728..cf245322a 100644
---- a/fem/ceed/integrators/nlconvection/nlconvection.hpp
-+++ b/fem/ceed/integrators/nlconvection/nlconvection.hpp
-@@ -13,7 +13,7 @@
- #define MFEM_LIBCEED_NLCONV_HPP
- 
- #include "../../interface/integrator.hpp"
--#include "../../interface/mixed_integrator.hpp"
-+#include "../../interface/mixed_operator.hpp"
- #include "../../../fespace.hpp"
- 
- namespace mfem
-@@ -24,40 +24,26 @@ namespace ceed
- 
- /** Represent a VectorConvectionNLFIntegrator with AssemblyLevel::Partial
-     using libCEED. */
--class PAVectorConvectionNLFIntegrator : public PAIntegrator
-+class PAVectorConvectionNLIntegrator : public MixedOperator<Integrator>
- {
- public:
--   PAVectorConvectionNLFIntegrator(const mfem::FiniteElementSpace &fes,
--                                   const mfem::IntegrationRule &irm,
--                                   mfem::Coefficient *coeff);
--};
--
--class MixedPAVectorConvectionNLIntegrator : public MixedIntegrator<PAIntegrator>
--{
--public:
--   MixedPAVectorConvectionNLIntegrator(
--      const VectorConvectionNLFIntegrator &integ,
-+   PAVectorConvectionNLIntegrator(
-+      const mfem::VectorConvectionNLFIntegrator &integ,
-       const mfem::FiniteElementSpace &fes,
--      mfem::Coefficient *Q);
-+      mfem::Coefficient *Q,
-+      const bool use_bdr = false);
- };
- 
- /** Represent a VectorConvectionNLFIntegrator with AssemblyLevel::None
-     using libCEED. */
--class MFVectorConvectionNLFIntegrator : public MFIntegrator
--{
--public:
--   MFVectorConvectionNLFIntegrator(const mfem::FiniteElementSpace &fes,
--                                   const mfem::IntegrationRule &irm,
--                                   mfem::Coefficient *coeff);
--};
--
--class MixedMFVectorConvectionNLIntegrator : public MixedIntegrator<MFIntegrator>
-+class MFVectorConvectionNLIntegrator : public MixedOperator<Integrator>
- {
- public:
--   MixedMFVectorConvectionNLIntegrator(
--      const VectorConvectionNLFIntegrator &integ,
-+   MFVectorConvectionNLIntegrator(
-+      const mfem::VectorConvectionNLFIntegrator &integ,
-       const mfem::FiniteElementSpace &fes,
--      mfem::Coefficient *Q);
-+      mfem::Coefficient *Q,
-+      const bool use_bdr = false);
- };
- 
- }
-diff --git a/fem/ceed/integrators/nlconvection/nlconvection_qf.h b/fem/ceed/integrators/nlconvection/nlconvection_qf.h
-index ef0d41327..ee1782784 100644
---- a/fem/ceed/integrators/nlconvection/nlconvection_qf.h
-+++ b/fem/ceed/integrators/nlconvection/nlconvection_qf.h
-@@ -9,186 +9,155 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--/// A structure used to pass additional data to f_build_conv and f_apply_conv
--struct NLConvectionContext { CeedInt dim, space_dim, vdim; CeedScalar coeff; };
-+#ifndef MFEM_LIBCEED_NLCONV_QF_H
-+#define MFEM_LIBCEED_NLCONV_QF_H
- 
--/// libCEED Q-function for building quadrature data for a convection operator
--/// with a constant coefficient
-+#include "../util/util_qf.h"
-+
-+struct NLConvectionContext
-+{
-+   CeedInt dim, space_dim;
-+   CeedScalar coeff;
-+};
-+
-+/// libCEED QFunction for building quadrature data for a convection
-+/// operator with a constant coefficient
- CEED_QFUNCTION(f_build_conv_const)(void *ctx, CeedInt Q,
-                                    const CeedScalar *const *in,
-                                    CeedScalar *const *out)
- {
--   NLConvectionContext *bc = (NLConvectionContext*)ctx;
--   // in[0] is Jacobians with shape [dim, nc=dim, Q]
-+   NLConvectionContext *bc = (NLConvectionContext *)ctx;
-+   // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q]
-    // in[1] is quadrature weights, size (Q)
-    //
--   // At every quadrature point, compute and store qw * adj(J).
-+   // At every quadrature point, compute and store qw * c * adj(J)^T
-    const CeedScalar coeff = bc->coeff;
-    const CeedScalar *J = in[0], *qw = in[1];
-    CeedScalar *qd = out[0];
--   switch (bc->dim + 10 * bc->space_dim)
-+   switch (10 * bc->space_dim + bc->dim)
-    {
-       case 11:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * coeff * J[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            qd[i] = coeff * qw[i] * J[i];
-+            MultAdjJt21(J + i, Q, qw[i] * coeff, Q, qd + i);
-          }
-          break;
-       case 22:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 2   qd: 0 2   adj(J):  J22 -J12
--            //    1 3       1 3           -J21  J11
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J12 = J[i + Q * 2];
--            const CeedScalar J22 = J[i + Q * 3];
--            const CeedScalar w = qw[i] * coeff;
--            qd[i + Q * 0] =  w * J22;
--            qd[i + Q * 1] = -w * J21;
--            qd[i + Q * 2] = -w * J12;
--            qd[i + Q * 3] =  w * J11;
-+            MultAdjJt22(J + i, Q, qw[i] * coeff, Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJt32(J + i, Q, qw[i] * coeff, Q, qd + i);
-          }
-          break;
-       case 33:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 3 6   qd: 0 3 6
--            //    1 4 7       1 4 7
--            //    2 5 8       2 5 8
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J31 = J[i + Q * 2];
--            const CeedScalar J12 = J[i + Q * 3];
--            const CeedScalar J22 = J[i + Q * 4];
--            const CeedScalar J32 = J[i + Q * 5];
--            const CeedScalar J13 = J[i + Q * 6];
--            const CeedScalar J23 = J[i + Q * 7];
--            const CeedScalar J33 = J[i + Q * 8];
--            const CeedScalar A11 = J22 * J33 - J23 * J32;
--            const CeedScalar A12 = J13 * J32 - J12 * J33;
--            const CeedScalar A13 = J12 * J23 - J13 * J22;
--            const CeedScalar A21 = J23 * J31 - J21 * J33;
--            const CeedScalar A22 = J11 * J33 - J13 * J31;
--            const CeedScalar A23 = J13 * J21 - J11 * J23;
--            const CeedScalar A31 = J21 * J32 - J22 * J31;
--            const CeedScalar A32 = J12 * J31 - J11 * J32;
--            const CeedScalar A33 = J11 * J22 - J12 * J21;
--            const CeedScalar w = qw[i] * coeff;
--            qd[i + Q * 0] = w * A11;
--            qd[i + Q * 1] = w * A21;
--            qd[i + Q * 2] = w * A31;
--            qd[i + Q * 3] = w * A12;
--            qd[i + Q * 4] = w * A22;
--            qd[i + Q * 5] = w * A32;
--            qd[i + Q * 6] = w * A13;
--            qd[i + Q * 7] = w * A23;
--            qd[i + Q * 8] = w * A33;
-+            MultAdjJt33(J + i, Q, qw[i] * coeff, Q, qd + i);
-          }
-          break;
-    }
-    return 0;
- }
- 
--/// libCEED Q-function for building quadrature data for a convection operator
--/// coefficient evaluated at quadrature points.
-+/// libCEED QFunction for building quadrature data for a convection
-+/// operator with a coefficient evaluated at quadrature points
- CEED_QFUNCTION(f_build_conv_quad)(void *ctx, CeedInt Q,
-                                   const CeedScalar *const *in,
-                                   CeedScalar *const *out)
- {
-    NLConvectionContext *bc = (NLConvectionContext *)ctx;
--   // in[1] is Jacobians with shape [dim, nc=dim, Q]
-+   // in[0] is coefficients, size (Q)
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-    // in[2] is quadrature weights, size (Q)
-    //
--   // At every quadrature point, compute and store qw * adj(J).
-+   // At every quadrature point, compute and store qw * c * adj(J)^T
-    const CeedScalar *c = in[0], *J = in[1], *qw = in[2];
-    CeedScalar *qd = out[0];
--   switch (bc->dim + 10 * bc->space_dim)
-+   switch (10 * bc->space_dim + bc->dim)
-    {
-       case 11:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * c[i] * J[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            const CeedScalar coeff = c[i];
--            qd[i] = coeff * qw[i] * J[i];
-+            MultAdjJt21(J + i, Q, qw[i] * c[i], Q, qd + i);
-          }
-          break;
-       case 22:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 2   qd: 0 2   adj(J):  J22 -J12
--            //    1 3       1 3           -J21  J11
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J12 = J[i + Q * 2];
--            const CeedScalar J22 = J[i + Q * 3];
--            const CeedScalar coeff = c[i];
--            const CeedScalar w = qw[i] * coeff;
--            qd[i + Q * 0] =  w * J22;
--            qd[i + Q * 1] = -w * J21;
--            qd[i + Q * 2] = -w * J12;
--            qd[i + Q * 3] =  w * J11;
-+            MultAdjJt22(J + i, Q, qw[i] * c[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJt32(J + i, Q, qw[i] * c[i], Q, qd + i);
-          }
-          break;
-       case 33:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 3 6   qd: 0 3 6
--            //    1 4 7       1 4 7
--            //    2 5 8       2 5 8
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J31 = J[i + Q * 2];
--            const CeedScalar J12 = J[i + Q * 3];
--            const CeedScalar J22 = J[i + Q * 4];
--            const CeedScalar J32 = J[i + Q * 5];
--            const CeedScalar J13 = J[i + Q * 6];
--            const CeedScalar J23 = J[i + Q * 7];
--            const CeedScalar J33 = J[i + Q * 8];
--            const CeedScalar A11 = J22 * J33 - J23 * J32;
--            const CeedScalar A12 = J13 * J32 - J12 * J33;
--            const CeedScalar A13 = J12 * J23 - J13 * J22;
--            const CeedScalar A21 = J23 * J31 - J21 * J33;
--            const CeedScalar A22 = J11 * J33 - J13 * J31;
--            const CeedScalar A23 = J13 * J21 - J11 * J23;
--            const CeedScalar A31 = J21 * J32 - J22 * J31;
--            const CeedScalar A32 = J12 * J31 - J11 * J32;
--            const CeedScalar A33 = J11 * J22 - J12 * J21;
--            const CeedScalar coeff = c[i];
--            const CeedScalar w = qw[i] * coeff;
--            qd[i + Q * 0] = w * A11;
--            qd[i + Q * 1] = w * A21;
--            qd[i + Q * 2] = w * A31;
--            qd[i + Q * 3] = w * A12;
--            qd[i + Q * 4] = w * A22;
--            qd[i + Q * 5] = w * A32;
--            qd[i + Q * 6] = w * A13;
--            qd[i + Q * 7] = w * A23;
--            qd[i + Q * 8] = w * A33;
-+            MultAdjJt33(J + i, Q, qw[i] * c[i], Q, qd + i);
-          }
-          break;
-    }
-    return 0;
- }
- 
--/// libCEED Q-function for applying a conv operator
-+/// libCEED QFunction for applying a convection operator
- CEED_QFUNCTION(f_apply_conv)(void *ctx, CeedInt Q,
-                              const CeedScalar *const *in,
-                              CeedScalar *const *out)
- {
-    NLConvectionContext *bc = (NLConvectionContext *)ctx;
--   // in[0], out[0] have shape [dim, nc=1, Q]
-+   // in[0] has shape [ncomp=space_dim, Q]
-+   // in[1] has shape [dim, ncomp=space_dim, Q]
-+   // out[0] has shape [ncomp=space_dim, Q]
-    const CeedScalar *u = in[0], *ug = in[1], *qd = in[2];
-    CeedScalar *vg = out[0];
--   switch (10*bc->dim + bc->vdim)
-+   switch (10 * bc->space_dim + bc->dim)
-    {
-       case 11:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            vg[i] = qd[i] * u[i] * ug[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            vg[i] = u[i] * ug[i] * qd[i];
-+            const CeedScalar qd00 = qd[i + Q * 0];
-+            const CeedScalar qd10 = qd[i + Q * 1];
-+            const CeedScalar u0   = u[i + Q * 0];
-+            const CeedScalar u1   = u[i + Q * 1];
-+            const CeedScalar ug00 = ug[i + Q * 0];
-+            const CeedScalar ug10 = ug[i + Q * 1];
-+            const CeedScalar Dxu0 = qd00 * ug00;
-+            const CeedScalar Dyu0 = qd10 * ug00;
-+            const CeedScalar Dxu1 = qd00 * ug10;
-+            const CeedScalar Dyu1 = qd10 * ug10;
-+            vg[i + Q * 0] = u0 * Dxu0 + u1 * Dyu0;
-+            vg[i + Q * 1] = u0 * Dxu1 + u1 * Dyu1;
-          }
-          break;
-       case 22:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
-             const CeedScalar qd00 = qd[i + Q * 0];
-             const CeedScalar qd10 = qd[i + Q * 1];
-@@ -200,16 +169,47 @@ CEED_QFUNCTION(f_apply_conv)(void *ctx, CeedInt Q,
-             const CeedScalar ug10 = ug[i + Q * 1];
-             const CeedScalar ug01 = ug[i + Q * 2];
-             const CeedScalar ug11 = ug[i + Q * 3];
--            const CeedScalar Dxu0 = ug00 * qd00 + ug01 * qd10;
--            const CeedScalar Dyu0 = ug00 * qd01 + ug01 * qd11;
--            const CeedScalar Dxu1 = ug10 * qd00 + ug11 * qd10;
--            const CeedScalar Dyu1 = ug10 * qd01 + ug11 * qd11;
-+            const CeedScalar Dxu0 = qd00 * ug00 + qd01 * ug01;
-+            const CeedScalar Dyu0 = qd10 * ug00 + qd11 * ug01;
-+            const CeedScalar Dxu1 = qd00 * ug10 + qd01 * ug11;
-+            const CeedScalar Dyu1 = qd10 * ug10 + qd11 * ug11;
-             vg[i + Q * 0] = u0 * Dxu0 + u1 * Dyu0;
-             vg[i + Q * 1] = u0 * Dxu1 + u1 * Dyu1;
-          }
-          break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd00 = qd[i + Q * 0];
-+            const CeedScalar qd10 = qd[i + Q * 1];
-+            const CeedScalar qd20 = qd[i + Q * 2];
-+            const CeedScalar qd01 = qd[i + Q * 3];
-+            const CeedScalar qd11 = qd[i + Q * 4];
-+            const CeedScalar qd21 = qd[i + Q * 5];
-+            const CeedScalar u0   = u[i + Q * 0];
-+            const CeedScalar u1   = u[i + Q * 1];
-+            const CeedScalar u2   = u[i + Q * 2];
-+            const CeedScalar ug00 = ug[i + Q * 0];
-+            const CeedScalar ug10 = ug[i + Q * 1];
-+            const CeedScalar ug20 = ug[i + Q * 2];
-+            const CeedScalar ug01 = ug[i + Q * 3];
-+            const CeedScalar ug11 = ug[i + Q * 4];
-+            const CeedScalar ug21 = ug[i + Q * 5];
-+            const CeedScalar Dxu0 = qd00 * ug00 + qd01 * ug01;
-+            const CeedScalar Dyu0 = qd10 * ug00 + qd11 * ug01;
-+            const CeedScalar Dzu0 = qd20 * ug00 + qd21 * ug01;
-+            const CeedScalar Dxu1 = qd00 * ug10 + qd01 * ug11;
-+            const CeedScalar Dyu1 = qd10 * ug10 + qd11 * ug11;
-+            const CeedScalar Dzu1 = qd20 * ug10 + qd21 * ug11;
-+            const CeedScalar Dxu2 = qd00 * ug20 + qd01 * ug21;
-+            const CeedScalar Dyu2 = qd10 * ug20 + qd11 * ug21;
-+            const CeedScalar Dzu2 = qd20 * ug20 + qd21 * ug21;
-+            vg[i + Q * 0] = u0 * Dxu0 + u1 * Dyu0 + u2 * Dzu0;
-+            vg[i + Q * 1] = u0 * Dxu1 + u1 * Dyu1 + u2 * Dzu1;
-+            vg[i + Q * 2] = u0 * Dxu2 + u1 * Dyu2 + u2 * Dzu2;
-+         }
-       case 33:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
-             const CeedScalar qd00 = qd[i + Q * 0];
-             const CeedScalar qd10 = qd[i + Q * 1];
-@@ -232,15 +232,15 @@ CEED_QFUNCTION(f_apply_conv)(void *ctx, CeedInt Q,
-             const CeedScalar ug02 = ug[i + Q * 6];
-             const CeedScalar ug12 = ug[i + Q * 7];
-             const CeedScalar ug22 = ug[i + Q * 8];
--            const CeedScalar Dxu0 = ug00 * qd00 + ug01 * qd10 + ug02 * qd20;
--            const CeedScalar Dyu0 = ug00 * qd01 + ug01 * qd11 + ug02 * qd21;
--            const CeedScalar Dzu0 = ug00 * qd02 + ug01 * qd12 + ug02 * qd22;
--            const CeedScalar Dxu1 = ug10 * qd00 + ug11 * qd10 + ug12 * qd20;
--            const CeedScalar Dyu1 = ug10 * qd01 + ug11 * qd11 + ug12 * qd21;
--            const CeedScalar Dzu1 = ug10 * qd02 + ug11 * qd12 + ug12 * qd22;
--            const CeedScalar Dxu2 = ug20 * qd00 + ug21 * qd10 + ug22 * qd20;
--            const CeedScalar Dyu2 = ug20 * qd01 + ug21 * qd11 + ug22 * qd21;
--            const CeedScalar Dzu2 = ug20 * qd02 + ug21 * qd12 + ug22 * qd22;
-+            const CeedScalar Dxu0 = qd00 * ug00 + qd01 * ug01 + qd02 * ug02;
-+            const CeedScalar Dyu0 = qd10 * ug00 + qd11 * ug01 + qd12 * ug02;
-+            const CeedScalar Dzu0 = qd20 * ug00 + qd21 * ug01 + qd22 * ug02;
-+            const CeedScalar Dxu1 = qd00 * ug10 + qd01 * ug11 + qd02 * ug12;
-+            const CeedScalar Dyu1 = qd10 * ug10 + qd11 * ug11 + qd12 * ug12;
-+            const CeedScalar Dzu1 = qd20 * ug10 + qd21 * ug11 + qd22 * ug12;
-+            const CeedScalar Dxu2 = qd00 * ug20 + qd01 * ug21 + qd02 * ug22;
-+            const CeedScalar Dyu2 = qd10 * ug20 + qd11 * ug21 + qd12 * ug22;
-+            const CeedScalar Dzu2 = qd20 * ug20 + qd21 * ug21 + qd22 * ug22;
-             vg[i + Q * 0] = u0 * Dxu0 + u1 * Dyu0 + u2 * Dzu0;
-             vg[i + Q * 1] = u0 * Dxu1 + u1 * Dyu1 + u2 * Dzu1;
-             vg[i + Q * 2] = u0 * Dxu2 + u1 * Dyu2 + u2 * Dzu2;
-@@ -250,91 +250,101 @@ CEED_QFUNCTION(f_apply_conv)(void *ctx, CeedInt Q,
-    return 0;
- }
- 
--/// libCEED Q-function for applying a conv operator
-+/// libCEED QFunction for applying a convection operator with a constant
-+/// coefficient
- CEED_QFUNCTION(f_apply_conv_mf_const)(void *ctx, CeedInt Q,
-                                       const CeedScalar *const *in,
-                                       CeedScalar *const *out)
- {
--   NLConvectionContext *bc = (NLConvectionContext*)ctx;
--   // in[0], out[0] have shape [dim, nc=1, Q]
--   // in[1] is Jacobians with shape [dim, nc=dim, Q]
--   // in[2] is quadrature weights, size (Q)
-+   NLConvectionContext *bc = (NLConvectionContext *)ctx;
-+   // in[0] has shape [ncomp=space_dim, Q]
-+   // in[1] has shape [dim, ncomp=space_dim, Q]
-+   // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[3] is quadrature weights, size (Q)
-+   // out[0] has shape [ncomp=space_dim, Q]
-    //
--   // At every quadrature point, compute qw * adj(J).
-+   // At every quadrature point, compute qw * c * adj(J)^T
-    const CeedScalar coeff = bc->coeff;
-    const CeedScalar *u = in[0], *ug = in[1], *J = in[2], *qw = in[3];
-    CeedScalar *vg = out[0];
--   switch (10 * bc->dim + bc->vdim)
-+   switch (10 * bc->space_dim + bc->dim)
-    {
-       case 11:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * coeff * J[i];
-+            vg[i] = u[i] * qd * ug[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            const CeedScalar qd = coeff * qw[i] * J[i];
--            vg[i] = u[i] * ug[i] * qd;
-+            CeedScalar qd[2];
-+            MultAdjJt21(J + i, Q, qw[i] * coeff, 1, qd);
-+            const CeedScalar u0   = u[i + Q * 0];
-+            const CeedScalar u1   = u[i + Q * 1];
-+            const CeedScalar ug00 = ug[i + Q * 0];
-+            const CeedScalar ug10 = ug[i + Q * 1];
-+            const CeedScalar Dxu0 = qd[0] * ug00;
-+            const CeedScalar Dyu0 = qd[1] * ug00;
-+            const CeedScalar Dxu1 = qd[0] * ug10;
-+            const CeedScalar Dyu1 = qd[1] * ug10;
-+            vg[i + Q * 0] = u0 * Dxu0 + u1 * Dyu0;
-+            vg[i + Q * 1] = u0 * Dxu1 + u1 * Dyu1;
-          }
-          break;
-       case 22:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 2   qd: 0 2   adj(J):  J22 -J12
--            //    1 3       1 3           -J21  J11
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J12 = J[i + Q * 2];
--            const CeedScalar J22 = J[i + Q * 3];
--            const CeedScalar w = qw[i] * coeff;
--            const CeedScalar qd00 =  w * J22;
--            const CeedScalar qd10 = -w * J21;
--            const CeedScalar qd01 = -w * J12;
--            const CeedScalar qd11 =  w * J11;
-+            CeedScalar qd[4];
-+            MultAdjJt22(J + i, Q, qw[i] * coeff, 1, qd);
-             const CeedScalar u0   = u[i + Q * 0];
-             const CeedScalar u1   = u[i + Q * 1];
-             const CeedScalar ug00 = ug[i + Q * 0];
-             const CeedScalar ug10 = ug[i + Q * 1];
-             const CeedScalar ug01 = ug[i + Q * 2];
-             const CeedScalar ug11 = ug[i + Q * 3];
--            const CeedScalar Dxu0 = ug00 * qd00 + ug01 * qd10;
--            const CeedScalar Dyu0 = ug00 * qd01 + ug01 * qd11;
--            const CeedScalar Dxu1 = ug10 * qd00 + ug11 * qd10;
--            const CeedScalar Dyu1 = ug10 * qd01 + ug11 * qd11;
-+            const CeedScalar Dxu0 = qd[0] * ug00 + qd[2] * ug01;
-+            const CeedScalar Dyu0 = qd[1] * ug00 + qd[3] * ug01;
-+            const CeedScalar Dxu1 = qd[0] * ug10 + qd[2] * ug11;
-+            const CeedScalar Dyu1 = qd[1] * ug10 + qd[3] * ug11;
-             vg[i + Q * 0] = u0 * Dxu0 + u1 * Dyu0;
-             vg[i + Q * 1] = u0 * Dxu1 + u1 * Dyu1;
-          }
-          break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultAdjJt32(J + i, Q, qw[i] * coeff, 1, qd);
-+            const CeedScalar u0   = u[i + Q * 0];
-+            const CeedScalar u1   = u[i + Q * 1];
-+            const CeedScalar u2   = u[i + Q * 2];
-+            const CeedScalar ug00 = ug[i + Q * 0];
-+            const CeedScalar ug10 = ug[i + Q * 1];
-+            const CeedScalar ug20 = ug[i + Q * 2];
-+            const CeedScalar ug01 = ug[i + Q * 3];
-+            const CeedScalar ug11 = ug[i + Q * 4];
-+            const CeedScalar ug21 = ug[i + Q * 5];
-+            const CeedScalar Dxu0 = qd[0] * ug00 + qd[3] * ug01;
-+            const CeedScalar Dyu0 = qd[1] * ug00 + qd[4] * ug01;
-+            const CeedScalar Dzu0 = qd[2] * ug00 + qd[5] * ug01;
-+            const CeedScalar Dxu1 = qd[0] * ug10 + qd[3] * ug11;
-+            const CeedScalar Dyu1 = qd[1] * ug10 + qd[4] * ug11;
-+            const CeedScalar Dzu1 = qd[2] * ug10 + qd[5] * ug11;
-+            const CeedScalar Dxu2 = qd[0] * ug20 + qd[3] * ug21;
-+            const CeedScalar Dyu2 = qd[1] * ug20 + qd[4] * ug21;
-+            const CeedScalar Dzu2 = qd[2] * ug20 + qd[5] * ug21;
-+            vg[i + Q * 0] = u0 * Dxu0 + u1 * Dyu0 + u2 * Dzu0;
-+            vg[i + Q * 1] = u0 * Dxu1 + u1 * Dyu1 + u2 * Dzu1;
-+            vg[i + Q * 2] = u0 * Dxu2 + u1 * Dyu2 + u2 * Dzu2;
-+         }
-+         break;
-       case 33:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 3 6   qd: 0 3 6
--            //    1 4 7       1 4 7
--            //    2 5 8       2 5 8
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J31 = J[i + Q * 2];
--            const CeedScalar J12 = J[i + Q * 3];
--            const CeedScalar J22 = J[i + Q * 4];
--            const CeedScalar J32 = J[i + Q * 5];
--            const CeedScalar J13 = J[i + Q * 6];
--            const CeedScalar J23 = J[i + Q * 7];
--            const CeedScalar J33 = J[i + Q * 8];
--            const CeedScalar A11 = J22 * J33 - J23 * J32;
--            const CeedScalar A12 = J13 * J32 - J12 * J33;
--            const CeedScalar A13 = J12 * J23 - J13 * J22;
--            const CeedScalar A21 = J23 * J31 - J21 * J33;
--            const CeedScalar A22 = J11 * J33 - J13 * J31;
--            const CeedScalar A23 = J13 * J21 - J11 * J23;
--            const CeedScalar A31 = J21 * J32 - J22 * J31;
--            const CeedScalar A32 = J12 * J31 - J11 * J32;
--            const CeedScalar A33 = J11 * J22 - J12 * J21;
--            const CeedScalar w = qw[i] * coeff;
--            const CeedScalar qd00 = w * A11;
--            const CeedScalar qd10 = w * A21;
--            const CeedScalar qd20 = w * A31;
--            const CeedScalar qd01 = w * A12;
--            const CeedScalar qd11 = w * A22;
--            const CeedScalar qd21 = w * A32;
--            const CeedScalar qd02 = w * A13;
--            const CeedScalar qd12 = w * A23;
--            const CeedScalar qd22 = w * A33;
-+            CeedScalar qd[9];
-+            MultAdjJt33(J + i, Q, qw[i] * coeff, 1, qd);
-             const CeedScalar u0   = u[i + Q * 0];
-             const CeedScalar u1   = u[i + Q * 1];
-             const CeedScalar u2   = u[i + Q * 2];
-@@ -347,15 +357,15 @@ CEED_QFUNCTION(f_apply_conv_mf_const)(void *ctx, CeedInt Q,
-             const CeedScalar ug02 = ug[i + Q * 6];
-             const CeedScalar ug12 = ug[i + Q * 7];
-             const CeedScalar ug22 = ug[i + Q * 8];
--            const CeedScalar Dxu0 = ug00 * qd00 + ug01 * qd10 + ug02 * qd20;
--            const CeedScalar Dyu0 = ug00 * qd01 + ug01 * qd11 + ug02 * qd21;
--            const CeedScalar Dzu0 = ug00 * qd02 + ug01 * qd12 + ug02 * qd22;
--            const CeedScalar Dxu1 = ug10 * qd00 + ug11 * qd10 + ug12 * qd20;
--            const CeedScalar Dyu1 = ug10 * qd01 + ug11 * qd11 + ug12 * qd21;
--            const CeedScalar Dzu1 = ug10 * qd02 + ug11 * qd12 + ug12 * qd22;
--            const CeedScalar Dxu2 = ug20 * qd00 + ug21 * qd10 + ug22 * qd20;
--            const CeedScalar Dyu2 = ug20 * qd01 + ug21 * qd11 + ug22 * qd21;
--            const CeedScalar Dzu2 = ug20 * qd02 + ug21 * qd12 + ug22 * qd22;
-+            const CeedScalar Dxu0 = qd[0] * ug00 + qd[3] * ug01 + qd[6] * ug02;
-+            const CeedScalar Dyu0 = qd[1] * ug00 + qd[4] * ug01 + qd[7] * ug02;
-+            const CeedScalar Dzu0 = qd[2] * ug00 + qd[5] * ug01 + qd[8] * ug02;
-+            const CeedScalar Dxu1 = qd[0] * ug10 + qd[3] * ug11 + qd[6] * ug12;
-+            const CeedScalar Dyu1 = qd[1] * ug10 + qd[4] * ug11 + qd[7] * ug12;
-+            const CeedScalar Dzu1 = qd[2] * ug10 + qd[5] * ug11 + qd[8] * ug12;
-+            const CeedScalar Dxu2 = qd[0] * ug20 + qd[3] * ug21 + qd[6] * ug22;
-+            const CeedScalar Dyu2 = qd[1] * ug20 + qd[4] * ug21 + qd[7] * ug22;
-+            const CeedScalar Dzu2 = qd[2] * ug20 + qd[5] * ug21 + qd[8] * ug22;
-             vg[i + Q * 0] = u0 * Dxu0 + u1 * Dyu0 + u2 * Dzu0;
-             vg[i + Q * 1] = u0 * Dxu1 + u1 * Dyu1 + u2 * Dzu1;
-             vg[i + Q * 2] = u0 * Dxu2 + u1 * Dyu2 + u2 * Dzu2;
-@@ -365,89 +375,101 @@ CEED_QFUNCTION(f_apply_conv_mf_const)(void *ctx, CeedInt Q,
-    return 0;
- }
- 
-+/// libCEED QFunction for applying a convection operator with a coefficient
-+/// evaluated at quadrature points
- CEED_QFUNCTION(f_apply_conv_mf_quad)(void *ctx, CeedInt Q,
-                                      const CeedScalar *const *in,
-                                      CeedScalar *const *out)
- {
--   NLConvectionContext *bc = (NLConvectionContext*)ctx;
--   // in[0], out[0] have shape [dim, nc=1, Q]
--   // in[1] is Jacobians with shape [dim, nc=dim, Q]
--   // in[2] is quadrature weights, size (Q)
-+   NLConvectionContext *bc = (NLConvectionContext *)ctx;
-+   // in[0] has shape [ncomp=space_dim, Q]
-+   // in[1] has shape [dim, ncomp=space_dim, Q]
-+   // in[2] is coefficients, size (Q)
-+   // in[3] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[4] is quadrature weights, size (Q)
-+   // out[0] has shape [ncomp=space_dim, Q]
-    //
--   // At every quadrature point, compute qw * adj(J).
--   const CeedScalar *c = in[0], *u = in[1], *ug = in[2], *J = in[3], *qw = in[4];
-+   // At every quadrature point, compute qw * c * adj(J)^T
-+   const CeedScalar *u = in[0], *ug = in[1], *c = in[2], *J = in[3], *qw = in[4];
-    CeedScalar *vg = out[0];
--   switch (10 * bc->dim + bc->vdim)
-+   switch (10 * bc->space_dim + bc->dim)
-    {
-       case 11:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            const CeedScalar qd = c[i] * qw[i] * J[i];
--            vg[i] = u[i] * ug[i] * qd;
-+            const CeedScalar qd = qw[i] * c[i] * J[i];
-+            vg[i] = u[i] * qd * ug[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[2];
-+            MultAdjJt21(J + i, Q, qw[i] * c[i], 1, qd);
-+            const CeedScalar u0   = u[i + Q * 0];
-+            const CeedScalar u1   = u[i + Q * 1];
-+            const CeedScalar ug00 = ug[i + Q * 0];
-+            const CeedScalar ug10 = ug[i + Q * 1];
-+            const CeedScalar Dxu0 = qd[0] * ug00;
-+            const CeedScalar Dyu0 = qd[1] * ug00;
-+            const CeedScalar Dxu1 = qd[0] * ug10;
-+            const CeedScalar Dyu1 = qd[1] * ug10;
-+            vg[i + Q * 0] = u0 * Dxu0 + u1 * Dyu0;
-+            vg[i + Q * 1] = u0 * Dxu1 + u1 * Dyu1;
-          }
-          break;
-       case 22:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 2   qd: 0 2   adj(J):  J22 -J12
--            //    1 3       1 3           -J21  J11
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J12 = J[i + Q * 2];
--            const CeedScalar J22 = J[i + Q * 3];
--            const CeedScalar w = qw[i] * c[i];
--            const CeedScalar qd00 =  w * J22;
--            const CeedScalar qd10 = -w * J21;
--            const CeedScalar qd01 = -w * J12;
--            const CeedScalar qd11 =  w * J11;
-+            CeedScalar qd[4];
-+            MultAdjJt22(J + i, Q, qw[i] * c[i], 1, qd);
-             const CeedScalar u0   = u[i + Q * 0];
-             const CeedScalar u1   = u[i + Q * 1];
-             const CeedScalar ug00 = ug[i + Q * 0];
-             const CeedScalar ug10 = ug[i + Q * 1];
-             const CeedScalar ug01 = ug[i + Q * 2];
-             const CeedScalar ug11 = ug[i + Q * 3];
--            const CeedScalar Dxu0 = ug00 * qd00 + ug01 * qd10;
--            const CeedScalar Dyu0 = ug00 * qd01 + ug01 * qd11;
--            const CeedScalar Dxu1 = ug10 * qd00 + ug11 * qd10;
--            const CeedScalar Dyu1 = ug10 * qd01 + ug11 * qd11;
-+            const CeedScalar Dxu0 = qd[0] * ug00 + qd[2] * ug01;
-+            const CeedScalar Dyu0 = qd[1] * ug00 + qd[3] * ug01;
-+            const CeedScalar Dxu1 = qd[0] * ug10 + qd[2] * ug11;
-+            const CeedScalar Dyu1 = qd[1] * ug10 + qd[3] * ug11;
-             vg[i + Q * 0] = u0 * Dxu0 + u1 * Dyu0;
-             vg[i + Q * 1] = u0 * Dxu1 + u1 * Dyu1;
-          }
-          break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultAdjJt32(J + i, Q, qw[i] * c[i], 1, qd);
-+            const CeedScalar u0   = u[i + Q * 0];
-+            const CeedScalar u1   = u[i + Q * 1];
-+            const CeedScalar u2   = u[i + Q * 2];
-+            const CeedScalar ug00 = ug[i + Q * 0];
-+            const CeedScalar ug10 = ug[i + Q * 1];
-+            const CeedScalar ug20 = ug[i + Q * 2];
-+            const CeedScalar ug01 = ug[i + Q * 3];
-+            const CeedScalar ug11 = ug[i + Q * 4];
-+            const CeedScalar ug21 = ug[i + Q * 5];
-+            const CeedScalar Dxu0 = qd[0] * ug00 + qd[3] * ug01;
-+            const CeedScalar Dyu0 = qd[1] * ug00 + qd[4] * ug01;
-+            const CeedScalar Dzu0 = qd[2] * ug00 + qd[5] * ug01;
-+            const CeedScalar Dxu1 = qd[0] * ug10 + qd[3] * ug11;
-+            const CeedScalar Dyu1 = qd[1] * ug10 + qd[4] * ug11;
-+            const CeedScalar Dzu1 = qd[2] * ug10 + qd[5] * ug11;
-+            const CeedScalar Dxu2 = qd[0] * ug20 + qd[3] * ug21;
-+            const CeedScalar Dyu2 = qd[1] * ug20 + qd[4] * ug21;
-+            const CeedScalar Dzu2 = qd[2] * ug20 + qd[5] * ug21;
-+            vg[i + Q * 0] = u0 * Dxu0 + u1 * Dyu0 + u2 * Dzu0;
-+            vg[i + Q * 1] = u0 * Dxu1 + u1 * Dyu1 + u2 * Dzu1;
-+            vg[i + Q * 2] = u0 * Dxu2 + u1 * Dyu2 + u2 * Dzu2;
-+         }
-+         break;
-       case 33:
--         for (CeedInt i = 0; i < Q; i++)
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-          {
--            // J: 0 3 6   qd: 0 3 6
--            //    1 4 7       1 4 7
--            //    2 5 8       2 5 8
--            const CeedScalar J11 = J[i + Q * 0];
--            const CeedScalar J21 = J[i + Q * 1];
--            const CeedScalar J31 = J[i + Q * 2];
--            const CeedScalar J12 = J[i + Q * 3];
--            const CeedScalar J22 = J[i + Q * 4];
--            const CeedScalar J32 = J[i + Q * 5];
--            const CeedScalar J13 = J[i + Q * 6];
--            const CeedScalar J23 = J[i + Q * 7];
--            const CeedScalar J33 = J[i + Q * 8];
--            const CeedScalar A11 = J22 * J33 - J23 * J32;
--            const CeedScalar A12 = J13 * J32 - J12 * J33;
--            const CeedScalar A13 = J12 * J23 - J13 * J22;
--            const CeedScalar A21 = J23 * J31 - J21 * J33;
--            const CeedScalar A22 = J11 * J33 - J13 * J31;
--            const CeedScalar A23 = J13 * J21 - J11 * J23;
--            const CeedScalar A31 = J21 * J32 - J22 * J31;
--            const CeedScalar A32 = J12 * J31 - J11 * J32;
--            const CeedScalar A33 = J11 * J22 - J12 * J21;
--            const CeedScalar w = qw[i] * c[i];
--            const CeedScalar qd00 = w * A11;
--            const CeedScalar qd10 = w * A21;
--            const CeedScalar qd20 = w * A31;
--            const CeedScalar qd01 = w * A12;
--            const CeedScalar qd11 = w * A22;
--            const CeedScalar qd21 = w * A32;
--            const CeedScalar qd02 = w * A13;
--            const CeedScalar qd12 = w * A23;
--            const CeedScalar qd22 = w * A33;
-+            CeedScalar qd[9];
-+            MultAdjJt33(J + i, Q, qw[i] * c[i], 1, qd);
-             const CeedScalar u0   = u[i + Q * 0];
-             const CeedScalar u1   = u[i + Q * 1];
-             const CeedScalar u2   = u[i + Q * 2];
-@@ -460,15 +482,15 @@ CEED_QFUNCTION(f_apply_conv_mf_quad)(void *ctx, CeedInt Q,
-             const CeedScalar ug02 = ug[i + Q * 6];
-             const CeedScalar ug12 = ug[i + Q * 7];
-             const CeedScalar ug22 = ug[i + Q * 8];
--            const CeedScalar Dxu0 = ug00 * qd00 + ug01 * qd10 + ug02 * qd20;
--            const CeedScalar Dyu0 = ug00 * qd01 + ug01 * qd11 + ug02 * qd21;
--            const CeedScalar Dzu0 = ug00 * qd02 + ug01 * qd12 + ug02 * qd22;
--            const CeedScalar Dxu1 = ug10 * qd00 + ug11 * qd10 + ug12 * qd20;
--            const CeedScalar Dyu1 = ug10 * qd01 + ug11 * qd11 + ug12 * qd21;
--            const CeedScalar Dzu1 = ug10 * qd02 + ug11 * qd12 + ug12 * qd22;
--            const CeedScalar Dxu2 = ug20 * qd00 + ug21 * qd10 + ug22 * qd20;
--            const CeedScalar Dyu2 = ug20 * qd01 + ug21 * qd11 + ug22 * qd21;
--            const CeedScalar Dzu2 = ug20 * qd02 + ug21 * qd12 + ug22 * qd22;
-+            const CeedScalar Dxu0 = qd[0] * ug00 + qd[3] * ug01 + qd[6] * ug02;
-+            const CeedScalar Dyu0 = qd[1] * ug00 + qd[4] * ug01 + qd[7] * ug02;
-+            const CeedScalar Dzu0 = qd[2] * ug00 + qd[5] * ug01 + qd[8] * ug02;
-+            const CeedScalar Dxu1 = qd[0] * ug10 + qd[3] * ug11 + qd[6] * ug12;
-+            const CeedScalar Dyu1 = qd[1] * ug10 + qd[4] * ug11 + qd[7] * ug12;
-+            const CeedScalar Dzu1 = qd[2] * ug10 + qd[5] * ug11 + qd[8] * ug12;
-+            const CeedScalar Dxu2 = qd[0] * ug20 + qd[3] * ug21 + qd[6] * ug22;
-+            const CeedScalar Dyu2 = qd[1] * ug20 + qd[4] * ug21 + qd[7] * ug22;
-+            const CeedScalar Dzu2 = qd[2] * ug20 + qd[5] * ug21 + qd[8] * ug22;
-             vg[i + Q * 0] = u0 * Dxu0 + u1 * Dyu0 + u2 * Dzu0;
-             vg[i + Q * 1] = u0 * Dxu1 + u1 * Dyu1 + u2 * Dzu1;
-             vg[i + Q * 2] = u0 * Dxu2 + u1 * Dyu2 + u2 * Dzu2;
-@@ -477,3 +499,5 @@ CEED_QFUNCTION(f_apply_conv_mf_quad)(void *ctx, CeedInt Q,
-    }
-    return 0;
- }
-+
-+#endif // MFEM_LIBCEED_NLCONV_QF_H
-diff --git a/fem/ceed/integrators/util/util_qf.h b/fem/ceed/integrators/util/util_qf.h
-new file mode 100644
-index 000000000..fa7ca763b
---- /dev/null
-+++ b/fem/ceed/integrators/util/util_qf.h
-@@ -0,0 +1,855 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#ifndef MFEM_LIBCEED_UTIL_QF_H
-+#define MFEM_LIBCEED_UTIL_QF_H
-+
-+#include <math.h>
-+
-+CEED_QFUNCTION_HELPER CeedScalar DetJ22(const CeedScalar *J,
-+                                        const CeedInt J_stride)
-+{
-+   // J: 0 2
-+   //    1 3
-+   return J[J_stride * 0] * J[J_stride * 3] -
-+          J[J_stride * 1] * J[J_stride * 2];
-+}
-+
-+CEED_QFUNCTION_HELPER CeedScalar DetJ21(const CeedScalar *J,
-+                                        const CeedInt J_stride)
-+{
-+   // J: 0
-+   //    1
-+   return sqrt(J[J_stride * 0] * J[J_stride * 0] +
-+               J[J_stride * 1] * J[J_stride * 1]);
-+}
-+
-+CEED_QFUNCTION_HELPER CeedScalar DetJ33(const CeedScalar *J,
-+                                        const CeedInt J_stride)
-+{
-+   // J: 0 3 6
-+   //    1 4 7
-+   //    2 5 8
-+   return J[J_stride * 0] * (J[J_stride * 4] * J[J_stride * 8] -
-+                             J[J_stride * 5] * J[J_stride * 7]) -
-+          J[J_stride * 1] * (J[J_stride * 3] * J[J_stride * 8] -
-+                             J[J_stride * 5] * J[J_stride * 6]) +
-+          J[J_stride * 2] * (J[J_stride * 3] * J[J_stride * 7] -
-+                             J[J_stride * 4] * J[J_stride * 6]);
-+}
-+
-+CEED_QFUNCTION_HELPER CeedScalar DetJ32(const CeedScalar *J,
-+                                        const CeedInt J_stride)
-+{
-+   // J: 0 3
-+   //    1 4
-+   //    2 5
-+   const CeedScalar E = J[J_stride * 0] * J[J_stride * 0] +
-+                        J[J_stride * 1] * J[J_stride * 1] +
-+                        J[J_stride * 2] * J[J_stride * 2];
-+   const CeedScalar G = J[J_stride * 3] * J[J_stride * 3] +
-+                        J[J_stride * 4] * J[J_stride * 4] +
-+                        J[J_stride * 5] * J[J_stride * 5];
-+   const CeedScalar F = J[J_stride * 0] * J[J_stride * 3] +
-+                        J[J_stride * 1] * J[J_stride * 4] +
-+                        J[J_stride * 2] * J[J_stride * 5];
-+   return sqrt(E * G - F * F);
-+}
-+
-+CEED_QFUNCTION_HELPER void MultAdjJCAdjJt22(const CeedScalar *J,
-+                                            const CeedInt J_stride,
-+                                            const CeedScalar *c,
-+                                            const CeedInt c_stride,
-+                                            const CeedInt c_comp,
-+                                            const CeedScalar qw,
-+                                            const CeedInt qd_stride,
-+                                            CeedScalar *qd)
-+{
-+   // compute qw/det(J) adj(J) C adj(J)^T and store the symmetric part of the result
-+   // J: 0 2   adj(J):  J22 -J12   qd: 0 1
-+   //    1 3           -J21  J11       1 2
-+   const CeedScalar J11 = J[J_stride * 0];
-+   const CeedScalar J21 = J[J_stride * 1];
-+   const CeedScalar J12 = J[J_stride * 2];
-+   const CeedScalar J22 = J[J_stride * 3];
-+   const CeedScalar w = qw / (J11 * J22 - J21 * J12);
-+   if (c_comp == 3)  // Matrix coefficient (symmetric)
-+   {
-+      // First compute entries of R = C adj(J)^T
-+      // c: 0 1
-+      //    1 2
-+      const CeedScalar R11 =  c[c_stride * 0] * J22 - c[c_stride * 1] * J12;
-+      const CeedScalar R21 =  c[c_stride * 1] * J22 - c[c_stride * 2] * J12;
-+      const CeedScalar R12 = -c[c_stride * 0] * J21 + c[c_stride * 1] * J11;
-+      const CeedScalar R22 = -c[c_stride * 1] * J21 + c[c_stride * 2] * J11;
-+      qd[qd_stride * 0] = w * (J22 * R11 - J12 * R21);
-+      qd[qd_stride * 1] = w * (J11 * R21 - J21 * R11);
-+      qd[qd_stride * 2] = w * (J11 * R22 - J21 * R12);
-+   }
-+   else if (c_comp == 2)  // Vector coefficient
-+   {
-+      // c: 0
-+      //      1
-+      qd[qd_stride * 0] =  w * (c[c_stride * 1] * J12 * J12 +
-+                                c[c_stride * 0] * J22 * J22);
-+      qd[qd_stride * 1] = -w * (c[c_stride * 1] * J11 * J12 +
-+                                c[c_stride * 0] * J21 * J22);
-+      qd[qd_stride * 2] =  w * (c[c_stride * 1] * J11 * J11 +
-+                                c[c_stride * 0] * J21 * J21);
-+   }
-+   else  // Scalar coefficient
-+   {
-+      qd[qd_stride * 0] =  w * c[c_stride * 0] * (J12 * J12 + J22 * J22);
-+      qd[qd_stride * 1] = -w * c[c_stride * 0] * (J11 * J12 + J21 * J22);
-+      qd[qd_stride * 2] =  w * c[c_stride * 0] * (J11 * J11 + J21 * J21);
-+   }
-+}
-+
-+CEED_QFUNCTION_HELPER void MultAdjJCAdjJt21(const CeedScalar *J,
-+                                            const CeedInt J_stride,
-+                                            const CeedScalar *c,
-+                                            const CeedInt c_stride,
-+                                            const CeedInt c_comp,
-+                                            const CeedScalar qw,
-+                                            const CeedInt qd_stride,
-+                                            CeedScalar *qd)
-+{
-+   // compute qw/det(J) adj(J) C adj(J)^T and store the symmetric part of the result
-+   // J: 0   adj(J): 1/sqrt(J^T J) J^T   qd: 0
-+   //    1
-+   const CeedScalar J11 = J[J_stride * 0];
-+   const CeedScalar J21 = J[J_stride * 1];
-+   const CeedScalar d = J11 * J11 + J21 * J21;
-+   const CeedScalar w = qw / sqrt(d);
-+   if (c_comp == 3)  // Matrix coefficient (symmetric)
-+   {
-+      // First compute entries of R = C adj(J)^T
-+      // c: 0 1
-+      //    1 2
-+      const CeedScalar R11 = c[c_stride * 0] * J11 + c[c_stride * 1] * J21;
-+      const CeedScalar R21 = c[c_stride * 1] * J11 + c[c_stride * 2] * J21;
-+      qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21) / d;
-+   }
-+   else if (c_comp == 2)  // Vector coefficient
-+   {
-+      // c: 0
-+      //      1
-+      qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J11 +
-+                               c[c_stride * 1] * J21 * J21) / d;
-+   }
-+   else  // Scalar coefficient
-+   {
-+      qd[qd_stride * 0] = w * c[c_stride * 0];
-+   }
-+}
-+
-+CEED_QFUNCTION_HELPER void MultAdjJCAdjJt33(const CeedScalar *J,
-+                                            const CeedInt J_stride,
-+                                            const CeedScalar *c,
-+                                            const CeedInt c_stride,
-+                                            const CeedInt c_comp,
-+                                            const CeedScalar qw,
-+                                            const CeedInt qd_stride,
-+                                            CeedScalar *qd)
-+{
-+   // compute qw/det(J) adj(J) C adj(J)^T and store the symmetric part of the result
-+   // J: 0 3 6   qd: 0 1 2
-+   //    1 4 7       1 3 4
-+   //    2 5 8       2 4 5
-+   const CeedScalar J11 = J[J_stride * 0];
-+   const CeedScalar J21 = J[J_stride * 1];
-+   const CeedScalar J31 = J[J_stride * 2];
-+   const CeedScalar J12 = J[J_stride * 3];
-+   const CeedScalar J22 = J[J_stride * 4];
-+   const CeedScalar J32 = J[J_stride * 5];
-+   const CeedScalar J13 = J[J_stride * 6];
-+   const CeedScalar J23 = J[J_stride * 7];
-+   const CeedScalar J33 = J[J_stride * 8];
-+   const CeedScalar A11 = J22 * J33 - J23 * J32;
-+   const CeedScalar A12 = J13 * J32 - J12 * J33;
-+   const CeedScalar A13 = J12 * J23 - J13 * J22;
-+   const CeedScalar A21 = J23 * J31 - J21 * J33;
-+   const CeedScalar A22 = J11 * J33 - J13 * J31;
-+   const CeedScalar A23 = J13 * J21 - J11 * J23;
-+   const CeedScalar A31 = J21 * J32 - J22 * J31;
-+   const CeedScalar A32 = J12 * J31 - J11 * J32;
-+   const CeedScalar A33 = J11 * J22 - J12 * J21;
-+   const CeedScalar w = qw / (J11 * A11 + J21 * A12 + J31 * A13);
-+   if (c_comp == 6)  // Matrix coefficient (symmetric)
-+   {
-+      // First compute entries of R = C adj(J)^T
-+      // c: 0 1 2
-+      //    1 3 4
-+      //    2 4 5
-+      const CeedScalar R11 = c[c_stride * 0] * A11 +
-+                             c[c_stride * 1] * A12 +
-+                             c[c_stride * 2] * A13;
-+      const CeedScalar R12 = c[c_stride * 0] * A21 +
-+                             c[c_stride * 1] * A22 +
-+                             c[c_stride * 2] * A23;
-+      const CeedScalar R13 = c[c_stride * 0] * A31 +
-+                             c[c_stride * 1] * A32 +
-+                             c[c_stride * 2] * A33;
-+      const CeedScalar R21 = c[c_stride * 1] * A11 +
-+                             c[c_stride * 3] * A12 +
-+                             c[c_stride * 4] * A13;
-+      const CeedScalar R22 = c[c_stride * 1] * A21 +
-+                             c[c_stride * 3] * A22 +
-+                             c[c_stride * 4] * A23;
-+      const CeedScalar R23 = c[c_stride * 1] * A31 +
-+                             c[c_stride * 3] * A32 +
-+                             c[c_stride * 4] * A33;
-+      const CeedScalar R31 = c[c_stride * 2] * A11 +
-+                             c[c_stride * 4] * A12 +
-+                             c[c_stride * 5] * A13;
-+      const CeedScalar R32 = c[c_stride * 2] * A21 +
-+                             c[c_stride * 4] * A22 +
-+                             c[c_stride * 5] * A23;
-+      const CeedScalar R33 = c[c_stride * 2] * A31 +
-+                             c[c_stride * 4] * A32 +
-+                             c[c_stride * 5] * A33;
-+      qd[qd_stride * 0] = w * (A11 * R11 + A12 * R21 + A13 * R31);
-+      qd[qd_stride * 1] = w * (A11 * R12 + A12 * R22 + A13 * R32);
-+      qd[qd_stride * 2] = w * (A11 * R13 + A12 * R23 + A13 * R33);
-+      qd[qd_stride * 3] = w * (A21 * R12 + A22 * R22 + A23 * R32);
-+      qd[qd_stride * 4] = w * (A21 * R13 + A22 * R23 + A23 * R33);
-+      qd[qd_stride * 5] = w * (A31 * R13 + A32 * R23 + A33 * R33);
-+   }
-+   else if (c_comp == 3)  // Vector coefficient
-+   {
-+      // c: 0
-+      //      1
-+      //        2
-+      qd[qd_stride * 0] = w * (c[c_stride * 0] * A11 * A11 +
-+                               c[c_stride * 1] * A12 * A12 +
-+                               c[c_stride * 2] * A13 * A13);
-+      qd[qd_stride * 1] = w * (c[c_stride * 0] * A11 * A21 +
-+                               c[c_stride * 1] * A12 * A22 +
-+                               c[c_stride * 2] * A13 * A23);
-+      qd[qd_stride * 2] = w * (c[c_stride * 0] * A11 * A31 +
-+                               c[c_stride * 1] * A12 * A32 +
-+                               c[c_stride * 2] * A13 * A33);
-+      qd[qd_stride * 3] = w * (c[c_stride * 0] * A21 * A21 +
-+                               c[c_stride * 1] * A22 * A22 +
-+                               c[c_stride * 2] * A23 * A23);
-+      qd[qd_stride * 4] = w * (c[c_stride * 0] * A21 * A31 +
-+                               c[c_stride * 1] * A22 * A32 +
-+                               c[c_stride * 2] * A23 * A33);
-+      qd[qd_stride * 5] = w * (c[c_stride * 0] * A31 * A31 +
-+                               c[c_stride * 1] * A32 * A32 +
-+                               c[c_stride * 2] * A33 * A33);
-+   }
-+   else  // Scalar coefficient
-+   {
-+      qd[qd_stride * 0] =
-+         w * c[c_stride * 0] * (A11 * A11 + A12 * A12 + A13 * A13);
-+      qd[qd_stride * 1] =
-+         w * c[c_stride * 0] * (A11 * A21 + A12 * A22 + A13 * A23);
-+      qd[qd_stride * 2] =
-+         w * c[c_stride * 0] * (A11 * A31 + A12 * A32 + A13 * A33);
-+      qd[qd_stride * 3] =
-+         w * c[c_stride * 0] * (A21 * A21 + A22 * A22 + A23 * A23);
-+      qd[qd_stride * 4] =
-+         w * c[c_stride * 0] * (A21 * A31 + A22 * A32 + A23 * A33);
-+      qd[qd_stride * 5] =
-+         w * c[c_stride * 0] * (A31 * A31 + A32 * A32 + A33 * A33);
-+   }
-+}
-+
-+CEED_QFUNCTION_HELPER void MultAdjJCAdjJt32(const CeedScalar *J,
-+                                            const CeedInt J_stride,
-+                                            const CeedScalar *c,
-+                                            const CeedInt c_stride,
-+                                            const CeedInt c_comp,
-+                                            const CeedScalar qw,
-+                                            const CeedInt qd_stride,
-+                                            CeedScalar *qd)
-+{
-+   // compute qw/det(J) adj(J) C adj(J)^T and store the symmetric part of the result
-+   // J: 0 3   qd: 0 1
-+   //    1 4       1 2
-+   //    2 5
-+   const CeedScalar J11 = J[J_stride * 0];
-+   const CeedScalar J21 = J[J_stride * 1];
-+   const CeedScalar J31 = J[J_stride * 2];
-+   const CeedScalar J12 = J[J_stride * 3];
-+   const CeedScalar J22 = J[J_stride * 4];
-+   const CeedScalar J32 = J[J_stride * 5];
-+   const CeedScalar E = J11 * J11 + J21 * J21 + J31 * J31;
-+   const CeedScalar G = J12 * J12 + J22 * J22 + J32 * J32;
-+   const CeedScalar F = J11 * J12 + J21 * J22 + J31 * J32;
-+   const CeedScalar d = E * G - F * F;
-+   const CeedScalar w = qw / sqrt(d);
-+   if (c_comp == 6)  // Matrix coefficient (symmetric)
-+   {
-+      // First compute entries of R = C adj(J)^T
-+      // c: 0 1 2
-+      //    1 3 4
-+      //    2 4 5
-+      const CeedScalar R11 = G * (c[c_stride * 0] * J11 +
-+                                  c[c_stride * 1] * J21 +
-+                                  c[c_stride * 2] * J31) -
-+                             F * (c[c_stride * 0] * J12 +
-+                                  c[c_stride * 1] * J22 +
-+                                  c[c_stride * 2] * J32);
-+      const CeedScalar R21 = G * (c[c_stride * 1] * J11 +
-+                                  c[c_stride * 3] * J21 +
-+                                  c[c_stride * 4] * J31) -
-+                             F * (c[c_stride * 1] * J12 +
-+                                  c[c_stride * 3] * J22 +
-+                                  c[c_stride * 4] * J32);
-+      const CeedScalar R31 = G * (c[c_stride * 2] * J11 +
-+                                  c[c_stride * 4] * J21 +
-+                                  c[c_stride * 5] * J31) -
-+                             F * (c[c_stride * 2] * J12 +
-+                                  c[c_stride * 4] * J22 +
-+                                  c[c_stride * 5] * J32);
-+      const CeedScalar R12 = E * (c[c_stride * 0] * J12 +
-+                                  c[c_stride * 1] * J22 +
-+                                  c[c_stride * 2] * J32) -
-+                             F * (c[c_stride * 0] * J11 +
-+                                  c[c_stride * 1] * J21 +
-+                                  c[c_stride * 2] * J31);
-+      const CeedScalar R22 = E * (c[c_stride * 1] * J12 +
-+                                  c[c_stride * 3] * J22 +
-+                                  c[c_stride * 4] * J32) -
-+                             F * (c[c_stride * 1] * J11 +
-+                                  c[c_stride * 3] * J21 +
-+                                  c[c_stride * 4] * J31);
-+      const CeedScalar R32 = E * (c[c_stride * 2] * J12 +
-+                                  c[c_stride * 4] * J22 +
-+                                  c[c_stride * 5] * J32) -
-+                             F * (c[c_stride * 2] * J11 +
-+                                  c[c_stride * 4] * J21 +
-+                                  c[c_stride * 5] * J31);
-+      qd[qd_stride * 0] = w * (G * (J11 * R11 + J21 * R21 + J31 * R31) -
-+                               F * (J12 * R11 + J22 * R21 + J32 * R31)) / d;
-+      qd[qd_stride * 1] = w * (G * (J11 * R12 + J21 * R22 + J31 * R32) -
-+                               F * (J12 * R12 + J22 * R22 + J32 * R32)) / d;
-+      qd[qd_stride * 2] = w * (E * (J12 * R12 + J22 * R22 + J32 * R32) -
-+                               F * (J11 * R12 + J21 * R22 + J31 * R32)) / d;
-+   }
-+   else if (c_comp == 3)  // Vector coefficient
-+   {
-+      // First compute entries of R = C adj(J)^T
-+      // c: 0
-+      //      1
-+      //        2
-+      const CeedScalar R11 = c[c_stride * 0] * (G * J11 - F * J12);
-+      const CeedScalar R21 = c[c_stride * 1] * (G * J21 - F * J22);
-+      const CeedScalar R31 = c[c_stride * 2] * (G * J31 - F * J32);
-+      const CeedScalar R12 = c[c_stride * 0] * (E * J12 - F * J11);
-+      const CeedScalar R22 = c[c_stride * 1] * (E * J22 - F * J21);
-+      const CeedScalar R32 = c[c_stride * 2] * (E * J32 - F * J31);
-+      qd[qd_stride * 0] = w * (G * (J11 * R11 + J21 * R21 + J31 * R31) -
-+                               F * (J12 * R11 + J22 * R21 + J32 * R31)) / d;
-+      qd[qd_stride * 1] = w * (G * (J11 * R12 + J21 * R22 + J31 * R32) -
-+                               F * (J12 * R12 + J22 * R22 + J32 * R32)) / d;
-+      qd[qd_stride * 2] = w * (E * (J12 * R12 + J22 * R22 + J32 * R32) -
-+                               F * (J11 * R12 + J21 * R22 + J31 * R32)) / d;
-+   }
-+   else  // Scalar coefficient
-+   {
-+      qd[qd_stride * 0] =  w * c[c_stride * 0] * G;
-+      qd[qd_stride * 1] = -w * c[c_stride * 0] * F;
-+      qd[qd_stride * 2] =  w * c[c_stride * 0] * E;
-+   }
-+}
-+
-+CEED_QFUNCTION_HELPER void MultJtCJ22(const CeedScalar *J,
-+                                      const CeedInt J_stride,
-+                                      const CeedScalar *c,
-+                                      const CeedInt c_stride,
-+                                      const CeedInt c_comp,
-+                                      const CeedScalar qw,
-+                                      const CeedInt qd_stride,
-+                                      CeedScalar *qd)
-+{
-+   // compute qw/det(J) J^T C J and store the symmetric part of the result
-+   // J: 0 2   qd: 0 1
-+   //    1 3       1 2
-+   const CeedScalar J11 = J[J_stride * 0];
-+   const CeedScalar J21 = J[J_stride * 1];
-+   const CeedScalar J12 = J[J_stride * 2];
-+   const CeedScalar J22 = J[J_stride * 3];
-+   const CeedScalar w = qw / (J11 * J22 - J21 * J12);
-+   if (c_comp == 3)  // Matrix coefficient (symmetric)
-+   {
-+      // First compute entries of R = C J
-+      // c: 0 1
-+      //    1 2
-+      const CeedScalar R11 = c[c_stride * 0] * J11 + c[c_stride * 1] * J21;
-+      const CeedScalar R21 = c[c_stride * 1] * J11 + c[c_stride * 2] * J21;
-+      const CeedScalar R12 = c[c_stride * 0] * J12 + c[c_stride * 1] * J22;
-+      const CeedScalar R22 = c[c_stride * 1] * J12 + c[c_stride * 2] * J22;
-+      qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21);
-+      qd[qd_stride * 1] = w * (J11 * R12 + J21 * R22);
-+      qd[qd_stride * 2] = w * (J12 * R12 + J22 * R22);
-+   }
-+   else if (c_comp == 2)  // Vector coefficient
-+   {
-+      // c: 0
-+      //      1
-+      qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J11 +
-+                               c[c_stride * 1] * J21 * J21);
-+      qd[qd_stride * 1] = w * (c[c_stride * 0] * J11 * J12 +
-+                               c[c_stride * 1] * J21 * J22);
-+      qd[qd_stride * 2] = w * (c[c_stride * 0] * J12 * J12 +
-+                               c[c_stride * 1] * J22 * J22);
-+   }
-+   else  // Scalar coefficient
-+   {
-+      qd[qd_stride * 0] = w * c[c_stride * 0] * (J11 * J11 + J21 * J21);
-+      qd[qd_stride * 1] = w * c[c_stride * 0] * (J11 * J12 + J21 * J22);
-+      qd[qd_stride * 2] = w * c[c_stride * 0] * (J12 * J12 + J22 * J22);
-+   }
-+}
-+
-+CEED_QFUNCTION_HELPER void MultJtCJ21(const CeedScalar *J,
-+                                      const CeedInt J_stride,
-+                                      const CeedScalar *c,
-+                                      const CeedInt c_stride,
-+                                      const CeedInt c_comp,
-+                                      const CeedScalar qw,
-+                                      const CeedInt qd_stride,
-+                                      CeedScalar *qd)
-+{
-+   // compute qw/det(J) J^T C J and store the symmetric part of the result
-+   // J: 0   qd: 0
-+   //    1
-+   const CeedScalar J11 = J[J_stride * 0];
-+   const CeedScalar J21 = J[J_stride * 1];
-+   if (c_comp == 3)  // Matrix coefficient (symmetric)
-+   {
-+      // First compute entries of R = C J
-+      // c: 0 1
-+      //    1 2
-+      const CeedScalar w = qw / sqrt(J11 * J11 + J21 * J21);
-+      const CeedScalar R11 = c[c_stride * 0] * J11 + c[c_stride * 1] * J21;
-+      const CeedScalar R21 = c[c_stride * 1] * J11 + c[c_stride * 2] * J21;
-+      qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21);
-+   }
-+   else if (c_comp == 2)  // Vector coefficient
-+   {
-+      // c: 0
-+      //      1
-+      const CeedScalar w = qw / sqrt(J11 * J11 + J21 * J21);
-+      qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J11 +
-+                               c[c_stride * 1] * J21 * J21);
-+   }
-+   else  // Scalar coefficient
-+   {
-+      qd[qd_stride * 0] = qw * c[c_stride * 0] * sqrt(J11 * J11 + J21 * J21);
-+   }
-+}
-+
-+CEED_QFUNCTION_HELPER void MultJtCJ33(const CeedScalar *J,
-+                                      const CeedInt J_stride,
-+                                      const CeedScalar *c,
-+                                      const CeedInt c_stride,
-+                                      const CeedInt c_comp,
-+                                      const CeedScalar qw,
-+                                      const CeedInt qd_stride,
-+                                      CeedScalar *qd)
-+{
-+   // compute qw/det(J) J^T C J and store the symmetric part of the result
-+   // J: 0 3 6   qd: 0 1 2
-+   //    1 4 7       1 3 4
-+   //    2 5 8       2 4 5
-+   const CeedScalar J11 = J[J_stride * 0];
-+   const CeedScalar J21 = J[J_stride * 1];
-+   const CeedScalar J31 = J[J_stride * 2];
-+   const CeedScalar J12 = J[J_stride * 3];
-+   const CeedScalar J22 = J[J_stride * 4];
-+   const CeedScalar J32 = J[J_stride * 5];
-+   const CeedScalar J13 = J[J_stride * 6];
-+   const CeedScalar J23 = J[J_stride * 7];
-+   const CeedScalar J33 = J[J_stride * 8];
-+   const CeedScalar w = qw / (J11 * (J22 * J33 - J23 * J32) +
-+                              J21 * (J13 * J32 - J12 * J33) +
-+                              J31 * (J12 * J23 - J13 * J22));
-+   if (c_comp == 6)  // Matrix coefficient (symmetric)
-+   {
-+      // First compute entries of R = C J
-+      // c: 0 1 2
-+      //    1 3 4
-+      //    2 4 5
-+      const CeedScalar R11 = c[c_stride * 0] * J11 +
-+                             c[c_stride * 1] * J21 +
-+                             c[c_stride * 2] * J31;
-+      const CeedScalar R12 = c[c_stride * 0] * J12 +
-+                             c[c_stride * 1] * J22 +
-+                             c[c_stride * 2] * J32;
-+      const CeedScalar R13 = c[c_stride * 0] * J13 +
-+                             c[c_stride * 1] * J23 +
-+                             c[c_stride * 2] * J33;
-+      const CeedScalar R21 = c[c_stride * 1] * J11 +
-+                             c[c_stride * 3] * J21 +
-+                             c[c_stride * 4] * J31;
-+      const CeedScalar R22 = c[c_stride * 1] * J12 +
-+                             c[c_stride * 3] * J22 +
-+                             c[c_stride * 4] * J32;
-+      const CeedScalar R23 = c[c_stride * 1] * J13 +
-+                             c[c_stride * 3] * J23 +
-+                             c[c_stride * 4] * J33;
-+      const CeedScalar R31 = c[c_stride * 2] * J11 +
-+                             c[c_stride * 4] * J21 +
-+                             c[c_stride * 5] * J31;
-+      const CeedScalar R32 = c[c_stride * 2] * J12 +
-+                             c[c_stride * 4] * J22 +
-+                             c[c_stride * 5] * J32;
-+      const CeedScalar R33 = c[c_stride * 2] * J13 +
-+                             c[c_stride * 4] * J23 +
-+                             c[c_stride * 5] * J33;
-+      qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21 + J31 * R31);
-+      qd[qd_stride * 1] = w * (J11 * R12 + J21 * R22 + J31 * R32);
-+      qd[qd_stride * 2] = w * (J11 * R13 + J21 * R23 + J31 * R33);
-+      qd[qd_stride * 3] = w * (J12 * R12 + J22 * R22 + J32 * R32);
-+      qd[qd_stride * 4] = w * (J12 * R13 + J22 * R23 + J32 * R33);
-+      qd[qd_stride * 5] = w * (J13 * R13 + J23 * R23 + J33 * R33);
-+   }
-+   else if (c_comp == 3)  // Vector coefficient
-+   {
-+      // c: 0
-+      //      1
-+      //        2
-+      qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J11 +
-+                               c[c_stride * 1] * J21 * J21 +
-+                               c[c_stride * 2] * J31 * J31);
-+      qd[qd_stride * 1] = w * (c[c_stride * 0] * J11 * J12 +
-+                               c[c_stride * 1] * J21 * J22 +
-+                               c[c_stride * 2] * J31 * J32);
-+      qd[qd_stride * 2] = w * (c[c_stride * 0] * J11 * J13 +
-+                               c[c_stride * 1] * J21 * J23 +
-+                               c[c_stride * 2] * J31 * J33);
-+      qd[qd_stride * 3] = w * (c[c_stride * 0] * J12 * J12 +
-+                               c[c_stride * 1] * J22 * J22 +
-+                               c[c_stride * 2] * J32 * J32);
-+      qd[qd_stride * 4] = w * (c[c_stride * 0] * J12 * J13 +
-+                               c[c_stride * 1] * J22 * J23 +
-+                               c[c_stride * 2] * J32 * J33);
-+      qd[qd_stride * 5] = w * (c[c_stride * 0] * J13 * J13 +
-+                               c[c_stride * 1] * J23 * J23 +
-+                               c[c_stride * 2] * J33 * J33);
-+   }
-+   else  // Scalar coefficient
-+   {
-+      qd[qd_stride * 0] =
-+         w * c[c_stride * 0] * (J11 * J11 + J21 * J21 + J31 * J31);
-+      qd[qd_stride * 1] =
-+         w * c[c_stride * 0] * (J11 * J12 + J21 * J22 + J31 * J32);
-+      qd[qd_stride * 2] =
-+         w * c[c_stride * 0] * (J11 * J13 + J21 * J23 + J31 * J33);
-+      qd[qd_stride * 3] =
-+         w * c[c_stride * 0] * (J12 * J12 + J22 * J22 + J32 * J32);
-+      qd[qd_stride * 4] =
-+         w * c[c_stride * 0] * (J12 * J13 + J22 * J23 + J32 * J33);
-+      qd[qd_stride * 5] =
-+         w * c[c_stride * 0] * (J13 * J13 + J23 * J23 + J33 * J33);
-+   }
-+}
-+
-+CEED_QFUNCTION_HELPER void MultJtCJ32(const CeedScalar *J,
-+                                      const CeedInt J_stride,
-+                                      const CeedScalar *c,
-+                                      const CeedInt c_stride,
-+                                      const CeedInt c_comp,
-+                                      const CeedScalar qw,
-+                                      const CeedInt qd_stride,
-+                                      CeedScalar *qd)
-+{
-+   // compute qw/det(J) J^T C J and store the symmetric part of the result
-+   // J: 0 3   qd: 0 1
-+   //    1 4       1 2
-+   //    2 5
-+   const CeedScalar J11 = J[J_stride * 0];
-+   const CeedScalar J21 = J[J_stride * 1];
-+   const CeedScalar J31 = J[J_stride * 2];
-+   const CeedScalar J12 = J[J_stride * 3];
-+   const CeedScalar J22 = J[J_stride * 4];
-+   const CeedScalar J32 = J[J_stride * 5];
-+   const CeedScalar E = J11 * J11 + J21 * J21 + J31 * J31;
-+   const CeedScalar G = J12 * J12 + J22 * J22 + J32 * J32;
-+   const CeedScalar F = J11 * J12 + J21 * J22 + J31 * J32;
-+   const CeedScalar w = qw / sqrt(E * G - F * F);
-+   if (c_comp == 6)  // Matrix coefficient (symmetric)
-+   {
-+      // First compute entries of R = C J
-+      // c: 0 1 2
-+      //    1 3 4
-+      //    2 4 5
-+      const CeedScalar R11 = c[c_stride * 0] * J11 +
-+                             c[c_stride * 1] * J21 +
-+                             c[c_stride * 2] * J31;
-+      const CeedScalar R21 = c[c_stride * 1] * J11 +
-+                             c[c_stride * 3] * J21 +
-+                             c[c_stride * 4] * J31;
-+      const CeedScalar R31 = c[c_stride * 2] * J11 +
-+                             c[c_stride * 4] * J21 +
-+                             c[c_stride * 5] * J31;
-+      const CeedScalar R12 = c[c_stride * 0] * J12 +
-+                             c[c_stride * 1] * J22 +
-+                             c[c_stride * 2] * J32;
-+      const CeedScalar R22 = c[c_stride * 1] * J12 +
-+                             c[c_stride * 3] * J22 +
-+                             c[c_stride * 4] * J32;
-+      const CeedScalar R32 = c[c_stride * 2] * J12 +
-+                             c[c_stride * 4] * J22 +
-+                             c[c_stride * 5] * J32;
-+      qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21 + J31 * R31);
-+      qd[qd_stride * 1] = w * (J11 * R12 + J21 * R22 + J31 * R32);
-+      qd[qd_stride * 2] = w * (J12 * R12 + J22 * R22 + J32 * R32);
-+   }
-+   else if (c_comp == 3)  // Vector coefficient
-+   {
-+      // c: 0
-+      //      1
-+      //        2
-+      qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J11 +
-+                               c[c_stride * 1] * J21 * J21 +
-+                               c[c_stride * 2] * J31 * J31);
-+      qd[qd_stride * 1] = w * (c[c_stride * 0] * J11 * J12 +
-+                               c[c_stride * 1] * J21 * J22 +
-+                               c[c_stride * 2] * J31 * J32);
-+      qd[qd_stride * 2] = w * (c[c_stride * 0] * J12 * J12 +
-+                               c[c_stride * 1] * J22 * J22 +
-+                               c[c_stride * 2] * J32 * J32);
-+   }
-+   else  // Scalar coefficient
-+   {
-+      qd[qd_stride * 0] = w * c[c_stride * 0] * E;
-+      qd[qd_stride * 1] = w * c[c_stride * 0] * F;
-+      qd[qd_stride * 2] = w * c[c_stride * 0] * G;
-+   }
-+}
-+
-+CEED_QFUNCTION_HELPER void MultCtAdjJt22(const CeedScalar *J,
-+                                         const CeedInt J_stride,
-+                                         const CeedScalar *c,
-+                                         const CeedInt c_stride,
-+                                         const CeedScalar qw,
-+                                         const CeedInt qd_stride,
-+                                         CeedScalar *qd)
-+{
-+   // compute qw c^T adj(J)^T and store the result vector
-+   // J: 0 2   adj(J):  J22 -J12
-+   //    1 3           -J21  J11
-+   const CeedScalar J11 = J[J_stride * 0];
-+   const CeedScalar J21 = J[J_stride * 1];
-+   const CeedScalar J12 = J[J_stride * 2];
-+   const CeedScalar J22 = J[J_stride * 3];
-+   const CeedScalar w1 = qw * c[c_stride * 0];
-+   const CeedScalar w2 = qw * c[c_stride * 1];
-+   qd[qd_stride * 0] =  w1 * J22 - w2 * J12;
-+   qd[qd_stride * 1] = -w1 * J21 + w2 * J11;
-+}
-+
-+CEED_QFUNCTION_HELPER void MultCtAdjJt21(const CeedScalar *J,
-+                                         const CeedInt J_stride,
-+                                         const CeedScalar *c,
-+                                         const CeedInt c_stride,
-+                                         const CeedScalar qw,
-+                                         const CeedInt qd_stride,
-+                                         CeedScalar *qd)
-+{
-+   // compute qw c^T adj(J)^T and store the result vector
-+   // J: 0   adj(J): 1/sqrt(J^T J) J^T
-+   //    1
-+   const CeedScalar J11 = J[J_stride * 0];
-+   const CeedScalar J21 = J[J_stride * 1];
-+   const CeedScalar w = qw / sqrt(J11 * J11 + J21 * J21);
-+   const CeedScalar w1 = w * c[c_stride * 0];
-+   const CeedScalar w2 = w * c[c_stride * 1];
-+   qd[qd_stride * 0] =  w1 * J11 + w2 * J21;
-+}
-+
-+CEED_QFUNCTION_HELPER void MultCtAdjJt33(const CeedScalar *J,
-+                                         const CeedInt J_stride,
-+                                         const CeedScalar *c,
-+                                         const CeedInt c_stride,
-+                                         const CeedScalar qw,
-+                                         const CeedInt qd_stride,
-+                                         CeedScalar *qd)
-+{
-+   // compute qw c^T adj(J)^T and store the result vector
-+   // J: 0 3 6
-+   //    1 4 7
-+   //    2 5 8
-+   const CeedScalar J11 = J[J_stride * 0];
-+   const CeedScalar J21 = J[J_stride * 1];
-+   const CeedScalar J31 = J[J_stride * 2];
-+   const CeedScalar J12 = J[J_stride * 3];
-+   const CeedScalar J22 = J[J_stride * 4];
-+   const CeedScalar J32 = J[J_stride * 5];
-+   const CeedScalar J13 = J[J_stride * 6];
-+   const CeedScalar J23 = J[J_stride * 7];
-+   const CeedScalar J33 = J[J_stride * 8];
-+   const CeedScalar A11 = J22 * J33 - J23 * J32;
-+   const CeedScalar A12 = J13 * J32 - J12 * J33;
-+   const CeedScalar A13 = J12 * J23 - J13 * J22;
-+   const CeedScalar A21 = J23 * J31 - J21 * J33;
-+   const CeedScalar A22 = J11 * J33 - J13 * J31;
-+   const CeedScalar A23 = J13 * J21 - J11 * J23;
-+   const CeedScalar A31 = J21 * J32 - J22 * J31;
-+   const CeedScalar A32 = J12 * J31 - J11 * J32;
-+   const CeedScalar A33 = J11 * J22 - J12 * J21;
-+   const CeedScalar w1 = qw * c[c_stride * 0];
-+   const CeedScalar w2 = qw * c[c_stride * 1];
-+   const CeedScalar w3 = qw * c[c_stride * 2];
-+   qd[qd_stride * 0] = w1 * A11 + w2 * A12 + w3 * A13;
-+   qd[qd_stride * 1] = w1 * A21 + w2 * A22 + w3 * A23;
-+   qd[qd_stride * 2] = w1 * A31 + w2 * A32 + w3 * A33;
-+}
-+
-+CEED_QFUNCTION_HELPER void MultCtAdjJt32(const CeedScalar *J,
-+                                         const CeedInt J_stride,
-+                                         const CeedScalar *c,
-+                                         const CeedInt c_stride,
-+                                         const CeedScalar qw,
-+                                         const CeedInt qd_stride,
-+                                         CeedScalar *qd)
-+{
-+   // compute qw c^T adj(J)^T and store the result vector
-+   // J: 0 3
-+   //    1 4
-+   //    2 5
-+   const CeedScalar J11 = J[J_stride * 0];
-+   const CeedScalar J21 = J[J_stride * 1];
-+   const CeedScalar J31 = J[J_stride * 2];
-+   const CeedScalar J12 = J[J_stride * 3];
-+   const CeedScalar J22 = J[J_stride * 4];
-+   const CeedScalar J32 = J[J_stride * 5];
-+   const CeedScalar E = J11 * J11 + J21 * J21 + J31 * J31;
-+   const CeedScalar G = J12 * J12 + J22 * J22 + J32 * J32;
-+   const CeedScalar F = J11 * J12 + J21 * J22 + J31 * J32;
-+   const CeedScalar A11 = G * J11 - F * J12;
-+   const CeedScalar A21 = E * J12 - F * J11;
-+   const CeedScalar A12 = G * J21 - F * J22;
-+   const CeedScalar A22 = E * J22 - F * J21;
-+   const CeedScalar A13 = G * J31 - F * J32;
-+   const CeedScalar A23 = E * J32 - F * J31;
-+   const CeedScalar w = qw / sqrt(E * G - F * F);
-+   const CeedScalar w1 = w * c[c_stride * 0];
-+   const CeedScalar w2 = w * c[c_stride * 1];
-+   const CeedScalar w3 = w * c[c_stride * 2];
-+   qd[qd_stride * 0] = w1 * A11 + w2 * A12 + w3 * A13;
-+   qd[qd_stride * 1] = w1 * A21 + w2 * A22 + w3 * A23;
-+}
-+
-+CEED_QFUNCTION_HELPER void MultAdjJt22(const CeedScalar *J,
-+                                       const CeedInt J_stride,
-+                                       const CeedScalar qw,
-+                                       const CeedInt qd_stride,
-+                                       CeedScalar *qd)
-+{
-+   // compute qw adj(J)^T and store the result matrix
-+   // J: 0 2   adj(J):  J22 -J12   qd: 0 2
-+   //    1 3           -J21  J11       1 3
-+   const CeedScalar J11 = J[J_stride * 0];
-+   const CeedScalar J21 = J[J_stride * 1];
-+   const CeedScalar J12 = J[J_stride * 2];
-+   const CeedScalar J22 = J[J_stride * 3];
-+   qd[qd_stride * 0] =  qw * J22;
-+   qd[qd_stride * 1] = -qw * J12;
-+   qd[qd_stride * 2] = -qw * J21;
-+   qd[qd_stride * 3] =  qw * J11;
-+}
-+
-+CEED_QFUNCTION_HELPER void MultAdjJt21(const CeedScalar *J,
-+                                       const CeedInt J_stride,
-+                                       const CeedScalar qw,
-+                                       const CeedInt qd_stride,
-+                                       CeedScalar *qd)
-+{
-+   // compute qw adj(J)^T and store the result matrix
-+   // J: 0   adj(J):  1/sqrt(J^T J) J^T   qd: 0
-+   //    1                                    1
-+   const CeedScalar J11 = J[J_stride * 0];
-+   const CeedScalar J21 = J[J_stride * 1];
-+   const CeedScalar w = qw / sqrt(J11 * J11 + J21 * J21);
-+   qd[qd_stride * 0] = w * J11;
-+   qd[qd_stride * 1] = w * J21;
-+}
-+
-+CEED_QFUNCTION_HELPER void MultAdjJt33(const CeedScalar *J,
-+                                       const CeedInt J_stride,
-+                                       const CeedScalar qw,
-+                                       const CeedInt qd_stride,
-+                                       CeedScalar *qd)
-+{
-+   // compute qw adj(J)^T and store the result matrix
-+   // J: 0 3 6   qd: 0 3 6
-+   //    1 4 7       1 4 7
-+   //    2 5 8       2 5 8
-+   const CeedScalar J11 = J[J_stride * 0];
-+   const CeedScalar J21 = J[J_stride * 1];
-+   const CeedScalar J31 = J[J_stride * 2];
-+   const CeedScalar J12 = J[J_stride * 3];
-+   const CeedScalar J22 = J[J_stride * 4];
-+   const CeedScalar J32 = J[J_stride * 5];
-+   const CeedScalar J13 = J[J_stride * 6];
-+   const CeedScalar J23 = J[J_stride * 7];
-+   const CeedScalar J33 = J[J_stride * 8];
-+   const CeedScalar A11 = J22 * J33 - J23 * J32;
-+   const CeedScalar A12 = J13 * J32 - J12 * J33;
-+   const CeedScalar A13 = J12 * J23 - J13 * J22;
-+   const CeedScalar A21 = J23 * J31 - J21 * J33;
-+   const CeedScalar A22 = J11 * J33 - J13 * J31;
-+   const CeedScalar A23 = J13 * J21 - J11 * J23;
-+   const CeedScalar A31 = J21 * J32 - J22 * J31;
-+   const CeedScalar A32 = J12 * J31 - J11 * J32;
-+   const CeedScalar A33 = J11 * J22 - J12 * J21;
-+   qd[qd_stride * 0] = qw * A11;
-+   qd[qd_stride * 1] = qw * A12;
-+   qd[qd_stride * 2] = qw * A13;
-+   qd[qd_stride * 3] = qw * A21;
-+   qd[qd_stride * 4] = qw * A22;
-+   qd[qd_stride * 5] = qw * A23;
-+   qd[qd_stride * 6] = qw * A31;
-+   qd[qd_stride * 7] = qw * A32;
-+   qd[qd_stride * 8] = qw * A33;
-+}
-+
-+CEED_QFUNCTION_HELPER void MultAdjJt32(const CeedScalar *J,
-+                                       const CeedInt J_stride,
-+                                       const CeedScalar qw,
-+                                       const CeedInt qd_stride,
-+                                       CeedScalar *qd)
-+{
-+   // compute qw adj(J)^T and store the result matrix
-+   // J: 0 3   qd: 0 3
-+   //    1 4       1 4
-+   //    2 5       2 5
-+   const CeedScalar J11 = J[J_stride * 0];
-+   const CeedScalar J21 = J[J_stride * 1];
-+   const CeedScalar J31 = J[J_stride * 2];
-+   const CeedScalar J12 = J[J_stride * 3];
-+   const CeedScalar J22 = J[J_stride * 4];
-+   const CeedScalar J32 = J[J_stride * 5];
-+   const CeedScalar E = J11 * J11 + J21 * J21 + J31 * J31;
-+   const CeedScalar G = J12 * J12 + J22 * J22 + J32 * J32;
-+   const CeedScalar F = J11 * J12 + J21 * J22 + J31 * J32;
-+   const CeedScalar A11 = G * J11 - F * J12;
-+   const CeedScalar A21 = E * J12 - F * J11;
-+   const CeedScalar A12 = G * J21 - F * J22;
-+   const CeedScalar A22 = E * J22 - F * J21;
-+   const CeedScalar A13 = G * J31 - F * J32;
-+   const CeedScalar A23 = E * J32 - F * J31;
-+   const CeedScalar w = qw / sqrt(E * G - F * F);
-+   qd[qd_stride * 0] = w * A11;
-+   qd[qd_stride * 1] = w * A12;
-+   qd[qd_stride * 2] = w * A13;
-+   qd[qd_stride * 3] = w * A21;
-+   qd[qd_stride * 4] = w * A22;
-+   qd[qd_stride * 5] = w * A23;
-+}
-+
-+#endif // MFEM_LIBCEED_UTIL_QF_H
-diff --git a/fem/ceed/integrators/vecfemass/vecfemass.cpp b/fem/ceed/integrators/vecfemass/vecfemass.cpp
-new file mode 100644
-index 000000000..87d624eb6
---- /dev/null
-+++ b/fem/ceed/integrators/vecfemass/vecfemass.cpp
-@@ -0,0 +1,274 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "vecfemass.hpp"
-+
-+#include "../../../../config/config.hpp"
-+#ifdef MFEM_USE_CEED
-+#include "vecfemass_qf.h"
-+#endif
-+
-+namespace mfem
-+{
-+
-+namespace ceed
-+{
-+
-+#ifdef MFEM_USE_CEED
-+struct VectorFEMassOperatorInfo : public OperatorInfo
-+{
-+   VectorFEMassContext ctx = {0};
-+   template <typename CoeffType>
-+   VectorFEMassOperatorInfo(const mfem::FiniteElementSpace &fes, CoeffType *Q,
-+                            bool use_bdr = false, bool use_mf = false)
-+   {
-+      MFEM_VERIFY(fes.GetVDim() == 1,
-+                  "libCEED interface for vector FE does not support vdim > 1!");
-+      ctx.dim = fes.GetMesh()->Dimension() - use_bdr;
-+      ctx.space_dim = fes.GetMesh()->SpaceDimension();
-+      bool is_hdiv = (fes.FEColl()->GetMapType(ctx.dim) ==
-+                      mfem::FiniteElement::H_DIV);
-+      MFEM_VERIFY(is_hdiv ||
-+                  fes.FEColl()->GetMapType(ctx.dim) == mfem::FiniteElement::H_CURL,
-+                  "VectorFEMassIntegrator requires H(div) or H(curl) FE space!");
-+      if (!use_mf)
-+      {
-+         apply_func = ":f_apply_vecfemass";
-+         apply_qf = &f_apply_vecfemass;
-+      }
-+      else
-+      {
-+         build_func = "";
-+         build_qf = nullptr;
-+      }
-+      if (Q == nullptr)
-+      {
-+         ctx.coeff[0] = 1.0;
-+         if (!use_mf)
-+         {
-+            build_func = is_hdiv ? ":f_build_hdivmass_const_scalar" :
-+                         ":f_build_hcurlmass_const_scalar";
-+            build_qf = is_hdiv ? &f_build_hdivmass_const_scalar :
-+                       &f_build_hcurlmass_const_scalar;
-+         }
-+         else
-+         {
-+            apply_func = is_hdiv ? ":f_apply_hdivmass_mf_const_scalar" :
-+                         ":f_apply_hcurlmass_mf_const_scalar";
-+            apply_qf = is_hdiv ? &f_apply_hdivmass_mf_const_scalar :
-+                       &f_apply_hcurlmass_mf_const_scalar;
-+         }
-+      }
-+      else
-+      {
-+         InitCoefficient(*Q, is_hdiv, use_mf);
-+      }
-+      header = "/integrators/vecfemass/vecfemass_qf.h";
-+      trial_op = EvalMode::Interp;
-+      test_op = EvalMode::Interp;
-+      qdatasize = (ctx.dim * (ctx.dim + 1)) / 2;
-+   }
-+   void InitCoefficient(mfem::Coefficient &Q, bool is_hdiv, bool use_mf)
-+   {
-+      if (mfem::ConstantCoefficient *const_coeff =
-+             dynamic_cast<mfem::ConstantCoefficient *>(&Q))
-+      {
-+         ctx.coeff[0] = const_coeff->constant;
-+         if (!use_mf)
-+         {
-+            build_func = is_hdiv ? ":f_build_hdivmass_const_scalar" :
-+                         ":f_build_hcurlmass_const_scalar";
-+            build_qf = is_hdiv ? &f_build_hdivmass_const_scalar :
-+                       &f_build_hcurlmass_const_scalar;
-+         }
-+         else
-+         {
-+            apply_func = is_hdiv ? ":f_apply_hdivmass_mf_const_scalar" :
-+                         ":f_apply_hcurlmass_mf_const_scalar";
-+            apply_qf = is_hdiv ? &f_apply_hdivmass_mf_const_scalar :
-+                       &f_apply_hcurlmass_mf_const_scalar;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = is_hdiv ? ":f_build_hdivmass_quad_scalar" :
-+                         ":f_build_hcurlmass_quad_scalar";
-+            build_qf = is_hdiv ? &f_build_hdivmass_quad_scalar :
-+                       &f_build_hcurlmass_quad_scalar;
-+         }
-+         else
-+         {
-+            apply_func = is_hdiv ? ":f_apply_hdivmass_mf_quad_scalar" :
-+                         ":f_apply_hcurlmass_mf_quad_scalar";
-+            apply_qf = is_hdiv ? &f_apply_hdivmass_mf_quad_scalar :
-+                       &f_apply_hcurlmass_mf_quad_scalar;
-+         }
-+      }
-+   }
-+   void InitCoefficient(mfem::VectorCoefficient &VQ, bool is_hdiv, bool use_mf)
-+   {
-+      if (mfem::VectorConstantCoefficient *const_coeff =
-+             dynamic_cast<mfem::VectorConstantCoefficient *>(&VQ))
-+      {
-+         const int vdim = VQ.GetVDim();
-+         MFEM_VERIFY(vdim <= LIBCEED_VECFEMASS_COEFF_COMP_MAX,
-+                     "VectorCoefficient dimension exceeds context storage!");
-+         const mfem::Vector &val = const_coeff->GetVec();
-+         for (int i = 0; i < vdim; i++)
-+         {
-+            ctx.coeff[i] = val[i];
-+         }
-+         if (!use_mf)
-+         {
-+            build_func = is_hdiv ? ":f_build_hdivmass_const_vector" :
-+                         ":f_build_hcurlmass_const_vector";
-+            build_qf = is_hdiv ? &f_build_hdivmass_const_vector :
-+                       &f_build_hcurlmass_const_vector;
-+         }
-+         else
-+         {
-+            apply_func = is_hdiv ? ":f_apply_hdivmass_mf_const_vector" :
-+                         ":f_apply_hcurlmass_mf_const_vector";
-+            apply_qf = is_hdiv ? &f_apply_hdivmass_mf_const_vector :
-+                       &f_apply_hcurlmass_mf_const_vector;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = is_hdiv ? ":f_build_hdivmass_quad_vector" :
-+                         ":f_build_hcurlmass_quad_vector";
-+            build_qf = is_hdiv ? &f_build_hdivmass_quad_vector :
-+                       &f_build_hcurlmass_quad_vector;
-+         }
-+         else
-+         {
-+            apply_func = is_hdiv ? ":f_apply_hdivmass_mf_quad_vector" :
-+                         ":f_apply_hcurlmass_mf_quad_vector";
-+            apply_qf = is_hdiv ? &f_apply_hdivmass_mf_quad_vector :
-+                       &f_apply_hcurlmass_mf_quad_vector;
-+         }
-+      }
-+   }
-+   void InitCoefficient(mfem::MatrixCoefficient &MQ, bool is_hdiv, bool use_mf)
-+   {
-+      // Assumes matrix coefficient is symmetric
-+      if (mfem::MatrixConstantCoefficient *const_coeff =
-+             dynamic_cast<mfem::MatrixConstantCoefficient *>(&MQ))
-+      {
-+         const int vdim = MQ.GetVDim();
-+         MFEM_VERIFY((vdim * (vdim + 1)) / 2 <= LIBCEED_VECFEMASS_COEFF_COMP_MAX,
-+                     "MatrixCoefficient dimensions exceed context storage!");
-+         const mfem::DenseMatrix &val = const_coeff->GetMatrix();
-+         for (int j = 0; j < vdim; j++)
-+         {
-+            for (int i = j; i < vdim; i++)
-+            {
-+               const int idx = (j * vdim) - (((j - 1) * j) / 2) + i - j;
-+               ctx.coeff[idx] = val(i, j);
-+            }
-+         }
-+         if (!use_mf)
-+         {
-+            build_func = is_hdiv ? ":f_build_hdivmass_const_matrix" :
-+                         ":f_build_hcurlmass_const_matrix";
-+            build_qf = is_hdiv ? &f_build_hdivmass_const_matrix :
-+                       &f_build_hcurlmass_const_matrix;
-+         }
-+         else
-+         {
-+            apply_func = is_hdiv ? ":f_apply_hdivmass_mf_const_matrix" :
-+                         ":f_apply_hcurlmass_mf_const_matrix";
-+            apply_qf = is_hdiv ? &f_apply_hdivmass_mf_const_matrix :
-+                       &f_apply_hcurlmass_mf_const_matrix;
-+         }
-+      }
-+      else
-+      {
-+         if (!use_mf)
-+         {
-+            build_func = is_hdiv ? ":f_build_hdivmass_quad_matrix" :
-+                         ":f_build_hcurlmass_quad_matrix";
-+            build_qf = is_hdiv ? &f_build_hdivmass_quad_matrix :
-+                       &f_build_hcurlmass_quad_matrix;
-+         }
-+         else
-+         {
-+            apply_func = is_hdiv ? ":f_apply_hdivmass_mf_quad_matrix" :
-+                         ":f_apply_hcurlmass_mf_quad_matrix";
-+            apply_qf = is_hdiv ? &f_apply_hdivmass_mf_quad_matrix :
-+                       &f_apply_hcurlmass_mf_quad_matrix;
-+         }
-+      }
-+   }
-+};
-+#endif
-+
-+template <typename CoeffType>
-+PAVectorFEMassIntegrator::PAVectorFEMassIntegrator(
-+   const mfem::VectorFEMassIntegrator &integ,
-+   const mfem::FiniteElementSpace &fes,
-+   CoeffType *Q,
-+   const bool use_bdr)
-+{
-+#ifdef MFEM_USE_CEED
-+   VectorFEMassOperatorInfo info(fes, Q, use_bdr);
-+   Assemble(integ, info, fes, Q, use_bdr);
-+#else
-+   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
-+#endif
-+}
-+
-+template <typename CoeffType>
-+MFVectorFEMassIntegrator::MFVectorFEMassIntegrator(
-+   const mfem::VectorFEMassIntegrator &integ,
-+   const mfem::FiniteElementSpace &fes,
-+   CoeffType *Q,
-+   const bool use_bdr)
-+{
-+#ifdef MFEM_USE_CEED
-+   VectorFEMassOperatorInfo info(fes, Q, use_bdr, true);
-+   Assemble(integ, info, fes, Q, use_bdr, true);
-+#else
-+   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
-+#endif
-+}
-+
-+// @cond DOXYGEN_SKIP
-+
-+template PAVectorFEMassIntegrator::PAVectorFEMassIntegrator(
-+   const mfem::VectorFEMassIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::Coefficient *, const bool);
-+template PAVectorFEMassIntegrator::PAVectorFEMassIntegrator(
-+   const mfem::VectorFEMassIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::VectorCoefficient *, const bool);
-+template PAVectorFEMassIntegrator::PAVectorFEMassIntegrator(
-+   const mfem::VectorFEMassIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::MatrixCoefficient *, const bool);
-+
-+template MFVectorFEMassIntegrator::MFVectorFEMassIntegrator(
-+   const mfem::VectorFEMassIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::Coefficient *, const bool);
-+template MFVectorFEMassIntegrator::MFVectorFEMassIntegrator(
-+   const mfem::VectorFEMassIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::VectorCoefficient *, const bool);
-+template MFVectorFEMassIntegrator::MFVectorFEMassIntegrator(
-+   const mfem::VectorFEMassIntegrator &, const mfem::FiniteElementSpace &,
-+   mfem::MatrixCoefficient *, const bool);
-+
-+// @endcond
-+
-+} // namespace ceed
-+
-+} // namespace mfem
-diff --git a/fem/ceed/integrators/vecfemass/vecfemass.hpp b/fem/ceed/integrators/vecfemass/vecfemass.hpp
-new file mode 100644
-index 000000000..aa0ca2ea3
---- /dev/null
-+++ b/fem/ceed/integrators/vecfemass/vecfemass.hpp
-@@ -0,0 +1,51 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#ifndef MFEM_LIBCEED_VECFEMASS_HPP
-+#define MFEM_LIBCEED_VECFEMASS_HPP
-+
-+#include "../../interface/integrator.hpp"
-+#include "../../interface/mixed_operator.hpp"
-+#include "../../../fespace.hpp"
-+
-+namespace mfem
-+{
-+
-+namespace ceed
-+{
-+
-+/// Represent a VectorFEMassIntegrator with AssemblyLevel::Partial using libCEED.
-+class PAVectorFEMassIntegrator : public MixedOperator<Integrator>
-+{
-+public:
-+   template <typename CoeffType>
-+   PAVectorFEMassIntegrator(const mfem::VectorFEMassIntegrator &integ,
-+                            const mfem::FiniteElementSpace &fes,
-+                            CoeffType *Q,
-+                            const bool use_bdr = false);
-+};
-+
-+/// Represent a VectorFEMassIntegrator with AssemblyLevel::None using libCEED.
-+class MFVectorFEMassIntegrator : public MixedOperator<Integrator>
-+{
-+public:
-+   template <typename CoeffType>
-+   MFVectorFEMassIntegrator(const mfem::VectorFEMassIntegrator &integ,
-+                            const mfem::FiniteElementSpace &fes,
-+                            CoeffType *Q,
-+                            const bool use_bdr = false);
-+};
-+
-+}
-+
-+}
-+
-+#endif // MFEM_LIBCEED_VECFEMASS_HPP
-diff --git a/fem/ceed/integrators/vecfemass/vecfemass_qf.h b/fem/ceed/integrators/vecfemass/vecfemass_qf.h
-new file mode 100644
-index 000000000..571316ba8
---- /dev/null
-+++ b/fem/ceed/integrators/vecfemass/vecfemass_qf.h
-@@ -0,0 +1,1454 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#ifndef MFEM_LIBCEED_VECFEMASS_QF_H
-+#define MFEM_LIBCEED_VECFEMASS_QF_H
-+
-+#include "../util/util_qf.h"
-+
-+#define LIBCEED_VECFEMASS_COEFF_COMP_MAX 6
-+
-+struct VectorFEMassContext
-+{
-+   CeedInt dim, space_dim;
-+   CeedScalar coeff[LIBCEED_VECFEMASS_COEFF_COMP_MAX];
-+};
-+
-+/// libCEED QFunction for building quadrature data for an H(div) mass operator
-+/// with a scalar constant coefficient
-+CEED_QFUNCTION(f_build_hdivmass_const_scalar)(void *ctx, CeedInt Q,
-+                                              const CeedScalar *const *in,
-+                                              CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[1] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div)) and store the symmetric part
-+   // of the result
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *J = in[0], *qw = in[1];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 11:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar coeff0 = coeff[0];
-+            qd[i] = qw[i] * coeff0 * J[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ21(J + i, Q, coeff, 1, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ22(J + i, Q, coeff, 1, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ32(J + i, Q, coeff, 1, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ33(J + i, Q, coeff, 1, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for an H(div) mass operator
-+/// with a vector constant coefficient
-+CEED_QFUNCTION(f_build_hdivmass_const_vector)(void *ctx, CeedInt Q,
-+                                              const CeedScalar *const *in,
-+                                              CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[1] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div)) and store the symmetric part
-+   // of the result
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *J = in[0], *qw = in[1];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ21(J + i, Q, coeff, 1, 2, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ22(J + i, Q, coeff, 1, 2, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ32(J + i, Q, coeff, 1, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ33(J + i, Q, coeff, 1, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for an H(div) mass operator
-+/// with a matrix constant coefficient
-+CEED_QFUNCTION(f_build_hdivmass_const_matrix)(void *ctx, CeedInt Q,
-+                                              const CeedScalar *const *in,
-+                                              CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[1] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div)) and store the symmetric part
-+   // of the result
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *J = in[0], *qw = in[1];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ21(J + i, Q, coeff, 1, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ22(J + i, Q, coeff, 1, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ32(J + i, Q, coeff, 1, 6, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ33(J + i, Q, coeff, 1, 6, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for an H(curl) mass operator
-+/// with a scalar constant coefficient
-+CEED_QFUNCTION(f_build_hcurlmass_const_scalar)(void *ctx, CeedInt Q,
-+                                               const CeedScalar *const *in,
-+                                               CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[1] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div)) and store the symmetric part
-+   // of the result
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *J = in[0], *qw = in[1];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 11:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar coeff0 = coeff[0];
-+            qd[i] = qw[i] * coeff0 / J[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt21(J + i, Q, coeff, 1, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt22(J + i, Q, coeff, 1, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt32(J + i, Q, coeff, 1, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt33(J + i, Q, coeff, 1, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for an H(curl) mass operator
-+/// with a vector constant coefficient
-+CEED_QFUNCTION(f_build_hcurlmass_const_vector)(void *ctx, CeedInt Q,
-+                                               const CeedScalar *const *in,
-+                                               CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[1] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div)) and store the symmetric part
-+   // of the result
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *J = in[0], *qw = in[1];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt21(J + i, Q, coeff, 1, 2, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt22(J + i, Q, coeff, 1, 2, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt32(J + i, Q, coeff, 1, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt33(J + i, Q, coeff, 1, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for an H(curl) mass operator
-+/// with a matrix constant coefficient
-+CEED_QFUNCTION(f_build_hcurlmass_const_matrix)(void *ctx, CeedInt Q,
-+                                               const CeedScalar *const *in,
-+                                               CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[1] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div)) and store the symmetric part
-+   // of the result
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *J = in[0], *qw = in[1];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt21(J + i, Q, coeff, 1, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt22(J + i, Q, coeff, 1, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt32(J + i, Q, coeff, 1, 6, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt33(J + i, Q, coeff, 1, 6, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for an H(div) mass operator
-+/// with a scalar coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_build_hdivmass_quad_scalar)(void *ctx, CeedInt Q,
-+                                             const CeedScalar *const *in,
-+                                             CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0] is coefficients with shape [ncomp=1, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div)) and store the symmetric part
-+   // of the result
-+   const CeedScalar *c = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 11:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * c[i] * J[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ21(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ22(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ32(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ33(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for an H(div) mass operator
-+/// with a vector coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_build_hdivmass_quad_vector)(void *ctx, CeedInt Q,
-+                                             const CeedScalar *const *in,
-+                                             CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0] is coefficients with shape [ncomp=space_dim, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div)) and store the symmetric part
-+   // of the result
-+   const CeedScalar *c = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ21(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ22(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ32(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ33(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for an H(div) mass operator
-+/// with a matrix coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_build_hdivmass_quad_matrix)(void *ctx, CeedInt Q,
-+                                             const CeedScalar *const *in,
-+                                             CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0] is coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div)) and store the symmetric part
-+   // of the result
-+   const CeedScalar *c = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ21(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ22(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ32(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultJtCJ33(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for an H(curl) mass operator
-+/// with a scalar coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_build_hcurlmass_quad_scalar)(void *ctx, CeedInt Q,
-+                                              const CeedScalar *const *in,
-+                                              CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0] is coefficients with shape [ncomp=1, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div)) and store the symmetric part
-+   // of the result
-+   const CeedScalar *c = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 11:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            qd[i] = qw[i] * c[i] / J[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt21(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt22(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt32(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt33(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for an H(curl) mass operator
-+/// with a vector coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_build_hcurlmass_quad_vector)(void *ctx, CeedInt Q,
-+                                              const CeedScalar *const *in,
-+                                              CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0] is coefficients with shape [ncomp=space_dim, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div)) and store the symmetric part
-+   // of the result
-+   const CeedScalar *c = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt21(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt22(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt32(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt33(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for building quadrature data for an H(curl) mass operator
-+/// with a matrix coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_build_hcurlmass_quad_matrix)(void *ctx, CeedInt Q,
-+                                              const CeedScalar *const *in,
-+                                              CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0] is coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div)) and store the symmetric part
-+   // of the result
-+   const CeedScalar *c = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *qd = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt21(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt22(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt32(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i);
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            MultAdjJCAdjJt33(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i);
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying a vector FE mass operator
-+CEED_QFUNCTION(f_apply_vecfemass)(void *ctx, CeedInt Q,
-+                                  const CeedScalar *const *in,
-+                                  CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=1, Q]
-+   const CeedScalar *u = in[0], *qd = in[1];
-+   CeedScalar *v = out[0];
-+   switch (bc->dim)
-+   {
-+      case 1:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            v[i] = qd[i] * u[i];
-+         }
-+         break;
-+      case 2:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[i + Q * 0] * u0 + qd[i + Q * 1] * u1;
-+            v[i + Q * 1] = qd[i + Q * 1] * u0 + qd[i + Q * 2] * u1;
-+         }
-+         break;
-+      case 3:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            const CeedScalar u2 = u[i + Q * 2];
-+            v[i + Q * 0] = qd[i + Q * 0] * u0 + qd[i + Q * 1] * u1 + qd[i + Q * 2] * u2;
-+            v[i + Q * 1] = qd[i + Q * 1] * u0 + qd[i + Q * 3] * u1 + qd[i + Q * 4] * u2;
-+            v[i + Q * 2] = qd[i + Q * 2] * u0 + qd[i + Q * 4] * u1 + qd[i + Q * 5] * u2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying an H(div) mass operator with a scalar
-+/// constant coefficient
-+CEED_QFUNCTION(f_apply_hdivmass_mf_const_scalar)(void *ctx, CeedInt Q,
-+                                                 const CeedScalar *const *in,
-+                                                 CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=1, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div))
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *u = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *v = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 11:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar coeff0 = coeff[0];
-+            const CeedScalar qd = qw[i] * coeff0 * J[i];
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultJtCJ21(J + i, Q, coeff, 1, 1, qw[i], 1, &qd);
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultJtCJ22(J + i, Q, coeff, 1, 1, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultJtCJ32(J + i, Q, coeff, 1, 1, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultJtCJ33(J + i, Q, coeff, 1, 1, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            const CeedScalar u2 = u[i + Q * 2];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1 + qd[2] * u2;
-+            v[i + Q * 1] = qd[1] * u0 + qd[3] * u1 + qd[4] * u2;
-+            v[i + Q * 2] = qd[2] * u0 + qd[4] * u1 + qd[5] * u2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying an H(div) mass operator with a vector
-+/// constant coefficient
-+CEED_QFUNCTION(f_apply_hdivmass_mf_const_vector)(void *ctx, CeedInt Q,
-+                                                 const CeedScalar *const *in,
-+                                                 CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=1, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div))
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *u = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *v = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultJtCJ21(J + i, Q, coeff, 1, 2, qw[i], 1, &qd);
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultJtCJ22(J + i, Q, coeff, 1, 2, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultJtCJ32(J + i, Q, coeff, 1, 3, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultJtCJ33(J + i, Q, coeff, 1, 3, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            const CeedScalar u2 = u[i + Q * 2];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1 + qd[2] * u2;
-+            v[i + Q * 1] = qd[1] * u0 + qd[3] * u1 + qd[4] * u2;
-+            v[i + Q * 2] = qd[2] * u0 + qd[4] * u1 + qd[5] * u2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying an H(div) mass operator with a matrix
-+/// constant coefficient
-+CEED_QFUNCTION(f_apply_hdivmass_mf_const_matrix)(void *ctx, CeedInt Q,
-+                                                 const CeedScalar *const *in,
-+                                                 CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=1, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div))
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *u = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *v = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultJtCJ21(J + i, Q, coeff, 1, 3, qw[i], 1, &qd);
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultJtCJ22(J + i, Q, coeff, 1, 3, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultJtCJ32(J + i, Q, coeff, 1, 6, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultJtCJ33(J + i, Q, coeff, 1, 6, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            const CeedScalar u2 = u[i + Q * 2];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1 + qd[2] * u2;
-+            v[i + Q * 1] = qd[1] * u0 + qd[3] * u1 + qd[4] * u2;
-+            v[i + Q * 2] = qd[2] * u0 + qd[4] * u1 + qd[5] * u2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying an H(curl) mass operator with a scalar
-+/// constant coefficient
-+CEED_QFUNCTION(f_apply_hcurlmass_mf_const_scalar)(void *ctx, CeedInt Q,
-+                                                  const CeedScalar *const *in,
-+                                                  CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=1, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div))
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *u = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *v = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 11:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar coeff0 = coeff[0];
-+            const CeedScalar qd = qw[i] * coeff0 / J[i];
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, coeff, 1, 1, qw[i], 1, &qd);
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt22(J + i, Q, coeff, 1, 1, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, coeff, 1, 1, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultAdjJCAdjJt33(J + i, Q, coeff, 1, 1, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            const CeedScalar u2 = u[i + Q * 2];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1 + qd[2] * u2;
-+            v[i + Q * 1] = qd[1] * u0 + qd[3] * u1 + qd[4] * u2;
-+            v[i + Q * 2] = qd[2] * u0 + qd[4] * u1 + qd[5] * u2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying an H(curl) mass operator with a vector
-+/// constant coefficient
-+CEED_QFUNCTION(f_apply_hcurlmass_mf_const_vector)(void *ctx, CeedInt Q,
-+                                                  const CeedScalar *const *in,
-+                                                  CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=1, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div))
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *u = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *v = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, coeff, 1, 2, qw[i], 1, &qd);
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt22(J + i, Q, coeff, 1, 2, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, coeff, 1, 3, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultAdjJCAdjJt33(J + i, Q, coeff, 1, 3, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            const CeedScalar u2 = u[i + Q * 2];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1 + qd[2] * u2;
-+            v[i + Q * 1] = qd[1] * u0 + qd[3] * u1 + qd[4] * u2;
-+            v[i + Q * 2] = qd[2] * u0 + qd[4] * u1 + qd[5] * u2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying an H(curl) mass operator with a matrix
-+/// constant coefficient
-+CEED_QFUNCTION(f_apply_hcurlmass_mf_const_matrix)(void *ctx, CeedInt Q,
-+                                                  const CeedScalar *const *in,
-+                                                  CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=1, Q]
-+   // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[2] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div))
-+   const CeedScalar *coeff = bc->coeff;
-+   const CeedScalar *u = in[0], *J = in[1], *qw = in[2];
-+   CeedScalar *v = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, coeff, 1, 3, qw[i], 1, &qd);
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt22(J + i, Q, coeff, 1, 3, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, coeff, 1, 6, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultAdjJCAdjJt33(J + i, Q, coeff, 1, 6, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            const CeedScalar u2 = u[i + Q * 2];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1 + qd[2] * u2;
-+            v[i + Q * 1] = qd[1] * u0 + qd[3] * u1 + qd[4] * u2;
-+            v[i + Q * 2] = qd[2] * u0 + qd[4] * u1 + qd[5] * u2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying an H(div) operator with a scalar
-+/// coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_apply_hdivmass_mf_quad_scalar)(void *ctx, CeedInt Q,
-+                                                const CeedScalar *const *in,
-+                                                CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=1, Q]
-+   // in[1] is coefficients with shape [ncomp=1, Q]
-+   // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[3] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div))
-+   const CeedScalar *u = in[0], *c = in[1], *J = in[2], *qw = in[3];
-+   CeedScalar *v = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 11:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * c[i] * J[i];
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultJtCJ21(J + i, Q, c + i, Q, 1, qw[i], 1, &qd);
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultJtCJ22(J + i, Q, c + i, Q, 1, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultJtCJ32(J + i, Q, c + i, Q, 1, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultJtCJ33(J + i, Q, c + i, Q, 1, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            const CeedScalar u2 = u[i + Q * 2];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1 + qd[2] * u2;
-+            v[i + Q * 1] = qd[1] * u0 + qd[3] * u1 + qd[4] * u2;
-+            v[i + Q * 2] = qd[2] * u0 + qd[4] * u1 + qd[5] * u2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying an H(div) operator with a vector
-+/// coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_apply_hdivmass_mf_quad_vector)(void *ctx, CeedInt Q,
-+                                                const CeedScalar *const *in,
-+                                                CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=1, Q]
-+   // in[1] is coefficients with shape [ncomp=space_dim, Q]
-+   // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[3] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div))
-+   const CeedScalar *u = in[0], *c = in[1], *J = in[2], *qw = in[3];
-+   CeedScalar *v = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultJtCJ21(J + i, Q, c + i, Q, 2, qw[i], 1, &qd);
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultJtCJ22(J + i, Q, c + i, Q, 2, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultJtCJ32(J + i, Q, c + i, Q, 3, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultJtCJ33(J + i, Q, c + i, Q, 3, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            const CeedScalar u2 = u[i + Q * 2];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1 + qd[2] * u2;
-+            v[i + Q * 1] = qd[1] * u0 + qd[3] * u1 + qd[4] * u2;
-+            v[i + Q * 2] = qd[2] * u0 + qd[4] * u1 + qd[5] * u2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying an H(div) operator with a matrix
-+/// coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_apply_hdivmass_mf_quad_matrix)(void *ctx, CeedInt Q,
-+                                                const CeedScalar *const *in,
-+                                                CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=1, Q]
-+   // in[1] is coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q]
-+   // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[3] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div))
-+   const CeedScalar *u = in[0], *c = in[1], *J = in[2], *qw = in[3];
-+   CeedScalar *v = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultJtCJ21(J + i, Q, c + i, Q, 3, qw[i], 1, &qd);
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultJtCJ22(J + i, Q, c + i, Q, 3, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultJtCJ32(J + i, Q, c + i, Q, 6, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultJtCJ33(J + i, Q, c + i, Q, 6, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            const CeedScalar u2 = u[i + Q * 2];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1 + qd[2] * u2;
-+            v[i + Q * 1] = qd[1] * u0 + qd[3] * u1 + qd[4] * u2;
-+            v[i + Q * 2] = qd[2] * u0 + qd[4] * u1 + qd[5] * u2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying an H(curl) operator with a scalar
-+/// coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_apply_hcurlmass_mf_quad_scalar)(void *ctx, CeedInt Q,
-+                                                 const CeedScalar *const *in,
-+                                                 CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=1, Q]
-+   // in[1] is coefficients with shape [ncomp=1, Q]
-+   // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[3] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div))
-+   const CeedScalar *u = in[0], *c = in[1], *J = in[2], *qw = in[3];
-+   CeedScalar *v = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 11:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            const CeedScalar qd = qw[i] * c[i] / J[i];
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, c + i, Q, 1, qw[i], 1, &qd);
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt22(J + i, Q, c + i, Q, 1, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, c + i, Q, 1, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultAdjJCAdjJt33(J + i, Q, c + i, Q, 1, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            const CeedScalar u2 = u[i + Q * 2];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1 + qd[2] * u2;
-+            v[i + Q * 1] = qd[1] * u0 + qd[3] * u1 + qd[4] * u2;
-+            v[i + Q * 2] = qd[2] * u0 + qd[4] * u1 + qd[5] * u2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying an H(curl) operator with a vector
-+/// coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_apply_hcurlmass_mf_quad_vector)(void *ctx, CeedInt Q,
-+                                                 const CeedScalar *const *in,
-+                                                 CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=1, Q]
-+   // in[1] is coefficients with shape [ncomp=space_dim, Q]
-+   // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[3] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div))
-+   const CeedScalar *u = in[0], *c = in[1], *J = in[2], *qw = in[3];
-+   CeedScalar *v = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, c + i, Q, 2, qw[i], 1, &qd);
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt22(J + i, Q, c + i, Q, 2, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, c + i, Q, 3, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultAdjJCAdjJt33(J + i, Q, c + i, Q, 3, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            const CeedScalar u2 = u[i + Q * 2];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1 + qd[2] * u2;
-+            v[i + Q * 1] = qd[1] * u0 + qd[3] * u1 + qd[4] * u2;
-+            v[i + Q * 2] = qd[2] * u0 + qd[4] * u1 + qd[5] * u2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+/// libCEED QFunction for applying an H(curl) operator with a matrix
-+/// coefficient evaluated at quadrature points
-+CEED_QFUNCTION(f_apply_hcurlmass_mf_quad_matrix)(void *ctx, CeedInt Q,
-+                                                 const CeedScalar *const *in,
-+                                                 CeedScalar *const *out)
-+{
-+   VectorFEMassContext *bc = (VectorFEMassContext *)ctx;
-+   // in[0], out[0] have shape [dim, ncomp=1, Q]
-+   // in[1] is coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q]
-+   // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q]
-+   // in[3] is quadrature weights, size (Q)
-+   //
-+   // At every quadrature point, compute qw/det(J) adj(J) C adj(J)^T (for
-+   // H(curl)) or qw/det(J) J^T C J (for H(div))
-+   const CeedScalar *u = in[0], *c = in[1], *J = in[2], *qw = in[3];
-+   CeedScalar *v = out[0];
-+   switch (10 * bc->space_dim + bc->dim)
-+   {
-+      case 21:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd;
-+            MultAdjJCAdjJt21(J + i, Q, c + i, Q, 3, qw[i], 1, &qd);
-+            v[i] = qd * u[i];
-+         }
-+         break;
-+      case 22:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt22(J + i, Q, c + i, Q, 3, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 32:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[3];
-+            MultAdjJCAdjJt32(J + i, Q, c + i, Q, 6, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1;
-+            v[i + Q * 1] = qd[1] * u0 + qd[2] * u1;
-+         }
-+         break;
-+      case 33:
-+         CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
-+         {
-+            CeedScalar qd[6];
-+            MultAdjJCAdjJt33(J + i, Q, c + i, Q, 6, qw[i], 1, qd);
-+            const CeedScalar u0 = u[i + Q * 0];
-+            const CeedScalar u1 = u[i + Q * 1];
-+            const CeedScalar u2 = u[i + Q * 2];
-+            v[i + Q * 0] = qd[0] * u0 + qd[1] * u1 + qd[2] * u2;
-+            v[i + Q * 1] = qd[1] * u0 + qd[3] * u1 + qd[4] * u2;
-+            v[i + Q * 2] = qd[2] * u0 + qd[4] * u1 + qd[5] * u2;
-+         }
-+         break;
-+   }
-+   return 0;
-+}
-+
-+#endif // MFEM_LIBCEED_VECFEMASS_QF_H
-diff --git a/fem/ceed/interface/basis.cpp b/fem/ceed/interface/basis.cpp
-index 37858cb78..487108ef4 100644
---- a/fem/ceed/interface/basis.cpp
-+++ b/fem/ceed/interface/basis.cpp
-@@ -9,7 +9,8 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../../gridfunc.hpp"
-+#include "basis.hpp"
-+
- #include "util.hpp"
- 
- namespace mfem
-@@ -47,75 +48,171 @@ static CeedElemTopology GetCeedTopology(Geometry::Type geom)
- static void InitNonTensorBasis(const mfem::FiniteElementSpace &fes,
-                                const mfem::FiniteElement &fe,
-                                const mfem::IntegrationRule &ir,
--                               Ceed ceed, CeedBasis *basis)
-+                               Ceed ceed,
-+                               CeedBasis *basis)
- {
-    const mfem::DofToQuad &maps = fe.GetDofToQuad(ir, mfem::DofToQuad::FULL);
--   mfem::Mesh *mesh = fes.GetMesh();
--   const int dim = mesh->Dimension();
--   const int ndofs = maps.ndof;
--   const int nqpts = maps.nqpt;
--   mfem::DenseMatrix qX(dim,nqpts);
--   mfem::Vector qW(nqpts);
--   for (int i = 0; i < nqpts; i++)
-+   const int dim = fe.GetDim();
-+   const int ncomp = fes.GetVDim();
-+   const int P = maps.ndof;
-+   const int Q = maps.nqpt;
-+   mfem::DenseMatrix qX(dim, Q);
-+   mfem::Vector qW(Q);
-+   for (int i = 0; i < Q; i++)
-    {
-       const mfem::IntegrationPoint &ip = ir.IntPoint(i);
--      qX(0,i) = ip.x;
--      if (dim>1) { qX(1,i) = ip.y; }
--      if (dim>2) { qX(2,i) = ip.z; }
-+      qX(0, i) = ip.x;
-+      if (dim > 1) { qX(1, i) = ip.y; }
-+      if (dim > 2) { qX(2, i) = ip.z; }
-       qW(i) = ip.weight;
-    }
--   CeedBasisCreateH1(ceed, GetCeedTopology(fe.GetGeomType()),
--                     fes.GetVDim(), ndofs, nqpts,
--                     maps.Bt.GetData(), maps.Gt.GetData(),
--                     qX.GetData(), qW.GetData(), basis);
-+   if (fe.GetMapType() == mfem::FiniteElement::H_DIV)
-+   {
-+      CeedBasisCreateHdiv(ceed, GetCeedTopology(fe.GetGeomType()), ncomp, P, Q,
-+                          maps.Bt.GetData(), maps.Gt.GetData(),
-+                          qX.GetData(), qW.GetData(), basis);
-+   }
-+   else if (fe.GetMapType() == mfem::FiniteElement::H_CURL)
-+   {
-+      CeedBasisCreateHcurl(ceed, GetCeedTopology(fe.GetGeomType()), ncomp, P, Q,
-+                           maps.Bt.GetData(), maps.Gt.GetData(),
-+                           qX.GetData(), qW.GetData(), basis);
-+   }
-+   else
-+   {
-+      CeedBasisCreateH1(ceed, GetCeedTopology(fe.GetGeomType()), ncomp, P, Q,
-+                        maps.Bt.GetData(), maps.Gt.GetData(),
-+                        qX.GetData(), qW.GetData(), basis);
-+   }
- }
- 
- static void InitTensorBasis(const mfem::FiniteElementSpace &fes,
-                             const mfem::FiniteElement &fe,
-                             const mfem::IntegrationRule &ir,
--                            Ceed ceed, CeedBasis *basis)
-+                            Ceed ceed,
-+                            CeedBasis *basis)
- {
-    const mfem::DofToQuad &maps = fe.GetDofToQuad(ir, mfem::DofToQuad::TENSOR);
--   mfem::Mesh *mesh = fes.GetMesh();
--   const int ndofs = maps.ndof;
--   const int nqpts = maps.nqpt;
--   mfem::Vector qX(nqpts), qW(nqpts);
--   // The x-coordinates of the first `nqpts` points of the integration rule are
-+   const int dim = fe.GetDim();
-+   const int ncomp = fes.GetVDim();
-+   const int P = maps.ndof;
-+   const int Q = maps.nqpt;
-+   mfem::Vector qX(Q), qW(Q);
-+   // The x-coordinates of the first `Q` points of the integration rule are
-    // the points of the corresponding 1D rule. We also scale the weights
-    // accordingly.
-    double w_sum = 0.0;
--   for (int i = 0; i < nqpts; i++)
-+   for (int i = 0; i < Q; i++)
-    {
-       const mfem::IntegrationPoint &ip = ir.IntPoint(i);
-       qX(i) = ip.x;
-       qW(i) = ip.weight;
-       w_sum += ip.weight;
-    }
--   qW *= 1.0/w_sum;
--   CeedBasisCreateTensorH1(ceed, mesh->Dimension(), fes.GetVDim(), ndofs,
--                           nqpts, maps.Bt.GetData(),
--                           maps.Gt.GetData(), qX.GetData(),
--                           qW.GetData(), basis);
-+   qW *= 1.0 / w_sum;
-+   CeedBasisCreateTensorH1(ceed, dim, ncomp, P, Q,
-+                           maps.Bt.GetData(), maps.Gt.GetData(),
-+                           qX.GetData(), qW.GetData(), basis);
-+}
-+
-+#if 0
-+static void InitCeedInterpolatorBasis(const FiniteElementSpace &trial_fes,
-+                                      const FiniteElementSpace &test_fes,
-+                                      const FiniteElement &trial_fe,
-+                                      const FiniteElement &test_fe,
-+                                      Ceed ceed,
-+                                      CeedBasis *basis)
-+{
-+   // Basis projection operator using libCEED
-+   CeedBasis trial_basis, test_basis;
-+   const int P = std::max(trial_fe.GetDof(), test_fe.GetDof()), ir_order_max = 100;
-+   int ir_order = std::max(trial_fe.GetOrder(), test_fe.GetOrder());
-+   for (; ir_order < ir_order_max; ir_order++)
-+   {
-+      if (IntRules.Get(trial_fe.GetGeomType(), ir_order).GetNPoints() >= P) { break; }
-+   }
-+   const IntegrationRule &ir = IntRules.Get(trial_fe.GetGeomType(), ir_order);
-+   InitBasis(trial_fes, trial_fe, ir, ceed, &trial_basis);
-+   InitBasis(test_fes, test_fe, ir, ceed, &test_basis);
-+   CeedBasisCreateProjection(trial_basis, test_basis, basis);
-+}
-+#endif
-+
-+static void InitMfemInterpolatorBasis(const FiniteElementSpace &trial_fes,
-+                                      const FiniteElementSpace &test_fes,
-+                                      const FiniteElement &trial_fe,
-+                                      const FiniteElement &test_fe,
-+                                      Ceed ceed,
-+                                      CeedBasis *basis)
-+{
-+   MFEM_VERIFY(trial_fes.GetVDim() == test_fes.GetVDim(),
-+               "libCEED discrete linear operator requires same vdim for trial "
-+               "and test FE spaces.");
-+   const int dim = trial_fe.GetDim();
-+   const int ncomp = trial_fes.GetVDim();
-+   const int trial_P = trial_fe.GetDof();
-+   const int test_P = test_fe.GetDof();
-+   mfem::DenseMatrix qX(dim, test_P), Gt(trial_P, test_P * dim), Bt;
-+   mfem::Vector qW(test_P);
-+   mfem::IsoparametricTransformation dummy;
-+   dummy.SetIdentityTransformation(trial_fe.GetGeomType());
-+   if (trial_fe.GetMapType() == test_fe.GetMapType())
-+   {
-+      // Prolongation
-+      test_fe.GetTransferMatrix(trial_fe, dummy, Bt);
-+   }
-+   else if (trial_fe.GetMapType() == mfem::FiniteElement::VALUE &&
-+            test_fe.GetMapType() == mfem::FiniteElement::H_CURL)
-+   {
-+      // Discrete gradient interpolator
-+      test_fe.ProjectGrad(trial_fe, dummy, Bt);
-+   }
-+   else if (trial_fe.GetMapType() == mfem::FiniteElement::H_CURL &&
-+            test_fe.GetMapType() == mfem::FiniteElement::H_DIV)
-+   {
-+      // Discrete curl interpolator
-+      test_fe.ProjectCurl(trial_fe, dummy, Bt);
-+   }
-+   else if (trial_fe.GetMapType() == mfem::FiniteElement::H_DIV &&
-+            test_fe.GetMapType() == mfem::FiniteElement::INTEGRAL)
-+   {
-+      // Discrete divergence interpolator
-+      test_fe.ProjectDiv(trial_fe, dummy, Bt);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unsupported trial/test FE spaces for libCEED discrete "
-+                 "linear operator");
-+   }
-+   Bt.Transpose();
-+   Gt = 0.0;
-+   qX = 0.0;
-+   qW = 0.0;
-+   CeedBasisCreateH1(ceed, GetCeedTopology(trial_fe.GetGeomType()), ncomp,
-+                     trial_P, test_P, Bt.GetData(), Gt.GetData(),
-+                     qX.GetData(), qW.GetData(), basis);
- }
- 
--static void InitBasisImpl(const FiniteElementSpace &fes,
--                          const FiniteElement &fe,
--                          const IntegrationRule &ir,
--                          Ceed ceed, CeedBasis *basis)
-+void InitBasis(const FiniteElementSpace &fes,
-+               const FiniteElement &fe,
-+               const IntegrationRule &ir,
-+               Ceed ceed,
-+               CeedBasis *basis)
- {
--   // Check for FES -> basis, restriction in hash tables
-+   // Check for fes -> basis in hash table
-+   const int ncomp = fes.GetVDim();
-    const int P = fe.GetDof();
-    const int Q = ir.GetNPoints();
--   const int ncomp = fes.GetVDim();
--   BasisKey basis_key(&fes, &ir, ncomp, P, Q);
-+   BasisKey basis_key(&fes, nullptr, &ir, {ncomp, P, Q});
-    auto basis_itr = mfem::internal::ceed_basis_map.find(basis_key);
--   const bool tensor = dynamic_cast<const mfem::TensorBasisElement *>
--                       (&fe) != nullptr;
- 
-    // Init or retrieve key values
-    if (basis_itr == mfem::internal::ceed_basis_map.end())
-    {
--      if ( tensor )
-+      const bool tensor =
-+         dynamic_cast<const mfem::TensorBasisElement *>(&fe) != nullptr;
-+      const bool vector = fe.GetRangeType() == mfem::FiniteElement::VECTOR;
-+      if (tensor && !vector)
-       {
-          InitTensorBasis(fes, fe, ir, ceed, basis);
-       }
-@@ -131,22 +228,41 @@ static void InitBasisImpl(const FiniteElementSpace &fes,
-    }
- }
- 
--void InitBasis(const FiniteElementSpace &fes,
--               const IntegrationRule &ir,
--               Ceed ceed, CeedBasis *basis)
-+void InitInterpolatorBasis(const FiniteElementSpace &trial_fes,
-+                           const FiniteElementSpace &test_fes,
-+                           const FiniteElement &trial_fe,
-+                           const FiniteElement &test_fe,
-+                           Ceed ceed,
-+                           CeedBasis *basis)
- {
--   const mfem::FiniteElement &fe = *fes.GetFE(0);
--   InitBasisImpl(fes, fe, ir, ceed, basis);
--}
-+   // Check for fes -> basis in hash table
-+   const int ncomp = trial_fes.GetVDim() + test_fes.GetVDim();
-+   const int P = trial_fe.GetDof();
-+   const int Q = test_fe.GetDof();
-+   BasisKey basis_key(&trial_fes, &test_fes, nullptr, {ncomp, P, Q});
-+   auto basis_itr = mfem::internal::ceed_basis_map.find(basis_key);
- 
--void InitBasisWithIndices(const FiniteElementSpace &fes,
--                          const IntegrationRule &ir,
--                          int nelem,
--                          const int* indices,
--                          Ceed ceed, CeedBasis *basis)
--{
--   const mfem::FiniteElement &fe = *fes.GetFE(indices[0]);
--   InitBasisImpl(fes, fe, ir, ceed, basis);
-+   // Init or retrieve key values
-+   if (basis_itr == mfem::internal::ceed_basis_map.end())
-+   {
-+#if 0
-+      if (trial_fe.GetMapType() == test_fe.GetMapType())
-+      {
-+         InitCeedInterpolatorBasis(trial_fes, test_fes, trial_fe, test_fe,
-+                                   ceed, basis);
-+      }
-+      else
-+#endif
-+      {
-+         InitMfemInterpolatorBasis(trial_fes, test_fes, trial_fe, test_fe,
-+                                   ceed, basis);
-+      }
-+      mfem::internal::ceed_basis_map[basis_key] = *basis;
-+   }
-+   else
-+   {
-+      *basis = basis_itr->second;
-+   }
- }
- 
- #endif
-diff --git a/fem/ceed/interface/basis.hpp b/fem/ceed/interface/basis.hpp
-index 3781f4cf7..361f97863 100644
---- a/fem/ceed/interface/basis.hpp
-+++ b/fem/ceed/interface/basis.hpp
-@@ -12,6 +12,7 @@
- #ifndef MFEM_LIBCEED_BASIS
- #define MFEM_LIBCEED_BASIS
- 
-+#include "../../fespace.hpp"
- #include "ceed.hpp"
- 
- namespace mfem
-@@ -22,31 +23,117 @@ namespace ceed
- 
- #ifdef MFEM_USE_CEED
- 
--/** @brief Initialize a CeedBasis for non-mixed meshes.
-+/** @brief Initialize a CeedBasis based on an mfem::FiniteElementSpace @a fes,
-+    an mfem::FiniteElement @a fe, and an mfem::IntegrationRule @a ir.
- 
--   @param[in] fes Input finite element space.
--   @param[in] ir Input integration rule.
--   @param[in] ceed Input Ceed object.
--   @param[out] basis The address of the initialized CeedBasis object.
--*/
-+    @param[in] fes The finite element space.
-+    @param[in] fe The finite element.
-+    @param[in] ir The integration rule.
-+    @param[in] ceed The Ceed object.
-+    @param[out] basis The `CeedBasis` to initialize. */
- void InitBasis(const FiniteElementSpace &fes,
-+               const FiniteElement &fe,
-                const IntegrationRule &ir,
--               Ceed ceed, CeedBasis *basis);
-+               Ceed ceed,
-+               CeedBasis *basis);
- 
--/** @brief Initialize a CeedBasis for mixed meshes.
-+/** @brief Initialize a CeedBasis based on an mfem::FiniteElementSpace @a fes,
-+    an mfem::IntegrationRule @a ir, and an optional list of element indices
-+    @a indices.
- 
-     @param[in] fes The finite element space.
--    @param[in] ir is the integration rule for the operator.
--    @param[in] nelem The number of elements.
-+    @param[in] ir The integration rule.
-+    @param[in] use_bdr Create the basis and restriction for boundary elements.
-+    @param[in] indices The indices of the elements of same type in the
-+                       `FiniteElementSpace`. If `indices == nullptr`, assumes
-+                       that the `FiniteElementSpace` is not mixed.
-+    @param[in] ceed The Ceed object.
-+    @param[out] basis The `CeedBasis` to initialize. */
-+inline void InitBasis(const FiniteElementSpace &fes,
-+                      const IntegrationRule &ir,
-+                      bool use_bdr,
-+                      const int *indices,
-+                      Ceed ceed,
-+                      CeedBasis *basis)
-+{
-+   const mfem::FiniteElement *fe;
-+   if (indices)
-+   {
-+      fe = use_bdr ? fes.GetBE(indices[0]) : fes.GetFE(indices[0]);
-+   }
-+   else
-+   {
-+      fe = use_bdr ? fes.GetBE(0) : fes.GetFE(0);
-+   }
-+   InitBasis(fes, *fe, ir, ceed, basis);
-+}
-+
-+inline void InitBasis(const FiniteElementSpace &fes,
-+                      const IntegrationRule &ir,
-+                      bool use_bdr,
-+                      Ceed ceed,
-+                      CeedBasis *basis)
-+{
-+   InitBasis(fes, ir, use_bdr, nullptr, ceed, basis);
-+}
-+
-+/** @brief Initialize a CeedBasis based on an interpolation from
-+    mfem::FiniteElementSpace @a trial_fes to @a test_fes. The type of
-+    interpolation will be chosen based on the map type of the provided
-+    mfem::FiniteElement objects.
-+
-+    @param[in] trial_fes The trial finite element space.
-+    @param[in] test_fes The test finite element space.
-+    @param[in] trial_fe The trial finite element.
-+    @param[in] test_fe The test finite element.
-+    @param[in] ceed The Ceed object.
-+    @param[out] basis The `CeedBasis` to initialize. */
-+void InitInterpolatorBasis(const FiniteElementSpace &trial_fes,
-+                           const FiniteElementSpace &test_fes,
-+                           const FiniteElement &trial_fe,
-+                           const FiniteElement &test_fe,
-+                           Ceed ceed,
-+                           CeedBasis *basis);
-+
-+/** @brief Initialize a CeedBasis based on an interpolation from
-+    mfem::FiniteElementSpace @a trial_fes to @a test_fes, with an optional list
-+    of element indices @a indices. The type of interpolation will be chosen
-+    based on the map type of the provided spaces.
-+
-+    @param[in] trial_fes The trial finite element space.
-+    @param[in] test_fes The test finite element space.
-     @param[in] indices The indices of the elements of same type in the
--                       `FiniteElementSpace`.
-+                       `FiniteElementSpace`. If `indices == nullptr`, assumes
-+                       that the `FiniteElementSpace` is not mixed.
-     @param[in] ceed The Ceed object.
-     @param[out] basis The `CeedBasis` to initialize. */
--void InitBasisWithIndices(const FiniteElementSpace &fes,
--                          const IntegrationRule &ir,
--                          int nelem,
--                          const int* indices,
--                          Ceed ceed, CeedBasis *basis);
-+inline void InitInterpolatorBasis(const FiniteElementSpace &trial_fes,
-+                                  const FiniteElementSpace &test_fes,
-+                                  const int *indices,
-+                                  Ceed ceed,
-+                                  CeedBasis *basis)
-+{
-+   const mfem::FiniteElement *trial_fe, *test_fe;
-+   if (indices)
-+   {
-+      trial_fe = trial_fes.GetFE(indices[0]);
-+      test_fe = test_fes.GetFE(indices[0]);
-+   }
-+   else
-+   {
-+      trial_fe = trial_fes.GetFE(0);
-+      test_fe = test_fes.GetFE(0);
-+   }
-+   InitInterpolatorBasis(trial_fes, test_fes, *trial_fe, *test_fe, ceed, basis);
-+}
-+
-+inline void InitInterpolatorBasis(const FiniteElementSpace &trial_fes,
-+                                  const FiniteElementSpace &test_fes,
-+                                  Ceed ceed,
-+                                  CeedBasis *basis)
-+{
-+   InitInterpolatorBasis(trial_fes, test_fes, nullptr, ceed, basis);
-+}
- 
- #endif
- 
-diff --git a/fem/ceed/interface/ceed.hpp b/fem/ceed/interface/ceed.hpp
-index 1e06d9ab5..1945ce3e5 100644
---- a/fem/ceed/interface/ceed.hpp
-+++ b/fem/ceed/interface/ceed.hpp
-@@ -12,6 +12,7 @@
- #ifndef MFEM_LIBCEED_CEED
- #define MFEM_LIBCEED_CEED
- 
-+#include "../../../config/config.hpp"
- #ifdef MFEM_USE_CEED
- #include <ceed.h>
- #if !CEED_VERSION_GE(0,10,0)
-@@ -24,6 +25,7 @@ namespace mfem
- namespace internal
- {
- 
-+// Definition in general/device.cpp.
- extern Ceed ceed;
- 
- } // namespace internal
-diff --git a/fem/ceed/interface/coefficient.hpp b/fem/ceed/interface/coefficient.hpp
-index abb70e8b8..d4e067d7a 100644
---- a/fem/ceed/interface/coefficient.hpp
-+++ b/fem/ceed/interface/coefficient.hpp
-@@ -15,7 +15,6 @@
- #ifdef MFEM_USE_CEED
- 
- #include "../../../general/forall.hpp"
--#include "../../../config/config.hpp"
- #include "../../../linalg/vector.hpp"
- #include "../../../linalg/dtensor.hpp"
- #include "../../../mesh/mesh.hpp"
-@@ -27,54 +26,38 @@
- namespace mfem
- {
- 
--class Mesh;
--class IntegrationRule;
--class Coefficient;
--class VectorCoefficient;
--class GridFunction;
--
- namespace ceed
- {
- 
- struct Coefficient
- {
-+   CeedVector coeff_vector = nullptr;
-    const int ncomp;
--   Coefficient(int ncomp_) : ncomp(ncomp_) { }
--   virtual bool IsConstant() const { return true; }
--   virtual ~Coefficient() { }
--};
--
--struct VariableCoefficient : Coefficient
--{
--   CeedVector coeffVector = nullptr;
-    const CeedEvalMode emode;
--   VariableCoefficient(int ncomp_, CeedEvalMode emode_)
--      : Coefficient(ncomp_), emode(emode_) { }
--   virtual bool IsConstant() const override { return false; }
--   ~VariableCoefficient()
-+   Coefficient(int ncomp_, CeedEvalMode emode_) : ncomp(ncomp_), emode(emode_) {}
-+   virtual ~Coefficient()
-    {
--      CeedVectorDestroy(&coeffVector);
-+      CeedVectorDestroy(&coeff_vector);
-    }
- };
- 
--struct GridCoefficient : VariableCoefficient
-+struct GridCoefficient : Coefficient
- {
-    const mfem::GridFunction &gf;
--   CeedBasis basis;
--   CeedElemRestriction restr;
-+   CeedBasis basis = nullptr;
-+   CeedElemRestriction restr = nullptr;
-    GridCoefficient(const mfem::GridFunction &gf_)
--      : VariableCoefficient(gf_.VectorDim(), CEED_EVAL_INTERP),
--        gf(gf_)
-+      : Coefficient(gf_.VectorDim(), CEED_EVAL_INTERP), gf(gf_)
-    {
--      InitVector(gf, coeffVector);
-+      InitVector(gf, coeff_vector);
-    }
- };
- 
--struct QuadCoefficient : VariableCoefficient
-+struct QuadCoefficient : Coefficient
- {
--   mfem::Vector coeff;
--   CeedElemRestriction restr;
--   QuadCoefficient(int ncomp_) : VariableCoefficient(ncomp_, CEED_EVAL_NONE) { }
-+   mfem::Vector vector;
-+   CeedElemRestriction restr = nullptr;
-+   QuadCoefficient(int ncomp_) : Coefficient(ncomp_, CEED_EVAL_NONE) {}
- };
- 
- /** @brief Initializes an mfem::ceed::Coefficient @a coeff_ptr from an
-@@ -84,73 +67,65 @@ struct QuadCoefficient : VariableCoefficient
-     @param[in] Q is the coefficient from the `Integrator`.
-     @param[in] mesh is the mesh.
-     @param[in] ir is the integration rule.
-+    @param[in] use_bdr is a flag to construct the coefficient on mesh boundaries.
-     @param[out] coeff_ptr is the structure to store the coefficient for the
--                          `CeedOperator`.
--    @param[out] ctx is the Context associated to the QFunction. */
--template <typename Context>
--void InitCoefficient(mfem::Coefficient *Q, mfem::Mesh &mesh,
--                     const mfem::IntegrationRule &ir,
--                     Coefficient*& coeff_ptr, Context &ctx)
-+                          `CeedOperator`. */
-+inline void InitCoefficient(mfem::Coefficient *Q, mfem::Mesh &mesh,
-+                            const mfem::IntegrationRule &ir, bool use_bdr,
-+                            Coefficient *&coeff_ptr)
- {
--   if ( Q == nullptr )
-+   if (Q == nullptr || dynamic_cast<mfem::ConstantCoefficient *>(Q))
-    {
--      Coefficient *ceedCoeff = new Coefficient(1);
--      ctx.coeff = 1.0;
--      coeff_ptr = ceedCoeff;
-+      // The constant coefficient case is handled by the QFunction context
-+      coeff_ptr = nullptr;
-    }
--   else if (ConstantCoefficient *const_coeff =
--               dynamic_cast<ConstantCoefficient*>(Q))
-+   else if (mfem::GridFunctionCoefficient *gf_coeff =
-+               dynamic_cast<mfem::GridFunctionCoefficient *>(Q))
-    {
--      Coefficient *ceedCoeff = new Coefficient(1);
--      ctx.coeff = const_coeff->constant;
--      coeff_ptr = ceedCoeff;
--   }
--   else if (GridFunctionCoefficient* gf_coeff =
--               dynamic_cast<GridFunctionCoefficient*>(Q))
--   {
--      GridCoefficient *ceedCoeff =
-+      GridCoefficient *ceed_coeff =
-          new GridCoefficient(*gf_coeff->GetGridFunction());
--      coeff_ptr = ceedCoeff;
-+      coeff_ptr = ceed_coeff;
-    }
--   else if (QuadratureFunctionCoefficient *cQ =
--               dynamic_cast<QuadratureFunctionCoefficient*>(Q))
-+   else if (mfem::QuadratureFunctionCoefficient *qf_coeff =
-+               dynamic_cast<mfem::QuadratureFunctionCoefficient *>(Q))
-    {
--      QuadCoefficient *ceedCoeff = new QuadCoefficient(1);
--      const int ne = mesh.GetNE();
-+      const int ne = use_bdr ? mesh.GetNBE() : mesh.GetNE();
-       const int nq = ir.GetNPoints();
--      const mfem::QuadratureFunction &qFun = cQ->GetQuadFunction();
--      MFEM_VERIFY(qFun.Size() == nq * ne,
--                  "Incompatible QuadratureFunction dimension \n");
--
--      MFEM_VERIFY(&ir == &qFun.GetSpace()->GetIntRule(0),
-+      QuadCoefficient *ceed_coeff = new QuadCoefficient(1);
-+      const mfem::QuadratureFunction &qfunc = qf_coeff->GetQuadFunction();
-+      MFEM_VERIFY(qfunc.Size() == nq * ne,
-+                  "Incompatible QuadratureFunction dimension.");
-+      MFEM_VERIFY(&ir == &qfunc.GetSpace()->GetIntRule(0),
-                   "IntegrationRule used within integrator and in"
--                  " QuadratureFunction appear to be different");
--      qFun.Read();
--      ceedCoeff->coeff.MakeRef(const_cast<mfem::QuadratureFunction &>(qFun),0);
--      InitVector(ceedCoeff->coeff, ceedCoeff->coeffVector);
--      coeff_ptr = ceedCoeff;
-+                  " QuadratureFunction appear to be different.");
-+      qfunc.Read();
-+      ceed_coeff->vector.MakeRef(const_cast<mfem::QuadratureFunction &>(qfunc), 0);
-+      InitVector(ceed_coeff->vector, ceed_coeff->coeff_vector);
-+      coeff_ptr = ceed_coeff;
-    }
-    else
-    {
--      QuadCoefficient *ceedCoeff = new QuadCoefficient(1);
--      const int ne = mesh.GetNE();
-+      const int ne = use_bdr ? mesh.GetNBE() : mesh.GetNE();
-       const int nq = ir.GetNPoints();
--      ceedCoeff->coeff.SetSize(nq * ne);
--      auto C = Reshape(ceedCoeff->coeff.HostWrite(), nq, ne);
-+      QuadCoefficient *ceed_coeff = new QuadCoefficient(1);
-+      ceed_coeff->vector.SetSize(nq * ne);
-+      auto C = Reshape(ceed_coeff->vector.HostWrite(), nq, ne);
-       for (int e = 0; e < ne; ++e)
-       {
--         mfem::ElementTransformation &T = *mesh.GetElementTransformation(e);
-+         auto &T = use_bdr ? *mesh.GetBdrElementTransformation(e) :
-+                   *mesh.GetElementTransformation(e);
-          for (int q = 0; q < nq; ++q)
-          {
--            C(q,e) = Q->Eval(T, ir.IntPoint(q));
-+            const IntegrationPoint &ip = ir.IntPoint(q);
-+            T.SetIntPoint(&ip);
-+            C(q, e) = Q->Eval(T, ip);
-          }
-       }
--      InitVector(ceedCoeff->coeff, ceedCoeff->coeffVector);
--      coeff_ptr = ceedCoeff;
-+      InitVector(ceed_coeff->vector, ceed_coeff->coeff_vector);
-+      coeff_ptr = ceed_coeff;
-    }
- }
- 
--
- /** @brief Initializes an mfem::ceed::Coefficient @a coeff_ptr from an
-     mfem::VectorCoefficient @a VQ, an mfem::Mesh @a mesh, and an
-     mfem::IntegrationRule @a ir.
-@@ -158,75 +133,121 @@ void InitCoefficient(mfem::Coefficient *Q, mfem::Mesh &mesh,
-     @param[in] VQ is the vector coefficient from the `Integrator`.
-     @param[in] mesh is the mesh.
-     @param[in] ir is the integration rule.
-+    @param[in] use_bdr is a flag to construct the coefficient on mesh boundaries.
-     @param[out] coeff_ptr is the structure to store the coefficient for the
--                          `CeedOperator`.
--    @param[out] ctx is the Context associated to the QFunction. */
--template <typename Context>
--void InitCoefficient(mfem::VectorCoefficient *VQ, mfem::Mesh &mesh,
--                     const mfem::IntegrationRule &ir,
--                     Coefficient *&coeff_ptr, Context &ctx)
-+                          `CeedOperator`. */
-+inline void InitCoefficient(mfem::VectorCoefficient *VQ, mfem::Mesh &mesh,
-+                            const mfem::IntegrationRule &ir, bool use_bdr,
-+                            Coefficient *&coeff_ptr)
- {
--   if (VectorConstantCoefficient *const_coeff =
--          dynamic_cast<VectorConstantCoefficient*>(VQ))
-+   if (VQ == nullptr || dynamic_cast<mfem::VectorConstantCoefficient *>(VQ))
-    {
--      const int vdim = const_coeff->GetVDim();
--      const mfem::Vector &val = const_coeff->GetVec();
--      Coefficient *ceedCoeff = new Coefficient(vdim);
--      for (int i = 0; i < vdim; i++)
--      {
--         ctx.coeff[i] = val[i];
--      }
--      coeff_ptr = ceedCoeff;
-+      // The constant coefficient case is handled by the QFunction context
-+      coeff_ptr = nullptr;
-    }
--   else if (VectorGridFunctionCoefficient* vgf_coeff =
--               dynamic_cast<VectorGridFunctionCoefficient*>(VQ))
-+   else if (mfem::VectorGridFunctionCoefficient *vgf_coeff =
-+               dynamic_cast<mfem::VectorGridFunctionCoefficient *>(VQ))
-    {
--      GridCoefficient *ceedCoeff =
-+      GridCoefficient *ceed_coeff =
-          new GridCoefficient(*vgf_coeff->GetGridFunction());
--      coeff_ptr = ceedCoeff;
-+      coeff_ptr = ceed_coeff;
-    }
--   else if (VectorQuadratureFunctionCoefficient *cQ =
--               dynamic_cast<VectorQuadratureFunctionCoefficient*>(VQ))
-+   else if (mfem::VectorQuadratureFunctionCoefficient *vqf_coeff =
-+               dynamic_cast<mfem::VectorQuadratureFunctionCoefficient *>(VQ))
-    {
--      QuadCoefficient *ceedCoeff = new QuadCoefficient(cQ->GetVDim());
--      const int dim = mesh.Dimension();
--      const int ne = mesh.GetNE();
-+      const int vdim = vqf_coeff->GetVDim();
-+      const int ne = use_bdr ? mesh.GetNBE() : mesh.GetNE();
-       const int nq = ir.GetNPoints();
--      const mfem::QuadratureFunction &qFun = cQ->GetQuadFunction();
--      MFEM_VERIFY(qFun.Size() == dim * nq * ne,
--                  "Incompatible QuadratureFunction dimension \n");
--
--      MFEM_VERIFY(&ir == &qFun.GetSpace()->GetIntRule(0),
-+      QuadCoefficient *ceed_coeff = new QuadCoefficient(vdim);
-+      const mfem::QuadratureFunction &qfunc = vqf_coeff->GetQuadFunction();
-+      MFEM_VERIFY(qfunc.Size() == vdim * nq * ne,
-+                  "Incompatible QuadratureFunction dimension.");
-+      MFEM_VERIFY(&ir == &qfunc.GetSpace()->GetIntRule(0),
-                   "IntegrationRule used within integrator and in"
--                  " QuadratureFunction appear to be different");
--      qFun.Read();
--      ceedCoeff->coeff.MakeRef(const_cast<mfem::QuadratureFunction &>(qFun),0);
--      InitVector(ceedCoeff->coeff, ceedCoeff->coeffVector);
--      coeff_ptr = ceedCoeff;
-+                  " QuadratureFunction appear to be different.");
-+      qfunc.Read();
-+      ceed_coeff->vector.MakeRef(const_cast<mfem::QuadratureFunction &>(qfunc), 0);
-+      InitVector(ceed_coeff->vector, ceed_coeff->coeff_vector);
-+      coeff_ptr = ceed_coeff;
-    }
-    else
-    {
--      const int dim = mesh.Dimension();
--      QuadCoefficient *ceedCoeff = new QuadCoefficient(dim);
--      const int ne = mesh.GetNE();
-+      const int vdim = VQ->GetVDim();
-+      const int ne = use_bdr ? mesh.GetNBE() : mesh.GetNE();
-       const int nq = ir.GetNPoints();
--      ceedCoeff->coeff.SetSize(dim * nq * ne);
--      auto C = Reshape(ceedCoeff->coeff.HostWrite(), dim, nq, ne);
-+      QuadCoefficient *ceed_coeff = new QuadCoefficient(vdim);
-+      ceed_coeff->vector.SetSize(vdim * nq * ne);
-+      auto C = Reshape(ceed_coeff->vector.HostWrite(), vdim, nq, ne);
-       mfem::DenseMatrix Q_ir;
-       for (int e = 0; e < ne; ++e)
-       {
--         mfem::ElementTransformation &T = *mesh.GetElementTransformation(e);
-+         auto &T = use_bdr ? *mesh.GetBdrElementTransformation(e) :
-+                   *mesh.GetElementTransformation(e);
-          VQ->Eval(Q_ir, T, ir);
-          for (int q = 0; q < nq; ++q)
-          {
--            for (int i = 0; i < dim; ++i)
-+            for (int i = 0; i < vdim; ++i)
-+            {
-+               C(i, q, e) = Q_ir(i, q);
-+            }
-+         }
-+      }
-+      InitVector(ceed_coeff->vector, ceed_coeff->coeff_vector);
-+      coeff_ptr = ceed_coeff;
-+   }
-+}
-+
-+/** @brief Initializes an mfem::ceed::Coefficient @a coeff_ptr from an
-+    mfem::MatrixCoefficient @a MQ, an mfem::Mesh @a mesh, and an
-+    mfem::IntegrationRule @a ir.
-+
-+    @param[in] MQ is the matrix coefficient from the `Integrator`.
-+    @param[in] mesh is the mesh.
-+    @param[in] ir is the integration rule.
-+    @param[in] use_bdr is a flag to construct the coefficient on mesh boundaries.
-+    @param[out] coeff_ptr is the structure to store the coefficient for the
-+                          `CeedOperator`. */
-+inline void InitCoefficient(mfem::MatrixCoefficient *MQ, mfem::Mesh &mesh,
-+                            const mfem::IntegrationRule &ir, bool use_bdr,
-+                            Coefficient *&coeff_ptr)
-+{
-+   if (MQ == nullptr || dynamic_cast<mfem::MatrixConstantCoefficient *>(MQ))
-+   {
-+      // The constant coefficient case is handled by the QFunction context
-+      coeff_ptr = nullptr;
-+   }
-+   else
-+   {
-+      // Assumes matrix coefficient is symmetric
-+      const int vdim = MQ->GetVDim();
-+      const int ncomp = (vdim * (vdim + 1)) / 2;
-+      const int ne = use_bdr ? mesh.GetNBE() : mesh.GetNE();
-+      const int nq = ir.GetNPoints();
-+      QuadCoefficient *ceed_coeff = new QuadCoefficient(ncomp);
-+      ceed_coeff->vector.SetSize(ncomp * nq * ne);
-+      auto C = Reshape(ceed_coeff->vector.HostWrite(), ncomp, nq, ne);
-+      mfem::DenseMatrix Q_ip;
-+      for (int e = 0; e < ne; ++e)
-+      {
-+         auto &T = use_bdr ? *mesh.GetBdrElementTransformation(e) :
-+                   *mesh.GetElementTransformation(e);
-+         for (int q = 0; q < nq; ++q)
-+         {
-+            const IntegrationPoint &ip = ir.IntPoint(q);
-+            T.SetIntPoint(&ip);
-+            MQ->Eval(Q_ip, T, ip);
-+            for (int j = 0; j < vdim; ++j)
-             {
--               C(i,q,e) = Q_ir(i,q);
-+               for (int i = j; i < vdim; ++i)
-+               {
-+                  const int idx = (j * vdim) - (((j - 1) * j) / 2) + i - j;
-+                  C(idx, q, e) = Q_ip(i, j);  // Column-major
-+               }
-             }
-          }
-       }
--      InitVector(ceedCoeff->coeff, ceedCoeff->coeffVector);
--      coeff_ptr = ceedCoeff;
-+      InitVector(ceed_coeff->vector, ceed_coeff->coeff_vector);
-+      coeff_ptr = ceed_coeff;
-    }
- }
- 
-@@ -237,57 +258,49 @@ void InitCoefficient(mfem::VectorCoefficient *VQ, mfem::Mesh &mesh,
-     @param[in] Q is the coefficient from the `Integrator`.
-     @param[in] mesh is the mesh.
-     @param[in] ir is the integration rule.
--    @param[in] nelem The number of elements.
--    @param[in] indices The indices of the elements of same type in the
-+    @param[in] use_bdr is a flag to construct the coefficient on mesh boundaries.
-+    @param[in] nelem is the number of elements.
-+    @param[in] indices are the indices of the elements of same type in the
-                        `FiniteElementSpace`.
-     @param[out] coeff_ptr is the structure to store the coefficient for the
--                          `CeedOperator`.
--    @param[out] ctx is the Context associated to the QFunction. */
--template <typename Context>
--void InitCoefficientWithIndices(mfem::Coefficient *Q, mfem::Mesh &mesh,
--                                const mfem::IntegrationRule &ir,
--                                int nelem,
--                                const int* indices,
--                                Coefficient*& coeff_ptr, Context &ctx)
-+                          `CeedOperator`. */
-+inline void InitCoefficientWithIndices(mfem::Coefficient *Q,
-+                                       mfem::Mesh &mesh,
-+                                       const mfem::IntegrationRule &ir,
-+                                       bool use_bdr,
-+                                       int nelem,
-+                                       const int *indices,
-+                                       Coefficient *&coeff_ptr)
- {
--   if ( Q == nullptr )
--   {
--      Coefficient *ceedCoeff = new Coefficient(1);
--      ctx.coeff = 1.0;
--      coeff_ptr = ceedCoeff;
--   }
--   else if (ConstantCoefficient *const_coeff =
--               dynamic_cast<ConstantCoefficient*>(Q))
-+   if (Q == nullptr || dynamic_cast<mfem::ConstantCoefficient *>(Q))
-    {
--      Coefficient *ceedCoeff = new Coefficient(1);
--      ctx.coeff = const_coeff->constant;
--      coeff_ptr = ceedCoeff;
-+      // The constant coefficient case is handled by the QFunction context
-+      coeff_ptr = nullptr;
-    }
--   else if (GridFunctionCoefficient* gf_coeff =
--               dynamic_cast<GridFunctionCoefficient*>(Q))
-+   else if (mfem::GridFunctionCoefficient *gf_coeff =
-+               dynamic_cast<mfem::GridFunctionCoefficient *>(Q))
-    {
--      GridCoefficient *ceedCoeff =
-+      GridCoefficient *ceed_coeff =
-          new GridCoefficient(*gf_coeff->GetGridFunction());
--      coeff_ptr = ceedCoeff;
-+      coeff_ptr = ceed_coeff;
-    }
--   else if (QuadratureFunctionCoefficient *cQ =
--               dynamic_cast<QuadratureFunctionCoefficient*>(Q))
-+   else if (mfem::QuadratureFunctionCoefficient *qf_coeff =
-+               dynamic_cast<mfem::QuadratureFunctionCoefficient *>(Q))
-    {
--      QuadCoefficient *ceedCoeff = new QuadCoefficient(1);
--      const int ne = mesh.GetNE();
-+      const int ne = use_bdr ? mesh.GetNBE() : mesh.GetNE();
-       const int nq = ir.GetNPoints();
--      const mfem::QuadratureFunction &qFun = cQ->GetQuadFunction();
--      MFEM_VERIFY(qFun.Size() == nq * ne,
--                  "Incompatible QuadratureFunction dimension \n");
--
--      MFEM_VERIFY(&ir == &qFun.GetSpace()->GetIntRule(0),
-+      QuadCoefficient *ceed_coeff = new QuadCoefficient(1);
-+      ceed_coeff->vector.SetSize(nq * nelem);
-+      const mfem::QuadratureFunction &qfunc = qf_coeff->GetQuadFunction();
-+      MFEM_VERIFY(qfunc.Size() == nq * ne,
-+                  "Incompatible QuadratureFunction dimension.");
-+      MFEM_VERIFY(&ir == &qfunc.GetSpace()->GetIntRule(0),
-                   "IntegrationRule used within integrator and in"
--                  " QuadratureFunction appear to be different");
--      ceedCoeff->coeff.SetSize(nq * nelem);
-+                  " QuadratureFunction appear to be different.");
-       Memory<int> m_indices((int*)indices, nelem, false);
--      auto in = Reshape(qFun.Read(), nq, ne);
-+      auto in = Reshape(qfunc.Read(), nq, ne);
-       auto d_indices = Read(m_indices, nelem);
--      auto out = Reshape(ceedCoeff->coeff.Write(), nq, nelem);
-+      auto out = Reshape(ceed_coeff->vector.Write(), nq, nelem);
-       mfem::forall(nelem * nq, [=] MFEM_HOST_DEVICE (int i)
-       {
-          const int q = i%nq;
-@@ -296,30 +309,32 @@ void InitCoefficientWithIndices(mfem::Coefficient *Q, mfem::Mesh &mesh,
-          out(q, sub_e) = in(q, e);
-       });
-       m_indices.DeleteDevice();
--      InitVector(ceedCoeff->coeff, ceedCoeff->coeffVector);
--      coeff_ptr = ceedCoeff;
-+      InitVector(ceed_coeff->vector, ceed_coeff->coeff_vector);
-+      coeff_ptr = ceed_coeff;
-    }
-    else
-    {
--      QuadCoefficient *ceedCoeff = new QuadCoefficient(1);
-       const int nq = ir.GetNPoints();
--      ceedCoeff->coeff.SetSize(nq * nelem);
--      auto C = Reshape(ceedCoeff->coeff.HostWrite(), nq, nelem);
-+      QuadCoefficient *ceed_coeff = new QuadCoefficient(1);
-+      ceed_coeff->vector.SetSize(nq * nelem);
-+      auto C = Reshape(ceed_coeff->vector.HostWrite(), nq, nelem);
-       for (int i = 0; i < nelem; ++i)
-       {
-          const int e = indices[i];
--         mfem::ElementTransformation &T = *mesh.GetElementTransformation(e);
-+         auto &T = use_bdr ? *mesh.GetBdrElementTransformation(e) :
-+                   *mesh.GetElementTransformation(e);
-          for (int q = 0; q < nq; ++q)
-          {
--            C(q, i) = Q->Eval(T, ir.IntPoint(q));
-+            const IntegrationPoint &ip = ir.IntPoint(q);
-+            T.SetIntPoint(&ip);
-+            C(q, i) = Q->Eval(T, ip);
-          }
-       }
--      InitVector(ceedCoeff->coeff, ceedCoeff->coeffVector);
--      coeff_ptr = ceedCoeff;
-+      InitVector(ceed_coeff->vector, ceed_coeff->coeff_vector);
-+      coeff_ptr = ceed_coeff;
-    }
- }
- 
--
- /** @brief Initializes an mfem::ceed::Coefficient @a coeff_ptr from an
-     mfem::VectorCoefficient @a Q, an mfem::Mesh @a mesh, and an
-     mfem::IntegrationRule @a ir for the elements given by the indices @a indices.
-@@ -327,109 +342,165 @@ void InitCoefficientWithIndices(mfem::Coefficient *Q, mfem::Mesh &mesh,
-     @param[in] VQ is the vector coefficient from the `Integrator`.
-     @param[in] mesh is the mesh.
-     @param[in] ir is the integration rule.
--    @param[in] nelem The number of elements.
--    @param[in] indices The indices of the elements of same type in the
-+    @param[in] use_bdr is a flag to construct the coefficient on mesh boundaries.
-+    @param[in] nelem is the number of elements.
-+    @param[in] indices are the indices of the elements of same type in the
-                        `FiniteElementSpace`.
-     @param[out] coeff_ptr is the structure to store the coefficient for the
--                          `CeedOperator`.
--    @param[out] ctx is the Context associated to the QFunction. */
--template <typename Context>
--void InitCoefficientWithIndices(mfem::VectorCoefficient *VQ, mfem::Mesh &mesh,
--                                const mfem::IntegrationRule &ir,
--                                int nelem,
--                                const int* indices,
--                                Coefficient *&coeff_ptr, Context &ctx)
-+                          `CeedOperator`. */
-+inline void InitCoefficientWithIndices(mfem::VectorCoefficient *VQ,
-+                                       mfem::Mesh &mesh,
-+                                       const mfem::IntegrationRule &ir,
-+                                       bool use_bdr,
-+                                       int nelem, const int *indices,
-+                                       Coefficient *&coeff_ptr)
- {
--   if (VectorConstantCoefficient *const_coeff =
--          dynamic_cast<VectorConstantCoefficient*>(VQ))
-+   if (VQ == nullptr || dynamic_cast<mfem::VectorConstantCoefficient *>(VQ))
-    {
--      const int vdim = const_coeff->GetVDim();
--      const mfem::Vector &val = const_coeff->GetVec();
--      Coefficient *ceedCoeff = new Coefficient(vdim);
--      for (int i = 0; i < vdim; i++)
--      {
--         ctx.coeff[i] = val[i];
--      }
--      coeff_ptr = ceedCoeff;
-+      // The constant coefficient case is handled by the QFunction context
-+      coeff_ptr = nullptr;
-    }
--   else if (VectorGridFunctionCoefficient* vgf_coeff =
--               dynamic_cast<VectorGridFunctionCoefficient*>(VQ))
-+   else if (mfem::VectorGridFunctionCoefficient *vgf_coeff =
-+               dynamic_cast<mfem::VectorGridFunctionCoefficient *>(VQ))
-    {
--      GridCoefficient *ceedCoeff =
-+      GridCoefficient *ceed_coeff =
-          new GridCoefficient(*vgf_coeff->GetGridFunction());
--      coeff_ptr = ceedCoeff;
-+      coeff_ptr = ceed_coeff;
-    }
--   else if (VectorQuadratureFunctionCoefficient *cQ =
--               dynamic_cast<VectorQuadratureFunctionCoefficient*>(VQ))
-+   else if (mfem::VectorQuadratureFunctionCoefficient *vqf_coeff =
-+               dynamic_cast<mfem::VectorQuadratureFunctionCoefficient *>(VQ))
-    {
--      QuadCoefficient *ceedCoeff = new QuadCoefficient(cQ->GetVDim());
--      const int dim = mesh.Dimension();
--      const int ne = mesh.GetNE();
-+      const int vdim = vqf_coeff->GetVDim();
-+      const int ne = use_bdr ? mesh.GetNBE() : mesh.GetNE();
-       const int nq = ir.GetNPoints();
--      const mfem::QuadratureFunction &qFun = cQ->GetQuadFunction();
--      MFEM_VERIFY(qFun.Size() == dim * nq * ne,
--                  "Incompatible QuadratureFunction dimension \n");
--
--      MFEM_VERIFY(&ir == &qFun.GetSpace()->GetIntRule(0),
-+      QuadCoefficient *ceed_coeff = new QuadCoefficient(vdim);
-+      ceed_coeff->vector.SetSize(vdim * nq * nelem);
-+      const mfem::QuadratureFunction &qfunc = vqf_coeff->GetQuadFunction();
-+      MFEM_VERIFY(qfunc.Size() == vdim * nq * ne,
-+                  "Incompatible QuadratureFunction dimension.");
-+      MFEM_VERIFY(&ir == &qfunc.GetSpace()->GetIntRule(0),
-                   "IntegrationRule used within integrator and in"
--                  " QuadratureFunction appear to be different");
--      ceedCoeff->coeff.SetSize(dim * nq * nelem);
-+                  " QuadratureFunction appear to be different.");
-       Memory<int> m_indices((int*)indices, nelem, false);
--      auto in = Reshape(qFun.Read(), dim, nq, ne);
-+      auto in = Reshape(qfunc.Read(), vdim, nq, ne);
-       auto d_indices = Read(m_indices, nelem);
--      auto out = Reshape(ceedCoeff->coeff.Write(), dim, nq, nelem);
-+      auto out = Reshape(ceed_coeff->vector.Write(), vdim, nq, nelem);
-       mfem::forall(nelem * nq, [=] MFEM_HOST_DEVICE (int i)
-       {
-          const int q = i%nq;
-          const int sub_e = i/nq;
-          const int e = d_indices[sub_e];
--         for (int d = 0; d < dim; d++)
-+         for (int d = 0; d < vdim; d++)
-          {
-             out(d, q, sub_e) = in(d, q, e);
-          }
-       });
-       m_indices.DeleteDevice();
--      InitVector(ceedCoeff->coeff, ceedCoeff->coeffVector);
--      coeff_ptr = ceedCoeff;
-+      InitVector(ceed_coeff->vector, ceed_coeff->coeff_vector);
-+      coeff_ptr = ceed_coeff;
-    }
-    else
-    {
--      const int dim = mesh.Dimension();
--      QuadCoefficient *ceedCoeff = new QuadCoefficient(dim);
-+      const int vdim = VQ->GetVDim();
-       const int nq = ir.GetNPoints();
--      ceedCoeff->coeff.SetSize(dim * nq * nelem);
--      auto C = Reshape(ceedCoeff->coeff.HostWrite(), dim, nq, nelem);
-+      QuadCoefficient *ceed_coeff = new QuadCoefficient(vdim);
-+      ceed_coeff->vector.SetSize(vdim * nq * nelem);
-+      auto C = Reshape(ceed_coeff->vector.HostWrite(), vdim, nq, nelem);
-       mfem::DenseMatrix Q_ir;
-       for (int i = 0; i < nelem; ++i)
-       {
-          const int e = indices[i];
--         mfem::ElementTransformation &T = *mesh.GetElementTransformation(e);
-+         auto &T = use_bdr ? *mesh.GetBdrElementTransformation(e) :
-+                   *mesh.GetElementTransformation(e);
-          VQ->Eval(Q_ir, T, ir);
-          for (int q = 0; q < nq; ++q)
-          {
--            for (int d = 0; d < dim; ++d)
-+            for (int d = 0; d < vdim; ++d)
-             {
-                C(d, q, i) = Q_ir(d, q);
-             }
-          }
-       }
--      InitVector(ceedCoeff->coeff, ceedCoeff->coeffVector);
--      coeff_ptr = ceedCoeff;
-+      InitVector(ceed_coeff->vector, ceed_coeff->coeff_vector);
-+      coeff_ptr = ceed_coeff;
-+   }
-+}
-+
-+/** @brief Initializes an mfem::ceed::Coefficient @a coeff_ptr from an
-+    mfem::MatrixCoefficient @a Q, an mfem::Mesh @a mesh, and an
-+    mfem::IntegrationRule @a ir for the elements given by the indices @a indices.
-+
-+    @param[in] MQ is the matrix coefficient from the `Integrator`.
-+    @param[in] mesh is the mesh.
-+    @param[in] ir is the integration rule.
-+    @param[in] use_bdr is a flag to construct the coefficient on mesh boundaries.
-+    @param[in] nelem is the number of elements.
-+    @param[in] indices are the indices of the elements of same type in the
-+                       `FiniteElementSpace`.
-+    @param[out] coeff_ptr is the structure to store the coefficient for the
-+                          `CeedOperator`. */
-+inline void InitCoefficientWithIndices(mfem::MatrixCoefficient *MQ,
-+                                       mfem::Mesh &mesh,
-+                                       const mfem::IntegrationRule &ir,
-+                                       bool use_bdr,
-+                                       int nelem, const int *indices,
-+                                       Coefficient *&coeff_ptr)
-+{
-+   if (MQ == nullptr || dynamic_cast<mfem::MatrixConstantCoefficient *>(MQ))
-+   {
-+      // The constant coefficient case is handled by the QFunction context
-+      coeff_ptr = nullptr;
-+   }
-+   else
-+   {
-+      // Assumes matrix coefficient is symmetric
-+      const int vdim = MQ->GetVDim();
-+      const int ncomp = (vdim * (vdim + 1)) / 2;
-+      const int nq = ir.GetNPoints();
-+      QuadCoefficient *ceed_coeff = new QuadCoefficient(ncomp);
-+      ceed_coeff->vector.SetSize(ncomp * nq * nelem);
-+      auto C = Reshape(ceed_coeff->vector.HostWrite(), ncomp, nq, nelem);
-+      mfem::DenseMatrix Q_ip;
-+      for (int i = 0; i < nelem; ++i)
-+      {
-+         const int e = indices[i];
-+         auto &T = use_bdr ? *mesh.GetBdrElementTransformation(e) :
-+                   *mesh.GetElementTransformation(e);
-+         for (int q = 0; q < nq; ++q)
-+         {
-+            const IntegrationPoint &ip = ir.IntPoint(q);
-+            T.SetIntPoint(&ip);
-+            MQ->Eval(Q_ip, T, ip);
-+            for (int dj = 0; dj < vdim; ++dj)
-+            {
-+               for (int di = dj; di < vdim; ++di)
-+               {
-+                  const int idx = (dj * vdim) - (((dj - 1) * dj) / 2) + di - dj;
-+                  C(idx, q, i) = Q_ip(di, dj);  // Column-major
-+               }
-+            }
-+         }
-+      }
-+      InitVector(ceed_coeff->vector, ceed_coeff->coeff_vector);
-+      coeff_ptr = ceed_coeff;
-    }
- }
- 
--template <typename Coeff, typename Context>
--void InitCoefficient(Coeff *Q, mfem::Mesh &mesh,
--                     const mfem::IntegrationRule &ir, int nelem,
--                     const int* indices, Coefficient *&coeff_ptr, Context &ctx)
-+template <typename CoeffType>
-+inline void InitCoefficient(CoeffType *Q, mfem::Mesh &mesh,
-+                            const mfem::IntegrationRule &ir,
-+                            bool use_bdr,
-+                            int nelem,
-+                            const int *indices,
-+                            Coefficient *&coeff_ptr)
- {
-    if (indices)
-    {
--      InitCoefficientWithIndices(Q, mesh, ir, nelem, indices, coeff_ptr, ctx);
-+      InitCoefficientWithIndices(Q, mesh, ir, use_bdr, nelem, indices, coeff_ptr);
-    }
-    else
-    {
--      InitCoefficient(Q, mesh, ir, coeff_ptr, ctx);
-+      InitCoefficient(Q, mesh, ir, use_bdr, coeff_ptr);
-    }
- }
- 
-diff --git a/fem/ceed/interface/integrator.hpp b/fem/ceed/interface/integrator.hpp
-index eea79388d..aa24ec207 100644
---- a/fem/ceed/interface/integrator.hpp
-+++ b/fem/ceed/interface/integrator.hpp
-@@ -9,14 +9,14 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#ifndef MFEM_LIBCEED_INTEG
--#define MFEM_LIBCEED_INTEG
-+#ifndef MFEM_LIBCEED_INTEGRATOR
-+#define MFEM_LIBCEED_INTEGRATOR
- 
--#include "../../../config/config.hpp"
- #include "../../fespace.hpp"
- #include "../../gridfunc.hpp"
--#include "operator.hpp"
-+#include "basis.hpp"
- #include "coefficient.hpp"
-+#include "operator.hpp"
- #include "restriction.hpp"
- #include "util.hpp"
- #include "ceed.hpp"
-@@ -28,43 +28,27 @@ namespace ceed
- {
- 
- /** The different evaluation modes available for PA and MF CeedIntegrator. */
--enum class EvalMode { None, Interp, Grad, InterpAndGrad };
-+enum class EvalMode { None, Interp, Grad, InterpAndGrad, Div, Curl };
- 
- #ifdef MFEM_USE_CEED
- /** This structure is a template interface for the Assemble methods of
-     PAIntegrator and MFIntegrator. See ceed/mass.cpp for an example. */
- struct OperatorInfo
- {
--   /** The path to the qFunction header. */
-+   /** The path to the QFunction header. */
-    const char *header;
--   /** The name of the qFunction to build a partially assembled CeedOperator
--       with a constant Coefficient. */
--   const char *build_func_const;
--   /** The qFunction to build a partially assembled CeedOperator with a constant
--       Coefficient. */
--   CeedQFunctionUser build_qf_const;
--   /** The name of the qFunction to build a partially assembled CeedOperator
--       with a variable Coefficient. */
--   const char *build_func_quad;
--   /** The qFunction to build a partially assembled CeedOperator with a variable
--       Coefficient. */
--   CeedQFunctionUser build_qf_quad;
--   /** The name of the qFunction to apply a partially assembled CeedOperator. */
-+   /** The name of the QFunction to build a partially assembled CeedOperator. */
-+   const char *build_func;
-+   /** The QFunction to build a partially assembled CeedOperator. */
-+   CeedQFunctionUser build_qf;
-+   /** The name of the QFunction to apply a partially assembled CeedOperator. */
-    const char *apply_func;
--   /** The qFunction to apply a partially assembled CeedOperator. */
-+   /** The QFunction to apply a partially assembled CeedOperator. */
-    CeedQFunctionUser apply_qf;
--   /** The name of the qFunction to apply a matrix-free CeedOperator with a
--       constant Coefficient. */
--   const char *apply_func_mf_const;
--   /** The qFunction to apply a matrix-free CeedOperator with a constant
--       Coefficient. */
--   CeedQFunctionUser apply_qf_mf_const;
--   /** The name of the qFunction to apply a matrix-free CeedOperator with a
--       variable Coefficient. */
--   const char *apply_func_mf_quad;
--   /** The qFunction to apply a matrix-free CeedOperator with a variable
--       Coefficient. */
--   CeedQFunctionUser apply_qf_mf_quad;
-+   /** The name of the QFunction to apply a matrix-free CeedOperator. */
-+   const char *apply_func_mf;
-+   /** The QFunction to apply a matrix-free CeedOperator. */
-+   CeedQFunctionUser apply_qf_mf;
-    /** The EvalMode on the trial basis functions. */
-    EvalMode trial_op;
-    /** The EvalMode on the test basis functions. */
-@@ -74,284 +58,347 @@ struct OperatorInfo
- };
- #endif
- 
--/** This class represent a partially assembled operator using libCEED. */
--class PAIntegrator : public ceed::Operator
-+/** This class represents a matrix-free or partially assembled bilinear,
-+    mixed bilinear, or nonlinear form operator using libCEED. */
-+class Integrator : public Operator
- {
- #ifdef MFEM_USE_CEED
- protected:
--   CeedBasis  trial_basis, test_basis, mesh_basis;
--   CeedElemRestriction trial_restr, test_restr, mesh_restr, restr_i;
--   CeedQFunction build_qfunc, apply_qfunc;
-+   CeedBasis trial_basis, test_basis, mesh_basis;
-+   CeedElemRestriction trial_restr, test_restr, mesh_restr, qdata_restr;
-+   CeedQFunction apply_qfunc;
-+   CeedQFunctionContext apply_ctx;
-    CeedVector node_coords, qdata;
-    Coefficient *coeff;
--   CeedQFunctionContext build_ctx;
--   CeedOperator build_oper;
- 
- public:
--   PAIntegrator()
-+   Integrator()
-       : Operator(),
-         trial_basis(nullptr), test_basis(nullptr), mesh_basis(nullptr),
-         trial_restr(nullptr), test_restr(nullptr), mesh_restr(nullptr),
--        restr_i(nullptr),
--        build_qfunc(nullptr), apply_qfunc(nullptr), node_coords(nullptr),
--        qdata(nullptr), coeff(nullptr), build_ctx(nullptr), build_oper(nullptr)
--   { }
-+        qdata_restr(nullptr),
-+        apply_qfunc(nullptr), apply_ctx(nullptr),
-+        node_coords(nullptr), qdata(nullptr), coeff(nullptr) {}
- 
--   /** @brief This method assembles the `PAIntegrator` with the given
-+   /** @brief This method assembles the `Integrator` with the given
-        `CeedOperatorInfo` @a info, an `mfem::FiniteElementSpace` @a fes, an
--       `mfem::IntegrationRule` @a ir, and `mfem::Coefficient` or
--       `mfem::VectorCoefficient` @a Q.
-+       `mfem::IntegrationRule` @a ir, and `mfem::Coefficient`,
-+       `mfem::VectorCoefficient`, or `mfem::MatrixCoefficient` @a Q.
-        The `CeedOperatorInfo` type is expected to inherit from `OperatorInfo`,
--       and contain a `Context` type relevant to the qFunctions.
--
--       @param[in] info is the structure describing the CeedOperator to assemble.
--       @param[in] fes is the finite element space.
--       @param[in] ir is the integration rule for the operator.
--       @param[in] Q is the coefficient from the `Integrator`. */
-+       and contain a `Context` type relevant to the QFunctions.
-+
-+       @param[in] info The structure describing the CeedOperator to assemble.
-+       @param[in] fes The finite element space.
-+       @param[in] ir The integration rule for the operator.
-+       @param[in] Q The coefficient from the `Integrator`.
-+       @param[in] use_bdr Controls whether to construct the operator for the domain
-+                          or domain boundary.
-+       @param[in] use_mf Controls whether to construct a matrix-free or partially
-+                         assembled operator. */
-    template <typename CeedOperatorInfo, typename CoeffType>
-    void Assemble(CeedOperatorInfo &info,
-                  const mfem::FiniteElementSpace &fes,
-                  const mfem::IntegrationRule &ir,
--                 CoeffType *Q)
-+                 CoeffType *Q,
-+                 const bool use_bdr = false,
-+                 const bool use_mf = false)
-    {
--      Assemble(info, fes, ir, fes.GetNE(), nullptr, Q);
-+      Assemble(info, fes, fes, ir,
-+               use_bdr ? fes.GetNBE() : fes.GetNE(),
-+               nullptr, Q, use_bdr, use_mf);
-    }
- 
--   /** @brief This method assembles the `PAIntegrator` with the given
-+   /** @brief This method assembles the `Integrator` with the given
-        `CeedOperatorInfo` @a info, an `mfem::FiniteElementSpace` @a fes, an
--       `mfem::IntegrationRule` @a ir, and `mfem::Coefficient` or
--       `mfem::VectorCoefficient` @a Q for the elements given by the indices
--       @a indices.
-+       `mfem::IntegrationRule` @a ir, and `mfem::Coefficient`,
-+       `mfem::VectorCoefficient`, or `mfem::MatrixCoefficient` @a Q for the
-+       elements given by the indices @a indices.
-        The `CeedOperatorInfo` type is expected to inherit from `OperatorInfo`,
--       and contain a `Context` type relevant to the qFunctions.
-+       and contain a `Context` type relevant to the QFunctions.
- 
--       @param[in] info is the structure describing the CeedOperator to assemble.
--       @param[in] fes is the finite element space.
--       @param[in] ir is the integration rule for the operator.
-+       @param[in] info The structure describing the CeedOperator to assemble.
-+       @param[in] fes The finite element space.
-+       @param[in] ir The integration rule for the operator.
-        @param[in] nelem The number of elements.
-        @param[in] indices The indices of the elements of same type in the
-                           `FiniteElementSpace`. If `indices == nullptr`, assumes
-                           that the `FiniteElementSpace` is not mixed.
--       @param[in] Q is the coefficient from the `Integrator`. */
-+       @param[in] Q The coefficient from the `Integrator`.
-+       @param[in] use_bdr Controls whether to construct the operator for the domain
-+                          or domain boundary.
-+       @param[in] use_mf Controls whether to construct a matrix-free or partially
-+                         assembled operator. */
-    template <typename CeedOperatorInfo, typename CoeffType>
-    void Assemble(CeedOperatorInfo &info,
-                  const mfem::FiniteElementSpace &fes,
-                  const mfem::IntegrationRule &ir,
-                  int nelem,
--                 const int* indices,
--                 CoeffType *Q)
-+                 const int *indices,
-+                 CoeffType *Q,
-+                 const bool use_bdr = false,
-+                 const bool use_mf = false)
-    {
--      Assemble(info, fes, fes, ir, nelem, indices, Q);
-+      Assemble(info, fes, fes, ir, nelem, indices, Q, use_bdr, use_mf);
-    }
- 
--   /** This method assembles the PAIntegrator for mixed forms.
-+   /** This method assembles the `Integrator` for mixed forms.
- 
--       @param[in] info the `CeedOperatorInfo` describing the `CeedOperator`,
-+       @param[in] info The `CeedOperatorInfo` describing the `CeedOperator`,
-                        the `CeedOperatorInfo` type is expected to inherit from
-                        `OperatorInfo` and contain a `Context` type relevant to
--                       the qFunctions.
--       @param[in] trial_fes the trial `FiniteElementSpace` for the form,
--       @param[in] test_fes the test `FiniteElementSpace` for the form,
--       @param[in] ir the `IntegrationRule` for the numerical integration,
--       @param[in] Q `Coefficient` or `VectorCoefficient`. */
-+                       the QFunctions.
-+       @param[in] trial_fes The trial `FiniteElementSpace` for the form.
-+       @param[in] test_fes The test `FiniteElementSpace` for the form.
-+       @param[in] ir The `IntegrationRule` for the numerical integration.
-+       @param[in] Q `Coefficient`, `VectorCoefficient`, or
-+                    `MatrixCoefficient`.
-+       @param[in] use_bdr Controls whether to construct the operator for the domain
-+                          or domain boundary.
-+       @param[in] use_mf Controls whether to construct a matrix-free or partially
-+                         assembled operator. */
-    template <typename CeedOperatorInfo, typename CoeffType>
-    void Assemble(CeedOperatorInfo &info,
-                  const mfem::FiniteElementSpace &trial_fes,
-                  const mfem::FiniteElementSpace &test_fes,
-                  const mfem::IntegrationRule &ir,
--                 CoeffType *Q)
-+                 CoeffType *Q,
-+                 const bool use_bdr = false,
-+                 const bool use_mf = false)
-    {
--      Assemble(info, trial_fes, test_fes, ir, trial_fes.GetNE(), nullptr, Q);
-+      Assemble(info, trial_fes, test_fes, ir,
-+               use_bdr ? trial_fes.GetNBE() : trial_fes.GetNE(),
-+               nullptr, Q, use_bdr, use_mf);
-    }
- 
--   /** This method assembles the PAIntegrator for mixed forms on mixed meshes.
-+   /** This method assembles the `Integrator` for mixed forms on mixed meshes.
- 
--       @param[in] info the `CeedOperatorInfo` describing the `CeedOperator`,
-+       @param[in] info The `CeedOperatorInfo` describing the `CeedOperator`,
-                        the `CeedOperatorInfo` type is expected to inherit from
-                        `OperatorInfo` and contain a `Context` type relevant to
--                       the qFunctions.
--       @param[in] trial_fes the trial `FiniteElementSpace` for the form,
--       @param[in] test_fes the test `FiniteElementSpace` for the form,
--       @param[in] ir the `IntegrationRule` for the numerical integration,
--       @param[in] nelem The number of elements,
-+                       the QFunctions.
-+       @param[in] trial_fes The trial `FiniteElementSpace` for the form.
-+       @param[in] test_fes The test `FiniteElementSpace` for the form.
-+       @param[in] ir The `IntegrationRule` for the numerical integration.
-+       @param[in] nelem The number of elements.
-        @param[in] indices The indices of the elements of same type in the
-                           `FiniteElementSpace`. If `indices == nullptr`, assumes
--                          that the `FiniteElementSpace` is not mixed,
--       @param[in] Q `Coefficient` or `VectorCoefficient`. */
-+                          that the `FiniteElementSpace` is not mixed.
-+       @param[in] Q `Coefficient`, `VectorCoefficient`, or
-+                    `MatrixCoefficient`.
-+       @param[in] use_bdr Controls whether to construct the operator for the domain
-+                          or domain boundary.
-+       @param[in] use_mf Controls whether to construct a matrix-free or partially
-+                         assembled operator. */
-    template <typename CeedOperatorInfo, typename CoeffType>
-    void Assemble(CeedOperatorInfo &info,
-                  const mfem::FiniteElementSpace &trial_fes,
-                  const mfem::FiniteElementSpace &test_fes,
-                  const mfem::IntegrationRule &ir,
-                  int nelem,
--                 const int* indices,
--                 CoeffType *Q)
-+                 const int *indices,
-+                 CoeffType *Q,
-+                 const bool use_bdr = false,
-+                 const bool use_mf = false)
-    {
-       Ceed ceed(internal::ceed);
-       mfem::Mesh &mesh = *trial_fes.GetMesh();
--      MFEM_VERIFY(!(!indices && mesh.GetNumGeometries(mesh.Dimension()) > 1),
--                  "Use ceed::MixedIntegrator on mixed meshes.");
--      InitCoefficient(Q, mesh, ir, nelem, indices, coeff, info.ctx);
--      bool const_coeff = coeff->IsConstant();
--      std::string build_func = const_coeff ? info.build_func_const
--                               : info.build_func_quad;
--      CeedQFunctionUser build_qf = const_coeff ? info.build_qf_const
--                                   : info.build_qf_quad;
--      PAOperator op {info.qdatasize, info.header,
--                     build_func, build_qf,
--                     info.apply_func, info.apply_qf,
--                     info.trial_op,
--                     info.test_op
--                    };
--      CeedInt dim = mesh.SpaceDimension();
-+      CeedInt dim = mesh.Dimension() - use_bdr;
-+      CeedInt space_dim = mesh.SpaceDimension();
-+      CeedInt curl_dim = (dim < 3) ? 1 : dim;
-       CeedInt trial_vdim = trial_fes.GetVDim();
-       CeedInt test_vdim = test_fes.GetVDim();
--
--      mesh.EnsureNodes();
--      if ( &trial_fes == &test_fes )
-+      bool trial_vectorfe =
-+         (trial_fes.FEColl()->GetRangeType(dim) == mfem::FiniteElement::VECTOR);
-+      bool test_vectorfe =
-+         (test_fes.FEColl()->GetRangeType(dim) == mfem::FiniteElement::VECTOR);
-+      MFEM_VERIFY(!(!indices && mesh.GetNumGeometries(dim) > 1),
-+                  "Use ceed::MixedOperator<ceed::Integrator> on mixed meshes.");
-+      InitCoefficient(Q, mesh, ir, use_bdr, nelem, indices, coeff);
-+
-+      if (&trial_fes == &test_fes)
-       {
--         InitBasisAndRestriction(trial_fes, ir, nelem, indices,
--                                 ceed, &trial_basis, &trial_restr);
--         test_basis = trial_basis;
--         test_restr = trial_restr;
-+         InitBasis(trial_fes, ir, use_bdr, indices, ceed,
-+                   &trial_basis);
-+         InitRestriction(trial_fes, use_bdr, nelem, indices, ceed,
-+                         &trial_restr);
-+         CeedBasisReferenceCopy(trial_basis, &test_basis);
-+         CeedElemRestrictionReferenceCopy(trial_restr, &test_restr);
-       }
-       else
-       {
--         InitBasisAndRestriction(trial_fes, ir, nelem, indices,
--                                 ceed, &trial_basis, &trial_restr);
--         InitBasisAndRestriction(test_fes, ir, nelem, indices,
--                                 ceed, &test_basis, &test_restr);
-+         InitBasis(trial_fes, ir, use_bdr, indices, ceed,
-+                   &trial_basis);
-+         InitBasis(test_fes, ir, use_bdr, indices, ceed,
-+                   &test_basis);
-+         InitRestriction(trial_fes, use_bdr, nelem, indices, ceed,
-+                         &trial_restr);
-+         InitRestriction(test_fes, use_bdr, nelem, indices, ceed,
-+                         &test_restr);
-       }
- 
--      const mfem::FiniteElementSpace *mesh_fes = mesh.GetNodalFESpace();
--      MFEM_VERIFY(mesh_fes, "the Mesh has no nodal FE space");
--      InitBasisAndRestriction(*mesh_fes, ir, nelem, indices,
--                              ceed, &mesh_basis, &mesh_restr);
--
-       CeedInt trial_nqpts, test_nqpts;
-       CeedBasisGetNumQuadraturePoints(trial_basis, &trial_nqpts);
-       CeedBasisGetNumQuadraturePoints(test_basis, &test_nqpts);
-       MFEM_VERIFY(trial_nqpts == test_nqpts,
-                   "Trial and test basis must have the same number of quadrature"
-                   " points.");
--      CeedInt nqpts = trial_nqpts;
--
--      const int qdatasize = op.qdatasize;
--      InitStridedRestriction(*mesh_fes, nelem, nqpts, qdatasize,
--                             CEED_STRIDES_BACKEND,
--                             &restr_i);
-+      const CeedInt nqpts = trial_nqpts;
- 
-+      mesh.EnsureNodes();
-+      const mfem::FiniteElementSpace *mesh_fes = mesh.GetNodalFESpace();
-+      MFEM_VERIFY(mesh_fes, "The mesh has no nodal FE space.");
-+      InitBasis(*mesh_fes, ir, use_bdr, indices, ceed, &mesh_basis);
-+      InitRestriction(*mesh_fes, use_bdr, nelem, indices, ceed, &mesh_restr);
-       InitVector(*mesh.GetNodes(), node_coords);
- 
--      CeedVectorCreate(ceed, nelem * nqpts * qdatasize, &qdata);
--
--      // Context data to be passed to the Q-function.
--      info.ctx.dim = mesh.Dimension();
--      info.ctx.space_dim = mesh.SpaceDimension();
--      info.ctx.vdim = trial_fes.GetVDim();
--
--      std::string qf_file = GetCeedPath() + op.header;
--      std::string qf = qf_file + op.build_func;
--      CeedQFunctionCreateInterior(ceed, 1, op.build_qf, qf.c_str(),
--                                  &build_qfunc);
--
--      // Create the Q-function that builds the operator (i.e. computes its
--      // quadrature data) and set its context data.
--      if (VariableCoefficient *var_coeff =
--             dynamic_cast<VariableCoefficient*>(coeff))
--      {
--         CeedQFunctionAddInput(build_qfunc, "coeff", coeff->ncomp,
--                               var_coeff->emode);
--      }
--      CeedQFunctionAddInput(build_qfunc, "dx", dim * dim, CEED_EVAL_GRAD);
--      CeedQFunctionAddInput(build_qfunc, "weights", 1, CEED_EVAL_WEIGHT);
--      CeedQFunctionAddOutput(build_qfunc, "qdata", qdatasize, CEED_EVAL_NONE);
--
--      CeedQFunctionContextCreate(ceed, &build_ctx);
--      CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST,
-+      CeedQFunctionContextCreate(ceed, &apply_ctx);
-+      CeedQFunctionContextSetData(apply_ctx, CEED_MEM_HOST,
-                                   CEED_COPY_VALUES,
-                                   sizeof(info.ctx),
-                                   &info.ctx);
--      CeedQFunctionSetContext(build_qfunc, build_ctx);
- 
--      // Create the operator that builds the quadrature data for the operator.
--      CeedOperatorCreate(ceed, build_qfunc, NULL, NULL, &build_oper);
--      if (GridCoefficient *gridCoeff = dynamic_cast<GridCoefficient*>(coeff))
--      {
--         InitBasisAndRestriction(*gridCoeff->gf.FESpace(), ir,
--                                 nelem, indices, ceed,
--                                 &gridCoeff->basis,
--                                 &gridCoeff->restr);
--         CeedOperatorSetField(build_oper, "coeff", gridCoeff->restr,
--                              gridCoeff->basis, gridCoeff->coeffVector);
--      }
--      else if (QuadCoefficient *quadCoeff =
--                  dynamic_cast<QuadCoefficient*>(coeff))
-+      if (!use_mf)
-       {
--         const int ncomp = quadCoeff->ncomp;
--         CeedInt strides[3] = {ncomp, 1, ncomp*nqpts};
--         InitStridedRestriction(*mesh.GetNodalFESpace(),
--                                nelem, nqpts, ncomp, strides,
--                                &quadCoeff->restr);
--         CeedOperatorSetField(build_oper, "coeff", quadCoeff->restr,
--                              CEED_BASIS_COLLOCATED, quadCoeff->coeffVector);
-+         const int qdatasize = info.qdatasize;
-+         InitStridedRestriction(*mesh_fes, nelem, nqpts, qdatasize,
-+                                CEED_STRIDES_BACKEND, ceed,
-+                                &qdata_restr);
-+         CeedVectorCreate(ceed, nelem * nqpts * qdatasize, &qdata);
-+
-+         // Create the QFunction that builds the operator (i.e. computes its
-+         // quadrature data) and set its context data.
-+         CeedQFunction build_qfunc;
-+         std::string qf = GetCeedPath() + info.header + info.build_func;
-+         CeedQFunctionCreateInterior(ceed, 1, info.build_qf, qf.c_str(),
-+                                     &build_qfunc);
-+         if (coeff)
-+         {
-+            CeedQFunctionAddInput(build_qfunc, "coeff", coeff->ncomp, coeff->emode);
-+         }
-+         CeedQFunctionAddInput(build_qfunc, "dx", dim * space_dim, CEED_EVAL_GRAD);
-+         CeedQFunctionAddInput(build_qfunc, "weights", 1, CEED_EVAL_WEIGHT);
-+         CeedQFunctionAddOutput(build_qfunc, "qdata", qdatasize, CEED_EVAL_NONE);
-+         CeedQFunctionSetContext(build_qfunc, apply_ctx);
-+
-+         // Create the operator that builds the quadrature data for the operator.
-+         CeedOperator build_oper;
-+         CeedOperatorCreate(ceed, build_qfunc, NULL, NULL, &build_oper);
-+         if (GridCoefficient *grid_coeff = dynamic_cast<GridCoefficient *>(coeff))
-+         {
-+            const mfem::FiniteElementSpace *coeff_fes = grid_coeff->gf.FESpace();
-+            InitBasis(*coeff_fes, ir, use_bdr, indices, ceed,
-+                      &grid_coeff->basis);
-+            InitRestriction(*coeff_fes, use_bdr, nelem, indices, ceed,
-+                            &grid_coeff->restr);
-+            CeedOperatorSetField(build_oper, "coeff", grid_coeff->restr,
-+                                 grid_coeff->basis, grid_coeff->coeff_vector);
-+         }
-+         else if (QuadCoefficient *quad_coeff = dynamic_cast<QuadCoefficient *>(coeff))
-+         {
-+            const int ncomp = quad_coeff->ncomp;
-+            CeedInt strides[3] = {ncomp, 1, ncomp * nqpts};
-+            InitStridedRestriction(*mesh_fes, nelem, nqpts, ncomp, strides, ceed,
-+                                   &quad_coeff->restr);
-+            CeedOperatorSetField(build_oper, "coeff", quad_coeff->restr,
-+                                 CEED_BASIS_COLLOCATED, quad_coeff->coeff_vector);
-+         }
-+         CeedOperatorSetField(build_oper, "dx", mesh_restr,
-+                              mesh_basis, CEED_VECTOR_ACTIVE);
-+         CeedOperatorSetField(build_oper, "weights", CEED_ELEMRESTRICTION_NONE,
-+                              mesh_basis, CEED_VECTOR_NONE);
-+         CeedOperatorSetField(build_oper, "qdata", qdata_restr,
-+                              CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE);
-+         CeedOperatorCheckReady(build_oper);
-+
-+         // Compute the quadrature data for the operator.
-+         CeedOperatorApply(build_oper, node_coords, qdata, CEED_REQUEST_IMMEDIATE);
-+
-+         CeedOperatorDestroy(&build_oper);
-+         CeedQFunctionDestroy(&build_qfunc);
-+
-+         CeedVectorDestroy(&node_coords);
-+         node_coords = nullptr;
-+         delete coeff;
-+         coeff = nullptr;
-       }
--      CeedOperatorSetField(build_oper, "dx", mesh_restr,
--                           mesh_basis, CEED_VECTOR_ACTIVE);
--      CeedOperatorSetField(build_oper, "weights", CEED_ELEMRESTRICTION_NONE,
--                           mesh_basis, CEED_VECTOR_NONE);
--      CeedOperatorSetField(build_oper, "qdata", restr_i,
--                           CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE);
--
--      // Compute the quadrature data for the operator.
--      CeedOperatorApply(build_oper, node_coords, qdata, CEED_REQUEST_IMMEDIATE);
--
--      // Create the Q-function that defines the action of the operator.
--      qf = qf_file + op.apply_func;
--      CeedQFunctionCreateInterior(ceed, 1, op.apply_qf, qf.c_str(),
-+
-+      // Create the QFunction that defines the action of the operator.
-+      std::string qf = GetCeedPath() + info.header + info.apply_func;
-+      CeedQFunctionCreateInterior(ceed, 1, info.apply_qf, qf.c_str(),
-                                   &apply_qfunc);
-       // input
--      switch (op.trial_op)
-+      switch (info.trial_op)
-       {
-          case EvalMode::None:
-             CeedQFunctionAddInput(apply_qfunc, "u", trial_vdim, CEED_EVAL_NONE);
-             break;
-          case EvalMode::Interp:
--            CeedQFunctionAddInput(apply_qfunc, "u", trial_vdim, CEED_EVAL_INTERP);
-+            CeedQFunctionAddInput(apply_qfunc, "u", trial_vdim * (trial_vectorfe ? dim : 1),
-+                                  CEED_EVAL_INTERP);
-             break;
-          case EvalMode::Grad:
--            CeedQFunctionAddInput(apply_qfunc, "gu", trial_vdim*dim, CEED_EVAL_GRAD);
-+            CeedQFunctionAddInput(apply_qfunc, "gu", trial_vdim * dim, CEED_EVAL_GRAD);
-             break;
-          case EvalMode::InterpAndGrad:
-+            MFEM_VERIFY(!trial_vectorfe,
-+                        "EvalMode::InterpAndGrad is not intended for vector FE.");
-             CeedQFunctionAddInput(apply_qfunc, "u", trial_vdim, CEED_EVAL_INTERP);
--            CeedQFunctionAddInput(apply_qfunc, "gu", trial_vdim*dim, CEED_EVAL_GRAD);
-+            CeedQFunctionAddInput(apply_qfunc, "gu", trial_vdim * dim, CEED_EVAL_GRAD);
-+            break;
-+         case EvalMode::Div:
-+            CeedQFunctionAddInput(apply_qfunc, "du", trial_vdim, CEED_EVAL_DIV);
-+            break;
-+         case EvalMode::Curl:
-+            CeedQFunctionAddInput(apply_qfunc, "cu", trial_vdim * curl_dim, CEED_EVAL_CURL);
-             break;
-       }
--      // qdata
--      CeedQFunctionAddInput(apply_qfunc, "qdata", qdatasize, CEED_EVAL_NONE);
-+      if (use_mf)
-+      {
-+         if (coeff)
-+         {
-+            // coefficient
-+            CeedQFunctionAddInput(apply_qfunc, "coeff", coeff->ncomp, coeff->emode);
-+         }
-+         CeedQFunctionAddInput(apply_qfunc, "dx", dim * space_dim, CEED_EVAL_GRAD);
-+         CeedQFunctionAddInput(apply_qfunc, "weights", 1, CEED_EVAL_WEIGHT);
-+      }
-+      else
-+      {
-+         // qdata
-+         CeedQFunctionAddInput(apply_qfunc, "qdata", info.qdatasize, CEED_EVAL_NONE);
-+      }
-       // output
--      switch (op.test_op)
-+      switch (info.test_op)
-       {
-          case EvalMode::None:
-             CeedQFunctionAddOutput(apply_qfunc, "v", test_vdim, CEED_EVAL_NONE);
-             break;
-          case EvalMode::Interp:
--            CeedQFunctionAddOutput(apply_qfunc, "v", test_vdim, CEED_EVAL_INTERP);
-+            CeedQFunctionAddOutput(apply_qfunc, "v", test_vdim * (test_vectorfe ? dim : 1),
-+                                   CEED_EVAL_INTERP);
-             break;
-          case EvalMode::Grad:
--            CeedQFunctionAddOutput(apply_qfunc, "gv", test_vdim*dim, CEED_EVAL_GRAD);
-+            CeedQFunctionAddOutput(apply_qfunc, "gv", test_vdim * dim, CEED_EVAL_GRAD);
-             break;
-          case EvalMode::InterpAndGrad:
-+            MFEM_VERIFY(!test_vectorfe,
-+                        "EvalMode::InterpAndGrad is not intended for vector FE.");
-             CeedQFunctionAddOutput(apply_qfunc, "v", test_vdim, CEED_EVAL_INTERP);
--            CeedQFunctionAddOutput(apply_qfunc, "gv", test_vdim*dim, CEED_EVAL_GRAD);
-+            CeedQFunctionAddOutput(apply_qfunc, "gv", test_vdim * dim, CEED_EVAL_GRAD);
-+            break;
-+         case EvalMode::Div:
-+            CeedQFunctionAddOutput(apply_qfunc, "dv", test_vdim, CEED_EVAL_DIV);
-+            break;
-+         case EvalMode::Curl:
-+            CeedQFunctionAddOutput(apply_qfunc, "cv", test_vdim * curl_dim, CEED_EVAL_CURL);
-             break;
-       }
--      CeedQFunctionSetContext(apply_qfunc, build_ctx);
-+      CeedQFunctionSetContext(apply_qfunc, apply_ctx);
- 
-       // Create the operator.
-       CeedOperatorCreate(ceed, apply_qfunc, NULL, NULL, &oper);
-       // input
--      switch (op.trial_op)
-+      switch (info.trial_op)
-       {
-          case EvalMode::None:
-             CeedOperatorSetField(oper, "u", trial_restr,
-@@ -367,12 +414,46 @@ public:
-             CeedOperatorSetField(oper, "u", trial_restr, trial_basis, CEED_VECTOR_ACTIVE);
-             CeedOperatorSetField(oper, "gu", trial_restr, trial_basis, CEED_VECTOR_ACTIVE);
-             break;
-+         case EvalMode::Div:
-+            CeedOperatorSetField(oper, "du", trial_restr, trial_basis, CEED_VECTOR_ACTIVE);
-+            break;
-+         case EvalMode::Curl:
-+            CeedOperatorSetField(oper, "cu", trial_restr, trial_basis, CEED_VECTOR_ACTIVE);
-+            break;
-+      }
-+      if (use_mf)
-+      {
-+         // coefficient
-+         if (GridCoefficient *grid_coeff = dynamic_cast<GridCoefficient *>(coeff))
-+         {
-+            const mfem::FiniteElementSpace *coeff_fes = grid_coeff->gf.FESpace();
-+            InitBasis(*coeff_fes, ir, use_bdr, indices, ceed,
-+                      &grid_coeff->basis);
-+            InitRestriction(*coeff_fes, use_bdr, nelem, indices, ceed,
-+                            &grid_coeff->restr);
-+            CeedOperatorSetField(oper, "coeff", grid_coeff->restr,
-+                                 grid_coeff->basis, grid_coeff->coeff_vector);
-+         }
-+         else if (QuadCoefficient *quad_coeff = dynamic_cast<QuadCoefficient *>(coeff))
-+         {
-+            const int ncomp = quad_coeff->ncomp;
-+            CeedInt strides[3] = {ncomp, 1, ncomp * nqpts};
-+            InitStridedRestriction(*mesh_fes, nelem, nqpts, ncomp, strides, ceed,
-+                                   &quad_coeff->restr);
-+            CeedOperatorSetField(oper, "coeff", quad_coeff->restr,
-+                                 CEED_BASIS_COLLOCATED, quad_coeff->coeff_vector);
-+         }
-+         CeedOperatorSetField(oper, "dx", mesh_restr, mesh_basis, node_coords);
-+         CeedOperatorSetField(oper, "weights", CEED_ELEMRESTRICTION_NONE,
-+                              mesh_basis, CEED_VECTOR_NONE);
-+      }
-+      else
-+      {
-+         // qdata
-+         CeedOperatorSetField(oper, "qdata", qdata_restr, CEED_BASIS_COLLOCATED, qdata);
-       }
--      // qdata
--      CeedOperatorSetField(oper, "qdata", restr_i, CEED_BASIS_COLLOCATED,
--                           qdata);
-       // output
--      switch (op.test_op)
-+      switch (info.test_op)
-       {
-          case EvalMode::None:
-             CeedOperatorSetField(oper, "v", test_restr,
-@@ -388,385 +469,154 @@ public:
-             CeedOperatorSetField(oper, "v", test_restr, test_basis, CEED_VECTOR_ACTIVE);
-             CeedOperatorSetField(oper, "gv", test_restr, test_basis, CEED_VECTOR_ACTIVE);
-             break;
-+         case EvalMode::Div:
-+            CeedOperatorSetField(oper, "dv", test_restr, test_basis, CEED_VECTOR_ACTIVE);
-+            break;
-+         case EvalMode::Curl:
-+            CeedOperatorSetField(oper, "cv", test_restr, test_basis, CEED_VECTOR_ACTIVE);
-+            break;
-       }
-+      CeedOperatorCheckReady(oper);
- 
--      CeedVectorCreate(ceed, trial_vdim*trial_fes.GetNDofs(), &u);
--      CeedVectorCreate(ceed, test_vdim*test_fes.GetNDofs(), &v);
-+      CeedVectorCreate(ceed, trial_vdim * trial_fes.GetNDofs(), &u);
-+      CeedVectorCreate(ceed, test_vdim * test_fes.GetNDofs(), &v);
-    }
- 
--   virtual ~PAIntegrator()
-+   virtual ~Integrator()
-    {
--      CeedQFunctionDestroy(&build_qfunc);
-+      // All basis and restriction objects are destroyed by fes destructor
-       CeedQFunctionDestroy(&apply_qfunc);
--      CeedQFunctionContextDestroy(&build_ctx);
-+      CeedQFunctionContextDestroy(&apply_ctx);
-       CeedVectorDestroy(&node_coords);
-       CeedVectorDestroy(&qdata);
-       delete coeff;
--      CeedOperatorDestroy(&build_oper);
-    }
--
--private:
--   /** This structure contains the data to assemble a partially assembled
--       operator with libCEED. */
--   struct PAOperator
--   {
--      /** The number of quadrature data at each quadrature point. */
--      int qdatasize;
--      /** The path to the header containing the functions for libCEED. */
--      std::string header;
--      /** The name of the Qfunction to build the quadrature data. */
--      std::string build_func;
--      /** The Qfunction to build the quadrature data. */
--      CeedQFunctionUser build_qf;
--      /** The name of the Qfunction to apply the operator. */
--      std::string apply_func;
--      /** The Qfunction to apply the operator. */
--      CeedQFunctionUser apply_qf;
--      /** The evaluation mode to apply to the trial function (CEED_EVAL_INTERP,
--          CEED_EVAL_GRAD, etc.) */
--      EvalMode trial_op;
--      /** The evaluation mode to apply to the test function ( CEED_EVAL_INTERP,
--          CEED_EVAL_GRAD, etc.)*/
--      EvalMode test_op;
--   };
- #endif
- };
- 
--/** This class represent a matrix-free operator using libCEED. */
--class MFIntegrator : public ceed::Operator
-+/** This class represents a matrix-free or partially assembled discrete linear
-+    operator using libCEED. */
-+class Interpolator : public Operator
- {
- #ifdef MFEM_USE_CEED
- protected:
--   CeedBasis trial_basis, test_basis, mesh_basis;
--   CeedElemRestriction trial_restr, test_restr, mesh_restr, restr_i;
--   CeedQFunction apply_qfunc;
--   CeedVector node_coords, qdata;
--   Coefficient *coeff;
--   CeedQFunctionContext build_ctx;
-+   CeedBasis basis_ctof;
-+   CeedElemRestriction trial_restr, test_restr;
-+   CeedQFunction apply_qfunc, apply_qfunc_t;
- 
- public:
--   MFIntegrator()
-+   Interpolator()
-       : Operator(),
--        trial_basis(nullptr), test_basis(nullptr), mesh_basis(nullptr),
--        trial_restr(nullptr), test_restr(nullptr), mesh_restr(nullptr),
--        restr_i(nullptr),
--        apply_qfunc(nullptr), node_coords(nullptr),
--        qdata(nullptr), coeff(nullptr), build_ctx(nullptr) { }
--
--   /** @brief This method assembles the `MFIntegrator` with the given
--       `CeedOperatorInfo` @a info, an `mfem::FiniteElementSpace` @a fes, an
--       `mfem::IntegrationRule` @a ir, and `mfem::Coefficient` or
--       `mfem::VectorCoefficient` @a Q.
--       The `CeedOperatorInfo` type is expected to inherit from `OperatorInfo`,
--       and contain a `Context` type relevant to the qFunctions.
--
--       @param[in] info is the structure describing the CeedOperator to assemble.
--       @param[in] fes is the finite element space.
--       @param[in] ir is the integration rule for the operator.
--       @param[in] Q is the coefficient from the `Integrator`. */
--   template <typename CeedOperatorInfo, typename CoeffType>
--   void Assemble(CeedOperatorInfo &info,
--                 const mfem::FiniteElementSpace &fes,
--                 const mfem::IntegrationRule &ir,
--                 CoeffType *Q)
--   {
--      Assemble(info, fes, ir, fes.GetNE(), nullptr, Q);
--   }
--
--   /** @brief This method assembles the `MFIntegrator` with the given
--       `CeedOperatorInfo` @a info, an `mfem::FiniteElementSpace` @a fes, an
--       `mfem::IntegrationRule` @a ir, and `mfem::Coefficient` or
--       `mfem::VectorCoefficient` @a Q for the elements given by the indices
--       @a indices.
--       The `CeedOperatorInfo` type is expected to inherit from `OperatorInfo`,
--       and contain a `Context` type relevant to the qFunctions.
-+        basis_ctof(nullptr),
-+        trial_restr(nullptr), test_restr(nullptr),
-+        apply_qfunc(nullptr), apply_qfunc_t(nullptr) {}
- 
--       @param[in] info is the structure describing the CeedOperator to assemble.
--       @param[in] fes is the finite element space.
--       @param[in] ir is the integration rule for the operator.
--       @param[in] nelem The number of elements.
--       @param[in] indices The indices of the elements of same type in the
--                          `FiniteElementSpace`. If `indices == nullptr`, assumes
--                          that the `FiniteElementSpace` is not mixed.
--       @param[in] Q is the coefficient from the `Integrator`. */
--   template <typename CeedOperatorInfo, typename CoeffType>
--   void Assemble(CeedOperatorInfo &info,
--                 const mfem::FiniteElementSpace &fes,
--                 const mfem::IntegrationRule &ir,
--                 int nelem,
--                 const int* indices,
--                 CoeffType *Q)
--   {
--      Assemble(info, fes, fes, ir, nelem, indices, Q);
--   }
--
--   /** This method assembles the MFIntegrator for mixed forms.
-+   /** This method assembles the `Interpolator`.
- 
--       @param[in] info the `CeedOperatorInfo` describing the `CeedOperator`,
-+       @param[in] info The `CeedOperatorInfo` describing the `CeedOperator`,
-                        the `CeedOperatorInfo` type is expected to inherit from
-                        `OperatorInfo` and contain a `Context` type relevant to
--                       the qFunctions.
--       @param[in] trial_fes the trial `FiniteElementSpace` for the form,
--       @param[in] test_fes the test `FiniteElementSpace` for the form,
--       @param[in] ir the `IntegrationRule` for the numerical integration,
--       @param[in] Q `Coefficient` or `VectorCoefficient`. */
-+                       the QFunctions.
-+       @param[in] trial_fes The trial `FiniteElementSpace` for the form.
-+       @param[in] test_fes The test `FiniteElementSpace` for the form.
-+       @param[in] ir Not supported by `Interpolator`.
-+       @param[in] Q Not supported by `Interpolator`.
-+       @param[in] use_bdr Not supported by `Interpolator`.
-+       @param[in] use_mf Controls whether to construct a matrix-free or partially
-+                         assembled operator. */
-    template <typename CeedOperatorInfo, typename CoeffType>
-    void Assemble(CeedOperatorInfo &info,
-                  const mfem::FiniteElementSpace &trial_fes,
-                  const mfem::FiniteElementSpace &test_fes,
-                  const mfem::IntegrationRule &ir,
--                 CoeffType *Q)
-+                 CoeffType *Q,
-+                 const bool use_bdr = false,
-+                 const bool use_mf = false)
-    {
--      Assemble(info, trial_fes, test_fes, ir, trial_fes.GetNE(), nullptr, Q);
-+      Assemble(info, trial_fes, test_fes, ir,
-+               use_bdr ? trial_fes.GetNBE() : trial_fes.GetNE(),
-+               nullptr, Q, use_bdr, use_mf);
-    }
- 
--   /** This method assembles the MFIntegrator for mixed forms.
-+   /** This method assembles the `Interpolator` on mixed meshes. Its signature
-+       matches that for `Integrator`.
- 
--       @param[in] info the `CeedOperatorInfo` describing the `CeedOperator`,
-+       @param[in] info The `CeedOperatorInfo` describing the `CeedOperator`,
-                        the `CeedOperatorInfo` type is expected to inherit from
-                        `OperatorInfo` and contain a `Context` type relevant to
--                       the qFunctions.
--       @param[in] trial_fes the trial `FiniteElementSpace` for the form,
--       @param[in] test_fes the test `FiniteElementSpace` for the form,
--       @param[in] ir the `IntegrationRule` for the numerical integration,
--       @param[in] nelem The number of elements,
-+                       the QFunctions.
-+       @param[in] trial_fes The trial `FiniteElementSpace` for the form.
-+       @param[in] test_fes The test `FiniteElementSpace` for the form.
-+       @param[in] ir Not supported by `Interpolator`.
-+       @param[in] nelem The number of elements.
-        @param[in] indices The indices of the elements of same type in the
-                           `FiniteElementSpace`. If `indices == nullptr`, assumes
--                          that the `FiniteElementSpace` is not mixed,
--       @param[in] Q `Coefficient` or `VectorCoefficient`. */
-+                          that the `FiniteElementSpace` is not mixed.
-+       @param[in] Q Not supported by `Interpolator`.
-+       @param[in] use_bdr Not supported by `Interpolator`.
-+       @param[in] use_mf Controls whether to construct a matrix-free or partially
-+                         assembled operator. */
-    template <typename CeedOperatorInfo, typename CoeffType>
-    void Assemble(CeedOperatorInfo &info,
-                  const mfem::FiniteElementSpace &trial_fes,
-                  const mfem::FiniteElementSpace &test_fes,
-                  const mfem::IntegrationRule &ir,
-                  int nelem,
--                 const int* indices,
--                 CoeffType *Q)
-+                 const int *indices,
-+                 CoeffType *Q,
-+                 const bool use_bdr = false,
-+                 const bool use_mf = false)
-    {
-       Ceed ceed(internal::ceed);
--      Mesh &mesh = *trial_fes.GetMesh();
--      MFEM_VERIFY(!(!indices && mesh.GetNumGeometries(mesh.Dimension()) > 1),
--                  "Use ceed::MixedIntegrator on mixed meshes.");
--      InitCoefficient(Q, mesh, ir, nelem, indices, coeff, info.ctx);
--      bool const_coeff = coeff->IsConstant();
--      std::string apply_func = const_coeff ? info.apply_func_mf_const
--                               : info.apply_func_mf_quad;
--      CeedQFunctionUser apply_qf = const_coeff ? info.apply_qf_mf_const
--                                   : info.apply_qf_mf_quad;
--      MFOperator op {info.header,
--                     apply_func, apply_qf,
--                     info.trial_op,
--                     info.test_op
--                    };
--
--      CeedInt dim = mesh.SpaceDimension();
-       CeedInt trial_vdim = trial_fes.GetVDim();
-       CeedInt test_vdim = test_fes.GetVDim();
--
--      mesh.EnsureNodes();
--      if ( &trial_fes == &test_fes )
--      {
--         InitBasisAndRestriction(trial_fes, ir, nelem, indices, ceed,
--                                 &trial_basis, &trial_restr);
--         test_basis = trial_basis;
--         test_restr = trial_restr;
--      }
--      else
--      {
--         InitBasisAndRestriction(trial_fes, ir, nelem, indices, ceed,
--                                 &trial_basis, &trial_restr);
--         InitBasisAndRestriction(test_fes, ir, nelem, indices, ceed,
--                                 &test_basis, &test_restr);
--      }
--
--      const mfem::FiniteElementSpace *mesh_fes = mesh.GetNodalFESpace();
--      MFEM_VERIFY(mesh_fes, "the Mesh has no nodal FE space");
--      InitBasisAndRestriction(*mesh_fes, ir, nelem, indices, ceed, &mesh_basis,
--                              &mesh_restr);
--
--      CeedInt trial_nqpts, test_nqpts;
--      CeedBasisGetNumQuadraturePoints(trial_basis, &trial_nqpts);
--      CeedBasisGetNumQuadraturePoints(trial_basis, &test_nqpts);
--      MFEM_VERIFY(trial_nqpts == test_nqpts,
--                  "Trial and test basis must have the same number of quadrature"
--                  " points.");
--      CeedInt nqpts = trial_nqpts;
--
--      InitVector(*mesh.GetNodes(), node_coords);
--
--      // Context data to be passed to the Q-function.
--      info.ctx.dim = mesh.Dimension();
--      info.ctx.space_dim = mesh.SpaceDimension();
--      info.ctx.vdim = trial_fes.GetVDim();
--
--      std::string qf_file = GetCeedPath() + op.header;
--      std::string qf = qf_file + op.apply_func;
--      CeedQFunctionCreateInterior(ceed, 1, op.apply_qf, qf.c_str(),
-+      MFEM_VERIFY(!Q, "ceed:Interpolator does not support coefficients.");
-+      MFEM_VERIFY(!use_bdr,
-+                  "ceed:Interpolator does not support boundary interpolators.");
-+      MFEM_VERIFY(trial_vdim == 1 && test_vdim == 1,
-+                  "ceed:Interpolator does not support spaces with vdim > 1.");
-+
-+      InitInterpolatorBasis(trial_fes, test_fes, indices, ceed, &basis_ctof);
-+      InitInterpolatorRestrictions(trial_fes, test_fes, nelem, indices, ceed,
-+                                   &trial_restr, &test_restr);
-+      MFEM_VERIFY(info.trial_op == EvalMode::Interp,
-+                  "ceed:Interpolator only supports trial_op == Interp.");
-+      MFEM_VERIFY(info.test_op == EvalMode::None,
-+                  "ceed:Interpolator only supports test_op == None.");
-+
-+      // Create the QFunction that defines the action of the operator
-+      // (only an identity as element dof multiplicity is handled outside of libCEED)
-+      CeedQFunctionCreateIdentity(ceed, trial_vdim, CEED_EVAL_INTERP, CEED_EVAL_NONE,
-                                   &apply_qfunc);
-+      CeedQFunctionCreateIdentity(ceed, trial_vdim, CEED_EVAL_NONE, CEED_EVAL_INTERP,
-+                                  &apply_qfunc_t);
- 
--      // Create the Q-function that builds the operator (i.e. computes its
--      // quadrature data) and set its context data.
--      if (VariableCoefficient *var_coeff =
--             dynamic_cast<VariableCoefficient*>(coeff))
--      {
--         CeedQFunctionAddInput(apply_qfunc, "coeff", coeff->ncomp,
--                               var_coeff->emode);
--      }
--      // input
--      switch (op.trial_op)
--      {
--         case EvalMode::None:
--            CeedQFunctionAddInput(apply_qfunc, "u", trial_vdim,
--                                  CEED_EVAL_NONE);
--            break;
--         case EvalMode::Interp:
--            CeedQFunctionAddInput(apply_qfunc, "u", trial_vdim,
--                                  CEED_EVAL_INTERP);
--            break;
--         case EvalMode::Grad:
--            CeedQFunctionAddInput(apply_qfunc, "gu", trial_vdim*dim,
--                                  CEED_EVAL_GRAD);
--            break;
--         case EvalMode::InterpAndGrad:
--            CeedQFunctionAddInput(apply_qfunc, "u", trial_vdim,
--                                  CEED_EVAL_INTERP);
--            CeedQFunctionAddInput(apply_qfunc, "gu", trial_vdim*dim,
--                                  CEED_EVAL_GRAD);
--            break;
--      }
--      CeedQFunctionAddInput(apply_qfunc, "dx", dim * dim, CEED_EVAL_GRAD);
--      CeedQFunctionAddInput(apply_qfunc, "weights", 1, CEED_EVAL_WEIGHT);
--      // output
--      switch (op.test_op)
--      {
--         case EvalMode::None:
--            CeedQFunctionAddOutput(apply_qfunc, "v", test_vdim,
--                                   CEED_EVAL_NONE);
--            break;
--         case EvalMode::Interp:
--            CeedQFunctionAddOutput(apply_qfunc, "v", test_vdim,
--                                   CEED_EVAL_INTERP);
--            break;
--         case EvalMode::Grad:
--            CeedQFunctionAddOutput(apply_qfunc, "gv", test_vdim*dim,
--                                   CEED_EVAL_GRAD);
--            break;
--         case EvalMode::InterpAndGrad:
--            CeedQFunctionAddOutput(apply_qfunc, "v", test_vdim,
--                                   CEED_EVAL_INTERP);
--            CeedQFunctionAddOutput(apply_qfunc, "gv", test_vdim*dim,
--                                   CEED_EVAL_GRAD);
--            break;
--      }
--
--      CeedQFunctionContextCreate(ceed, &build_ctx);
--      CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST,
--                                  CEED_COPY_VALUES,
--                                  sizeof(info.ctx),
--                                  &info.ctx);
--      CeedQFunctionSetContext(apply_qfunc, build_ctx);
--
--      // Create the operator.
-+      // Create the operator
-       CeedOperatorCreate(ceed, apply_qfunc, NULL, NULL, &oper);
--      // coefficient
--      if (GridCoefficient *gridCoeff = dynamic_cast<GridCoefficient*>(coeff))
--      {
--         InitBasisAndRestriction(*gridCoeff->gf.FESpace(), ir, nelem, indices,
--                                 ceed, &gridCoeff->basis, &gridCoeff->restr);
--         CeedOperatorSetField(oper, "coeff", gridCoeff->restr,
--                              gridCoeff->basis, gridCoeff->coeffVector);
--      }
--      else if (QuadCoefficient *quadCoeff =
--                  dynamic_cast<QuadCoefficient*>(coeff))
--      {
--         const int ncomp = quadCoeff->ncomp;
--         CeedInt strides[3] = {ncomp, 1, ncomp*nqpts};
--         InitStridedRestriction(*mesh.GetNodalFESpace(),
--                                nelem, nqpts, ncomp, strides,
--                                &quadCoeff->restr);
--         CeedOperatorSetField(oper, "coeff", quadCoeff->restr,
--                              CEED_BASIS_COLLOCATED, quadCoeff->coeffVector);
--      }
--      // input
--      switch (op.trial_op)
--      {
--         case EvalMode::None:
--            CeedOperatorSetField(oper, "u", trial_restr,
--                                 CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE);
--            break;
--         case EvalMode::Interp:
--            CeedOperatorSetField(oper, "u", trial_restr, trial_basis,
--                                 CEED_VECTOR_ACTIVE);
--            break;
--         case EvalMode::Grad:
--            CeedOperatorSetField(oper, "gu", trial_restr, trial_basis,
--                                 CEED_VECTOR_ACTIVE);
--            break;
--         case EvalMode::InterpAndGrad:
--            CeedOperatorSetField(oper, "u", trial_restr, trial_basis,
--                                 CEED_VECTOR_ACTIVE);
--            CeedOperatorSetField(oper, "gu", trial_restr, trial_basis,
--                                 CEED_VECTOR_ACTIVE);
--            break;
--      }
--      CeedOperatorSetField(oper, "dx", mesh_restr,
--                           mesh_basis, node_coords);
--      CeedOperatorSetField(oper, "weights", CEED_ELEMRESTRICTION_NONE,
--                           mesh_basis, CEED_VECTOR_NONE);
--      // output
--      switch (op.test_op)
--      {
--         case EvalMode::None:
--            CeedOperatorSetField(oper, "v", test_restr,
--                                 CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE);
--            break;
--         case EvalMode::Interp:
--            CeedOperatorSetField(oper, "v", test_restr, test_basis,
--                                 CEED_VECTOR_ACTIVE);
--            break;
--         case EvalMode::Grad:
--            CeedOperatorSetField(oper, "gv", test_restr, test_basis,
--                                 CEED_VECTOR_ACTIVE);
--            break;
--         case EvalMode::InterpAndGrad:
--            CeedOperatorSetField(oper, "v", test_restr, test_basis,
--                                 CEED_VECTOR_ACTIVE);
--            CeedOperatorSetField(oper, "gv", test_restr, test_basis,
--                                 CEED_VECTOR_ACTIVE);
--            break;
--      }
--
--      CeedVectorCreate(ceed, trial_vdim*trial_fes.GetNDofs(), &u);
--      CeedVectorCreate(ceed, test_vdim*test_fes.GetNDofs(), &v);
-+      CeedOperatorSetField(oper, "input", trial_restr, basis_ctof,
-+                           CEED_VECTOR_ACTIVE);
-+      CeedOperatorSetField(oper, "output", test_restr, CEED_BASIS_COLLOCATED,
-+                           CEED_VECTOR_ACTIVE);
-+      CeedOperatorCheckReady(oper);
-+
-+      // Create the transpose operator
-+      CeedOperatorCreate(ceed, apply_qfunc_t, NULL, NULL, &oper_t);
-+      CeedOperatorSetField(oper_t, "input", test_restr, CEED_BASIS_COLLOCATED,
-+                           CEED_VECTOR_ACTIVE);
-+      CeedOperatorSetField(oper_t, "output", trial_restr, basis_ctof,
-+                           CEED_VECTOR_ACTIVE);
-+      CeedOperatorCheckReady(oper_t);
-+
-+      CeedVectorCreate(ceed, trial_vdim * trial_fes.GetNDofs(), &u);
-+      CeedVectorCreate(ceed, test_vdim * test_fes.GetNDofs(), &v);
-    }
- 
--   virtual ~MFIntegrator()
-+   virtual ~Interpolator()
-    {
-+      // All basis and restriction objects are destroyed by fes destructor
-       CeedQFunctionDestroy(&apply_qfunc);
--      CeedQFunctionContextDestroy(&build_ctx);
--      CeedVectorDestroy(&node_coords);
--      CeedVectorDestroy(&qdata);
--      delete coeff;
-+      CeedQFunctionDestroy(&apply_qfunc_t);
-    }
--
--private:
--   /** This structure contains the data to assemble a matrix-free operator with
--       libCEED. */
--   struct MFOperator
--   {
--      /** The path to the header containing the functions for libCEED. */
--      std::string header;
--      /** The name of the Qfunction to apply the operator. */
--      std::string apply_func;
--      /** The Qfunction to apply the operator. */
--      CeedQFunctionUser apply_qf;
--      /** The evaluation mode to apply to the trial function (CEED_EVAL_INTERP,
--          CEED_EVAL_GRAD, etc.) */
--      EvalMode trial_op;
--      /** The evaluation mode to apply to the test function ( CEED_EVAL_INTERP,
--          CEED_EVAL_GRAD, etc.) */
--      EvalMode test_op;
--   };
- #endif
- };
- 
-@@ -774,4 +624,4 @@ private:
- 
- } // namespace mfem
- 
--#endif // MFEM_LIBCEED_INTEG
-+#endif // MFEM_LIBCEED_INTEGRATOR
-diff --git a/fem/ceed/interface/interface.hpp b/fem/ceed/interface/interface.hpp
-index 0a69121ad..8b877188c 100644
---- a/fem/ceed/interface/interface.hpp
-+++ b/fem/ceed/interface/interface.hpp
-@@ -14,6 +14,8 @@
- 
- // Object wrapping a CeedOperator in a mfem::Operator.
- #include "operator.hpp"
-+// Operator supporting mixed finite element spaces.
-+#include "mixed_operator.hpp"
- // Functions to initialize CeedBasis objects.
- #include "basis.hpp"
- // Functions to initialize CeedRestriction objects.
-@@ -22,8 +24,6 @@
- #include "coefficient.hpp"
- // PA or MF Operator using libCEED.
- #include "integrator.hpp"
--// PA Operator supporting mixed finite element spaces.
--#include "mixed_integrator.hpp"
- // Utility functions
- #include "util.hpp"
- // Wrapper to include <ceed.h>
-diff --git a/fem/ceed/interface/mixed_integrator.hpp b/fem/ceed/interface/mixed_integrator.hpp
-deleted file mode 100644
-index 8d344f4d9..000000000
---- a/fem/ceed/interface/mixed_integrator.hpp
-+++ /dev/null
-@@ -1,126 +0,0 @@
--// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
--// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
--// LICENSE and NOTICE for details. LLNL-CODE-806117.
--//
--// This file is part of the MFEM library. For more information and source code
--// availability visit https://mfem.org.
--//
--// MFEM is free software; you can redistribute it and/or modify it under the
--// terms of the BSD-3 license. We welcome feedback and contributions, see file
--// CONTRIBUTING.md for details.
--
--#ifndef MFEM_LIBCEED_MIXED_INTEGRATOR
--#define MFEM_LIBCEED_MIXED_INTEGRATOR
--
--#include "ceed.hpp"
--#include "integrator.hpp"
--#include <unordered_map>
--
--namespace mfem
--{
--
--namespace ceed
--{
--
--/** @brief This class wraps a `ceed::PAIntegrator` or `ceed::MFIntegrator` to
--    support mixed finite element spaces. */
--template <typename CeedInteg>
--class MixedIntegrator : public ceed::Operator
--{
--#ifdef MFEM_USE_CEED
--   using ElementKey = std::pair<int, int>; //< Element::Type, Order >
--   struct key_hash
--   {
--      std::size_t operator()(const ElementKey& k) const
--      {
--         return k.first + 2 * k.second;
--      }
--   };
--   using ElementsMap = std::unordered_map<const ElementKey, int*, key_hash>;
--   std::vector<CeedInteg*> sub_ops;
--
--public:
--   template <typename Integrator, typename CeedOperatorInfo, typename CoeffType>
--   void Assemble(const Integrator &integ,
--                 CeedOperatorInfo &info,
--                 const mfem::FiniteElementSpace &fes,
--                 CoeffType *Q)
--   {
--      ElementsMap count;
--      ElementsMap element_indices;
--      ElementsMap offsets;
--
--      // Count the number of elements of each type
--      for (int i = 0; i < fes.GetNE(); i++)
--      {
--         ElementKey key(fes.GetElementType(i), fes.GetElementOrder(i));
--         auto value = count.find(key);
--         if (value == count.end())
--         {
--            count[key] = new int(1);
--         }
--         else
--         {
--            (*value->second)++;
--         }
--      }
--
--      // Initialization of the arrays
--      for ( const auto& value : count )
--      {
--         element_indices[value.first] = new int[*value.second];
--         offsets[value.first] = new int(0);
--      }
--
--      // Populates the indices arrays for each element type
--      for (int i = 0; i < fes.GetNE(); i++)
--      {
--         ElementKey key(fes.GetElementType(i), fes.GetElementOrder(i));
--         int &offset = *(offsets[key]);
--         int* indices_array = element_indices[key];
--         indices_array[offset] = i;
--         offset++;
--      }
--
--      // Create composite CeedOperator
--      CeedCompositeOperatorCreate(internal::ceed, &oper);
--
--      // Create each sub-CeedOperator
--      sub_ops.reserve(element_indices.size());
--      for (const auto& value : element_indices)
--      {
--         const int* indices = value.second;
--         const int first_index = indices[0];
--         const mfem::FiniteElement &el = *fes.GetFE(first_index);
--         auto &T = *fes.GetMesh()->GetElementTransformation(first_index);
--         MFEM_ASSERT(!integ.GetIntegrationRule(),
--                     "Mixed mesh integrators should not have an"
--                     " IntegrationRule.");
--         const IntegrationRule &ir = GetRule(integ, el, el, T);
--         auto sub_op = new CeedInteg();
--         int nelem = *count[value.first];
--         sub_op->Assemble(info, fes, ir, nelem, indices, Q);
--         sub_ops.push_back(sub_op);
--         CeedCompositeOperatorAddSub(oper, sub_op->GetCeedOperator());
--      }
--
--      const int ndofs = fes.GetVDim() * fes.GetNDofs();
--      CeedVectorCreate(internal::ceed, ndofs, &u);
--      CeedVectorCreate(internal::ceed, ndofs, &v);
--   }
--
--   virtual ~MixedIntegrator()
--   {
--      for (auto sub_op : sub_ops)
--      {
--         delete sub_op;
--      }
--   }
--#endif
--};
--
--} // namespace ceed
--
--} // namespace mfem
--
--#endif // MFEM_LIBCEED_MIXED_INTEGRATOR
-diff --git a/fem/ceed/interface/mixed_operator.hpp b/fem/ceed/interface/mixed_operator.hpp
-new file mode 100644
-index 000000000..963e367be
---- /dev/null
-+++ b/fem/ceed/interface/mixed_operator.hpp
-@@ -0,0 +1,204 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#ifndef MFEM_LIBCEED_MIXED_OPERATOR
-+#define MFEM_LIBCEED_MIXED_OPERATOR
-+
-+#include <array>
-+#include <unordered_map>
-+#include "../../fespace.hpp"
-+#include "operator.hpp"
-+#include "ceed.hpp"
-+#ifdef MFEM_USE_CEED
-+#include <ceed/hash.h>
-+#endif
-+
-+namespace mfem
-+{
-+
-+namespace ceed
-+{
-+
-+/** @brief This class wraps one or more `OpType` objects to support
-+    finite element spaces on mixed meshes. */
-+template <typename OpType>
-+class MixedOperator : public Operator
-+{
-+#ifdef MFEM_USE_CEED
-+   using ElementKey =
-+      std::array<int, 3>; // <mfem::Element::Type, TrialOrder, TestOrder>
-+   struct key_hash
-+   {
-+      std::size_t operator()(const ElementKey &k) const
-+      {
-+         return CeedHashCombine(
-+                   CeedHashCombine(CeedHashInt(k[0]),
-+                                   CeedHashInt(k[1])),
-+                   CeedHashInt(k[2]));
-+      }
-+   };
-+   using ElementsMap = std::unordered_map<const ElementKey, int *, key_hash>;
-+   std::vector<OpType *> sub_ops;
-+
-+public:
-+   template <typename IntegratorType, typename CeedOperatorInfo, typename CoeffType>
-+   void Assemble(const IntegratorType &integ,
-+                 CeedOperatorInfo &info,
-+                 const mfem::FiniteElementSpace &fes,
-+                 CoeffType *Q,
-+                 const bool use_bdr = false,
-+                 const bool use_mf = false)
-+   {
-+      Assemble(integ, info, fes, fes, Q, use_bdr, use_mf);
-+   }
-+
-+   template <typename IntegratorType, typename CeedOperatorInfo, typename CoeffType>
-+   void Assemble(const IntegratorType &integ,
-+                 CeedOperatorInfo &info,
-+                 const mfem::FiniteElementSpace &trial_fes,
-+                 const mfem::FiniteElementSpace &test_fes,
-+                 CoeffType *Q,
-+                 const bool use_bdr = false,
-+                 const bool use_mf = false)
-+   {
-+      MFEM_VERIFY(trial_fes.GetMesh() == test_fes.GetMesh(),
-+                  "Trial and test basis must correspond to the same Mesh.");
-+      mfem::Mesh &mesh = *trial_fes.GetMesh();
-+      const bool mixed =
-+         mesh.GetNumGeometries(mesh.Dimension() - use_bdr) > 1 ||
-+         trial_fes.IsVariableOrder() || test_fes.IsVariableOrder();
-+      if (!mixed)
-+      {
-+         const mfem::FiniteElement &trial_fe = use_bdr ? *trial_fes.GetBE(0) :
-+                                               *trial_fes.GetFE(0);
-+         const mfem::FiniteElement &test_fe = use_bdr ? *test_fes.GetBE(0) :
-+                                              *test_fes.GetFE(0);
-+         auto &T = use_bdr ? *mesh.GetBdrElementTransformation(0) :
-+                   *mesh.GetElementTransformation(0);
-+         const mfem::IntegrationRule &ir =
-+            integ.GetIntegrationRule() ? *integ.GetIntegrationRule() :
-+            integ.GetRule(trial_fe, test_fe, T);
-+         sub_ops.reserve(1);
-+         auto *sub_op = new OpType();
-+         sub_op->Assemble(info, trial_fes, test_fes, ir, Q, use_bdr, use_mf);
-+         sub_ops.push_back(sub_op);
-+
-+         CeedOperatorReferenceCopy(sub_op->GetCeedOperator(), &oper);
-+         if (sub_op->GetCeedOperatorTranspose())
-+         {
-+            CeedOperatorReferenceCopy(sub_op->GetCeedOperatorTranspose(), &oper_t);
-+         }
-+         CeedVectorReferenceCopy(sub_op->GetCeedVectorU(), &u);
-+         CeedVectorReferenceCopy(sub_op->GetCeedVectorV(), &v);
-+         return;
-+      }
-+
-+      // Count the number of elements of each type
-+      ElementsMap count;
-+      ElementsMap element_indices;
-+      ElementsMap offsets;
-+
-+      const int ne = use_bdr ? mesh.GetNBE() : mesh.GetNE();
-+      for (int i = 0; i < ne; i++)
-+      {
-+         const mfem::FiniteElement &trial_fe = use_bdr ? *trial_fes.GetBE(i) :
-+                                               *trial_fes.GetFE(i);
-+         const mfem::FiniteElement &test_fe = use_bdr ? *test_fes.GetBE(i) :
-+                                              *test_fes.GetFE(i);
-+         mfem::Element::Type type = use_bdr ? mesh.GetBdrElementType(i) :
-+                                    mesh.GetElementType(i);
-+         ElementKey key = {type, trial_fe.GetOrder(), test_fe.GetOrder()};
-+         auto value = count.find(key);
-+         if (value == count.end())
-+         {
-+            count[key] = new int(1);
-+         }
-+         else
-+         {
-+            (*value->second)++;
-+         }
-+      }
-+
-+      // Initialization of the arrays
-+      for (const auto &value : count)
-+      {
-+         element_indices[value.first] = new int[*value.second];
-+         offsets[value.first] = new int(0);
-+      }
-+
-+      // Populates the indices arrays for each element type
-+      for (int i = 0; i < ne; i++)
-+      {
-+         const mfem::FiniteElement &trial_fe = use_bdr ? *trial_fes.GetBE(i) :
-+                                               *trial_fes.GetFE(i);
-+         const mfem::FiniteElement &test_fe = use_bdr ? *test_fes.GetBE(i) :
-+                                              *test_fes.GetFE(i);
-+         mfem::Element::Type type = use_bdr ? mesh.GetBdrElementType(i) :
-+                                    mesh.GetElementType(i);
-+         ElementKey key = {type, trial_fe.GetOrder(), test_fe.GetOrder()};
-+         int &offset = *(offsets[key]);
-+         int *indices_array = element_indices[key];
-+         indices_array[offset] = i;
-+         offset++;
-+      }
-+
-+      // Create composite CeedOperator
-+      CeedCompositeOperatorCreate(internal::ceed, &oper);
-+
-+      // Create each sub-CeedOperator
-+      sub_ops.reserve(element_indices.size());
-+      for (const auto &value : element_indices)
-+      {
-+         const int *indices = value.second;
-+         const int first_index = indices[0];
-+         const mfem::FiniteElement &trial_fe =
-+            use_bdr ? *trial_fes.GetBE(first_index) : *trial_fes.GetFE(first_index);
-+         const mfem::FiniteElement &test_fe =
-+            use_bdr ? *test_fes.GetBE(first_index) : *test_fes.GetFE(first_index);
-+         auto &T = use_bdr ? *mesh.GetBdrElementTransformation(first_index) :
-+                   *mesh.GetElementTransformation(first_index);
-+         MFEM_VERIFY(!integ.GetIntegrationRule(),
-+                     "Mixed mesh integrators should not have an IntegrationRule.");
-+         const IntegrationRule &ir = integ.GetRule(trial_fe, test_fe, T);
-+         auto *sub_op = new OpType();
-+         sub_op->Assemble(info, trial_fes, test_fes, ir, *count[value.first], indices, Q,
-+                          use_bdr, use_mf);
-+         sub_ops.push_back(sub_op);
-+         CeedCompositeOperatorAddSub(oper, sub_op->GetCeedOperator());
-+         if (sub_op->GetCeedOperatorTranspose())
-+         {
-+            if (!oper_t) { CeedCompositeOperatorCreate(internal::ceed, &oper_t); }
-+            CeedCompositeOperatorAddSub(oper_t, sub_op->GetCeedOperatorTranspose());
-+         }
-+      }
-+      CeedOperatorCheckReady(oper);
-+      if (oper_t) { CeedOperatorCheckReady(oper_t); }
-+
-+      CeedVectorCreate(internal::ceed, trial_fes.GetVDim() * trial_fes.GetNDofs(),
-+                       &u);
-+      CeedVectorCreate(internal::ceed, test_fes.GetVDim() * test_fes.GetNDofs(), &v);
-+   }
-+
-+   virtual ~MixedOperator()
-+   {
-+      for (auto *sub_op : sub_ops)
-+      {
-+         delete sub_op;
-+      }
-+   }
-+#endif
-+};
-+
-+} // namespace ceed
-+
-+} // namespace mfem
-+
-+#endif // MFEM_LIBCEED_MIXED_OPERATOR
-diff --git a/fem/ceed/interface/operator.cpp b/fem/ceed/interface/operator.cpp
-index 745e474e5..7f21e7ce4 100644
---- a/fem/ceed/interface/operator.cpp
-+++ b/fem/ceed/interface/operator.cpp
-@@ -11,11 +11,9 @@
- 
- #include "operator.hpp"
- 
--#include "../../../config/config.hpp"
- #include "../../../linalg/vector.hpp"
- #include "../../fespace.hpp"
- #include "util.hpp"
--#include "ceed.hpp"
- 
- namespace mfem
- {
-@@ -27,6 +25,7 @@ namespace ceed
- Operator::Operator(CeedOperator op)
- {
-    oper = op;
-+   oper_t = nullptr;
-    CeedSize in_len, out_len;
-    int ierr = CeedOperatorGetActiveVectorLengths(oper, &in_len, &out_len);
-    PCeedChk(ierr);
-@@ -39,9 +38,15 @@ Operator::Operator(CeedOperator op)
- }
- #endif
- 
--void Operator::Mult(const mfem::Vector &x, mfem::Vector &y) const
-+namespace
- {
-+
- #ifdef MFEM_USE_CEED
-+void CeedAddMult(CeedOperator oper, CeedVector u, CeedVector v,
-+                 const mfem::Vector &x, mfem::Vector &y, double a)
-+{
-+   MFEM_VERIFY(a == 0.0 || a == 1.0,
-+               "General coefficient case is not yet supported!");
-    const CeedScalar *x_ptr;
-    CeedScalar *y_ptr;
-    CeedMemType mem;
-@@ -49,21 +54,37 @@ void Operator::Mult(const mfem::Vector &x, mfem::Vector &y) const
-    if (Device::Allows(Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE)
-    {
-       x_ptr = x.Read();
--      y_ptr = y.Write();
-+      y_ptr = (a == 0.0) ? y.Write() : y.ReadWrite();
-    }
-    else
-    {
-       x_ptr = x.HostRead();
--      y_ptr = y.HostWrite();
-+      y_ptr = (a == 0.0) ? y.HostWrite() : y.HostReadWrite();
-       mem = CEED_MEM_HOST;
-    }
-    CeedVectorSetArray(u, mem, CEED_USE_POINTER, const_cast<CeedScalar*>(x_ptr));
-    CeedVectorSetArray(v, mem, CEED_USE_POINTER, y_ptr);
- 
--   CeedOperatorApply(oper, u, v, CEED_REQUEST_IMMEDIATE);
-+   if (a == 0.0)
-+   {
-+      CeedOperatorApply(oper, u, v, CEED_REQUEST_IMMEDIATE);
-+   }
-+   else
-+   {
-+      CeedOperatorApplyAdd(oper, u, v, CEED_REQUEST_IMMEDIATE);
-+   }
- 
-    CeedVectorTakeArray(u, mem, const_cast<CeedScalar**>(&x_ptr));
-    CeedVectorTakeArray(v, mem, &y_ptr);
-+}
-+#endif
-+
-+} // namespace
-+
-+void Operator::Mult(const mfem::Vector &x, mfem::Vector &y) const
-+{
-+#ifdef MFEM_USE_CEED
-+   CeedAddMult(oper, u, v, x, y, 0.0);
- #else
-    MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
- #endif
-@@ -73,29 +94,30 @@ void Operator::AddMult(const mfem::Vector &x, mfem::Vector &y,
-                        const double a) const
- {
- #ifdef MFEM_USE_CEED
--   MFEM_VERIFY(a == 1.0, "General coefficient case is not yet supported!");
--   const CeedScalar *x_ptr;
--   CeedScalar *y_ptr;
--   CeedMemType mem;
--   CeedGetPreferredMemType(mfem::internal::ceed, &mem);
--   if (Device::Allows(Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE)
--   {
--      x_ptr = x.Read();
--      y_ptr = y.ReadWrite();
--   }
--   else
--   {
--      x_ptr = x.HostRead();
--      y_ptr = y.HostReadWrite();
--      mem = CEED_MEM_HOST;
--   }
--   CeedVectorSetArray(u, mem, CEED_USE_POINTER, const_cast<CeedScalar*>(x_ptr));
--   CeedVectorSetArray(v, mem, CEED_USE_POINTER, y_ptr);
-+   CeedAddMult(oper, u, v, x, y, 1.0);
-+#else
-+   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
-+#endif
-+}
- 
--   CeedOperatorApplyAdd(oper, u, v, CEED_REQUEST_IMMEDIATE);
-+void Operator::MultTranspose(const mfem::Vector &x, mfem::Vector &y) const
-+{
-+#ifdef MFEM_USE_CEED
-+   MFEM_ASSERT(oper_t,
-+               "No transpose operator defined for ceed::Operator::MultTranspose.");
-+   CeedAddMult(oper_t, v, u, x, y, 0.0);
-+#else
-+   MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
-+#endif
-+}
- 
--   CeedVectorTakeArray(u, mem, const_cast<CeedScalar**>(&x_ptr));
--   CeedVectorTakeArray(v, mem, &y_ptr);
-+void Operator::AddMultTranspose(const mfem::Vector &x, mfem::Vector &y,
-+                                const double a) const
-+{
-+#ifdef MFEM_USE_CEED
-+   MFEM_ASSERT(oper_t,
-+               "No transpose operator defined for ceed::Operator::AddMultTranspose.");
-+   CeedAddMult(oper_t, v, u, x, y, 1.0);
- #else
-    MFEM_ABORT("MFEM must be built with MFEM_USE_CEED=YES to use libCEED.");
- #endif
-diff --git a/fem/ceed/interface/operator.hpp b/fem/ceed/interface/operator.hpp
-index 9e4a4faaf..2f131e64d 100644
---- a/fem/ceed/interface/operator.hpp
-+++ b/fem/ceed/interface/operator.hpp
-@@ -26,35 +26,40 @@ class Operator : public mfem::Operator
- {
- protected:
- #ifdef MFEM_USE_CEED
--   CeedOperator oper;
-+   CeedOperator oper, oper_t;
-    CeedVector u, v;
- 
--   Operator() : oper(nullptr), u(nullptr), v(nullptr) { }
-+   Operator() : oper(nullptr), oper_t(nullptr), u(nullptr), v(nullptr) {}
- #endif
- 
- public:
- #ifdef MFEM_USE_CEED
-    /// This class takes ownership of op and will delete it
-    Operator(CeedOperator op);
-+
-+   CeedOperator &GetCeedOperator() { return oper; }
-+   CeedOperator &GetCeedOperatorTranspose() { return oper_t; }
-+   CeedVector &GetCeedVectorU() { return u; }
-+   CeedVector &GetCeedVectorV() { return v; }
- #endif
- 
-    void Mult(const mfem::Vector &x, mfem::Vector &y) const override;
-    void AddMult(const mfem::Vector &x, mfem::Vector &y,
-                 const double a = 1.0) const override;
-+   void MultTranspose(const mfem::Vector &x, mfem::Vector &y) const override;
-+   void AddMultTranspose(const mfem::Vector &x, mfem::Vector &y,
-+                         const double a = 1.0) const override;
-    void GetDiagonal(mfem::Vector &diag) const;
- 
-    virtual ~Operator()
-    {
- #ifdef MFEM_USE_CEED
-       CeedOperatorDestroy(&oper);
-+      CeedOperatorDestroy(&oper_t);
-       CeedVectorDestroy(&u);
-       CeedVectorDestroy(&v);
- #endif
-    }
--
--#ifdef MFEM_USE_CEED
--   CeedOperator& GetCeedOperator() { return oper; }
--#endif
- };
- 
- } // namespace ceed
-diff --git a/fem/ceed/interface/restriction.cpp b/fem/ceed/interface/restriction.cpp
-index e7e8539bd..32a00223f 100644
---- a/fem/ceed/interface/restriction.cpp
-+++ b/fem/ceed/interface/restriction.cpp
-@@ -9,8 +9,9 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../../../fem/gridfunc.hpp"
--#include "ceed.hpp"
-+#include "restriction.hpp"
-+
-+#include "util.hpp"
- 
- namespace mfem
- {
-@@ -20,222 +21,368 @@ namespace ceed
- 
- #ifdef MFEM_USE_CEED
- 
--static void InitNativeRestr(const mfem::FiniteElementSpace &fes,
--                            Ceed ceed, CeedElemRestriction *restr)
-+enum RestrType {Strided = 0, Lexico, Native, NativeRange};
-+
-+static void InitLexicoRestr(const mfem::FiniteElementSpace &fes,
-+                            bool use_bdr,
-+                            int nelem,
-+                            Ceed ceed,
-+                            CeedElemRestriction *restr)
- {
--   const mfem::FiniteElement *fe = fes.GetFE(0);
-+   const mfem::FiniteElement *fe = use_bdr ? fes.GetBE(0) :
-+                                   fes.GetFE(0);
-    const int P = fe->GetDof();
--   CeedInt compstride = fes.GetOrdering()==Ordering::byVDIM ? 1 : fes.GetNDofs();
--   const mfem::Table &el_dof = fes.GetElementToDofTable();
--   mfem::Array<int> tp_el_dof(el_dof.Size_of_connections());
--   const mfem::TensorBasisElement * tfe =
-+   const mfem::TensorBasisElement *tfe =
-       dynamic_cast<const mfem::TensorBasisElement *>(fe);
--   const int stride = compstride == 1 ? fes.GetVDim() : 1;
--   const mfem::Array<int>& dof_map = tfe->GetDofMap();
-+   const mfem::Array<int> &dof_map = tfe->GetDofMap();
-+   CeedInt compstride =
-+      (fes.GetOrdering() == Ordering::byVDIM) ? 1 : fes.GetNDofs();
-+   const int stride = (compstride == 1) ? fes.GetVDim() : 1;
-+   const mfem::Table &el_dof = use_bdr ? fes.GetBdrElementToDofTable() :
-+                               fes.GetElementToDofTable();
-+   const int *el_map = el_dof.GetJ();
-+   mfem::Array<int> tp_el_dof(el_dof.Size_of_connections());
-+   mfem::Array<bool> tp_el_orients(el_dof.Size_of_connections());
-+   bool use_orients = false;
- 
--   for (int i = 0; i < fes.GetNE(); i++)
-+   for (int i = 0; i < nelem; i++)
-    {
--      const int el_offset = P * i;
-+      // No need to handle DofTransformation for tensor-product elements
-       for (int j = 0; j < P; j++)
-       {
--         tp_el_dof[j+el_offset] = stride*el_dof.GetJ()[dof_map[j]+el_offset];
-+         const int sdid = dof_map[j];  // signed
-+         const int did = (sdid >= 0) ? sdid : -1 - sdid;
-+         const int sgid = el_map[did + P * i];  // signed
-+         const int gid = (sgid >= 0) ? sgid : -1 - sgid;
-+         tp_el_dof[j + P * i] = stride * gid;
-+         tp_el_orients[j + P * i] =
-+            (sgid >= 0 && sdid < 0) || (sgid < 0 && sdid >= 0);
-+         use_orients = use_orients || tp_el_orients[j + P * i];
-       }
-    }
- 
--   CeedElemRestrictionCreate(ceed, fes.GetNE(), P, fes.GetVDim(),
--                             compstride, (fes.GetVDim())*(fes.GetNDofs()),
--                             CEED_MEM_HOST, CEED_COPY_VALUES,
--                             tp_el_dof.GetData(), restr);
-+   if (use_orients)
-+   {
-+      CeedElemRestrictionCreateOriented(ceed, nelem, P, fes.GetVDim(),
-+                                        compstride, fes.GetVDim() * fes.GetNDofs(),
-+                                        CEED_MEM_HOST, CEED_COPY_VALUES,
-+                                        tp_el_dof.GetData(), tp_el_orients.GetData(),
-+                                        restr);
-+   }
-+   else
-+   {
-+      CeedElemRestrictionCreate(ceed, nelem, P, fes.GetVDim(),
-+                                compstride, fes.GetVDim() * fes.GetNDofs(),
-+                                CEED_MEM_HOST, CEED_COPY_VALUES,
-+                                tp_el_dof.GetData(), restr);
-+   }
- }
- 
--static void InitLexicoRestr(const mfem::FiniteElementSpace &fes,
--                            Ceed ceed, CeedElemRestriction *restr)
-+static void InitNativeRestr(const mfem::FiniteElementSpace &fes,
-+                            bool use_bdr,
-+                            int nelem,
-+                            Ceed ceed,
-+                            CeedElemRestriction *restr)
- {
--   const mfem::FiniteElement *fe = fes.GetFE(0);
-+   const mfem::FiniteElement *fe = use_bdr ? fes.GetBE(0) :
-+                                   fes.GetFE(0);
-    const int P = fe->GetDof();
--   CeedInt compstride = fes.GetOrdering()==Ordering::byVDIM ? 1 : fes.GetNDofs();
--   const mfem::Table &el_dof = fes.GetElementToDofTable();
-+   CeedInt compstride =
-+      (fes.GetOrdering() == Ordering::byVDIM) ? 1 : fes.GetNDofs();
-+   const int stride = (compstride == 1) ? fes.GetVDim() : 1;
-+   const mfem::Table &el_dof = use_bdr ? fes.GetBdrElementToDofTable() :
-+                               fes.GetElementToDofTable();
-+   const int *el_map = el_dof.GetJ();
-    mfem::Array<int> tp_el_dof(el_dof.Size_of_connections());
--   const int stride = compstride == 1 ? fes.GetVDim() : 1;
-+   mfem::Array<bool> tp_el_orients(el_dof.Size_of_connections());
-+   bool use_orients = false;
- 
--   for (int e = 0; e < fes.GetNE(); e++)
-+   for (int i = 0; i < nelem; i++)
-    {
--      for (int i = 0; i < P; i++)
-+      // DofTransformation support uses InitNativeRestrWithIndices
-+      for (int j = 0; j < P; j++)
-       {
--         tp_el_dof[i + e*P] = stride*el_dof.GetJ()[i + e*P];
-+         const int sgid = el_map[j + P * i];  // signed
-+         const int gid = (sgid >= 0) ? sgid : -1 - sgid;
-+         tp_el_dof[j + P * i] = stride * gid;
-+         tp_el_orients[j + P * i] = (sgid < 0);
-+         use_orients = use_orients || tp_el_orients[j + P * i];
-       }
-    }
- 
--   CeedElemRestrictionCreate(ceed, fes.GetNE(), P, fes.GetVDim(),
--                             compstride, (fes.GetVDim())*(fes.GetNDofs()),
--                             CEED_MEM_HOST, CEED_COPY_VALUES,
--                             tp_el_dof.GetData(), restr);
--}
--
--static void InitRestrictionImpl(const mfem::FiniteElementSpace &fes,
--                                Ceed ceed, CeedElemRestriction *restr)
--{
--   const mfem::FiniteElement *fe = fes.GetFE(0);
--   const mfem::TensorBasisElement * tfe =
--      dynamic_cast<const mfem::TensorBasisElement *>(fe);
--   if ( tfe && tfe->GetDofMap().Size()>0 ) // Native ordering using dof_map
-+   if (use_orients)
-    {
--      InitNativeRestr(fes, ceed, restr);
-+      CeedElemRestrictionCreateOriented(ceed, nelem, P, fes.GetVDim(),
-+                                        compstride, fes.GetVDim() * fes.GetNDofs(),
-+                                        CEED_MEM_HOST, CEED_COPY_VALUES,
-+                                        tp_el_dof.GetData(), tp_el_orients.GetData(),
-+                                        restr);
-    }
--   else  // Lexicographic ordering
-+   else
-    {
--      InitLexicoRestr(fes, ceed, restr);
-+      CeedElemRestrictionCreate(ceed, nelem, P, fes.GetVDim(),
-+                                compstride, fes.GetVDim() * fes.GetNDofs(),
-+                                CEED_MEM_HOST, CEED_COPY_VALUES,
-+                                tp_el_dof.GetData(), restr);
-    }
- }
- 
--static void InitNativeRestrWithIndices(
--   const mfem::FiniteElementSpace &fes,
--   int nelem,
--   const int* indices,
--   Ceed ceed, CeedElemRestriction *restr)
-+static void InitLexicoRestrWithIndices(const mfem::FiniteElementSpace &fes,
-+                                       bool use_bdr,
-+                                       int nelem,
-+                                       const int *indices,
-+                                       Ceed ceed,
-+                                       CeedElemRestriction *restr)
- {
--   const mfem::FiniteElement *fe = fes.GetFE(indices[0]);
-+   const mfem::FiniteElement *fe = use_bdr ? fes.GetBE(indices[0]) :
-+                                   fes.GetFE(indices[0]);
-    const int P = fe->GetDof();
--   CeedInt compstride = fes.GetOrdering()==Ordering::byVDIM ? 1 : fes.GetNDofs();
--   mfem::Array<int> tp_el_dof(nelem*P);
--   const mfem::TensorBasisElement * tfe =
-+   const mfem::TensorBasisElement *tfe =
-       dynamic_cast<const mfem::TensorBasisElement *>(fe);
--   Array<int> dofs;
--   const int stride = compstride == 1 ? fes.GetVDim() : 1;
--   const mfem::Array<int>& dof_map = tfe->GetDofMap();
-+   const mfem::Array<int> &dof_map = tfe->GetDofMap();
-+   CeedInt compstride =
-+      (fes.GetOrdering() == Ordering::byVDIM) ? 1 : fes.GetNDofs();
-+   const int stride = (compstride == 1) ? fes.GetVDim() : 1;
-+   mfem::Array<int> tp_el_dof(nelem * P), dofs;
-+   mfem::Array<bool> tp_el_orients(nelem * P);
-+   bool use_orients = false;
- 
-    for (int i = 0; i < nelem; i++)
-    {
-+      // No need to handle DofTransformation for tensor-product elements
-       const int elem_index = indices[i];
--      fes.GetElementDofs(elem_index, dofs);
--      const int el_offset = P * i;
--      for (int j = 0; j < P; j++)
-+      mfem::DofTransformation *dof_trans;
-+      if (use_bdr)
-       {
--         tp_el_dof[j + el_offset] = stride*dofs[dof_map[j]];
-+         dof_trans = fes.GetBdrElementDofs(elem_index, dofs);
-       }
--   }
--
--   CeedElemRestrictionCreate(ceed, nelem, P, fes.GetVDim(),
--                             compstride, (fes.GetVDim())*(fes.GetNDofs()),
--                             CEED_MEM_HOST, CEED_COPY_VALUES,
--                             tp_el_dof.GetData(), restr);
--}
--
--static void InitLexicoRestrWithIndices(
--   const mfem::FiniteElementSpace &fes,
--   int nelem,
--   const int* indices,
--   Ceed ceed, CeedElemRestriction *restr)
--{
--   const mfem::FiniteElement *fe = fes.GetFE(indices[0]);
--   const int P = fe->GetDof();
--   CeedInt compstride = fes.GetOrdering()==Ordering::byVDIM ? 1 : fes.GetNDofs();
--   mfem::Array<int> tp_el_dof(nelem*P);
--   Array<int> dofs;
--   const int stride = compstride == 1 ? fes.GetVDim() : 1;
--
--   for (int i = 0; i < nelem; i++)
--   {
--      const int elem_index = indices[i];
--      fes.GetElementDofs(elem_index, dofs);
--      const int el_offset = P * i;
-+      else
-+      {
-+         dof_trans = fes.GetElementDofs(elem_index, dofs);
-+      }
-+      MFEM_VERIFY(!dof_trans,
-+                  "Unexpected DofTransformation for lexicographic element "
-+                  "restriction.");
-       for (int j = 0; j < P; j++)
-       {
--         tp_el_dof[j + el_offset] = stride*dofs[j];
-+         const int sdid = dof_map[j];  // signed
-+         const int did = (sdid >= 0) ? sdid : -1 - sdid;
-+         const int sgid = dofs[did];  // signed
-+         const int gid = (sgid >= 0) ? sgid : -1 - sgid;
-+         tp_el_dof[j + P * i] = stride * gid;
-+         tp_el_orients[j + P * i] =
-+            (sgid >= 0 && sdid < 0) || (sgid < 0 && sdid >= 0);
-+         use_orients = use_orients || tp_el_orients[j + P * i];
-       }
-    }
- 
--   CeedElemRestrictionCreate(ceed, nelem, P, fes.GetVDim(),
--                             compstride, (fes.GetVDim())*(fes.GetNDofs()),
--                             CEED_MEM_HOST, CEED_COPY_VALUES,
--                             tp_el_dof.GetData(), restr);
-+   if (use_orients)
-+   {
-+      CeedElemRestrictionCreateOriented(ceed, nelem, P, fes.GetVDim(),
-+                                        compstride, fes.GetVDim() * fes.GetNDofs(),
-+                                        CEED_MEM_HOST, CEED_COPY_VALUES,
-+                                        tp_el_dof.GetData(), tp_el_orients.GetData(),
-+                                        restr);
-+   }
-+   else
-+   {
-+      CeedElemRestrictionCreate(ceed, nelem, P, fes.GetVDim(),
-+                                compstride, fes.GetVDim() * fes.GetNDofs(),
-+                                CEED_MEM_HOST, CEED_COPY_VALUES,
-+                                tp_el_dof.GetData(), restr);
-+   }
- }
- 
--static void InitRestrictionWithIndicesImpl(
--   const mfem::FiniteElementSpace &fes,
--   int nelem,
--   const int* indices,
--   Ceed ceed, CeedElemRestriction *restr)
-+static void InitNativeRestrWithIndices(const mfem::FiniteElementSpace &fes,
-+                                       bool use_bdr,
-+                                       bool is_interp_range,
-+                                       int nelem,
-+                                       const int *indices,
-+                                       Ceed ceed,
-+                                       CeedElemRestriction *restr)
- {
--   const mfem::FiniteElement *fe = fes.GetFE(indices[0]);
--   const mfem::TensorBasisElement * tfe =
--      dynamic_cast<const mfem::TensorBasisElement *>(fe);
--   if ( tfe && tfe->GetDofMap().Size()>0 ) // Native ordering using dof_map
-+   const int i0 = indices ? indices[0] : 0;
-+   const mfem::FiniteElement *fe = use_bdr ? fes.GetBE(i0) : fes.GetFE(i0);
-+   const int P = fe->GetDof();
-+   CeedInt compstride =
-+      (fes.GetOrdering() == Ordering::byVDIM) ? 1 : fes.GetNDofs();
-+   const int stride = (compstride == 1) ? fes.GetVDim() : 1;
-+   mfem::Array<int> tp_el_dof(nelem * P), dofs;
-+   mfem::Array<bool> tp_el_orients;
-+   mfem::Array<int> tp_el_curl_orients;
-+   mfem::Vector el_trans_j;
-+   mfem::DofTransformation *dof_trans = use_bdr ? fes.GetBdrElementDofs(i0, dofs) :
-+                                        fes.GetElementDofs(i0, dofs);
-+   if (!dof_trans || dof_trans->IsEmpty())
-    {
--      InitNativeRestrWithIndices(fes, nelem, indices, ceed, restr);
-+      tp_el_orients.SetSize(nelem * P);
-    }
--   else  // Lexicographic ordering
-+   else
-    {
--      InitLexicoRestrWithIndices(fes, nelem, indices, ceed, restr);
-+      tp_el_curl_orients.SetSize(nelem * P * 3, 0.0);
-+      el_trans_j.SetSize(P);
-    }
--}
- 
--static void InitCoeffRestrictionWithIndicesImpl(
--   const mfem::FiniteElementSpace &fes,
--   int nelem,
--   const int* indices,
--   int nquads,
--   int ncomp,
--   Ceed ceed,
--   CeedElemRestriction *restr)
--{
--   mfem::Array<int> tp_el_dof(nelem*nquads);
--   const int stride_quad = ncomp;
--   const int stride_elem = ncomp*nquads;
--   // TODO generalize to support different #quads
-    for (int i = 0; i < nelem; i++)
-    {
--      const int elem_index = indices[i];
--      const int el_offset = elem_index * stride_elem;
--      for (int j = 0; j < nquads; j++)
-+      const int elem_index = indices ? indices[i] : i;
-+      if (use_bdr)
-+      {
-+         dof_trans = fes.GetBdrElementDofs(elem_index, dofs);
-+      }
-+      else
-       {
--         tp_el_dof[j + nquads * i] = j * stride_quad + el_offset;
-+         dof_trans = fes.GetElementDofs(elem_index, dofs);
-+      }
-+      if (!dof_trans || dof_trans->IsEmpty())
-+      {
-+         for (int j = 0; j < P; j++)
-+         {
-+            const int sgid = dofs[j];  // signed
-+            const int gid = (sgid >= 0) ? sgid : -1 - sgid;
-+            tp_el_dof[j + P * i] = stride * gid;
-+            tp_el_orients[j + P * i] = (sgid < 0);
-+         }
-+      }
-+      else
-+      {
-+         for (int j = 0; j < P; j++)
-+         {
-+            const int sgid = dofs[j];  // signed
-+            const int gid = (sgid >= 0) ? sgid : -1 - sgid;
-+            tp_el_dof[j + P * i] = stride * gid;
-+
-+            // Fill column j of element tridiagonal matrix tp_el_curl_orients
-+            el_trans_j = 0.0;
-+            el_trans_j(j) = 1.0;
-+            if (is_interp_range)
-+            {
-+               dof_trans->InvTransformDual(el_trans_j);
-+            }
-+            else
-+            {
-+               dof_trans->InvTransformPrimal(el_trans_j);
-+            }
-+            el_trans_j *= (sgid < 0) ? -1.0 : 1.0;
-+            tp_el_curl_orients[3 * (j + 0 + P * i) + 1] = el_trans_j(j + 0);
-+            if (j > 0)
-+            {
-+               tp_el_curl_orients[3 * (j - 1 + P * i) + 2] = el_trans_j(j - 1);
-+            }
-+            if (j < P - 1)
-+            {
-+               tp_el_curl_orients[3 * (j + 1 + P * i) + 0] = el_trans_j(j + 1);
-+            }
-+#ifdef MFEM_DEBUG
-+            int nnz = 0;
-+            for (int k = 0; k < P; k++)
-+            {
-+               if (k < j - 1 && k > j + 1 && el_trans_j(k) != 0.0) { nnz++; }
-+            }
-+            MFEM_ASSERT(nnz == 0,
-+                        "Element transformation matrix is not tridiagonal at column "
-+                        << j << " (nnz = " << nnz << ")!");
-+#endif
-+         }
-       }
-    }
--   CeedElemRestrictionCreate(ceed, nelem, nquads, ncomp, 1,
--                             ncomp*fes.GetNE()*nquads,
--                             CEED_MEM_HOST, CEED_COPY_VALUES,
--                             tp_el_dof.GetData(), restr);
--}
- 
--void InitStridedRestriction(const mfem::FiniteElementSpace &fes,
--                            CeedInt nelem, CeedInt nqpts, CeedInt qdatasize,
--                            const CeedInt *strides,
--                            CeedElemRestriction *restr)
--{
--   RestrKey restr_key(&fes, nelem, nqpts, qdatasize, restr_type::Strided);
--   auto restr_itr = mfem::internal::ceed_restr_map.find(restr_key);
--   if (restr_itr == mfem::internal::ceed_restr_map.end())
-+   if (tp_el_curl_orients.Size())
-    {
--      CeedElemRestrictionCreateStrided(mfem::internal::ceed, nelem, nqpts, qdatasize,
--                                       nelem*nqpts*qdatasize,
--                                       strides,
--                                       restr);
--      // Will be automatically destroyed when @a fes gets destroyed.
--      mfem::internal::ceed_restr_map[restr_key] = *restr;
-+      CeedElemRestrictionCreateCurlOriented(ceed, nelem, P, fes.GetVDim(),
-+                                            compstride, fes.GetVDim() * fes.GetNDofs(),
-+                                            CEED_MEM_HOST, CEED_COPY_VALUES,
-+                                            tp_el_dof.GetData(), tp_el_curl_orients.GetData(),
-+                                            restr);
-+   }
-+   else if (tp_el_orients.Size())
-+   {
-+      CeedElemRestrictionCreateOriented(ceed, nelem, P, fes.GetVDim(),
-+                                        compstride, fes.GetVDim() * fes.GetNDofs(),
-+                                        CEED_MEM_HOST, CEED_COPY_VALUES,
-+                                        tp_el_dof.GetData(), tp_el_orients.GetData(),
-+                                        restr);
-    }
-    else
-    {
--      *restr = restr_itr->second;
-+      CeedElemRestrictionCreate(ceed, nelem, P, fes.GetVDim(),
-+                                compstride, fes.GetVDim() * fes.GetNDofs(),
-+                                CEED_MEM_HOST, CEED_COPY_VALUES,
-+                                tp_el_dof.GetData(), restr);
-    }
- }
- 
- void InitRestriction(const FiniteElementSpace &fes,
-+                     bool use_bdr,
-+                     int nelem,
-+                     const int *indices,
-                      Ceed ceed,
-                      CeedElemRestriction *restr)
- {
--   // Check for FES -> basis, restriction in hash tables
--   const mfem::FiniteElement *fe = fes.GetFE(0);
-+   // Check for fes -> restriction in hash table
-+   // {-1, -1, -1} is unique from CEED_STRIDES_BACKEND for strided restrictions
-+   const mfem::FiniteElement *fe;
-+   if (indices)
-+   {
-+      fe = use_bdr ? fes.GetBE(indices[0]) : fes.GetFE(indices[0]);
-+   }
-+   else
-+   {
-+      fe = use_bdr ? fes.GetBE(0) : fes.GetFE(0);
-+   }
-    const int P = fe->GetDof();
--   const int nelem = fes.GetNE();
-    const int ncomp = fes.GetVDim();
--   RestrKey restr_key(&fes, nelem, P, ncomp, restr_type::Standard);
-+   const mfem::TensorBasisElement *tfe =
-+      dynamic_cast<const mfem::TensorBasisElement *>(fe);
-+   const bool vector = fe->GetRangeType() == mfem::FiniteElement::VECTOR;
-+   const RestrType type = (tfe && tfe->GetDofMap().Size() > 0 && !vector) ?
-+                          RestrType::Lexico : RestrType::Native;
-+   RestrKey restr_key(&fes, {nelem, P, ncomp}, {-1, -1, -1}, type);
-    auto restr_itr = mfem::internal::ceed_restr_map.find(restr_key);
- 
-    // Init or retrieve key values
-    if (restr_itr == mfem::internal::ceed_restr_map.end())
-    {
--      InitRestrictionImpl(fes, ceed, restr);
-+      if (indices)
-+      {
-+         if (type == RestrType::Lexico)
-+         {
-+            // Lexicographic ordering using dof_map
-+            InitLexicoRestrWithIndices(fes, use_bdr, nelem, indices,
-+                                       ceed, restr);
-+         }
-+         else
-+         {
-+            // Native ordering
-+            InitNativeRestrWithIndices(fes, use_bdr, false, nelem, indices,
-+                                       ceed, restr);
-+         }
-+      }
-+      else
-+      {
-+         mfem::Array<int> dofs;
-+         mfem::DofTransformation *dof_trans = use_bdr ? fes.GetBdrElementDofs(0, dofs) :
-+                                              fes.GetElementDofs(0, dofs);
-+         if (type == RestrType::Lexico)
-+         {
-+            // Lexicographic ordering using dof_map
-+            MFEM_VERIFY(!dof_trans,
-+                        "Unexpected DofTransformation for lexicographic element "
-+                        "restriction.");
-+            InitLexicoRestr(fes, use_bdr, nelem, ceed, restr);
-+         }
-+         else if (!dof_trans || dof_trans->IsEmpty())
-+         {
-+            // Native ordering without dof_trans
-+            InitNativeRestr(fes, use_bdr, nelem, ceed, restr);
-+         }
-+         else
-+         {
-+            // Native ordering with dof_trans
-+            InitNativeRestrWithIndices(fes, use_bdr, false, nelem, nullptr,
-+                                       ceed, restr);
-+         }
-+      }
-       mfem::internal::ceed_restr_map[restr_key] = *restr;
-    }
-    else
-@@ -244,48 +391,116 @@ void InitRestriction(const FiniteElementSpace &fes,
-    }
- }
- 
--void InitRestrictionWithIndices(const FiniteElementSpace &fes,
--                                int nelem,
--                                const int* indices,
--                                Ceed ceed,
--                                CeedElemRestriction *restr)
-+void InitInterpolatorRestrictions(const FiniteElementSpace &trial_fes,
-+                                  const FiniteElementSpace &test_fes,
-+                                  int nelem,
-+                                  const int *indices,
-+                                  Ceed ceed,
-+                                  CeedElemRestriction *trial_restr,
-+                                  CeedElemRestriction *test_restr)
- {
--   // Check for FES -> basis, restriction in hash tables
--   const mfem::FiniteElement *fe = fes.GetFE(indices[0]);
--   const int P = fe->GetDof();
--   const int ncomp = fes.GetVDim();
--   RestrKey restr_key(&fes, nelem, P, ncomp, restr_type::Standard);
--   auto restr_itr = mfem::internal::ceed_restr_map.find(restr_key);
--
--   // Init or retrieve key values
--   if (restr_itr == mfem::internal::ceed_restr_map.end())
-+   // Check for fes -> restriction in hash table
-+   // {-1, -1, -1} is unique from CEED_STRIDES_BACKEND for strided restrictions
-+   const mfem::FiniteElement *trial_fe, *test_fe;
-+   if (indices)
-    {
--      InitRestrictionWithIndicesImpl(fes, nelem, indices, ceed, restr);
--      mfem::internal::ceed_restr_map[restr_key] = *restr;
-+      trial_fe = trial_fes.GetFE(indices[0]);
-+      test_fe = test_fes.GetFE(indices[0]);
-    }
-    else
-    {
--      *restr = restr_itr->second;
-+      trial_fe = trial_fes.GetFE(0);
-+      test_fe = test_fes.GetFE(0);
-+   }
-+   for (int s = 0; s < 2; s++)
-+   {
-+      // The restriction for the test space is slightly different as the output
-+      // is a primal vector instead of a dual vector, and lexicographic ordering
-+      // is never used (no use of tensor-product basis)
-+      CeedElemRestriction *restr = (s == 0) ? trial_restr : test_restr;
-+      const FiniteElementSpace &fes = (s == 0) ? trial_fes : test_fes;
-+      const mfem::FiniteElement *fe = (s == 0) ? trial_fe : test_fe;
-+      const int P = fe->GetDof();
-+      const int ncomp = fes.GetVDim();
-+      mfem::Array<int> dofs;
-+      mfem::DofTransformation *dof_trans =
-+         indices ? fes.GetElementDofs(indices[0], dofs) : fes.GetElementDofs(0, dofs);
-+      const RestrType type = (dof_trans && s > 0) ? RestrType::NativeRange :
-+                             RestrType::Native;
-+      RestrKey restr_key(&fes, {nelem, P, ncomp}, {-1, -1, -1}, type);
-+      auto restr_itr = mfem::internal::ceed_restr_map.find(restr_key);
-+
-+      // Init or retrieve key values
-+      if (restr_itr == mfem::internal::ceed_restr_map.end())
-+      {
-+         if (indices)
-+         {
-+            if (type == RestrType::Lexico)
-+            {
-+               // Lexicographic ordering using dof_map
-+               MFEM_VERIFY(!dof_trans,
-+                           "Unexpected DofTransformation for lexicographic element "
-+                           "restriction.");
-+               InitLexicoRestrWithIndices(fes, false, nelem, indices,
-+                                          ceed, restr);
-+            }
-+            else
-+            {
-+               // Native ordering
-+               InitNativeRestrWithIndices(fes, false, (s > 0), nelem, indices,
-+                                          ceed, restr);
-+            }
-+         }
-+         else
-+         {
-+            if (type == RestrType::Lexico)
-+            {
-+               // Lexicographic ordering using dof_map
-+               MFEM_VERIFY(!dof_trans,
-+                           "Unexpected DofTransformation for lexicographic element "
-+                           "restriction.");
-+               InitLexicoRestr(fes, false, nelem, ceed, restr);
-+            }
-+            else if (!dof_trans || dof_trans->IsEmpty())
-+            {
-+               // Native ordering without dof_trans
-+               InitNativeRestr(fes, false, nelem, ceed, restr);
-+            }
-+            else
-+            {
-+               // Native ordering with dof_trans
-+               InitNativeRestrWithIndices(fes, false, (s > 0), nelem, nullptr,
-+                                          ceed, restr);
-+            }
-+         }
-+         mfem::internal::ceed_restr_map[restr_key] = *restr;
-+      }
-+      else
-+      {
-+         *restr = restr_itr->second;
-+      }
-    }
- }
- 
--void InitCoeffRestrictionWithIndices(const FiniteElementSpace &fes,
--                                     int nelem,
--                                     const int* indices,
--                                     int nquads,
--                                     int ncomp,
--                                     Ceed ceed,
--                                     CeedElemRestriction *restr)
-+void InitStridedRestriction(const mfem::FiniteElementSpace &fes,
-+                            CeedInt nelem,
-+                            CeedInt nqpts,
-+                            CeedInt qdatasize,
-+                            const CeedInt strides[3],
-+                            Ceed ceed,
-+                            CeedElemRestriction *restr)
- {
--   // Check for FES -> basis, restriction in hash tables
--   RestrKey restr_key(&fes, nelem, nquads, ncomp, restr_type::Coeff);
-+   // Check for fes -> restriction in hash table
-+   RestrKey restr_key(&fes, {nelem, nqpts, qdatasize},
-+   {strides[0], strides[1], strides[2]}, RestrType::Strided);
-    auto restr_itr = mfem::internal::ceed_restr_map.find(restr_key);
- 
-    // Init or retrieve key values
-    if (restr_itr == mfem::internal::ceed_restr_map.end())
-    {
--      InitCoeffRestrictionWithIndicesImpl(fes, nelem, indices, nquads, ncomp,
--                                          ceed, restr);
-+      CeedElemRestrictionCreateStrided(ceed, nelem, nqpts, qdatasize,
-+                                       nelem * nqpts * qdatasize, strides,
-+                                       restr);
-       mfem::internal::ceed_restr_map[restr_key] = *restr;
-    }
-    else
-diff --git a/fem/ceed/interface/restriction.hpp b/fem/ceed/interface/restriction.hpp
-index 221716b39..612754dc7 100644
---- a/fem/ceed/interface/restriction.hpp
-+++ b/fem/ceed/interface/restriction.hpp
-@@ -12,6 +12,7 @@
- #ifndef MFEM_LIBCEED_RESTR
- #define MFEM_LIBCEED_RESTR
- 
-+#include "../../fespace.hpp"
- #include "ceed.hpp"
- 
- namespace mfem
-@@ -21,65 +22,89 @@ namespace ceed
- {
- 
- #ifdef MFEM_USE_CEED
--/** @brief Initialize a CeedElemRestriction for non-mixed meshes.
- 
--    @param[in] fes Input finite element space.
--    @param[in] ceed Input Ceed object.
--    @param[out] restr The address of the initialized CeedElemRestriction object.
--*/
-+/** @brief Initialize a CeedElemRestriction based on an
-+    mfem::FiniteElementSpace @a fes and an optional list of @a nelem elements
-+    of indices @a indices.
-+
-+    @param[in] fes The finite element space.
-+    @param[in] use_bdr Create the basis and restriction for boundary elements.
-+    @param[in] nelem The number of elements.
-+    @param[in] indices The indices of the elements of same type in the
-+                       `FiniteElementSpace`. If `indices == nullptr`, assumes
-+                       that the `FiniteElementSpace` is not mixed.
-+    @param[in] ceed The Ceed object.
-+    @param[out] restr The `CeedElemRestriction` to initialize. */
- void InitRestriction(const FiniteElementSpace &fes,
-+                     bool use_bdr,
-+                     int nelem,
-+                     const int *indices,
-                      Ceed ceed,
-                      CeedElemRestriction *restr);
- 
--/** @brief Initialize a CeedElemRestriction for mixed meshes.
-+inline void InitRestriction(const FiniteElementSpace &fes,
-+                            bool use_bdr,
-+                            Ceed ceed,
-+                            CeedElemRestriction *restr)
-+{
-+   InitRestriction(fes, use_bdr, use_bdr ? fes.GetNBE() : fes.GetNE(),
-+                   nullptr, ceed, restr);
-+}
- 
--    @param[in] fes The finite element space.
--    @param[in] ceed The Ceed object.
-+/** @brief Initialize a pair of CeedElemRestriction objects based on a
-+    mfem::FiniteElementSpace @a trial_fes and @a test_fes, and an optional list
-+    of @a nelem elements of indices @a indices.
-+
-+    @param[in] trial_fes The trial finite element space.
-+    @param[in] test_fes The test finite element space.
-     @param[in] nelem The number of elements.
-     @param[in] indices The indices of the elements of same type in the
--                       `FiniteElementSpace`.
--    @param[out] restr The `CeedElemRestriction` to initialize. */
--void InitRestrictionWithIndices(const FiniteElementSpace &fes,
--                                int nelem,
--                                const int* indices,
--                                Ceed ceed,
--                                CeedElemRestriction *restr);
-+                       `FiniteElementSpace`. If `indices == nullptr`, assumes
-+                       that the `FiniteElementSpace` is not mixed.
-+    @param[in] ceed The Ceed object.
-+    @param[out] trial_restr The `CeedElemRestriction` to initialize for the
-+                            trial space.
-+    @param[out] test_restr The `CeedElemRestriction` to initialize for the
-+                           test space. */
-+void InitInterpolatorRestrictions(const FiniteElementSpace &trial_fes,
-+                                  const FiniteElementSpace &test_fes,
-+                                  int nelem,
-+                                  const int *indices,
-+                                  Ceed ceed,
-+                                  CeedElemRestriction *trial_restr,
-+                                  CeedElemRestriction *test_restr);
-+
-+inline void InitInterpolatorRestrictions(const FiniteElementSpace &trial_fes,
-+                                         const FiniteElementSpace &test_fes,
-+                                         Ceed ceed,
-+                                         CeedElemRestriction *trial_restr,
-+                                         CeedElemRestriction *test_restr)
-+{
-+   InitInterpolatorRestrictions(trial_fes, test_fes, trial_fes.GetNE(),
-+                                nullptr, ceed, trial_restr, test_restr);
-+}
- 
--/** @brief Initialize a strided CeedElemRestriction
-+/** @brief Initialize a strided CeedElemRestriction.
- 
-     @param[in] fes Input finite element space.
-     @param[in] nelem is the number of elements.
-     @param[in] nqpts is the total number of quadrature points.
-     @param[in] qdatasize is the number of data per quadrature point.
-     @param[in] strides Array for strides between [nodes, components, elements].
--    Data for node i, component j, element k can be found in the L-vector at
--    index i*strides[0] + j*strides[1] + k*strides[2]. CEED_STRIDES_BACKEND may
--    be used with vectors created by a Ceed backend.
-+                       Data for node i, component j, element k can be found in
-+                       the L-vector at index i*strides[0] + j*strides[1] +
-+                       k*strides[2]. CEED_STRIDES_BACKEND may be used with
-+                       vectors created by a Ceed backend.
-+    @param[in] ceed The Ceed object.
-     @param[out] restr The `CeedElemRestriction` to initialize. */
- void InitStridedRestriction(const mfem::FiniteElementSpace &fes,
--                            CeedInt nelem, CeedInt nqpts, CeedInt qdatasize,
--                            const CeedInt *strides,
-+                            CeedInt nelem,
-+                            CeedInt nqpts,
-+                            CeedInt qdatasize,
-+                            const CeedInt strides[3],
-+                            Ceed ceed,
-                             CeedElemRestriction *restr);
- 
--/** @brief Initialize a CeedElemRestriction for a mfem::Coefficient on a mixed
--    mesh.
--
--    @param[in] fes The finite element space.
--    @param[in] nelem is the number of elements.
--    @param[in] indices The indices of the elements of same type in the
--                       `FiniteElementSpace`.
--    @param[in] nquads is the total number of quadrature points
--    @param[in] ncomp is the number of data per quadrature point
--    @param[in] ceed The Ceed object.
--    @param[out] restr The `CeedElemRestriction` to initialize. */
--void InitCoeffRestrictionWithIndices(const FiniteElementSpace &fes,
--                                     int nelem,
--                                     const int* indices,
--                                     int nquads,
--                                     int ncomp,
--                                     Ceed ceed,
--                                     CeedElemRestriction *restr);
--
- #endif
- 
- } // namespace ceed
-diff --git a/fem/ceed/interface/util.cpp b/fem/ceed/interface/util.cpp
-index b65fd2197..4eecc7841 100644
---- a/fem/ceed/interface/util.cpp
-+++ b/fem/ceed/interface/util.cpp
-@@ -9,14 +9,11 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
-+#include "util.hpp"
-+
- #include "../../../general/device.hpp"
--#include "../../../fem/gridfunc.hpp"
- #include "../../../linalg/dtensor.hpp"
--
--#include "basis.hpp"
--#include "restriction.hpp"
--#include "ceed.hpp"
--
-+#include "../../gridfunc.hpp"
- #include <sys/types.h>
- #include <sys/stat.h>
- #if !defined(_WIN32) || !defined(_MSC_VER)
-@@ -44,7 +41,7 @@ void RemoveBasisAndRestriction(const mfem::FiniteElementSpace *fes)
-    auto itb = mfem::internal::ceed_basis_map.begin();
-    while (itb != mfem::internal::ceed_basis_map.end())
-    {
--      if (std::get<0>(itb->first)==fes)
-+      if (std::get<0>(itb->first) == fes)
-       {
-          CeedBasisDestroy(&itb->second);
-          itb = mfem::internal::ceed_basis_map.erase(itb);
-@@ -57,7 +54,7 @@ void RemoveBasisAndRestriction(const mfem::FiniteElementSpace *fes)
-    auto itr = mfem::internal::ceed_restr_map.begin();
-    while (itr != mfem::internal::ceed_restr_map.end())
-    {
--      if (std::get<0>(itr->first)==fes)
-+      if (std::get<0>(itr->first) == fes)
-       {
-          CeedElemRestrictionDestroy(&itr->second);
-          itr = mfem::internal::ceed_restr_map.erase(itr);
-@@ -78,78 +75,41 @@ void InitVector(const mfem::Vector &v, CeedVector &cv)
-    CeedScalar *cv_ptr;
-    CeedMemType mem;
-    CeedGetPreferredMemType(mfem::internal::ceed, &mem);
--   if ( Device::Allows(Backend::DEVICE_MASK) && mem==CEED_MEM_DEVICE )
-+   if (Device::Allows(Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE)
-    {
--      cv_ptr = const_cast<CeedScalar*>(v.Read());
-+      cv_ptr = const_cast<CeedScalar *>(v.Read());
-    }
-    else
-    {
--      cv_ptr = const_cast<CeedScalar*>(v.HostRead());
-+      cv_ptr = const_cast<CeedScalar *>(v.HostRead());
-       mem = CEED_MEM_HOST;
-    }
-    CeedVectorSetArray(cv, mem, CEED_USE_POINTER, cv_ptr);
- }
- 
--void InitBasisAndRestriction(const FiniteElementSpace &fes,
--                             const IntegrationRule &irm,
--                             Ceed ceed, CeedBasis *basis,
--                             CeedElemRestriction *restr)
--{
--   InitBasis(fes, irm, ceed, basis);
--   InitRestriction(fes, ceed, restr);
--}
--
--void InitBasisAndRestrictionWithIndices(const FiniteElementSpace &fes,
--                                        const IntegrationRule &irm,
--                                        int nelem,
--                                        const int* indices,
--                                        Ceed ceed, CeedBasis *basis,
--                                        CeedElemRestriction *restr)
--{
--   InitBasisWithIndices(fes, irm, nelem, indices, ceed, basis);
--   InitRestrictionWithIndices(fes, nelem, indices, ceed, restr);
--}
--
--void InitBasisAndRestriction(const FiniteElementSpace &fes,
--                             const IntegrationRule &irm,
--                             int nelem,
--                             const int* indices,
--                             Ceed ceed, CeedBasis *basis,
--                             CeedElemRestriction *restr)
--{
--   if (indices)
--   {
--      InitBasisAndRestrictionWithIndices(fes,irm,nelem,indices,ceed,basis,restr);
--   }
--   else
--   {
--      InitBasisAndRestriction(fes,irm,ceed,basis,restr);
--   }
--}
--
- // Assumes a tensor-product operator with one active field
- int CeedOperatorGetActiveField(CeedOperator oper, CeedOperatorField *field)
- {
-    int ierr;
-    Ceed ceed;
--   ierr = CeedOperatorGetCeed(oper, &ceed); CeedChk(ierr);
-+   ierr = CeedOperatorGetCeed(oper, &ceed); PCeedChk(ierr);
- 
-    CeedQFunction qf;
-    bool isComposite;
--   ierr = CeedOperatorIsComposite(oper, &isComposite); CeedChk(ierr);
-+   ierr = CeedOperatorIsComposite(oper, &isComposite); PCeedChk(ierr);
-    CeedOperator *subops;
-    if (isComposite)
-    {
- #if CEED_VERSION_GE(0, 10, 2)
--      ierr = CeedCompositeOperatorGetSubList(oper, &subops); CeedChk(ierr);
-+      ierr = CeedCompositeOperatorGetSubList(oper, &subops); PCeedChk(ierr);
- #else
--      ierr = CeedOperatorGetSubList(oper, &subops); CeedChk(ierr);
-+      ierr = CeedOperatorGetSubList(oper, &subops); PCeedChk(ierr);
- #endif
--      ierr = CeedOperatorGetQFunction(subops[0], &qf); CeedChk(ierr);
-+      ierr = CeedOperatorGetQFunction(subops[0], &qf); PCeedChk(ierr);
-    }
-    else
-    {
--      ierr = CeedOperatorGetQFunction(oper, &qf); CeedChk(ierr);
-+      ierr = CeedOperatorGetQFunction(oper, &qf); PCeedChk(ierr);
-    }
-    CeedInt numinputfields, numoutputfields;
-    ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
-@@ -157,12 +117,12 @@ int CeedOperatorGetActiveField(CeedOperator oper, CeedOperatorField *field)
-    if (isComposite)
-    {
-       ierr = CeedOperatorGetFields(subops[0], &numinputfields, &inputfields,
--                                   &numoutputfields, NULL); CeedChk(ierr);
-+                                   &numoutputfields, NULL); PCeedChk(ierr);
-    }
-    else
-    {
-       ierr = CeedOperatorGetFields(oper, &numinputfields, &inputfields,
--                                   &numoutputfields, NULL); CeedChk(ierr);
-+                                   &numoutputfields, NULL); PCeedChk(ierr);
-    }
- 
-    CeedVector if_vector;
-@@ -170,7 +130,7 @@ int CeedOperatorGetActiveField(CeedOperator oper, CeedOperatorField *field)
-    int found_index = -1;
-    for (int i = 0; i < numinputfields; ++i)
-    {
--      ierr = CeedOperatorFieldGetVector(inputfields[i], &if_vector); CeedChk(ierr);
-+      ierr = CeedOperatorFieldGetVector(inputfields[i], &if_vector); PCeedChk(ierr);
-       if (if_vector == CEED_VECTOR_ACTIVE)
-       {
-          if (found)
-@@ -190,66 +150,6 @@ int CeedOperatorGetActiveField(CeedOperator oper, CeedOperatorField *field)
-    return 0;
- }
- 
--template <>
--const IntegrationRule & GetRule<MassIntegrator>(
--   const MassIntegrator &integ,
--   const FiniteElement &trial_fe,
--   const FiniteElement &test_fe,
--   ElementTransformation &trans)
--{
--   return MassIntegrator::GetRule(trial_fe, test_fe, trans);
--}
--
--template <>
--const IntegrationRule & GetRule<VectorMassIntegrator>(
--   const VectorMassIntegrator &integ,
--   const FiniteElement &trial_fe,
--   const FiniteElement &test_fe,
--   ElementTransformation &trans)
--{
--   return MassIntegrator::GetRule(trial_fe, test_fe, trans);
--}
--
--template <>
--const IntegrationRule & GetRule<ConvectionIntegrator>(
--   const ConvectionIntegrator &integ,
--   const FiniteElement &trial_fe,
--   const FiniteElement &test_fe,
--   ElementTransformation &trans)
--{
--   return ConvectionIntegrator::GetRule(trial_fe, trans);
--}
--
--template <>
--const IntegrationRule & GetRule<VectorConvectionNLFIntegrator>(
--   const VectorConvectionNLFIntegrator &integ,
--   const FiniteElement &trial_fe,
--   const FiniteElement &test_fe,
--   ElementTransformation &trans)
--{
--   return VectorConvectionNLFIntegrator::GetRule(trial_fe, trans);
--}
--
--template <>
--const IntegrationRule & GetRule<DiffusionIntegrator>(
--   const DiffusionIntegrator &integ,
--   const FiniteElement &trial_fe,
--   const FiniteElement &test_fe,
--   ElementTransformation &trans)
--{
--   return DiffusionIntegrator::GetRule(trial_fe, test_fe);
--}
--
--template <>
--const IntegrationRule & GetRule<VectorDiffusionIntegrator>(
--   const VectorDiffusionIntegrator &integ,
--   const FiniteElement &trial_fe,
--   const FiniteElement &test_fe,
--   ElementTransformation &trans)
--{
--   return DiffusionIntegrator::GetRule(trial_fe, test_fe);
--}
--
- std::string ceed_path;
- 
- const std::string &GetCeedPath()
-diff --git a/fem/ceed/interface/util.hpp b/fem/ceed/interface/util.hpp
-index 17ae4adae..823cd09e3 100644
---- a/fem/ceed/interface/util.hpp
-+++ b/fem/ceed/interface/util.hpp
-@@ -12,15 +12,15 @@
- #ifndef MFEM_LIBCEED_UTIL
- #define MFEM_LIBCEED_UTIL
- 
--#include "../../../config/config.hpp"
-+#include <array>
- #include <tuple>
- #include <unordered_map>
- #include <string>
--
-+#include "../../../general/error.hpp"
- #include "ceed.hpp"
- #ifdef MFEM_USE_CEED
- #include <ceed/hash.h>
--#include <ceed/backend.h>  // for CeedOperatorField
-+#include <ceed/backend.h>
- #endif
- 
- namespace mfem
-@@ -40,7 +40,7 @@ namespace ceed
- {
- 
- /** @brief Remove from ceed_basis_map and ceed_restr_map the entries associated
--    with the given @a fes. */
-+    with the given @a fes when @a fes gets destroyed. */
- void RemoveBasisAndRestriction(const mfem::FiniteElementSpace *fes);
- 
- #ifdef MFEM_USE_CEED
-@@ -48,7 +48,7 @@ void RemoveBasisAndRestriction(const mfem::FiniteElementSpace *fes);
- #define PCeedChk(err) do {                                                     \
-      if ((err))                                                                \
-      {                                                                         \
--        const char * errmsg;                                                   \
-+        const char *errmsg;                                                    \
-         CeedGetErrorMessage(internal::ceed, &errmsg);                          \
-         MFEM_ABORT(errmsg);                                                    \
-      }                                                                         \
-@@ -57,91 +57,54 @@ void RemoveBasisAndRestriction(const mfem::FiniteElementSpace *fes);
- /// Initialize a CeedVector from an mfem::Vector
- void InitVector(const mfem::Vector &v, CeedVector &cv);
- 
--/** @brief Initialize a CeedBasis and a CeedElemRestriction based on an
--    mfem::FiniteElementSpace @a fes, and an mfem::IntegrationRule @a ir.
--
--    @param[in] fes The finite element space.
--    @param[in] ir The integration rule.
--    @param[in] ceed The Ceed object.
--    @param[out] basis The `CeedBasis` to initialize.
--    @param[out] restr The `CeedElemRestriction` to initialize.
--
--    @warning Only for non-mixed finite element spaces. */
--void InitBasisAndRestriction(const mfem::FiniteElementSpace &fes,
--                             const mfem::IntegrationRule &ir,
--                             Ceed ceed, CeedBasis *basis,
--                             CeedElemRestriction *restr);
--
--/** @brief Initialize a CeedBasis and a CeedElemRestriction based on an
--    mfem::FiniteElementSpace @a fes, and an mfem::IntegrationRule @a ir,
--    and a list of @a nelem elements of indices @a indices.
--
--    @param[in] fes The finite element space.
--    @param[in] ir The integration rule.
--    @param[in] nelem The number of elements.
--    @param[in] indices The indices of the elements of same type in the
--                       `FiniteElementSpace`. If `indices == nullptr`, assumes
--                       that the `FiniteElementSpace` is not mixed.
--    @param[in] ceed The Ceed object.
--    @param[out] basis The `CeedBasis` to initialize.
--    @param[out] restr The `CeedElemRestriction` to initialize. */
--void InitBasisAndRestriction(const FiniteElementSpace &fes,
--                             const IntegrationRule &ir,
--                             int nelem,
--                             const int* indices,
--                             Ceed ceed, CeedBasis *basis,
--                             CeedElemRestriction *restr);
--
- int CeedOperatorGetActiveField(CeedOperator oper, CeedOperatorField *field);
- 
--
--template <typename Integrator>
--const IntegrationRule & GetRule(
--   const Integrator &integ,
--   const FiniteElement &trial_fe,
--   const FiniteElement &test_fe,
--   ElementTransformation &Trans);
--
--/// Return the path to the libCEED q-function headers.
-+/// Return the path to the libCEED QFunction headers.
- const std::string &GetCeedPath();
- 
- // Hash table for CeedBasis
--using BasisKey = std::tuple<const mfem::FiniteElementSpace*,
--      const mfem::IntegrationRule*,
--      int, int, int>;
-+using BasisKey =
-+   std::tuple<const mfem::FiniteElementSpace *, const mfem::FiniteElementSpace *,
-+   const mfem::IntegrationRule *, std::array<int, 3>>;
- struct BasisHash
- {
--   std::size_t operator()(const BasisKey& k) const
-+   std::size_t operator()(const BasisKey &k) const
-    {
-       return CeedHashCombine(
-                 CeedHashCombine(
--                   CeedHashInt(reinterpret_cast<CeedHash64_t>(std::get<0>(k))),
--                   CeedHashInt(reinterpret_cast<CeedHash64_t>(std::get<1>(k)))),
-+                   CeedHashCombine(
-+                      CeedHashInt(reinterpret_cast<CeedHash64_t>(std::get<0>(k))),
-+                      CeedHashInt(reinterpret_cast<CeedHash64_t>(std::get<1>(k)))),
-+                   CeedHashInt(reinterpret_cast<CeedHash64_t>(std::get<2>(k)))),
-                 CeedHashCombine(
--                   CeedHashCombine(CeedHashInt(std::get<2>(k)),
--                                   CeedHashInt(std::get<3>(k))),
--                   CeedHashInt(std::get<4>(k))));
-+                   CeedHashCombine(CeedHashInt(std::get<3>(k)[0]),
-+                                   CeedHashInt(std::get<3>(k)[1])),
-+                   CeedHashInt(std::get<3>(k)[2])));
-    }
- };
- using BasisMap = std::unordered_map<const BasisKey, CeedBasis, BasisHash>;
- 
--enum restr_type {Standard, Strided, Coeff};
--
- // Hash table for CeedElemRestriction
- using RestrKey =
--   std::tuple<const mfem::FiniteElementSpace*, int, int, int, int>;
-+   std::tuple<const mfem::FiniteElementSpace *, std::array<int, 3>,
-+   std::array<int, 3>, int>;
- struct RestrHash
- {
--   std::size_t operator()(const RestrKey& k) const
-+   std::size_t operator()(const RestrKey &k) const
-    {
-       return CeedHashCombine(
-+                CeedHashInt(reinterpret_cast<CeedHash64_t>(std::get<0>(k))),
-                 CeedHashCombine(
-                    CeedHashCombine(
--                      CeedHashInt(reinterpret_cast<CeedHash64_t>(std::get<0>(k))),
--                      CeedHashInt(std::get<1>(k))),
--                   CeedHashCombine(CeedHashInt(std::get<2>(k)),
--                                   CeedHashInt(std::get<3>(k)))),
--                CeedHashInt(std::get<4>(k)));
-+                      CeedHashCombine(
-+                         CeedHashCombine(CeedHashInt(std::get<1>(k)[0]),
-+                                         CeedHashInt(std::get<1>(k)[1])),
-+                         CeedHashInt(std::get<1>(k)[2])),
-+                      CeedHashCombine(
-+                         CeedHashCombine(CeedHashInt(std::get<2>(k)[0]),
-+                                         CeedHashInt(std::get<2>(k)[1])),
-+                         CeedHashInt(std::get<2>(k)[2]))),
-+                   CeedHashInt(std::get<3>(k))));
-    }
- };
- using RestrMap =
-@@ -156,7 +119,7 @@ namespace internal
- 
- #ifdef MFEM_USE_CEED
- /** @warning These maps have a tendency to create bugs when adding new "types"
--    of CeedBasis and CeedElemRestriction. */
-+    of CeedBasis and CeedElemRestriction. Definitions in general/device.cpp. */
- extern ceed::BasisMap ceed_basis_map;
- extern ceed::RestrMap ceed_restr_map;
- #endif
-diff --git a/fem/ceed/solvers/algebraic.cpp b/fem/ceed/solvers/algebraic.cpp
-index 280a19960..72eca5c2e 100644
---- a/fem/ceed/solvers/algebraic.cpp
-+++ b/fem/ceed/solvers/algebraic.cpp
-@@ -11,14 +11,16 @@
- 
- #include "algebraic.hpp"
- 
-+#include "../../../general/forall.hpp"
- #include "../../bilinearform.hpp"
- #include "../../fespace.hpp"
- #include "../../pfespace.hpp"
--#include "../../../general/forall.hpp"
--#include "solvers-atpmg.hpp"
--#include "full-assembly.hpp"
- #include "../interface/restriction.hpp"
--#include "../interface/ceed.hpp"
-+#include "../interface/util.hpp"
-+#include "solvers-atpmg.hpp"
-+#ifdef MFEM_USE_CEED
-+#include <ceed/backend.h>
-+#endif
- 
- namespace mfem
- {
-@@ -28,6 +30,13 @@ namespace ceed
- 
- #ifdef MFEM_USE_CEED
- 
-+int CeedInternalFree(void *p)
-+{
-+   free(*(void **)p);
-+   *(void **)p = NULL;
-+   return 0;
-+}
-+
- /** Wraps a CeedOperator in an mfem::Operator, with essential boundary
-     conditions and a prolongation operator for parallel application. */
- class ConstrainedOperator : public mfem::Operator
-@@ -91,29 +100,22 @@ const mfem::Operator *ConstrainedOperator::GetProlongation() const
-    return P;
- }
- 
--/// assumes a square operator (you could do rectangular, you'd have
--/// to find separate active input and output fields/restrictions)
--int CeedOperatorGetSize(CeedOperator oper, CeedInt * size)
--{
--   CeedSize in_len, out_len;
--   int ierr = CeedOperatorGetActiveVectorLengths(oper, &in_len, &out_len);
--   CeedChk(ierr);
--   *size = (CeedInt)in_len;
--   MFEM_VERIFY(in_len == out_len, "not a square CeedOperator");
--   MFEM_VERIFY(in_len == *size, "size overflow");
--   return 0;
--}
--
- Solver *BuildSmootherFromCeed(ConstrainedOperator &op, bool chebyshev)
- {
-    int ierr;
-+
-    CeedOperator ceed_op = op.GetCeedOperator();
-    const Array<int> &ess_tdofs = op.GetEssentialTrueDofs();
-    const mfem::Operator *P = op.GetProlongation();
-+
-    // Assemble the a local diagonal, in the sense of L-vector
-    CeedVector diagceed;
--   CeedInt length;
--   ierr = CeedOperatorGetSize(ceed_op, &length); PCeedChk(ierr);
-+   CeedSize l_in, l_out;
-+   ierr = CeedOperatorGetActiveVectorLengths(ceed_op, &l_in, &l_out);
-+   PCeedChk(ierr);
-+   MFEM_VERIFY(l_in == l_out, "Not a square CeedOperator.");
-+   MFEM_VERIFY((CeedInt)l_in == l_in, "Size overflow.");
-+   CeedInt length = l_in;
-    ierr = CeedVectorCreate(internal::ceed, length, &diagceed); PCeedChk(ierr);
-    CeedMemType mem;
-    ierr = CeedGetPreferredMemType(internal::ceed, &mem); PCeedChk(ierr);
-@@ -239,20 +241,18 @@ CeedOperator CreateCeedCompositeOperatorFromBilinearForm(BilinearForm &form)
-    int ierr;
-    CeedOperator op;
-    ierr = CeedCompositeOperatorCreate(internal::ceed, &op); PCeedChk(ierr);
--
--   MFEM_VERIFY(form.GetBBFI()->Size() == 0,
--               "Not implemented for this integrator!");
--   MFEM_VERIFY(form.GetFBFI()->Size() == 0,
--               "Not implemented for this integrator!");
--   MFEM_VERIFY(form.GetBFBFI()->Size() == 0,
--               "Not implemented for this integrator!");
--
--   // Get the domain bilinear form integrators (DBFIs)
--   Array<BilinearFormIntegrator*> *bffis = form.GetDBFI();
--   for (int i = 0; i < bffis->Size(); ++i)
-+   for (BilinearFormIntegrator *integ : *form.GetDBFI())
-+   {
-+      AddToCompositeOperator(integ, op);
-+   }
-+   for (BilinearFormIntegrator *integ : *form.GetBBFI())
-    {
--      AddToCompositeOperator((*bffis)[i], op);
-+      AddToCompositeOperator(integ, op);
-    }
-+   MFEM_VERIFY(form.GetFBFI()->Size() == 0, "AddInteriorFaceIntegrator is not "
-+               "currently supported in CreateCeedCompositeOperatorFromBilinearForm");
-+   MFEM_VERIFY(form.GetBFBFI()->Size() == 0, "AddBdrFaceIntegrator is not "
-+               "currently supported in CreateCeedCompositeOperatorFromBilinearForm");
-    return op;
- }
- 
-@@ -266,7 +266,8 @@ CeedOperator CoarsenCeedCompositeOperator(
-    int ierr;
-    bool isComposite;
-    ierr = CeedOperatorIsComposite(op, &isComposite); PCeedChk(ierr);
--   MFEM_ASSERT(isComposite, "");
-+   MFEM_ASSERT(isComposite,
-+               "CoarsenCeedCompositeOperator requires a composite operator.");
- 
-    CeedOperator op_coarse;
-    ierr = CeedCompositeOperatorCreate(internal::ceed,
-@@ -376,67 +377,68 @@ int AlgebraicInterpolation::Initialize(
- 
-    CeedSize height, width;
-    ierr = CeedElemRestrictionGetLVectorSize(erestrictu_coarse, &width);
--   CeedChk(ierr);
-+   PCeedChk(ierr);
-    ierr = CeedElemRestrictionGetLVectorSize(erestrictu_fine, &height);
--   CeedChk(ierr);
-+   PCeedChk(ierr);
- 
-    // interpolation qfunction
-    const int bp3_ncompu = 1;
-    CeedQFunction l_qf_restrict, l_qf_prolong;
-    ierr = CeedQFunctionCreateIdentity(ceed, bp3_ncompu, CEED_EVAL_NONE,
--                                      CEED_EVAL_INTERP, &l_qf_restrict); CeedChk(ierr);
-+                                      CEED_EVAL_INTERP, &l_qf_restrict); PCeedChk(ierr);
-    ierr = CeedQFunctionCreateIdentity(ceed, bp3_ncompu, CEED_EVAL_INTERP,
--                                      CEED_EVAL_NONE, &l_qf_prolong); CeedChk(ierr);
-+                                      CEED_EVAL_NONE, &l_qf_prolong); PCeedChk(ierr);
- 
-    qf_restrict = l_qf_restrict;
-    qf_prolong = l_qf_prolong;
- 
-    CeedVector c_fine_multiplicity;
--   ierr = CeedVectorCreate(ceed, height, &c_fine_multiplicity); CeedChk(ierr);
--   ierr = CeedVectorSetValue(c_fine_multiplicity, 0.0); CeedChk(ierr);
-+   ierr = CeedVectorCreate(ceed, height, &c_fine_multiplicity); PCeedChk(ierr);
-+   ierr = CeedVectorSetValue(c_fine_multiplicity, 0.0); PCeedChk(ierr);
- 
-    // Create the restriction operator
-    // Restriction - Fine to coarse
-    ierr = CeedOperatorCreate(ceed, qf_restrict, CEED_QFUNCTION_NONE,
--                             CEED_QFUNCTION_NONE, &op_restrict); CeedChk(ierr);
-+                             CEED_QFUNCTION_NONE, &op_restrict); PCeedChk(ierr);
-    ierr = CeedOperatorSetField(op_restrict, "input", erestrictu_fine,
--                               CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); CeedChk(ierr);
-+                               CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); PCeedChk(ierr);
-    ierr = CeedOperatorSetField(op_restrict, "output", erestrictu_coarse,
--                               basisctof, CEED_VECTOR_ACTIVE); CeedChk(ierr);
-+                               basisctof, CEED_VECTOR_ACTIVE); PCeedChk(ierr);
- 
-    // Interpolation - Coarse to fine
-    // Create the prolongation operator
-    ierr =  CeedOperatorCreate(ceed, qf_prolong, CEED_QFUNCTION_NONE,
--                              CEED_QFUNCTION_NONE, &op_interp); CeedChk(ierr);
-+                              CEED_QFUNCTION_NONE, &op_interp); PCeedChk(ierr);
-    ierr =  CeedOperatorSetField(op_interp, "input", erestrictu_coarse,
--                                basisctof, CEED_VECTOR_ACTIVE); CeedChk(ierr);
-+                                basisctof, CEED_VECTOR_ACTIVE); PCeedChk(ierr);
-    ierr = CeedOperatorSetField(op_interp, "output", erestrictu_fine,
--                               CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); CeedChk(ierr);
-+                               CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); PCeedChk(ierr);
- 
-    ierr = CeedElemRestrictionGetMultiplicity(erestrictu_fine,
--                                             c_fine_multiplicity); CeedChk(ierr);
--   ierr = CeedVectorCreate(ceed, height, &fine_multiplicity_r); CeedChk(ierr);
-+                                             c_fine_multiplicity); PCeedChk(ierr);
-+   ierr = CeedVectorCreate(ceed, height, &fine_multiplicity_r); PCeedChk(ierr);
- 
-    CeedScalar* fine_r_data;
-    const CeedScalar* fine_data;
-    ierr = CeedVectorGetArrayWrite(fine_multiplicity_r, CEED_MEM_HOST,
--                                  &fine_r_data); CeedChk(ierr);
-+                                  &fine_r_data); PCeedChk(ierr);
-    ierr = CeedVectorGetArrayRead(c_fine_multiplicity, CEED_MEM_HOST,
--                                 &fine_data); CeedChk(ierr);
-+                                 &fine_data); PCeedChk(ierr);
-    for (CeedSize i = 0; i < height; ++i)
-    {
-       fine_r_data[i] = 1.0 / fine_data[i];
-    }
- 
--   ierr = CeedVectorRestoreArray(fine_multiplicity_r, &fine_r_data); CeedChk(ierr);
-+   ierr = CeedVectorRestoreArray(fine_multiplicity_r, &fine_r_data);
-+   PCeedChk(ierr);
-    ierr = CeedVectorRestoreArrayRead(c_fine_multiplicity, &fine_data);
--   CeedChk(ierr);
--   ierr = CeedVectorDestroy(&c_fine_multiplicity); CeedChk(ierr);
-+   PCeedChk(ierr);
-+   ierr = CeedVectorDestroy(&c_fine_multiplicity); PCeedChk(ierr);
- 
--   ierr = CeedVectorCreate(ceed, height, &fine_work); CeedChk(ierr);
-+   ierr = CeedVectorCreate(ceed, height, &fine_work); PCeedChk(ierr);
- 
--   ierr = CeedVectorCreate(ceed, height, &v_); CeedChk(ierr);
--   ierr = CeedVectorCreate(ceed, width, &u_); CeedChk(ierr);
-+   ierr = CeedVectorCreate(ceed, height, &v_); PCeedChk(ierr);
-+   ierr = CeedVectorCreate(ceed, width, &u_); PCeedChk(ierr);
- 
-    return 0;
- }
-@@ -445,12 +447,12 @@ int AlgebraicInterpolation::Finalize()
- {
-    int ierr;
- 
--   ierr = CeedQFunctionDestroy(&qf_restrict); CeedChk(ierr);
--   ierr = CeedQFunctionDestroy(&qf_prolong); CeedChk(ierr);
--   ierr = CeedOperatorDestroy(&op_interp); CeedChk(ierr);
--   ierr = CeedOperatorDestroy(&op_restrict); CeedChk(ierr);
--   ierr = CeedVectorDestroy(&fine_multiplicity_r); CeedChk(ierr);
--   ierr = CeedVectorDestroy(&fine_work); CeedChk(ierr);
-+   ierr = CeedQFunctionDestroy(&qf_restrict); PCeedChk(ierr);
-+   ierr = CeedQFunctionDestroy(&qf_prolong); PCeedChk(ierr);
-+   ierr = CeedOperatorDestroy(&op_interp); PCeedChk(ierr);
-+   ierr = CeedOperatorDestroy(&op_restrict); PCeedChk(ierr);
-+   ierr = CeedVectorDestroy(&fine_multiplicity_r); PCeedChk(ierr);
-+   ierr = CeedVectorDestroy(&fine_work); PCeedChk(ierr);
- 
-    return 0;
- }
-@@ -468,8 +470,8 @@ AlgebraicInterpolation::AlgebraicInterpolation(
-                                             &ho_nldofs); PCeedChk(ierr);
-    height = (int)ho_nldofs;
-    width = (int)lo_nldofs;
--   MFEM_VERIFY(ho_nldofs == height, "height overflow");
--   MFEM_VERIFY(lo_nldofs == width, "width overflow");
-+   MFEM_VERIFY(ho_nldofs == height, "Height overflow.");
-+   MFEM_VERIFY(lo_nldofs == width, "Width overflow.");
-    owns_basis_ = false;
-    ierr = Initialize(ceed, basisctof, erestrictu_coarse, erestrictu_fine);
-    PCeedChk(ierr);
-@@ -488,7 +490,6 @@ AlgebraicInterpolation::~AlgebraicInterpolation()
- }
- 
- /// a = a (pointwise*) b
--/// @todo: using MPI_FORALL in this Ceed-like function is ugly
- int CeedVectorPointwiseMult(CeedVector a, const CeedVector b)
- {
-    int ierr;
-@@ -496,8 +497,8 @@ int CeedVectorPointwiseMult(CeedVector a, const CeedVector b)
-    CeedVectorGetCeed(a, &ceed);
- 
-    CeedSize length, length2;
--   ierr = CeedVectorGetLength(a, &length); CeedChk(ierr);
--   ierr = CeedVectorGetLength(b, &length2); CeedChk(ierr);
-+   ierr = CeedVectorGetLength(a, &length); PCeedChk(ierr);
-+   ierr = CeedVectorGetLength(b, &length2); PCeedChk(ierr);
-    if (length != length2)
-    {
-       return CeedError(ceed, 1, "Vector sizes don't match");
-@@ -514,14 +515,16 @@ int CeedVectorPointwiseMult(CeedVector a, const CeedVector b)
-    }
-    CeedScalar *a_data;
-    const CeedScalar *b_data;
--   ierr = CeedVectorGetArray(a, mem, &a_data); CeedChk(ierr);
--   ierr = CeedVectorGetArrayRead(b, mem, &b_data); CeedChk(ierr);
--   MFEM_VERIFY(int(length) == length, "length overflow");
-+   ierr = CeedVectorGetArray(a, mem, &a_data); PCeedChk(ierr);
-+   ierr = CeedVectorGetArrayRead(b, mem, &b_data); PCeedChk(ierr);
-+   MFEM_VERIFY(int(length) == length, "Length overflow.");
-    mfem::forall(length, [=] MFEM_HOST_DEVICE (int i)
--   {a_data[i] *= b_data[i];});
-+   {
-+      a_data[i] *= b_data[i];
-+   });
- 
--   ierr = CeedVectorRestoreArray(a, &a_data); CeedChk(ierr);
--   ierr = CeedVectorRestoreArrayRead(b, &b_data); CeedChk(ierr);
-+   ierr = CeedVectorRestoreArray(a, &a_data); PCeedChk(ierr);
-+   ierr = CeedVectorRestoreArrayRead(b, &b_data); PCeedChk(ierr);
- 
-    return 0;
- }
-@@ -590,7 +593,7 @@ void AlgebraicInterpolation::MultTranspose(const mfem::Vector& x,
-    ierr = CeedVectorGetArrayRead(fine_multiplicity_r, mem,
-                                  &multiplicitydata); PCeedChk(ierr);
-    ierr = CeedVectorGetArrayWrite(fine_work, mem, &workdata); PCeedChk(ierr);
--   MFEM_VERIFY((int)length == length, "length overflow");
-+   MFEM_VERIFY((int)length == length, "Length overflow.");
-    mfem::forall(length, [=] MFEM_HOST_DEVICE (int i)
-    {workdata[i] = in_ptr[i] * multiplicitydata[i];});
-    ierr = CeedVectorRestoreArrayRead(fine_multiplicity_r,
-@@ -636,7 +639,7 @@ AlgebraicSpaceHierarchy::AlgebraicSpaceHierarchy(FiniteElementSpace &fes)
-    current_order = order;
- 
-    Ceed ceed = internal::ceed;
--   InitRestriction(fes, ceed, &fine_er);
-+   InitRestriction(fes, false, ceed, &fine_er);
-    CeedElemRestriction er = fine_er;
- 
-    int dim = fes.GetMesh()->Dimension();
-@@ -715,7 +718,7 @@ AlgebraicCoarseSpace::AlgebraicCoarseSpace(
-    ierr = CeedElemRestrictionGetLVectorSize(ceed_elem_restriction, &ndofs_);
-    PCeedChk(ierr);
-    ndofs = ndofs_;
--   MFEM_VERIFY(ndofs == ndofs_, "ndofs overflow");
-+   MFEM_VERIFY(ndofs == ndofs_, "Overflow in ndofs.");
- 
-    mesh = fine_fes.GetMesh();
- }
-@@ -741,9 +744,8 @@ ParAlgebraicCoarseSpace::ParAlgebraicCoarseSpace(
- {
-    CeedSize lsize;
-    CeedElemRestrictionGetLVectorSize(ceed_elem_restriction, &lsize);
-+   MFEM_VERIFY((int)lsize == lsize, "Size overflow.");
-    const Table &group_ldof_fine = gc_fine->GroupLDofTable();
--
--   MFEM_VERIFY((int)lsize == lsize, "size overflow");
-    ldof_group.SetSize(lsize);
-    ldof_group = 0;
- 
-@@ -814,7 +816,8 @@ HypreParMatrix *ParAlgebraicCoarseSpace::GetProlongationHypreParMatrix()
-    if (P_mat) { return P_mat; }
- 
-    ParMesh *pmesh = dynamic_cast<ParMesh*>(mesh);
--   MFEM_VERIFY(pmesh != NULL, "");
-+   MFEM_VERIFY(pmesh != NULL,
-+               "ParAlgebraicCoarseSpace requires a ParMesh mesh object.");
-    Array<HYPRE_BigInt> dof_offsets, tdof_offsets, tdof_nb_offsets;
-    Array<HYPRE_BigInt> *offsets[2] = {&dof_offsets, &tdof_offsets};
-    int ltsize = R_mat->Height();
-@@ -943,7 +946,7 @@ AlgebraicSolver::AlgebraicSolver(BilinearForm &form,
-                form.GetAssemblyLevel() == AssemblyLevel::NONE,
-                "AlgebraicSolver requires partial assembly or fully matrix-free.");
-    MFEM_VERIFY(UsesTensorBasis(*form.FESpace()),
--               "AlgebraicSolver requires tensor product basis functions.");
-+               "AlgebraicSolver requires tensor-product basis functions.");
- #ifdef MFEM_USE_CEED
-    fespaces = new AlgebraicSpaceHierarchy(*form.FESpace());
-    multigrid = new AlgebraicMultigrid(*fespaces, form, ess_tdofs);
-@@ -974,6 +977,88 @@ void AlgebraicSolver::SetOperator(const mfem::Operator& op)
- #endif
- }
- 
-+#ifdef MFEM_USE_CEED
-+SparseMatrix *CeedOperatorFullAssemble(BilinearForm &form, bool set)
-+{
-+   Array<SparseMatrix *> mat_i;
-+   for (BilinearFormIntegrator *integ : *form.GetDBFI())
-+   {
-+      if (!integ->SupportsCeed()) { continue; }
-+      SparseMatrix *mat_integ;
-+      int ierr = CeedOperatorFullAssemble(integ->GetCeedOp().GetCeedOperator(),
-+                                          &mat_integ, set);
-+      PCeedChk(ierr);
-+      mat_i.Append(mat_integ);
-+   }
-+   for (BilinearFormIntegrator *integ : *form.GetBBFI())
-+   {
-+      if (!integ->SupportsCeed()) { continue; }
-+      SparseMatrix *mat_integ;
-+      int ierr = CeedOperatorFullAssemble(integ->GetCeedOp().GetCeedOperator(),
-+                                          &mat_integ, set);
-+      PCeedChk(ierr);
-+      mat_i.Append(mat_integ);
-+   }
-+   MFEM_VERIFY(form.GetFBFI()->Size() == 0, "AddInteriorFaceIntegrator is not "
-+               "currently supported in CeedOperatorFullAssemble");
-+   MFEM_VERIFY(form.GetBFBFI()->Size() == 0, "AddBdrFaceIntegrator is not "
-+               "currently supported in CeedOperatorFullAssemble");
-+
-+   SparseMatrix *mat = Add(mat_i);
-+   for (SparseMatrix *mat_integ : mat_i)
-+   {
-+      delete mat_integ;
-+   }
-+   return mat;
-+}
-+
-+int CeedOperatorFullAssemble(CeedOperator op, SparseMatrix **mat, bool set)
-+{
-+   int ierr;
-+   Ceed ceed;
-+   ierr = CeedOperatorGetCeed(op, &ceed); PCeedChk(ierr);
-+
-+   CeedSize l_in, l_out;
-+   ierr = CeedOperatorGetActiveVectorLengths(op, &l_in, &l_out); PCeedChk(ierr);
-+   MFEM_VERIFY((int)l_in == l_in && (int)l_out == l_out, "Size overflow.");
-+   *mat = new SparseMatrix(l_out, l_in);
-+
-+   CeedSize nnz;
-+   CeedInt *rows, *cols;
-+   ierr = CeedOperatorLinearAssembleSymbolic(op, &nnz, &rows, &cols);
-+   PCeedChk(ierr);
-+
-+   CeedVector vals;
-+   ierr = CeedVectorCreate(ceed, nnz, &vals); PCeedChk(ierr);
-+   ierr = CeedOperatorLinearAssemble(op, vals); PCeedChk(ierr);
-+
-+   const CeedScalar *val_array;
-+   ierr = CeedVectorGetArrayRead(vals, CEED_MEM_HOST, &val_array); PCeedChk(ierr);
-+   for (CeedSize k = 0; k < nnz; ++k)
-+   {
-+      if (!set)
-+      {
-+         (*mat)->Add(rows[k], cols[k], val_array[k]);
-+      }
-+      else
-+      {
-+         (*mat)->Set(rows[k], cols[k], val_array[k]);
-+      }
-+   }
-+   ierr = CeedVectorRestoreArrayRead(vals, &val_array); PCeedChk(ierr);
-+
-+   ierr = CeedVectorDestroy(&vals); PCeedChk(ierr);
-+   ierr = CeedInternalFree(&rows); PCeedChk(ierr);
-+   ierr = CeedInternalFree(&cols); PCeedChk(ierr);
-+
-+   // Enforce structurally symmetric for later elimination
-+   const int skip_zeros = 0;
-+   (*mat)->Finalize(skip_zeros);
-+
-+   return 0;
-+}
-+#endif
-+
- } // namespace ceed
- 
- } // namespace mfem
-diff --git a/fem/ceed/solvers/algebraic.hpp b/fem/ceed/solvers/algebraic.hpp
-index 8ede8324e..b8a37b7ec 100644
---- a/fem/ceed/solvers/algebraic.hpp
-+++ b/fem/ceed/solvers/algebraic.hpp
-@@ -12,9 +12,9 @@
- #ifndef MFEM_CEED_ALGEBRAIC_HPP
- #define MFEM_CEED_ALGEBRAIC_HPP
- 
-+#include "../../../linalg/sparsemat.hpp"
- #include "../../fespacehierarchy.hpp"
- #include "../../multigrid.hpp"
--#include "../interface/operator.hpp"
- #include "../interface/ceed.hpp"
- 
- namespace mfem
-@@ -190,7 +190,7 @@ private:
- #endif
- 
- public:
--   /** @brief Constructs algebraic multigrid hierarchy and solver.
-+   /** @brief Constructs algebraic multigrid hierarchy and solver
- 
-        This only works if the Ceed device backend is enabled.
- 
-@@ -204,6 +204,26 @@ public:
-    void SetOperator(const mfem::Operator& op);
- };
- 
-+#ifdef MFEM_USE_CEED
-+/** @brief Assemble the CeedOperators from a BilinearForm as an
-+    mfem::SparseMatrix
-+
-+    In parallel, this assembles independently on each processor, that is, it
-+    assembles at the L-vector level. The assembly procedure is always performed
-+    on the host, but this works also for operators stored on device by copying
-+    memory. */
-+SparseMatrix *CeedOperatorFullAssemble(BilinearForm &form, bool set = false);
-+
-+/** @brief Assembles a CeedOperator as an mfem::SparseMatrix
-+
-+    In parallel, this assembles independently on each processor, that is, it
-+    assembles at the L-vector level. The assembly procedure is always performed
-+    on the host, but this works also for operators stored on device by copying
-+    memory. */
-+int CeedOperatorFullAssemble(CeedOperator op, SparseMatrix **mat,
-+                             bool set = false);
-+#endif
-+
- } // namespace ceed
- 
- } // namespace mfem
-diff --git a/fem/ceed/solvers/full-assembly.cpp b/fem/ceed/solvers/full-assembly.cpp
-deleted file mode 100644
-index ccf9b145a..000000000
---- a/fem/ceed/solvers/full-assembly.cpp
-+++ /dev/null
-@@ -1,341 +0,0 @@
--// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
--// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
--// LICENSE and NOTICE for details. LLNL-CODE-806117.
--//
--// This file is part of the MFEM library. For more information and source code
--// availability visit https://mfem.org.
--//
--// MFEM is free software; you can redistribute it and/or modify it under the
--// terms of the BSD-3 license. We welcome feedback and contributions, see file
--// CONTRIBUTING.md for details.
--
--#include "full-assembly.hpp"
--
--#include "../../../linalg/sparsemat.hpp"
--#include "../interface/util.hpp"
--#include "../interface/ceed.hpp"
--
--#ifdef MFEM_USE_CEED
--
--namespace mfem
--{
--
--namespace ceed
--{
--
--int CeedHackReallocArray(size_t n, size_t unit, void *p)
--{
--   *(void **)p = realloc(*(void **)p, n*unit);
--   if (n && unit && !*(void **)p)
--      return CeedError(NULL, 1, "realloc failed to allocate %zd members of size "
--                       "%zd\n", n, unit);
--   return 0;
--}
--
--#define CeedHackRealloc(n, p) CeedHackReallocArray((n), sizeof(**(p)), p)
--
--int CeedHackFree(void *p)
--{
--   free(*(void **)p);
--   *(void **)p = NULL;
--   return 0;
--}
--
--int CeedSingleOperatorFullAssemble(CeedOperator op, SparseMatrix *out)
--{
--   int ierr;
--   Ceed ceed;
--   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
--
--   // Assemble QFunction
--   CeedQFunction qf;
--   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
--   CeedInt numinputfields, numoutputfields;
--   CeedChk(ierr);
--   CeedVector assembledqf;
--   CeedElemRestriction rstr_q;
--   ierr = CeedOperatorLinearAssembleQFunction(
--             op, &assembledqf, &rstr_q, CEED_REQUEST_IMMEDIATE); CeedChk(ierr);
--
--   CeedSize qflength;
--   ierr = CeedVectorGetLength(assembledqf, &qflength); CeedChk(ierr);
--
--   CeedOperatorField *input_fields;
--   CeedOperatorField *output_fields;
--   ierr = CeedOperatorGetFields(op, &numinputfields, &input_fields,
--                                &numoutputfields, &output_fields);
--   CeedChk(ierr);
--
--   // Determine active input basis
--   CeedQFunctionField *qffields;
--   ierr = CeedQFunctionGetFields(qf, &numinputfields, &qffields,
--                                 &numoutputfields, NULL);
--   CeedChk(ierr);
--   CeedInt numemodein = 0, ncomp, dim = 1;
--   CeedEvalMode *emodein = NULL;
--   CeedBasis basisin = NULL;
--   CeedElemRestriction rstrin = NULL;
--   for (CeedInt i=0; i<numinputfields; i++)
--   {
--      CeedVector vec;
--      ierr = CeedOperatorFieldGetVector(input_fields[i], &vec); CeedChk(ierr);
--      if (vec == CEED_VECTOR_ACTIVE)
--      {
--         ierr = CeedOperatorFieldGetBasis(input_fields[i], &basisin);
--         CeedChk(ierr);
--         ierr = CeedBasisGetNumComponents(basisin, &ncomp); CeedChk(ierr);
--         ierr = CeedBasisGetDimension(basisin, &dim); CeedChk(ierr);
--         ierr = CeedOperatorFieldGetElemRestriction(input_fields[i], &rstrin);
--         CeedChk(ierr);
--         CeedEvalMode emode;
--         ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode);
--         CeedChk(ierr);
--         switch (emode)
--         {
--            case CEED_EVAL_NONE:
--            case CEED_EVAL_INTERP:
--               ierr = CeedHackRealloc(numemodein + 1, &emodein); CeedChk(ierr);
--               emodein[numemodein] = emode;
--               numemodein += 1;
--               break;
--            case CEED_EVAL_GRAD:
--               ierr = CeedHackRealloc(numemodein + dim, &emodein); CeedChk(ierr);
--               for (CeedInt d=0; d<dim; d++)
--               {
--                  emodein[numemodein+d] = emode;
--               }
--               numemodein += dim;
--               break;
--            case CEED_EVAL_WEIGHT:
--            case CEED_EVAL_DIV:
--            case CEED_EVAL_CURL:
--               break; // Caught by QF Assembly
--         }
--      }
--   }
--
--   // Determine active output basis
--   ierr = CeedQFunctionGetFields(qf, &numinputfields, NULL, &numoutputfields,
--                                 &qffields); CeedChk(ierr);
--   CeedInt numemodeout = 0;
--   CeedEvalMode *emodeout = NULL;
--   CeedBasis basisout = NULL;
--   CeedElemRestriction rstrout = NULL;
--   for (CeedInt i=0; i<numoutputfields; i++)
--   {
--      CeedVector vec;
--      ierr = CeedOperatorFieldGetVector(output_fields[i], &vec); CeedChk(ierr);
--      if (vec == CEED_VECTOR_ACTIVE)
--      {
--         ierr = CeedOperatorFieldGetBasis(output_fields[i], &basisout);
--         CeedChk(ierr);
--         ierr = CeedOperatorFieldGetElemRestriction(output_fields[i], &rstrout);
--         CeedChk(ierr);
--         CeedChk(ierr);
--         CeedEvalMode emode;
--         ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode);
--         CeedChk(ierr);
--         switch (emode)
--         {
--            case CEED_EVAL_NONE:
--            case CEED_EVAL_INTERP:
--               ierr = CeedHackRealloc(numemodeout + 1, &emodeout); CeedChk(ierr);
--               emodeout[numemodeout] = emode;
--               numemodeout += 1;
--               break;
--            case CEED_EVAL_GRAD:
--               ierr = CeedHackRealloc(numemodeout + dim, &emodeout); CeedChk(ierr);
--               for (CeedInt d=0; d<dim; d++)
--               {
--                  emodeout[numemodeout+d] = emode;
--               }
--               numemodeout += dim;
--               break;
--            case CEED_EVAL_WEIGHT:
--            case CEED_EVAL_DIV:
--            case CEED_EVAL_CURL:
--               break; // Caught by QF Assembly
--         }
--      }
--   }
--
--   CeedInt nelem, elemsize, nqpts;
--   CeedSize nnodes;
--   ierr = CeedElemRestrictionGetNumElements(rstrin, &nelem); CeedChk(ierr);
--   ierr = CeedElemRestrictionGetElementSize(rstrin, &elemsize); CeedChk(ierr);
--   ierr = CeedElemRestrictionGetLVectorSize(rstrin, &nnodes); CeedChk(ierr);
--   ierr = CeedBasisGetNumQuadraturePoints(basisin, &nqpts); CeedChk(ierr);
--
--   // Determine elem_dof relation
--   CeedVector index_vec;
--   ierr = CeedVectorCreate(ceed, nnodes, &index_vec); CeedChk(ierr);
--   CeedScalar *array;
--   ierr = CeedVectorGetArrayWrite(index_vec, CEED_MEM_HOST, &array);
--   CeedChk(ierr);
--   for (CeedSize i = 0; i < nnodes; ++i)
--   {
--      array[i] = i;
--   }
--   ierr = CeedVectorRestoreArray(index_vec, &array); CeedChk(ierr);
--   CeedVector elem_dof;
--   ierr = CeedVectorCreate(ceed, nelem * elemsize, &elem_dof); CeedChk(ierr);
--   ierr = CeedVectorSetValue(elem_dof, 0.0); CeedChk(ierr);
--   CeedElemRestrictionApply(rstrin, CEED_NOTRANSPOSE, index_vec,
--                            elem_dof, CEED_REQUEST_IMMEDIATE); CeedChk(ierr);
--   const CeedScalar * elem_dof_a;
--   ierr = CeedVectorGetArrayRead(elem_dof, CEED_MEM_HOST, &elem_dof_a);
--   CeedChk(ierr);
--   ierr = CeedVectorDestroy(&index_vec); CeedChk(ierr);
--
--   // loop over elements and put in SparseMatrix
--   // SparseMatrix * out = new SparseMatrix(nnodes, nnodes);
--   MFEM_ASSERT(out->Height() == nnodes, "Sizes don't match!");
--   MFEM_ASSERT(out->Width() == nnodes, "Sizes don't match!");
--   const CeedScalar *interpin, *gradin;
--   ierr = CeedBasisGetInterp(basisin, &interpin); CeedChk(ierr);
--   ierr = CeedBasisGetGrad(basisin, &gradin); CeedChk(ierr);
--
--   const CeedScalar * assembledqfarray;
--   ierr = CeedVectorGetArrayRead(assembledqf, CEED_MEM_HOST, &assembledqfarray);
--   CeedChk(ierr);
--
--   CeedInt layout[3];
--   ierr = CeedElemRestrictionGetELayout(rstr_q, &layout); CeedChk(ierr);
--   ierr = CeedElemRestrictionDestroy(&rstr_q); CeedChk(ierr);
--
--   // enforce structurally symmetric for later elimination
--   const int skip_zeros = 0;
--   MFEM_ASSERT(numemodein == numemodeout,
--               "Ceed full assembly not implemented for this case.");
--   for (int e = 0; e < nelem; ++e)
--   {
--      // get Array<int> for use in SparseMatrix::AddSubMatrix()
--      Array<int> rows(elemsize);
--      for (int i = 0; i < elemsize; ++i)
--      {
--         rows[i] = elem_dof_a[e * elemsize + i];
--      }
--
--      // form element matrix itself
--      DenseMatrix Bmat(nqpts * numemodein, elemsize);
--      Bmat = 0.0;
--      // Store block-diagonal D matrix as collection of small dense blocks
--      DenseTensor Dmat(numemodeout, numemodein, nqpts);
--      Dmat = 0.0;
--      DenseMatrix elem_mat(elemsize, elemsize);
--      elem_mat = 0.0;
--      for (int q = 0; q < nqpts; ++q)
--      {
--         for (int n = 0; n < elemsize; ++n)
--         {
--            CeedInt din = -1;
--            for (int ein = 0; ein < numemodein; ++ein)
--            {
--               if (emodein[ein] == CEED_EVAL_INTERP)
--               {
--                  Bmat(numemodein * q + ein, n) += interpin[q * elemsize + n];
--               }
--               else if (emodein[ein] == CEED_EVAL_GRAD)
--               {
--                  din += 1;
--                  Bmat(numemodein * q + ein, n) += gradin[(din*nqpts+q) * elemsize + n];
--               }
--               else
--               {
--                  MFEM_ASSERT(false, "Not implemented!");
--               }
--            }
--         }
--         for (int ei = 0; ei < numemodein; ++ei)
--         {
--            for (int ej = 0; ej < numemodein; ++ej)
--            {
--               const int comp = ei * numemodein + ej;
--               const int index = q*layout[0] + comp*layout[1] + e*layout[2];
--               Dmat(ei, ej, q) += assembledqfarray[index];
--            }
--         }
--      }
--      DenseMatrix BTD(elemsize, nqpts*numemodein);
--      // Compute B^T*D
--      BTD = 0.0;
--      for (int j=0; j<elemsize; ++j)
--      {
--         for (int q=0; q<nqpts; ++q)
--         {
--            int qq = numemodein*q;
--            for (int ei = 0; ei < numemodein; ++ei)
--            {
--               for (int ej = 0; ej < numemodein; ++ej)
--               {
--                  BTD(j,qq+ei) += Bmat(qq+ej,j)*Dmat(ej,ei,q);
--               }
--            }
--         }
--      }
--
--      Mult(BTD, Bmat, elem_mat);
--
--      // put element matrix in sparsemat
--      out->AddSubMatrix(rows, rows, elem_mat, skip_zeros);
--   }
--
--   ierr = CeedVectorRestoreArrayRead(elem_dof, &elem_dof_a); CeedChk(ierr);
--   ierr = CeedVectorDestroy(&elem_dof); CeedChk(ierr);
--   ierr = CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray);
--   CeedChk(ierr);
--   ierr = CeedVectorDestroy(&assembledqf); CeedChk(ierr);
--   ierr = CeedHackFree(&emodein); CeedChk(ierr);
--   ierr = CeedHackFree(&emodeout); CeedChk(ierr);
--
--   return 0;
--}
--
--int CeedOperatorFullAssemble(CeedOperator op, SparseMatrix **mat)
--{
--   int ierr;
--
--   CeedSize in_len, out_len;
--   ierr = CeedOperatorGetActiveVectorLengths(op, &in_len, &out_len);
--   CeedChk(ierr);
--   const int nnodes = in_len;
--   MFEM_VERIFY(in_len == out_len, "not a square CeedOperator");
--   MFEM_VERIFY(in_len == nnodes, "size overflow");
--
--   SparseMatrix *out = new SparseMatrix(nnodes, nnodes);
--
--   bool isComposite;
--   ierr = CeedOperatorIsComposite(op, &isComposite); CeedChk(ierr);
--   if (isComposite)
--   {
--      CeedInt numsub;
--      CeedOperator *subops;
--#if CEED_VERSION_GE(0, 10, 2)
--      CeedCompositeOperatorGetNumSub(op, &numsub);
--      ierr = CeedCompositeOperatorGetSubList(op, &subops); CeedChk(ierr);
--#else
--      CeedOperatorGetNumSub(op, &numsub);
--      ierr = CeedOperatorGetSubList(op, &subops); CeedChk(ierr);
--#endif
--      for (int i = 0; i < numsub; ++i)
--      {
--         ierr = CeedSingleOperatorFullAssemble(subops[i], out); CeedChk(ierr);
--      }
--   }
--   else
--   {
--      ierr = CeedSingleOperatorFullAssemble(op, out); CeedChk(ierr);
--   }
--   // enforce structurally symmetric for later elimination
--   const int skip_zeros = 0;
--   out->Finalize(skip_zeros);
--   *mat = out;
--
--   return 0;
--}
--
--} // namespace ceed
--
--} // namespace mfem
--
--#endif
-diff --git a/fem/ceed/solvers/solvers-atpmg.cpp b/fem/ceed/solvers/solvers-atpmg.cpp
-index 8f4be00b2..052f21927 100644
---- a/fem/ceed/solvers/solvers-atpmg.cpp
-+++ b/fem/ceed/solvers/solvers-atpmg.cpp
-@@ -11,15 +11,13 @@
- 
- #include "solvers-atpmg.hpp"
- 
--#include "../interface/ceed.hpp"
-+#include <math.h>
- #include "../interface/util.hpp"
--
- #ifdef MFEM_USE_CEED
- #include <ceed/backend.h>
-+#endif
- 
--#include <math.h>
--// todo: should probably use Ceed memory wrappers instead of calloc/free?
--#include <stdlib.h>
-+#ifdef MFEM_USE_CEED
- 
- namespace mfem
- {
-@@ -86,17 +84,17 @@ int CeedATPMGElemRestriction(int order,
- {
-    int ierr;
-    Ceed ceed;
--   ierr = CeedElemRestrictionGetCeed(er_in, &ceed); CeedChk(ierr);
-+   ierr = CeedElemRestrictionGetCeed(er_in, &ceed); PCeedChk(ierr);
- 
-    CeedInt numelem, numcomp, elemsize;
-    CeedSize numnodes;
--   ierr = CeedElemRestrictionGetNumElements(er_in, &numelem); CeedChk(ierr);
--   ierr = CeedElemRestrictionGetLVectorSize(er_in, &numnodes); CeedChk(ierr);
--   ierr = CeedElemRestrictionGetElementSize(er_in, &elemsize); CeedChk(ierr);
--   ierr = CeedElemRestrictionGetNumComponents(er_in, &numcomp); CeedChk(ierr);
-+   ierr = CeedElemRestrictionGetNumElements(er_in, &numelem); PCeedChk(ierr);
-+   ierr = CeedElemRestrictionGetLVectorSize(er_in, &numnodes); PCeedChk(ierr);
-+   ierr = CeedElemRestrictionGetElementSize(er_in, &elemsize); PCeedChk(ierr);
-+   ierr = CeedElemRestrictionGetNumComponents(er_in, &numcomp); PCeedChk(ierr);
-    if (numcomp != 1)
-    {
--      // todo: multi-component will require more thought
-+      // TODO: multi-component will require more thought
-       return CeedError(ceed, 1, "Algebraic element restriction not "
-                        "implemented for multiple components.");
-    }
-@@ -107,31 +105,31 @@ int CeedATPMGElemRestriction(int order,
- 
-    CeedVector in_lvec, in_evec;
-    ierr = CeedElemRestrictionCreateVector(er_in, &in_lvec, &in_evec);
--   CeedChk(ierr);
-+   PCeedChk(ierr);
- 
-    // Create the elem_dof array from the given high-order ElemRestriction
-    // by using it to map the L-vector indices to an E-vector
-    CeedScalar * lvec_data;
-    ierr = CeedVectorGetArrayWrite(in_lvec, CEED_MEM_HOST, &lvec_data);
--   CeedChk(ierr);
-+   PCeedChk(ierr);
-    for (CeedSize i = 0; i < numnodes; ++i)
-    {
-       lvec_data[i] = (CeedScalar) i;
-    }
--   ierr = CeedVectorRestoreArray(in_lvec, &lvec_data); CeedChk(ierr);
-+   ierr = CeedVectorRestoreArray(in_lvec, &lvec_data); PCeedChk(ierr);
-    CeedInt in_layout[3];
--   ierr = CeedElemRestrictionGetELayout(er_in, &in_layout); CeedChk(ierr);
-+   ierr = CeedElemRestrictionGetELayout(er_in, &in_layout); PCeedChk(ierr);
-    if (in_layout[0] == 0 && in_layout[1] == 0 && in_layout[2] == 0)
-    {
-       return CeedError(ceed, 1, "Cannot interpret e-vector ordering of given"
-                        "CeedElemRestriction!");
-    }
-    ierr = CeedElemRestrictionApply(er_in, CEED_NOTRANSPOSE, in_lvec, in_evec,
--                                   CEED_REQUEST_IMMEDIATE); CeedChk(ierr);
--   ierr = CeedVectorDestroy(&in_lvec); CeedChk(ierr);
-+                                   CEED_REQUEST_IMMEDIATE); PCeedChk(ierr);
-+   ierr = CeedVectorDestroy(&in_lvec); PCeedChk(ierr);
-    const CeedScalar * in_elem_dof;
-    ierr = CeedVectorGetArrayRead(in_evec, CEED_MEM_HOST, &in_elem_dof);
--   CeedChk(ierr);
-+   PCeedChk(ierr);
- 
-    // Create a map (dof_map) that maps high-order ldof indices to
-    // low-order ldof indices, with -1 indicating no correspondence
-@@ -469,13 +467,13 @@ int CeedATPMGElemRestriction(int order,
-                        "CeedATPMGElemRestriction does not yet support this dimension.");
-    }
- 
--   ierr = CeedVectorRestoreArrayRead(in_evec, &in_elem_dof); CeedChk(ierr);
--   ierr = CeedVectorDestroy(&in_evec); CeedChk(ierr);
-+   ierr = CeedVectorRestoreArrayRead(in_evec, &in_elem_dof); PCeedChk(ierr);
-+   ierr = CeedVectorDestroy(&in_evec); PCeedChk(ierr);
- 
-    ierr = CeedElemRestrictionCreate(ceed, numelem, coarse_elemsize, numcomp,
-                                     0, running_out_ldof_count,
-                                     CEED_MEM_HOST, CEED_COPY_VALUES, out_elem_dof,
--                                    er_out); CeedChk(ierr);
-+                                    er_out); PCeedChk(ierr);
- 
-    delete [] out_elem_dof;
- 
-@@ -491,7 +489,7 @@ int CeedBasisATPMGCoarseToFine(Ceed ceed, int P1d, int dim, int order_reduction,
-    // calling the following Ceed function)
-    int ierr;
-    ierr = CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, P1d - order_reduction, P1d,
--                                          CEED_GAUSS_LOBATTO, basisc2f); CeedChk(ierr);
-+                                          CEED_GAUSS_LOBATTO, basisc2f); PCeedChk(ierr);
-    return 0;
- }
- 
-@@ -501,13 +499,13 @@ int CeedBasisATPMGCoarseToFine(CeedBasis basisin,
- {
-    int ierr;
-    Ceed ceed;
--   ierr = CeedBasisGetCeed(basisin, &ceed); CeedChk(ierr);
-+   ierr = CeedBasisGetCeed(basisin, &ceed); PCeedChk(ierr);
- 
-    CeedInt dim, P1d;
--   ierr = CeedBasisGetDimension(basisin, &dim); CeedChk(ierr);
--   ierr = CeedBasisGetNumNodes1D(basisin, &P1d); CeedChk(ierr);
-+   ierr = CeedBasisGetDimension(basisin, &dim); PCeedChk(ierr);
-+   ierr = CeedBasisGetNumNodes1D(basisin, &P1d); PCeedChk(ierr);
-    ierr = CeedBasisATPMGCoarseToFine(ceed, P1d, dim, order_reduction,
--                                     basisc2f); CeedChk(ierr);
-+                                     basisc2f); PCeedChk(ierr);
-    return 0;
- }
- 
-@@ -518,38 +516,38 @@ int CeedBasisATPMGCoarsen(CeedBasis basisin,
- {
-    int ierr;
-    Ceed ceed;
--   ierr = CeedBasisGetCeed(basisin, &ceed); CeedChk(ierr);
-+   ierr = CeedBasisGetCeed(basisin, &ceed); PCeedChk(ierr);
- 
-    CeedInt dim, ncomp, P1d, Q1d;
--   ierr = CeedBasisGetDimension(basisin, &dim); CeedChk(ierr);
--   ierr = CeedBasisGetNumComponents(basisin, &ncomp); CeedChk(ierr);
--   ierr = CeedBasisGetNumNodes1D(basisin, &P1d); CeedChk(ierr);
--   ierr = CeedBasisGetNumQuadraturePoints1D(basisin, &Q1d); CeedChk(ierr);
-+   ierr = CeedBasisGetDimension(basisin, &dim); PCeedChk(ierr);
-+   ierr = CeedBasisGetNumComponents(basisin, &ncomp); PCeedChk(ierr);
-+   ierr = CeedBasisGetNumNodes1D(basisin, &P1d); PCeedChk(ierr);
-+   ierr = CeedBasisGetNumQuadraturePoints1D(basisin, &Q1d); PCeedChk(ierr);
- 
-    CeedInt coarse_P1d = P1d - order_reduction;
- 
-    const CeedScalar *interp1d;
--   ierr = CeedBasisGetInterp1D(basisin, &interp1d); CeedChk(ierr);
-+   ierr = CeedBasisGetInterp1D(basisin, &interp1d); PCeedChk(ierr);
-    const CeedScalar * grad1d;
--   ierr = CeedBasisGetGrad1D(basisin, &grad1d); CeedChk(ierr);
-+   ierr = CeedBasisGetGrad1D(basisin, &grad1d); PCeedChk(ierr);
- 
-    CeedScalar * coarse_interp1d = new CeedScalar[coarse_P1d * Q1d];
-    CeedScalar * coarse_grad1d = new CeedScalar[coarse_P1d * Q1d];
-    CeedScalar * fine_nodal_points = new CeedScalar[P1d];
- 
-    // these things are in [-1, 1], not [0, 1], which matters
--   // (todo: how can we determine this or something related, algebraically?)
-+   // (TODO: how can we determine this or something related, algebraically?)
-    /* one way you might be able to tell is to just run this algorithm
-       with coarse_P1d = 2 (i.e., linear) and look for symmetry in the coarse
-       basis matrix? */
--   ierr = CeedLobattoQuadrature(P1d, fine_nodal_points, NULL); CeedChk(ierr);
-+   ierr = CeedLobattoQuadrature(P1d, fine_nodal_points, NULL); PCeedChk(ierr);
-    for (int i = 0; i < P1d; ++i)
-    {
-       fine_nodal_points[i] = 0.5 * fine_nodal_points[i] + 0.5; // cheating
-    }
- 
-    const CeedScalar *interp_ctof;
--   ierr = CeedBasisGetInterp1D(basisc2f, &interp_ctof); CeedChk(ierr);
-+   ierr = CeedBasisGetInterp1D(basisc2f, &interp_ctof); PCeedChk(ierr);
- 
-    for (int i = 0; i < Q1d; ++i)
-    {
-@@ -568,12 +566,12 @@ int CeedBasisATPMGCoarsen(CeedBasis basisin,
-    }
- 
-    const CeedScalar * qref1d;
--   ierr = CeedBasisGetQRef(basisin, &qref1d); CeedChk(ierr);
-+   ierr = CeedBasisGetQRef(basisin, &qref1d); PCeedChk(ierr);
-    const CeedScalar * qweight1d;
--   ierr = CeedBasisGetQWeights(basisin, &qweight1d); CeedChk(ierr);
-+   ierr = CeedBasisGetQWeights(basisin, &qweight1d); PCeedChk(ierr);
-    ierr = CeedBasisCreateTensorH1(ceed, dim, ncomp,
-                                   coarse_P1d, Q1d, coarse_interp1d, coarse_grad1d,
--                                  qref1d, qweight1d, basisout); CeedChk(ierr);
-+                                  qref1d, qweight1d, basisout); PCeedChk(ierr);
- 
-    delete [] fine_nodal_points;
-    delete [] coarse_interp1d;
-@@ -593,19 +591,19 @@ int CeedATPMGOperator(CeedOperator oper, int order_reduction,
- 
-    int ierr;
-    Ceed ceed;
--   ierr = CeedOperatorGetCeed(oper, &ceed); CeedChk(ierr);
-+   ierr = CeedOperatorGetCeed(oper, &ceed); PCeedChk(ierr);
- 
-    CeedQFunction qf;
--   ierr = CeedOperatorGetQFunction(oper, &qf); CeedChk(ierr);
-+   ierr = CeedOperatorGetQFunction(oper, &qf); PCeedChk(ierr);
-    CeedInt numinputfields, numoutputfields;
-    CeedQFunctionField *inputqfields, *outputqfields;
-    ierr = CeedQFunctionGetFields(qf, &numinputfields, &inputqfields,
-                                  &numoutputfields, &outputqfields);
--   CeedChk(ierr);
-+   PCeedChk(ierr);
-    CeedOperatorField *inputfields, *outputfields;
-    ierr = CeedOperatorGetFields(oper, &numinputfields, &inputfields,
-                                 &numoutputfields, &outputfields);
--   CeedChk(ierr);
-+   PCeedChk(ierr);
- 
-    CeedElemRestriction * er_input = new CeedElemRestriction[numinputfields];
-    CeedElemRestriction * er_output = new CeedElemRestriction[numoutputfields];
-@@ -619,10 +617,11 @@ int CeedATPMGOperator(CeedOperator oper, int order_reduction,
-    for (int i = 0; i < numinputfields; ++i)
-    {
-       ierr = CeedOperatorFieldGetElemRestriction(inputfields[i],
--                                                 &er_input[i]); CeedChk(ierr);
--      ierr = CeedOperatorFieldGetVector(inputfields[i], &if_vector[i]); CeedChk(ierr);
-+                                                 &er_input[i]); PCeedChk(ierr);
-+      ierr = CeedOperatorFieldGetVector(inputfields[i], &if_vector[i]);
-+      PCeedChk(ierr);
-       ierr = CeedOperatorFieldGetBasis(inputfields[i], &basis_input[i]);
--      CeedChk(ierr);
-+      PCeedChk(ierr);
-       if (if_vector[i] == CEED_VECTOR_ACTIVE)
-       {
-          if (active_input_basis < 0)
-@@ -638,11 +637,11 @@ int CeedATPMGOperator(CeedOperator oper, int order_reduction,
-    for (int i = 0; i < numoutputfields; ++i)
-    {
-       ierr = CeedOperatorFieldGetElemRestriction(outputfields[i],
--                                                 &er_output[i]); CeedChk(ierr);
-+                                                 &er_output[i]); PCeedChk(ierr);
-       ierr = CeedOperatorFieldGetVector(outputfields[i], &of_vector[i]);
--      CeedChk(ierr);
-+      PCeedChk(ierr);
-       ierr = CeedOperatorFieldGetBasis(outputfields[i], &basis_output[i]);
--      CeedChk(ierr);
-+      PCeedChk(ierr);
-       if (of_vector[i] == CEED_VECTOR_ACTIVE)
-       {
-          // should already be coarsened
-@@ -659,36 +658,36 @@ int CeedATPMGOperator(CeedOperator oper, int order_reduction,
- 
-    CeedOperator coper;
-    ierr = CeedOperatorCreate(ceed, qf, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE,
--                             &coper); CeedChk(ierr);
-+                             &coper); PCeedChk(ierr);
- 
-    for (int i = 0; i < numinputfields; ++i)
-    {
-       char * fieldname;
--      ierr = CeedQFunctionFieldGetName(inputqfields[i], &fieldname); CeedChk(ierr);
-+      ierr = CeedQFunctionFieldGetName(inputqfields[i], &fieldname); PCeedChk(ierr);
-       if (if_vector[i] == CEED_VECTOR_ACTIVE)
-       {
-          ierr = CeedOperatorSetField(coper, fieldname, coarse_er, cbasis,
--                                     if_vector[i]); CeedChk(ierr);
-+                                     if_vector[i]); PCeedChk(ierr);
-       }
-       else
-       {
-          ierr = CeedOperatorSetField(coper, fieldname, er_input[i], basis_input[i],
--                                     if_vector[i]); CeedChk(ierr);
-+                                     if_vector[i]); PCeedChk(ierr);
-       }
-    }
-    for (int i = 0; i < numoutputfields; ++i)
-    {
-       char * fieldname;
--      ierr = CeedQFunctionFieldGetName(outputqfields[i], &fieldname); CeedChk(ierr);
-+      ierr = CeedQFunctionFieldGetName(outputqfields[i], &fieldname); PCeedChk(ierr);
-       if (of_vector[i] == CEED_VECTOR_ACTIVE)
-       {
-          ierr = CeedOperatorSetField(coper, fieldname, coarse_er, cbasis,
--                                     of_vector[i]); CeedChk(ierr);
-+                                     of_vector[i]); PCeedChk(ierr);
-       }
-       else
-       {
-          ierr = CeedOperatorSetField(coper, fieldname, er_output[i], basis_output[i],
--                                     of_vector[i]); CeedChk(ierr);
-+                                     of_vector[i]); PCeedChk(ierr);
-       }
-    }
-    delete [] er_input;
-@@ -711,21 +710,21 @@ int CeedATPMGOperator(CeedOperator oper, int order_reduction,
-    int ierr;
- 
-    CeedQFunction qf;
--   ierr = CeedOperatorGetQFunction(oper, &qf); CeedChk(ierr);
-+   ierr = CeedOperatorGetQFunction(oper, &qf); PCeedChk(ierr);
-    CeedInt numinputfields, numoutputfields;
-    CeedOperatorField *inputfields;
-    ierr = CeedOperatorGetFields(oper, &numinputfields, &inputfields,
-                                 &numoutputfields, NULL);
--   CeedChk(ierr);
-+   PCeedChk(ierr);
- 
-    CeedBasis basis;
--   ierr = CeedOperatorGetActiveBasis(oper, &basis); CeedChk(ierr);
-+   ierr = CeedOperatorGetActiveBasis(oper, &basis); PCeedChk(ierr);
-    ierr = CeedBasisATPMGCoarseToFine(basis, basis_ctof_out, order_reduction);
--   CeedChk(ierr);
-+   PCeedChk(ierr);
-    ierr = CeedBasisATPMGCoarsen(basis, *basis_ctof_out, coarse_basis_out,
--                                order_reduction); CeedChk(ierr);
-+                                order_reduction); PCeedChk(ierr);
-    ierr = CeedATPMGOperator(oper, order_reduction, coarse_er, *coarse_basis_out,
--                            *basis_ctof_out, out); CeedChk(ierr);
-+                            *basis_ctof_out, out); PCeedChk(ierr);
-    return 0;
- }
- 
-@@ -734,11 +733,11 @@ int CeedOperatorGetOrder(CeedOperator oper, CeedInt * order)
-    int ierr;
- 
-    CeedOperatorField active_field;
--   ierr = CeedOperatorGetActiveField(oper, &active_field); CeedChk(ierr);
-+   ierr = CeedOperatorGetActiveField(oper, &active_field); PCeedChk(ierr);
-    CeedBasis basis;
--   ierr = CeedOperatorFieldGetBasis(active_field, &basis); CeedChk(ierr);
-+   ierr = CeedOperatorFieldGetBasis(active_field, &basis); PCeedChk(ierr);
-    int P1d;
--   ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
-+   ierr = CeedBasisGetNumNodes1D(basis, &P1d); PCeedChk(ierr);
-    *order = P1d - 1;
- 
-    return 0;
-@@ -753,13 +752,13 @@ int CeedATPMGBundle(CeedOperator oper, int order_reduction,
- {
-    int ierr;
-    CeedInt order;
--   ierr = CeedOperatorGetOrder(oper, &order); CeedChk(ierr);
-+   ierr = CeedOperatorGetOrder(oper, &order); PCeedChk(ierr);
-    CeedElemRestriction ho_er;
--   ierr = CeedOperatorGetActiveElemRestriction(oper, &ho_er); CeedChk(ierr);
-+   ierr = CeedOperatorGetActiveElemRestriction(oper, &ho_er); PCeedChk(ierr);
-    ierr = CeedATPMGElemRestriction(order, order_reduction, ho_er, er_out, dof_map);
--   CeedChk(ierr);
-+   PCeedChk(ierr);
-    ierr = CeedATPMGOperator(oper, order_reduction, *er_out, coarse_basis_out,
--                            basis_ctof_out, coarse_oper); CeedChk(ierr);
-+                            basis_ctof_out, coarse_oper); PCeedChk(ierr);
-    return 0;
- }
- 
-diff --git a/fem/ceed/solvers/solvers-atpmg.hpp b/fem/ceed/solvers/solvers-atpmg.hpp
-index 8d85b1840..62a1640fe 100644
---- a/fem/ceed/solvers/solvers-atpmg.hpp
-+++ b/fem/ceed/solvers/solvers-atpmg.hpp
-@@ -25,7 +25,7 @@ namespace ceed
- /** @brief Take given (high-order) CeedElemRestriction and make a new
-     CeedElemRestriction, which corresponds to a lower-order problem.
- 
--    Assumes a Gauss-Lobatto basis and tensor product elements, and assumes that
-+    Assumes a Gauss-Lobatto basis and tensor-product elements, and assumes that
-     the nodes in er_in are ordered in a tensor-product way.
- 
-     This is a setup routine that operates on the host.
-diff --git a/fem/fe/fe_base.cpp b/fem/fe/fe_base.cpp
-index 6f665bfa3..ded2ffc9c 100644
---- a/fem/fe/fe_base.cpp
-+++ b/fem/fe/fe_base.cpp
-@@ -401,7 +401,7 @@ const DofToQuad &FiniteElement::GetDofToQuad(const IntegrationRule &ir,
-          }
-       }
-    }
--   else
-+   else if (range_type == VECTOR)
-    {
-       d2q->B.SetSize(nqpt*dim*dof);
-       d2q->Bt.SetSize(dof*nqpt*dim);
-@@ -419,6 +419,10 @@ const DofToQuad &FiniteElement::GetDofToQuad(const IntegrationRule &ir,
-          }
-       }
-    }
-+   else
-+   {
-+      // Skip B and Bt for unknown range type
-+   }
-    switch (deriv_type)
-    {
-       case GRAD:
-@@ -472,7 +476,7 @@ const DofToQuad &FiniteElement::GetDofToQuad(const IntegrationRule &ir,
-             {
-                for (int j = 0; j < dof; j++)
-                {
--                  d2q->G[i+nqpt*(d+dim*j)] = d2q->Gt[j+dof*(i+nqpt*d)] = curlshape(j, d);
-+                  d2q->G[i+nqpt*(d+cdim*j)] = d2q->Gt[j+dof*(i+nqpt*d)] = curlshape(j, d);
-                }
-             }
-          }
-@@ -480,7 +484,8 @@ const DofToQuad &FiniteElement::GetDofToQuad(const IntegrationRule &ir,
-       }
-       case NONE:
-       default:
--         MFEM_ABORT("invalid finite element derivative type");
-+         // Skip G and Gt for unknown derivative type
-+         break;
-    }
-    dof2quad_array.Append(d2q);
-    return *d2q;
-diff --git a/fem/fe/fe_base.hpp b/fem/fe/fe_base.hpp
-index b533525f0..6582140df 100644
---- a/fem/fe/fe_base.hpp
-+++ b/fem/fe/fe_base.hpp
-@@ -1288,9 +1288,9 @@ public:
-    const DofToQuad &GetDofToQuad(const IntegrationRule &ir,
-                                  DofToQuad::Mode mode) const override
-    {
--      MFEM_VERIFY(mode != DofToQuad::FULL, "invalid mode requested");
--      return GetTensorDofToQuad(*this, ir, mode, basis1d, true,
--                                dof2quad_array);
-+      return (mode == DofToQuad::FULL) ?
-+             FiniteElement::GetDofToQuad(ir, mode) :
-+             GetTensorDofToQuad(*this, ir, mode, basis1d, true, dof2quad_array);
-    }
- 
-    const DofToQuad &GetDofToQuadOpen(const IntegrationRule &ir,
-diff --git a/fem/fespace.cpp b/fem/fespace.cpp
-index cb82c6008..59b4b7e31 100644
---- a/fem/fespace.cpp
-+++ b/fem/fespace.cpp
-@@ -1599,7 +1599,7 @@ void FiniteElementSpace::RefinementOperator
- 
-       subY.SetSize(lP.Height());
- 
--      DofTransformation *doftrans = fespace->GetElementDofs(k, dofs);
-+      const DofTransformation *doftrans = fespace->GetElementDofs(k, dofs);
-       old_elem_dof->GetRow(emb.parent, old_dofs);
- 
-       if (!doftrans)
-@@ -1620,9 +1620,9 @@ void FiniteElementSpace::RefinementOperator
-          old_elem_fos->GetRow(emb.parent, old_Fo);
-          old_DoFTrans[geom]->SetFaceOrientations(old_Fo);
- 
--         DofTransformation *new_doftrans = NULL;
--         VDofTransformation *vdoftrans =
--            dynamic_cast<VDofTransformation*>(doftrans);
-+         const DofTransformation *new_doftrans = NULL;
-+         const VDofTransformation *vdoftrans =
-+            dynamic_cast<const VDofTransformation *>(doftrans);
-          if (vdoftrans)
-          {
-             new_doftrans = doftrans;
-@@ -1675,7 +1675,7 @@ void FiniteElementSpace::RefinementOperator
-       const Geometry::Type geom = mesh_ref->GetElementBaseGeometry(k);
-       const DenseMatrix &lP = localP[geom](emb.matrix);
- 
--      DofTransformation * doftrans = fespace->GetElementDofs(k, f_dofs);
-+      const DofTransformation * doftrans = fespace->GetElementDofs(k, f_dofs);
-       old_elem_dof->GetRow(emb.parent, c_dofs);
- 
-       if (!doftrans)
-@@ -1710,9 +1710,9 @@ void FiniteElementSpace::RefinementOperator
-          old_elem_fos->GetRow(emb.parent, old_Fo);
-          old_DoFTrans[geom]->SetFaceOrientations(old_Fo);
- 
--         DofTransformation *new_doftrans = NULL;
--         VDofTransformation *vdoftrans =
--            dynamic_cast<VDofTransformation*>(doftrans);
-+         const DofTransformation *new_doftrans = NULL;
-+         const VDofTransformation *vdoftrans =
-+            dynamic_cast<const VDofTransformation *>(doftrans);
-          if (vdoftrans)
-          {
-             new_doftrans = doftrans;
-diff --git a/fem/fespace.hpp b/fem/fespace.hpp
-index 00b290c09..588de9199 100644
---- a/fem/fespace.hpp
-+++ b/fem/fespace.hpp
-@@ -271,7 +271,7 @@ protected:
-    int own_ext;
-    mutable Array<int> face_to_be; // NURBS FE space only
- 
--   Array<DofTransformation*> DoFTrans;
-+   Array<DofTransformation *> DoFTrans;
-    mutable VDofTransformation VDoFTrans;
- 
-    /** Matrix representing the prolongation from the global conforming dofs to
-@@ -1268,11 +1268,9 @@ public:
- /// @brief Return true if the mesh contains only one topology and the elements are tensor elements.
- inline bool UsesTensorBasis(const FiniteElementSpace& fes)
- {
--   Mesh & mesh = *fes.GetMesh();
--   const bool mixed = mesh.GetNumGeometries(mesh.Dimension()) > 1;
--   // Potential issue: empty local mesh --> no element 0.
--   return !mixed &&
--          dynamic_cast<const mfem::TensorBasisElement *>(fes.GetFE(0))!=nullptr;
-+   Mesh &mesh = *fes.GetMesh();
-+   return mesh.GetNE() > 0 && mesh.GetNumGeometries(mesh.Dimension()) == 1 &&
-+          dynamic_cast<const mfem::TensorBasisElement *>(fes.GetFE(0)) != nullptr;
- }
- 
- }
-diff --git a/fem/hybridization.cpp b/fem/hybridization.cpp
-index f9d4699c2..06934064c 100644
---- a/fem/hybridization.cpp
-+++ b/fem/hybridization.cpp
-@@ -120,10 +120,10 @@ void Hybridization::ConstructC()
-             vdofs[s1+j] = o2 + j;
-          }
-          c_fes->GetFaceVDofs(i, c_vdofs);
--         c_bfi->AssembleFaceMatrix(*c_fes->GetFaceElement(i),
--                                   *fes->GetFE(FTr->Elem1No),
--                                   *fes->GetFE(FTr->Elem2No),
--                                   *FTr, elmat);
-+         c_bfi->AssembleFaceMatrix2(*c_fes->GetFaceElement(i),
-+                                    *fes->GetFE(FTr->Elem1No),
-+                                    *fes->GetFE(FTr->Elem2No),
-+                                    *FTr, elmat);
-          // zero-out small elements in elmat
-          elmat.Threshold(1e-12 * elmat.MaxMaxNorm());
-          Ct->AddSubMatrix(vdofs, c_vdofs, elmat, skip_zeros);
-@@ -165,7 +165,7 @@ void Hybridization::ConstructC()
-                vdofs[j] = o1 + j;
-             }
-             fe = fes->GetFE(FTr->Elem1No);
--            c_bfi->AssembleFaceMatrix(*face_fe, *fe, *fe, *FTr, elmat);
-+            c_bfi->AssembleFaceMatrix2(*face_fe, *fe, *fe, *FTr, elmat);
-             // zero-out small elements in elmat
-             elmat.Threshold(1e-12 * elmat.MaxMaxNorm());
-             Ct->AddSubMatrix(vdofs, c_vdofs, elmat, skip_zeros);
-diff --git a/fem/integ/bilininteg_br2.cpp b/fem/integ/bilininteg_br2.cpp
-index 159947029..846d72c7c 100644
---- a/fem/integ/bilininteg_br2.cpp
-+++ b/fem/integ/bilininteg_br2.cpp
-@@ -152,20 +152,7 @@ void DGDiffusionBR2Integrator::AssembleFaceMatrix(
-    elmat.SetSize(ndofs);
-    elmat = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int order;
--      if (ndof2)
--      {
--         order = 2*std::max(el1.GetOrder(), el2.GetOrder());
--      }
--      else
--      {
--         order = 2*el1.GetOrder();
--      }
--      ir = &IntRules.Get(Trans.FaceGeom, order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el1, el2, Trans);
- 
-    for (int p = 0; p < ir->GetNPoints(); p++)
-    {
-diff --git a/fem/integ/bilininteg_convection_mf.cpp b/fem/integ/bilininteg_convection_mf.cpp
-index bbaf82788..c7078d407 100644
---- a/fem/integ/bilininteg_convection_mf.cpp
-+++ b/fem/integ/bilininteg_convection_mf.cpp
-@@ -19,31 +19,42 @@ namespace mfem
- 
- void ConvectionIntegrator::AssembleMF(const FiniteElementSpace &fes)
- {
--   // Assuming the same element type
-    Mesh *mesh = fes.GetMesh();
-    if (mesh->GetNE() == 0) { return; }
--   const FiniteElement &el = *fes.GetFE(0);
--   ElementTransformation &Trans = *fes.GetElementTransformation(0);
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Trans);
-    if (DeviceCanUseCeed())
-    {
-       delete ceedOp;
--      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
--                         fes.IsVariableOrder();
--      if (mixed)
--      {
--         ceedOp = new ceed::MixedMFConvectionIntegrator(*this, fes, Q, alpha);
--      }
--      else
--      {
--         ceedOp = new ceed::MFConvectionIntegrator(fes, *ir, Q, alpha);
--      }
-+      ceedOp = new ceed::MFConvectionIntegrator(*this, fes, Q, alpha);
-       return;
-    }
-+
-+   // Assuming the same element type
-+   // const FiniteElement &el = *fes.GetFE(0);
-+   // ElementTransformation &T = *fes.GetElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-    MFEM_ABORT("Error: ConvectionIntegrator::AssembleMF only implemented with"
-               " libCEED");
- }
- 
-+void ConvectionIntegrator::AssembleMFBoundary(const FiniteElementSpace &fes)
-+{
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      ceedOp = new ceed::MFConvectionIntegrator(*this, fes, Q, alpha, true);
-+      return;
-+   }
-+
-+   // Assuming the same element type
-+   // const FiniteElement &el = *fes.GetBE(0);
-+   // ElementTransformation &T = *fes.GetBdrElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-+   MFEM_ABORT("Error: ConvectionIntegrator::AssembleMFBoundary only implemented with"
-+              " libCEED");
-+}
-+
- void ConvectionIntegrator::AssembleDiagonalMF(Vector &diag)
- {
-    if (DeviceCanUseCeed())
-diff --git a/fem/integ/bilininteg_convection_pa.cpp b/fem/integ/bilininteg_convection_pa.cpp
-index 25928f002..74156c726 100644
---- a/fem/integ/bilininteg_convection_pa.cpp
-+++ b/fem/integ/bilininteg_convection_pa.cpp
-@@ -116,26 +116,19 @@ void ConvectionIntegrator::AssemblePA(const FiniteElementSpace &fes)
- {
-    const MemoryType mt = (pa_mt == MemoryType::DEFAULT) ?
-                          Device::GetDeviceMemoryType() : pa_mt;
--   // Assumes tensor-product elements
-    Mesh *mesh = fes.GetMesh();
--   const FiniteElement &el = *fes.GetFE(0);
--   ElementTransformation &Trans = *fes.GetElementTransformation(0);
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Trans);
-+   if (mesh->GetNE() == 0) { return; }
-    if (DeviceCanUseCeed())
-    {
-       delete ceedOp;
--      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
--                         fes.IsVariableOrder();
--      if (mixed)
--      {
--         ceedOp = new ceed::MixedPAConvectionIntegrator(*this, fes, Q, alpha);
--      }
--      else
--      {
--         ceedOp = new ceed::PAConvectionIntegrator(fes, *ir, Q, alpha);
--      }
-+      ceedOp = new ceed::PAConvectionIntegrator(*this, fes, Q, alpha);
-       return;
-    }
-+
-+   // Assumes tensor-product elements
-+   const FiniteElement &el = *fes.GetFE(0);
-+   ElementTransformation &T = *fes.GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-    const int dims = el.GetDim();
-    const int symmDims = dims;
-    nq = ir->GetNPoints();
-@@ -166,6 +159,25 @@ void ConvectionIntegrator::AssemblePA(const FiniteElementSpace &fes)
-    }
- }
- 
-+void ConvectionIntegrator::AssemblePABoundary(const FiniteElementSpace &fes)
-+{
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      ceedOp = new ceed::PAConvectionIntegrator(*this, fes, Q, alpha, true);
-+      return;
-+   }
-+
-+   // Assuming the same element type
-+   // const FiniteElement &el = *fes.GetBE(0);
-+   // ElementTransformation &T = *fes.GetBdrElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-+   MFEM_ABORT("Error: ConvectionIntegrator::AssemblePABoundary only implemented with"
-+              " libCEED");
-+}
-+
- void ConvectionIntegrator::AssembleDiagonalPA(Vector &diag)
- {
-    if (DeviceCanUseCeed())
-diff --git a/fem/integ/bilininteg_curlcurl_mf.cpp b/fem/integ/bilininteg_curlcurl_mf.cpp
-new file mode 100644
-index 000000000..54c88055b
---- /dev/null
-+++ b/fem/integ/bilininteg_curlcurl_mf.cpp
-@@ -0,0 +1,89 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../ceed/integrators/curlcurl/curlcurl.hpp"
-+
-+using namespace std;
-+
-+namespace mfem
-+{
-+
-+void CurlCurlIntegrator::AssembleMF(const FiniteElementSpace &fes)
-+{
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ) { ceedOp = new ceed::MFCurlCurlIntegrator(*this, fes, MQ); }
-+      else if (DQ) { ceedOp = new ceed::MFCurlCurlIntegrator(*this, fes, DQ); }
-+      else { ceedOp = new ceed::MFCurlCurlIntegrator(*this, fes, Q); }
-+      return;
-+   }
-+
-+   // Assumes tensor-product elements
-+   // const FiniteElement &el = *fes.GetFE(0);
-+   // ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*el, T);
-+   MFEM_ABORT("Error: CurlCurlIntegrator::AssembleMF only implemented with"
-+              " libCEED");
-+}
-+
-+void CurlCurlIntegrator::AssembleMFBoundary(const FiniteElementSpace &fes)
-+{
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ) { ceedOp = new ceed::MFCurlCurlIntegrator(*this, fes, MQ, true); }
-+      else if (DQ) { ceedOp = new ceed::MFCurlCurlIntegrator(*this, fes, DQ, true); }
-+      else { ceedOp = new ceed::MFCurlCurlIntegrator(*this, fes, Q, true); }
-+      return;
-+   }
-+
-+   // Assumes tensor-product elements
-+   // const FiniteElement &el = *fes.GetBE(0);
-+   // ElementTransformation &T = *mesh->GetBdrElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*el, T);
-+   MFEM_ABORT("Error: CurlCurlIntegrator::AssembleMFBoundary only implemented with"
-+              " libCEED");
-+}
-+
-+void CurlCurlIntegrator::AssembleDiagonalMF(Vector &diag)
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->GetDiagonal(diag);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Error: CurlCurlIntegrator::AssembleDiagonalMF only"
-+                 " implemented with libCEED");
-+   }
-+}
-+
-+void CurlCurlIntegrator::AddMultMF(const Vector &x, Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Error: CurlCurlIntegrator::AddMultMF only implemented with"
-+                 " libCEED");
-+   }
-+}
-+
-+}
-diff --git a/fem/integ/bilininteg_curlcurl_pa.cpp b/fem/integ/bilininteg_curlcurl_pa.cpp
-index 3d12d978a..7b17fc94d 100644
---- a/fem/integ/bilininteg_curlcurl_pa.cpp
-+++ b/fem/integ/bilininteg_curlcurl_pa.cpp
-@@ -13,6 +13,7 @@
- #include "../bilininteg.hpp"
- #include "../gridfunc.hpp"
- #include "../qfunction.hpp"
-+#include "../ceed/integrators/curlcurl/curlcurl.hpp"
- #include "bilininteg_hcurl_kernels.hpp"
- 
- namespace mfem
-@@ -20,32 +21,35 @@ namespace mfem
- 
- void CurlCurlIntegrator::AssemblePA(const FiniteElementSpace &fes)
- {
--   // Assumes tensor-product elements
-    Mesh *mesh = fes.GetMesh();
--   const FiniteElement *fel = fes.GetFE(0);
-+   if (mesh->GetNE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ) { ceedOp = new ceed::PACurlCurlIntegrator(*this, fes, MQ); }
-+      else if (DQ) { ceedOp = new ceed::PACurlCurlIntegrator(*this, fes, DQ); }
-+      else { ceedOp = new ceed::PACurlCurlIntegrator(*this, fes, Q); }
-+      return;
-+   }
- 
-+   // Assumes tensor-product elements
-+   const FiniteElement *fel = fes.GetFE(0);
-    const VectorTensorFiniteElement *el =
-       dynamic_cast<const VectorTensorFiniteElement*>(fel);
-    MFEM_VERIFY(el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const IntegrationRule *ir
--      = IntRule ? IntRule : &MassIntegrator::GetRule(*el, *el,
--                                                     *mesh->GetElementTransformation(0));
--
-+   ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*el, T);
-    const int dims = el->GetDim();
-    MFEM_VERIFY(dims == 2 || dims == 3, "");
--
-    nq = ir->GetNPoints();
-    dim = mesh->Dimension();
-    MFEM_VERIFY(dim == 2 || dim == 3, "");
--
-    ne = fes.GetNE();
-    geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
-    mapsC = &el->GetDofToQuad(*ir, DofToQuad::TENSOR);
-    mapsO = &el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
-    dofs1D = mapsC->ndof;
-    quad1D = mapsC->nqpt;
--
-    MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
- 
-    QuadratureSpace qs(*mesh, *ir);
-@@ -78,131 +82,153 @@ void CurlCurlIntegrator::AssemblePA(const FiniteElementSpace &fes)
-    }
- }
- 
--void CurlCurlIntegrator::AssembleDiagonalPA(Vector& diag)
-+void CurlCurlIntegrator::AssemblePABoundary(const FiniteElementSpace &fes)
- {
--   if (dim == 3)
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ) { ceedOp = new ceed::PACurlCurlIntegrator(*this, fes, MQ, true); }
-+      else if (DQ) { ceedOp = new ceed::PACurlCurlIntegrator(*this, fes, DQ, true); }
-+      else { ceedOp = new ceed::PACurlCurlIntegrator(*this, fes, Q, true); }
-+      return;
-+   }
-+
-+   // Assumes tensor-product elements
-+   // const FiniteElement &el = *fes.GetBE(0);
-+   // ElementTransformation &T = *mesh->GetBdrElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*el, T);
-+   MFEM_ABORT("Error: CurlCurlIntegrator::AssemblePABoundary only implemented with"
-+              " libCEED");
-+}
-+
-+void CurlCurlIntegrator::AssembleDiagonalPA(Vector &diag)
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->GetDiagonal(diag);
-+   }
-+   else
-    {
--      if (Device::Allows(Backend::DEVICE_MASK))
-+      if (dim == 3)
-       {
--         const int ID = (dofs1D << 4) | quad1D;
--         switch (ID)
-+         if (Device::Allows(Backend::DEVICE_MASK))
-          {
--            case 0x23:
--               return internal::SmemPACurlCurlAssembleDiagonal3D<2,3>(
--                         dofs1D,
--                         quad1D,
--                         symmetric, ne,
--                         mapsO->B, mapsC->B,
--                         mapsO->G, mapsC->G,
--                         pa_data, diag);
--            case 0x34:
--               return internal::SmemPACurlCurlAssembleDiagonal3D<3,4>(
--                         dofs1D,
--                         quad1D,
--                         symmetric, ne,
--                         mapsO->B, mapsC->B,
--                         mapsO->G, mapsC->G,
--                         pa_data, diag);
--            case 0x45:
--               return internal::SmemPACurlCurlAssembleDiagonal3D<4,5>(
--                         dofs1D,
--                         quad1D,
--                         symmetric, ne,
--                         mapsO->B, mapsC->B,
--                         mapsO->G, mapsC->G,
--                         pa_data, diag);
--            case 0x56:
--               return internal::SmemPACurlCurlAssembleDiagonal3D<5,6>(
--                         dofs1D,
--                         quad1D,
--                         symmetric, ne,
--                         mapsO->B, mapsC->B,
--                         mapsO->G, mapsC->G,
--                         pa_data, diag);
--            default:
--               return internal::SmemPACurlCurlAssembleDiagonal3D(
--                         dofs1D, quad1D,
--                         symmetric, ne,
--                         mapsO->B, mapsC->B,
--                         mapsO->G, mapsC->G,
--                         pa_data, diag);
-+            const int ID = (dofs1D << 4) | quad1D;
-+            switch (ID)
-+            {
-+               case 0x23:
-+                  return internal::SmemPACurlCurlAssembleDiagonal3D<2,3>(
-+                            dofs1D, quad1D, symmetric, ne,
-+                            mapsO->B, mapsC->B,
-+                            mapsO->G, mapsC->G,
-+                            pa_data, diag);
-+               case 0x34:
-+                  return internal::SmemPACurlCurlAssembleDiagonal3D<3,4>(
-+                            dofs1D, quad1D, symmetric, ne,
-+                            mapsO->B, mapsC->B,
-+                            mapsO->G, mapsC->G,
-+                            pa_data, diag);
-+               case 0x45:
-+                  return internal::SmemPACurlCurlAssembleDiagonal3D<4,5>(
-+                            dofs1D, quad1D, symmetric, ne,
-+                            mapsO->B, mapsC->B,
-+                            mapsO->G, mapsC->G,
-+                            pa_data, diag);
-+               case 0x56:
-+                  return internal::SmemPACurlCurlAssembleDiagonal3D<5,6>(
-+                            dofs1D, quad1D, symmetric, ne,
-+                            mapsO->B, mapsC->B,
-+                            mapsO->G, mapsC->G,
-+                            pa_data, diag);
-+               default:
-+                  return internal::SmemPACurlCurlAssembleDiagonal3D(
-+                            dofs1D, quad1D, symmetric, ne,
-+                            mapsO->B, mapsC->B,
-+                            mapsO->G, mapsC->G,
-+                            pa_data, diag);
-+            }
-          }
-+         else
-+         {
-+            internal::PACurlCurlAssembleDiagonal3D(dofs1D, quad1D, symmetric, ne,
-+                                                   mapsO->B, mapsC->B,
-+                                                   mapsO->G, mapsC->G,
-+                                                   pa_data, diag);
-+         }
-+      }
-+      else if (dim == 2)
-+      {
-+         internal::PACurlCurlAssembleDiagonal2D(dofs1D, quad1D, ne,
-+                                                mapsO->B, mapsC->G, pa_data, diag);
-       }
-       else
-       {
--         internal::PACurlCurlAssembleDiagonal3D(dofs1D, quad1D, symmetric, ne,
--                                                mapsO->B, mapsC->B,
--                                                mapsO->G, mapsC->G,
--                                                pa_data, diag);
-+         MFEM_ABORT("Unsupported dimension!");
-       }
-    }
--   else if (dim == 2)
--   {
--      internal::PACurlCurlAssembleDiagonal2D(dofs1D, quad1D, ne,
--                                             mapsO->B, mapsC->G, pa_data, diag);
--   }
--   else
--   {
--      MFEM_ABORT("Unsupported dimension!");
--   }
- }
- 
- void CurlCurlIntegrator::AddMultPA(const Vector &x, Vector &y) const
- {
--   if (dim == 3)
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-    {
--      if (Device::Allows(Backend::DEVICE_MASK))
-+      if (dim == 3)
-       {
--         const int ID = (dofs1D << 4) | quad1D;
--         switch (ID)
-+         if (Device::Allows(Backend::DEVICE_MASK))
-+         {
-+            const int ID = (dofs1D << 4) | quad1D;
-+            switch (ID)
-+            {
-+               case 0x23:
-+                  return internal::SmemPACurlCurlApply3D<2,3>(
-+                            dofs1D, quad1D, symmetric, ne,
-+                            mapsO->B, mapsC->B, mapsO->Bt, mapsC->Bt,
-+                            mapsC->G, mapsC->Gt, pa_data, x, y);
-+               case 0x34:
-+                  return internal::SmemPACurlCurlApply3D<3,4>(
-+                            dofs1D, quad1D, symmetric, ne,
-+                            mapsO->B, mapsC->B, mapsO->Bt, mapsC->Bt,
-+                            mapsC->G, mapsC->Gt, pa_data, x, y);
-+               case 0x45:
-+                  return internal::SmemPACurlCurlApply3D<4,5>(
-+                            dofs1D, quad1D, symmetric, ne,
-+                            mapsO->B, mapsC->B, mapsO->Bt, mapsC->Bt,
-+                            mapsC->G, mapsC->Gt, pa_data, x, y);
-+               case 0x56:
-+                  return internal::SmemPACurlCurlApply3D<5,6>(
-+                            dofs1D, quad1D, symmetric, ne,
-+                            mapsO->B, mapsC->B, mapsO->Bt, mapsC->Bt,
-+                            mapsC->G, mapsC->Gt, pa_data, x, y);
-+               default:
-+                  return internal::SmemPACurlCurlApply3D(
-+                            dofs1D, quad1D, symmetric, ne,
-+                            mapsO->B, mapsC->B, mapsO->Bt, mapsC->Bt,
-+                            mapsC->G, mapsC->Gt, pa_data, x, y);
-+            }
-+         }
-+         else
-          {
--            case 0x23:
--               return internal::SmemPACurlCurlApply3D<2,3>(
--                         dofs1D, quad1D,
--                         symmetric, ne,
--                         mapsO->B, mapsC->B, mapsO->Bt, mapsC->Bt,
--                         mapsC->G, mapsC->Gt, pa_data, x, y);
--            case 0x34:
--               return internal::SmemPACurlCurlApply3D<3,4>(
--                         dofs1D, quad1D,
--                         symmetric, ne,
--                         mapsO->B, mapsC->B, mapsO->Bt, mapsC->Bt,
--                         mapsC->G, mapsC->Gt, pa_data, x, y);
--            case 0x45:
--               return internal::SmemPACurlCurlApply3D<4,5>(
--                         dofs1D, quad1D,
--                         symmetric, ne,
--                         mapsO->B, mapsC->B, mapsO->Bt, mapsC->Bt,
--                         mapsC->G, mapsC->Gt, pa_data, x, y);
--            case 0x56:
--               return internal::SmemPACurlCurlApply3D<5,6>(
--                         dofs1D, quad1D,
--                         symmetric, ne,
--                         mapsO->B, mapsC->B, mapsO->Bt, mapsC->Bt,
--                         mapsC->G, mapsC->Gt, pa_data, x, y);
--            default:
--               return internal::SmemPACurlCurlApply3D(
--                         dofs1D, quad1D, symmetric, ne,
--                         mapsO->B, mapsC->B, mapsO->Bt, mapsC->Bt,
--                         mapsC->G, mapsC->Gt, pa_data, x, y);
-+            internal::PACurlCurlApply3D(dofs1D, quad1D, symmetric, ne, mapsO->B, mapsC->B,
-+                                        mapsO->Bt, mapsC->Bt, mapsC->G, mapsC->Gt,
-+                                        pa_data, x, y);
-          }
-       }
-+      else if (dim == 2)
-+      {
-+         internal::PACurlCurlApply2D(dofs1D, quad1D, ne, mapsO->B, mapsO->Bt,
-+                                     mapsC->G, mapsC->Gt, pa_data, x, y);
-+      }
-       else
-       {
--         internal::PACurlCurlApply3D(dofs1D, quad1D, symmetric, ne, mapsO->B, mapsC->B,
--                                     mapsO->Bt, mapsC->Bt, mapsC->G, mapsC->Gt,
--                                     pa_data, x, y);
-+         MFEM_ABORT("Unsupported dimension!");
-       }
-    }
--   else if (dim == 2)
--   {
--      internal::PACurlCurlApply2D(dofs1D, quad1D, ne, mapsO->B, mapsO->Bt,
--                                  mapsC->G, mapsC->Gt, pa_data, x, y);
--   }
--   else
--   {
--      MFEM_ABORT("Unsupported dimension!");
--   }
- }
- 
- } // namespace mfem
-diff --git a/fem/integ/bilininteg_dgtrace_pa.cpp b/fem/integ/bilininteg_dgtrace_pa.cpp
-index f4b8d837c..284191c13 100644
---- a/fem/integ/bilininteg_dgtrace_pa.cpp
-+++ b/fem/integ/bilininteg_dgtrace_pa.cpp
-@@ -123,9 +123,7 @@ void DGTraceIntegrator::SetupPA(const FiniteElementSpace &fes, FaceType type)
-       *fes.GetTraceElement(0, fes.GetMesh()->GetFaceGeometry(0));
-    FaceElementTransformations &T0 =
-       *fes.GetMesh()->GetFaceElementTransformations(0);
--   const IntegrationRule *ir = IntRule?
--                               IntRule:
--                               &GetRule(el.GetGeomType(), el.GetOrder(), T0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*fes.GetFE(0), T0);
-    const int symmDims = 4;
-    nq = ir->GetNPoints();
-    dim = mesh->Dimension();
-diff --git a/fem/integ/bilininteg_diffusion_mf.cpp b/fem/integ/bilininteg_diffusion_mf.cpp
-index 0896b8bf9..0a39a442c 100644
---- a/fem/integ/bilininteg_diffusion_mf.cpp
-+++ b/fem/integ/bilininteg_diffusion_mf.cpp
-@@ -18,33 +18,46 @@ namespace mfem
- 
- void DiffusionIntegrator::AssembleMF(const FiniteElementSpace &fes)
- {
--   // Assuming the same element type
-    Mesh *mesh = fes.GetMesh();
-    if (mesh->GetNE() == 0) { return; }
--   const FiniteElement &el = *fes.GetFE(0);
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el);
-    if (DeviceCanUseCeed())
-    {
-       delete ceedOp;
--      MFEM_VERIFY(!VQ && !MQ,
--                  "Only scalar coefficient supported for DiffusionIntegrator"
--                  " with libCEED");
--      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
--                         fes.IsVariableOrder();
--      if (mixed)
--      {
--         ceedOp = new ceed::MixedMFDiffusionIntegrator(*this, fes, Q);
--      }
--      else
--      {
--         ceedOp = new ceed::MFDiffusionIntegrator(fes, *ir, Q);
--      }
-+      if (MQ) { ceedOp = new ceed::MFDiffusionIntegrator(*this, fes, MQ); }
-+      else if (VQ) { ceedOp = new ceed::MFDiffusionIntegrator(*this, fes, VQ); }
-+      else { ceedOp = new ceed::MFDiffusionIntegrator(*this, fes, Q); }
-       return;
-    }
-+
-+   // Assuming the same element type
-+   // const FiniteElement &el = *fes.GetFE(0);
-+   // ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-    MFEM_ABORT("Error: DiffusionIntegrator::AssembleMF only implemented with"
-               " libCEED");
- }
- 
-+void DiffusionIntegrator::AssembleMFBoundary(const FiniteElementSpace &fes)
-+{
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ) { ceedOp = new ceed::MFDiffusionIntegrator(*this, fes, MQ, true); }
-+      else if (VQ) { ceedOp = new ceed::MFDiffusionIntegrator(*this, fes, VQ, true); }
-+      else { ceedOp = new ceed::MFDiffusionIntegrator(*this, fes, Q, true); }
-+      return;
-+   }
-+
-+   // Assuming the same element type
-+   // const FiniteElement &el = *fes.GetBE(0);
-+   // ElementTransformation &T = *mesh->GetBdrElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-+   MFEM_ABORT("Error: DiffusionIntegrator::AssembleMFBoundary only implemented with"
-+              " libCEED");
-+}
-+
- void DiffusionIntegrator::AssembleDiagonalMF(Vector &diag)
- {
-    if (DeviceCanUseCeed())
-diff --git a/fem/integ/bilininteg_diffusion_pa.cpp b/fem/integ/bilininteg_diffusion_pa.cpp
-index a966c8520..7cf050870 100644
---- a/fem/integ/bilininteg_diffusion_pa.cpp
-+++ b/fem/integ/bilininteg_diffusion_pa.cpp
-@@ -22,29 +22,21 @@ void DiffusionIntegrator::AssemblePA(const FiniteElementSpace &fes)
- {
-    const MemoryType mt = (pa_mt == MemoryType::DEFAULT) ?
-                          Device::GetDeviceMemoryType() : pa_mt;
--   // Assuming the same element type
-    Mesh *mesh = fes.GetMesh();
-    if (mesh->GetNE() == 0) { return; }
--   const FiniteElement &el = *fes.GetFE(0);
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el);
-    if (DeviceCanUseCeed())
-    {
-       delete ceedOp;
--      MFEM_VERIFY(!VQ && !MQ,
--                  "Only scalar coefficient supported for DiffusionIntegrator"
--                  " with libCEED");
--      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
--                         fes.IsVariableOrder();
--      if (mixed)
--      {
--         ceedOp = new ceed::MixedPADiffusionIntegrator(*this, fes, Q);
--      }
--      else
--      {
--         ceedOp = new ceed::PADiffusionIntegrator(fes, *ir, Q);
--      }
-+      if (MQ) { ceedOp = new ceed::PADiffusionIntegrator(*this, fes, MQ); }
-+      else if (VQ) { ceedOp = new ceed::PADiffusionIntegrator(*this, fes, VQ); }
-+      else { ceedOp = new ceed::PADiffusionIntegrator(*this, fes, Q); }
-       return;
-    }
-+
-+   // Assuming the same element type
-+   const FiniteElement &el = *fes.GetFE(0);
-+   ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-    const int dims = el.GetDim();
-    const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
-    const int nq = ir->GetNPoints();
-@@ -73,6 +65,27 @@ void DiffusionIntegrator::AssemblePA(const FiniteElementSpace &fes)
-                               ir->GetWeights(), geom->J, coeff, pa_data);
- }
- 
-+void DiffusionIntegrator::AssemblePABoundary(const FiniteElementSpace &fes)
-+{
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ) { ceedOp = new ceed::PADiffusionIntegrator(*this, fes, MQ, true); }
-+      else if (VQ) { ceedOp = new ceed::PADiffusionIntegrator(*this, fes, VQ, true); }
-+      else { ceedOp = new ceed::PADiffusionIntegrator(*this, fes, Q, true); }
-+      return;
-+   }
-+
-+   // Assuming the same element type
-+   // const FiniteElement &el = *fes.GetBE(0);
-+   // ElementTransformation &T = *mesh->GetBdrElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-+   MFEM_ABORT("Error: DiffusionIntegrator::AssemblePABoundary only implemented with"
-+              " libCEED");
-+}
-+
- void DiffusionIntegrator::AssembleDiagonalPA(Vector &diag)
- {
-    if (DeviceCanUseCeed())
-diff --git a/fem/integ/bilininteg_divdiv_mf.cpp b/fem/integ/bilininteg_divdiv_mf.cpp
-new file mode 100644
-index 000000000..7d8ea409e
---- /dev/null
-+++ b/fem/integ/bilininteg_divdiv_mf.cpp
-@@ -0,0 +1,85 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../ceed/integrators/divdiv/divdiv.hpp"
-+
-+using namespace std;
-+
-+namespace mfem
-+{
-+
-+void DivDivIntegrator::AssembleMF(const FiniteElementSpace &fes)
-+{
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      ceedOp = new ceed::MFDivDivIntegrator(*this, fes, Q);
-+      return;
-+   }
-+
-+   // Assumes tensor-product elements
-+   // const FiniteElement &el = *fes.GetFE(0);
-+   // ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*el, T);
-+   MFEM_ABORT("Error: DivDivIntegrator::AssembleMF only implemented with"
-+              " libCEED");
-+}
-+
-+void DivDivIntegrator::AssembleMFBoundary(const FiniteElementSpace &fes)
-+{
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      ceedOp = new ceed::MFDivDivIntegrator(*this, fes, Q, true);
-+      return;
-+   }
-+
-+   // Assumes tensor-product elements
-+   // const FiniteElement &el = *fes.GetBE(0);
-+   // ElementTransformation &T = *mesh->GetBdrElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*el, T);
-+   MFEM_ABORT("Error: DivDivIntegrator::AssembleMFBoundary only implemented with"
-+              " libCEED");
-+}
-+
-+void DivDivIntegrator::AssembleDiagonalMF(Vector &diag)
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->GetDiagonal(diag);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Error: DivDivIntegrator::AssembleDiagonalMF only"
-+                 " implemented with libCEED");
-+   }
-+}
-+
-+void DivDivIntegrator::AddMultMF(const Vector &x, Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Error: DivDivIntegrator::AddMultMF only implemented with"
-+                 " libCEED");
-+   }
-+}
-+
-+}
-diff --git a/fem/integ/bilininteg_divdiv_pa.cpp b/fem/integ/bilininteg_divdiv_pa.cpp
-index 8abf233a7..ec85f6c22 100644
---- a/fem/integ/bilininteg_divdiv_pa.cpp
-+++ b/fem/integ/bilininteg_divdiv_pa.cpp
-@@ -13,6 +13,7 @@
- #include "../bilininteg.hpp"
- #include "../gridfunc.hpp"
- #include "../qfunction.hpp"
-+#include "../ceed/integrators/divdiv/divdiv.hpp"
- #include "bilininteg_hdiv_kernels.hpp"
- 
- namespace mfem
-@@ -20,33 +21,34 @@ namespace mfem
- 
- void DivDivIntegrator::AssemblePA(const FiniteElementSpace &fes)
- {
--   // Assumes tensor-product elements
-    Mesh *mesh = fes.GetMesh();
--   const FiniteElement *fel = fes.GetFE(0);
-+   if (mesh->GetNE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      ceedOp = new ceed::PADivDivIntegrator(*this, fes, Q);
-+      return;
-+   }
- 
-+   // Assumes tensor-product elements
-+   const FiniteElement *fel = fes.GetFE(0);
-    const VectorTensorFiniteElement *el =
-       dynamic_cast<const VectorTensorFiniteElement*>(fel);
-    MFEM_VERIFY(el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const IntegrationRule *ir = IntRule ? IntRule : &MassIntegrator::GetRule
--                               (*el, *el, *mesh->GetElementTransformation(0));
--
-+   ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*el, T);
-    const int dims = el->GetDim();
-    MFEM_VERIFY(dims == 2 || dims == 3, "");
--
-    const int nq = ir->GetNPoints();
-    dim = mesh->Dimension();
-    MFEM_VERIFY(dim == 2 || dim == 3, "");
--
-    ne = fes.GetNE();
-    geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
-    mapsC = &el->GetDofToQuad(*ir, DofToQuad::TENSOR);
-    mapsO = &el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
-    dofs1D = mapsC->ndof;
-    quad1D = mapsC->nqpt;
--
-    MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
--
-    pa_data.SetSize(nq * ne, Device::GetMemoryType());
- 
-    QuadratureSpace qs(*mesh, *ir);
-@@ -68,31 +70,72 @@ void DivDivIntegrator::AssemblePA(const FiniteElementSpace &fes)
-    }
- }
- 
--void DivDivIntegrator::AssembleDiagonalPA(Vector& diag)
-+void DivDivIntegrator::AssemblePABoundary(const FiniteElementSpace &fes)
- {
--   if (dim == 3)
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-    {
--      internal::PADivDivAssembleDiagonal3D(dofs1D, quad1D, ne,
--                                           mapsO->B, mapsC->G, pa_data, diag);
-+      delete ceedOp;
-+      ceedOp = new ceed::PADivDivIntegrator(*this, fes, Q, true);
-+      return;
-+   }
-+
-+   // Assumes tensor-product elements
-+   // const FiniteElement &el = *fes.GetBE(0);
-+   // ElementTransformation &T = *mesh->GetBdrElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*el, T);
-+   MFEM_ABORT("Error: DivDivIntegrator::AssemblePABoundary only implemented with"
-+              " libCEED");
-+}
-+
-+void DivDivIntegrator::AssembleDiagonalPA(Vector &diag)
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->GetDiagonal(diag);
-    }
-    else
-    {
--      internal::PADivDivAssembleDiagonal2D(dofs1D, quad1D, ne,
--                                           mapsO->B, mapsC->G, pa_data, diag);
-+      if (dim == 3)
-+      {
-+         internal::PADivDivAssembleDiagonal3D(dofs1D, quad1D, ne,
-+                                              mapsO->B, mapsC->G, pa_data, diag);
-+      }
-+      else if (dim == 2)
-+      {
-+         internal::PADivDivAssembleDiagonal2D(dofs1D, quad1D, ne,
-+                                              mapsO->B, mapsC->G, pa_data, diag);
-+      }
-+      else
-+      {
-+         MFEM_ABORT("Unsupported dimension!");
-+      }
-    }
- }
- 
- void DivDivIntegrator::AddMultPA(const Vector &x, Vector &y) const
- {
--   if (dim == 3)
--      internal::PADivDivApply3D(dofs1D, quad1D, ne, mapsO->B, mapsC->G,
--                                mapsO->Bt, mapsC->Gt, pa_data, x, y);
--   else if (dim == 2)
--      internal::PADivDivApply2D(dofs1D, quad1D, ne, mapsO->B, mapsC->G,
--                                mapsO->Bt, mapsC->Gt, pa_data, x, y);
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-    else
-    {
--      MFEM_ABORT("Unsupported dimension!");
-+      if (dim == 3)
-+      {
-+         internal::PADivDivApply3D(dofs1D, quad1D, ne, mapsO->B, mapsC->G,
-+                                   mapsO->Bt, mapsC->Gt, pa_data, x, y);
-+      }
-+      else if (dim == 2)
-+      {
-+         internal::PADivDivApply2D(dofs1D, quad1D, ne, mapsO->B, mapsC->G,
-+                                   mapsO->Bt, mapsC->Gt, pa_data, x, y);
-+      }
-+      else
-+      {
-+         MFEM_ABORT("Unsupported dimension!");
-+      }
-    }
- }
- 
-diff --git a/fem/integ/bilininteg_gradient_pa.cpp b/fem/integ/bilininteg_gradient_pa.cpp
-index 20ef4684d..cb37e981a 100644
---- a/fem/integ/bilininteg_gradient_pa.cpp
-+++ b/fem/integ/bilininteg_gradient_pa.cpp
-@@ -167,9 +167,8 @@ void GradientIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-    Mesh *mesh = trial_fes.GetMesh();
-    const FiniteElement &trial_fe = *trial_fes.GetFE(0);
-    const FiniteElement &test_fe = *test_fes.GetFE(0);
--   ElementTransformation *trans = mesh->GetElementTransformation(0);
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
--                                                            *trans);
-+   ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe, T);
-    const int dims = trial_fe.GetDim();
-    const int dimsToStore = dims * dims;
-    nq = ir->GetNPoints();
-diff --git a/fem/integ/bilininteg_interp_curl_pa.cpp b/fem/integ/bilininteg_interp_curl_pa.cpp
-new file mode 100644
-index 000000000..cc9502ce0
---- /dev/null
-+++ b/fem/integ/bilininteg_interp_curl_pa.cpp
-@@ -0,0 +1,65 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../qfunction.hpp"
-+#include "../ceed/integrators/interp/interp.hpp"
-+
-+namespace mfem
-+{
-+
-+void CurlInterpolator::AssemblePA(const FiniteElementSpace &trial_fes,
-+                                  const FiniteElementSpace &test_fes)
-+{
-+   Mesh *mesh = trial_fes.GetMesh();
-+   if (mesh->GetNE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      ceedOp = new ceed::PADiscreteInterpolator(*this, trial_fes, test_fes);
-+      return;
-+   }
-+
-+   // Assumes tensor-product elements, with a vector test space and H^1 trial space.
-+   // const FiniteElement *trial_fel = trial_fes.GetFE(0);
-+   // const FiniteElement *test_fel = test_fes.GetFE(0);
-+   MFEM_ABORT("Error: CurlInterpolator::AssemblePA only implemented with libCEED");
-+}
-+
-+void CurlInterpolator::AddMultPA(const Vector &x, Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Error: CurlInterpolator::AddMultPA only implemented with"
-+                 " libCEED");
-+   }
-+}
-+
-+void CurlInterpolator::AddMultTransposePA(const Vector &x, Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMultTranspose(x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Error: CurlInterpolator::AddMultTransposePA only implemented"
-+                 "with libCEED");
-+   }
-+}
-+
-+} // namespace mfem
-diff --git a/fem/integ/bilininteg_interp_pa.cpp b/fem/integ/bilininteg_interp_grad_pa.cpp
-similarity index 55%
-rename from fem/integ/bilininteg_interp_pa.cpp
-rename to fem/integ/bilininteg_interp_grad_pa.cpp
-index 3cac18c65..4ec50e083 100644
---- a/fem/integ/bilininteg_interp_pa.cpp
-+++ b/fem/integ/bilininteg_interp_grad_pa.cpp
-@@ -13,10 +13,68 @@
- #include "../bilininteg.hpp"
- #include "../gridfunc.hpp"
- #include "../qfunction.hpp"
-+#include "../ceed/integrators/interp/interp.hpp"
- 
- namespace mfem
- {
- 
-+void GradientInterpolator::AssemblePA(const FiniteElementSpace &trial_fes,
-+                                      const FiniteElementSpace &test_fes)
-+{
-+   Mesh *mesh = trial_fes.GetMesh();
-+   if (mesh->GetNE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      ceedOp = new ceed::PADiscreteInterpolator(*this, trial_fes, test_fes);
-+      return;
-+   }
-+
-+   // Assumes tensor-product elements, with a vector test space and H^1 trial space.
-+   const FiniteElement *trial_fel = trial_fes.GetFE(0);
-+   const FiniteElement *test_fel = test_fes.GetFE(0);
-+
-+   const NodalTensorFiniteElement *trial_el =
-+      dynamic_cast<const NodalTensorFiniteElement*>(trial_fel);
-+   MFEM_VERIFY(trial_el != NULL, "Only NodalTensorFiniteElement is supported!");
-+
-+   const VectorTensorFiniteElement *test_el =
-+      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
-+   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
-+
-+   const int dims = trial_el->GetDim();
-+   MFEM_VERIFY(dims == 2 || dims == 3, "Bad dimension!");
-+   dim = mesh->Dimension();
-+   MFEM_VERIFY(dim == 2 || dim == 3, "Bad dimension!");
-+   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(),
-+               "Orders do not match!");
-+   ne = trial_fes.GetNE();
-+
-+   const int order = trial_el->GetOrder();
-+   dofquad_fe = new H1_SegmentElement(order, trial_el->GetBasisType());
-+   mfem::QuadratureFunctions1D qf1d;
-+   mfem::IntegrationRule closed_ir;
-+   closed_ir.SetSize(order + 1);
-+   qf1d.GaussLobatto(order + 1, &closed_ir);
-+   mfem::IntegrationRule open_ir;
-+   open_ir.SetSize(order);
-+   qf1d.GaussLegendre(order, &open_ir);
-+
-+   maps_O_C = &dofquad_fe->GetDofToQuad(open_ir, DofToQuad::TENSOR);
-+   o_dofs1D = maps_O_C->nqpt;
-+   if (trial_el->GetBasisType() == BasisType::GaussLobatto)
-+   {
-+      B_id = true;
-+      c_dofs1D = maps_O_C->ndof;
-+   }
-+   else
-+   {
-+      B_id = false;
-+      maps_C_C = &dofquad_fe->GetDofToQuad(closed_ir, DofToQuad::TENSOR);
-+      c_dofs1D = maps_C_C->nqpt;
-+   }
-+}
-+
- // Apply to x corresponding to DOFs in H^1 (domain) the (topological) gradient
- // to get a dof in H(curl) (range). You can think of the range as the "test" space
- // and the domain as the "trial" space, but there's no integration.
-@@ -1017,920 +1075,85 @@ static void PAHcurlApplyGradientTranspose3DBId(
-    });
- }
- 
--void GradientInterpolator::AssemblePA(const FiniteElementSpace &trial_fes,
--                                      const FiniteElementSpace &test_fes)
-+void GradientInterpolator::AddMultPA(const Vector &x, Vector &y) const
- {
--   // Assumes tensor-product elements, with a vector test space and H^1 trial space.
--   Mesh *mesh = trial_fes.GetMesh();
--   const FiniteElement *trial_fel = trial_fes.GetFE(0);
--   const FiniteElement *test_fel = test_fes.GetFE(0);
--
--   const NodalTensorFiniteElement *trial_el =
--      dynamic_cast<const NodalTensorFiniteElement*>(trial_fel);
--   MFEM_VERIFY(trial_el != NULL, "Only NodalTensorFiniteElement is supported!");
--
--   const VectorTensorFiniteElement *test_el =
--      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
--   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const int dims = trial_el->GetDim();
--   MFEM_VERIFY(dims == 2 || dims == 3, "Bad dimension!");
--   dim = mesh->Dimension();
--   MFEM_VERIFY(dim == 2 || dim == 3, "Bad dimension!");
--   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(),
--               "Orders do not match!");
--   ne = trial_fes.GetNE();
--
--   const int order = trial_el->GetOrder();
--   dofquad_fe = new H1_SegmentElement(order, trial_el->GetBasisType());
--   mfem::QuadratureFunctions1D qf1d;
--   mfem::IntegrationRule closed_ir;
--   closed_ir.SetSize(order + 1);
--   qf1d.GaussLobatto(order + 1, &closed_ir);
--   mfem::IntegrationRule open_ir;
--   open_ir.SetSize(order);
--   qf1d.GaussLegendre(order, &open_ir);
--
--   maps_O_C = &dofquad_fe->GetDofToQuad(open_ir, DofToQuad::TENSOR);
--   o_dofs1D = maps_O_C->nqpt;
--   if (trial_el->GetBasisType() == BasisType::GaussLobatto)
-+   if (DeviceCanUseCeed())
-    {
--      B_id = true;
--      c_dofs1D = maps_O_C->ndof;
-+      ceedOp->AddMult(x, y);
-    }
-    else
-    {
--      B_id = false;
--      maps_C_C = &dofquad_fe->GetDofToQuad(closed_ir, DofToQuad::TENSOR);
--      c_dofs1D = maps_C_C->nqpt;
--   }
--}
--
--void GradientInterpolator::AddMultPA(const Vector &x, Vector &y) const
--{
--   if (dim == 3)
--   {
--      if (B_id)
-+      if (dim == 3)
-       {
--         PAHcurlApplyGradient3DBId(c_dofs1D, o_dofs1D, ne,
-+         if (B_id)
-+         {
-+            PAHcurlApplyGradient3DBId(c_dofs1D, o_dofs1D, ne,
-+                                      maps_O_C->G, x, y);
-+         }
-+         else
-+         {
-+            PAHcurlApplyGradient3D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
-                                    maps_O_C->G, x, y);
-+         }
-       }
--      else
--      {
--         PAHcurlApplyGradient3D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
--                                maps_O_C->G, x, y);
--      }
--   }
--   else if (dim == 2)
--   {
--      if (B_id)
-+      else if (dim == 2)
-       {
--         PAHcurlApplyGradient2DBId(c_dofs1D, o_dofs1D, ne,
--                                   maps_O_C->G, x, y);
-+         if (B_id)
-+         {
-+            PAHcurlApplyGradient2DBId(c_dofs1D, o_dofs1D, ne,
-+                                      maps_O_C->G, x, y);
-+         }
-+         else
-+         {
-+            PAHcurlApplyGradient2D(c_dofs1D, o_dofs1D, ne, maps_C_C->B, maps_O_C->G,
-+                                   x, y);
-+         }
-       }
-       else
-       {
--         PAHcurlApplyGradient2D(c_dofs1D, o_dofs1D, ne, maps_C_C->B, maps_O_C->G,
--                                x, y);
-+         MFEM_ABORT("Bad dimension!");
-       }
-    }
--   else
--   {
--      mfem_error("Bad dimension!");
--   }
- }
- 
- void GradientInterpolator::AddMultTransposePA(const Vector &x, Vector &y) const
- {
--   if (dim == 3)
--   {
--      if (B_id)
--      {
--         PAHcurlApplyGradientTranspose3DBId(c_dofs1D, o_dofs1D, ne,
--                                            maps_O_C->G, x, y);
--      }
--      else
--      {
--         PAHcurlApplyGradientTranspose3D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
--                                         maps_O_C->G, x, y);
--      }
--   }
--   else if (dim == 2)
-+   if (DeviceCanUseCeed())
-    {
--      if (B_id)
--      {
--         PAHcurlApplyGradientTranspose2DBId(c_dofs1D, o_dofs1D, ne,
--                                            maps_O_C->G, x, y);
--      }
--      else
--      {
--         PAHcurlApplyGradientTranspose2D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
--                                         maps_O_C->G, x, y);
--      }
-+      ceedOp->AddMultTranspose(x, y);
-    }
-    else
-    {
--      mfem_error("Bad dimension!");
--   }
--}
--
--static void PAHcurlVecH1IdentityApply2D(const int c_dofs1D,
--                                        const int o_dofs1D,
--                                        const int NE,
--                                        const Array<double> &Bclosed,
--                                        const Array<double> &Bopen,
--                                        const Vector &pa_data,
--                                        const Vector &x_,
--                                        Vector &y_)
--{
--   auto Bc = Reshape(Bclosed.Read(), c_dofs1D, c_dofs1D);
--   auto Bo = Reshape(Bopen.Read(), o_dofs1D, c_dofs1D);
--
--   auto x = Reshape(x_.Read(), c_dofs1D, c_dofs1D, 2, NE);
--   auto y = Reshape(y_.ReadWrite(), (2 * c_dofs1D * o_dofs1D), NE);
--
--   auto vk = Reshape(pa_data.Read(), 2, (2 * c_dofs1D * o_dofs1D), NE);
--
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--
--   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double w[2][MAX_D1D][MAX_D1D];
--
--      // dofs that point parallel to x-axis (open in x, closed in y)
--
--      // contract in y
--      for (int ey = 0; ey < c_dofs1D; ++ey)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int j=0; j<2; ++j)
--            {
--               w[j][dx][ey] = 0.0;
--               for (int dy = 0; dy < c_dofs1D; ++dy)
--               {
--                  w[j][dx][ey] += Bc(ey, dy) * x(dx, dy, j, e);
--               }
--            }
--         }
--      }
--
--      // contract in x
--      for (int ey = 0; ey < c_dofs1D; ++ey)
--      {
--         for (int ex = 0; ex < o_dofs1D; ++ex)
--         {
--            for (int j=0; j<2; ++j)
--            {
--               double s = 0.0;
--               for (int dx = 0; dx < c_dofs1D; ++dx)
--               {
--                  s += Bo(ex, dx) * w[j][dx][ey];
--               }
--               const int local_index = ey*o_dofs1D + ex;
--               y(local_index, e) += s * vk(j, local_index, e);
--            }
--         }
--      }
--
--      // dofs that point parallel to y-axis (open in y, closed in x)
--
--      // contract in y
--      for (int ey = 0; ey < o_dofs1D; ++ey)
-+      if (dim == 3)
-       {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         if (B_id)
-          {
--            for (int j=0; j<2; ++j)
--            {
--               w[j][dx][ey] = 0.0;
--               for (int dy = 0; dy < c_dofs1D; ++dy)
--               {
--                  w[j][dx][ey] += Bo(ey, dy) * x(dx, dy, j, e);
--               }
--            }
-+            PAHcurlApplyGradientTranspose3DBId(c_dofs1D, o_dofs1D, ne,
-+                                               maps_O_C->G, x, y);
-          }
--      }
--
--      // contract in x
--      for (int ey = 0; ey < o_dofs1D; ++ey)
--      {
--         for (int ex = 0; ex < c_dofs1D; ++ex)
-+         else
-          {
--            for (int j=0; j<2; ++j)
--            {
--               double s = 0.0;
--               for (int dx = 0; dx < c_dofs1D; ++dx)
--               {
--                  s += Bc(ex, dx) * w[j][dx][ey];
--               }
--               const int local_index = c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
--               y(local_index, e) += s * vk(j, local_index, e);
--            }
-+            PAHcurlApplyGradientTranspose3D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
-+                                            maps_O_C->G, x, y);
-          }
-       }
--   });
--}
--
--static void PAHcurlVecH1IdentityApplyTranspose2D(const int c_dofs1D,
--                                                 const int o_dofs1D,
--                                                 const int NE,
--                                                 const Array<double> &Bclosed,
--                                                 const Array<double> &Bopen,
--                                                 const Vector &pa_data,
--                                                 const Vector &x_,
--                                                 Vector &y_)
--{
--   auto Bc = Reshape(Bclosed.Read(), c_dofs1D, c_dofs1D);
--   auto Bo = Reshape(Bopen.Read(), o_dofs1D, c_dofs1D);
--
--   auto x = Reshape(x_.Read(), (2 * c_dofs1D * o_dofs1D), NE);
--   auto y = Reshape(y_.ReadWrite(), c_dofs1D, c_dofs1D, 2, NE);
--
--   auto vk = Reshape(pa_data.Read(), 2, (2 * c_dofs1D * o_dofs1D), NE);
--
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   //constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
--
--   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double w[2][MAX_D1D][MAX_D1D];
--
--      // dofs that point parallel to x-axis (open in x, closed in y)
--
--      // contract in x
--      for (int ey = 0; ey < c_dofs1D; ++ey)
-+      else if (dim == 2)
-       {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int j=0; j<2; ++j) { w[j][dx][ey] = 0.0; }
--         }
--         for (int ex = 0; ex < o_dofs1D; ++ex)
-+         if (B_id)
-          {
--            const int local_index = ey*o_dofs1D + ex;
--            const double xd = x(local_index, e);
--
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               for (int j=0; j<2; ++j)
--               {
--                  w[j][dx][ey] += Bo(ex, dx) * xd * vk(j, local_index, e);
--               }
--            }
-+            PAHcurlApplyGradientTranspose2DBId(c_dofs1D, o_dofs1D, ne,
-+                                               maps_O_C->G, x, y);
-          }
--      }
--
--      // contract in y
--      for (int dx = 0; dx < c_dofs1D; ++dx)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         else
-          {
--            for (int j=0; j<2; ++j)
--            {
--               double s = 0.0;
--               for (int ey = 0; ey < c_dofs1D; ++ey)
--               {
--                  s += w[j][dx][ey] * Bc(ey, dy);
--               }
--               y(dx, dy, j, e) += s;
--            }
-+            PAHcurlApplyGradientTranspose2D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
-+                                            maps_O_C->G, x, y);
-          }
-       }
--
--      // dofs that point parallel to y-axis (open in y, closed in x)
--
--      // contract in x
--      for (int ey = 0; ey < o_dofs1D; ++ey)
-+      else
-       {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int j=0; j<2; ++j) { w[j][dx][ey] = 0.0; }
--         }
--         for (int ex = 0; ex < c_dofs1D; ++ex)
--         {
--            const int local_index = c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
--            const double xd = x(local_index, e);
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               for (int j=0; j<2; ++j)
--               {
--                  w[j][dx][ey] += Bc(ex, dx) * xd * vk(j, local_index, e);
--               }
--            }
--         }
-+         MFEM_ABORT("Bad dimension!");
-       }
--
--      // contract in y
--      for (int dx = 0; dx < c_dofs1D; ++dx)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int j=0; j<2; ++j)
--            {
--               double s = 0.0;
--               for (int ey = 0; ey < o_dofs1D; ++ey)
--               {
--                  s += w[j][dx][ey] * Bo(ey, dy);
--               }
--               y(dx, dy, j, e) += s;
--            }
--         }
--      }
--   });
--}
--
--static void PAHcurlVecH1IdentityApply3D(const int c_dofs1D,
--                                        const int o_dofs1D,
--                                        const int NE,
--                                        const Array<double> &Bclosed,
--                                        const Array<double> &Bopen,
--                                        const Vector &pa_data,
--                                        const Vector &x_,
--                                        Vector &y_)
--{
--   auto Bc = Reshape(Bclosed.Read(), c_dofs1D, c_dofs1D);
--   auto Bo = Reshape(Bopen.Read(), o_dofs1D, c_dofs1D);
--
--   auto x = Reshape(x_.Read(), c_dofs1D, c_dofs1D, c_dofs1D, 3, NE);
--   auto y = Reshape(y_.ReadWrite(), (3 * c_dofs1D * c_dofs1D * o_dofs1D), NE);
--
--   auto vk = Reshape(pa_data.Read(), 3, (3 * c_dofs1D * c_dofs1D * o_dofs1D),
--                     NE);
--
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double w1[3][MAX_D1D][MAX_D1D][MAX_D1D];
--      double w2[3][MAX_D1D][MAX_D1D][MAX_D1D];
--
--      // dofs that point parallel to x-axis (open in x, closed in y, z)
--
--      // contract in z
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w1[j][dx][dy][ez] = 0.0;
--                  for (int dz = 0; dz < c_dofs1D; ++dz)
--                  {
--                     w1[j][dx][dy][ez] += Bc(ez, dz) * x(dx, dy, dz, j, e);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w2[j][dx][ey][ez] = 0.0;
--                  for (int dy = 0; dy < c_dofs1D; ++dy)
--                  {
--                     w2[j][dx][ey][ez] += Bc(ey, dy) * w1[j][dx][dy][ez];
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in x
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int ex = 0; ex < o_dofs1D; ++ex)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  double s = 0.0;
--                  for (int dx = 0; dx < c_dofs1D; ++dx)
--                  {
--                     s += Bo(ex, dx) * w2[j][dx][ey][ez];
--                  }
--                  const int local_index = ez*c_dofs1D*o_dofs1D + ey*o_dofs1D + ex;
--                  y(local_index, e) += s * vk(j, local_index, e);
--               }
--            }
--         }
--      }
--
--      // dofs that point parallel to y-axis (open in y, closed in x, z)
--
--      // contract in z
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w1[j][dx][dy][ez] = 0.0;
--                  for (int dz = 0; dz < c_dofs1D; ++dz)
--                  {
--                     w1[j][dx][dy][ez] += Bc(ez, dz) * x(dx, dy, dz, j, e);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < o_dofs1D; ++ey)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w2[j][dx][ey][ez] = 0.0;
--                  for (int dy = 0; dy < c_dofs1D; ++dy)
--                  {
--                     w2[j][dx][ey][ez] += Bo(ey, dy) * w1[j][dx][dy][ez];
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in x
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < o_dofs1D; ++ey)
--         {
--            for (int ex = 0; ex < c_dofs1D; ++ex)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  double s = 0.0;
--                  for (int dx = 0; dx < c_dofs1D; ++dx)
--                  {
--                     s += Bc(ex, dx) * w2[j][dx][ey][ez];
--                  }
--                  const int local_index = c_dofs1D*c_dofs1D*o_dofs1D +
--                                          ez*c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
--                  y(local_index, e) += s * vk(j, local_index, e);
--               }
--            }
--         }
--      }
--
--      // dofs that point parallel to z-axis (open in z, closed in x, y)
--
--      // contract in z
--      for (int ez = 0; ez < o_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w1[j][dx][dy][ez] = 0.0;
--                  for (int dz = 0; dz < c_dofs1D; ++dz)
--                  {
--                     w1[j][dx][dy][ez] += Bo(ez, dz) * x(dx, dy, dz, j, e);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < o_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w2[j][dx][ey][ez] = 0.0;
--                  for (int dy = 0; dy < c_dofs1D; ++dy)
--                  {
--                     w2[j][dx][ey][ez] += Bc(ey, dy) * w1[j][dx][dy][ez];
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in x
--      for (int ez = 0; ez < o_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int ex = 0; ex < c_dofs1D; ++ex)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  double s = 0.0;
--                  for (int dx = 0; dx < c_dofs1D; ++dx)
--                  {
--                     s += Bc(ex, dx) * w2[j][dx][ey][ez];
--                  }
--                  const int local_index = 2*c_dofs1D*c_dofs1D*o_dofs1D +
--                                          ez*c_dofs1D*c_dofs1D + ey*c_dofs1D + ex;
--                  y(local_index, e) += s * vk(j, local_index, e);
--               }
--            }
--         }
--      }
--   });
--}
--
--static void PAHcurlVecH1IdentityApplyTranspose3D(const int c_dofs1D,
--                                                 const int o_dofs1D,
--                                                 const int NE,
--                                                 const Array<double> &Bclosed,
--                                                 const Array<double> &Bopen,
--                                                 const Vector &pa_data,
--                                                 const Vector &x_,
--                                                 Vector &y_)
--{
--   auto Bc = Reshape(Bclosed.Read(), c_dofs1D, c_dofs1D);
--   auto Bo = Reshape(Bopen.Read(), o_dofs1D, c_dofs1D);
--
--   auto x = Reshape(x_.Read(), (3 * c_dofs1D * c_dofs1D * o_dofs1D), NE);
--   auto y = Reshape(y_.ReadWrite(), c_dofs1D, c_dofs1D, c_dofs1D, 3, NE);
--
--   auto vk = Reshape(pa_data.Read(), 3, (3 * c_dofs1D * c_dofs1D * o_dofs1D),
--                     NE);
--
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--
--   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double w1[3][MAX_D1D][MAX_D1D][MAX_D1D];
--      double w2[3][MAX_D1D][MAX_D1D][MAX_D1D];
--
--      // dofs that point parallel to x-axis (open in x, closed in y, z)
--
--      // contract in x
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int j=0; j<3; ++j)
--            {
--               for (int dx = 0; dx < c_dofs1D; ++dx)
--               {
--                  w2[j][dx][ey][ez] = 0.0;
--               }
--               for (int ex = 0; ex < o_dofs1D; ++ex)
--               {
--                  const int local_index = ez*c_dofs1D*o_dofs1D + ey*o_dofs1D + ex;
--                  const double xv = x(local_index, e) * vk(j, local_index, e);
--                  for (int dx = 0; dx < c_dofs1D; ++dx)
--                  {
--                     w2[j][dx][ey][ez] += xv * Bo(ex, dx);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w1[j][dx][dy][ez] = 0.0;
--                  for (int ey = 0; ey < c_dofs1D; ++ey)
--                  {
--                     w1[j][dx][dy][ez] += w2[j][dx][ey][ez] * Bc(ey, dy);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in z
--      for (int dx = 0; dx < c_dofs1D; ++dx)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int dz = 0; dz < c_dofs1D; ++dz)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  double s = 0.0;
--                  for (int ez = 0; ez < c_dofs1D; ++ez)
--                  {
--                     s += w1[j][dx][dy][ez] * Bc(ez, dz);
--                  }
--                  y(dx, dy, dz, j, e) += s;
--               }
--            }
--         }
--      }
--
--      // dofs that point parallel to y-axis (open in y, closed in x, z)
--
--      // contract in x
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < o_dofs1D; ++ey)
--         {
--            for (int j=0; j<3; ++j)
--            {
--               for (int dx = 0; dx < c_dofs1D; ++dx)
--               {
--                  w2[j][dx][ey][ez] = 0.0;
--               }
--               for (int ex = 0; ex < c_dofs1D; ++ex)
--               {
--                  const int local_index = c_dofs1D*c_dofs1D*o_dofs1D +
--                                          ez*c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
--                  const double xv = x(local_index, e) * vk(j, local_index, e);
--                  for (int dx = 0; dx < c_dofs1D; ++dx)
--                  {
--                     w2[j][dx][ey][ez] += xv * Bc(ex, dx);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w1[j][dx][dy][ez] = 0.0;
--                  for (int ey = 0; ey < o_dofs1D; ++ey)
--                  {
--                     w1[j][dx][dy][ez] += w2[j][dx][ey][ez] * Bo(ey, dy);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in z
--      for (int dx = 0; dx < c_dofs1D; ++dx)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int dz = 0; dz < c_dofs1D; ++dz)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  double s = 0.0;
--                  for (int ez = 0; ez < c_dofs1D; ++ez)
--                  {
--                     s += w1[j][dx][dy][ez] * Bc(ez, dz);
--                  }
--                  y(dx, dy, dz, j, e) += s;
--               }
--            }
--         }
--      }
--
--      // dofs that point parallel to z-axis (open in z, closed in x, y)
--
--      // contract in x
--      for (int ez = 0; ez < o_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int j=0; j<3; ++j)
--            {
--               for (int dx = 0; dx < c_dofs1D; ++dx)
--               {
--                  w2[j][dx][ey][ez] = 0.0;
--               }
--               for (int ex = 0; ex < c_dofs1D; ++ex)
--               {
--                  const int local_index = 2*c_dofs1D*c_dofs1D*o_dofs1D +
--                                          ez*c_dofs1D*c_dofs1D + ey*c_dofs1D + ex;
--                  const double xv = x(local_index, e) * vk(j, local_index, e);
--                  for (int dx = 0; dx < c_dofs1D; ++dx)
--                  {
--                     w2[j][dx][ey][ez] += xv * Bc(ex, dx);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < o_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w1[j][dx][dy][ez] = 0.0;
--                  for (int ey = 0; ey < c_dofs1D; ++ey)
--                  {
--                     w1[j][dx][dy][ez] += w2[j][dx][ey][ez] * Bc(ey, dy);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in z
--      for (int dx = 0; dx < c_dofs1D; ++dx)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int dz = 0; dz < c_dofs1D; ++dz)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  double s = 0.0;
--                  for (int ez = 0; ez < o_dofs1D; ++ez)
--                  {
--                     s += w1[j][dx][dy][ez] * Bo(ez, dz);
--                  }
--                  y(dx, dy, dz, j, e) += s;
--               }
--            }
--         }
--      }
--   });
--}
--
--void IdentityInterpolator::AssemblePA(const FiniteElementSpace &trial_fes,
--                                      const FiniteElementSpace &test_fes)
--{
--   // Assumes tensor-product elements, with a vector test space and H^1 trial space.
--   Mesh *mesh = trial_fes.GetMesh();
--   const FiniteElement *trial_fel = trial_fes.GetFE(0);
--   const FiniteElement *test_fel = test_fes.GetFE(0);
--
--   const NodalTensorFiniteElement *trial_el =
--      dynamic_cast<const NodalTensorFiniteElement*>(trial_fel);
--   MFEM_VERIFY(trial_el != NULL, "Only NodalTensorFiniteElement is supported!");
--
--   const VectorTensorFiniteElement *test_el =
--      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
--   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const int dims = trial_el->GetDim();
--   MFEM_VERIFY(dims == 2 || dims == 3, "");
--
--   dim = mesh->Dimension();
--   MFEM_VERIFY(dim == 2 || dim == 3, "");
--
--   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(), "");
--
--   ne = trial_fes.GetNE();
--
--   const int order = trial_el->GetOrder();
--   dofquad_fe = new H1_SegmentElement(order);
--   mfem::QuadratureFunctions1D qf1d;
--   mfem::IntegrationRule closed_ir;
--   closed_ir.SetSize(order + 1);
--   qf1d.GaussLobatto(order + 1, &closed_ir);
--   mfem::IntegrationRule open_ir;
--   open_ir.SetSize(order);
--   qf1d.GaussLegendre(order, &open_ir);
--
--   maps_C_C = &dofquad_fe->GetDofToQuad(closed_ir, DofToQuad::TENSOR);
--   maps_O_C = &dofquad_fe->GetDofToQuad(open_ir, DofToQuad::TENSOR);
--
--   o_dofs1D = maps_O_C->nqpt;
--   c_dofs1D = maps_C_C->nqpt;
--   MFEM_VERIFY(maps_O_C->ndof == c_dofs1D &&
--               maps_C_C->ndof == c_dofs1D, "Discrepancy in the number of DOFs");
--
--   const int ndof_test = (dim == 3) ? 3 * c_dofs1D * c_dofs1D * o_dofs1D
--                         : 2 * c_dofs1D * o_dofs1D;
--
--   const IntegrationRule & Nodes = test_el->GetNodes();
--
--   pa_data.SetSize(dim * ndof_test * ne, Device::GetMemoryType());
--   auto op = Reshape(pa_data.HostWrite(), dim, ndof_test, ne);
--
--   const Array<int> &dofmap = test_el->GetDofMap();
--
--   if (dim == 3)
--   {
--      // Note that ND_HexahedronElement uses 6 vectors in tk rather than 3, with
--      // the last 3 having negative signs. Here the signs are all positive, as
--      // signs are applied in ElementRestriction.
--
--      const double tk[9] = { 1.,0.,0.,  0.,1.,0.,  0.,0.,1. };
--
--      for (int c=0; c<3; ++c)
--      {
--         for (int i=0; i<ndof_test/3; ++i)
--         {
--            const int d = (c*ndof_test/3) + i;
--            // ND_HexahedronElement sets dof2tk = (dofmap < 0) ? 3+c : c, but here
--            // no signs should be applied due to ElementRestriction.
--            const int dof2tk = c;
--            const int id = (dofmap[d] >= 0) ? dofmap[d] : -1 - dofmap[d];
--
--            for (int e=0; e<ne; ++e)
--            {
--               double v[3];
--               ElementTransformation *tr = mesh->GetElementTransformation(e);
--               tr->SetIntPoint(&Nodes.IntPoint(id));
--               tr->Jacobian().Mult(tk + dof2tk*dim, v);
--
--               for (int j=0; j<3; ++j)
--               {
--                  op(j,d,e) = v[j];
--               }
--            }
--         }
--      }
--   }
--   else // 2D case
--   {
--      const double tk[4] = { 1.,0.,  0.,1. };
--      for (int c=0; c<2; ++c)
--      {
--         for (int i=0; i<ndof_test/2; ++i)
--         {
--            const int d = (c*ndof_test/2) + i;
--            // ND_QuadrilateralElement sets dof2tk = (dofmap < 0) ? 2+c : c, but here
--            // no signs should be applied due to ElementRestriction.
--            const int dof2tk = c;
--            const int id = (dofmap[d] >= 0) ? dofmap[d] : -1 - dofmap[d];
--
--            for (int e=0; e<ne; ++e)
--            {
--               double v[2];
--               ElementTransformation *tr = mesh->GetElementTransformation(e);
--               tr->SetIntPoint(&Nodes.IntPoint(id));
--               tr->Jacobian().Mult(tk + dof2tk*dim, v);
--
--               for (int j=0; j<2; ++j)
--               {
--                  op(j,d,e) = v[j];
--               }
--            }
--         }
--      }
--   }
--}
--
--void IdentityInterpolator::AddMultPA(const Vector &x, Vector &y) const
--{
--   if (dim == 3)
--   {
--      PAHcurlVecH1IdentityApply3D(c_dofs1D, o_dofs1D, ne, maps_C_C->B, maps_O_C->B,
--                                  pa_data, x, y);
--   }
--   else if (dim == 2)
--   {
--      PAHcurlVecH1IdentityApply2D(c_dofs1D, o_dofs1D, ne, maps_C_C->B, maps_O_C->B,
--                                  pa_data, x, y);
--   }
--   else
--   {
--      mfem_error("Bad dimension!");
--   }
--}
--
--void IdentityInterpolator::AddMultTransposePA(const Vector &x, Vector &y) const
--{
--   if (dim == 3)
--   {
--      PAHcurlVecH1IdentityApplyTranspose3D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
--                                           maps_O_C->B, pa_data, x, y);
--   }
--   else if (dim == 2)
--   {
--      PAHcurlVecH1IdentityApplyTranspose2D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
--                                           maps_O_C->B, pa_data, x, y);
--   }
--   else
--   {
--      mfem_error("Bad dimension!");
-    }
- }
- 
-diff --git a/fem/integ/bilininteg_interp_id_pa.cpp b/fem/integ/bilininteg_interp_id_pa.cpp
-new file mode 100644
-index 000000000..efabe7c80
---- /dev/null
-+++ b/fem/integ/bilininteg_interp_id_pa.cpp
-@@ -0,0 +1,843 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../qfunction.hpp"
-+#include "../ceed/integrators/interp/interp.hpp"
-+
-+namespace mfem
-+{
-+
-+void IdentityInterpolator::AssemblePA(const FiniteElementSpace &trial_fes,
-+                                      const FiniteElementSpace &test_fes)
-+{
-+   Mesh *mesh = trial_fes.GetMesh();
-+   if (mesh->GetNE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      ceedOp = new ceed::PADiscreteInterpolator(*this, trial_fes, test_fes);
-+      return;
-+   }
-+
-+   // Assumes tensor-product elements, with a vector test space and H^1 trial space.
-+   const FiniteElement *trial_fel = trial_fes.GetFE(0);
-+   const FiniteElement *test_fel = test_fes.GetFE(0);
-+
-+   const NodalTensorFiniteElement *trial_el =
-+      dynamic_cast<const NodalTensorFiniteElement*>(trial_fel);
-+   MFEM_VERIFY(trial_el != NULL, "Only NodalTensorFiniteElement is supported!");
-+
-+   const VectorTensorFiniteElement *test_el =
-+      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
-+   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
-+
-+   const int dims = trial_el->GetDim();
-+   MFEM_VERIFY(dims == 2 || dims == 3, "");
-+
-+   dim = mesh->Dimension();
-+   MFEM_VERIFY(dim == 2 || dim == 3, "");
-+
-+   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(), "");
-+
-+   ne = trial_fes.GetNE();
-+
-+   const int order = trial_el->GetOrder();
-+   dofquad_fe = new H1_SegmentElement(order);
-+   mfem::QuadratureFunctions1D qf1d;
-+   mfem::IntegrationRule closed_ir;
-+   closed_ir.SetSize(order + 1);
-+   qf1d.GaussLobatto(order + 1, &closed_ir);
-+   mfem::IntegrationRule open_ir;
-+   open_ir.SetSize(order);
-+   qf1d.GaussLegendre(order, &open_ir);
-+
-+   maps_C_C = &dofquad_fe->GetDofToQuad(closed_ir, DofToQuad::TENSOR);
-+   maps_O_C = &dofquad_fe->GetDofToQuad(open_ir, DofToQuad::TENSOR);
-+
-+   o_dofs1D = maps_O_C->nqpt;
-+   c_dofs1D = maps_C_C->nqpt;
-+   MFEM_VERIFY(maps_O_C->ndof == c_dofs1D &&
-+               maps_C_C->ndof == c_dofs1D, "Discrepancy in the number of DOFs");
-+
-+   const int ndof_test = (dim == 3) ? 3 * c_dofs1D * c_dofs1D * o_dofs1D
-+                         : 2 * c_dofs1D * o_dofs1D;
-+
-+   const IntegrationRule & Nodes = test_el->GetNodes();
-+
-+   pa_data.SetSize(dim * ndof_test * ne, Device::GetMemoryType());
-+   auto op = Reshape(pa_data.HostWrite(), dim, ndof_test, ne);
-+
-+   const Array<int> &dofmap = test_el->GetDofMap();
-+
-+   if (dim == 3)
-+   {
-+      // Note that ND_HexahedronElement uses 6 vectors in tk rather than 3, with
-+      // the last 3 having negative signs. Here the signs are all positive, as
-+      // signs are applied in ElementRestriction.
-+
-+      const double tk[9] = { 1.,0.,0.,  0.,1.,0.,  0.,0.,1. };
-+
-+      for (int c=0; c<3; ++c)
-+      {
-+         for (int i=0; i<ndof_test/3; ++i)
-+         {
-+            const int d = (c*ndof_test/3) + i;
-+            // ND_HexahedronElement sets dof2tk = (dofmap < 0) ? 3+c : c, but here
-+            // no signs should be applied due to ElementRestriction.
-+            const int dof2tk = c;
-+            const int id = (dofmap[d] >= 0) ? dofmap[d] : -1 - dofmap[d];
-+
-+            for (int e=0; e<ne; ++e)
-+            {
-+               double v[3];
-+               ElementTransformation *tr = mesh->GetElementTransformation(e);
-+               tr->SetIntPoint(&Nodes.IntPoint(id));
-+               tr->Jacobian().Mult(tk + dof2tk*dim, v);
-+
-+               for (int j=0; j<3; ++j)
-+               {
-+                  op(j,d,e) = v[j];
-+               }
-+            }
-+         }
-+      }
-+   }
-+   else // 2D case
-+   {
-+      const double tk[4] = { 1.,0.,  0.,1. };
-+      for (int c=0; c<2; ++c)
-+      {
-+         for (int i=0; i<ndof_test/2; ++i)
-+         {
-+            const int d = (c*ndof_test/2) + i;
-+            // ND_QuadrilateralElement sets dof2tk = (dofmap < 0) ? 2+c : c, but here
-+            // no signs should be applied due to ElementRestriction.
-+            const int dof2tk = c;
-+            const int id = (dofmap[d] >= 0) ? dofmap[d] : -1 - dofmap[d];
-+
-+            for (int e=0; e<ne; ++e)
-+            {
-+               double v[2];
-+               ElementTransformation *tr = mesh->GetElementTransformation(e);
-+               tr->SetIntPoint(&Nodes.IntPoint(id));
-+               tr->Jacobian().Mult(tk + dof2tk*dim, v);
-+
-+               for (int j=0; j<2; ++j)
-+               {
-+                  op(j,d,e) = v[j];
-+               }
-+            }
-+         }
-+      }
-+   }
-+}
-+
-+static void PAHcurlVecH1IdentityApply2D(const int c_dofs1D,
-+                                        const int o_dofs1D,
-+                                        const int NE,
-+                                        const Array<double> &Bclosed,
-+                                        const Array<double> &Bopen,
-+                                        const Vector &pa_data,
-+                                        const Vector &x_,
-+                                        Vector &y_)
-+{
-+   auto Bc = Reshape(Bclosed.Read(), c_dofs1D, c_dofs1D);
-+   auto Bo = Reshape(Bopen.Read(), o_dofs1D, c_dofs1D);
-+
-+   auto x = Reshape(x_.Read(), c_dofs1D, c_dofs1D, 2, NE);
-+   auto y = Reshape(y_.ReadWrite(), (2 * c_dofs1D * o_dofs1D), NE);
-+
-+   auto vk = Reshape(pa_data.Read(), 2, (2 * c_dofs1D * o_dofs1D), NE);
-+
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+
-+   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double w[2][MAX_D1D][MAX_D1D];
-+
-+      // dofs that point parallel to x-axis (open in x, closed in y)
-+
-+      // contract in y
-+      for (int ey = 0; ey < c_dofs1D; ++ey)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int j=0; j<2; ++j)
-+            {
-+               w[j][dx][ey] = 0.0;
-+               for (int dy = 0; dy < c_dofs1D; ++dy)
-+               {
-+                  w[j][dx][ey] += Bc(ey, dy) * x(dx, dy, j, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int ey = 0; ey < c_dofs1D; ++ey)
-+      {
-+         for (int ex = 0; ex < o_dofs1D; ++ex)
-+         {
-+            for (int j=0; j<2; ++j)
-+            {
-+               double s = 0.0;
-+               for (int dx = 0; dx < c_dofs1D; ++dx)
-+               {
-+                  s += Bo(ex, dx) * w[j][dx][ey];
-+               }
-+               const int local_index = ey*o_dofs1D + ex;
-+               y(local_index, e) += s * vk(j, local_index, e);
-+            }
-+         }
-+      }
-+
-+      // dofs that point parallel to y-axis (open in y, closed in x)
-+
-+      // contract in y
-+      for (int ey = 0; ey < o_dofs1D; ++ey)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int j=0; j<2; ++j)
-+            {
-+               w[j][dx][ey] = 0.0;
-+               for (int dy = 0; dy < c_dofs1D; ++dy)
-+               {
-+                  w[j][dx][ey] += Bo(ey, dy) * x(dx, dy, j, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int ey = 0; ey < o_dofs1D; ++ey)
-+      {
-+         for (int ex = 0; ex < c_dofs1D; ++ex)
-+         {
-+            for (int j=0; j<2; ++j)
-+            {
-+               double s = 0.0;
-+               for (int dx = 0; dx < c_dofs1D; ++dx)
-+               {
-+                  s += Bc(ex, dx) * w[j][dx][ey];
-+               }
-+               const int local_index = c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
-+               y(local_index, e) += s * vk(j, local_index, e);
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+static void PAHcurlVecH1IdentityApplyTranspose2D(const int c_dofs1D,
-+                                                 const int o_dofs1D,
-+                                                 const int NE,
-+                                                 const Array<double> &Bclosed,
-+                                                 const Array<double> &Bopen,
-+                                                 const Vector &pa_data,
-+                                                 const Vector &x_,
-+                                                 Vector &y_)
-+{
-+   auto Bc = Reshape(Bclosed.Read(), c_dofs1D, c_dofs1D);
-+   auto Bo = Reshape(Bopen.Read(), o_dofs1D, c_dofs1D);
-+
-+   auto x = Reshape(x_.Read(), (2 * c_dofs1D * o_dofs1D), NE);
-+   auto y = Reshape(y_.ReadWrite(), c_dofs1D, c_dofs1D, 2, NE);
-+
-+   auto vk = Reshape(pa_data.Read(), 2, (2 * c_dofs1D * o_dofs1D), NE);
-+
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   //constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
-+
-+   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double w[2][MAX_D1D][MAX_D1D];
-+
-+      // dofs that point parallel to x-axis (open in x, closed in y)
-+
-+      // contract in x
-+      for (int ey = 0; ey < c_dofs1D; ++ey)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int j=0; j<2; ++j) { w[j][dx][ey] = 0.0; }
-+         }
-+         for (int ex = 0; ex < o_dofs1D; ++ex)
-+         {
-+            const int local_index = ey*o_dofs1D + ex;
-+            const double xd = x(local_index, e);
-+
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               for (int j=0; j<2; ++j)
-+               {
-+                  w[j][dx][ey] += Bo(ex, dx) * xd * vk(j, local_index, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int dx = 0; dx < c_dofs1D; ++dx)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int j=0; j<2; ++j)
-+            {
-+               double s = 0.0;
-+               for (int ey = 0; ey < c_dofs1D; ++ey)
-+               {
-+                  s += w[j][dx][ey] * Bc(ey, dy);
-+               }
-+               y(dx, dy, j, e) += s;
-+            }
-+         }
-+      }
-+
-+      // dofs that point parallel to y-axis (open in y, closed in x)
-+
-+      // contract in x
-+      for (int ey = 0; ey < o_dofs1D; ++ey)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int j=0; j<2; ++j) { w[j][dx][ey] = 0.0; }
-+         }
-+         for (int ex = 0; ex < c_dofs1D; ++ex)
-+         {
-+            const int local_index = c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
-+            const double xd = x(local_index, e);
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               for (int j=0; j<2; ++j)
-+               {
-+                  w[j][dx][ey] += Bc(ex, dx) * xd * vk(j, local_index, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int dx = 0; dx < c_dofs1D; ++dx)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int j=0; j<2; ++j)
-+            {
-+               double s = 0.0;
-+               for (int ey = 0; ey < o_dofs1D; ++ey)
-+               {
-+                  s += w[j][dx][ey] * Bo(ey, dy);
-+               }
-+               y(dx, dy, j, e) += s;
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+static void PAHcurlVecH1IdentityApply3D(const int c_dofs1D,
-+                                        const int o_dofs1D,
-+                                        const int NE,
-+                                        const Array<double> &Bclosed,
-+                                        const Array<double> &Bopen,
-+                                        const Vector &pa_data,
-+                                        const Vector &x_,
-+                                        Vector &y_)
-+{
-+   auto Bc = Reshape(Bclosed.Read(), c_dofs1D, c_dofs1D);
-+   auto Bo = Reshape(Bopen.Read(), o_dofs1D, c_dofs1D);
-+
-+   auto x = Reshape(x_.Read(), c_dofs1D, c_dofs1D, c_dofs1D, 3, NE);
-+   auto y = Reshape(y_.ReadWrite(), (3 * c_dofs1D * c_dofs1D * o_dofs1D), NE);
-+
-+   auto vk = Reshape(pa_data.Read(), 3, (3 * c_dofs1D * c_dofs1D * o_dofs1D),
-+                     NE);
-+
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double w1[3][MAX_D1D][MAX_D1D][MAX_D1D];
-+      double w2[3][MAX_D1D][MAX_D1D][MAX_D1D];
-+
-+      // dofs that point parallel to x-axis (open in x, closed in y, z)
-+
-+      // contract in z
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w1[j][dx][dy][ez] = 0.0;
-+                  for (int dz = 0; dz < c_dofs1D; ++dz)
-+                  {
-+                     w1[j][dx][dy][ez] += Bc(ez, dz) * x(dx, dy, dz, j, e);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w2[j][dx][ey][ez] = 0.0;
-+                  for (int dy = 0; dy < c_dofs1D; ++dy)
-+                  {
-+                     w2[j][dx][ey][ez] += Bc(ey, dy) * w1[j][dx][dy][ez];
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int ex = 0; ex < o_dofs1D; ++ex)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  double s = 0.0;
-+                  for (int dx = 0; dx < c_dofs1D; ++dx)
-+                  {
-+                     s += Bo(ex, dx) * w2[j][dx][ey][ez];
-+                  }
-+                  const int local_index = ez*c_dofs1D*o_dofs1D + ey*o_dofs1D + ex;
-+                  y(local_index, e) += s * vk(j, local_index, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // dofs that point parallel to y-axis (open in y, closed in x, z)
-+
-+      // contract in z
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w1[j][dx][dy][ez] = 0.0;
-+                  for (int dz = 0; dz < c_dofs1D; ++dz)
-+                  {
-+                     w1[j][dx][dy][ez] += Bc(ez, dz) * x(dx, dy, dz, j, e);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < o_dofs1D; ++ey)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w2[j][dx][ey][ez] = 0.0;
-+                  for (int dy = 0; dy < c_dofs1D; ++dy)
-+                  {
-+                     w2[j][dx][ey][ez] += Bo(ey, dy) * w1[j][dx][dy][ez];
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < o_dofs1D; ++ey)
-+         {
-+            for (int ex = 0; ex < c_dofs1D; ++ex)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  double s = 0.0;
-+                  for (int dx = 0; dx < c_dofs1D; ++dx)
-+                  {
-+                     s += Bc(ex, dx) * w2[j][dx][ey][ez];
-+                  }
-+                  const int local_index = c_dofs1D*c_dofs1D*o_dofs1D +
-+                                          ez*c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
-+                  y(local_index, e) += s * vk(j, local_index, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // dofs that point parallel to z-axis (open in z, closed in x, y)
-+
-+      // contract in z
-+      for (int ez = 0; ez < o_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w1[j][dx][dy][ez] = 0.0;
-+                  for (int dz = 0; dz < c_dofs1D; ++dz)
-+                  {
-+                     w1[j][dx][dy][ez] += Bo(ez, dz) * x(dx, dy, dz, j, e);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < o_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w2[j][dx][ey][ez] = 0.0;
-+                  for (int dy = 0; dy < c_dofs1D; ++dy)
-+                  {
-+                     w2[j][dx][ey][ez] += Bc(ey, dy) * w1[j][dx][dy][ez];
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int ez = 0; ez < o_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int ex = 0; ex < c_dofs1D; ++ex)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  double s = 0.0;
-+                  for (int dx = 0; dx < c_dofs1D; ++dx)
-+                  {
-+                     s += Bc(ex, dx) * w2[j][dx][ey][ez];
-+                  }
-+                  const int local_index = 2*c_dofs1D*c_dofs1D*o_dofs1D +
-+                                          ez*c_dofs1D*c_dofs1D + ey*c_dofs1D + ex;
-+                  y(local_index, e) += s * vk(j, local_index, e);
-+               }
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+static void PAHcurlVecH1IdentityApplyTranspose3D(const int c_dofs1D,
-+                                                 const int o_dofs1D,
-+                                                 const int NE,
-+                                                 const Array<double> &Bclosed,
-+                                                 const Array<double> &Bopen,
-+                                                 const Vector &pa_data,
-+                                                 const Vector &x_,
-+                                                 Vector &y_)
-+{
-+   auto Bc = Reshape(Bclosed.Read(), c_dofs1D, c_dofs1D);
-+   auto Bo = Reshape(Bopen.Read(), o_dofs1D, c_dofs1D);
-+
-+   auto x = Reshape(x_.Read(), (3 * c_dofs1D * c_dofs1D * o_dofs1D), NE);
-+   auto y = Reshape(y_.ReadWrite(), c_dofs1D, c_dofs1D, c_dofs1D, 3, NE);
-+
-+   auto vk = Reshape(pa_data.Read(), 3, (3 * c_dofs1D * c_dofs1D * o_dofs1D),
-+                     NE);
-+
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+
-+   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double w1[3][MAX_D1D][MAX_D1D][MAX_D1D];
-+      double w2[3][MAX_D1D][MAX_D1D][MAX_D1D];
-+
-+      // dofs that point parallel to x-axis (open in x, closed in y, z)
-+
-+      // contract in x
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int j=0; j<3; ++j)
-+            {
-+               for (int dx = 0; dx < c_dofs1D; ++dx)
-+               {
-+                  w2[j][dx][ey][ez] = 0.0;
-+               }
-+               for (int ex = 0; ex < o_dofs1D; ++ex)
-+               {
-+                  const int local_index = ez*c_dofs1D*o_dofs1D + ey*o_dofs1D + ex;
-+                  const double xv = x(local_index, e) * vk(j, local_index, e);
-+                  for (int dx = 0; dx < c_dofs1D; ++dx)
-+                  {
-+                     w2[j][dx][ey][ez] += xv * Bo(ex, dx);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w1[j][dx][dy][ez] = 0.0;
-+                  for (int ey = 0; ey < c_dofs1D; ++ey)
-+                  {
-+                     w1[j][dx][dy][ez] += w2[j][dx][ey][ez] * Bc(ey, dy);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in z
-+      for (int dx = 0; dx < c_dofs1D; ++dx)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int dz = 0; dz < c_dofs1D; ++dz)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  double s = 0.0;
-+                  for (int ez = 0; ez < c_dofs1D; ++ez)
-+                  {
-+                     s += w1[j][dx][dy][ez] * Bc(ez, dz);
-+                  }
-+                  y(dx, dy, dz, j, e) += s;
-+               }
-+            }
-+         }
-+      }
-+
-+      // dofs that point parallel to y-axis (open in y, closed in x, z)
-+
-+      // contract in x
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < o_dofs1D; ++ey)
-+         {
-+            for (int j=0; j<3; ++j)
-+            {
-+               for (int dx = 0; dx < c_dofs1D; ++dx)
-+               {
-+                  w2[j][dx][ey][ez] = 0.0;
-+               }
-+               for (int ex = 0; ex < c_dofs1D; ++ex)
-+               {
-+                  const int local_index = c_dofs1D*c_dofs1D*o_dofs1D +
-+                                          ez*c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
-+                  const double xv = x(local_index, e) * vk(j, local_index, e);
-+                  for (int dx = 0; dx < c_dofs1D; ++dx)
-+                  {
-+                     w2[j][dx][ey][ez] += xv * Bc(ex, dx);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w1[j][dx][dy][ez] = 0.0;
-+                  for (int ey = 0; ey < o_dofs1D; ++ey)
-+                  {
-+                     w1[j][dx][dy][ez] += w2[j][dx][ey][ez] * Bo(ey, dy);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in z
-+      for (int dx = 0; dx < c_dofs1D; ++dx)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int dz = 0; dz < c_dofs1D; ++dz)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  double s = 0.0;
-+                  for (int ez = 0; ez < c_dofs1D; ++ez)
-+                  {
-+                     s += w1[j][dx][dy][ez] * Bc(ez, dz);
-+                  }
-+                  y(dx, dy, dz, j, e) += s;
-+               }
-+            }
-+         }
-+      }
-+
-+      // dofs that point parallel to z-axis (open in z, closed in x, y)
-+
-+      // contract in x
-+      for (int ez = 0; ez < o_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int j=0; j<3; ++j)
-+            {
-+               for (int dx = 0; dx < c_dofs1D; ++dx)
-+               {
-+                  w2[j][dx][ey][ez] = 0.0;
-+               }
-+               for (int ex = 0; ex < c_dofs1D; ++ex)
-+               {
-+                  const int local_index = 2*c_dofs1D*c_dofs1D*o_dofs1D +
-+                                          ez*c_dofs1D*c_dofs1D + ey*c_dofs1D + ex;
-+                  const double xv = x(local_index, e) * vk(j, local_index, e);
-+                  for (int dx = 0; dx < c_dofs1D; ++dx)
-+                  {
-+                     w2[j][dx][ey][ez] += xv * Bc(ex, dx);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < o_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w1[j][dx][dy][ez] = 0.0;
-+                  for (int ey = 0; ey < c_dofs1D; ++ey)
-+                  {
-+                     w1[j][dx][dy][ez] += w2[j][dx][ey][ez] * Bc(ey, dy);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in z
-+      for (int dx = 0; dx < c_dofs1D; ++dx)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int dz = 0; dz < c_dofs1D; ++dz)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  double s = 0.0;
-+                  for (int ez = 0; ez < o_dofs1D; ++ez)
-+                  {
-+                     s += w1[j][dx][dy][ez] * Bo(ez, dz);
-+                  }
-+                  y(dx, dy, dz, j, e) += s;
-+               }
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+void IdentityInterpolator::AddMultPA(const Vector &x, Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-+   {
-+      if (dim == 3)
-+      {
-+         PAHcurlVecH1IdentityApply3D(c_dofs1D, o_dofs1D, ne, maps_C_C->B, maps_O_C->B,
-+                                     pa_data, x, y);
-+      }
-+      else if (dim == 2)
-+      {
-+         PAHcurlVecH1IdentityApply2D(c_dofs1D, o_dofs1D, ne, maps_C_C->B, maps_O_C->B,
-+                                     pa_data, x, y);
-+      }
-+      else
-+      {
-+         MFEM_ABORT("Bad dimension!");
-+      }
-+   }
-+}
-+
-+void IdentityInterpolator::AddMultTransposePA(const Vector &x, Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMultTranspose(x, y);
-+   }
-+   else
-+   {
-+      if (dim == 3)
-+      {
-+         PAHcurlVecH1IdentityApplyTranspose3D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
-+                                              maps_O_C->B, pa_data, x, y);
-+      }
-+      else if (dim == 2)
-+      {
-+         PAHcurlVecH1IdentityApplyTranspose2D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
-+                                              maps_O_C->B, pa_data, x, y);
-+      }
-+      else
-+      {
-+         MFEM_ABORT("Bad dimension!");
-+      }
-+   }
-+}
-+
-+} // namespace mfem
-diff --git a/fem/integ/bilininteg_mass_mf.cpp b/fem/integ/bilininteg_mass_mf.cpp
-index 34a118b6d..41ab07b94 100644
---- a/fem/integ/bilininteg_mass_mf.cpp
-+++ b/fem/integ/bilininteg_mass_mf.cpp
-@@ -19,42 +19,40 @@ namespace mfem
- 
- void MassIntegrator::AssembleMF(const FiniteElementSpace &fes)
- {
--   // Assuming the same element type
-    Mesh *mesh = fes.GetMesh();
-    if (mesh->GetNE() == 0) { return; }
--   const FiniteElement &el = *fes.GetFE(0);
--   ElementTransformation *T = mesh->GetElementTransformation(0);
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el, *T);
-    if (DeviceCanUseCeed())
-    {
-       delete ceedOp;
--      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
--                         fes.IsVariableOrder();
--      if (mixed)
--      {
--         ceedOp = new ceed::MixedMFMassIntegrator(*this, fes, Q);
--      }
--      else
--      {
--         ceedOp = new ceed::MFMassIntegrator(fes, *ir, Q);
--      }
-+      ceedOp = new ceed::MFMassIntegrator(*this, fes, Q);
-       return;
-    }
-+
-+   // Assuming the same element type
-+   // const FiniteElement &el = *fes.GetFE(0);
-+   // ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-    MFEM_ABORT("Error: MassIntegrator::AssembleMF only implemented with"
-               " libCEED");
- }
- 
--void MassIntegrator::AddMultMF(const Vector &x, Vector &y) const
-+void MassIntegrator::AssembleMFBoundary(const FiniteElementSpace &fes)
- {
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-    if (DeviceCanUseCeed())
-    {
--      ceedOp->AddMult(x, y);
--   }
--   else
--   {
--      MFEM_ABORT("Error: MassIntegrator::AddMultMF only implemented with"
--                 " libCEED");
-+      delete ceedOp;
-+      ceedOp = new ceed::MFMassIntegrator(*this, fes, Q, true);
-+      return;
-    }
-+
-+   // Assuming the same element type
-+   // const FiniteElement &el = *fes.GetBE(0);
-+   // ElementTransformation &T = *mesh->GetBdrElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-+   MFEM_ABORT("Error: MassIntegrator::AssembleMFBoundary only implemented with"
-+              " libCEED");
- }
- 
- void MassIntegrator::AssembleDiagonalMF(Vector &diag)
-@@ -70,4 +68,17 @@ void MassIntegrator::AssembleDiagonalMF(Vector &diag)
-    }
- }
- 
-+void MassIntegrator::AddMultMF(const Vector &x, Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Error: MassIntegrator::AddMultMF only implemented with"
-+                 " libCEED");
-+   }
-+}
-+
- } // namespace mfem
-diff --git a/fem/integ/bilininteg_mass_pa.cpp b/fem/integ/bilininteg_mass_pa.cpp
-index ffdec1edb..0e4cd8f41 100644
---- a/fem/integ/bilininteg_mass_pa.cpp
-+++ b/fem/integ/bilininteg_mass_pa.cpp
-@@ -23,28 +23,19 @@ void MassIntegrator::AssemblePA(const FiniteElementSpace &fes)
- {
-    const MemoryType mt = (pa_mt == MemoryType::DEFAULT) ?
-                          Device::GetDeviceMemoryType() : pa_mt;
--
--   // Assuming the same element type
-    Mesh *mesh = fes.GetMesh();
-    if (mesh->GetNE() == 0) { return; }
--   const FiniteElement &el = *fes.GetFE(0);
--   ElementTransformation *T0 = mesh->GetElementTransformation(0);
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el, *T0);
-    if (DeviceCanUseCeed())
-    {
-       delete ceedOp;
--      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
--                         fes.IsVariableOrder();
--      if (mixed)
--      {
--         ceedOp = new ceed::MixedPAMassIntegrator(*this, fes, Q);
--      }
--      else
--      {
--         ceedOp = new ceed::PAMassIntegrator(fes, *ir, Q);
--      }
-+      ceedOp = new ceed::PAMassIntegrator(*this, fes, Q);
-       return;
-    }
-+
-+   // Assuming the same element type
-+   const FiniteElement &el = *fes.GetFE(0);
-+   ElementTransformation &T =* mesh->GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-    int map_type = el.GetMapType();
-    dim = mesh->Dimension();
-    ne = fes.GetMesh()->GetNE();
-@@ -116,14 +107,19 @@ void MassIntegrator::AssemblePABoundary(const FiniteElementSpace &fes)
- {
-    const MemoryType mt = (pa_mt == MemoryType::DEFAULT) ?
-                          Device::GetDeviceMemoryType() : pa_mt;
--
--   // Assuming the same element type
-    Mesh *mesh = fes.GetMesh();
-    if (mesh->GetNBE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      ceedOp = new ceed::PAMassIntegrator(*this, fes, Q, true);
-+      return;
-+   }
-+
-+   // Assuming the same element type
-    const FiniteElement &el = *fes.GetBE(0);
-    ElementTransformation *T0 = mesh->GetBdrElementTransformation(0);
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el, *T0);
--
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, *T0);
-    int map_type = el.GetMapType();
-    dim = el.GetDim(); // Dimension of the boundary element, *not* the mesh
-    ne = fes.GetMesh()->GetNBE();
-diff --git a/fem/integ/bilininteg_mixedcurl_mf.cpp b/fem/integ/bilininteg_mixedcurl_mf.cpp
-new file mode 100644
-index 000000000..b22cc297e
---- /dev/null
-+++ b/fem/integ/bilininteg_mixedcurl_mf.cpp
-@@ -0,0 +1,108 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../ceed/integrators/mixedveccurl/mixedveccurl.hpp"
-+
-+namespace mfem
-+{
-+
-+void MixedVectorCurlIntegrator::AssembleMF(const FiniteElementSpace &trial_fes,
-+                                           const FiniteElementSpace &test_fes)
-+{
-+   Mesh *mesh = trial_fes.GetMesh();
-+   if (mesh->GetNE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ)
-+      {
-+         ceedOp = new ceed::MFMixedVectorCurlIntegrator(*this, trial_fes,
-+                                                        test_fes, MQ);
-+      }
-+      else if (DQ)
-+      {
-+         ceedOp = new ceed::MFMixedVectorCurlIntegrator(*this, trial_fes,
-+                                                        test_fes, DQ);
-+      }
-+      else
-+      {
-+         ceedOp = new ceed::MFMixedVectorCurlIntegrator(*this, trial_fes,
-+                                                        test_fes, Q);
-+      }
-+      return;
-+   }
-+
-+   // Assuming the same element type
-+   MFEM_ABORT("Error: MixedVectorCurlIntegrator::AssembleMF only implemented with"
-+              " libCEED");
-+}
-+
-+void MixedVectorCurlIntegrator::AddMultMF(const Vector &x, Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Error: MixedVectorCurlIntegrator::AddMultMF only"
-+                 " implemented with libCEED");
-+   }
-+}
-+
-+void MixedVectorWeakCurlIntegrator::AssembleMF(
-+   const FiniteElementSpace &trial_fes,
-+   const FiniteElementSpace &test_fes)
-+{
-+   Mesh *mesh = trial_fes.GetMesh();
-+   if (mesh->GetNE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ)
-+      {
-+         ceedOp = new ceed::MFMixedVectorWeakCurlIntegrator(*this, trial_fes,
-+                                                            test_fes, MQ);
-+      }
-+      else if (DQ)
-+      {
-+         ceedOp = new ceed::MFMixedVectorWeakCurlIntegrator(*this, trial_fes,
-+                                                            test_fes, DQ);
-+      }
-+      else
-+      {
-+         ceedOp = new ceed::MFMixedVectorWeakCurlIntegrator(*this, trial_fes,
-+                                                            test_fes, Q);
-+      }
-+      return;
-+   }
-+
-+   // Assuming the same element type
-+   MFEM_ABORT("Error: MixedVectorWeakCurlIntegrator::AssembleMF only"
-+              " implemented with libCEED");
-+}
-+
-+void MixedVectorWeakCurlIntegrator::AddMultMF(const Vector &x, Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Error: MixedVectorWeakCurlIntegrator::AddMultMF only"
-+                 " implemented with libCEED");
-+   }
-+}
-+
-+} // namespace mfem
-diff --git a/fem/integ/bilininteg_mixedcurl_pa.cpp b/fem/integ/bilininteg_mixedcurl_pa.cpp
-index 3d70bc4c9..dd7a9375f 100644
---- a/fem/integ/bilininteg_mixedcurl_pa.cpp
-+++ b/fem/integ/bilininteg_mixedcurl_pa.cpp
-@@ -13,6 +13,7 @@
- #include "../bilininteg.hpp"
- #include "../gridfunc.hpp"
- #include "../qfunction.hpp"
-+#include "../ceed/integrators/mixedveccurl/mixedveccurl.hpp"
- #include "bilininteg_hcurl_kernels.hpp"
- #include "bilininteg_hcurlhdiv_kernels.hpp"
- 
-@@ -36,9 +37,8 @@ void MixedScalarCurlIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-       MFEM_ABORT("Unknown kernel.");
-    }
- 
--   const IntegrationRule *ir
--      = IntRule ? IntRule : &MassIntegrator::GetRule(*eltest, *eltest,
--                                                     *mesh->GetElementTransformation(0));
-+   ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*fel, *eltest, T);
- 
-    const int dims = el->GetDim();
-    MFEM_VERIFY(dims == 2, "");
-@@ -111,8 +111,30 @@ void MixedScalarCurlIntegrator::AddMultTransposePA(const Vector &x,
- void MixedVectorCurlIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-                                            const FiniteElementSpace &test_fes)
- {
--   // Assumes tensor-product elements, with vector test and trial spaces.
-    Mesh *mesh = trial_fes.GetMesh();
-+   if (mesh->GetNE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ)
-+      {
-+         ceedOp = new ceed::PAMixedVectorCurlIntegrator(*this, trial_fes,
-+                                                        test_fes, MQ);
-+      }
-+      else if (DQ)
-+      {
-+         ceedOp = new ceed::PAMixedVectorCurlIntegrator(*this, trial_fes,
-+                                                        test_fes, DQ);
-+      }
-+      else
-+      {
-+         ceedOp = new ceed::PAMixedVectorCurlIntegrator(*this, trial_fes,
-+                                                        test_fes, Q);
-+      }
-+      return;
-+   }
-+
-+   // Assumes tensor-product elements, with vector test and trial spaces.
-    const FiniteElement *trial_fel = trial_fes.GetFE(0);
-    const FiniteElement *test_fel = test_fes.GetFE(0);
- 
-@@ -124,9 +146,10 @@ void MixedVectorCurlIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-       dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
-    MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
- 
--   const IntegrationRule *ir
--      = IntRule ? IntRule : &MassIntegrator::GetRule(*trial_el, *trial_el,
--                                                     *mesh->GetElementTransformation(0));
-+   ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*trial_el, *test_el,
-+                                                            T);
-+
-    const int dims = trial_el->GetDim();
-    MFEM_VERIFY(dims == 3, "");
- 
-@@ -194,75 +217,90 @@ void MixedVectorCurlIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
- 
- void MixedVectorCurlIntegrator::AddMultPA(const Vector &x, Vector &y) const
- {
--   if (testType == mfem::FiniteElement::CURL &&
--       trialType == mfem::FiniteElement::CURL && dim == 3)
-+   if (DeviceCanUseCeed())
-    {
--      const int ndata = coeffDim == 1 ? 1 : 9;
--
--      if (Device::Allows(Backend::DEVICE_MASK))
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-+   {
-+      if (testType == mfem::FiniteElement::CURL &&
-+          trialType == mfem::FiniteElement::CURL && dim == 3)
-       {
--         const int ID = (dofs1D << 4) | quad1D;
--         switch (ID)
-+         const int ndata = coeffDim == 1 ? 1 : 9;
-+
-+         if (Device::Allows(Backend::DEVICE_MASK))
-          {
--            case 0x23:
--               return internal::SmemPAHcurlL2Apply3D<2,3>(
--                         dofs1D, quad1D, ndata, ne,
--                         mapsO->B, mapsC->B, mapsC->G,
--                         pa_data, x, y);
--            case 0x34:
--               return internal::SmemPAHcurlL2Apply3D<3,4>(
--                         dofs1D, quad1D, ndata, ne,
--                         mapsO->B, mapsC->B, mapsC->G,
--                         pa_data, x, y);
--            case 0x45:
--               return internal::SmemPAHcurlL2Apply3D<4,5>(
--                         dofs1D, quad1D, ndata, ne,
--                         mapsO->B, mapsC->B, mapsC->G,
--                         pa_data, x, y);
--            case 0x56:
--               return internal::SmemPAHcurlL2Apply3D<5,6>(
--                         dofs1D, quad1D, ndata, ne,
--                         mapsO->B, mapsC->B, mapsC->G,
--                         pa_data, x, y);
--            default:
--               return internal::SmemPAHcurlL2Apply3D(
--                         dofs1D, quad1D, ndata, ne,
--                         mapsO->B, mapsC->B, mapsC->G,
--                         pa_data, x, y);
-+            const int ID = (dofs1D << 4) | quad1D;
-+            switch (ID)
-+            {
-+               case 0x23:
-+                  return internal::SmemPAHcurlL2Apply3D<2,3>(
-+                            dofs1D, quad1D, ndata, ne,
-+                            mapsO->B, mapsC->B, mapsC->G,
-+                            pa_data, x, y);
-+               case 0x34:
-+                  return internal::SmemPAHcurlL2Apply3D<3,4>(
-+                            dofs1D, quad1D, ndata, ne,
-+                            mapsO->B, mapsC->B, mapsC->G,
-+                            pa_data, x, y);
-+               case 0x45:
-+                  return internal::SmemPAHcurlL2Apply3D<4,5>(
-+                            dofs1D, quad1D, ndata, ne,
-+                            mapsO->B, mapsC->B, mapsC->G,
-+                            pa_data, x, y);
-+               case 0x56:
-+                  return internal::SmemPAHcurlL2Apply3D<5,6>(
-+                            dofs1D, quad1D, ndata, ne,
-+                            mapsO->B, mapsC->B, mapsC->G,
-+                            pa_data, x, y);
-+               default:
-+                  return internal::SmemPAHcurlL2Apply3D(
-+                            dofs1D, quad1D, ndata, ne,
-+                            mapsO->B, mapsC->B, mapsC->G,
-+                            pa_data, x, y);
-+            }
-+         }
-+         else
-+         {
-+            internal::PAHcurlL2Apply3D(dofs1D, quad1D, ndata, ne, mapsO->B, mapsC->B,
-+                                       mapsO->Bt, mapsC->Bt, mapsC->G, pa_data, x, y);
-          }
-       }
-+      else if (testType == mfem::FiniteElement::DIV &&
-+               trialType == mfem::FiniteElement::CURL && dim == 3)
-+      {
-+         internal::PAHcurlHdivApply3D(dofs1D, dofs1Dtest, quad1D, ne, mapsO->B,
-+                                      mapsC->B, mapsOtest->Bt, mapsCtest->Bt, mapsC->G,
-+                                      pa_data, x, y);
-+      }
-       else
-       {
--         internal::PAHcurlL2Apply3D(dofs1D, quad1D, ndata, ne, mapsO->B, mapsC->B,
--                                    mapsO->Bt, mapsC->Bt, mapsC->G, pa_data, x, y);
-+         MFEM_ABORT("Unsupported dimension or space!");
-       }
-    }
--   else if (testType == mfem::FiniteElement::DIV &&
--            trialType == mfem::FiniteElement::CURL && dim == 3)
--   {
--      internal::PAHcurlHdivApply3D(dofs1D, dofs1Dtest, quad1D, ne, mapsO->B,
--                                   mapsC->B, mapsOtest->Bt, mapsCtest->Bt, mapsC->G,
--                                   pa_data, x, y);
--   }
--   else
--   {
--      MFEM_ABORT("Unsupported dimension or space!");
--   }
- }
- 
- void MixedVectorCurlIntegrator::AddMultTransposePA(const Vector &x,
-                                                    Vector &y) const
- {
--   if (testType == mfem::FiniteElement::DIV &&
--       trialType == mfem::FiniteElement::CURL && dim == 3)
-+   if (DeviceCanUseCeed())
-    {
--      internal::PAHcurlHdivApply3DTranspose(dofs1D, dofs1Dtest, quad1D, ne, mapsO->B,
--                                            mapsC->B, mapsOtest->Bt, mapsCtest->Bt,
--                                            mapsC->Gt, pa_data, x, y);
-+      MFEM_ABORT("AddMultTransposePA not yet implemented with libCEED for"
-+                 " MixedVectorCurlIntegrator.");
-    }
-    else
-    {
--      MFEM_ABORT("Unsupported dimension or space!");
-+      if (testType == mfem::FiniteElement::DIV &&
-+          trialType == mfem::FiniteElement::CURL && dim == 3)
-+      {
-+         internal::PAHcurlHdivApply3DTranspose(dofs1D, dofs1Dtest, quad1D, ne, mapsO->B,
-+                                               mapsC->B, mapsOtest->Bt, mapsCtest->Bt,
-+                                               mapsC->Gt, pa_data, x, y);
-+      }
-+      else
-+      {
-+         MFEM_ABORT("Unsupported dimension or space!");
-+      }
-    }
- }
- 
-@@ -270,8 +308,30 @@ void MixedVectorWeakCurlIntegrator::AssemblePA(
-    const FiniteElementSpace &trial_fes,
-    const FiniteElementSpace &test_fes)
- {
--   // Assumes tensor-product elements, with vector test and trial spaces.
-    Mesh *mesh = trial_fes.GetMesh();
-+   if (mesh->GetNE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ)
-+      {
-+         ceedOp = new ceed::PAMixedVectorWeakCurlIntegrator(*this, trial_fes,
-+                                                            test_fes, MQ);
-+      }
-+      else if (DQ)
-+      {
-+         ceedOp = new ceed::PAMixedVectorWeakCurlIntegrator(*this, trial_fes,
-+                                                            test_fes, DQ);
-+      }
-+      else
-+      {
-+         ceedOp = new ceed::PAMixedVectorWeakCurlIntegrator(*this, trial_fes,
-+                                                            test_fes, Q);
-+      }
-+      return;
-+   }
-+
-+   // Assumes tensor-product elements, with vector test and trial spaces.
-    const FiniteElement *trial_fel = trial_fes.GetFE(0);
-    const FiniteElement *test_fel = test_fes.GetFE(0);
- 
-@@ -283,9 +343,10 @@ void MixedVectorWeakCurlIntegrator::AssemblePA(
-       dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
-    MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
- 
--   const IntegrationRule *ir
--      = IntRule ? IntRule : &MassIntegrator::GetRule(*trial_el, *trial_el,
--                                                     *mesh->GetElementTransformation(0));
-+   ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*trial_el, *test_el,
-+                                                            T);
-+
-    const int dims = trial_el->GetDim();
-    MFEM_VERIFY(dims == 3, "");
- 
-@@ -349,75 +410,90 @@ void MixedVectorWeakCurlIntegrator::AssemblePA(
- 
- void MixedVectorWeakCurlIntegrator::AddMultPA(const Vector &x, Vector &y) const
- {
--   if (testType == mfem::FiniteElement::CURL &&
--       trialType == mfem::FiniteElement::CURL && dim == 3)
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-    {
--      const int ndata = coeffDim == 1 ? 1 : 9;
--      if (Device::Allows(Backend::DEVICE_MASK))
-+      if (testType == mfem::FiniteElement::CURL &&
-+          trialType == mfem::FiniteElement::CURL && dim == 3)
-       {
--         const int ID = (dofs1D << 4) | quad1D;
--         switch (ID)
-+         const int ndata = coeffDim == 1 ? 1 : 9;
-+         if (Device::Allows(Backend::DEVICE_MASK))
-          {
--            case 0x23:
--               return internal::SmemPAHcurlL2Apply3DTranspose<2,3>(
--                         dofs1D, quad1D, ndata,
--                         ne, mapsO->B, mapsC->B,
--                         mapsC->G, pa_data, x, y);
--            case 0x34:
--               return internal::SmemPAHcurlL2Apply3DTranspose<3,4>(
--                         dofs1D, quad1D, ndata,
--                         ne, mapsO->B, mapsC->B,
--                         mapsC->G, pa_data, x, y);
--            case 0x45:
--               return internal::SmemPAHcurlL2Apply3DTranspose<4,5>(
--                         dofs1D, quad1D, ndata,
--                         ne, mapsO->B, mapsC->B,
--                         mapsC->G, pa_data, x, y);
--            case 0x56:
--               return internal::SmemPAHcurlL2Apply3DTranspose<5,6>(
--                         dofs1D, quad1D, ndata,
--                         ne, mapsO->B, mapsC->B,
--                         mapsC->G, pa_data, x, y);
--            default:
--               return internal::SmemPAHcurlL2Apply3DTranspose(
--                         dofs1D, quad1D, ndata, ne,
--                         mapsO->B, mapsC->B,
--                         mapsC->G, pa_data, x, y);
-+            const int ID = (dofs1D << 4) | quad1D;
-+            switch (ID)
-+            {
-+               case 0x23:
-+                  return internal::SmemPAHcurlL2Apply3DTranspose<2,3>(
-+                            dofs1D, quad1D, ndata,
-+                            ne, mapsO->B, mapsC->B,
-+                            mapsC->G, pa_data, x, y);
-+               case 0x34:
-+                  return internal::SmemPAHcurlL2Apply3DTranspose<3,4>(
-+                            dofs1D, quad1D, ndata,
-+                            ne, mapsO->B, mapsC->B,
-+                            mapsC->G, pa_data, x, y);
-+               case 0x45:
-+                  return internal::SmemPAHcurlL2Apply3DTranspose<4,5>(
-+                            dofs1D, quad1D, ndata,
-+                            ne, mapsO->B, mapsC->B,
-+                            mapsC->G, pa_data, x, y);
-+               case 0x56:
-+                  return internal::SmemPAHcurlL2Apply3DTranspose<5,6>(
-+                            dofs1D, quad1D, ndata,
-+                            ne, mapsO->B, mapsC->B,
-+                            mapsC->G, pa_data, x, y);
-+               default:
-+                  return internal::SmemPAHcurlL2Apply3DTranspose(
-+                            dofs1D, quad1D, ndata, ne,
-+                            mapsO->B, mapsC->B,
-+                            mapsC->G, pa_data, x, y);
-+            }
-+         }
-+         else
-+         {
-+            internal::PAHcurlL2Apply3DTranspose(dofs1D, quad1D, ndata, ne, mapsO->B,
-+                                                mapsC->B, mapsO->Bt, mapsC->Bt,
-+                                                mapsC->Gt, pa_data, x, y);
-          }
-       }
-+      else if (testType == mfem::FiniteElement::CURL &&
-+               trialType == mfem::FiniteElement::DIV && dim == 3)
-+      {
-+         internal::PAHcurlHdivApply3DTranspose(dofs1D, dofs1D, quad1D, ne, mapsO->B,
-+                                               mapsC->B, mapsO->Bt, mapsC->Bt,
-+                                               mapsC->Gt, pa_data, x, y);
-+      }
-       else
-       {
--         internal::PAHcurlL2Apply3DTranspose(dofs1D, quad1D, ndata, ne, mapsO->B,
--                                             mapsC->B, mapsO->Bt, mapsC->Bt, mapsC->Gt,
--                                             pa_data, x, y);
-+         MFEM_ABORT("Unsupported dimension or space!");
-       }
-    }
--   else if (testType == mfem::FiniteElement::CURL &&
--            trialType == mfem::FiniteElement::DIV && dim == 3)
--   {
--      internal::PAHcurlHdivApply3DTranspose(dofs1D, dofs1D, quad1D, ne, mapsO->B,
--                                            mapsC->B, mapsO->Bt, mapsC->Bt,
--                                            mapsC->Gt, pa_data, x, y);
--   }
--   else
--   {
--      MFEM_ABORT("Unsupported dimension or space!");
--   }
- }
- 
- void MixedVectorWeakCurlIntegrator::AddMultTransposePA(const Vector &x,
-                                                        Vector &y) const
- {
--   if (testType == mfem::FiniteElement::CURL &&
--       trialType == mfem::FiniteElement::DIV && dim == 3)
-+   if (DeviceCanUseCeed())
-    {
--      internal::PAHcurlHdivApply3D(dofs1D, dofs1D, quad1D, ne, mapsO->B,
--                                   mapsC->B, mapsO->Bt, mapsC->Bt, mapsC->G,
--                                   pa_data, x, y);
-+      MFEM_ABORT("AddMultTransposePA not yet implemented with libCEED for"
-+                 " MixedVectorWeakCurlIntegrator.");
-    }
-    else
-    {
--      MFEM_ABORT("Unsupported dimension or space!");
-+      if (testType == mfem::FiniteElement::CURL &&
-+          trialType == mfem::FiniteElement::DIV && dim == 3)
-+      {
-+         internal::PAHcurlHdivApply3D(dofs1D, dofs1D, quad1D, ne, mapsO->B,
-+                                      mapsC->B, mapsO->Bt, mapsC->Bt, mapsC->G,
-+                                      pa_data, x, y);
-+      }
-+      else
-+      {
-+         MFEM_ABORT("Unsupported dimension or space!");
-+      }
-    }
- }
- 
-diff --git a/fem/integ/bilininteg_mixedvecgrad_mf.cpp b/fem/integ/bilininteg_mixedvecgrad_mf.cpp
-new file mode 100644
-index 000000000..10b3b9686
---- /dev/null
-+++ b/fem/integ/bilininteg_mixedvecgrad_mf.cpp
-@@ -0,0 +1,174 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../ceed/integrators/mixedvecgrad/mixedvecgrad.hpp"
-+
-+namespace mfem
-+{
-+
-+void MixedVectorGradientIntegrator::AssembleMF(
-+   const FiniteElementSpace &trial_fes,
-+   const FiniteElementSpace &test_fes)
-+{
-+   Mesh *mesh = trial_fes.GetMesh();
-+   if (mesh->GetNE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ)
-+      {
-+         ceedOp = new ceed::MFMixedVectorGradientIntegrator(*this, trial_fes,
-+                                                            test_fes, MQ);
-+      }
-+      else if (DQ)
-+      {
-+         ceedOp = new ceed::MFMixedVectorGradientIntegrator(*this, trial_fes,
-+                                                            test_fes, DQ);
-+      }
-+      else
-+      {
-+         ceedOp = new ceed::MFMixedVectorGradientIntegrator(*this, trial_fes,
-+                                                            test_fes, Q);
-+      }
-+      return;
-+   }
-+
-+   // Assuming the same element type
-+   MFEM_ABORT("Error: MixedVectorGradientIntegrator::AssembleMF only"
-+              " implemented with libCEED");
-+}
-+
-+void MixedVectorGradientIntegrator::AssembleMFBoundary(
-+   const FiniteElementSpace &trial_fes,
-+   const FiniteElementSpace &test_fes)
-+{
-+   Mesh *mesh = trial_fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ)
-+      {
-+         ceedOp = new ceed::MFMixedVectorGradientIntegrator(*this, trial_fes,
-+                                                            test_fes, MQ, true);
-+      }
-+      else if (DQ)
-+      {
-+         ceedOp = new ceed::MFMixedVectorGradientIntegrator(*this, trial_fes,
-+                                                            test_fes, DQ, true);
-+      }
-+      else
-+      {
-+         ceedOp = new ceed::MFMixedVectorGradientIntegrator(*this, trial_fes,
-+                                                            test_fes, Q, true);
-+      }
-+      return;
-+   }
-+
-+   // Assuming the same element type
-+   MFEM_ABORT("Error: MixedVectorGradientIntegrator::AssembleMFBoundary only"
-+              " implemented with libCEED");
-+}
-+
-+void MixedVectorGradientIntegrator::AddMultMF(const Vector &x, Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Error: MixedVectorGradientIntegrator::AddMultMF only"
-+                 " implemented with libCEED");
-+   }
-+}
-+
-+void MixedVectorWeakDivergenceIntegrator::AssembleMF(
-+   const FiniteElementSpace &trial_fes,
-+   const FiniteElementSpace &test_fes)
-+{
-+   Mesh *mesh = trial_fes.GetMesh();
-+   if (mesh->GetNE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ)
-+      {
-+         ceedOp = new ceed::MFMixedVectorWeakDivergenceIntegrator(*this, trial_fes,
-+                                                                  test_fes, MQ);
-+      }
-+      else if (DQ)
-+      {
-+         ceedOp = new ceed::MFMixedVectorWeakDivergenceIntegrator(*this, trial_fes,
-+                                                                  test_fes, DQ);
-+      }
-+      else
-+      {
-+         ceedOp = new ceed::MFMixedVectorWeakDivergenceIntegrator(*this, trial_fes,
-+                                                                  test_fes, Q);
-+      }
-+      return;
-+   }
-+
-+   // Assuming the same element type
-+   MFEM_ABORT("Error: MixedVectorWeakDivergenceIntegrator::AssembleMF only"
-+              " implemented with libCEED");
-+}
-+
-+void MixedVectorWeakDivergenceIntegrator::AssembleMFBoundary(
-+   const FiniteElementSpace &trial_fes,
-+   const FiniteElementSpace &test_fes)
-+{
-+   Mesh *mesh = trial_fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ)
-+      {
-+         ceedOp = new ceed::MFMixedVectorWeakDivergenceIntegrator(*this, trial_fes,
-+                                                                  test_fes, MQ, true);
-+      }
-+      else if (DQ)
-+      {
-+         ceedOp = new ceed::MFMixedVectorWeakDivergenceIntegrator(*this, trial_fes,
-+                                                                  test_fes, DQ, true);
-+      }
-+      else
-+      {
-+         ceedOp = new ceed::MFMixedVectorWeakDivergenceIntegrator(*this, trial_fes,
-+                                                                  test_fes, Q, true);
-+      }
-+      return;
-+   }
-+
-+   // Assuming the same element type
-+   MFEM_ABORT("Error: MixedVectorWeakDivergenceIntegrator::AssembleMFBoundary only"
-+              " implemented with libCEED");
-+}
-+
-+void MixedVectorWeakDivergenceIntegrator::AddMultMF(const Vector &x,
-+                                                    Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Error: MixedVectorWeakDivergenceIntegrator::AddMultMF only"
-+                 " implemented with libCEED");
-+   }
-+}
-+
-+} // namespace mfem
-diff --git a/fem/integ/bilininteg_mixedvecgrad_pa.cpp b/fem/integ/bilininteg_mixedvecgrad_pa.cpp
-index f9e6d3ee8..5acf3367b 100644
---- a/fem/integ/bilininteg_mixedvecgrad_pa.cpp
-+++ b/fem/integ/bilininteg_mixedvecgrad_pa.cpp
-@@ -13,11 +13,128 @@
- #include "../bilininteg.hpp"
- #include "../gridfunc.hpp"
- #include "../qfunction.hpp"
-+#include "../ceed/integrators/mixedvecgrad/mixedvecgrad.hpp"
- #include "bilininteg_diffusion_kernels.hpp"
- 
- namespace mfem
- {
- 
-+void MixedVectorGradientIntegrator::AssemblePA(
-+   const FiniteElementSpace &trial_fes,
-+   const FiniteElementSpace &test_fes)
-+{
-+   Mesh *mesh = trial_fes.GetMesh();
-+   if (mesh->GetNE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ)
-+      {
-+         ceedOp = new ceed::PAMixedVectorGradientIntegrator(*this, trial_fes,
-+                                                            test_fes, MQ);
-+      }
-+      else if (DQ)
-+      {
-+         ceedOp = new ceed::PAMixedVectorGradientIntegrator(*this, trial_fes,
-+                                                            test_fes, DQ);
-+      }
-+      else
-+      {
-+         ceedOp = new ceed::PAMixedVectorGradientIntegrator(*this, trial_fes,
-+                                                            test_fes, Q);
-+      }
-+      return;
-+   }
-+
-+   // Assumes tensor-product elements, with a vector test space and H^1 trial space.
-+   const FiniteElement *trial_fel = trial_fes.GetFE(0);
-+   const FiniteElement *test_fel = test_fes.GetFE(0);
-+
-+   const NodalTensorFiniteElement *trial_el =
-+      dynamic_cast<const NodalTensorFiniteElement*>(trial_fel);
-+   MFEM_VERIFY(trial_el != NULL, "Only NodalTensorFiniteElement is supported!");
-+
-+   const VectorTensorFiniteElement *test_el =
-+      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
-+   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
-+
-+   ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*trial_el, *test_el,
-+                                                            T);
-+
-+   const int dims = trial_el->GetDim();
-+   MFEM_VERIFY(dims == 2 || dims == 3, "");
-+
-+   const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
-+   const int nq = ir->GetNPoints();
-+   dim = mesh->Dimension();
-+   MFEM_VERIFY(dim == 2 || dim == 3, "");
-+
-+   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(), "");
-+
-+   ne = trial_fes.GetNE();
-+   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
-+   mapsC = &test_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
-+   mapsO = &test_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
-+   dofs1D = mapsC->ndof;
-+   quad1D = mapsC->nqpt;
-+
-+   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
-+
-+   pa_data.SetSize(symmDims * nq * ne, Device::GetMemoryType());
-+
-+   QuadratureSpace qs(*mesh, *ir);
-+   CoefficientVector coeff(Q, qs, CoefficientStorage::FULL);
-+
-+   // Use the same setup functions as VectorFEMassIntegrator.
-+   if (test_el->GetDerivType() == mfem::FiniteElement::CURL && dim == 3)
-+   {
-+      internal::PADiffusionSetup3D(quad1D, 1, ne, ir->GetWeights(), geom->J,
-+                                   coeff, pa_data);
-+   }
-+   else if (test_el->GetDerivType() == mfem::FiniteElement::CURL && dim == 2)
-+   {
-+      internal::PADiffusionSetup2D<2>(quad1D, 1, ne, ir->GetWeights(), geom->J,
-+                                      coeff, pa_data);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unknown kernel.");
-+   }
-+}
-+
-+void MixedVectorGradientIntegrator::AssemblePABoundary(
-+   const FiniteElementSpace &trial_fes,
-+   const FiniteElementSpace &test_fes)
-+{
-+   Mesh *mesh = trial_fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ)
-+      {
-+         ceedOp = new ceed::PAMixedVectorGradientIntegrator(*this, trial_fes,
-+                                                            test_fes, MQ, true);
-+      }
-+      else if (DQ)
-+      {
-+         ceedOp = new ceed::PAMixedVectorGradientIntegrator(*this, trial_fes,
-+                                                            test_fes, DQ, true);
-+      }
-+      else
-+      {
-+         ceedOp = new ceed::PAMixedVectorGradientIntegrator(*this, trial_fes,
-+                                                            test_fes, Q, true);
-+      }
-+      return;
-+   }
-+
-+   // Assuming the same element type
-+   MFEM_ABORT("Error: MixedVectorGradientIntegrator::AssemblePABoundary only"
-+              " implemented with libCEED");
-+}
-+
- // Apply to x corresponding to DOFs in H^1 (trial), whose gradients are
- // integrated against H(curl) test functions corresponding to y.
- static void PAHcurlH1Apply2D(const int D1D,
-@@ -656,101 +773,133 @@ static void PAHcurlH1ApplyTranspose3D(const int D1D,
-    }); // end of element loop
- }
- 
--void MixedVectorGradientIntegrator::AssemblePA(
--   const FiniteElementSpace &trial_fes,
--   const FiniteElementSpace &test_fes)
-+void MixedVectorGradientIntegrator::AddMultPA(const Vector &x, Vector &y) const
- {
--   // Assumes tensor-product elements, with a vector test space and H^1 trial space.
--   Mesh *mesh = trial_fes.GetMesh();
--   const FiniteElement *trial_fel = trial_fes.GetFE(0);
--   const FiniteElement *test_fel = test_fes.GetFE(0);
--
--   const NodalTensorFiniteElement *trial_el =
--      dynamic_cast<const NodalTensorFiniteElement*>(trial_fel);
--   MFEM_VERIFY(trial_el != NULL, "Only NodalTensorFiniteElement is supported!");
--
--   const VectorTensorFiniteElement *test_el =
--      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
--   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const IntegrationRule *ir
--      = IntRule ? IntRule : &MassIntegrator::GetRule(*trial_el, *trial_el,
--                                                     *mesh->GetElementTransformation(0));
--   const int dims = trial_el->GetDim();
--   MFEM_VERIFY(dims == 2 || dims == 3, "");
--
--   const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
--   const int nq = ir->GetNPoints();
--   dim = mesh->Dimension();
--   MFEM_VERIFY(dim == 2 || dim == 3, "");
--
--   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(), "");
--
--   ne = trial_fes.GetNE();
--   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
--   mapsC = &test_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
--   mapsO = &test_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
--   dofs1D = mapsC->ndof;
--   quad1D = mapsC->nqpt;
--
--   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
--
--   pa_data.SetSize(symmDims * nq * ne, Device::GetMemoryType());
--
--   QuadratureSpace qs(*mesh, *ir);
--   CoefficientVector coeff(Q, qs, CoefficientStorage::FULL);
--
--   // Use the same setup functions as VectorFEMassIntegrator.
--   if (test_el->GetDerivType() == mfem::FiniteElement::CURL && dim == 3)
-+   if (DeviceCanUseCeed())
-    {
--      internal::PADiffusionSetup3D(quad1D, 1, ne, ir->GetWeights(), geom->J,
--                                   coeff, pa_data);
--   }
--   else if (test_el->GetDerivType() == mfem::FiniteElement::CURL && dim == 2)
--   {
--      internal::PADiffusionSetup2D<2>(quad1D, 1, ne, ir->GetWeights(), geom->J,
--                                      coeff, pa_data);
-+      ceedOp->AddMult(x, y);
-    }
-    else
-    {
--      MFEM_ABORT("Unknown kernel.");
-+      if (dim == 3)
-+      {
-+         PAHcurlH1Apply3D(dofs1D, quad1D, ne, mapsC->B, mapsC->G,
-+                          mapsO->Bt, mapsC->Bt, pa_data, x, y);
-+      }
-+      else if (dim == 2)
-+      {
-+         PAHcurlH1Apply2D(dofs1D, quad1D, ne, mapsC->B, mapsC->G,
-+                          mapsO->Bt, mapsC->Bt, pa_data, x, y);
-+      }
-+      else
-+      {
-+         MFEM_ABORT("Unsupported dimension!");
-+      }
-    }
- }
- 
--void MixedVectorGradientIntegrator::AddMultPA(const Vector &x, Vector &y) const
-+void MixedVectorGradientIntegrator::AddMultTransposePA(const Vector &x,
-+                                                       Vector &y) const
- {
--   if (dim == 3)
-+   if (DeviceCanUseCeed())
-    {
--      PAHcurlH1Apply3D(dofs1D, quad1D, ne, mapsC->B, mapsC->G,
--                       mapsO->Bt, mapsC->Bt, pa_data, x, y);
-+      MFEM_ABORT("AddMultTransposePA not yet implemented with libCEED for"
-+                 " MixedVectorGradientIntegrator.");
-    }
--   else if (dim == 2)
-+   else
-    {
--      PAHcurlH1Apply2D(dofs1D, quad1D, ne, mapsC->B, mapsC->G,
--                       mapsO->Bt, mapsC->Bt, pa_data, x, y);
-+      if (dim == 3)
-+      {
-+         PAHcurlH1ApplyTranspose3D(dofs1D, quad1D, ne, mapsC->B, mapsO->B,
-+                                   mapsC->Bt, mapsC->Gt, pa_data, x, y);
-+      }
-+      else if (dim == 2)
-+      {
-+         PAHcurlH1ApplyTranspose2D(dofs1D, quad1D, ne, mapsC->B, mapsO->B,
-+                                   mapsC->Bt, mapsC->Gt, pa_data, x, y);
-+      }
-+      else
-+      {
-+         MFEM_ABORT("Unsupported dimension!");
-+      }
-    }
--   else
-+}
-+
-+void MixedVectorWeakDivergenceIntegrator::AssemblePA(
-+   const FiniteElementSpace &trial_fes,
-+   const FiniteElementSpace &test_fes)
-+{
-+   Mesh *mesh = trial_fes.GetMesh();
-+   if (mesh->GetNE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-    {
--      MFEM_ABORT("Unsupported dimension!");
-+      delete ceedOp;
-+      if (MQ)
-+      {
-+         ceedOp = new ceed::PAMixedVectorWeakDivergenceIntegrator(*this, trial_fes,
-+                                                                  test_fes, MQ);
-+      }
-+      else if (DQ)
-+      {
-+         ceedOp = new ceed::PAMixedVectorWeakDivergenceIntegrator(*this, trial_fes,
-+                                                                  test_fes, DQ);
-+      }
-+      else
-+      {
-+         ceedOp = new ceed::PAMixedVectorWeakDivergenceIntegrator(*this, trial_fes,
-+                                                                  test_fes, Q);
-+      }
-+      return;
-    }
-+
-+   // Assuming the same element type
-+   MFEM_ABORT("Error: MixedVectorWeakDivergenceIntegrator::AssemblePA only"
-+              " implemented with libCEED");
- }
- 
--void MixedVectorGradientIntegrator::AddMultTransposePA(const Vector &x,
--                                                       Vector &y) const
-+void MixedVectorWeakDivergenceIntegrator::AssemblePABoundary(
-+   const FiniteElementSpace &trial_fes,
-+   const FiniteElementSpace &test_fes)
- {
--   if (dim == 3)
-+   Mesh *mesh = trial_fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-    {
--      PAHcurlH1ApplyTranspose3D(dofs1D, quad1D, ne, mapsC->B, mapsO->B,
--                                mapsC->Bt, mapsC->Gt, pa_data, x, y);
-+      delete ceedOp;
-+      if (MQ)
-+      {
-+         ceedOp = new ceed::PAMixedVectorWeakDivergenceIntegrator(*this, trial_fes,
-+                                                                  test_fes, MQ, true);
-+      }
-+      else if (DQ)
-+      {
-+         ceedOp = new ceed::PAMixedVectorWeakDivergenceIntegrator(*this, trial_fes,
-+                                                                  test_fes, DQ, true);
-+      }
-+      else
-+      {
-+         ceedOp = new ceed::PAMixedVectorWeakDivergenceIntegrator(*this, trial_fes,
-+                                                                  test_fes, Q, true);
-+      }
-+      return;
-    }
--   else if (dim == 2)
-+
-+   // Assuming the same element type
-+   MFEM_ABORT("Error: MixedVectorWeakDivergenceIntegrator::AssemblePABoundary only"
-+              " implemented with libCEED");
-+}
-+
-+void MixedVectorWeakDivergenceIntegrator::AddMultPA(const Vector &x,
-+                                                    Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-    {
--      PAHcurlH1ApplyTranspose2D(dofs1D, quad1D, ne, mapsC->B, mapsO->B,
--                                mapsC->Bt, mapsC->Gt, pa_data, x, y);
-+      ceedOp->AddMult(x, y);
-    }
-    else
-    {
--      MFEM_ABORT("Unsupported dimension!");
-+      MFEM_ABORT("Error: MixedVectorWeakDivergenceIntegrator::AddMultMF only"
-+                 " implemented with libCEED");
-    }
- }
- 
-diff --git a/fem/integ/bilininteg_vecdiffusion_mf.cpp b/fem/integ/bilininteg_vecdiffusion_mf.cpp
-index 7cad61496..56139d2ef 100644
---- a/fem/integ/bilininteg_vecdiffusion_mf.cpp
-+++ b/fem/integ/bilininteg_vecdiffusion_mf.cpp
-@@ -19,45 +19,45 @@ namespace mfem
- 
- void VectorDiffusionIntegrator::AssembleMF(const FiniteElementSpace &fes)
- {
--   // Assumes tensor-product elements
-    Mesh *mesh = fes.GetMesh();
-    if (mesh->GetNE() == 0) { return; }
--   const FiniteElement &el = *fes.GetFE(0);
--   const IntegrationRule *ir
--      = IntRule ? IntRule : &DiffusionIntegrator::GetRule(el, el);
-    if (DeviceCanUseCeed())
-    {
--      delete ceedOp;
-       MFEM_VERIFY(!VQ && !MQ,
--                  "Only scalar coefficient supported for DiffusionIntegrator"
--                  " with libCEED");
--      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
--                         fes.IsVariableOrder();
--      if (mixed)
--      {
--         ceedOp = new ceed::MixedMFDiffusionIntegrator(*this, fes, Q);
--      }
--      else
--      {
--         ceedOp = new ceed::MFDiffusionIntegrator(fes, *ir, Q);
--      }
-+                  "Only scalar coefficient is supported for matrix-free assembly for VectorDiffusionIntegrator");
-+      delete ceedOp;
-+      ceedOp = new ceed::MFDiffusionIntegrator(*this, fes, Q);
-       return;
-    }
-+
-+   // Assumes tensor-product elements
-+   // const FiniteElement &el = *fes.GetFE(0);
-+   // ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-    MFEM_ABORT("Error: VectorDiffusionIntegrator::AssembleMF only implemented"
-               " with libCEED");
- }
- 
--void VectorDiffusionIntegrator::AddMultMF(const Vector &x, Vector &y) const
-+void VectorDiffusionIntegrator::AssembleMFBoundary(
-+   const FiniteElementSpace &fes)
- {
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-    if (DeviceCanUseCeed())
-    {
--      ceedOp->AddMult(x, y);
--   }
--   else
--   {
--      MFEM_ABORT("Error: VectorDiffusionIntegrator::AddMultMF only implemented"
--                 " with libCEED");
-+      MFEM_VERIFY(!VQ && !MQ,
-+                  "Only scalar coefficient is supported for matrix-free assembly for VectorDiffusionIntegrator");
-+      delete ceedOp;
-+      ceedOp = new ceed::MFDiffusionIntegrator(*this, fes, Q, true);
-+      return;
-    }
-+
-+   // Assumes tensor-product elements
-+   // const FiniteElement &el = *fes.GetBE(0);
-+   // ElementTransformation &T = *mesh->GetBdrElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-+   MFEM_ABORT("Error: VectorDiffusionIntegrator::AssembleMFBoundary only implemented"
-+              " with libCEED");
- }
- 
- void VectorDiffusionIntegrator::AssembleDiagonalMF(Vector &diag)
-@@ -73,4 +73,17 @@ void VectorDiffusionIntegrator::AssembleDiagonalMF(Vector &diag)
-    }
- }
- 
-+void VectorDiffusionIntegrator::AddMultMF(const Vector &x, Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Error: VectorDiffusionIntegrator::AddMultMF only implemented"
-+                 " with libCEED");
-+   }
-+}
-+
- } // namespace mfem
-diff --git a/fem/integ/bilininteg_vecdiffusion_pa.cpp b/fem/integ/bilininteg_vecdiffusion_pa.cpp
-index 84e4d5b2a..3fe58e1c1 100644
---- a/fem/integ/bilininteg_vecdiffusion_pa.cpp
-+++ b/fem/integ/bilininteg_vecdiffusion_pa.cpp
-@@ -114,26 +114,21 @@ static void PAVectorDiffusionSetup3D(const int Q1D,
- 
- void VectorDiffusionIntegrator::AssemblePA(const FiniteElementSpace &fes)
- {
--   // Assumes tensor-product elements
-    Mesh *mesh = fes.GetMesh();
--   const FiniteElement &el = *fes.GetFE(0);
--   const IntegrationRule *ir
--      = IntRule ? IntRule : &DiffusionIntegrator::GetRule(el, el);
-+   if (mesh->GetNE() == 0) { return; }
-    if (DeviceCanUseCeed())
-    {
-+      MFEM_VERIFY(!VQ && !MQ,
-+                  "Only scalar coefficient is supported for partial assembly for VectorDiffusionIntegrator");
-       delete ceedOp;
--      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
--                         fes.IsVariableOrder();
--      if (mixed)
--      {
--         ceedOp = new ceed::MixedPADiffusionIntegrator(*this, fes, Q);
--      }
--      else
--      {
--         ceedOp = new ceed::PADiffusionIntegrator(fes, *ir, Q);
--      }
-+      ceedOp = new ceed::PADiffusionIntegrator(*this, fes, Q);
-       return;
-    }
-+
-+   // Assumes tensor-product elements
-+   const FiniteElement &el = *fes.GetFE(0);
-+   ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-    const int dims = el.GetDim();
-    const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
-    const int nq = ir->GetNPoints();
-@@ -209,6 +204,28 @@ void VectorDiffusionIntegrator::AssemblePA(const FiniteElementSpace &fes)
-    }
- }
- 
-+void VectorDiffusionIntegrator::AssemblePABoundary(
-+   const FiniteElementSpace &fes)
-+{
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      MFEM_VERIFY(!VQ && !MQ,
-+                  "Only scalar coefficient is supported for partial assembly for VectorDiffusionIntegrator");
-+      delete ceedOp;
-+      ceedOp = new ceed::PADiffusionIntegrator(*this, fes, Q, true);
-+      return;
-+   }
-+
-+   // Assumes tensor-product elements
-+   // const FiniteElement &el = *fes.GetBE(0);
-+   // ElementTransformation &T = *mesh->GetBdrElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-+   MFEM_ABORT("Error: VectorDiffusionIntegrator::AssemblePABoundary only implemented"
-+              " with libCEED");
-+}
-+
- template<int T_D1D = 0, int T_Q1D = 0>
- static void PAVectorDiffusionDiagonal2D(const int NE,
-                                         const Array<double> &b,
-diff --git a/fem/integ/bilininteg_vecdiv_pa.cpp b/fem/integ/bilininteg_vecdiv_pa.cpp
-index 63f7a3308..cf58df9ea 100644
---- a/fem/integ/bilininteg_vecdiv_pa.cpp
-+++ b/fem/integ/bilininteg_vecdiv_pa.cpp
-@@ -105,9 +105,8 @@ void VectorDivergenceIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-    Mesh *mesh = trial_fes.GetMesh();
-    const FiniteElement &trial_fe = *trial_fes.GetFE(0);
-    const FiniteElement &test_fe = *test_fes.GetFE(0);
--   ElementTransformation *trans = mesh->GetElementTransformation(0);
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
--                                                            *trans);
-+   ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe, T);
-    const int dims = trial_fe.GetDim();
-    const int dimsToStore = dims * dims;
-    nq = ir->GetNPoints();
-diff --git a/fem/integ/bilininteg_vecmass_mf.cpp b/fem/integ/bilininteg_vecmass_mf.cpp
-index cc2eb0174..59d7209db 100644
---- a/fem/integ/bilininteg_vecmass_mf.cpp
-+++ b/fem/integ/bilininteg_vecmass_mf.cpp
-@@ -19,43 +19,40 @@ namespace mfem
- 
- void VectorMassIntegrator::AssembleMF(const FiniteElementSpace &fes)
- {
--   // Assuming the same element type
-    Mesh *mesh = fes.GetMesh();
-    if (mesh->GetNE() == 0) { return; }
--   const FiniteElement &el = *fes.GetFE(0);
--   ElementTransformation *T = mesh->GetElementTransformation(0);
--   const IntegrationRule *ir
--      = IntRule ? IntRule : &MassIntegrator::GetRule(el, el, *T);
-    if (DeviceCanUseCeed())
-    {
-       delete ceedOp;
--      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
--                         fes.IsVariableOrder();
--      if (mixed)
--      {
--         ceedOp = new ceed::MixedMFMassIntegrator(*this, fes, Q);
--      }
--      else
--      {
--         ceedOp = new ceed::MFMassIntegrator(fes, *ir, Q);
--      }
-+      ceedOp = new ceed::MFMassIntegrator(*this, fes, Q);
-       return;
-    }
-+
-+   // Assuming the same element type
-+   // const FiniteElement &el = *fes.GetFE(0);
-+   // ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-    MFEM_ABORT("Error: VectorMassIntegrator::AssembleMF only implemented with"
-               " libCEED");
- }
- 
--void VectorMassIntegrator::AddMultMF(const Vector &x, Vector &y) const
-+void VectorMassIntegrator::AssembleMFBoundary(const FiniteElementSpace &fes)
- {
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-    if (DeviceCanUseCeed())
-    {
--      ceedOp->AddMult(x, y);
--   }
--   else
--   {
--      MFEM_ABORT("Error: VectorMassIntegrator::AddMultMF only implemented with"
--                 " libCEED");
-+      delete ceedOp;
-+      ceedOp = new ceed::MFMassIntegrator(*this, fes, Q, true);
-+      return;
-    }
-+
-+   // Assuming the same element type
-+   // const FiniteElement &el = *fes.GetBE(0);
-+   // ElementTransformation &T = *mesh->GetBdrElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-+   MFEM_ABORT("Error: VectorMassIntegrator::AssembleMFBoundary only implemented with"
-+              " libCEED");
- }
- 
- void VectorMassIntegrator::AssembleDiagonalMF(Vector &diag)
-@@ -71,4 +68,17 @@ void VectorMassIntegrator::AssembleDiagonalMF(Vector &diag)
-    }
- }
- 
-+void VectorMassIntegrator::AddMultMF(const Vector &x, Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Error: VectorMassIntegrator::AddMultMF only implemented with"
-+                 " libCEED");
-+   }
-+}
-+
- } // namespace mfem
-diff --git a/fem/integ/bilininteg_vecmass_pa.cpp b/fem/integ/bilininteg_vecmass_pa.cpp
-index b1c20b4c4..3be9e2283 100644
---- a/fem/integ/bilininteg_vecmass_pa.cpp
-+++ b/fem/integ/bilininteg_vecmass_pa.cpp
-@@ -19,33 +19,23 @@ namespace mfem
- 
- void VectorMassIntegrator::AssemblePA(const FiniteElementSpace &fes)
- {
--   // Assuming the same element type
-    Mesh *mesh = fes.GetMesh();
-    if (mesh->GetNE() == 0) { return; }
--   const FiniteElement &el = *fes.GetFE(0);
--   ElementTransformation *T = mesh->GetElementTransformation(0);
--   const IntegrationRule *ir
--      = IntRule ? IntRule : &MassIntegrator::GetRule(el, el, *T);
-    if (DeviceCanUseCeed())
-    {
-       delete ceedOp;
--      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
--                         fes.IsVariableOrder();
--      if (mixed)
--      {
--         ceedOp = new ceed::MixedPAMassIntegrator(*this, fes, Q);
--      }
--      else
--      {
--         ceedOp = new ceed::PAMassIntegrator(fes, *ir, Q);
--      }
-+      ceedOp = new ceed::PAMassIntegrator(*this, fes, Q);
-       return;
-    }
-+
-+   // Assuming the same element type
-+   const FiniteElement &el = *fes.GetFE(0);
-+   ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-    dim = mesh->Dimension();
-    ne = fes.GetMesh()->GetNE();
-    nq = ir->GetNPoints();
--   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::COORDINATES |
--                                    GeometricFactors::JACOBIANS);
-+   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
-    maps = &el.GetDofToQuad(*ir, DofToQuad::TENSOR);
-    dofs1D = maps->ndof;
-    quad1D = maps->nqpt;
-@@ -106,6 +96,25 @@ void VectorMassIntegrator::AssemblePA(const FiniteElementSpace &fes)
-    }
- }
- 
-+void VectorMassIntegrator::AssemblePABoundary(const FiniteElementSpace &fes)
-+{
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      ceedOp = new ceed::PAMassIntegrator(*this, fes, Q, true);
-+      return;
-+   }
-+
-+   // Assuming the same element type
-+   // const FiniteElement &el = *fes.GetBE(0);
-+   // ElementTransformation &T = *mesh->GetBdrElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-+   MFEM_ABORT("Error: VectorMassIntegrator::AssemblePABoundary only implemented with"
-+              " libCEED");
-+}
-+
- template<const int T_D1D = 0, const int T_Q1D = 0>
- static void PAVectorMassAssembleDiagonal2D(const int NE,
-                                            const Array<double> &B_,
-diff --git a/fem/integ/bilininteg_vectorfediv_pa.cpp b/fem/integ/bilininteg_vectorfediv_pa.cpp
-index 2915a253b..4a2c2a055 100644
---- a/fem/integ/bilininteg_vectorfediv_pa.cpp
-+++ b/fem/integ/bilininteg_vectorfediv_pa.cpp
-@@ -38,9 +38,9 @@ VectorFEDivergenceIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-       dynamic_cast<const NodalTensorFiniteElement*>(test_fel);
-    MFEM_VERIFY(test_el != NULL, "Only NodalTensorFiniteElement is supported!");
- 
--   const IntegrationRule *ir = IntRule ? IntRule : &MassIntegrator::GetRule(
--                                  *trial_el, *trial_el,
--                                  *mesh->GetElementTransformation(0));
-+   ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*trial_el, *test_el,
-+                                                            T);
- 
-    const int dims = trial_el->GetDim();
-    MFEM_VERIFY(dims == 2 || dims == 3, "");
-diff --git a/fem/integ/bilininteg_vectorfemass_mf.cpp b/fem/integ/bilininteg_vectorfemass_mf.cpp
-new file mode 100644
-index 000000000..91d2b6b5a
---- /dev/null
-+++ b/fem/integ/bilininteg_vectorfemass_mf.cpp
-@@ -0,0 +1,89 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../ceed/integrators/vecfemass/vecfemass.hpp"
-+
-+using namespace std;
-+
-+namespace mfem
-+{
-+
-+void VectorFEMassIntegrator::AssembleMF(const FiniteElementSpace &fes)
-+{
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ) { ceedOp = new ceed::MFVectorFEMassIntegrator(*this, fes, MQ); }
-+      else if (DQ) { ceedOp = new ceed::MFVectorFEMassIntegrator(*this, fes, DQ); }
-+      else { ceedOp = new ceed::MFVectorFEMassIntegrator(*this, fes, Q); }
-+      return;
-+   }
-+
-+   // Assumes tensor-product elements
-+   // const FiniteElement &el = *fes.GetFE(0);
-+   // ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-+   MFEM_ABORT("Error: VectorFEMassIntegrator::AssembleMF only implemented with"
-+              " libCEED");
-+}
-+
-+void VectorFEMassIntegrator::AssembleMFBoundary(const FiniteElementSpace &fes)
-+{
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      if (MQ) { ceedOp = new ceed::MFVectorFEMassIntegrator(*this, fes, MQ, true); }
-+      else if (DQ) { ceedOp = new ceed::MFVectorFEMassIntegrator(*this, fes, DQ, true); }
-+      else { ceedOp = new ceed::MFVectorFEMassIntegrator(*this, fes, Q, true); }
-+      return;
-+   }
-+
-+   // Assumes tensor-product elements
-+   // const FiniteElement &el = *fes.GetBE(0);
-+   // ElementTransformation &T = *mesh->GetBdrElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-+   MFEM_ABORT("Error: VectorFEMassIntegrator::AssembleMFBoundary only implemented with"
-+              " libCEED");
-+}
-+
-+void VectorFEMassIntegrator::AssembleDiagonalMF(Vector &diag)
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->GetDiagonal(diag);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Error: VectorFEMassIntegrator::AssembleDiagonalMF only"
-+                 " implemented with libCEED");
-+   }
-+}
-+
-+void VectorFEMassIntegrator::AddMultMF(const Vector &x, Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Error: VectorFEMassIntegrator::AddMultMF only implemented with"
-+                 " libCEED");
-+   }
-+}
-+
-+}
-diff --git a/fem/integ/bilininteg_vectorfemass_pa.cpp b/fem/integ/bilininteg_vectorfemass_pa.cpp
-index c07e9f816..a49a9daa7 100644
---- a/fem/integ/bilininteg_vectorfemass_pa.cpp
-+++ b/fem/integ/bilininteg_vectorfemass_pa.cpp
-@@ -12,6 +12,7 @@
- #include "../bilininteg.hpp"
- #include "../gridfunc.hpp"
- #include "../qfunction.hpp"
-+#include "../ceed/integrators/vecfemass/vecfemass.hpp"
- #include "bilininteg_diffusion_kernels.hpp"
- #include "bilininteg_hcurl_kernels.hpp"
- #include "bilininteg_hdiv_kernels.hpp"
-@@ -23,30 +24,37 @@ namespace mfem
- void VectorFEMassIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-                                         const FiniteElementSpace &test_fes)
- {
--   // Assumes tensor-product elements
-    Mesh *mesh = trial_fes.GetMesh();
-+   if (mesh->GetNE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-+   {
-+      MFEM_VERIFY(&trial_fes == &test_fes,
-+                  "VectorFEMassIntegrator with mixed FE spaces is not supported by libCEED!");
-+      delete ceedOp;
-+      if (MQ) { ceedOp = new ceed::PAVectorFEMassIntegrator(*this, trial_fes, MQ); }
-+      else if (DQ) { ceedOp = new ceed::PAVectorFEMassIntegrator(*this, trial_fes, DQ); }
-+      else { ceedOp = new ceed::PAVectorFEMassIntegrator(*this, trial_fes, Q); }
-+      return;
-+   }
- 
-+   // Assumes tensor-product elements
-    const FiniteElement *trial_fel = trial_fes.GetFE(0);
-    const VectorTensorFiniteElement *trial_el =
-       dynamic_cast<const VectorTensorFiniteElement*>(trial_fel);
-    MFEM_VERIFY(trial_el != NULL, "Only VectorTensorFiniteElement is supported!");
--
-    const FiniteElement *test_fel = test_fes.GetFE(0);
-    const VectorTensorFiniteElement *test_el =
-       dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
-    MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const IntegrationRule *ir
--      = IntRule ? IntRule : &MassIntegrator::GetRule(*trial_el, *trial_el,
--                                                     *mesh->GetElementTransformation(0));
-+   ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*trial_el, *test_el,
-+                                                            T);
-    const int dims = trial_el->GetDim();
-    MFEM_VERIFY(dims == 2 || dims == 3, "");
--
-    const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
-    nq = ir->GetNPoints();
-    dim = mesh->Dimension();
-    MFEM_VERIFY(dim == 2 || dim == 3, "");
--
-    ne = trial_fes.GetNE();
-    MFEM_VERIFY(ne == test_fes.GetNE(),
-                "Different meshes for test and trial spaces");
-@@ -55,13 +63,10 @@ void VectorFEMassIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-    mapsO = &trial_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
-    dofs1D = mapsC->ndof;
-    quad1D = mapsC->nqpt;
--
-    mapsCtest = &test_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
-    mapsOtest = &test_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
-    dofs1Dtest = mapsCtest->ndof;
--
-    MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
--
-    trial_fetype = trial_el->GetDerivType();
-    test_fetype = test_el->GetDerivType();
- 
-@@ -72,6 +77,7 @@ void VectorFEMassIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
- 
-    QuadratureSpace qs(*mesh, *ir);
-    CoefficientVector coeff(qs, CoefficientStorage::SYMMETRIC);
-+
-    if (Q) { coeff.Project(*Q); }
-    else if (MQ) { coeff.ProjectTranspose(*MQ); }
-    else if (DQ) { coeff.Project(*DQ); }
-@@ -138,172 +144,206 @@ void VectorFEMassIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-    }
- }
- 
--void VectorFEMassIntegrator::AssembleDiagonalPA(Vector& diag)
-+void VectorFEMassIntegrator::AssemblePABoundary(const FiniteElementSpace &fes)
- {
--   if (dim == 3)
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-+   if (DeviceCanUseCeed())
-    {
--      if (trial_fetype == mfem::FiniteElement::CURL && test_fetype == trial_fetype)
-+      delete ceedOp;
-+      if (MQ) { ceedOp = new ceed::PAVectorFEMassIntegrator(*this, fes, MQ, true); }
-+      else if (DQ) { ceedOp = new ceed::PAVectorFEMassIntegrator(*this, fes, DQ, true); }
-+      else { ceedOp = new ceed::PAVectorFEMassIntegrator(*this, fes, Q, true); }
-+      return;
-+   }
-+
-+   // Assuming the same element type
-+   // const FiniteElement &el = *fes.GetBE(0);
-+   // ElementTransformation &T = *mesh->GetBdrElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-+   MFEM_ABORT("Error: VectorFEMassIntegrator::AssemblePABoundary only implemented with"
-+              " libCEED");
-+}
-+
-+void VectorFEMassIntegrator::AssembleDiagonalPA(Vector &diag)
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->GetDiagonal(diag);
-+   }
-+   else
-+   {
-+      if (dim == 3)
-       {
--         if (Device::Allows(Backend::DEVICE_MASK))
-+         if (trial_fetype == mfem::FiniteElement::CURL && test_fetype == trial_fetype)
-          {
--            const int ID = (dofs1D << 4) | quad1D;
--            switch (ID)
-+            if (Device::Allows(Backend::DEVICE_MASK))
-+            {
-+               const int ID = (dofs1D << 4) | quad1D;
-+               switch (ID)
-+               {
-+                  case 0x23:
-+                     return internal::SmemPAHcurlMassAssembleDiagonal3D<2,3>(
-+                               dofs1D, quad1D, ne, symmetric,
-+                               mapsO->B, mapsC->B, pa_data, diag);
-+                  case 0x34:
-+                     return internal::SmemPAHcurlMassAssembleDiagonal3D<3,4>(
-+                               dofs1D, quad1D, ne, symmetric,
-+                               mapsO->B, mapsC->B, pa_data, diag);
-+                  case 0x45:
-+                     return internal::SmemPAHcurlMassAssembleDiagonal3D<4,5>(
-+                               dofs1D, quad1D, ne, symmetric,
-+                               mapsO->B, mapsC->B, pa_data, diag);
-+                  case 0x56:
-+                     return internal::SmemPAHcurlMassAssembleDiagonal3D<5,6>(
-+                               dofs1D, quad1D, ne, symmetric,
-+                               mapsO->B, mapsC->B, pa_data, diag);
-+                  default:
-+                     return internal::SmemPAHcurlMassAssembleDiagonal3D(
-+                               dofs1D, quad1D, ne, symmetric,
-+                               mapsO->B, mapsC->B, pa_data, diag);
-+               }
-+            }
-+            else
-             {
--               case 0x23:
--                  return internal::SmemPAHcurlMassAssembleDiagonal3D<2,3>(
--                            dofs1D, quad1D, ne, symmetric,
--                            mapsO->B, mapsC->B, pa_data, diag);
--               case 0x34:
--                  return internal::SmemPAHcurlMassAssembleDiagonal3D<3,4>(
--                            dofs1D, quad1D, ne, symmetric,
--                            mapsO->B, mapsC->B, pa_data, diag);
--               case 0x45:
--                  return internal::SmemPAHcurlMassAssembleDiagonal3D<4,5>(
--                            dofs1D, quad1D, ne, symmetric,
--                            mapsO->B, mapsC->B, pa_data, diag);
--               case 0x56:
--                  return internal::SmemPAHcurlMassAssembleDiagonal3D<5,6>(
--                            dofs1D, quad1D, ne, symmetric,
--                            mapsO->B, mapsC->B, pa_data, diag);
--               default:
--                  return internal::SmemPAHcurlMassAssembleDiagonal3D(
--                            dofs1D, quad1D, ne, symmetric,
--                            mapsO->B, mapsC->B, pa_data, diag);
-+               internal::PAHcurlMassAssembleDiagonal3D(dofs1D, quad1D, ne, symmetric,
-+                                                       mapsO->B, mapsC->B, pa_data, diag);
-             }
-          }
-+         else if (trial_fetype == mfem::FiniteElement::DIV &&
-+                  test_fetype == trial_fetype)
-+         {
-+            internal::PAHdivMassAssembleDiagonal3D(dofs1D, quad1D, ne, symmetric,
-+                                                   mapsO->B, mapsC->B, pa_data, diag);
-+         }
-          else
-          {
--            internal::PAHcurlMassAssembleDiagonal3D(dofs1D, quad1D, ne, symmetric,
--                                                    mapsO->B, mapsC->B, pa_data, diag);
-+            MFEM_ABORT("Unknown kernel.");
-          }
-       }
--      else if (trial_fetype == mfem::FiniteElement::DIV &&
--               test_fetype == trial_fetype)
-+      else // 2D
-       {
--         internal::PAHdivMassAssembleDiagonal3D(dofs1D, quad1D, ne, symmetric,
--                                                mapsO->B, mapsC->B, pa_data, diag);
--      }
--      else
--      {
--         MFEM_ABORT("Unknown kernel.");
--      }
--   }
--   else // 2D
--   {
--      if (trial_fetype == mfem::FiniteElement::CURL && test_fetype == trial_fetype)
--      {
--         internal::PAHcurlMassAssembleDiagonal2D(dofs1D, quad1D, ne, symmetric,
--                                                 mapsO->B, mapsC->B, pa_data, diag);
--      }
--      else if (trial_fetype == mfem::FiniteElement::DIV &&
--               test_fetype == trial_fetype)
--      {
--         internal::PAHdivMassAssembleDiagonal2D(dofs1D, quad1D, ne, symmetric,
--                                                mapsO->B, mapsC->B, pa_data, diag);
--      }
--      else
--      {
--         MFEM_ABORT("Unknown kernel.");
-+         if (trial_fetype == mfem::FiniteElement::CURL && test_fetype == trial_fetype)
-+         {
-+            internal::PAHcurlMassAssembleDiagonal2D(dofs1D, quad1D, ne, symmetric,
-+                                                    mapsO->B, mapsC->B, pa_data, diag);
-+         }
-+         else if (trial_fetype == mfem::FiniteElement::DIV &&
-+                  test_fetype == trial_fetype)
-+         {
-+            internal::PAHdivMassAssembleDiagonal2D(dofs1D, quad1D, ne, symmetric,
-+                                                   mapsO->B, mapsC->B, pa_data, diag);
-+         }
-+         else
-+         {
-+            MFEM_ABORT("Unknown kernel.");
-+         }
-       }
-    }
- }
- 
- void VectorFEMassIntegrator::AddMultPA(const Vector &x, Vector &y) const
- {
--   const bool trial_curl = (trial_fetype == mfem::FiniteElement::CURL);
--   const bool trial_div = (trial_fetype == mfem::FiniteElement::DIV);
--   const bool test_curl = (test_fetype == mfem::FiniteElement::CURL);
--   const bool test_div = (test_fetype == mfem::FiniteElement::DIV);
--
--   if (dim == 3)
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-    {
--      if (trial_curl && test_curl)
-+      const bool trial_curl = (trial_fetype == mfem::FiniteElement::CURL);
-+      const bool trial_div = (trial_fetype == mfem::FiniteElement::DIV);
-+      const bool test_curl = (test_fetype == mfem::FiniteElement::CURL);
-+      const bool test_div = (test_fetype == mfem::FiniteElement::DIV);
-+
-+      if (dim == 3)
-       {
--         if (Device::Allows(Backend::DEVICE_MASK))
-+         if (trial_curl && test_curl)
-          {
--            const int ID = (dofs1D << 4) | quad1D;
--            switch (ID)
-+            if (Device::Allows(Backend::DEVICE_MASK))
-+            {
-+               const int ID = (dofs1D << 4) | quad1D;
-+               switch (ID)
-+               {
-+                  case 0x23:
-+                     return internal::SmemPAHcurlMassApply3D<2,3>(
-+                               dofs1D, quad1D, ne, symmetric,
-+                               mapsO->B, mapsC->B, mapsO->Bt,
-+                               mapsC->Bt, pa_data, x, y);
-+                  case 0x34:
-+                     return internal::SmemPAHcurlMassApply3D<3,4>(
-+                               dofs1D, quad1D, ne, symmetric,
-+                               mapsO->B, mapsC->B, mapsO->Bt,
-+                               mapsC->Bt, pa_data, x, y);
-+                  case 0x45:
-+                     return internal::SmemPAHcurlMassApply3D<4,5>(
-+                               dofs1D, quad1D, ne, symmetric,
-+                               mapsO->B, mapsC->B, mapsO->Bt,
-+                               mapsC->Bt, pa_data, x, y);
-+                  case 0x56:
-+                     return internal::SmemPAHcurlMassApply3D<5,6>(
-+                               dofs1D, quad1D, ne, symmetric,
-+                               mapsO->B, mapsC->B, mapsO->Bt,
-+                               mapsC->Bt, pa_data, x, y);
-+                  default:
-+                     return internal::SmemPAHcurlMassApply3D(
-+                               dofs1D, quad1D, ne, symmetric,
-+                               mapsO->B, mapsC->B, mapsO->Bt,
-+                               mapsC->Bt, pa_data, x, y);
-+               }
-+            }
-+            else
-             {
--               case 0x23:
--                  return internal::SmemPAHcurlMassApply3D<2,3>(
--                            dofs1D, quad1D, ne, symmetric,
--                            mapsO->B, mapsC->B, mapsO->Bt,
--                            mapsC->Bt, pa_data, x, y);
--               case 0x34:
--                  return internal::SmemPAHcurlMassApply3D<3,4>(
--                            dofs1D, quad1D, ne, symmetric,
--                            mapsO->B, mapsC->B, mapsO->Bt,
--                            mapsC->Bt, pa_data, x, y);
--               case 0x45:
--                  return internal::SmemPAHcurlMassApply3D<4,5>(
--                            dofs1D, quad1D, ne, symmetric,
--                            mapsO->B, mapsC->B, mapsO->Bt,
--                            mapsC->Bt, pa_data, x, y);
--               case 0x56:
--                  return internal::SmemPAHcurlMassApply3D<5,6>(
--                            dofs1D, quad1D, ne, symmetric,
--                            mapsO->B, mapsC->B, mapsO->Bt,
--                            mapsC->Bt, pa_data, x, y);
--               default:
--                  return internal::SmemPAHcurlMassApply3D(
--                            dofs1D, quad1D, ne, symmetric,
--                            mapsO->B, mapsC->B, mapsO->Bt,
--                            mapsC->Bt, pa_data, x, y);
-+               internal::PAHcurlMassApply3D(dofs1D, quad1D, ne, symmetric, mapsO->B, mapsC->B,
-+                                            mapsO->Bt, mapsC->Bt, pa_data, x, y);
-             }
-          }
-+         else if (trial_div && test_div)
-+         {
-+            internal::PAHdivMassApply(3, dofs1D, quad1D, ne, symmetric, mapsO->B, mapsC->B,
-+                                      mapsO->Bt, mapsC->Bt, pa_data, x, y);
-+         }
-+         else if (trial_curl && test_div)
-+         {
-+            const bool scalarCoeff = !(DQ || MQ);
-+            internal::PAHcurlHdivMassApply3D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
-+                                             true, false, mapsO->B, mapsC->B, mapsOtest->Bt,
-+                                             mapsCtest->Bt, pa_data, x, y);
-+         }
-+         else if (trial_div && test_curl)
-+         {
-+            const bool scalarCoeff = !(DQ || MQ);
-+            internal::PAHcurlHdivMassApply3D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
-+                                             false, false, mapsO->B, mapsC->B, mapsOtest->Bt,
-+                                             mapsCtest->Bt, pa_data, x, y);
-+         }
-          else
-          {
--            internal::PAHcurlMassApply3D(dofs1D, quad1D, ne, symmetric, mapsO->B, mapsC->B,
--                                         mapsO->Bt, mapsC->Bt, pa_data, x, y);
-+            MFEM_ABORT("Unknown kernel.");
-          }
-       }
--      else if (trial_div && test_div)
--      {
--         internal::PAHdivMassApply(3, dofs1D, quad1D, ne, symmetric, mapsO->B, mapsC->B,
--                                   mapsO->Bt, mapsC->Bt, pa_data, x, y);
--      }
--      else if (trial_curl && test_div)
--      {
--         const bool scalarCoeff = !(DQ || MQ);
--         internal::PAHcurlHdivMassApply3D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
--                                          true, false, mapsO->B, mapsC->B, mapsOtest->Bt,
--                                          mapsCtest->Bt, pa_data, x, y);
--      }
--      else if (trial_div && test_curl)
--      {
--         const bool scalarCoeff = !(DQ || MQ);
--         internal::PAHcurlHdivMassApply3D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
--                                          false, false, mapsO->B, mapsC->B, mapsOtest->Bt,
--                                          mapsCtest->Bt, pa_data, x, y);
--      }
--      else
--      {
--         MFEM_ABORT("Unknown kernel.");
--      }
--   }
--   else // 2D
--   {
--      if (trial_curl && test_curl)
-+      else // 2D
-       {
--         internal::PAHcurlMassApply2D(dofs1D, quad1D, ne, symmetric, mapsO->B, mapsC->B,
-+         if (trial_curl && test_curl)
-+         {
-+            internal::PAHcurlMassApply2D(dofs1D, quad1D, ne, symmetric, mapsO->B, mapsC->B,
-+                                         mapsO->Bt, mapsC->Bt, pa_data, x, y);
-+         }
-+         else if (trial_div && test_div)
-+         {
-+            internal::PAHdivMassApply(2, dofs1D, quad1D, ne, symmetric, mapsO->B, mapsC->B,
-                                       mapsO->Bt, mapsC->Bt, pa_data, x, y);
--      }
--      else if (trial_div && test_div)
--      {
--         internal::PAHdivMassApply(2, dofs1D, quad1D, ne, symmetric, mapsO->B, mapsC->B,
--                                   mapsO->Bt,
--                                   mapsC->Bt, pa_data, x, y);
--      }
--      else if ((trial_curl && test_div) || (trial_div && test_curl))
--      {
--         const bool scalarCoeff = !(DQ || MQ);
--         internal::PAHcurlHdivMassApply2D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
--                                          trial_curl, false, mapsO->B, mapsC->B,
--                                          mapsOtest->Bt, mapsCtest->Bt, pa_data, x, y);
--      }
--      else
--      {
--         MFEM_ABORT("Unknown kernel.");
-+         }
-+         else if ((trial_curl && test_div) || (trial_div && test_curl))
-+         {
-+            const bool scalarCoeff = !(DQ || MQ);
-+            internal::PAHcurlHdivMassApply2D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
-+                                             trial_curl, false, mapsO->B, mapsC->B,
-+                                             mapsOtest->Bt, mapsCtest->Bt, pa_data, x, y);
-+         }
-+         else
-+         {
-+            MFEM_ABORT("Unknown kernel.");
-+         }
-       }
-    }
- }
-@@ -311,35 +351,43 @@ void VectorFEMassIntegrator::AddMultPA(const Vector &x, Vector &y) const
- void VectorFEMassIntegrator::AddMultTransposePA(const Vector &x,
-                                                 Vector &y) const
- {
--   const bool trial_curl = (trial_fetype == mfem::FiniteElement::CURL);
--   const bool trial_div = (trial_fetype == mfem::FiniteElement::DIV);
--   const bool test_curl = (test_fetype == mfem::FiniteElement::CURL);
--   const bool test_div = (test_fetype == mfem::FiniteElement::DIV);
--
--   bool symmetricSpaces = true;
--   if (dim == 3 && ((trial_div && test_curl) || (trial_curl && test_div)))
--   {
--      const bool scalarCoeff = !(DQ || MQ);
--      internal::PAHcurlHdivMassApply3D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
--                                       trial_div, true, mapsO->B, mapsC->B,
--                                       mapsOtest->Bt, mapsCtest->Bt, pa_data, x, y);
--      symmetricSpaces = false;
--   }
--   else if (dim == 2 && ((trial_curl && test_div) || (trial_div && test_curl)))
-+   if (DeviceCanUseCeed())
-    {
--      const bool scalarCoeff = !(DQ || MQ);
--      internal::PAHcurlHdivMassApply2D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
--                                       !trial_curl, true, mapsO->B, mapsC->B,
--                                       mapsOtest->Bt, mapsCtest->Bt, pa_data, x, y);
--      symmetricSpaces = false;
-+      MFEM_ABORT("AddMultTransposePA not yet implemented with libCEED for"
-+                 " VectorFEMassIntegrator.");
-    }
--   if (symmetricSpaces)
-+   else
-    {
--      if (MQ && dynamic_cast<SymmetricMatrixCoefficient*>(MQ) == NULL)
-+      const bool trial_curl = (trial_fetype == mfem::FiniteElement::CURL);
-+      const bool trial_div = (trial_fetype == mfem::FiniteElement::DIV);
-+      const bool test_curl = (test_fetype == mfem::FiniteElement::CURL);
-+      const bool test_div = (test_fetype == mfem::FiniteElement::DIV);
-+
-+      bool symmetricSpaces = true;
-+      if (dim == 3 && ((trial_div && test_curl) || (trial_curl && test_div)))
-+      {
-+         const bool scalarCoeff = !(DQ || MQ);
-+         internal::PAHcurlHdivMassApply3D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
-+                                          trial_div, true, mapsO->B, mapsC->B,
-+                                          mapsOtest->Bt, mapsCtest->Bt, pa_data, x, y);
-+         symmetricSpaces = false;
-+      }
-+      else if (dim == 2 && ((trial_curl && test_div) || (trial_div && test_curl)))
-       {
--         MFEM_ABORT("VectorFEMassIntegrator transpose not implemented for asymmetric MatrixCoefficient");
-+         const bool scalarCoeff = !(DQ || MQ);
-+         internal::PAHcurlHdivMassApply2D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
-+                                          !trial_curl, true, mapsO->B, mapsC->B,
-+                                          mapsOtest->Bt, mapsCtest->Bt, pa_data, x, y);
-+         symmetricSpaces = false;
-+      }
-+      if (symmetricSpaces)
-+      {
-+         if (MQ && dynamic_cast<SymmetricMatrixCoefficient*>(MQ) == NULL)
-+         {
-+            MFEM_ABORT("VectorFEMassIntegrator transpose not implemented for asymmetric MatrixCoefficient");
-+         }
-+         AddMultPA(x, y);
-       }
--      AddMultPA(x, y);
-    }
- }
- 
-diff --git a/fem/integ/lininteg_boundary.cpp b/fem/integ/lininteg_boundary.cpp
-index 9b785335c..92f1ff8a5 100644
---- a/fem/integ/lininteg_boundary.cpp
-+++ b/fem/integ/lininteg_boundary.cpp
-@@ -214,30 +214,28 @@ void BoundaryLFIntegrator::AssembleDevice(const FiniteElementSpace &fes,
-                                           const Array<int> &markers,
-                                           Vector &b)
- {
--   const FiniteElement &fe = *fes.GetBE(0);
--   const int qorder = oa * fe.GetOrder() + ob;
--   const Geometry::Type gtype = fe.GetGeomType();
--   const IntegrationRule &ir = IntRule ? *IntRule : IntRules.Get(gtype, qorder);
-    Mesh &mesh = *fes.GetMesh();
-+   const FiniteElement &fe = *fes.GetBE(0);
-+   ElementTransformation &T = *mesh.GetBdrElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(fe, T, oa, ob);
- 
--   FaceQuadratureSpace qs(mesh, ir, FaceType::Boundary);
-+   FaceQuadratureSpace qs(mesh, *ir, FaceType::Boundary);
-    CoefficientVector coeff(Q, qs, CoefficientStorage::COMPRESSED);
--   BLFEvalAssemble(fes, ir, markers, coeff, false, b);
-+   BLFEvalAssemble(fes, *ir, markers, coeff, false, b);
- }
- 
- void BoundaryNormalLFIntegrator::AssembleDevice(const FiniteElementSpace &fes,
-                                                 const Array<int> &markers,
-                                                 Vector &b)
- {
--   const FiniteElement &fe = *fes.GetBE(0);
--   const int qorder = oa * fe.GetOrder() + ob;
--   const Geometry::Type gtype = fe.GetGeomType();
--   const IntegrationRule &ir = IntRule ? *IntRule : IntRules.Get(gtype, qorder);
-    Mesh &mesh = *fes.GetMesh();
-+   const FiniteElement &fe = *fes.GetBE(0);
-+   ElementTransformation &T = *mesh.GetBdrElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(fe, T, oa, ob);
- 
--   FaceQuadratureSpace qs(mesh, ir, FaceType::Boundary);
-+   FaceQuadratureSpace qs(mesh, *ir, FaceType::Boundary);
-    CoefficientVector coeff(Q, qs, CoefficientStorage::COMPRESSED);
--   BLFEvalAssemble(fes, ir, markers, coeff, true, b);
-+   BLFEvalAssemble(fes, *ir, markers, coeff, true, b);
- }
- 
- } // namespace mfem
-diff --git a/fem/integ/lininteg_boundary_flux.cpp b/fem/integ/lininteg_boundary_flux.cpp
-index b9f047817..a6e422b94 100644
---- a/fem/integ/lininteg_boundary_flux.cpp
-+++ b/fem/integ/lininteg_boundary_flux.cpp
-@@ -166,15 +166,14 @@ void VectorFEBoundaryFluxLFIntegrator::AssembleDevice(
-    const Array<int> &markers,
-    Vector &b)
- {
--   const FiniteElement &fe = *fes.GetBE(0);
--   const int qorder = oa * fe.GetOrder() + ob;
--   const Geometry::Type gtype = fe.GetGeomType();
--   const IntegrationRule &ir = IntRule ? *IntRule : IntRules.Get(gtype, qorder);
-    Mesh &mesh = *fes.GetMesh();
-+   const FiniteElement &fe = *fes.GetBE(0);
-+   ElementTransformation &T = *mesh.GetBdrElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(fe, T, oa, ob);
- 
--   FaceQuadratureSpace qs(mesh, ir, FaceType::Boundary);
-+   FaceQuadratureSpace qs(mesh, *ir, FaceType::Boundary);
-    CoefficientVector coeff(F, qs, CoefficientStorage::COMPRESSED);
--   BFLFEvalAssemble(fes, ir, markers, coeff, b);
-+   BFLFEvalAssemble(fes, *ir, markers, coeff, b);
- }
- 
- } // namespace mfem
-diff --git a/fem/integ/lininteg_domain.cpp b/fem/integ/lininteg_domain.cpp
-index 6ff7b090d..438da4df2 100644
---- a/fem/integ/lininteg_domain.cpp
-+++ b/fem/integ/lininteg_domain.cpp
-@@ -242,10 +242,10 @@ void DomainLFIntegrator::AssembleDevice(const FiniteElementSpace &fes,
-                                         const Array<int> &markers,
-                                         Vector &b)
- {
-+   Mesh &mesh = *fes.GetMesh();
-    const FiniteElement &fe = *fes.GetFE(0);
--   const int qorder = oa * fe.GetOrder() + ob;
--   const Geometry::Type gtype = fe.GetGeomType();
--   const IntegrationRule *ir = IntRule ? IntRule : &IntRules.Get(gtype, qorder);
-+   ElementTransformation &T = *mesh.GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(fe, T, oa, ob);
- 
-    QuadratureSpace qs(*fes.GetMesh(), *ir);
-    CoefficientVector coeff(Q, qs, CoefficientStorage::COMPRESSED);
-@@ -256,10 +256,10 @@ void VectorDomainLFIntegrator::AssembleDevice(const FiniteElementSpace &fes,
-                                               const Array<int> &markers,
-                                               Vector &b)
- {
-+   Mesh &mesh = *fes.GetMesh();
-    const FiniteElement &fe = *fes.GetFE(0);
--   const int qorder = 2 * fe.GetOrder();
--   const Geometry::Type gtype = fe.GetGeomType();
--   const IntegrationRule *ir = IntRule ? IntRule : &IntRules.Get(gtype, qorder);
-+   ElementTransformation &T = *mesh.GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(fe, T);
- 
-    QuadratureSpace qs(*fes.GetMesh(), *ir);
-    CoefficientVector coeff(Q, qs, CoefficientStorage::COMPRESSED);
-diff --git a/fem/integ/lininteg_domain_grad.cpp b/fem/integ/lininteg_domain_grad.cpp
-index 5cca01a1d..735ea56c4 100644
---- a/fem/integ/lininteg_domain_grad.cpp
-+++ b/fem/integ/lininteg_domain_grad.cpp
-@@ -321,11 +321,10 @@ void DomainLFGradIntegrator::AssembleDevice(const FiniteElementSpace &fes,
-                                             const Array<int> &markers,
-                                             Vector &b)
- {
--
-+   Mesh &mesh = *fes.GetMesh();
-    const FiniteElement &fe = *fes.GetFE(0);
--   const int qorder = 2 * fe.GetOrder();
--   const Geometry::Type gtype = fe.GetGeomType();
--   const IntegrationRule *ir = IntRule ? IntRule : &IntRules.Get(gtype, qorder);
-+   ElementTransformation &T = *mesh.GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(fe, T);
- 
-    QuadratureSpace qs(*fes.GetMesh(), *ir);
-    CoefficientVector coeff(Q, qs, CoefficientStorage::COMPRESSED);
-@@ -336,10 +335,10 @@ void VectorDomainLFGradIntegrator::AssembleDevice(const FiniteElementSpace &fes,
-                                                   const Array<int> &markers,
-                                                   Vector &b)
- {
-+   Mesh &mesh = *fes.GetMesh();
-    const FiniteElement &fe = *fes.GetFE(0);
--   const int qorder = 2 * fe.GetOrder();
--   const Geometry::Type gtype = fe.GetGeomType();
--   const IntegrationRule *ir = IntRule ? IntRule : &IntRules.Get(gtype, qorder);
-+   ElementTransformation &T = *mesh.GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(fe, T);
- 
-    QuadratureSpace qs(*fes.GetMesh(), *ir);
-    CoefficientVector coeff(Q, qs, CoefficientStorage::COMPRESSED);
-diff --git a/fem/integ/lininteg_domain_vectorfe.cpp b/fem/integ/lininteg_domain_vectorfe.cpp
-index 16d9e866c..0765d58bc 100644
---- a/fem/integ/lininteg_domain_vectorfe.cpp
-+++ b/fem/integ/lininteg_domain_vectorfe.cpp
-@@ -325,10 +325,10 @@ void VectorFEDomainLFIntegrator::AssembleDevice(const FiniteElementSpace &fes,
-                                                 const Array<int> &markers,
-                                                 Vector &b)
- {
-+   Mesh &mesh = *fes.GetMesh();
-    const FiniteElement &fe = *fes.GetFE(0);
--   const int qorder = 2 * fe.GetOrder();
--   const Geometry::Type gtype = fe.GetGeomType();
--   const IntegrationRule *ir = IntRule ? IntRule : &IntRules.Get(gtype, qorder);
-+   ElementTransformation &T = *mesh.GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(fe, T);
- 
-    QuadratureSpace qs(*fes.GetMesh(), *ir);
-    CoefficientVector coeff(QF, qs, CoefficientStorage::COMPRESSED);
-diff --git a/fem/integ/nonlininteg_vecconvection_mf.cpp b/fem/integ/nonlininteg_vecconvection_mf.cpp
-index 4005d6836..370fd7991 100644
---- a/fem/integ/nonlininteg_vecconvection_mf.cpp
-+++ b/fem/integ/nonlininteg_vecconvection_mf.cpp
-@@ -19,27 +19,22 @@ namespace mfem
- void VectorConvectionNLFIntegrator::AssembleMF(const FiniteElementSpace &fes)
- {
-    MFEM_ASSERT(fes.GetOrdering() == Ordering::byNODES,
--               "PA Only supports Ordering::byNODES!");
-+               "MF only supports Ordering::byNODES!");
-    Mesh *mesh = fes.GetMesh();
--   const FiniteElement &el = *fes.GetFE(0);
--   ElementTransformation &T = *mesh->GetElementTransformation(0);
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-+   if (mesh->GetNE() == 0) { return; }
-    if (DeviceCanUseCeed())
-    {
-       delete ceedOp;
--      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
--                         fes.IsVariableOrder();
--      if (mixed)
--      {
--         ceedOp = new ceed::MixedMFVectorConvectionNLIntegrator(*this, fes, Q);
--      }
--      else
--      {
--         ceedOp = new ceed::MFVectorConvectionNLFIntegrator(fes, *ir, Q);
--      }
-+      ceedOp = new ceed::MFVectorConvectionNLIntegrator(*this, fes, Q);
-       return;
-    }
--   MFEM_ABORT("Not yet implemented.");
-+
-+   // Assuming the same element type
-+   // const FiniteElement &el = *fes.GetFE(0);
-+   // ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   // const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-+   MFEM_ABORT("Error: VectorConvectionNLFIntegrator::AssembleMF only"
-+              " implemented with libCEED");
- }
- 
- void VectorConvectionNLFIntegrator::AddMultMF(const Vector &x, Vector &y) const
-diff --git a/fem/integ/nonlininteg_vecconvection_pa.cpp b/fem/integ/nonlininteg_vecconvection_pa.cpp
-index 7bed31800..d8ca8f899 100644
---- a/fem/integ/nonlininteg_vecconvection_pa.cpp
-+++ b/fem/integ/nonlininteg_vecconvection_pa.cpp
-@@ -21,24 +21,18 @@ void VectorConvectionNLFIntegrator::AssemblePA(const FiniteElementSpace &fes)
-    MFEM_ASSERT(fes.GetOrdering() == Ordering::byNODES,
-                "PA Only supports Ordering::byNODES!");
-    Mesh *mesh = fes.GetMesh();
--   const FiniteElement &el = *fes.GetFE(0);
--   ElementTransformation &T = *mesh->GetElementTransformation(0);
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-+   if (mesh->GetNE() == 0) { return; }
-    if (DeviceCanUseCeed())
-    {
-       delete ceedOp;
--      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
--                         fes.IsVariableOrder();
--      if (mixed)
--      {
--         ceedOp = new ceed::MixedPAVectorConvectionNLIntegrator(*this, fes, Q);
--      }
--      else
--      {
--         ceedOp = new ceed::PAVectorConvectionNLFIntegrator(fes, *ir, Q);
--      }
-+      ceedOp = new ceed::PAVectorConvectionNLIntegrator(*this, fes, Q);
-       return;
-    }
-+
-+   // Assumes tensor-product elements
-+   const FiniteElement &el = *fes.GetFE(0);
-+   ElementTransformation &T = *mesh->GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-    dim = mesh->Dimension();
-    ne = fes.GetMesh()->GetNE();
-    nq = ir->GetNPoints();
-diff --git a/fem/lininteg.cpp b/fem/lininteg.cpp
-index c9b6b4699..25e7c2823 100644
---- a/fem/lininteg.cpp
-+++ b/fem/lininteg.cpp
-@@ -15,6 +15,22 @@
- namespace mfem
- {
- 
-+const IntegrationRule &LinearFormIntegrator::GetRule(
-+   const FiniteElement &el,
-+   ElementTransformation &Tr,
-+   int oa, int ob) const
-+{
-+   return IntRules.Get(el.GetGeomType(), oa * el.GetOrder() + ob);
-+}
-+
-+const IntegrationRule &LinearFormIntegrator::GetRule(
-+   const FiniteElement &el,
-+   FaceElementTransformations &Tr,
-+   int oa, int ob) const
-+{
-+   return IntRules.Get(Tr.GetGeometryType(), oa * el.GetOrder() + ob);
-+}
-+
- void LinearFormIntegrator::AssembleDevice(const FiniteElementSpace &fes,
-                                           const Array<int> &markers,
-                                           Vector &b)
-@@ -41,17 +57,11 @@ void DomainLFIntegrator::AssembleRHSElementVect(const FiniteElement &el,
- {
-    int dof = el.GetDof();
- 
--   shape.SetSize(dof);       // vector of size dof
-+   shape.SetSize(dof);  // vector of size dof
-    elvect.SetSize(dof);
-    elvect = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      // ir = &IntRules.Get(el.GetGeomType(),
--      //                    oa * el.GetOrder() + ob + Tr.OrderW());
--      ir = &IntRules.Get(el.GetGeomType(), oa * el.GetOrder() + ob);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr, oa, ob);
- 
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-@@ -86,12 +96,7 @@ void DomainLFGradIntegrator::AssembleRHSElementVect(
-    elvect.SetSize(dof);
-    elvect = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int intorder = 2 * el.GetOrder();
--      ir = &IntRules.Get(el.GetGeomType(), intorder);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr);
- 
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-@@ -128,16 +133,11 @@ void BoundaryLFIntegrator::AssembleRHSElementVect(
- {
-    int dof = el.GetDof();
- 
--   shape.SetSize(dof);        // vector of size dof
-+   shape.SetSize(dof);  // vector of size dof
-    elvect.SetSize(dof);
-    elvect = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int intorder = oa * el.GetOrder() + ob;  // <----------
--      ir = &IntRules.Get(el.GetGeomType(), intorder);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr, oa, ob);
- 
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-@@ -157,16 +157,11 @@ void BoundaryLFIntegrator::AssembleRHSElementVect(
- {
-    int dof = el.GetDof();
- 
--   shape.SetSize(dof);        // vector of size dof
-+   shape.SetSize(dof);  // vector of size dof
-    elvect.SetSize(dof);
-    elvect = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int intorder = oa * el.GetOrder() + ob;    // <------ user control
--      ir = &IntRules.Get(Tr.FaceGeom, intorder); // of integration order
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr, oa, ob);
- 
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-@@ -197,12 +192,7 @@ void BoundaryNormalLFIntegrator::AssembleRHSElementVect(
-    elvect.SetSize(dof);
-    elvect = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int intorder = oa * el.GetOrder() + ob;  // <----------
--      ir = &IntRules.Get(el.GetGeomType(), intorder);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr, oa, ob);
- 
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-@@ -241,12 +231,7 @@ void BoundaryTangentialLFIntegrator::AssembleRHSElementVect(
-       mfem_error("These methods make sense only in 2D problems.");
-    }
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int intorder = oa * el.GetOrder() + ob;  // <----------
--      ir = &IntRules.Get(el.GetGeomType(), intorder);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr, oa, ob);
- 
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-@@ -273,17 +258,12 @@ void VectorDomainLFIntegrator::AssembleRHSElementVect(
- 
-    double val,cf;
- 
--   shape.SetSize(dof);       // vector of size dof
-+   shape.SetSize(dof);  // vector of size dof
- 
-    elvect.SetSize(dof * vdim);
-    elvect = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int intorder = 2*el.GetOrder();
--      ir = &IntRules.Get(el.GetGeomType(), intorder);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr);
- 
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-@@ -337,12 +317,7 @@ void VectorDomainLFGradIntegrator::AssembleRHSElementVect(
-    elvect.SetSize(dof*(vdim/sdim));
-    elvect = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int intorder = 2 * el.GetOrder();
--      ir = &IntRules.Get(el.GetGeomType(), intorder);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr);
- 
-    Vector pelvect(dof);
-    Vector part_x(dim);
-@@ -384,12 +359,7 @@ void VectorBoundaryLFIntegrator::AssembleRHSElementVect(
-    elvect.SetSize(dof * vdim);
-    elvect = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int intorder = 2*el.GetOrder();
--      ir = &IntRules.Get(el.GetGeomType(), intorder);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr);
- 
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-@@ -419,12 +389,7 @@ void VectorBoundaryLFIntegrator::AssembleRHSElementVect(
-    elvect.SetSize(dof * vdim);
-    elvect = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int intorder = 2*el.GetOrder();
--      ir = &IntRules.Get(Tr.GetGeometryType(), intorder);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr);
- 
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-@@ -463,13 +428,8 @@ void VectorFEDomainLFIntegrator::AssembleRHSElementVect(
-    elvect.SetSize(dof);
-    elvect = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      // int intorder = 2*el.GetOrder() - 1; // ok for O(h^{k+1}) conv. in L2
--      int intorder = 2*el.GetOrder();
--      ir = &IntRules.Get(el.GetGeomType(), intorder);
--   }
-+   // Previously: 2 * el.GetOrder() - 1; // ok for O(h^{k+1}) conv. in L2
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr);
- 
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-@@ -512,12 +472,7 @@ void VectorFEDomainLFCurlIntegrator::AssembleRHSElementVect(
-    elvect.SetSize(dof);
-    elvect = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int intorder = 2*el.GetOrder();
--      ir = &IntRules.Get(el.GetGeomType(), intorder);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr);
- 
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-@@ -554,16 +509,11 @@ void VectorFEDomainLFDivIntegrator::AssembleRHSElementVect(
- {
-    int dof = el.GetDof();
- 
--   divshape.SetSize(dof);       // vector of size dof
-+   divshape.SetSize(dof);  // vector of size dof
-    elvect.SetSize(dof);
-    elvect = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int intorder = 2 * el.GetOrder();
--      ir = &IntRules.Get(el.GetGeomType(), intorder);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr);
- 
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-@@ -596,11 +546,7 @@ void VectorBoundaryFluxLFIntegrator::AssembleRHSElementVect(
-    nor.SetSize (dim);
-    elvect.SetSize (dim*dof);
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      ir = &IntRules.Get(el.GetGeomType(), el.GetOrder() + 1);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr, 1, 1);
- 
-    elvect = 0.0;
-    for (int i = 0; i < ir->GetNPoints(); i++)
-@@ -618,7 +564,6 @@ void VectorBoundaryFluxLFIntegrator::AssembleRHSElementVect(
-    }
- }
- 
--
- void VectorFEBoundaryFluxLFIntegrator::AssembleRHSElementVect(
-    const FiniteElement &el, ElementTransformation &Tr, Vector &elvect)
- {
-@@ -628,12 +573,7 @@ void VectorFEBoundaryFluxLFIntegrator::AssembleRHSElementVect(
-    elvect.SetSize(dof);
-    elvect = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int intorder = oa * el.GetOrder() + ob;  // <----------
--      ir = &IntRules.Get(el.GetGeomType(), intorder);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr, oa, ob);
- 
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-@@ -667,12 +607,7 @@ void VectorFEBoundaryTangentLFIntegrator::AssembleRHSElementVect(
-    elvect.SetSize(dof);
-    elvect = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      int intorder = oa * el.GetOrder() + ob;  // <----------
--      ir = &IntRules.Get(el.GetGeomType(), intorder);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr, oa, ob);
- 
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-@@ -707,6 +642,20 @@ void VectorFEBoundaryTangentLFIntegrator::AssembleRHSElementVect(
-    }
- }
- 
-+const IntegrationRule &BoundaryFlowIntegrator::GetRule(
-+   const FiniteElement &el,
-+   FaceElementTransformations &Tr,
-+   int oa, int ob) const
-+{
-+   // Assuming order(u) == order(mesh)
-+   int order = 2 * el.GetOrder() + Tr.Elem1->OrderW();
-+   if (el.Space() == FunctionSpace::Pk)
-+   {
-+      order++;
-+   }
-+   return IntRules.Get(Tr.GetGeometryType(), order);
-+}
-+
- void BoundaryFlowIntegrator::AssembleRHSElementVect(
-    const FiniteElement &el, ElementTransformation &Tr, Vector &elvect)
- {
-@@ -719,24 +668,14 @@ void BoundaryFlowIntegrator::AssembleRHSElementVect(
- void BoundaryFlowIntegrator::AssembleRHSElementVect(
-    const FiniteElement &el, FaceElementTransformations &Tr, Vector &elvect)
- {
--   int dim, ndof, order;
-+   int dim, ndof;
-    double un, w, vu_data[3], nor_data[3];
- 
-    dim  = el.GetDim();
-    ndof = el.GetDof();
-    Vector vu(vu_data, dim), nor(nor_data, dim);
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      // Assuming order(u)==order(mesh)
--      order = Tr.Elem1->OrderW() + 2*el.GetOrder();
--      if (el.Space() == FunctionSpace::Pk)
--      {
--         order++;
--      }
--      ir = &IntRules.Get(Tr.GetGeometryType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr);
- 
-    shape.SetSize(ndof);
-    elvect.SetSize(ndof);
-@@ -805,13 +744,7 @@ void DGDirichletLFIntegrator::AssembleRHSElementVect(
-    elvect.SetSize(ndof);
-    elvect = 0.0;
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      // a simple choice for the integration order; is this OK?
--      int order = 2*el.GetOrder();
--      ir = &IntRules.Get(Tr.GetGeometryType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr);
- 
-    for (int p = 0; p < ir->GetNPoints(); p++)
-    {
-@@ -902,12 +835,7 @@ void DGElasticityDirichletLFIntegrator::AssembleRHSElementVect(
-    dshape_du.SetSize(ndofs);
-    u_dir.SetSize(dim);
- 
--   const IntegrationRule *ir = IntRule;
--   if (ir == NULL)
--   {
--      const int order = 2*el.GetOrder(); // <-----
--      ir = &IntRules.Get(Tr.GetGeometryType(), order);
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Tr);
- 
-    for (int pi = 0; pi < ir->GetNPoints(); ++pi)
-    {
-@@ -1000,12 +928,10 @@ void DGElasticityDirichletLFIntegrator::AssembleRHSElementVect(
-    }
- }
- 
--
--
--void WhiteGaussianNoiseDomainLFIntegrator::AssembleRHSElementVect
--(const FiniteElement &el,
-- ElementTransformation &Tr,
-- Vector &elvect)
-+void WhiteGaussianNoiseDomainLFIntegrator::AssembleRHSElementVect(
-+   const FiniteElement &el,
-+   ElementTransformation &Tr,
-+   Vector &elvect)
- {
-    int n = el.GetDof();
-    elvect.SetSize(n);
-@@ -1040,13 +966,18 @@ void WhiteGaussianNoiseDomainLFIntegrator::AssembleRHSElementVect
-    }
- }
- 
-+const IntegrationRule &VectorQuadratureLFIntegrator::GetRule(
-+   const FiniteElement &el,
-+   ElementTransformation &Tr,
-+   int oa, int ob) const
-+{
-+   return vqfc.GetQuadFunction().GetSpace()->GetIntRule(Tr.ElementNo);
-+}
- 
- void VectorQuadratureLFIntegrator::AssembleRHSElementVect(
-    const FiniteElement &fe, ElementTransformation &Tr, Vector &elvect)
- {
--   const IntegrationRule *ir =
--      &vqfc.GetQuadFunction().GetSpace()->GetIntRule(Tr.ElementNo);
--
-+   const IntegrationRule *ir = &GetRule(fe, Tr);
-    const int nqp = ir->GetNPoints();
-    const int vdim = vqfc.GetVDim();
-    const int ndofs = fe.GetDof();
-@@ -1071,14 +1002,19 @@ void VectorQuadratureLFIntegrator::AssembleRHSElementVect(
-    }
- }
- 
-+const IntegrationRule &QuadratureLFIntegrator::GetRule(
-+   const FiniteElement &el,
-+   ElementTransformation &Tr,
-+   int oa, int ob) const
-+{
-+   return qfc.GetQuadFunction().GetSpace()->GetIntRule(Tr.ElementNo);
-+}
- 
- void QuadratureLFIntegrator::AssembleRHSElementVect(const FiniteElement &fe,
-                                                     ElementTransformation &Tr,
-                                                     Vector &elvect)
- {
--   const IntegrationRule *ir =
--      &qfc.GetQuadFunction().GetSpace()->GetIntRule(Tr.ElementNo);
--
-+   const IntegrationRule *ir = &GetRule(fe, Tr);
-    const int nqp = ir->GetNPoints();
-    const int ndofs = fe.GetDof();
-    Vector shape(ndofs);
-diff --git a/fem/lininteg.hpp b/fem/lininteg.hpp
-index 02fde00c9..5bcdfdb83 100644
---- a/fem/lininteg.hpp
-+++ b/fem/lininteg.hpp
-@@ -29,10 +29,20 @@ protected:
-    LinearFormIntegrator(const IntegrationRule *ir = NULL) { IntRule = ir; }
- 
- public:
--
-    /// Method probing for assembly on device
-    virtual bool SupportsDevice() const { return false; }
- 
-+   virtual void SetIntRule(const IntegrationRule *ir) { IntRule = ir; }
-+
-+   const IntegrationRule *GetIntRule() { return IntRule; }
-+
-+   virtual const IntegrationRule &GetRule(const FiniteElement &el,
-+                                          ElementTransformation &Tr,
-+                                          int oa = 2, int ob = 0) const;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &el,
-+                                          FaceElementTransformations &Tr,
-+                                          int oa = 2, int ob = 0) const;
-+
-    /// Method defining assembly on device
-    virtual void AssembleDevice(const FiniteElementSpace &fes,
-                                const Array<int> &markers,
-@@ -51,13 +61,9 @@ public:
-                                        FaceElementTransformations &Tr,
-                                        Vector &elvect);
- 
--   virtual void SetIntRule(const IntegrationRule *ir) { IntRule = ir; }
--   const IntegrationRule* GetIntRule() { return IntRule; }
--
--   virtual ~LinearFormIntegrator() { }
-+   virtual ~LinearFormIntegrator() {}
- };
- 
--
- /// Abstract class for integrators that support delta coefficients
- class DeltaLFIntegrator : public LinearFormIntegrator
- {
-@@ -70,7 +76,7 @@ protected:
-    DeltaLFIntegrator(Coefficient &q, const IntegrationRule *ir = NULL)
-       : LinearFormIntegrator(ir),
-         delta(dynamic_cast<DeltaCoefficient*>(&q)),
--        vec_delta(NULL) { }
-+        vec_delta(NULL) {}
- 
-    /** @brief This constructor should be used by derived classes that use a
-        VectorDeltaCoefficient. */
-@@ -78,7 +84,7 @@ protected:
-                      const IntegrationRule *ir = NULL)
-       : LinearFormIntegrator(ir),
-         delta(NULL),
--        vec_delta(dynamic_cast<VectorDeltaCoefficient*>(&vq)) { }
-+        vec_delta(dynamic_cast<VectorDeltaCoefficient*>(&vq)) {}
- 
- public:
-    /// Returns true if the derived class instance uses a delta coefficient.
-@@ -103,23 +109,23 @@ public:
-                                          Vector &elvect) = 0;
- };
- 
--
- /// Class for domain integration L(v) := (f, v)
- class DomainLFIntegrator : public DeltaLFIntegrator
- {
-    Vector shape;
-    Coefficient &Q;
-    int oa, ob;
-+
- public:
-    /// Constructs a domain integrator with a given Coefficient
-+   /// the old default was a = 1, b = 1
-+   /// for simple elliptic problems a = 2, b = -2 is OK
-    DomainLFIntegrator(Coefficient &QF, int a = 2, int b = 0)
--   // the old default was a = 1, b = 1
--   // for simple elliptic problems a = 2, b = -2 is OK
--      : DeltaLFIntegrator(QF), Q(QF), oa(a), ob(b) { }
-+      : DeltaLFIntegrator(QF), Q(QF), oa(a), ob(b) {}
- 
-    /// Constructs a domain integrator with a given Coefficient
-    DomainLFIntegrator(Coefficient &QF, const IntegrationRule *ir)
--      : DeltaLFIntegrator(QF, ir), Q(QF), oa(1), ob(1) { }
-+      : DeltaLFIntegrator(QF, ir), Q(QF), oa(1), ob(1) {}
- 
-    virtual bool SupportsDevice() const { return true; }
- 
-@@ -152,7 +158,7 @@ private:
- public:
-    /// Constructs the domain integrator (Q, grad v)
-    DomainLFGradIntegrator(VectorCoefficient &QF)
--      : DeltaLFIntegrator(QF), Q(QF) { }
-+      : DeltaLFIntegrator(QF), Q(QF) {}
- 
-    virtual bool SupportsDevice() const { return true; }
- 
-@@ -174,18 +180,18 @@ public:
-    using LinearFormIntegrator::AssembleRHSElementVect;
- };
- 
--
- /// Class for boundary integration L(v) := (g, v)
- class BoundaryLFIntegrator : public LinearFormIntegrator
- {
-    Vector shape;
-    Coefficient &Q;
-    int oa, ob;
-+
- public:
-    /** @brief Constructs a boundary integrator with a given Coefficient @a QG.
-        Integration order will be @a a * basis_order + @a b. */
-    BoundaryLFIntegrator(Coefficient &QG, int a = 1, int b = 1)
--      : Q(QG), oa(a), ob(b) { }
-+      : Q(QG), oa(a), ob(b) {}
- 
-    virtual bool SupportsDevice() const { return true; }
- 
-@@ -212,10 +218,11 @@ class BoundaryNormalLFIntegrator : public LinearFormIntegrator
-    Vector shape;
-    VectorCoefficient &Q;
-    int oa, ob;
-+
- public:
-    /// Constructs a boundary integrator with a given Coefficient QG
-    BoundaryNormalLFIntegrator(VectorCoefficient &QG, int a = 1, int b = 1)
--      : Q(QG), oa(a), ob(b) { }
-+      : Q(QG), oa(a), ob(b) {}
- 
-    virtual bool SupportsDevice() const { return true; }
- 
-@@ -237,10 +244,11 @@ class BoundaryTangentialLFIntegrator : public LinearFormIntegrator
-    Vector shape;
-    VectorCoefficient &Q;
-    int oa, ob;
-+
- public:
-    /// Constructs a boundary integrator with a given Coefficient QG
-    BoundaryTangentialLFIntegrator(VectorCoefficient &QG, int a = 1, int b = 1)
--      : Q(QG), oa(a), ob(b) { }
-+      : Q(QG), oa(a), ob(b) {}
- 
-    virtual void AssembleRHSElementVect(const FiniteElement &el,
-                                        ElementTransformation &Tr,
-@@ -260,7 +268,7 @@ private:
- public:
-    /// Constructs a domain integrator with a given VectorCoefficient
-    VectorDomainLFIntegrator(VectorCoefficient &QF)
--      : DeltaLFIntegrator(QF), Q(QF) { }
-+      : DeltaLFIntegrator(QF), Q(QF) {}
- 
-    virtual bool SupportsDevice() const { return true; }
- 
-@@ -294,7 +302,7 @@ private:
- public:
-    /// Constructs the domain integrator (Q, grad v)
-    VectorDomainLFGradIntegrator(VectorCoefficient &QF)
--      : DeltaLFIntegrator(QF), Q(QF) { }
-+      : DeltaLFIntegrator(QF), Q(QF) {}
- 
-    virtual bool SupportsDevice() const override { return true; }
- 
-@@ -326,7 +334,7 @@ private:
- 
- public:
-    /// Constructs a boundary integrator with a given VectorCoefficient QG
--   VectorBoundaryLFIntegrator(VectorCoefficient &QG) : Q(QG) { }
-+   VectorBoundaryLFIntegrator(VectorCoefficient &QG) : Q(QG) {}
- 
-    /** Given a particular boundary Finite Element and a transformation (Tr)
-        computes the element boundary vector, elvect. */
-@@ -352,7 +360,9 @@ private:
- 
- public:
-    VectorFEDomainLFIntegrator(VectorCoefficient &F)
--      : DeltaLFIntegrator(F), QF(F) { }
-+      : DeltaLFIntegrator(F), QF(F) {}
-+
-+   virtual bool SupportsDevice() const { return true; }
- 
-    virtual void AssembleRHSElementVect(const FiniteElement &el,
-                                        ElementTransformation &Tr,
-@@ -362,8 +372,6 @@ public:
-                                          ElementTransformation &Trans,
-                                          Vector &elvect);
- 
--   virtual bool SupportsDevice() const { return true; }
--
-    virtual void AssembleDevice(const FiniteElementSpace &fes,
-                                const Array<int> &markers,
-                                Vector &b);
-@@ -382,7 +390,7 @@ private:
- public:
-    /// Constructs the domain integrator (Q, curl v)
-    VectorFEDomainLFCurlIntegrator(VectorCoefficient &F)
--      : DeltaLFIntegrator(F), QF(&F) { }
-+      : DeltaLFIntegrator(F), QF(&F) {}
- 
-    virtual void AssembleRHSElementVect(const FiniteElement &el,
-                                        ElementTransformation &Tr,
-@@ -401,10 +409,11 @@ class VectorFEDomainLFDivIntegrator : public DeltaLFIntegrator
- private:
-    Vector divshape;
-    Coefficient &Q;
-+
- public:
-    /// Constructs the domain integrator (Q, div v)
-    VectorFEDomainLFDivIntegrator(Coefficient &QF)
--      : DeltaLFIntegrator(QF), Q(QF) { }
-+      : DeltaLFIntegrator(QF), Q(QF) {}
- 
-    /** Given a particular Finite Element and a transformation (Tr)
-        computes the element right hand side element vector, elvect. */
-@@ -432,7 +441,7 @@ private:
- public:
-    VectorBoundaryFluxLFIntegrator(Coefficient &f, double s = 1.0,
-                                   const IntegrationRule *ir = NULL)
--      : LinearFormIntegrator(ir), Sign(s), F(&f) { }
-+      : LinearFormIntegrator(ir), Sign(s), F(&f) {}
- 
-    virtual void AssembleRHSElementVect(const FiniteElement &el,
-                                        ElementTransformation &Tr,
-@@ -453,21 +462,21 @@ private:
- 
- public:
-    VectorFEBoundaryFluxLFIntegrator(int a = 1, int b = -1)
--      : F(NULL), oa(a), ob(b) { }
-+      : F(NULL), oa(a), ob(b) {}
-    VectorFEBoundaryFluxLFIntegrator(Coefficient &f, int a = 2, int b = 0)
--      : F(&f), oa(a), ob(b) { }
-+      : F(&f), oa(a), ob(b) {}
-+
-+   virtual bool SupportsDevice() const { return true; }
- 
-    virtual void AssembleRHSElementVect(const FiniteElement &el,
-                                        ElementTransformation &Tr,
-                                        Vector &elvect);
- 
--   using LinearFormIntegrator::AssembleRHSElementVect;
--
--   virtual bool SupportsDevice() const { return true; }
--
-    virtual void AssembleDevice(const FiniteElementSpace &fes,
-                                const Array<int> &markers,
-                                Vector &b);
-+
-+   using LinearFormIntegrator::AssembleRHSElementVect;
- };
- 
- /// Class for boundary integration \f$ L(v) = (n \times f, v) \f$
-@@ -480,7 +489,7 @@ private:
- public:
-    VectorFEBoundaryTangentLFIntegrator(VectorCoefficient &QG,
-                                        int a = 2, int b = 0)
--      : f(QG), oa(a), ob(b) { }
-+      : f(QG), oa(a), ob(b) {}
- 
-    virtual void AssembleRHSElementVect(const FiniteElement &el,
-                                        ElementTransformation &Tr,
-@@ -489,7 +498,6 @@ public:
-    using LinearFormIntegrator::AssembleRHSElementVect;
- };
- 
--
- /** Class for boundary integration of the linear form:
-     (alpha/2) < (u.n) f, w > - beta < |u.n| f, w >,
-     where f and u are given scalar and vector coefficients, respectively,
-@@ -512,6 +520,11 @@ public:
-                           double a, double b)
-    { f = &f_; u = &u_; alpha = a; beta = b; }
- 
-+   using LinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &el,
-+                                          FaceElementTransformations &Tr,
-+                                          int oa = 2, int ob = 0) const;
-+
-    virtual void AssembleRHSElementVect(const FiniteElement &el,
-                                        ElementTransformation &Tr,
-                                        Vector &elvect);
-@@ -522,7 +535,6 @@ public:
-    using LinearFormIntegrator::AssembleRHSElementVect;
- };
- 
--
- /** Boundary linear integrator for imposing non-zero Dirichlet boundary
-     conditions, to be used in conjunction with DGDiffusionIntegrator.
-     Specifically, given the Dirichlet data u_D, the linear form assembles the
-@@ -546,13 +558,13 @@ protected:
- 
- public:
-    DGDirichletLFIntegrator(Coefficient &u, const double s, const double k)
--      : uD(&u), Q(NULL), MQ(NULL), sigma(s), kappa(k) { }
-+      : uD(&u), Q(NULL), MQ(NULL), sigma(s), kappa(k) {}
-    DGDirichletLFIntegrator(Coefficient &u, Coefficient &q,
-                            const double s, const double k)
--      : uD(&u), Q(&q), MQ(NULL), sigma(s), kappa(k) { }
-+      : uD(&u), Q(&q), MQ(NULL), sigma(s), kappa(k) {}
-    DGDirichletLFIntegrator(Coefficient &u, MatrixCoefficient &q,
-                            const double s, const double k)
--      : uD(&u), Q(NULL), MQ(&q), sigma(s), kappa(k) { }
-+      : uD(&u), Q(NULL), MQ(&q), sigma(s), kappa(k) {}
- 
-    virtual void AssembleRHSElementVect(const FiniteElement &el,
-                                        ElementTransformation &Tr,
-@@ -564,7 +576,6 @@ public:
-    using LinearFormIntegrator::AssembleRHSElementVect;
- };
- 
--
- /** Boundary linear form integrator for imposing non-zero Dirichlet boundary
-     conditions, in a DG elasticity formulation. Specifically, the linear form is
-     given by
-@@ -597,7 +608,7 @@ public:
-    DGElasticityDirichletLFIntegrator(VectorCoefficient &uD_,
-                                      Coefficient &lambda_, Coefficient &mu_,
-                                      double alpha_, double kappa_)
--      : uD(uD_), lambda(&lambda_), mu(&mu_), alpha(alpha_), kappa(kappa_) { }
-+      : uD(uD_), lambda(&lambda_), mu(&mu_), alpha(alpha_), kappa(kappa_) {}
- 
-    virtual void AssembleRHSElementVect(const FiniteElement &el,
-                                        ElementTransformation &Tr,
-@@ -609,7 +620,6 @@ public:
-    using LinearFormIntegrator::AssembleRHSElementVect;
- };
- 
--
- /** Class for spatial white Gaussian noise integration.
- 
-     The target problem is the linear SPDE a(u,v) = F(v) with F(v) := <Ẇ,v>,
-@@ -637,8 +647,8 @@ class WhiteGaussianNoiseDomainLFIntegrator : public LinearFormIntegrator
-    std::normal_distribution<double> dist;
- 
-    bool save_factors = false;
--public:
- 
-+public:
- #ifdef MFEM_USE_MPI
-    /** @brief Sets the @a seed_ of the random number generator. A fixed seed
-        allows for a reproducible sequence of white noise vectors. */
-@@ -669,13 +679,13 @@ public:
-       if (seed_ > 0) { SetSeed(seed_); }
-    }
- #endif
-+
-    /// @brief Sets/resets the @a seed of the random number generator.
-    void SetSeed(int seed)
-    {
-       generator.seed(seed);
-    }
- 
--   using LinearFormIntegrator::AssembleRHSElementVect;
-    virtual void AssembleRHSElementVect(const FiniteElement &el,
-                                        ElementTransformation &Tr,
-                                        Vector &elvect);
-@@ -715,8 +725,9 @@ public:
-       }
-       L.DeleteAll();
-    }
--};
- 
-+   using LinearFormIntegrator::AssembleRHSElementVect;
-+};
- 
- /** Class for domain integration of L(v) := (f, v), where
-     f=(f1,...,fn) and v=(v1,...,vn). that makes use of
-@@ -738,18 +749,23 @@ public:
-       }
-    }
- 
--   using LinearFormIntegrator::AssembleRHSElementVect;
--   virtual void AssembleRHSElementVect(const FiniteElement &fe,
--                                       ElementTransformation &Tr,
--                                       Vector &elvect);
--
-    virtual void SetIntRule(const IntegrationRule *ir)
-    {
-       MFEM_WARNING("Integration rule not used in this class. "
-                    "The QuadratureFunction integration rules are used instead");
-    }
--};
- 
-+   using LinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &el,
-+                                          ElementTransformation &Tr,
-+                                          int oa = 2, int ob = 0) const;
-+
-+   virtual void AssembleRHSElementVect(const FiniteElement &fe,
-+                                       ElementTransformation &Tr,
-+                                       Vector &elvect);
-+
-+   using LinearFormIntegrator::AssembleRHSElementVect;
-+};
- 
- /** Class for domain integration L(v) := (f, v) that makes use
-     of QuadratureFunctionCoefficient. */
-@@ -770,19 +786,24 @@ public:
-       }
-    }
- 
--   using LinearFormIntegrator::AssembleRHSElementVect;
--   virtual void AssembleRHSElementVect(const FiniteElement &fe,
--                                       ElementTransformation &Tr,
--                                       Vector &elvect);
--
-    virtual void SetIntRule(const IntegrationRule *ir)
-    {
-       MFEM_WARNING("Integration rule not used in this class. "
-                    "The QuadratureFunction integration rules are used instead");
-    }
-+
-+   using LinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &el,
-+                                          ElementTransformation &Tr,
-+                                          int oa = 2, int ob = 0) const;
-+
-+   virtual void AssembleRHSElementVect(const FiniteElement &fe,
-+                                       ElementTransformation &Tr,
-+                                       Vector &elvect);
-+
-+   using LinearFormIntegrator::AssembleRHSElementVect;
- };
- 
- }
- 
--
- #endif
-diff --git a/fem/nonlininteg.cpp b/fem/nonlininteg.cpp
-index 5ee1febea..a704ee207 100644
---- a/fem/nonlininteg.cpp
-+++ b/fem/nonlininteg.cpp
-@@ -15,26 +15,44 @@
- namespace mfem
- {
- 
-+const IntegrationRule &NonlinearFormIntegrator::GetRule(
-+   const FiniteElement&, const FiniteElement&,
-+   ElementTransformation&) const
-+{
-+   MFEM_ABORT("NonlinearFormIntegrator::GetRule(...)\n"
-+              "   is not implemented for this class.");
-+   return IntRules.Get(0, 0);
-+}
-+
-+const IntegrationRule &NonlinearFormIntegrator::GetRule(
-+   const FiniteElement&, const FiniteElement&,
-+   FaceElementTransformations&) const
-+{
-+   MFEM_ABORT("NonlinearFormIntegrator::GetRule(...)\n"
-+              "   is not implemented for this class.");
-+   return IntRules.Get(0, 0);
-+}
-+
- void NonlinearFormIntegrator::AssemblePA(const FiniteElementSpace&)
- {
-    MFEM_ABORT("NonlinearFormIntegrator::AssemblePA(...)\n"
-               "   is not implemented for this class.");
- }
- 
--void NonlinearFormIntegrator::AssembleGradPA(const Vector &x,
--                                             const FiniteElementSpace &fes)
-+void NonlinearFormIntegrator::AssembleGradPA(const Vector&,
-+                                             const FiniteElementSpace&)
- {
-    MFEM_ABORT("NonlinearFormIntegrator::AssembleGradPA(...)\n"
-               "   is not implemented for this class.");
- }
- 
--void NonlinearFormIntegrator::AssembleGradDiagonalPA(Vector &diag) const
-+void NonlinearFormIntegrator::AssembleGradDiagonalPA(Vector&) const
- {
-    MFEM_ABORT("NonlinearFormIntegrator::AssembleGradDiagonalPA(...)\n"
-               "   is not implemented for this class.");
- }
- 
--void NonlinearFormIntegrator::AddMultPA(const Vector &, Vector &) const
-+void NonlinearFormIntegrator::AddMultPA(const Vector&, Vector&) const
- {
-    MFEM_ABORT("NonlinearFormIntegrator::AddMultPA(...)\n"
-               "   is not implemented for this class.");
-@@ -46,119 +64,141 @@ void NonlinearFormIntegrator::AddMultGradPA(const Vector&, Vector&) const
-               "   is not implemented for this class.");
- }
- 
--double NonlinearFormIntegrator::GetLocalStateEnergyPA(const Vector &x) const
-+double NonlinearFormIntegrator::GetLocalStateEnergyPA(const Vector&) const
- {
-    MFEM_ABORT("NonlinearFormIntegrator::GetLocalStateEnergyPA(...)\n"
-               "   is not implemented for this class.");
-    return 0.0;
- }
- 
--void NonlinearFormIntegrator::AssembleMF(const FiniteElementSpace &fes)
-+void NonlinearFormIntegrator::AssembleMF(const FiniteElementSpace&)
- {
-    MFEM_ABORT("NonlinearFormIntegrator::AssembleMF(...)\n"
-               "   is not implemented for this class.");
- }
- 
--void NonlinearFormIntegrator::AddMultMF(const Vector &, Vector &) const
-+void NonlinearFormIntegrator::AddMultMF(const Vector&, Vector&) const
- {
-    MFEM_ABORT("NonlinearFormIntegrator::AddMultMF(...)\n"
-               "   is not implemented for this class.");
- }
- 
--double NonlinearFormIntegrator::GetElementEnergy(
--   const FiniteElement &el, ElementTransformation &Tr, const Vector &elfun)
-+double NonlinearFormIntegrator::GetElementEnergy(const FiniteElement&,
-+                                                 ElementTransformation&,
-+                                                 const Vector&)
- {
--   MFEM_ABORT("NonlinearFormIntegrator::GetElementEnergy"
--              " is not overloaded!");
-+   MFEM_ABORT("NonlinearFormIntegrator::GetElementEnergy(...)"
-+              "   is not implemented for this class.");
-    return 0.0;
- }
- 
--void NonlinearFormIntegrator::AssembleElementVector(
--   const FiniteElement &el, ElementTransformation &Tr,
--   const Vector &elfun, Vector &elvect)
-+void NonlinearFormIntegrator::AssembleElementVector(const FiniteElement&,
-+                                                    ElementTransformation&,
-+                                                    const Vector&,
-+                                                    Vector&)
- {
--   MFEM_ABORT("NonlinearFormIntegrator::AssembleElementVector"
--              " is not overloaded!");
-+   MFEM_ABORT("NonlinearFormIntegrator::AssembleElementVector(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--void NonlinearFormIntegrator::AssembleFaceVector(
--   const FiniteElement &el1, const FiniteElement &el2,
--   FaceElementTransformations &Tr, const Vector &elfun, Vector &elvect)
-+void NonlinearFormIntegrator::AssembleFaceVector(const FiniteElement&,
-+                                                 const FiniteElement&,
-+                                                 FaceElementTransformations&,
-+                                                 const Vector&,
-+                                                 Vector&)
- {
--   MFEM_ABORT("NonlinearFormIntegrator::AssembleFaceVector"
--              " is not overloaded!");
-+   MFEM_ABORT("NonlinearFormIntegrator::AssembleFaceVector(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--void NonlinearFormIntegrator::AssembleElementGrad(
--   const FiniteElement &el, ElementTransformation &Tr, const Vector &elfun,
--   DenseMatrix &elmat)
-+void NonlinearFormIntegrator::AssembleElementGrad(const FiniteElement&,
-+                                                  ElementTransformation&,
-+                                                  const Vector&,
-+                                                  DenseMatrix&)
- {
--   MFEM_ABORT("NonlinearFormIntegrator::AssembleElementGrad"
--              " is not overloaded!");
-+   MFEM_ABORT("NonlinearFormIntegrator::AssembleElementGrad(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--void NonlinearFormIntegrator::AssembleFaceGrad(
--   const FiniteElement &el1, const FiniteElement &el2,
--   FaceElementTransformations &Tr, const Vector &elfun,
--   DenseMatrix &elmat)
-+void NonlinearFormIntegrator::AssembleFaceGrad(const FiniteElement&,
-+                                               const FiniteElement&,
-+                                               FaceElementTransformations&,
-+                                               const Vector&,
-+                                               DenseMatrix&)
- {
--   MFEM_ABORT("NonlinearFormIntegrator::AssembleFaceGrad"
--              " is not overloaded!");
-+   MFEM_ABORT("NonlinearFormIntegrator::AssembleFaceGrad(...)\n"
-+              "   is not implemented for this class.");
- }
- 
-+const IntegrationRule &BlockNonlinearFormIntegrator::GetRule(
-+   const FiniteElement&, const FiniteElement&test_fe,
-+   ElementTransformation&) const
-+{
-+   MFEM_ABORT("BlockNonlinearFormIntegrator::GetRule(...)\n"
-+              "   is not implemented for this class.");
-+   return IntRules.Get(0, 0);
-+}
- 
--void BlockNonlinearFormIntegrator::AssembleElementVector(
--   const Array<const FiniteElement *> &el,
--   ElementTransformation &Tr,
--   const Array<const Vector *> &elfun,
--   const Array<Vector *> &elvec)
-+const IntegrationRule &BlockNonlinearFormIntegrator::GetRule(
-+   const FiniteElement&, const FiniteElement&,
-+   FaceElementTransformations&) const
- {
--   MFEM_ABORT("BlockNonlinearFormIntegrator::AssembleElementVector"
--              " is not overloaded!");
-+   MFEM_ABORT("BlockNonlinearFormIntegrator::GetRule(...)\n"
-+              "   is not implemented for this class.");
-+   return IntRules.Get(0, 0);
- }
- 
--void BlockNonlinearFormIntegrator::AssembleFaceVector(
--   const Array<const FiniteElement *> &el1,
--   const Array<const FiniteElement *> &el2,
--   FaceElementTransformations &Tr,
--   const Array<const Vector *> &elfun,
--   const Array<Vector *> &elvect)
-+double BlockNonlinearFormIntegrator::GetElementEnergy(
-+   const Array<const FiniteElement *>&,
-+   ElementTransformation&,
-+   const Array<const Vector *>&)
- {
--   MFEM_ABORT("BlockNonlinearFormIntegrator::AssembleFaceVector"
--              " is not overloaded!");
-+   MFEM_ABORT("BlockNonlinearFormIntegrator::GetElementEnergy(...)\n"
-+              "   is not implemented for this class.");
-+   return 0.0;
- }
- 
--void BlockNonlinearFormIntegrator::AssembleElementGrad(
--   const Array<const FiniteElement*> &el,
--   ElementTransformation &Tr,
--   const Array<const Vector *> &elfun,
--   const Array2D<DenseMatrix *> &elmats)
-+void BlockNonlinearFormIntegrator::AssembleElementVector(
-+   const Array<const FiniteElement *>&,
-+   ElementTransformation&,
-+   const Array<const Vector *>&,
-+   const Array<Vector *>&)
- {
--   MFEM_ABORT("BlockNonlinearFormIntegrator::AssembleElementGrad"
--              " is not overloaded!");
-+   MFEM_ABORT("BlockNonlinearFormIntegrator::AssembleElementVector(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--void BlockNonlinearFormIntegrator::AssembleFaceGrad(
--   const Array<const FiniteElement *>&el1,
--   const Array<const FiniteElement *>&el2,
--   FaceElementTransformations &Tr,
--   const Array<const Vector *> &elfun,
--   const Array2D<DenseMatrix *> &elmats)
-+void BlockNonlinearFormIntegrator::AssembleFaceVector(
-+   const Array<const FiniteElement *>&,
-+   const Array<const FiniteElement *>&,
-+   FaceElementTransformations&,
-+   const Array<const Vector *>&,
-+   const Array<Vector *>&)
- {
--   MFEM_ABORT("BlockNonlinearFormIntegrator::AssembleFaceGrad"
--              " is not overloaded!");
-+   MFEM_ABORT("BlockNonlinearFormIntegrator::AssembleFaceVector(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--double BlockNonlinearFormIntegrator::GetElementEnergy(
--   const Array<const FiniteElement *>&el,
--   ElementTransformation &Tr,
--   const Array<const Vector *>&elfun)
-+void BlockNonlinearFormIntegrator::AssembleElementGrad(
-+   const Array<const FiniteElement*>&,
-+   ElementTransformation&,
-+   const Array<const Vector *>&,
-+   const Array2D<DenseMatrix *>&)
- {
--   MFEM_ABORT("BlockNonlinearFormIntegrator::GetElementEnergy"
--              " is not overloaded!");
--   return 0.0;
-+   MFEM_ABORT("BlockNonlinearFormIntegrator::AssembleElementGrad(...)\n"
-+              "   is not implemented for this class.");
- }
- 
-+void BlockNonlinearFormIntegrator::AssembleFaceGrad(
-+   const Array<const FiniteElement *>&,
-+   const Array<const FiniteElement *>&,
-+   FaceElementTransformations&,
-+   const Array<const Vector *>&,
-+   const Array2D<DenseMatrix *>&)
-+{
-+   MFEM_ABORT("BlockNonlinearFormIntegrator::AssembleFaceGrad(...)\n"
-+              "   is not implemented for this class.");
-+}
- 
- double InverseHarmonicModel::EvalW(const DenseMatrix &J) const
- {
-@@ -260,7 +300,6 @@ void InverseHarmonicModel::AssembleH(
-       }
- }
- 
--
- inline void NeoHookeanModel::EvalCoeffs() const
- {
-    mu = c_mu->Eval(*Ttr, Ttr->GetIntPoint());
-@@ -376,6 +415,13 @@ void NeoHookeanModel::AssembleH(const DenseMatrix &J, const DenseMatrix &DS,
-             }
- }
- 
-+const IntegrationRule &HyperelasticNLFIntegrator::GetRule(
-+   const FiniteElement &trial_fe, const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
-+{
-+   int order = trial_fe.GetOrder() + test_fe.GetOrder() + 3;
-+   return IntRules.Get(trial_fe.GetGeomType(), order);
-+}
- 
- double HyperelasticNLFIntegrator::GetElementEnergy(const FiniteElement &el,
-                                                    ElementTransformation &Ttr,
-@@ -390,11 +436,7 @@ double HyperelasticNLFIntegrator::GetElementEnergy(const FiniteElement &el,
-    Jpt.SetSize(dim);
-    PMatI.UseExternalData(elfun.GetData(), dof, dim);
- 
--   const IntegrationRule *ir = IntRule;
--   if (!ir)
--   {
--      ir = &(IntRules.Get(el.GetGeomType(), 2*el.GetOrder() + 3)); // <---
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Ttr);
- 
-    energy = 0.0;
-    model->SetTransformation(Ttr);
-@@ -429,11 +471,7 @@ void HyperelasticNLFIntegrator::AssembleElementVector(
-    elvect.SetSize(dof*dim);
-    PMatO.UseExternalData(elvect.GetData(), dof, dim);
- 
--   const IntegrationRule *ir = IntRule;
--   if (!ir)
--   {
--      ir = &(IntRules.Get(el.GetGeomType(), 2*el.GetOrder() + 3)); // <---
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Ttr);
- 
-    elvect = 0.0;
-    model->SetTransformation(Ttr);
-@@ -468,11 +506,7 @@ void HyperelasticNLFIntegrator::AssembleElementGrad(const FiniteElement &el,
-    PMatI.UseExternalData(elfun.GetData(), dof, dim);
-    elmat.SetSize(dof*dim);
- 
--   const IntegrationRule *ir = IntRule;
--   if (!ir)
--   {
--      ir = &(IntRules.Get(el.GetGeomType(), 2*el.GetOrder() + 3)); // <---
--   }
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Ttr);
- 
-    elmat = 0.0;
-    model->SetTransformation(Ttr);
-@@ -490,6 +524,13 @@ void HyperelasticNLFIntegrator::AssembleElementGrad(const FiniteElement &el,
-    }
- }
- 
-+const IntegrationRule &IncompressibleNeoHookeanIntegrator::GetRule(
-+   const FiniteElement &trial_fe, const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
-+{
-+   int order = trial_fe.GetOrder() + test_fe.GetOrder() + 3;
-+   return IntRules.Get(trial_fe.GetGeomType(), order);
-+}
- 
- double IncompressibleNeoHookeanIntegrator::GetElementEnergy(
-    const Array<const FiniteElement *>&el,
-@@ -511,15 +552,14 @@ double IncompressibleNeoHookeanIntegrator::GetElementEnergy(
-    J.SetSize(dim);
-    PMatI_u.UseExternalData(elfun[0]->GetData(), dof_u, dim);
- 
--   int intorder = 2*el[0]->GetOrder() + 3; // <---
--   const IntegrationRule &ir = IntRules.Get(el[0]->GetGeomType(), intorder);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*el[0], Tr);
- 
-    double energy = 0.0;
-    double mu = 0.0;
- 
--   for (int i = 0; i < ir.GetNPoints(); ++i)
-+   for (int i = 0; i < ir->GetNPoints(); ++i)
-    {
--      const IntegrationPoint &ip = ir.IntPoint(i);
-+      const IntegrationPoint &ip = ir->IntPoint(i);
-       Tr.SetIntPoint(&ip);
-       CalcInverse(Tr.Jacobian(), J0i);
- 
-@@ -572,15 +612,14 @@ void IncompressibleNeoHookeanIntegrator::AssembleElementVector(
-    Sh_p.SetSize(dof_p);
-    elvec[1]->SetSize(dof_p);
- 
--   int intorder = 2*el[0]->GetOrder() + 3; // <---
--   const IntegrationRule &ir = IntRules.Get(el[0]->GetGeomType(), intorder);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*el[0], Tr);
- 
-    *elvec[0] = 0.0;
-    *elvec[1] = 0.0;
- 
--   for (int i = 0; i < ir.GetNPoints(); ++i)
-+   for (int i = 0; i < ir->GetNPoints(); ++i)
-    {
--      const IntegrationPoint &ip = ir.IntPoint(i);
-+      const IntegrationPoint &ip = ir->IntPoint(i);
-       Tr.SetIntPoint(&ip);
-       CalcInverse(Tr.Jacobian(), J0i);
- 
-@@ -605,7 +644,6 @@ void IncompressibleNeoHookeanIntegrator::AssembleElementVector(
- 
-       elvec[1]->Add(ip.weight * Tr.Weight() * (dJ - 1.0), Sh_p);
-    }
--
- }
- 
- void IncompressibleNeoHookeanIntegrator::AssembleElementGrad(
-@@ -639,12 +677,11 @@ void IncompressibleNeoHookeanIntegrator::AssembleElementGrad(
-    PMatI_u.UseExternalData(elfun[0]->GetData(), dof_u, dim);
-    Sh_p.SetSize(dof_p);
- 
--   int intorder = 2*el[0]->GetOrder() + 3; // <---
--   const IntegrationRule &ir = IntRules.Get(el[0]->GetGeomType(), intorder);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(*el[0], Tr);
- 
--   for (int i = 0; i < ir.GetNPoints(); ++i)
-+   for (int i = 0; i < ir->GetNPoints(); ++i)
-    {
--      const IntegrationPoint &ip = ir.IntPoint(i);
-+      const IntegrationPoint &ip = ir->IntPoint(i);
-       Tr.SetIntPoint(&ip);
-       CalcInverse(Tr.Jacobian(), J0i);
- 
-@@ -721,16 +758,15 @@ void IncompressibleNeoHookeanIntegrator::AssembleElementGrad(
-          }
-       }
-    }
--
- }
- 
--
--const IntegrationRule&
--VectorConvectionNLFIntegrator::GetRule(const FiniteElement &fe,
--                                       ElementTransformation &T)
-+const IntegrationRule &VectorConvectionNLFIntegrator::GetRule(
-+   const FiniteElement &trial_fe, const FiniteElement &test_fe,
-+   ElementTransformation &Trans) const
- {
--   const int order = 2 * fe.GetOrder() + T.OrderGrad(&fe);
--   return IntRules.Get(fe.GetGeomType(), order);
-+   int order = Trans.OrderGrad(&trial_fe) + trial_fe.GetOrder() +
-+               test_fe.GetOrder();
-+   return IntRules.Get(trial_fe.GetGeomType(), order);
- }
- 
- void VectorConvectionNLFIntegrator::AssembleElementVector(
-@@ -751,7 +787,9 @@ void VectorConvectionNLFIntegrator::AssembleElementVector(
-    ELV.UseExternalData(elvect.GetData(), nd, dim);
- 
-    Vector vec1(dim), vec2(dim);
-+
-    const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, T);
-+
-    ELV = 0.0;
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-@@ -841,7 +879,6 @@ void VectorConvectionNLFIntegrator::AssembleElementGrad(
-    }
- }
- 
--
- void ConvectiveVectorConvectionNLFIntegrator::AssembleElementGrad(
-    const FiniteElement &el,
-    ElementTransformation &trans,
-@@ -890,7 +927,6 @@ void ConvectiveVectorConvectionNLFIntegrator::AssembleElementGrad(
-    }
- }
- 
--
- void SkewSymmetricVectorConvectionNLFIntegrator::AssembleElementGrad(
-    const FiniteElement &el,
-    ElementTransformation &trans,
-diff --git a/fem/nonlininteg.hpp b/fem/nonlininteg.hpp
-index 38b133244..4c2ee2470 100644
---- a/fem/nonlininteg.hpp
-+++ b/fem/nonlininteg.hpp
-@@ -17,6 +17,7 @@
- #include "coefficient.hpp"
- #include "fespace.hpp"
- #include "ceed/interface/operator.hpp"
-+#include "ceed/interface/util.hpp"
- 
- namespace mfem
- {
-@@ -37,23 +38,43 @@ protected:
-       : IntRule(ir), ceedOp(NULL) {}
- 
- public:
-+   /// Set the memory type used for GeometricFactors and other large allocations
-+   /// in PA extensions.
-+   void SetPAMemoryType(MemoryType mt) { pa_mt = mt; }
-+
-+   /// Indicates whether this integrator can use a Ceed backend.
-+   virtual bool SupportsCeed() const { return false; }
-+
-+   /// Access the underlying ceed::Operator for libCEED backends, after the
-+   /// integrator has been assembled.
-+   ceed::Operator &GetCeedOp() { return *ceedOp; }
-+
-    /** @brief Prescribe a fixed IntegrationRule to use (when @a ir != NULL) or
-        let the integrator choose (when @a ir == NULL). */
-    virtual void SetIntRule(const IntegrationRule *ir) { IntRule = ir; }
--
--   /// Prescribe a fixed IntegrationRule to use.
-    void SetIntegrationRule(const IntegrationRule &ir) { SetIntRule(&ir); }
- 
--   /// Set the memory type used for GeometricFactors and other large allocations
--   /// in PA extensions.
--   void SetPAMemoryType(MemoryType mt) { pa_mt = mt; }
--
-    /// Get the integration rule of the integrator (possibly NULL).
-    const IntegrationRule *GetIntegrationRule() const { return IntRule; }
- 
-+   /// Get the integration rule of the integrator as a function of the finite
-+   /// element and geometry orders.
-+   virtual const IntegrationRule &GetRule(const FiniteElement &el1,
-+                                          const FiniteElement &el2,
-+                                          ElementTransformation &Trans) const;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &el1,
-+                                          const FiniteElement &el2,
-+                                          FaceElementTransformations &Trans) const;
-+   const IntegrationRule &GetRule(const FiniteElement &el,
-+                                  ElementTransformation &Trans) const
-+   { return GetRule(el, el, Trans); }
-+   const IntegrationRule &GetRule(const FiniteElement &el,
-+                                  FaceElementTransformations &Trans) const
-+   { return GetRule(el, el, Trans); }
-+
-    /// Method defining partial assembly.
-    /** The result of the partial assembly is stored internally so that it can be
--       used later in the methods AddMultPA(). */
-+       used later in the methods AddMultPA() and AddMultTransposePA(). */
-    virtual void AssemblePA(const FiniteElementSpace &fes);
- 
-    /** @brief Prepare the integrator for partial assembly (PA) gradient
-@@ -132,11 +153,6 @@ public:
-                                  FaceElementTransformations &Tr,
-                                  const Vector &elfun, DenseMatrix &elmat);
- 
--   /// Indicates whether this integrator can use a Ceed backend.
--   virtual bool SupportsCeed() const { return false; }
--
--   ceed::Operator &GetCeedOp() { return *ceedOp; }
--
-    virtual ~NonlinearFormIntegrator()
-    {
-       delete ceedOp;
-@@ -149,7 +165,36 @@ public:
-     for block state vectors. */
- class BlockNonlinearFormIntegrator
- {
-+protected:
-+   const IntegrationRule *IntRule;
-+
-+   BlockNonlinearFormIntegrator(const IntegrationRule *ir = NULL)
-+      : IntRule(ir) {}
-+
- public:
-+   /** @brief Prescribe a fixed IntegrationRule to use (when @a ir != NULL) or
-+       let the integrator choose (when @a ir == NULL). */
-+   virtual void SetIntRule(const IntegrationRule *ir) { IntRule = ir; }
-+   void SetIntegrationRule(const IntegrationRule &ir) { SetIntRule(&ir); }
-+
-+   /// Get the integration rule of the integrator (possibly NULL).
-+   const IntegrationRule *GetIntegrationRule() const { return IntRule; }
-+
-+   /// Get the integration rule of the integrator as a function of the finite
-+   /// element and geometry orders.
-+   virtual const IntegrationRule &GetRule(const FiniteElement &el1,
-+                                          const FiniteElement &el2,
-+                                          ElementTransformation &Tr) const;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &el1,
-+                                          const FiniteElement &el2,
-+                                          FaceElementTransformations &Tr) const;
-+   const IntegrationRule &GetRule(const FiniteElement &el,
-+                                  ElementTransformation &Trans) const
-+   { return GetRule(el, el, Trans); }
-+   const IntegrationRule &GetRule(const FiniteElement &el,
-+                                  FaceElementTransformations &Trans) const
-+   { return GetRule(el, el, Trans); }
-+
-    /// Compute the local energy
-    virtual double GetElementEnergy(const Array<const FiniteElement *>&el,
-                                    ElementTransformation &Tr,
-@@ -182,8 +227,7 @@ public:
-    virtual ~BlockNonlinearFormIntegrator() {}
- };
- 
--
--/// Abstract class for hyperelastic models
-+/// Abstract base class for hyperelastic models
- class HyperelasticModel
- {
- protected:
-@@ -228,7 +272,6 @@ public:
-                           const double weight, DenseMatrix &A) const = 0;
- };
- 
--
- /** Inverse-harmonic hyperelastic model with a strain energy density function
-     given by the formula: W(J) = (1/2) det(J) Tr((J J^t)^{-1}) where J is the
-     deformation gradient. */
-@@ -247,7 +290,6 @@ public:
-                           const double weight, DenseMatrix &A) const;
- };
- 
--
- /** Neo-Hookean hyperelastic model with a strain energy density function given
-     by the formula: \f$(\mu/2)(\bar{I}_1 - dim) + (K/2)(det(J)/g - 1)^2\f$ where
-     J is the deformation gradient and \f$\bar{I}_1 = (det(J))^{-2/dim} Tr(J
-@@ -281,7 +323,6 @@ public:
-                           const double weight, DenseMatrix &A) const;
- };
- 
--
- /** Hyperelastic integrator for any given HyperelasticModel.
- 
-     Represents @f$ \int W(Jpt) dx @f$ over a target zone, where W is the
-@@ -310,10 +351,11 @@ public:
-    /** @param[in] m  HyperelasticModel that will be integrated. */
-    HyperelasticNLFIntegrator(HyperelasticModel *m) : model(m) {}
- 
--   /** @brief Computes the integral of W(Jacobian(Trt)) over a target zone
--       @param[in] el     Type of FiniteElement.
--       @param[in] Ttr    Represents ref->target coordinates transformation.
--       @param[in] elfun  Physical coordinates of the zone. */
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &el1,
-+                                          const FiniteElement &el2,
-+                                          ElementTransformation &Trans) const;
-+
-    virtual double GetElementEnergy(const FiniteElement &el,
-                                    ElementTransformation &Ttr,
-                                    const Vector &elfun);
-@@ -342,24 +384,26 @@ private:
- public:
-    IncompressibleNeoHookeanIntegrator(Coefficient &mu_) : c_mu(&mu_) {}
- 
-+   using BlockNonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &el1,
-+                                          const FiniteElement &el2,
-+                                          ElementTransformation &Trans) const;
-+
-    virtual double GetElementEnergy(const Array<const FiniteElement *>&el,
-                                    ElementTransformation &Tr,
-                                    const Array<const Vector *> &elfun);
- 
--   /// Perform the local action of the NonlinearFormIntegrator
-    virtual void AssembleElementVector(const Array<const FiniteElement *> &el,
-                                       ElementTransformation &Tr,
-                                       const Array<const Vector *> &elfun,
-                                       const Array<Vector *> &elvec);
- 
--   /// Assemble the local gradient matrix
-    virtual void AssembleElementGrad(const Array<const FiniteElement*> &el,
-                                     ElementTransformation &Tr,
-                                     const Array<const Vector *> &elfun,
-                                     const Array2D<DenseMatrix *> &elmats);
- };
- 
--
- class VectorConvectionNLFIntegrator : public NonlinearFormIntegrator
- {
- private:
-@@ -378,8 +422,12 @@ public:
- 
-    VectorConvectionNLFIntegrator() = default;
- 
--   static const IntegrationRule &GetRule(const FiniteElement &fe,
--                                         ElementTransformation &T);
-+   virtual bool SupportsCeed() const { return DeviceCanUseCeed(); }
-+
-+   using NonlinearFormIntegrator::GetRule;
-+   virtual const IntegrationRule &GetRule(const FiniteElement &el1,
-+                                          const FiniteElement &el2,
-+                                          ElementTransformation &Trans) const;
- 
-    virtual void AssembleElementVector(const FiniteElement &el,
-                                       ElementTransformation &trans,
-@@ -400,7 +448,6 @@ public:
-    virtual void AddMultMF(const Vector &x, Vector &y) const;
- };
- 
--
- /** This class is used to assemble the convective form of the nonlinear term
-     arising in the Navier-Stokes equations \f$(u \cdot \nabla v, w )\f$ */
- class ConvectiveVectorConvectionNLFIntegrator :
-@@ -422,7 +469,6 @@ public:
-                                     DenseMatrix &elmat);
- };
- 
--
- /** This class is used to assemble the skew-symmetric form of the nonlinear term
-     arising in the Navier-Stokes equations
-     \f$.5*(u \cdot \nabla v, w ) - .5*(u \cdot \nabla w, v )\f$ */
-diff --git a/fem/transfer.cpp b/fem/transfer.cpp
-index 7f95ca9fe..a7dd0731c 100644
---- a/fem/transfer.cpp
-+++ b/fem/transfer.cpp
-@@ -930,17 +930,13 @@ TransferOperator::TransferOperator(const FiniteElementSpace& lFESpace_,
-       P.SetOperatorOwner(false);
-       opr = P.Ptr();
-    }
--   else if (lFESpace_.GetMesh()->GetNE() > 0
--            && hFESpace_.GetMesh()->GetNE() > 0
--            && lFESpace_.GetVDim() == 1
--            && hFESpace_.GetVDim() == 1
--            && dynamic_cast<const TensorBasisElement*>(lFESpace_.GetFE(0))
--            && dynamic_cast<const TensorBasisElement*>(hFESpace_.GetFE(0))
--            && !isvar_order
--            && (hFESpace_.FEColl()->GetContType() ==
--                mfem::FiniteElementCollection::CONTINUOUS ||
--                hFESpace_.FEColl()->GetContType() ==
--                mfem::FiniteElementCollection::DISCONTINUOUS))
-+   else if (UsesTensorBasis(lFESpace_) && UsesTensorBasis(hFESpace_) &&
-+            lFESpace_.GetVDim() == 1 && hFESpace_.GetVDim() == 1 &&
-+            !isvar_order &&
-+            (hFESpace_.FEColl()->GetContType() ==
-+             mfem::FiniteElementCollection::CONTINUOUS ||
-+             hFESpace_.FEColl()->GetContType() ==
-+             mfem::FiniteElementCollection::DISCONTINUOUS))
-    {
-       opr = new TensorProductPRefinementTransferOperator(lFESpace_, hFESpace_);
-    }
-diff --git a/general/array.cpp b/general/array.cpp
-index 12c3e3c06..e1a81e2d1 100644
---- a/general/array.cpp
-+++ b/general/array.cpp
-@@ -175,6 +175,7 @@ void Array2D<T>::Print(std::ostream &os, int width_)
-    }
- }
- 
-+template class Array<bool>;
- template class Array<char>;
- template class Array<int>;
- template class Array<long long>;
-diff --git a/general/device.cpp b/general/device.cpp
-index ccee71cd7..1ea480245 100644
---- a/general/device.cpp
-+++ b/general/device.cpp
-@@ -481,14 +481,14 @@ static void OccaDeviceSetup(const int dev)
- #endif
- }
- 
--static void CeedDeviceSetup(const char* ceed_spec)
-+static void CeedDeviceSetup(const char *ceed_spec)
- {
- #ifdef MFEM_USE_CEED
-    CeedInit(ceed_spec, &internal::ceed);
-    const char *ceed_backend;
-    CeedGetResource(internal::ceed, &ceed_backend);
--   if (strcmp(ceed_spec, ceed_backend) && strcmp(ceed_spec, "/cpu/self") &&
--       strcmp(ceed_spec, "/gpu/hip"))
-+   size_t ceed_spec_len = strlen(ceed_spec);
-+   if (strncmp(ceed_spec, ceed_backend, ceed_spec_len))
-    {
-       mfem::out << std::endl << "WARNING!!!\n"
-                 "libCEED is not using the requested backend!!!\n"
-diff --git a/makefile b/makefile
-index a606f6dfe..ee5cd6b0d 100644
---- a/makefile
-+++ b/makefile
-@@ -271,7 +271,6 @@ MFEM_REQ_LIB_DEPS = ENZYME SUPERLU MUMPS METIS FMS CONDUIT SIDRE LAPACK SUNDIALS
-  GSLIB OCCA CEED RAJA UMPIRE MKL_CPARDISO AMGX CALIPER PARELAG BENCHMARK\
-  MOONOLITH ALGOIM
- 
--
- PETSC_ERROR_MSG = $(if $(PETSC_FOUND),,. PETSC config not found: $(PETSC_VARS))
- SLEPC_ERROR_MSG = $(if $(SLEPC_FOUND),,. SLEPC config not found: $(SLEPC_VARS))
- 
-@@ -409,7 +408,11 @@ endif
- DIRS = general linalg linalg/simd mesh mesh/submesh fem fem/ceed/interface \
-        fem/ceed/integrators/mass fem/ceed/integrators/convection \
-        fem/ceed/integrators/diffusion fem/ceed/integrators/nlconvection \
--       fem/ceed/solvers fem/fe fem/lor fem/qinterp fem/integ fem/tmop
-+       fem/ceed/integrators/vecfemass fem/ceed/integrators/divdiv \
-+       fem/ceed/integrators/curlcurl fem/ceed/integrators/mixedvecgrad \
-+       fem/ceed/integrators/mixedveccurl fem/ceed/integrators/interp \
-+       fem/ceed/integrators/util fem/ceed/solvers \
-+       fem/fe fem/lor fem/qinterp fem/integ fem/tmop
- 
- ifeq ($(MFEM_USE_MOONOLITH),YES)
-    MFEM_CXXFLAGS += $(MOONOLITH_CXX_FLAGS)
-@@ -423,7 +426,7 @@ RELSRC_FILES = $(patsubst $(SRC)%,%,$(SOURCE_FILES))
- OBJECT_FILES = $(patsubst $(SRC)%,$(BLD)%,$(SOURCE_FILES:.cpp=.o))
- OKL_DIRS = fem
- 
--.PHONY: lib all clean distclean install config status info deps serial parallel	\
-+.PHONY: lib all clean distclean install config status info deps serial parallel \
- 	debug pdebug cuda hip pcuda cudebug pcudebug hpc style check test unittest \
- 	deprecation-warnings
- 
-@@ -603,6 +606,14 @@ install: $(if $(static),$(BLD)libmfem.a) $(if $(shared),$(BLD)libmfem.$(SO_EXT))
- 	$(INSTALL) -m 640 $(SRC)fem/ceed/integrators/diffusion/*.h $(PREFIX_INC)/mfem/fem/ceed/integrators/diffusion
- 	mkdir -p $(PREFIX_INC)/mfem/fem/ceed/integrators/nlconvection
- 	$(INSTALL) -m 640 $(SRC)fem/ceed/integrators/nlconvection/*.h $(PREFIX_INC)/mfem/fem/ceed/integrators/nlconvection
-+	mkdir -p $(PREFIX_INC)/mfem/fem/ceed/integrators/vecfemass
-+	$(INSTALL) -m 640 $(SRC)fem/ceed/integrators/vecfemass/*.h $(PREFIX_INC)/mfem/fem/ceed/integrators/vecfemass
-+	mkdir -p $(PREFIX_INC)/mfem/fem/ceed/integrators/divdiv
-+	$(INSTALL) -m 640 $(SRC)fem/ceed/integrators/divdiv/*.h $(PREFIX_INC)/mfem/fem/ceed/integrators/divdiv
-+	mkdir -p $(PREFIX_INC)/mfem/fem/ceed/integrators/curlcurl
-+	$(INSTALL) -m 640 $(SRC)fem/ceed/integrators/curlcurl/*.h $(PREFIX_INC)/mfem/fem/ceed/integrators/curlcurl
-+	mkdir -p $(PREFIX_INC)/mfem/fem/ceed/integrators/util
-+	$(INSTALL) -m 640 $(SRC)fem/ceed/integrators/util/*.h $(PREFIX_INC)/mfem/fem/ceed/integrators/util
- # install config.mk in $(PREFIX_SHARE)
- 	mkdir -p $(PREFIX_SHARE)
- 	$(MAKE) -C $(BLD)config config-mk CONFIG_MK=config-install.mk
-@@ -726,17 +737,16 @@ status info:
- ASTYLE_BIN = astyle
- ASTYLE = $(ASTYLE_BIN) --options=$(SRC)config/mfem.astylerc
- ASTYLE_VER = "Artistic Style Version 3.1"
--FORMAT_FILES = $(foreach dir,$(DIRS) $(EM_DIRS) config,$(dir)/*.?pp)
--FORMAT_FILES += tests/unit/*.?pp
--UNIT_TESTS_SUBDIRS = general linalg mesh fem miniapps ceed
--MINIAPPS_SUBDIRS = dpg/util hooke/operators hooke/preconditioners hooke/materials hooke/kernels
--FORMAT_FILES += $(foreach dir,$(UNIT_TESTS_SUBDIRS),tests/unit/$(dir)/*.?pp)
--FORMAT_FILES += $(foreach dir,$(MINIAPPS_SUBDIRS),miniapps/$(dir)/*.?pp)
--FORMAT_EXCLUDE = general/tinyxml2.cpp tests/unit/catch.hpp
-+FORMAT_FILES = $(foreach dir,$(DIRS) $(EM_DIRS) config,$(dir)/*.[ch]pp $(dir)/*.[ch])
-+FORMAT_FILES += tests/unit/*.[ch]pp
-+FORMAT_FILES += $(foreach dir,$(wildcard tests/unit/*),$(dir)/*.[ch]pp $(dir)/*.[ch])
-+FORMAT_FILES += $(foreach dir,$(wildcard miniapps/*/*),$(dir)/*.[ch]pp $(dir)/*.[ch])
-+FORMAT_EXCLUDE = general/tinyxml2.cpp tests/unit/catch.hpp fem/picojson.h general/tinyxml2.h
- FORMAT_LIST = $(filter-out $(FORMAT_EXCLUDE),$(wildcard $(FORMAT_FILES)))
- 
--COUT_CERR_FILES = $(foreach dir,$(DIRS),$(dir)/*.[ch]pp)
--COUT_CERR_EXCLUDE = '^general/error\.cpp' '^general/globals\.[ch]pp'
-+COUT_CERR_FILES = $(foreach dir,$(DIRS),$(dir)/*.[ch]pp $(dir)/*.[ch])
-+COUT_CERR_EXCLUDE = general/error.cpp general/globals.cpp general/globals.hpp
-+COUT_CERR_LIST = $(filter-out $(COUT_CERR_EXCLUDE),$(wildcard $(COUT_CERR_FILES)))
- 
- DEPRECATION_WARNING := \
- "This feature is planned for removal in the next release."\
-@@ -772,12 +782,12 @@ style:
- 	    "Please make sure the changes are committed");\
- 	echo "Checking for use of std::cout...";\
- 	$(call mfem_check_command,\
--	   grep cout $(COUT_CERR_FILES) | grep -v $(COUT_CERR_EXCLUDE:%=-e %),\
-+	   grep cout $(COUT_CERR_LIST),\
- 	   "No use of std::cout found", "Use mfem::out instead of std::cout");\
- 	echo "Checking for use of std::cerr...";\
- 	$(call mfem_check_command,\
--	   grep cerr $(COUT_CERR_FILES) |\
--	      grep -v $(COUT_CERR_EXCLUDE:%=-e %) -e cerrno,\
-+	   grep cerr $(COUT_CERR_LIST) |\
-+	      grep -v -e cerrno,\
- 	   "No use of std::cerr found", "Use mfem::err instead of std::cerr");\
- 	exit $$err_code
- 
-diff --git a/miniapps/shifted/sbm_solver.hpp b/miniapps/shifted/sbm_solver.hpp
-index db16738d5..b8830bf31 100644
---- a/miniapps/shifted/sbm_solver.hpp
-+++ b/miniapps/shifted/sbm_solver.hpp
-@@ -121,7 +121,6 @@ public:
-         par_shared_face_count(0),
-         cut_marker(cut_marker_) { }
- 
--   using BilinearFormIntegrator::AssembleFaceMatrix;
-    virtual void AssembleFaceMatrix(const FiniteElement &el1,
-                                    const FiniteElement &el2,
-                                    FaceElementTransformations &Trans,
-@@ -241,7 +240,6 @@ public:
-         par_shared_face_count(0),
-         cut_marker(cut_marker_) { }
- 
--   using BilinearFormIntegrator::AssembleFaceMatrix;
-    virtual void AssembleFaceMatrix(const FiniteElement &el1,
-                                    const FiniteElement &el2,
-                                    FaceElementTransformations &Trans,
-diff --git a/tests/unit/ceed/test_ceed.cpp b/tests/unit/ceed/test_ceed.cpp
-index 971b68e24..46f5e6bab 100644
---- a/tests/unit/ceed/test_ceed.cpp
-+++ b/tests/unit/ceed/test_ceed.cpp
-@@ -21,7 +21,15 @@ namespace ceed_test
- 
- #ifdef MFEM_USE_CEED
- 
--enum class CeedCoeffType { Const, Grid, Quad, VecConst, VecGrid, VecQuad };
-+enum class CeedCoeffType { Const,
-+                           Grid,
-+                           Quad,
-+                           VecConst,
-+                           VecGrid,
-+                           VecQuad,
-+                           MatConst,
-+                           MatQuad
-+                         };
- 
- double coeff_function(const Vector &x)
- {
-@@ -41,11 +49,24 @@ void velocity_function(const Vector &x, Vector &v)
-    }
- }
- 
-+// Matrix-valued velocity coefficient
-+void matrix_velocity_function(const Vector &x, DenseMatrix &m)
-+{
-+   int dim = x.Size();
-+   Vector v(dim);
-+   velocity_function(x, v);
-+   m.SetSize(dim);
-+   m = 0.5;
-+   for (int i = 0; i < dim; i++)
-+   {
-+      m(i, i) = 1.0 + v(i);
-+   }
-+}
-+
- // Vector valued quantity to convect
- void quantity(const Vector &x, Vector &u)
- {
-    int dim = x.Size();
--
-    switch (dim)
-    {
-       case 1: u(0) = x[0]*x[0]; break;
-@@ -59,7 +80,6 @@ void quantity(const Vector &x, Vector &u)
- void convected_quantity(const Vector &x, Vector &u)
- {
-    double a, b, c;
--
-    int dim = x.Size();
-    switch (dim)
-    {
-@@ -82,7 +102,7 @@ void convected_quantity(const Vector &x, Vector &u)
-    }
- }
- 
--std::string getString(AssemblyLevel assembly)
-+std::string GetString(AssemblyLevel assembly)
- {
-    switch (assembly)
-    {
-@@ -106,7 +126,7 @@ std::string getString(AssemblyLevel assembly)
-    return "";
- }
- 
--std::string getString(CeedCoeffType coeff_type)
-+std::string GetString(CeedCoeffType coeff_type)
- {
-    switch (coeff_type)
-    {
-@@ -128,6 +148,12 @@ std::string getString(CeedCoeffType coeff_type)
-       case CeedCoeffType::VecQuad:
-          return "VecQuad";
-          break;
-+      case CeedCoeffType::MatConst:
-+         return "MatConst";
-+         break;
-+      case CeedCoeffType::MatQuad:
-+         return "MatQuad";
-+         break;
-    }
-    MFEM_ABORT("Unknown CeedCoeffType.");
-    return "";
-@@ -138,10 +164,16 @@ enum class Problem { Mass,
-                      Diffusion,
-                      VectorMass,
-                      VectorDiffusion,
--                     MassDiffusion
-+                     MassDiffusion,
-+                     HDivMass,
-+                     HCurlMass,
-+                     DivDiv,
-+                     CurlCurl,
-+                     MixedVectorGradient,
-+                     MixedVectorCurl
-                    };
- 
--std::string getString(Problem pb)
-+std::string GetString(Problem pb)
- {
-    switch (pb)
-    {
-@@ -163,6 +195,24 @@ std::string getString(Problem pb)
-       case Problem::MassDiffusion:
-          return "MassDiffusion";
-          break;
-+      case Problem::HDivMass:
-+         return "HDivMass";
-+         break;
-+      case Problem::HCurlMass:
-+         return "HCurlMass";
-+         break;
-+      case Problem::DivDiv:
-+         return "DivDiv";
-+         break;
-+      case Problem::CurlCurl:
-+         return "CurlCurl";
-+         break;
-+      case Problem::MixedVectorGradient:
-+         return "MixedVectorGradient";
-+         break;
-+      case Problem::MixedVectorCurl:
-+         return "MixedVectorCurl";
-+         break;
-    }
-    MFEM_ABORT("Unknown Problem.");
-    return "";
-@@ -170,7 +220,7 @@ std::string getString(Problem pb)
- 
- enum class NLProblem {Convection};
- 
--std::string getString(NLProblem pb)
-+std::string GetString(NLProblem pb)
- {
-    switch (pb)
-    {
-@@ -178,14 +228,15 @@ std::string getString(NLProblem pb)
-          return "Convection";
-          break;
-    }
--   MFEM_ABORT("Unknown Problem.");
-+   MFEM_ABORT("Unknown NLProblem.");
-    return "";
- }
- 
- void InitCoeff(Mesh &mesh, FiniteElementCollection &fec, const int dim,
-                const CeedCoeffType coeff_type, GridFunction *&gf,
--               FiniteElementSpace *& coeff_fes,
--               Coefficient *&coeff, VectorCoefficient *&vcoeff)
-+               FiniteElementSpace *&coeff_fes,
-+               Coefficient *&coeff, VectorCoefficient *&vcoeff,
-+               MatrixCoefficient *&mcoeff)
- {
-    switch (coeff_type)
-    {
-@@ -209,7 +260,7 @@ void InitCoeff(Mesh &mesh, FiniteElementCollection &fec, const int dim,
-          Vector val(dim);
-          for (int i = 0; i < dim; i++)
-          {
--            val(i) = 1.0;
-+            val(i) = 1.0 + i;
-          }
-          vcoeff = new VectorConstantCoefficient(val);
-          break;
-@@ -226,21 +277,38 @@ void InitCoeff(Mesh &mesh, FiniteElementCollection &fec, const int dim,
-       case CeedCoeffType::VecQuad:
-          vcoeff = new VectorFunctionCoefficient(dim, velocity_function);
-          break;
-+      case CeedCoeffType::MatConst:
-+      {
-+         DenseMatrix val(dim);
-+         val = 0.5;
-+         for (int i = 0; i < dim; i++)
-+         {
-+            val(i, i) = 1.0 + i;
-+         }
-+         mcoeff = new MatrixConstantCoefficient(val);
-+         break;
-+      }
-+      case CeedCoeffType::MatQuad:
-+         mcoeff = new MatrixFunctionCoefficient(dim, matrix_velocity_function);
-+         break;
-    }
- }
- 
--void test_ceed_operator(const char* input, int order,
-+void test_ceed_operator(const char *input, int order,
-                         const CeedCoeffType coeff_type, const Problem pb,
--                        const AssemblyLevel assembly)
-+                        const AssemblyLevel assembly, bool mixed_p, bool bdr_integ)
- {
--   std::string section = "assembly: " + getString(assembly) + "\n" +
--                         "coeff_type: " + getString(coeff_type) + "\n" +
--                         "pb: " + getString(pb) + "\n" +
-+   std::string section = "assembly: " + GetString(assembly) + "\n" +
-+                         "coeff_type: " + GetString(coeff_type) + "\n" +
-+                         "pb: " + GetString(pb) + "\n" +
-                          "order: " + std::to_string(order) + "\n" +
-+                         (mixed_p ? "mixed_p: true\n" : "") +
-+                         (bdr_integ ? "bdr_integ: true\n" : "") +
-                          "mesh: " + input;
-    INFO(section);
-    Mesh mesh(input, 1, 1);
-    mesh.EnsureNodes();
-+   if (mixed_p) { mesh.EnsureNCMesh(); }
-    int dim = mesh.Dimension();
-    H1_FECollection fec(order, dim);
- 
-@@ -249,43 +317,65 @@ void test_ceed_operator(const char* input, int order,
-    FiniteElementSpace *coeff_fes = nullptr;
-    Coefficient *coeff = nullptr;
-    VectorCoefficient *vcoeff = nullptr;
--   InitCoeff(mesh, fec, dim, coeff_type, gf, coeff_fes, coeff, vcoeff);
-+   MatrixCoefficient *mcoeff = nullptr;
-+   InitCoeff(mesh, fec, dim, coeff_type, gf, coeff_fes, coeff, vcoeff, mcoeff);
-+   MFEM_VERIFY(!mcoeff,
-+               "Unexpected matrix-valued coefficient in test_ceed_operator.");
- 
-    // Build the BilinearForm
-    bool vecOp = pb == Problem::VectorMass || pb == Problem::VectorDiffusion;
-    const int vdim = vecOp ? dim : 1;
-    FiniteElementSpace fes(&mesh, &fec, vdim);
-+   if (mixed_p)
-+   {
-+      fes.SetElementOrder(0, order+1);
-+      fes.SetElementOrder(fes.GetNE() - 1, order+1);
-+      fes.Update(false);
-+   }
- 
--   BilinearForm k_test(&fes);
-    BilinearForm k_ref(&fes);
-+   BilinearForm k_test(&fes);
-+   auto AddIntegrator = [&bdr_integ](BilinearForm &k, BilinearFormIntegrator *blfi)
-+   {
-+      if (bdr_integ)
-+      {
-+         k.AddBoundaryIntegrator(blfi);
-+      }
-+      else
-+      {
-+         k.AddDomainIntegrator(blfi);
-+      }
-+   };
-    switch (pb)
-    {
-       case Problem::Mass:
--         k_ref.AddDomainIntegrator(new MassIntegrator(*coeff));
--         k_test.AddDomainIntegrator(new MassIntegrator(*coeff));
-+         AddIntegrator(k_ref, new MassIntegrator(*coeff));
-+         AddIntegrator(k_test, new MassIntegrator(*coeff));
-          break;
-       case Problem::Convection:
--         k_ref.AddDomainIntegrator(new ConvectionIntegrator(*vcoeff,-1));
--         k_test.AddDomainIntegrator(new ConvectionIntegrator(*vcoeff,-1));
-+         AddIntegrator(k_ref, new ConvectionIntegrator(*vcoeff, -1));
-+         AddIntegrator(k_test, new ConvectionIntegrator(*vcoeff, -1));
-          break;
-       case Problem::Diffusion:
--         k_ref.AddDomainIntegrator(new DiffusionIntegrator(*coeff));
--         k_test.AddDomainIntegrator(new DiffusionIntegrator(*coeff));
-+         AddIntegrator(k_ref, new DiffusionIntegrator(*coeff));
-+         AddIntegrator(k_test, new DiffusionIntegrator(*coeff));
-          break;
-       case Problem::VectorMass:
--         k_ref.AddDomainIntegrator(new VectorMassIntegrator(*coeff));
--         k_test.AddDomainIntegrator(new VectorMassIntegrator(*coeff));
-+         AddIntegrator(k_ref, new VectorMassIntegrator(*coeff));
-+         AddIntegrator(k_test, new VectorMassIntegrator(*coeff));
-          break;
-       case Problem::VectorDiffusion:
--         k_ref.AddDomainIntegrator(new VectorDiffusionIntegrator(*coeff));
--         k_test.AddDomainIntegrator(new VectorDiffusionIntegrator(*coeff));
-+         AddIntegrator(k_ref, new VectorDiffusionIntegrator(*coeff));
-+         AddIntegrator(k_test, new VectorDiffusionIntegrator(*coeff));
-          break;
-       case Problem::MassDiffusion:
--         k_ref.AddDomainIntegrator(new MassIntegrator(*coeff));
--         k_test.AddDomainIntegrator(new MassIntegrator(*coeff));
--         k_ref.AddDomainIntegrator(new DiffusionIntegrator(*coeff));
--         k_test.AddDomainIntegrator(new DiffusionIntegrator(*coeff));
-+         AddIntegrator(k_ref, new MassIntegrator(*coeff));
-+         AddIntegrator(k_test, new MassIntegrator(*coeff));
-+         AddIntegrator(k_ref, new DiffusionIntegrator(*coeff));
-+         AddIntegrator(k_test, new DiffusionIntegrator(*coeff));
-          break;
-+      default:
-+         MFEM_ABORT("Unexpected problem type.");
-    }
- 
-    k_ref.Assemble();
-@@ -294,121 +384,499 @@ void test_ceed_operator(const char* input, int order,
-    k_test.SetAssemblyLevel(assembly);
-    k_test.Assemble();
- 
--   // Compare ceed with mfem.
-+   // Compare ceed with mfem
-    GridFunction x(&fes), y_ref(&fes), y_test(&fes);
-+   Vector d_ref(fes.GetTrueVSize()), d_test(fes.GetTrueVSize());
- 
-    x.Randomize(1);
- 
--   k_ref.Mult(x,y_ref);
--   k_test.Mult(x,y_test);
-+   k_ref.Mult(x, y_ref);
-+   k_test.Mult(x, y_test);
- 
-    y_test -= y_ref;
- 
--   REQUIRE(y_test.Norml2() < 1.e-12);
-+   REQUIRE(y_test.Norml2() < 1.e-12 * std::max(y_ref.Norml2(), 1.0));
-+
-+   if (mesh.Nonconforming())
-+   {
-+      k_ref.ConformingAssemble();
-+   }
-+   k_ref.AssembleDiagonal(d_ref);
-+   k_test.AssembleDiagonal(d_test);
-+
-+   d_test -= d_ref;
-+
-+   // // TODO: Debug
-+   // if (mesh.Nonconforming() &&
-+   //    d_test.Norml2() > 0.1 * d_ref.Norml2())
-+   // {
-+   //    out << "\nDIAGONAL ASSEMBLY DELTA\n\n";
-+   //    d_test.Print();
-+   //    out << "\nDIAGONAL ASSEMBLY REF\n\n";
-+   //    d_ref.Print();
-+   //    // Vector temp(d_test);
-+   //    // temp += d_ref;
-+   //    // out << "\nDIAGONAL ASSEMBLY TEST\n\n";
-+   //    // temp.Print();
-+   // }
-+
-+   REQUIRE(d_test.Norml2() <
-+           (mesh.Nonconforming() ? 1.0 : 1.e-12) * std::max(d_ref.Norml2(), 1.0));
-    delete gf;
-    delete coeff_fes;
-    delete coeff;
-    delete vcoeff;
-+   delete mcoeff;
- }
- 
--void test_mixed_p_ceed_operator(const char* input, int order,
--                                const CeedCoeffType coeff_type, const Problem pb,
--                                const AssemblyLevel assembly)
-+void test_ceed_vectorfe_operator(const char *input, int order,
-+                                 const CeedCoeffType coeff_type, const Problem pb,
-+                                 const AssemblyLevel assembly, bool bdr_integ)
- {
--   std::string section = "assembly: " + getString(assembly) + "\n" +
--                         "coeff_type: " + getString(coeff_type) + "\n" +
--                         "pb: " + getString(pb) + "\n" +
-+   std::string section = "assembly: " + GetString(assembly) + "\n" +
-+                         "coeff_type: " + GetString(coeff_type) + "\n" +
-+                         "pb: " + GetString(pb) + "\n" +
-                          "order: " + std::to_string(order) + "\n" +
-+                         (bdr_integ ? "bdr_integ: true\n" : "") +
-                          "mesh: " + input;
-    INFO(section);
-    Mesh mesh(input, 1, 1);
-    mesh.EnsureNodes();
--   mesh.EnsureNCMesh();
-    int dim = mesh.Dimension();
--   MFEM_VERIFY(dim == 2, "p-adaptivity only supported in serial 2D.");
--   H1_FECollection fec(order, dim);
-+   FiniteElementCollection *fec = nullptr;
-+   if ((pb == Problem::HDivMass || pb == Problem::DivDiv) && bdr_integ)
-+   {
-+      // Boundary RT elements in 2D and 3D are actually L2
-+      return;
-+   }
-+   if (pb == Problem::CurlCurl && dim - bdr_integ < 2)
-+   {
-+      // No 1D ND curl shape
-+      return;
-+   }
-+   switch (pb)
-+   {
-+      case Problem::Mass:
-+      case Problem::Diffusion:
-+         fec = new H1_FECollection(order, dim);
-+         break;
-+      case Problem::HDivMass:
-+      case Problem::DivDiv:
-+         fec = new RT_FECollection(order-1, dim);
-+         break;
-+      case Problem::HCurlMass:
-+      case Problem::CurlCurl:
-+         fec = new ND_FECollection(order, dim);
-+         break;
-+      default:
-+         MFEM_ABORT("Unexpected problem type.");
-+   }
- 
-    // Coefficient Initialization
-    GridFunction *gf = nullptr;
-    FiniteElementSpace *coeff_fes = nullptr;
-    Coefficient *coeff = nullptr;
-    VectorCoefficient *vcoeff = nullptr;
--   InitCoeff(mesh, fec, dim, coeff_type, gf, coeff_fes, coeff, vcoeff);
-+   MatrixCoefficient *mcoeff = nullptr;
-+   InitCoeff(mesh, *fec, dim, coeff_type, gf, coeff_fes, coeff, vcoeff, mcoeff);
-+   if (!coeff && (pb == Problem::Mass || pb == Problem::DivDiv ||
-+                  (pb == Problem::CurlCurl && dim - bdr_integ < 3)))
-+   {
-+      delete gf;
-+      delete coeff_fes;
-+      delete coeff;
-+      delete vcoeff;
-+      delete mcoeff;
-+      delete fec;
-+      return;
-+   }
- 
-    // Build the BilinearForm
--   bool vecOp = pb == Problem::VectorMass || pb == Problem::VectorDiffusion;
--   const int vdim = vecOp ? dim : 1;
--   FiniteElementSpace fes(&mesh, &fec, vdim);
--   fes.SetElementOrder(0, order+1);
--   fes.SetElementOrder(fes.GetNE() - 1, order+1);
--   fes.Update(false);
-+   FiniteElementSpace fes(&mesh, fec);
- 
--   BilinearForm k_test(&fes);
-    BilinearForm k_ref(&fes);
-+   BilinearForm k_test(&fes);
-+   auto AddIntegrator = [&bdr_integ](BilinearForm &k, BilinearFormIntegrator *blfi)
-+   {
-+      if (bdr_integ)
-+      {
-+         k.AddBoundaryIntegrator(blfi);
-+      }
-+      else
-+      {
-+         k.AddDomainIntegrator(blfi);
-+      }
-+   };
-    switch (pb)
-    {
-       case Problem::Mass:
--         k_ref.AddDomainIntegrator(new MassIntegrator(*coeff));
--         k_test.AddDomainIntegrator(new MassIntegrator(*coeff));
--         break;
--      case Problem::Convection:
--         k_ref.AddDomainIntegrator(new ConvectionIntegrator(*vcoeff,-1));
--         k_test.AddDomainIntegrator(new ConvectionIntegrator(*vcoeff,-1));
-+         AddIntegrator(k_ref, new MassIntegrator(*coeff));
-+         AddIntegrator(k_test, new MassIntegrator(*coeff));
-          break;
-       case Problem::Diffusion:
--         k_ref.AddDomainIntegrator(new DiffusionIntegrator(*coeff));
--         k_test.AddDomainIntegrator(new DiffusionIntegrator(*coeff));
-+         if (coeff)
-+         {
-+            AddIntegrator(k_ref, new DiffusionIntegrator(*coeff));
-+            AddIntegrator(k_test, new DiffusionIntegrator(*coeff));
-+         }
-+         else if (vcoeff)
-+         {
-+            AddIntegrator(k_ref, new DiffusionIntegrator(*vcoeff));
-+            AddIntegrator(k_test, new DiffusionIntegrator(*vcoeff));
-+         }
-+         else if (mcoeff)
-+         {
-+            AddIntegrator(k_ref, new DiffusionIntegrator(*mcoeff));
-+            AddIntegrator(k_test, new DiffusionIntegrator(*mcoeff));
-+         }
-          break;
--      case Problem::VectorMass:
--         k_ref.AddDomainIntegrator(new VectorMassIntegrator(*coeff));
--         k_test.AddDomainIntegrator(new VectorMassIntegrator(*coeff));
-+      case Problem::HDivMass:
-+      case Problem::HCurlMass:
-+         if (coeff)
-+         {
-+            AddIntegrator(k_ref, new VectorFEMassIntegrator(*coeff));
-+            AddIntegrator(k_test, new VectorFEMassIntegrator(*coeff));
-+         }
-+         else if (vcoeff)
-+         {
-+            AddIntegrator(k_ref, new VectorFEMassIntegrator(*vcoeff));
-+            AddIntegrator(k_test, new VectorFEMassIntegrator(*vcoeff));
-+         }
-+         else if (mcoeff)
-+         {
-+            AddIntegrator(k_ref, new VectorFEMassIntegrator(*mcoeff));
-+            AddIntegrator(k_test, new VectorFEMassIntegrator(*mcoeff));
-+         }
-          break;
--      case Problem::VectorDiffusion:
--         k_ref.AddDomainIntegrator(new VectorDiffusionIntegrator(*coeff));
--         k_test.AddDomainIntegrator(new VectorDiffusionIntegrator(*coeff));
-+      case Problem::DivDiv:
-+         AddIntegrator(k_ref, new DivDivIntegrator(*coeff));
-+         AddIntegrator(k_test, new DivDivIntegrator(*coeff));
-          break;
--      case Problem::MassDiffusion:
--         k_ref.AddDomainIntegrator(new MassIntegrator(*coeff));
--         k_test.AddDomainIntegrator(new MassIntegrator(*coeff));
--         k_ref.AddDomainIntegrator(new DiffusionIntegrator(*coeff));
--         k_test.AddDomainIntegrator(new DiffusionIntegrator(*coeff));
-+      case Problem::CurlCurl:
-+         if (coeff)
-+         {
-+            AddIntegrator(k_ref, new CurlCurlIntegrator(*coeff));
-+            AddIntegrator(k_test, new CurlCurlIntegrator(*coeff));
-+         }
-+         else if (vcoeff)
-+         {
-+            AddIntegrator(k_ref, new CurlCurlIntegrator(*vcoeff));
-+            AddIntegrator(k_test, new CurlCurlIntegrator(*vcoeff));
-+         }
-+         else if (mcoeff)
-+         {
-+            AddIntegrator(k_ref, new CurlCurlIntegrator(*mcoeff));
-+            AddIntegrator(k_test, new CurlCurlIntegrator(*mcoeff));
-+         }
-          break;
-+      default:
-+         MFEM_ABORT("Unexpected problem type.");
-    }
- 
-+   // Timer for profiling
-+   const int trials = 1;
-+   const bool debug = false;
-+   StopWatch chrono_setup_ref, chrono_setup_test;
-+   StopWatch chrono_apply_ref, chrono_apply_test;
-+   chrono_setup_ref.Clear();
-+   chrono_setup_ref.Start();
-+
-    k_ref.Assemble();
-    k_ref.Finalize();
- 
-+   chrono_setup_ref.Stop();
-+   chrono_setup_test.Clear();
-+   chrono_setup_test.Start();
-+
-    k_test.SetAssemblyLevel(assembly);
-    k_test.Assemble();
- 
--   // Compare ceed with mfem.
-+   chrono_setup_test.Stop();
-+
-+   // Compare ceed with mfem
-    GridFunction x(&fes), y_ref(&fes), y_test(&fes);
-+   Vector d_ref(fes.GetTrueVSize()), d_test(fes.GetTrueVSize());
- 
-    x.Randomize(1);
- 
--   k_ref.Mult(x,y_ref);
--   k_test.Mult(x,y_test);
-+   chrono_apply_ref.Clear();
-+   chrono_apply_ref.Start();
-+
-+   for (int trial = 0; trial < trials; trial++)
-+   {
-+      k_ref.Mult(x, y_ref);
-+   }
-+
-+   chrono_apply_ref.Stop();
-+   chrono_apply_test.Clear();
-+   chrono_apply_test.Start();
-+
-+   for (int trial = 0; trial < trials; trial++)
-+   {
-+      k_test.Mult(x, y_test);
-+   }
-+
-+   chrono_apply_test.Stop();
- 
-    y_test -= y_ref;
- 
--   REQUIRE(y_test.Norml2() < 1.e-12);
-+   REQUIRE(y_test.Norml2() < 1.e-12 * std::max(y_ref.Norml2(), 1.0));
-+
-+   if (mesh.Nonconforming())
-+   {
-+      k_ref.ConformingAssemble();
-+   }
-+   k_ref.AssembleDiagonal(d_ref);
-+   k_test.AssembleDiagonal(d_test);
-+
-+   d_test -= d_ref;
-+
-+   // // TODO: Debug
-+   // if (!UsesTensorBasis(fes) && order > 1 &&
-+   //     (pb == Problem::HCurlMass || pb == Problem::CurlCurl) &&
-+   //    d_test.Norml2() > 0.1 * d_ref.Norml2())
-+   // {
-+   //    out << "\nH(CURL) DIAGONAL ASSEMBLY DELTA\n\n";
-+   //    d_test.Print();
-+   //    out << "\nH(CURL) DIAGONAL ASSEMBLY REF\n\n";
-+   //    d_ref.Print();
-+   //    // Vector temp(d_test);
-+   //    // temp += d_ref;
-+   //    // out << "\nH(CURL) DIAGONAL ASSEMBLY TEST\n\n";
-+   //    // temp.Print();
-+   // }
-+
-+   REQUIRE(d_test.Norml2() <
-+           (mesh.Nonconforming() ||
-+            (!UsesTensorBasis(fes) && order > 1 &&
-+             (pb == Problem::HCurlMass || pb == Problem::CurlCurl)) ?
-+            1.0 : 1.e-12) * std::max(d_ref.Norml2(), 1.0));
-+
-+   if (debug)
-+   {
-+      // Estimates only for !bdr_integ, non-mixed meshes
-+      std::size_t mem_test = 0;
-+      if (!bdr_integ && mesh.GetNumGeometries(dim) == 1)
-+      {
-+         const FiniteElement &fe = *fes.GetFE(0);
-+         ElementTransformation &T = *mesh.GetElementTransformation(0);
-+         const int Q = (*k_ref.GetDBFI())[0]->GetRule(fe, T).GetNPoints();
-+         const int P = fe.GetDof();
-+         switch (pb)
-+         {
-+            case Problem::Mass:
-+               mem_test = Q * 1 * 8;
-+               mem_test += P * 4;
-+            case Problem::Diffusion:
-+               mem_test = Q * (dim * (dim + 1)) / 2 * 8;
-+               mem_test += P * 4;
-+               break;
-+            case Problem::HDivMass:
-+               mem_test = Q * (dim * (dim + 1)) / 2 * 8;
-+               mem_test += P * 4;
-+            case Problem::DivDiv:
-+               mem_test = Q * 1 * 8;
-+               mem_test += P * 4;
-+               break;
-+            case Problem::HCurlMass:
-+               mem_test = Q * (dim * (dim + 1)) / 2 * 8;
-+               mem_test += P * 3 * 4;  // Tri-diagonal curl orientations
-+            case Problem::CurlCurl:
-+               mem_test = Q * (dim - bdr_integ < 3 ? 1 : dim * (dim + 1) / 2) * 8;
-+               mem_test += P * 3 * 4;
-+               break;
-+            default:
-+               MFEM_ABORT("Unexpected problem type.");
-+         }
-+         mem_test *= mesh.GetNE();  // Estimate for QFunction memory
-+      }
-+      std::size_t mem_ref = k_ref.SpMat().NumNonZeroElems() * (8 + 4) +
-+                            k_ref.Height() * 4;
-+
-+      out << "\n" << section << "\n";
-+      out << "benchmark (" << fes.GetTrueVSize() << " unknowns)\n"
-+          << "    setup: ref = "
-+          << chrono_setup_ref.RealTime() * 1e3 << " ms\n"
-+          << "           test = "
-+          << chrono_setup_test.RealTime() * 1e3 << " ms\n"
-+          << "    apply: ref = "
-+          << chrono_apply_ref.RealTime() * 1e3 / trials << " ms\n"
-+          << "           test = "
-+          << chrono_apply_test.RealTime() * 1e3 / trials << " ms\n"
-+          << "    mem usage: ref = " << mem_ref / 1e6 << " MB\n"
-+          << "               test = " << mem_test / 1e6 << " MB\n";
-+   }
-    delete gf;
-    delete coeff_fes;
-    delete coeff;
-    delete vcoeff;
-+   delete mcoeff;
-+   delete fec;
- }
- 
--void test_ceed_nloperator(const char* mesh_filename, int order,
-+void test_ceed_mixed_operator(const char *input, int order,
-+                              const CeedCoeffType coeff_type, const Problem pb,
-+                              const AssemblyLevel assembly, bool bdr_integ)
-+{
-+   std::string section = "assembly: " + GetString(assembly) + "\n" +
-+                         "coeff_type: " + GetString(coeff_type) + "\n" +
-+                         "pb: " + GetString(pb) + "\n" +
-+                         "order: " + std::to_string(order) + "\n" +
-+                         (bdr_integ ? "bdr_integ: true\n" : "") +
-+                         "mesh: " + input;
-+   INFO(section);
-+   Mesh mesh(input, 1, 1);
-+   mesh.EnsureNodes();
-+   int dim = mesh.Dimension();
-+   FiniteElementCollection *trial_fec = nullptr, *test_fec = nullptr;
-+   if (pb == Problem::MixedVectorGradient && dim - bdr_integ < 2)
-+   {
-+      // MixedVectorGradient is only supported in 2D or 3D
-+      return;
-+   }
-+   if (pb == Problem::MixedVectorCurl && dim - bdr_integ < 3)
-+   {
-+      // MixedVectorCurl is only supported in 3D
-+      return;
-+   }
-+   switch (pb)
-+   {
-+      case Problem::MixedVectorGradient:
-+         trial_fec = new H1_FECollection(order, dim);
-+         test_fec = new ND_FECollection(order, dim);
-+         break;
-+      case Problem::MixedVectorCurl:
-+         trial_fec = new ND_FECollection(order, dim);
-+         test_fec = new RT_FECollection(order - 1, dim);
-+         break;
-+      default:
-+         MFEM_ABORT("Unexpected problem type.");
-+   }
-+
-+   // Coefficient Initialization
-+   GridFunction *gf = nullptr;
-+   FiniteElementSpace *coeff_fes = nullptr;
-+   Coefficient *coeff = nullptr;
-+   VectorCoefficient *vcoeff = nullptr;
-+   MatrixCoefficient *mcoeff = nullptr;
-+   InitCoeff(mesh, *trial_fec, dim, coeff_type, gf, coeff_fes, coeff, vcoeff,
-+             mcoeff);
-+
-+   // Build the BilinearForm
-+   FiniteElementSpace trial_fes(&mesh, trial_fec);
-+   FiniteElementSpace test_fes(&mesh, test_fec);
-+
-+   MixedBilinearForm k_ref(&trial_fes, &test_fes);
-+   MixedBilinearForm k_test(&trial_fes, &test_fes);
-+   MixedBilinearForm k_test_t(&test_fes, &trial_fes);
-+   auto AddIntegrator = [&bdr_integ](MixedBilinearForm &k,
-+                                     BilinearFormIntegrator *blfi)
-+   {
-+      if (bdr_integ)
-+      {
-+         k.AddBoundaryIntegrator(blfi);
-+      }
-+      else
-+      {
-+         k.AddDomainIntegrator(blfi);
-+      }
-+   };
-+   switch (pb)
-+   {
-+      case Problem::MixedVectorGradient:
-+         if (coeff)
-+         {
-+            AddIntegrator(k_ref, new MixedVectorGradientIntegrator(*coeff));
-+            AddIntegrator(k_test, new MixedVectorGradientIntegrator(*coeff));
-+            AddIntegrator(k_test_t, new MixedVectorWeakDivergenceIntegrator(*coeff));
-+         }
-+         else if (vcoeff)
-+         {
-+            AddIntegrator(k_ref, new MixedVectorGradientIntegrator(*vcoeff));
-+            AddIntegrator(k_test, new MixedVectorGradientIntegrator(*vcoeff));
-+            AddIntegrator(k_test_t, new MixedVectorWeakDivergenceIntegrator(*vcoeff));
-+         }
-+         else if (mcoeff)
-+         {
-+            AddIntegrator(k_ref, new MixedVectorGradientIntegrator(*mcoeff));
-+            AddIntegrator(k_test, new MixedVectorGradientIntegrator(*mcoeff));
-+            AddIntegrator(k_test_t, new MixedVectorWeakDivergenceIntegrator(*mcoeff));
-+         }
-+         break;
-+      case Problem::MixedVectorCurl:
-+         if (coeff)
-+         {
-+            AddIntegrator(k_ref, new MixedVectorCurlIntegrator(*coeff));
-+            AddIntegrator(k_test, new MixedVectorCurlIntegrator(*coeff));
-+            AddIntegrator(k_test_t, new MixedVectorWeakCurlIntegrator(*coeff));
-+         }
-+         else if (vcoeff)
-+         {
-+            AddIntegrator(k_ref, new MixedVectorCurlIntegrator(*vcoeff));
-+            AddIntegrator(k_test, new MixedVectorCurlIntegrator(*vcoeff));
-+            AddIntegrator(k_test_t, new MixedVectorWeakCurlIntegrator(*vcoeff));
-+         }
-+         else if (mcoeff)
-+         {
-+            AddIntegrator(k_ref, new MixedVectorCurlIntegrator(*mcoeff));
-+            AddIntegrator(k_test, new MixedVectorCurlIntegrator(*mcoeff));
-+            AddIntegrator(k_test_t, new MixedVectorWeakCurlIntegrator(*mcoeff));
-+         }
-+         break;
-+      default:
-+         MFEM_ABORT("Unexpected problem type.");
-+   }
-+
-+   k_ref.Assemble();
-+   k_ref.Finalize();
-+
-+   k_test.SetAssemblyLevel(assembly);
-+   k_test.Assemble();
-+
-+   k_test_t.SetAssemblyLevel(assembly);
-+   k_test_t.Assemble();
-+
-+   // Compare ceed with mfem
-+   GridFunction x(&trial_fes), y_ref(&test_fes), y_test(&test_fes);
-+   GridFunction x_t(&test_fes), y_t_ref(&trial_fes), y_t_test(&trial_fes);
-+
-+   x.Randomize(1);
-+
-+   k_ref.Mult(x, y_ref);
-+   k_test.Mult(x, y_test);
-+
-+   y_test -= y_ref;
-+
-+   REQUIRE(y_test.Norml2() < 1.e-12 * std::max(y_ref.Norml2(), 1.0));
-+
-+   x_t.Randomize(1);
-+
-+   k_ref.MultTranspose(x_t, y_t_ref);
-+   k_test_t.Mult(x_t, y_t_test);
-+
-+   y_t_test.Add((pb == Problem::MixedVectorCurl) ? -1.0 : 1.0, y_t_ref);
-+
-+   REQUIRE(y_t_test.Norml2() < 1.e-12 * std::max(y_t_ref.Norml2(), 1.0));
-+   delete gf;
-+   delete coeff_fes;
-+   delete coeff;
-+   delete vcoeff;
-+   delete mcoeff;
-+   delete trial_fec;
-+   delete test_fec;
-+}
-+
-+void test_ceed_nloperator(const char *input, int order,
-                           const CeedCoeffType coeff_type,
-                           const NLProblem pb, const AssemblyLevel assembly)
- {
--   std::string section = "assembly: " + getString(assembly) + "\n" +
--                         "coeff_type: " + getString(coeff_type) + "\n" +
--                         "pb: " + getString(pb) + "\n" +
-+   std::string section = "assembly: " + GetString(assembly) + "\n" +
-+                         "coeff_type: " + GetString(coeff_type) + "\n" +
-+                         "pb: " + GetString(pb) + "\n" +
-                          "order: " + std::to_string(order) + "\n" +
--                         "mesh: " + mesh_filename;
-+                         "mesh: " + input;
-    INFO(section);
--   Mesh mesh(mesh_filename, 1, 1);
-+   Mesh mesh(input, 1, 1);
-    mesh.EnsureNodes();
-    int dim = mesh.Dimension();
-    H1_FECollection fec(order, dim);
-@@ -418,15 +886,18 @@ void test_ceed_nloperator(const char* mesh_filename, int order,
-    FiniteElementSpace *coeff_fes = nullptr;
-    Coefficient *coeff = nullptr;
-    VectorCoefficient *vcoeff = nullptr;
--   InitCoeff(mesh, fec, dim, coeff_type, gf, coeff_fes, coeff, vcoeff);
-+   MatrixCoefficient *mcoeff = nullptr;
-+   InitCoeff(mesh, fec, dim, coeff_type, gf, coeff_fes, coeff, vcoeff, mcoeff);
-+   MFEM_VERIFY(!vcoeff && !mcoeff,
-+               "Unexpected vector- or matrix-valued coefficient in test_ceed_nloperator.");
- 
-    // Build the NonlinearForm
-    bool vecOp = pb == NLProblem::Convection;
-    const int vdim = vecOp ? dim : 1;
-    FiniteElementSpace fes(&mesh, &fec, vdim);
- 
--   NonlinearForm k_test(&fes);
-    NonlinearForm k_ref(&fes);
-+   NonlinearForm k_test(&fes);
-    switch (pb)
-    {
-       case NLProblem::Convection:
-@@ -435,42 +906,48 @@ void test_ceed_nloperator(const char* mesh_filename, int order,
-          break;
-    }
- 
-+   k_ref.Setup();
-    k_test.SetAssemblyLevel(assembly);
-    k_test.Setup();
--   k_ref.Setup();
- 
--   // Compare ceed with mfem.
-+   // Compare ceed with mfem
-    GridFunction x(&fes), y_ref(&fes), y_test(&fes);
- 
-    x.Randomize(1);
- 
--   k_ref.Mult(x,y_ref);
--   k_test.Mult(x,y_test);
-+   k_ref.Mult(x, y_ref);
-+   k_test.Mult(x, y_test);
- 
-    y_test -= y_ref;
- 
--   REQUIRE(y_test.Norml2() < 1.e-12);
-+   REQUIRE(y_test.Norml2() < 1.e-12 * std::max(y_ref.Norml2(), 1.0));
-    delete gf;
-    delete coeff_fes;
-    delete coeff;
-    delete vcoeff;
-+   delete mcoeff;
- }
- 
- // This function specifically tests convection of a vector valued quantity and
- // using a custom integration rule. The integration rule is chosen s.t. in
- // combination with an appropriate order, it can represent the analytical
- // polynomial functions correctly.
--void test_ceed_convection(const char* mesh_filename, int order,
-+void test_ceed_convection(const char *input, int order,
-                           const AssemblyLevel assembly)
- {
--   Mesh mesh(mesh_filename, 1, 1);
-+   std::string section = "assembly: " + GetString(assembly) + "\n" +
-+                         "order: " + std::to_string(order) + "\n" +
-+                         "mesh: " + input;
-+   INFO(section);
-+   Mesh mesh(input, 1, 1);
-    mesh.EnsureNodes();
-    int dim = mesh.Dimension();
-    H1_FECollection fec(order, dim);
- 
-    VectorFunctionCoefficient velocity_coeff(dim, velocity_function);
- 
--   FiniteElementSpace fes(&mesh, &fec, dim);
-+   FiniteElementSpace fes(&mesh, &fec, 1);
-+   FiniteElementSpace vfes(&mesh, &fec, dim);
-    BilinearForm conv_op(&fes);
- 
-    IntegrationRules rules(0, Quadrature1D::GaussLobatto);
-@@ -483,7 +960,7 @@ void test_ceed_convection(const char* mesh_filename, int order,
-    conv_op.SetAssemblyLevel(assembly);
-    conv_op.Assemble();
- 
--   GridFunction q(&fes), r(&fes), ex(&fes);
-+   GridFunction q(&vfes), r(&vfes), ex(&vfes);
- 
-    VectorFunctionCoefficient quantity_coeff(dim, quantity);
-    q.ProjectCoefficient(quantity_coeff);
-@@ -492,9 +969,15 @@ void test_ceed_convection(const char* mesh_filename, int order,
-    ex.ProjectCoefficient(convected_quantity_coeff);
- 
-    r = 0.0;
--   conv_op.Mult(q, r);
-+   for (int i = 0; i < dim; i++)
-+   {
-+      GridFunction qi, ri;
-+      qi.MakeRef(&fes, q, i * fes.GetVSize());
-+      ri.MakeRef(&fes, r, i * fes.GetVSize());
-+      conv_op.Mult(qi, ri);
-+   }
- 
--   LinearForm f(&fes);
-+   LinearForm f(&vfes);
-    VectorDomainLFIntegrator *vlf_integ = new VectorDomainLFIntegrator(
-       convected_quantity_coeff);
-    vlf_integ->SetIntRule(&ir);
-@@ -503,7 +986,360 @@ void test_ceed_convection(const char* mesh_filename, int order,
- 
-    r -= f;
- 
--   REQUIRE(r.Norml2() < 1e-12);
-+   REQUIRE(r.Norml2() < 1.e-12 * std::max(f.Norml2(), 1.0));
-+}
-+
-+void test_ceed_full_assembly(const char *input, int order,
-+                             const AssemblyLevel assembly)
-+{
-+   std::string section = "assembly: " + GetString(assembly) + "\n" +
-+                         "order: " + std::to_string(order) + "\n" +
-+                         "mesh: " + input;
-+   INFO(section);
-+   Mesh mesh(input, 1, 1);
-+   mesh.EnsureNodes();
-+   int dim = mesh.Dimension();
-+   H1_FECollection fec(order, dim);
-+
-+   DenseMatrix val(dim);
-+   val = 0.0;
-+   for (int i = 0; i < dim; i++)
-+   {
-+      val(i, i) = 1.0 + i;
-+   }
-+   MatrixConstantCoefficient diff_coeff(val);
-+   ConstantCoefficient mass_coeff(1.0);
-+
-+   FiniteElementSpace fes(&mesh, &fec, 1);
-+   BilinearForm k_test(&fes);
-+   BilinearForm k_ref(&fes);
-+
-+   k_ref.AddDomainIntegrator(new MassIntegrator(mass_coeff));
-+   k_test.AddDomainIntegrator(new MassIntegrator(mass_coeff));
-+   k_ref.AddBoundaryIntegrator(new MassIntegrator(mass_coeff));
-+   k_test.AddBoundaryIntegrator(new MassIntegrator(mass_coeff));
-+   k_ref.AddDomainIntegrator(new DiffusionIntegrator(diff_coeff));
-+   k_test.AddDomainIntegrator(new DiffusionIntegrator(diff_coeff));
-+
-+   k_ref.Assemble();
-+   k_ref.Finalize();
-+
-+   k_test.SetAssemblyLevel(assembly);
-+   k_test.Assemble();
-+
-+   SparseMatrix *mat_ref = &k_ref.SpMat();
-+   SparseMatrix *mat_test = ceed::CeedOperatorFullAssemble(k_test);
-+   SparseMatrix *mat_diff = Add(1.0, *mat_ref, -1.0, *mat_test);
-+
-+   REQUIRE(mat_diff->MaxNorm() < 1.e-12 * std::max(mat_ref->MaxNorm(), 1.0));
-+   delete mat_diff;
-+   delete mat_test;
-+}
-+
-+void test_ceed_linear_interpolator(const char *input, int order)
-+{
-+   std::string section = "order: " + std::to_string(order) + "\n" +
-+                         "mesh: " + input;
-+   INFO(section);
-+   Mesh mesh(input, 1, 1);
-+   mesh.EnsureNodes();
-+   int dim = mesh.Dimension();
-+   H1_FECollection h1_fec(order, dim);
-+   ND_FECollection nd_fec(order, dim);
-+   RT_FECollection rt_fec(order - 1, dim);
-+
-+   // Build the DiscreteLinearOperator
-+   FiniteElementSpace h1_fes(&mesh, &h1_fec);
-+   FiniteElementSpace nd_fes(&mesh, &nd_fec);
-+   FiniteElementSpace rt_fes(&mesh, &rt_fec);
-+
-+   // Discrete gradient
-+   DiscreteLinearOperator grad_ref(&h1_fes, &nd_fes);
-+   DiscreteLinearOperator grad_test(&h1_fes, &nd_fes);
-+   grad_ref.AddDomainInterpolator(new GradientInterpolator);
-+   grad_test.AddDomainInterpolator(new GradientInterpolator);
-+
-+   // Timer for profiling
-+   const int trials = 1;
-+   const bool debug = false;
-+   StopWatch chrono_setup_grad_ref, chrono_setup_grad_test;
-+   StopWatch chrono_apply_grad_ref, chrono_apply_grad_test;
-+   StopWatch chrono_apply_id_ref, chrono_apply_id_test;
-+   chrono_setup_grad_ref.Clear();
-+   chrono_setup_grad_ref.Start();
-+
-+   grad_ref.Assemble();
-+   grad_ref.Finalize();
-+
-+   chrono_setup_grad_ref.Stop();
-+   chrono_setup_grad_test.Clear();
-+   chrono_setup_grad_test.Start();
-+
-+   grad_test.SetAssemblyLevel(AssemblyLevel::PARTIAL);
-+   grad_test.Assemble();
-+
-+   chrono_setup_grad_test.Stop();
-+
-+   // Compare ceed with mfem
-+   {
-+      GridFunction x(&h1_fes), y_ref(&nd_fes), y_test(&nd_fes);
-+      GridFunction x_t(&nd_fes), y_t_ref(&h1_fes), y_t_test(&h1_fes);
-+
-+      x.Randomize(1);
-+
-+      chrono_apply_grad_ref.Clear();
-+      chrono_apply_grad_ref.Start();
-+
-+      for (int trial = 0; trial < trials; trial++)
-+      {
-+         grad_ref.Mult(x, y_ref);
-+      }
-+
-+      chrono_apply_grad_ref.Stop();
-+      chrono_apply_grad_test.Clear();
-+      chrono_apply_grad_test.Start();
-+
-+      for (int trial = 0; trial < trials; trial++)
-+      {
-+         grad_test.Mult(x, y_test);
-+      }
-+
-+      chrono_apply_grad_test.Stop();
-+
-+      y_test -= y_ref;
-+
-+      REQUIRE(y_test.Norml2() < 1.e-12);
-+
-+      x_t.Randomize(1);
-+
-+      chrono_apply_grad_ref.Start();
-+
-+      for (int trial = 0; trial < trials; trial++)
-+      {
-+         grad_ref.MultTranspose(x_t, y_t_ref);
-+      }
-+
-+      chrono_apply_grad_ref.Stop();
-+      chrono_apply_grad_test.Start();
-+
-+      for (int trial = 0; trial < trials; trial++)
-+      {
-+         grad_test.MultTranspose(x_t, y_t_test);
-+      }
-+
-+      chrono_apply_grad_test.Stop();
-+
-+      y_t_test -= y_t_ref;
-+
-+      REQUIRE(y_t_test.Norml2() < 1.e-12);
-+   }
-+
-+   // Discrete curl
-+   if (dim == 3)
-+   {
-+      DiscreteLinearOperator curl_ref(&nd_fes, &rt_fes);
-+      DiscreteLinearOperator curl_test(&nd_fes, &rt_fes);
-+      curl_ref.AddDomainInterpolator(new CurlInterpolator);
-+      curl_test.AddDomainInterpolator(new CurlInterpolator);
-+
-+      curl_ref.Assemble();
-+      curl_ref.Finalize();
-+
-+      curl_test.SetAssemblyLevel(AssemblyLevel::PARTIAL);
-+      curl_test.Assemble();
-+
-+      // Compare ceed with mfem
-+      {
-+         GridFunction x(&nd_fes), y_ref(&rt_fes), y_test(&rt_fes);
-+         GridFunction x_t(&rt_fes), y_t_ref(&nd_fes), y_t_test(&nd_fes);
-+
-+         x.Randomize(1);
-+
-+         curl_ref.Mult(x, y_ref);
-+         curl_test.Mult(x, y_test);
-+
-+         y_test -= y_ref;
-+
-+         REQUIRE(y_test.Norml2() < 1.e-10);
-+
-+         x_t.Randomize(1);
-+
-+         curl_ref.MultTranspose(x_t, y_t_ref);
-+         curl_test.MultTranspose(x_t, y_t_test);
-+
-+         y_t_test -= y_t_ref;
-+
-+         REQUIRE(y_t_test.Norml2() < 1.e-10);
-+      }
-+   }
-+
-+   // Prolongation and restriction
-+   H1_FECollection fine_h1_fec(order + 1, dim);
-+   ND_FECollection fine_nd_fec(order + 1, dim);
-+   RT_FECollection fine_rt_fec(order, dim);
-+
-+   FiniteElementSpace fine_h1_fes(&mesh, &fine_h1_fec);
-+   FiniteElementSpace fine_nd_fes(&mesh, &fine_nd_fec);
-+   FiniteElementSpace fine_rt_fes(&mesh, &fine_rt_fec);
-+
-+   DiscreteLinearOperator id_h1_test(&h1_fes, &fine_h1_fes);
-+   DiscreteLinearOperator id_nd_test(&nd_fes, &fine_nd_fes);
-+   DiscreteLinearOperator id_rt_test(&rt_fes, &fine_rt_fes);
-+   id_h1_test.AddDomainInterpolator(new IdentityInterpolator);
-+   id_nd_test.AddDomainInterpolator(new IdentityInterpolator);
-+   id_rt_test.AddDomainInterpolator(new IdentityInterpolator);
-+
-+   id_h1_test.SetAssemblyLevel(AssemblyLevel::PARTIAL);
-+   id_h1_test.Assemble();
-+
-+   id_nd_test.SetAssemblyLevel(AssemblyLevel::PARTIAL);
-+   id_nd_test.Assemble();
-+
-+   id_rt_test.SetAssemblyLevel(AssemblyLevel::PARTIAL);
-+   id_rt_test.Assemble();
-+
-+   TransferOperator id_h1_ref(h1_fes, fine_h1_fes);
-+   TransferOperator id_nd_ref(nd_fes, fine_nd_fes);
-+   TransferOperator id_rt_ref(rt_fes, fine_rt_fes);
-+
-+   // Compare ceed with mfem
-+   for (int t = 0; t < 3; t++)
-+   {
-+      GridFunction *x = nullptr, *y_t_ref = nullptr, *y_t_test = nullptr;
-+      GridFunction *y_exact = nullptr, *y_ref = nullptr, *y_test = nullptr;
-+      switch (t)
-+      {
-+         case 0:
-+            x = new GridFunction(&h1_fes);
-+            y_t_ref = new GridFunction(&h1_fes);
-+            y_t_test = new GridFunction(&h1_fes);
-+            y_exact = new GridFunction(&fine_h1_fes);
-+            y_ref = new GridFunction(&fine_h1_fes);
-+            y_test = new GridFunction(&fine_h1_fes);
-+            break;
-+         case 1:
-+            x = new GridFunction(&nd_fes);
-+            y_t_ref = new GridFunction(&nd_fes);
-+            y_t_test = new GridFunction(&nd_fes);
-+            y_exact = new GridFunction(&fine_nd_fes);
-+            y_ref = new GridFunction(&fine_nd_fes);
-+            y_test = new GridFunction(&fine_nd_fes);
-+            break;
-+         case 2:
-+            x = new GridFunction(&rt_fes);
-+            y_t_ref = new GridFunction(&rt_fes);
-+            y_t_test = new GridFunction(&rt_fes);
-+            y_exact = new GridFunction(&fine_rt_fes);
-+            y_ref = new GridFunction(&fine_rt_fes);
-+            y_test = new GridFunction(&fine_rt_fes);
-+            break;
-+         default:
-+            MFEM_ABORT("Unexpected problem type.");
-+      }
-+
-+      if (t == 0)
-+      {
-+         FunctionCoefficient f_coeff(coeff_function);
-+         x->ProjectCoefficient(f_coeff);
-+         y_exact->ProjectCoefficient(f_coeff);
-+      }
-+      else
-+      {
-+         VectorFunctionCoefficient vf_coeff(dim, velocity_function);
-+         x->ProjectCoefficient(vf_coeff);
-+         y_exact->ProjectCoefficient(vf_coeff);
-+      }
-+
-+      chrono_apply_id_ref.Clear();
-+      chrono_apply_id_ref.Start();
-+
-+      for (int trial = 0; trial < trials; trial++)
-+      {
-+         switch (t)
-+         {
-+            case 0:
-+               id_h1_ref.Mult(*x, *y_ref);
-+               id_h1_ref.MultTranspose(*y_exact, *y_t_ref);
-+               break;
-+            case 1:
-+               id_nd_ref.Mult(*x, *y_ref);
-+               id_nd_ref.MultTranspose(*y_exact, *y_t_ref);
-+               break;
-+            case 2:
-+               id_rt_ref.Mult(*x, *y_ref);
-+               id_rt_ref.MultTranspose(*y_exact, *y_t_ref);
-+               break;
-+            default:
-+               MFEM_ABORT("Unexpected problem type.");
-+         }
-+      }
-+
-+      chrono_apply_id_ref.Stop();
-+      chrono_apply_id_test.Clear();
-+      chrono_apply_id_test.Start();
-+
-+      for (int trial = 0; trial < trials; trial++)
-+      {
-+         switch (t)
-+         {
-+            case 0:
-+               id_h1_test.Mult(*x, *y_test);
-+               id_h1_test.MultTranspose(*y_exact, *y_t_test);
-+               break;
-+            case 1:
-+               id_nd_test.Mult(*x, *y_test);
-+               id_nd_test.MultTranspose(*y_exact, *y_t_test);
-+               break;
-+            case 2:
-+               id_rt_test.Mult(*x, *y_test);
-+               id_rt_test.MultTranspose(*y_exact, *y_t_test);
-+               break;
-+            default:
-+               MFEM_ABORT("Unexpected problem type.");
-+         }
-+      }
-+
-+      chrono_apply_id_test.Stop();
-+
-+      *y_test -= *y_ref;
-+
-+      REQUIRE(y_test->Norml2() < 1.e-10);
-+
-+      *y_t_test -= *y_t_ref;
-+
-+      REQUIRE(y_t_test->Norml2() < 1.e-10);
-+      delete x;
-+      delete y_t_ref;
-+      delete y_t_test;
-+      delete y_exact;
-+      delete y_ref;
-+      delete y_test;
-+   }
-+
-+   if (debug)
-+   {
-+      out << "\n" << section << "\n";
-+      out << "benchmark (unknowns: H1: " << h1_fes.GetTrueVSize()
-+          << ", ND: " << nd_fes.GetTrueVSize()
-+          << ", RT: " << rt_fes.GetTrueVSize() << ",\n"
-+          << "                     fine H1: " << fine_h1_fes.GetTrueVSize()
-+          << ", fine ND: " << fine_nd_fes.GetTrueVSize()
-+          << ", fine RT: " << fine_rt_fes.GetTrueVSize() << ")\n"
-+          << "    discrete gradient interpolator\n"
-+          << "    setup: ref = "
-+          << chrono_setup_grad_ref.RealTime() * 1e3 << " ms\n"
-+          << "           test = "
-+          << chrono_setup_grad_test.RealTime() * 1e3 << " ms\n"
-+          << "    apply: ref = "
-+          << chrono_apply_grad_ref.RealTime() * 1e3 / trials << " ms\n"
-+          << "           test = "
-+          << chrono_apply_grad_test.RealTime() * 1e3 / trials << " ms\n"
-+          << "    identity interpolator\n"
-+          << "    apply: ref = "
-+          << chrono_apply_id_ref.RealTime() * 1e3 / trials << " ms\n"
-+          << "           test = "
-+          << chrono_apply_id_test.RealTime() * 1e3 / trials << " ms\n";
-+   }
- }
- 
- TEST_CASE("CEED mass & diffusion", "[CEED]")
-@@ -513,17 +1349,20 @@ TEST_CASE("CEED mass & diffusion", "[CEED]")
-                               CeedCoeffType::Quad);
-    auto pb = GENERATE(Problem::Mass,Problem::Diffusion,Problem::MassDiffusion,
-                       Problem::VectorMass,Problem::VectorDiffusion);
--   auto order = GENERATE(1);
-+   auto order = GENERATE(1,2);
-+   auto bdr_integ = GENERATE(false,true);
-    auto mesh = GENERATE("../../data/inline-quad.mesh",
-                         "../../data/inline-hex.mesh",
--                        "../../data/periodic-square.mesh",
-+                        "../../data/inline-tri.mesh",
-+                        "../../data/inline-tet.mesh",
-                         "../../data/star-q2.mesh",
-                         "../../data/fichera-q2.mesh",
-                         "../../data/amr-quad.mesh",
-                         "../../data/fichera-amr.mesh",
-                         "../../data/square-mixed.mesh",
-                         "../../data/fichera-mixed.mesh");
--   test_ceed_operator(mesh, order, coeff_type, pb, assembly);
-+   bool mixed_p = false;
-+   test_ceed_operator(mesh, order, coeff_type, pb, assembly, mixed_p, bdr_integ);
- } // test case
- 
- TEST_CASE("CEED p-adaptivity", "[CEED]")
-@@ -539,46 +1378,99 @@ TEST_CASE("CEED p-adaptivity", "[CEED]")
-                         "../../data/star-q2.mesh",
-                         "../../data/amr-quad.mesh",
-                         "../../data/square-mixed.mesh");
--   test_mixed_p_ceed_operator(mesh, order, coeff_type, pb, assembly);
-+   bool mixed_p = true;
-+   bool bdr_integ = false;
-+   test_ceed_operator(mesh, order, coeff_type, pb, assembly, mixed_p, bdr_integ);
- } // test case
- 
--TEST_CASE("CEED convection low", "[CEED],[Convection]")
-+TEST_CASE("CEED vector and matrix coefficients and vector FE operators",
-+          "[CEED], [VectorFE]")
- {
-    auto assembly = GENERATE(AssemblyLevel::PARTIAL,AssemblyLevel::NONE);
--   auto coeff_type = GENERATE(CeedCoeffType::VecConst,CeedCoeffType::VecGrid,
--                              CeedCoeffType::VecQuad);
-+   auto coeff_type = GENERATE(CeedCoeffType::Const,CeedCoeffType::Quad,
-+                              CeedCoeffType::VecConst,CeedCoeffType::VecQuad,
-+                              CeedCoeffType::MatConst,CeedCoeffType::MatQuad);
-+   auto pb = GENERATE(Problem::Mass,Problem::Diffusion,
-+                      Problem::HDivMass,Problem::DivDiv,
-+                      Problem::HCurlMass,Problem::CurlCurl);
-+   auto order = GENERATE(1,3);
-+   auto bdr_integ = GENERATE(false,true);
-    auto mesh = GENERATE("../../data/inline-quad.mesh",
-                         "../../data/inline-hex.mesh",
-+                        "../../data/inline-tri.mesh",
-+                        "../../data/inline-tet.mesh",
-                         "../../data/star-q2.mesh",
-                         "../../data/fichera-q2.mesh",
-                         "../../data/amr-quad.mesh",
-                         "../../data/fichera-amr.mesh",
-                         "../../data/square-mixed.mesh",
-                         "../../data/fichera-mixed.mesh");
--   Problem pb = Problem::Convection;
-+   test_ceed_vectorfe_operator(mesh, order, coeff_type, pb, assembly, bdr_integ);
-+} // test case
- 
--   // Test that the CEED and MFEM integrators give the same answer
--   int low_order = 1;
--   test_ceed_operator(mesh, low_order, coeff_type, pb, assembly);
-+TEST_CASE("CEED mixed integrators",
-+          "[CEED], [MixedVectorIntegrator], [VectorFE]")
-+{
-+   auto assembly = GENERATE(AssemblyLevel::PARTIAL,AssemblyLevel::NONE);
-+   auto coeff_type = GENERATE(CeedCoeffType::Const,CeedCoeffType::Quad,
-+                              CeedCoeffType::VecConst,CeedCoeffType::VecQuad,
-+                              CeedCoeffType::MatConst,CeedCoeffType::MatQuad);
-+   auto pb = GENERATE(Problem::MixedVectorGradient,Problem::MixedVectorCurl);
-+   auto order = GENERATE(2);
-+   auto bdr_integ = GENERATE(false,true);
-+   auto mesh = GENERATE("../../data/inline-quad.mesh",
-+                        "../../data/inline-hex.mesh",
-+                        "../../data/inline-tri.mesh",
-+                        "../../data/inline-tet.mesh",
-+                        "../../data/star-q2.mesh",
-+                        "../../data/fichera-q2.mesh",
-+                        "../../data/amr-quad.mesh",
-+                        "../../data/fichera-amr.mesh",
-+                        "../../data/square-mixed.mesh",
-+                        "../../data/fichera-mixed.mesh");
-+   test_ceed_mixed_operator(mesh, order, coeff_type, pb, assembly, bdr_integ);
- } // test case
- 
--TEST_CASE("CEED convection high", "[CEED],[Convection]")
-+TEST_CASE("CEED convection low", "[CEED], [Convection]")
- {
-    auto assembly = GENERATE(AssemblyLevel::PARTIAL,AssemblyLevel::NONE);
-+   auto coeff_type = GENERATE(CeedCoeffType::VecConst,CeedCoeffType::VecGrid,
-+                              CeedCoeffType::VecQuad);
-    auto mesh = GENERATE("../../data/inline-quad.mesh",
-                         "../../data/inline-hex.mesh",
-+                        "../../data/inline-tri.mesh",
-+                        "../../data/inline-tet.mesh",
-+                        "../../data/periodic-square.mesh",
-                         "../../data/star-q2.mesh",
-                         "../../data/fichera-q2.mesh",
-                         "../../data/amr-quad.mesh",
--                        "../../data/fichera-amr.mesh");
-+                        "../../data/fichera-amr.mesh",
-+                        "../../data/square-mixed.mesh",
-+                        "../../data/fichera-mixed.mesh");
-+   Problem pb = Problem::Convection;
-+   int low_order = 1;
-+   bool mixed_p = false;
-+   bool bdr_integ = false;
-+   test_ceed_operator(mesh, low_order, coeff_type, pb, assembly, mixed_p,
-+                      bdr_integ);
-+} // test case
- 
-+TEST_CASE("CEED convection high", "[CEED], [Convection]")
-+{
-    // Apply the CEED convection integrator applied to a vector quantity, check
-    // that we get the exact answer (with sufficiently high polynomial degree)
-+   auto assembly = GENERATE(AssemblyLevel::PARTIAL,AssemblyLevel::NONE);
-+   auto mesh = GENERATE("../../data/inline-quad.mesh",
-+                        "../../data/periodic-square.mesh",
-+                        "../../data/star-q2.mesh",
-+                        "../../data/fichera-q2.mesh",
-+                        "../../data/amr-quad.mesh",
-+                        "../../data/fichera-amr.mesh");
-    int high_order = 4;
-    test_ceed_convection(mesh, high_order, assembly);
- } // test case
- 
--TEST_CASE("CEED non-linear convection", "[CEED],[NLConvection]")
-+TEST_CASE("CEED nonlinear convection", "[CEED], [NLConvection]")
- {
-    auto assembly = GENERATE(AssemblyLevel::PARTIAL,AssemblyLevel::NONE);
-    auto coeff_type = GENERATE(CeedCoeffType::Const,CeedCoeffType::Grid,
-@@ -587,6 +1479,8 @@ TEST_CASE("CEED non-linear convection", "[CEED],[NLConvection]")
-    auto order = GENERATE(1);
-    auto mesh = GENERATE("../../data/inline-quad.mesh",
-                         "../../data/inline-hex.mesh",
-+                        "../../data/inline-tri.mesh",
-+                        "../../data/inline-tet.mesh",
-                         "../../data/periodic-square.mesh",
-                         "../../data/star-q2.mesh",
-                         "../../data/fichera.mesh",
-@@ -595,6 +1489,35 @@ TEST_CASE("CEED non-linear convection", "[CEED],[NLConvection]")
-    test_ceed_nloperator(mesh, order, coeff_type, pb, assembly);
- } // test case
- 
-+TEST_CASE("CEED full assembly", "[CEED]")
-+{
-+   auto assembly = GENERATE(AssemblyLevel::PARTIAL,AssemblyLevel::NONE);
-+   auto mesh = GENERATE("../../data/inline-quad.mesh",
-+                        "../../data/inline-hex.mesh",
-+                        "../../data/star-q2.mesh",
-+                        "../../data/fichera-q2.mesh",
-+                        "../../data/amr-quad.mesh",
-+                        "../../data/fichera-amr.mesh",
-+                        "../../data/square-mixed.mesh",
-+                        "../../data/fichera-mixed.mesh");
-+   int order = 1;
-+   test_ceed_full_assembly(mesh, order, assembly);
-+} // test case
-+
-+TEST_CASE("CEED linear interpolators", "[CEED]")
-+{
-+   auto mesh = GENERATE("../../data/inline-quad.mesh",
-+                        "../../data/inline-hex.mesh",
-+                        "../../data/star-q2.mesh",
-+                        "../../data/fichera-q2.mesh",
-+                        "../../data/amr-quad.mesh",
-+                        "../../data/fichera-amr.mesh",
-+                        "../../data/square-mixed.mesh",
-+                        "../../data/fichera-mixed.mesh");
-+   int order = 2;
-+   test_ceed_linear_interpolator(mesh, order);
-+} // test case
-+
- #endif
- 
- } // namespace ceed_test
-diff --git a/tests/unit/enzyme/compatibility.cpp b/tests/unit/enzyme/compatibility.cpp
-index 8cbb658d1..118f11b12 100644
---- a/tests/unit/enzyme/compatibility.cpp
-+++ b/tests/unit/enzyme/compatibility.cpp
-@@ -8,38 +8,40 @@
- template<typename VectorT>
- void square(const VectorT& v, double& y)
- {
--  for (int i = 0; i < 4; i++) {
--    y += v[i]*v[i];
--  }
-+   for (int i = 0; i < 4; i++)
-+   {
-+      y += v[i]*v[i];
-+   }
- }
- 
- template<typename VectorT>
- void dsquare(const VectorT& v, double& y, VectorT& dydv)
- {
--  double seed = 1.0;
--  __enzyme_autodiff<void>(square<VectorT>, &v, &dydv, &y, &seed);
-+   double seed = 1.0;
-+   __enzyme_autodiff<void>(square<VectorT>, &v, &dydv, &y, &seed);
- }
- 
- template<typename VectorT>
--void run_test() {
--  VectorT v(4);
--  v[0] = 2.0;
--  v[1] = 3.0;
--  v[2] = 1.0;
--  v[3] = 7.0;
--
--  double yy = 0;
--  VectorT dydv(4);
--  dydv[0] = 0;
--  dydv[1] = 0;
--  dydv[2] = 0;
--  dydv[3] = 0;
--  dsquare(v, yy, dydv);
--
--  REQUIRE(dydv[0] == MFEM_Approx(4.0));
--  REQUIRE(dydv[1] == MFEM_Approx(6.0));
--  REQUIRE(dydv[2] == MFEM_Approx(2.0));
--  REQUIRE(dydv[3] == MFEM_Approx(14.0));
-+void run_test()
-+{
-+   VectorT v(4);
-+   v[0] = 2.0;
-+   v[1] = 3.0;
-+   v[2] = 1.0;
-+   v[3] = 7.0;
-+
-+   double yy = 0;
-+   VectorT dydv(4);
-+   dydv[0] = 0;
-+   dydv[1] = 0;
-+   dydv[2] = 0;
-+   dydv[3] = 0;
-+   dsquare(v, yy, dydv);
-+
-+   REQUIRE(dydv[0] == MFEM_Approx(4.0));
-+   REQUIRE(dydv[1] == MFEM_Approx(6.0));
-+   REQUIRE(dydv[2] == MFEM_Approx(2.0));
-+   REQUIRE(dydv[3] == MFEM_Approx(14.0));
- }
- 
- TEST_CASE("AD Vector implementation", "[Enzyme]")
-diff --git a/tests/unit/fem/test_assemblediagonalpa.cpp b/tests/unit/fem/test_assemblediagonalpa.cpp
-index 050561e1d..42cdc76ba 100644
---- a/tests/unit/fem/test_assemblediagonalpa.cpp
-+++ b/tests/unit/fem/test_assemblediagonalpa.cpp
-@@ -447,8 +447,8 @@ TEST_CASE("Hcurl/Hdiv diagonal PA",
-                      else
-                      {
-                         const FiniteElement *fel = fespace.GetFE(0);
--                        const IntegrationRule *intRule = &MassIntegrator::GetRule(*fel, *fel,
--                                                                                  *mesh.GetElementTransformation(0));
-+                        ElementTransformation *T = mesh.GetElementTransformation(0);
-+                        const IntegrationRule *intRule = &MassIntegrator::GetRuleStatic(*fel, *fel, *T);
- 
-                         if (spaceType == Hcurl)
-                         {
-diff --git a/tests/unit/fem/test_pa_coeff.cpp b/tests/unit/fem/test_pa_coeff.cpp
-index b45738fad..270359f3c 100644
---- a/tests/unit/fem/test_pa_coeff.cpp
-+++ b/tests/unit/fem/test_pa_coeff.cpp
-@@ -530,8 +530,8 @@ TEST_CASE("Hcurl/Hdiv PA Coefficient",
-                         if (spaceType == Hcurl)
-                         {
-                            const FiniteElement *fel = fespace.GetFE(0);
--                           const IntegrationRule *intRule = &MassIntegrator::GetRule(*fel, *fel,
--                                                                                     *mesh.GetElementTransformation(0));
-+                           ElementTransformation *T = mesh.GetElementTransformation(0);
-+                           const IntegrationRule *intRule = &MassIntegrator::GetRuleStatic(*fel, *fel, *T);
- 
-                            if (coeffType >= 3 && dimension == 3)
-                            {
diff --git a/palace/deps/patch/mfem/patch_pa_prereq.diff b/palace/deps/patch/mfem/patch_pa_prereq.diff
deleted file mode 100644
index 647d15c1d..000000000
--- a/palace/deps/patch/mfem/patch_pa_prereq.diff
+++ /dev/null
@@ -1,41184 +0,0 @@
-diff --git a/CHANGELOG b/CHANGELOG
-index eb4f4e4bd..1026d6957 100644
---- a/CHANGELOG
-+++ b/CHANGELOG
-@@ -22,6 +22,10 @@ Version 4.5.3 (development)
-   338. Added the tmop-metric-magnitude tool for tracking how metrics change
-   under geometric perturbations.
- 
-+- Reorganized files for bilinear form, linear form, and nonlinear form integrators
-+  in the fem/integ/ subdirectory.
-+
-+
- New and updated examples and miniapps
- -------------------------------------
- - Added a miniapp pmesh-fitting in miniapps/meshing for interface and boundary
-@@ -53,6 +57,7 @@ Integrations, testing and documentation
- - Added an address sanitizer GitHub action for a serial build/test on Ubuntu,
-   based on Clang/LLVM (https://clang.llvm.org/docs/AddressSanitizer.html).
- 
-+
- Version 4.5.2, released on March 23, 2023
- =========================================
- 
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 32112b549..9e46030ad 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -218,10 +218,7 @@ if (MFEM_USE_HIP)
- endif()
- 
- # OpenMP
--if (MFEM_USE_OPENMP OR MFEM_USE_LEGACY_OPENMP)
--  if (NOT MFEM_THREAD_SAFE AND MFEM_USE_LEGACY_OPENMP)
--    message(FATAL_ERROR " *** MFEM_USE_LEGACY_OPENMP requires MFEM_THREAD_SAFE=ON.")
--  endif()
-+if (MFEM_USE_OPENMP)
-   find_package(OpenMP REQUIRED)
-   set(OPENMP_LIBRARIES ${OpenMP_CXX_LIBRARIES})
-   if(APPLE)
-diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
-index ba372df94..a6111c3e0 100644
---- a/CONTRIBUTING.md
-+++ b/CONTRIBUTING.md
-@@ -121,6 +121,7 @@ The MFEM source code has the following structure:
-   ├── fem
-   │   ├── ceed
-   │   ├── fe
-+  │   ├── integ
-   │   ├── lor
-   │   ├── moonolith
-   │   ├── qinterp
-diff --git a/INSTALL b/INSTALL
-index cb092cc1b..e48618821 100644
---- a/INSTALL
-+++ b/INSTALL
-@@ -309,10 +309,6 @@ MFEM_THREAD_SAFE = YES/NO
-    Use thread-safe implementation for some classes/methods. This comes at the
-    cost of extra memory allocation and de-allocation.
- 
--MFEM_USE_LEGACY_OPENMP = YES/NO
--   Enable (basic) experimental OpenMP support. Requires MFEM_THREAD_SAFE.
--   This option is deprecated.
--
- MFEM_USE_OPENMP = YES/NO
-    Enable the OpenMP backend.
- 
-@@ -611,8 +607,8 @@ The specific libraries and their options are:
-        http://math-atlas.sourceforge.net (ATLAS)
-   Options: LAPACK_OPT (currently not used/needed), LAPACK_LIB.
- 
--- OpenMP (optional), usually part of compiler, used when either MFEM_USE_OPENMP
--  or MFEM_USE_LEGACY_OPENMP is set to YES.
-+- OpenMP (optional), usually part of compiler, used when MFEM_USE_OPENMP is set
-+  to YES.
-   Options: OPENMP_OPT, OPENMP_LIB.
- 
- - High-resolution POSIX clocks: when using MFEM_TIMER_TYPE = 2, it may be
-@@ -956,7 +952,6 @@ MFEM_USE_METIS - Set to ${MFEM_USE_MPI}, can be overwritten.
- MFEM_USE_LIBUNWIND
- MFEM_USE_LAPACK
- MFEM_THREAD_SAFE
--MFEM_USE_LEGACY_OPENMP
- MFEM_USE_OPENMP
- MFEM_USE_MEMALLOC
- MFEM_TIMER_TYPE - Set automatically, can be overwritten.
-diff --git a/config/cmake/MFEMConfig.cmake.in b/config/cmake/MFEMConfig.cmake.in
-index 9d5eef52f..43b6d0671 100644
---- a/config/cmake/MFEMConfig.cmake.in
-+++ b/config/cmake/MFEMConfig.cmake.in
-@@ -25,7 +25,6 @@ set(MFEM_USE_LIBUNWIND @MFEM_USE_LIBUNWIND@)
- set(MFEM_USE_LAPACK @MFEM_USE_LAPACK@)
- set(MFEM_THREAD_SAFE @MFEM_THREAD_SAFE@)
- set(MFEM_USE_OPENMP @MFEM_USE_OPENMP@)
--set(MFEM_USE_LEGACY_OPENMP @MFEM_USE_LEGACY_OPENMP@)
- set(MFEM_USE_MEMALLOC @MFEM_USE_MEMALLOC@)
- set(MFEM_TIMER_TYPE @MFEM_TIMER_TYPE@)
- set(MFEM_USE_SUNDIALS @MFEM_USE_SUNDIALS@)
-diff --git a/config/cmake/config.hpp.in b/config/cmake/config.hpp.in
-index 7e820088a..ba75ff79e 100644
---- a/config/cmake/config.hpp.in
-+++ b/config/cmake/config.hpp.in
-@@ -74,9 +74,6 @@
- // Enable the OpenMP backend.
- #cmakedefine MFEM_USE_OPENMP
- 
--// [Deprecated] Enable experimental OpenMP support. Requires MFEM_THREAD_SAFE.
--#cmakedefine MFEM_USE_LEGACY_OPENMP
--
- // Internal MFEM option: enable group/batch allocation for some small objects.
- #cmakedefine MFEM_USE_MEMALLOC
- 
-diff --git a/config/cmake/modules/MfemCmakeUtilities.cmake b/config/cmake/modules/MfemCmakeUtilities.cmake
-index 204b7d87f..9a629330b 100644
---- a/config/cmake/modules/MfemCmakeUtilities.cmake
-+++ b/config/cmake/modules/MfemCmakeUtilities.cmake
-@@ -843,16 +843,16 @@ function(mfem_export_mk_files)
-   # Convert Boolean vars to YES/NO without writing the values to cache
-   set(CONFIG_MK_BOOL_VARS MFEM_USE_MPI MFEM_USE_METIS MFEM_USE_METIS_5
-       MFEM_DEBUG MFEM_USE_EXCEPTIONS MFEM_USE_ZLIB MFEM_USE_LIBUNWIND
--      MFEM_USE_LAPACK MFEM_THREAD_SAFE MFEM_USE_LEGACY_OPENMP MFEM_USE_OPENMP
--      MFEM_USE_MEMALLOC MFEM_USE_SUNDIALS MFEM_USE_SUITESPARSE
--      MFEM_USE_SUPERLU MFEM_USE_SUPERLU5 MFEM_USE_MUMPS MFEM_USE_STRUMPACK
--      MFEM_USE_GINKGO MFEM_USE_AMGX MFEM_USE_GNUTLS MFEM_USE_NETCDF
--      MFEM_USE_PETSC MFEM_USE_SLEPC MFEM_USE_MPFR MFEM_USE_SIDRE MFEM_USE_FMS
--      MFEM_USE_CONDUIT MFEM_USE_PUMI MFEM_USE_HIOP MFEM_USE_GSLIB MFEM_USE_CUDA
--      MFEM_USE_HIP MFEM_USE_RAJA MFEM_USE_OCCA MFEM_USE_CEED MFEM_USE_CALIPER
--      MFEM_USE_UMPIRE MFEM_USE_SIMD MFEM_USE_ADIOS2 MFEM_USE_MKL_CPARDISO
--      MFEM_USE_ADFORWARD MFEM_USE_CODIPACK MFEM_USE_BENCHMARK MFEM_USE_PARELAG
--      MFEM_USE_MOONOLITH MFEM_USE_ALGOIM MFEM_USE_ENZYME)
-+      MFEM_USE_LAPACK MFEM_THREAD_SAFE MFEM_USE_OPENMP MFEM_USE_MEMALLOC
-+      MFEM_USE_SUNDIALS MFEM_USE_SUITESPARSE MFEM_USE_SUPERLU MFEM_USE_SUPERLU5
-+      MFEM_USE_MUMPS MFEM_USE_STRUMPACK MFEM_USE_GINKGO MFEM_USE_AMGX
-+      MFEM_USE_GNUTLS MFEM_USE_NETCDF MFEM_USE_PETSC MFEM_USE_SLEPC
-+      MFEM_USE_MPFR MFEM_USE_SIDRE MFEM_USE_FMS MFEM_USE_CONDUIT MFEM_USE_PUMI
-+      MFEM_USE_HIOP MFEM_USE_GSLIB MFEM_USE_CUDA MFEM_USE_HIP MFEM_USE_RAJA
-+      MFEM_USE_OCCA MFEM_USE_CEED MFEM_USE_CALIPER MFEM_USE_UMPIRE
-+      MFEM_USE_SIMD MFEM_USE_ADIOS2 MFEM_USE_MKL_CPARDISO MFEM_USE_ADFORWARD
-+      MFEM_USE_CODIPACK MFEM_USE_BENCHMARK MFEM_USE_PARELAG MFEM_USE_MOONOLITH
-+      MFEM_USE_ALGOIM MFEM_USE_ENZYME)
-   foreach(var ${CONFIG_MK_BOOL_VARS})
-     if (${var})
-       set(${var} YES)
-diff --git a/config/config.hpp.in b/config/config.hpp.in
-index 76145927b..d82bf192c 100644
---- a/config/config.hpp.in
-+++ b/config/config.hpp.in
-@@ -74,9 +74,6 @@
- // Enable the OpenMP backend.
- // #define MFEM_USE_OPENMP
- 
--// [Deprecated] Enable experimental OpenMP support. Requires MFEM_THREAD_SAFE.
--// #define MFEM_USE_LEGACY_OPENMP
--
- // Internal MFEM option: enable group/batch allocation for some small objects.
- // #define MFEM_USE_MEMALLOC
- 
-diff --git a/config/config.mk.in b/config/config.mk.in
-index baf5c2955..8858d01b8 100644
---- a/config/config.mk.in
-+++ b/config/config.mk.in
-@@ -24,7 +24,6 @@ MFEM_USE_ZLIB          = @MFEM_USE_ZLIB@
- MFEM_USE_LIBUNWIND     = @MFEM_USE_LIBUNWIND@
- MFEM_USE_LAPACK        = @MFEM_USE_LAPACK@
- MFEM_THREAD_SAFE       = @MFEM_THREAD_SAFE@
--MFEM_USE_LEGACY_OPENMP = @MFEM_USE_LEGACY_OPENMP@
- MFEM_USE_OPENMP        = @MFEM_USE_OPENMP@
- MFEM_USE_MEMALLOC      = @MFEM_USE_MEMALLOC@
- MFEM_TIMER_TYPE        = @MFEM_TIMER_TYPE@
-diff --git a/config/defaults.cmake b/config/defaults.cmake
-index d5104092b..a72be813f 100644
---- a/config/defaults.cmake
-+++ b/config/defaults.cmake
-@@ -28,7 +28,6 @@ option(MFEM_USE_LIBUNWIND "Enable backtrace for errors." OFF)
- option(MFEM_USE_LAPACK "Enable LAPACK usage" OFF)
- option(MFEM_THREAD_SAFE "Enable thread safety" OFF)
- option(MFEM_USE_OPENMP "Enable the OpenMP backend" OFF)
--option(MFEM_USE_LEGACY_OPENMP "Enable legacy OpenMP usage" OFF)
- option(MFEM_USE_MEMALLOC "Enable the internal MEMALLOC option." ON)
- option(MFEM_USE_SUNDIALS "Enable SUNDIALS usage" OFF)
- option(MFEM_USE_SUITESPARSE "Enable SuiteSparse usage" OFF)
-diff --git a/config/defaults.mk b/config/defaults.mk
-index ca5dc3c45..364627756 100644
---- a/config/defaults.mk
-+++ b/config/defaults.mk
-@@ -127,7 +127,6 @@ MFEM_USE_LIBUNWIND     = NO
- MFEM_USE_LAPACK        = NO
- MFEM_THREAD_SAFE       = NO
- MFEM_USE_OPENMP        = NO
--MFEM_USE_LEGACY_OPENMP = NO
- MFEM_USE_MEMALLOC      = YES
- MFEM_TIMER_TYPE        = $(if $(NOTMAC),2,4)
- MFEM_USE_SUNDIALS      = NO
-diff --git a/fem/CMakeLists.txt b/fem/CMakeLists.txt
-index 462ef72aa..6da0cfea3 100644
---- a/fem/CMakeLists.txt
-+++ b/fem/CMakeLists.txt
-@@ -13,28 +13,39 @@ set(SRCS
-   bilinearform.cpp
-   bilinearform_ext.cpp
-   bilininteg.cpp
--  bilininteg_br2.cpp
--  bilininteg_convection_mf.cpp
--  bilininteg_convection_pa.cpp
--  bilininteg_convection_ea.cpp
--  bilininteg_dgtrace_pa.cpp
--  bilininteg_dgtrace_ea.cpp
--  bilininteg_diffusion_mf.cpp
--  bilininteg_diffusion_pa.cpp
--  bilininteg_diffusion_ea.cpp
--  bilininteg_divergence.cpp
--  bilininteg_hcurl.cpp
--  bilininteg_hdiv.cpp
--  bilininteg_vectorfe.cpp
--  bilininteg_gradient.cpp
--  bilininteg_mass_mf.cpp
--  bilininteg_mass_pa.cpp
--  bilininteg_mass_ea.cpp
--  bilininteg_transpose_ea.cpp
--  bilininteg_vecdiffusion.cpp
--  bilininteg_vecdiffusion_mf.cpp
--  bilininteg_vecmass.cpp
--  bilininteg_vecmass_mf.cpp
-+  integ/bilininteg_br2.cpp
-+  integ/bilininteg_convection_mf.cpp
-+  integ/bilininteg_convection_pa.cpp
-+  integ/bilininteg_convection_ea.cpp
-+  integ/bilininteg_curlcurl_pa.cpp
-+  integ/bilininteg_dgtrace_pa.cpp
-+  integ/bilininteg_dgtrace_ea.cpp
-+  integ/bilininteg_diffusion_mf.cpp
-+  integ/bilininteg_diffusion_pa.cpp
-+  integ/bilininteg_diffusion_ea.cpp
-+  integ/bilininteg_divdiv_pa.cpp
-+  integ/bilininteg_gradient_pa.cpp
-+  integ/bilininteg_interp_pa.cpp
-+  integ/bilininteg_mass_mf.cpp
-+  integ/bilininteg_mass_pa.cpp
-+  integ/bilininteg_mass_ea.cpp
-+  integ/bilininteg_mixedcurl_pa.cpp
-+  integ/bilininteg_mixedvecgrad_pa.cpp
-+  integ/bilininteg_transpose_ea.cpp
-+  integ/bilininteg_vecdiffusion_mf.cpp
-+  integ/bilininteg_vecdiffusion_pa.cpp
-+  integ/bilininteg_vecdiv_pa.cpp
-+  integ/bilininteg_vecmass_mf.cpp
-+  integ/bilininteg_vecmass_pa.cpp
-+  integ/bilininteg_vectorfediv_pa.cpp
-+  integ/bilininteg_vectorfemass_pa.cpp
-+  integ/lininteg_boundary.cpp
-+  integ/lininteg_boundary_flux.cpp
-+  integ/lininteg_domain.cpp
-+  integ/lininteg_domain_grad.cpp
-+  integ/lininteg_domain_vectorfe.cpp
-+  integ/nonlininteg_vecconvection_pa.cpp
-+  integ/nonlininteg_vecconvection_mf.cpp
-   coefficient.cpp
-   complex_fem.cpp
-   convergence.cpp
-@@ -74,11 +85,6 @@ set(SRCS
-   linearform.cpp
-   linearform_ext.cpp
-   lininteg.cpp
--  lininteg_boundary.cpp
--  lininteg_boundary_flux.cpp
--  lininteg_domain.cpp
--  lininteg_domain_grad.cpp
--  lininteg_vectorfe_domain.cpp
-   lor/lor.cpp
-   lor/lor_ads.cpp
-   lor/lor_ams.cpp
-@@ -91,8 +97,6 @@ set(SRCS
-   nonlinearform_ext.cpp
-   nonlininteg.cpp
-   fespacehierarchy.cpp
--  nonlininteg_vectorconvection.cpp
--  nonlininteg_vectorconvection_mf.cpp
-   qfunction.cpp
-   qinterp/det.cpp
-   qinterp/eval_by_nodes.cpp
-@@ -143,7 +147,11 @@ set(HDRS
-   bilinearform.hpp
-   bilinearform_ext.hpp
-   bilininteg.hpp
--  bilininteg_mass_pa.hpp
-+  integ/bilininteg_diffusion_kernels.hpp
-+  integ/bilininteg_hcurl_kernels.hpp
-+  integ/bilininteg_hdiv_kernels.hpp
-+  integ/bilininteg_hcurlhdiv_kernels.hpp
-+  integ/bilininteg_mass_kernels.hpp
-   coefficient.hpp
-   complex_fem.hpp
-   convergence.hpp
-diff --git a/fem/bilinearform.cpp b/fem/bilinearform.cpp
-index fad9717aa..a549d03a7 100644
---- a/fem/bilinearform.cpp
-+++ b/fem/bilinearform.cpp
-@@ -18,84 +18,31 @@
- namespace mfem
- {
- 
--void BilinearForm::AllocMat()
--{
--   if (static_cond) { return; }
--
--   if (precompute_sparsity == 0 || fes->GetVDim() > 1)
--   {
--      mat = new SparseMatrix(height);
--      return;
--   }
--
--   const Table &elem_dof = fes->GetElementToDofTable();
--   Table dof_dof;
--
--   if (interior_face_integs.Size() > 0)
--   {
--      // the sparsity pattern is defined from the map: face->element->dof
--      Table face_dof, dof_face;
--      {
--         Table *face_elem = fes->GetMesh()->GetFaceToElementTable();
--         mfem::Mult(*face_elem, elem_dof, face_dof);
--         delete face_elem;
--      }
--      Transpose(face_dof, dof_face, height);
--      mfem::Mult(dof_face, face_dof, dof_dof);
--   }
--   else
--   {
--      // the sparsity pattern is defined from the map: element->dof
--      Table dof_elem;
--      Transpose(elem_dof, dof_elem, height);
--      mfem::Mult(dof_elem, elem_dof, dof_dof);
--   }
--
--   dof_dof.SortRows();
--
--   int *I = dof_dof.GetI();
--   int *J = dof_dof.GetJ();
--   double *data = Memory<double>(I[height]);
--
--   mat = new SparseMatrix(I, J, data, height, height, true, true, true);
--   *mat = 0.0;
--
--   dof_dof.LoseData();
--}
--
--BilinearForm::BilinearForm(FiniteElementSpace * f)
--   : Matrix (f->GetVSize())
-+BilinearForm::BilinearForm(FiniteElementSpace *f)
-+   : Matrix(f->GetVSize())
- {
-    fes = f;
-    sequence = f->GetSequence();
-    mat = mat_e = NULL;
-    extern_bfs = 0;
--   element_matrices = NULL;
-    static_cond = NULL;
-    hybridization = NULL;
--   precompute_sparsity = 0;
-    diag_policy = DIAG_KEEP;
--
-    assembly = AssemblyLevel::LEGACY;
--   batch = 1;
-    ext = NULL;
- }
- 
--BilinearForm::BilinearForm (FiniteElementSpace * f, BilinearForm * bf, int ps)
--   : Matrix (f->GetVSize())
-+BilinearForm::BilinearForm(FiniteElementSpace *f, BilinearForm *bf)
-+   : Matrix(f->GetVSize())
- {
-    fes = f;
-    sequence = f->GetSequence();
--   mat_e = NULL;
-+   mat = mat_e = NULL;
-    extern_bfs = 1;
--   element_matrices = NULL;
-    static_cond = NULL;
-    hybridization = NULL;
--   precompute_sparsity = ps;
-    diag_policy = DIAG_KEEP;
--
-    assembly = AssemblyLevel::LEGACY;
--   batch = 1;
-    ext = NULL;
- 
-    // Copy the pointers to the integrators
-@@ -108,8 +55,6 @@ BilinearForm::BilinearForm (FiniteElementSpace * f, BilinearForm * bf, int ps)
- 
-    boundary_face_integs = bf->boundary_face_integs;
-    boundary_face_integs_marker = bf->boundary_face_integs_marker;
--
--   AllocMat();
- }
- 
- void BilinearForm::SetAssemblyLevel(AssemblyLevel assembly_level)
-@@ -124,7 +69,7 @@ void BilinearForm::SetAssemblyLevel(AssemblyLevel assembly_level)
-       case AssemblyLevel::LEGACY:
-          break;
-       case AssemblyLevel::FULL:
--         SetDiagonalPolicy( DIAG_ONE ); // Only diagonal policy supported on device
-+         SetDiagonalPolicy(DIAG_ONE); // Only diagonal policy supported on device
-          ext = new FABilinearFormExtension(this);
-          break;
-       case AssemblyLevel::ELEMENT:
-@@ -181,48 +126,71 @@ void BilinearForm::EnableHybridization(FiniteElementSpace *constr_space,
-    hybridization->Init(ess_tdof_list);
- }
- 
--void BilinearForm::UseSparsity(int *I, int *J, bool isSorted)
-+double &BilinearForm::Elem(int i, int j)
- {
--   if (static_cond) { return; }
-+   return mat->Elem(i,j);
-+}
-+
-+const double &BilinearForm::Elem(int i, int j) const
-+{
-+   return mat->Elem(i,j);
-+}
- 
--   if (mat)
-+void BilinearForm::Mult(const Vector &x, Vector &y) const
-+{
-+   if (ext)
-    {
--      if (mat->Finalized() && mat->GetI() == I && mat->GetJ() == J)
--      {
--         return; // mat is already using the given sparsity
--      }
--      delete mat;
-+      ext->Mult(x, y);
-+   }
-+   else
-+   {
-+      mat->Mult(x, y);
-    }
--   height = width = fes->GetVSize();
--   mat = new SparseMatrix(I, J, NULL, height, width, false, true, isSorted);
- }
- 
--void BilinearForm::UseSparsity(SparseMatrix &A)
-+void BilinearForm::AddMult(const Vector &x, Vector &y, const double a) const
- {
--   MFEM_ASSERT(A.Height() == fes->GetVSize() && A.Width() == fes->GetVSize(),
--               "invalid matrix A dimensions: "
--               << A.Height() << " x " << A.Width());
--   MFEM_ASSERT(A.Finalized(), "matrix A must be Finalized");
--
--   UseSparsity(A.GetI(), A.GetJ(), A.ColumnsAreSorted());
-+   if (ext)
-+   {
-+      ext->AddMult(x, y, a);
-+   }
-+   else
-+   {
-+      mat->AddMult(x, y, a);
-+   }
- }
- 
--double& BilinearForm::Elem (int i, int j)
-+void BilinearForm::MultTranspose(const Vector &x, Vector &y) const
- {
--   return mat -> Elem(i,j);
-+   if (ext)
-+   {
-+      ext->MultTranspose(x, y);
-+   }
-+   else
-+   {
-+      mat->MultTranspose(x, y);
-+   }
- }
- 
--const double& BilinearForm::Elem (int i, int j) const
-+void BilinearForm::AddMultTranspose(const Vector &x, Vector &y,
-+                                    const double a) const
- {
--   return mat -> Elem(i,j);
-+   if (ext)
-+   {
-+      ext->AddMultTranspose(x, y, a);
-+   }
-+   else
-+   {
-+      mat->AddMultTranspose(x, y, a);
-+   }
- }
- 
--MatrixInverse * BilinearForm::Inverse() const
-+MatrixInverse *BilinearForm::Inverse() const
- {
--   return mat -> Inverse();
-+   return mat->Inverse();
- }
- 
--void BilinearForm::Finalize (int skip_zeros)
-+void BilinearForm::Finalize(int skip_zeros)
- {
-    if (assembly == AssemblyLevel::LEGACY)
-    {
-@@ -246,22 +214,22 @@ void BilinearForm::AddDomainIntegrator(BilinearFormIntegrator *bfi,
-    domain_integs_marker.Append(&elem_marker);
- }
- 
--void BilinearForm::AddBoundaryIntegrator (BilinearFormIntegrator * bfi)
-+void BilinearForm::AddBoundaryIntegrator(BilinearFormIntegrator *bfi)
- {
--   boundary_integs.Append (bfi);
-+   boundary_integs.Append(bfi);
-    boundary_integs_marker.Append(NULL); // NULL marker means apply everywhere
- }
- 
--void BilinearForm::AddBoundaryIntegrator (BilinearFormIntegrator * bfi,
--                                          Array<int> &bdr_marker)
-+void BilinearForm::AddBoundaryIntegrator(BilinearFormIntegrator *bfi,
-+                                         Array<int> &bdr_marker)
- {
--   boundary_integs.Append (bfi);
-+   boundary_integs.Append(bfi);
-    boundary_integs_marker.Append(&bdr_marker);
- }
- 
--void BilinearForm::AddInteriorFaceIntegrator(BilinearFormIntegrator * bfi)
-+void BilinearForm::AddInteriorFaceIntegrator(BilinearFormIntegrator *bfi)
- {
--   interior_face_integs.Append (bfi);
-+   interior_face_integs.Append(bfi);
- }
- 
- void BilinearForm::AddBdrFaceIntegrator(BilinearFormIntegrator *bfi)
-@@ -278,55 +246,6 @@ void BilinearForm::AddBdrFaceIntegrator(BilinearFormIntegrator *bfi,
-    boundary_face_integs_marker.Append(&bdr_marker);
- }
- 
--void BilinearForm::ComputeElementMatrix(int i, DenseMatrix &elmat)
--{
--   if (element_matrices)
--   {
--      elmat.SetSize(element_matrices->SizeI(), element_matrices->SizeJ());
--      elmat = element_matrices->GetData(i);
--      return;
--   }
--
--   if (domain_integs.Size())
--   {
--      const FiniteElement &fe = *fes->GetFE(i);
--      ElementTransformation *eltrans = fes->GetElementTransformation(i);
--      domain_integs[0]->AssembleElementMatrix(fe, *eltrans, elmat);
--      for (int k = 1; k < domain_integs.Size(); k++)
--      {
--         domain_integs[k]->AssembleElementMatrix(fe, *eltrans, elemmat);
--         elmat += elemmat;
--      }
--   }
--   else
--   {
--      fes->GetElementVDofs(i, vdofs);
--      elmat.SetSize(vdofs.Size());
--      elmat = 0.0;
--   }
--}
--
--void BilinearForm::ComputeBdrElementMatrix(int i, DenseMatrix &elmat)
--{
--   if (boundary_integs.Size())
--   {
--      const FiniteElement &be = *fes->GetBE(i);
--      ElementTransformation *eltrans = fes->GetBdrElementTransformation(i);
--      boundary_integs[0]->AssembleElementMatrix(be, *eltrans, elmat);
--      for (int k = 1; k < boundary_integs.Size(); k++)
--      {
--         boundary_integs[k]->AssembleElementMatrix(be, *eltrans, elemmat);
--         elmat += elemmat;
--      }
--   }
--   else
--   {
--      fes->GetBdrElementVDofs(i, vdofs);
--      elmat.SetSize(vdofs.Size());
--      elmat = 0.0;
--   }
--}
--
- void BilinearForm::AssembleElementMatrix(
-    int i, const DenseMatrix &elmat, int skip_zeros)
- {
-@@ -345,7 +264,7 @@ void BilinearForm::AssembleElementMatrix(
-    {
-       if (mat == NULL)
-       {
--         AllocMat();
-+         mat = new SparseMatrix(height);
-       }
-       mat->AddSubMatrix(vdofs_, vdofs_, elmat, skip_zeros);
-       if (hybridization)
-@@ -373,7 +292,7 @@ void BilinearForm::AssembleBdrElementMatrix(
-    {
-       if (mat == NULL)
-       {
--         AllocMat();
-+         mat = new SparseMatrix(height);
-       }
-       mat->AddSubMatrix(vdofs_, vdofs_, elmat, skip_zeros);
-       if (hybridization)
-@@ -392,23 +311,14 @@ void BilinearForm::Assemble(int skip_zeros)
-    }
- 
-    ElementTransformation *eltrans;
--   DofTransformation * doftrans;
--   Mesh *mesh = fes -> GetMesh();
--   DenseMatrix elmat, *elmat_p;
--
--   if (mat == NULL)
--   {
--      AllocMat();
--   }
-+   DofTransformation *doftrans;
-+   Mesh *mesh = fes->GetMesh();
-+   DenseMatrix elmat;
- 
--#ifdef MFEM_USE_LEGACY_OPENMP
--   int free_element_matrices = 0;
--   if (!element_matrices)
-+   if (mat == NULL && !static_cond)
-    {
--      ComputeElementMatrices();
--      free_element_matrices = 1;
-+      mat = new SparseMatrix(height);
-    }
--#endif
- 
-    if (domain_integs.Size())
-    {
-@@ -423,61 +333,49 @@ void BilinearForm::Assemble(int skip_zeros)
-          }
-       }
- 
--      for (int i = 0; i < fes -> GetNE(); i++)
-+      for (int i = 0; i < fes->GetNE(); i++)
-       {
-          int elem_attr = fes->GetMesh()->GetAttribute(i);
-          doftrans = fes->GetElementVDofs(i, vdofs);
--         if (element_matrices)
--         {
--            elmat_p = &(*element_matrices)(i);
--         }
--         else
-+         elmat.SetSize(0);
-+         for (int k = 0; k < domain_integs.Size(); k++)
-          {
--            elmat.SetSize(0);
--            for (int k = 0; k < domain_integs.Size(); k++)
-+            if ( domain_integs_marker[k] == NULL ||
-+                 (*(domain_integs_marker[k]))[elem_attr-1] == 1)
-             {
--               if ( domain_integs_marker[k] == NULL ||
--                    (*(domain_integs_marker[k]))[elem_attr-1] == 1)
-+               const FiniteElement &fe = *fes->GetFE(i);
-+               eltrans = fes->GetElementTransformation(i);
-+               domain_integs[k]->AssembleElementMatrix(fe, *eltrans, elemmat);
-+               if (elmat.Size() == 0)
-                {
--                  const FiniteElement &fe = *fes->GetFE(i);
--                  eltrans = fes->GetElementTransformation(i);
--                  domain_integs[k]->AssembleElementMatrix(fe, *eltrans, elemmat);
--                  if (elmat.Size() == 0)
--                  {
--                     elmat = elemmat;
--                  }
--                  else
--                  {
--                     elmat += elemmat;
--                  }
-+                  elmat = elemmat;
-+               }
-+               else
-+               {
-+                  elmat += elemmat;
-                }
-             }
--            if (elmat.Size() == 0)
--            {
--               continue;
--            }
--            else
--            {
--               elmat_p = &elmat;
--            }
--            if (doftrans)
--            {
--               doftrans->TransformDual(elmat);
--            }
--            elmat_p = &elmat;
-          }
--         if (static_cond)
-+         if (elmat.Size() == 0)
-          {
--            static_cond->AssembleMatrix(i, *elmat_p);
-+            continue;
-          }
--         else
-+         if (doftrans)
-          {
--            mat->AddSubMatrix(vdofs, vdofs, *elmat_p, skip_zeros);
-+            doftrans->TransformDual(elmat);
-+         }
-+         if (!static_cond)
-+         {
-+            mat->AddSubMatrix(vdofs, vdofs, elmat, skip_zeros);
-             if (hybridization)
-             {
--               hybridization->AssembleMatrix(i, *elmat_p);
-+               hybridization->AssembleMatrix(i, elmat);
-             }
-          }
-+         else
-+         {
-+            static_cond->AssembleMatrix(i, elmat);
-+         }
-       }
-    }
- 
-@@ -504,14 +402,14 @@ void BilinearForm::Assemble(int skip_zeros)
-          }
-       }
- 
--      for (int i = 0; i < fes -> GetNBE(); i++)
-+      for (int i = 0; i < fes->GetNBE(); i++)
-       {
-          const int bdr_attr = mesh->GetBdrAttribute(i);
-          if (bdr_attr_marker[bdr_attr-1] == 0) { continue; }
- 
-          const FiniteElement &be = *fes->GetBE(i);
--         doftrans = fes -> GetBdrElementVDofs (i, vdofs);
--         eltrans = fes -> GetBdrElementTransformation (i);
-+         doftrans = fes->GetBdrElementVDofs(i, vdofs);
-+         eltrans = fes->GetBdrElementTransformation(i);
-          int k = 0;
-          for (; k < boundary_integs.Size(); k++)
-          {
-@@ -534,18 +432,17 @@ void BilinearForm::Assemble(int skip_zeros)
-          {
-             doftrans->TransformDual(elmat);
-          }
--         elmat_p = &elmat;
-          if (!static_cond)
-          {
--            mat->AddSubMatrix(vdofs, vdofs, *elmat_p, skip_zeros);
-+            mat->AddSubMatrix(vdofs, vdofs, elmat, skip_zeros);
-             if (hybridization)
-             {
--               hybridization->AssembleBdrMatrix(i, *elmat_p);
-+               hybridization->AssembleBdrMatrix(i, elmat);
-             }
-          }
-          else
-          {
--            static_cond->AssembleBdrMatrix(i, *elmat_p);
-+            static_cond->AssembleBdrMatrix(i, elmat);
-          }
-       }
-    }
-@@ -558,19 +455,18 @@ void BilinearForm::Assemble(int skip_zeros)
-       int nfaces = mesh->GetNumFaces();
-       for (int i = 0; i < nfaces; i++)
-       {
--         tr = mesh -> GetInteriorFaceTransformations (i);
-+         tr = mesh->GetInteriorFaceTransformations(i);
-          if (tr != NULL)
-          {
--            fes -> GetElementVDofs (tr -> Elem1No, vdofs);
--            fes -> GetElementVDofs (tr -> Elem2No, vdofs2);
--            vdofs.Append (vdofs2);
-+            fes->GetElementVDofs(tr->Elem1No, vdofs);
-+            fes->GetElementVDofs(tr->Elem2No, vdofs2);
-+            vdofs.Append(vdofs2);
-             for (int k = 0; k < interior_face_integs.Size(); k++)
-             {
--               interior_face_integs[k]->
--               AssembleFaceMatrix(*fes->GetFE(tr->Elem1No),
--                                  *fes->GetFE(tr->Elem2No),
--                                  *tr, elemmat);
--               mat -> AddSubMatrix (vdofs, vdofs, elemmat, skip_zeros);
-+               interior_face_integs[k]->AssembleFaceMatrix(*fes->GetFE(tr->Elem1No),
-+                                                           *fes->GetFE(tr->Elem2No),
-+                                                           *tr, elemmat);
-+               mat->AddSubMatrix(vdofs, vdofs, elemmat, skip_zeros);
-             }
-          }
-       }
-@@ -602,16 +498,16 @@ void BilinearForm::Assemble(int skip_zeros)
-          }
-       }
- 
--      for (int i = 0; i < fes -> GetNBE(); i++)
-+      for (int i = 0; i < fes->GetNBE(); i++)
-       {
-          const int bdr_attr = mesh->GetBdrAttribute(i);
-          if (bdr_attr_marker[bdr_attr-1] == 0) { continue; }
- 
--         tr = mesh -> GetBdrFaceTransformations (i);
-+         tr = mesh->GetBdrFaceTransformations(i);
-          if (tr != NULL)
-          {
--            fes -> GetElementVDofs (tr -> Elem1No, vdofs);
--            fe1 = fes -> GetFE (tr -> Elem1No);
-+            fes->GetElementVDofs(tr->Elem1No, vdofs);
-+            fe1 = fes->GetFE(tr->Elem1No);
-             // The fe2 object is really a dummy and not used on the boundaries,
-             // but we can't dereference a NULL pointer, and we don't want to
-             // actually make a fake element.
-@@ -622,20 +518,13 @@ void BilinearForm::Assemble(int skip_zeros)
-                    (*boundary_face_integs_marker[k])[bdr_attr-1] == 0)
-                { continue; }
- 
--               boundary_face_integs[k] -> AssembleFaceMatrix (*fe1, *fe2, *tr,
--                                                              elemmat);
--               mat -> AddSubMatrix (vdofs, vdofs, elemmat, skip_zeros);
-+               boundary_face_integs[k]->AssembleFaceMatrix(*fe1, *fe2, *tr,
-+                                                           elemmat);
-+               mat->AddSubMatrix(vdofs, vdofs, elemmat, skip_zeros);
-             }
-          }
-       }
-    }
--
--#ifdef MFEM_USE_LEGACY_OPENMP
--   if (free_element_matrices)
--   {
--      FreeElementMatrices();
--   }
--#endif
- }
- 
- void BilinearForm::ConformingAssemble()
-@@ -644,8 +533,9 @@ void BilinearForm::ConformingAssemble()
-    // matrix which in turn will give rise to symmetric structure in the new
-    // matrix. This ensures that subsequent calls to EliminateRowCol will work
-    // correctly.
--   Finalize(0);
-    MFEM_ASSERT(mat, "the BilinearForm is not assembled");
-+   const int remove_zeros = 0;
-+   Finalize(remove_zeros);
- 
-    const SparseMatrix *P = fes->GetConformingProlongation();
-    if (!P) { return; } // conforming mesh
-@@ -693,7 +583,6 @@ void BilinearForm::AssembleDiagonal(Vector &diag) const
-       return;
-    }
-    // Here, we have extension, ext, and conforming prolongation, cP.
--
-    // For an AMR mesh, a convergent diagonal is assembled with |P^T| d_l,
-    // where |P^T| has the entry-wise absolute values of the conforming
-    // prolongation transpose operator.
-@@ -708,12 +597,26 @@ void BilinearForm::FormLinearSystem(const Array<int> &ess_tdof_list, Vector &x,
- {
-    if (ext)
-    {
--      ext->FormLinearSystem(ess_tdof_list, x, b, A, X, B, copy_interior);
-+      Operator *oper;
-+      ext->FormLinearSystem(ess_tdof_list, x, b, oper, X, B, copy_interior);
-+      if (assembly == AssemblyLevel::FULL)
-+      {
-+         delete oper;
-+         FormSystemMatrix(ess_tdof_list, A);
-+      }
-+      else
-+      {
-+         A.Reset(oper);
-+      }
-       return;
-    }
--   const SparseMatrix *P = fes->GetConformingProlongation();
-+
-+   // Finish the matrix assembly and perform BC elimination, storing the
-+   // eliminated part of the matrix.
-    FormSystemMatrix(ess_tdof_list, A);
- 
-+   const SparseMatrix *P = fes->GetConformingProlongation();
-+
-    // Transform the system and perform the elimination in B, based on the
-    // essential BC values from x. Restrict the BC part of x in X, and set the
-    // non-BC part to zero. Since there is no good initial guess for the Lagrange
-@@ -776,7 +679,23 @@ void BilinearForm::FormSystemMatrix(const Array<int> &ess_tdof_list,
- {
-    if (ext)
-    {
--      ext->FormSystemMatrix(ess_tdof_list, A);
-+      if (assembly == AssemblyLevel::FULL)
-+      {
-+         // Always does `DIAG_ONE` policy to be consistent with
-+         // `Operator::FormConstrainedSystemOperator`.
-+         MFEM_VERIFY(diag_policy == DiagonalPolicy::DIAG_ONE,
-+                     "Only DiagonalPolicy::DIAG_ONE supported with"
-+                     " FABilinearFormExtension.");
-+         ConformingAssemble();
-+         mat->EliminateBC(ess_tdof_list, DiagonalPolicy::DIAG_ONE);
-+         A.Reset(mat, false);
-+      }
-+      else
-+      {
-+         Operator *oper;
-+         ext->FormSystemOperator(ess_tdof_list, oper);
-+         A.Reset(oper);
-+      }
-       return;
-    }
- 
-@@ -797,8 +716,7 @@ void BilinearForm::FormSystemMatrix(const Array<int> &ess_tdof_list,
-    {
-       if (!mat_e)
-       {
--         const SparseMatrix *P = fes->GetConformingProlongation();
--         if (P) { ConformingAssemble(); }
-+         ConformingAssemble();
-          EliminateVDofs(ess_tdof_list, diag_policy);
-          const int remove_zeros = 0;
-          Finalize(remove_zeros);
-@@ -872,48 +790,6 @@ void BilinearForm::RecoverFEMSolution(const Vector &X,
-    }
- }
- 
--void BilinearForm::ComputeElementMatrices()
--{
--   if (element_matrices || domain_integs.Size() == 0 || fes->GetNE() == 0)
--   {
--      return;
--   }
--
--   int num_elements = fes->GetNE();
--   int num_dofs_per_el = fes->GetFE(0)->GetDof() * fes->GetVDim();
--
--   element_matrices = new DenseTensor(num_dofs_per_el, num_dofs_per_el,
--                                      num_elements);
--
--   DenseMatrix tmp;
--   IsoparametricTransformation eltrans;
--
--#ifdef MFEM_USE_LEGACY_OPENMP
--   #pragma omp parallel for private(tmp,eltrans)
--#endif
--   for (int i = 0; i < num_elements; i++)
--   {
--      DenseMatrix elmat(element_matrices->GetData(i),
--                        num_dofs_per_el, num_dofs_per_el);
--      const FiniteElement &fe = *fes->GetFE(i);
--#ifdef MFEM_DEBUG
--      if (num_dofs_per_el != fe.GetDof()*fes->GetVDim())
--         mfem_error("BilinearForm::ComputeElementMatrices:"
--                    " all elements must have same number of dofs");
--#endif
--      fes->GetElementTransformation(i, &eltrans);
--
--      domain_integs[0]->AssembleElementMatrix(fe, eltrans, elmat);
--      for (int k = 1; k < domain_integs.Size(); k++)
--      {
--         // note: some integrators may not be thread-safe
--         domain_integs[k]->AssembleElementMatrix(fe, eltrans, tmp);
--         elmat += tmp;
--      }
--      elmat.ClearExternalData();
--   }
--}
--
- void BilinearForm::EliminateEssentialBC(const Array<int> &bdr_attr_is_ess,
-                                         const Vector &sol, Vector &rhs,
-                                         DiagonalPolicy dpolicy)
-@@ -949,8 +825,8 @@ void BilinearForm::EliminateEssentialBC(const Array<int> &bdr_attr_is_ess,
-    }
- }
- 
--void BilinearForm::EliminateEssentialBCDiag (const Array<int> &bdr_attr_is_ess,
--                                             double value)
-+void BilinearForm::EliminateEssentialBCDiag(const Array<int> &bdr_attr_is_ess,
-+                                            double value)
- {
-    Array<int> ess_dofs, conf_ess_dofs;
-    fes->GetEssentialVDofs(bdr_attr_is_ess, ess_dofs);
-@@ -976,11 +852,11 @@ void BilinearForm::EliminateVDofs(const Array<int> &vdofs_,
-       int vdof = vdofs_[i];
-       if ( vdof >= 0 )
-       {
--         mat -> EliminateRowCol (vdof, sol(vdof), rhs, dpolicy);
-+         mat->EliminateRowCol(vdof, sol(vdof), rhs, dpolicy);
-       }
-       else
-       {
--         mat -> EliminateRowCol (-1-vdof, sol(-1-vdof), rhs, dpolicy);
-+         mat->EliminateRowCol(-1-vdof, sol(-1-vdof), rhs, dpolicy);
-       }
-    }
- }
-@@ -999,11 +875,11 @@ void BilinearForm::EliminateVDofs(const Array<int> &vdofs_,
-       int vdof = vdofs_[i];
-       if ( vdof >= 0 )
-       {
--         mat -> EliminateRowCol (vdof, *mat_e, dpolicy);
-+         mat->EliminateRowCol(vdof, *mat_e, dpolicy);
-       }
-       else
-       {
--         mat -> EliminateRowCol (-1-vdof, *mat_e, dpolicy);
-+         mat->EliminateRowCol(-1-vdof, *mat_e, dpolicy);
-       }
-    }
- }
-@@ -1019,31 +895,31 @@ void BilinearForm::EliminateEssentialBCFromDofs(
-    for (int i = 0; i < ess_dofs.Size(); i++)
-       if (ess_dofs[i] < 0)
-       {
--         mat -> EliminateRowCol (i, sol(i), rhs, dpolicy);
-+         mat->EliminateRowCol(i, sol(i), rhs, dpolicy);
-       }
- }
- 
--void BilinearForm::EliminateEssentialBCFromDofs (const Array<int> &ess_dofs,
--                                                 DiagonalPolicy dpolicy)
-+void BilinearForm::EliminateEssentialBCFromDofs(const Array<int> &ess_dofs,
-+                                                DiagonalPolicy dpolicy)
- {
-    MFEM_ASSERT(ess_dofs.Size() == height, "incorrect dof Array size");
- 
-    for (int i = 0; i < ess_dofs.Size(); i++)
-       if (ess_dofs[i] < 0)
-       {
--         mat -> EliminateRowCol (i, dpolicy);
-+         mat->EliminateRowCol(i, dpolicy);
-       }
- }
- 
--void BilinearForm::EliminateEssentialBCFromDofsDiag (const Array<int> &ess_dofs,
--                                                     double value)
-+void BilinearForm::EliminateEssentialBCFromDofsDiag(const Array<int> &ess_dofs,
-+                                                    double value)
- {
-    MFEM_ASSERT(ess_dofs.Size() == height, "incorrect dof Array size");
- 
-    for (int i = 0; i < ess_dofs.Size(); i++)
-       if (ess_dofs[i] < 0)
-       {
--         mat -> EliminateRowColDiag (i, value);
-+         mat->EliminateRowColDiag(i, value);
-       }
- }
- 
-@@ -1054,31 +930,6 @@ void BilinearForm::EliminateVDofsInRHS(
-    mat->PartMult(vdofs_, x, b);
- }
- 
--void BilinearForm::Mult(const Vector &x, Vector &y) const
--{
--   if (ext)
--   {
--      ext->Mult(x, y);
--   }
--   else
--   {
--      mat->Mult(x, y);
--   }
--}
--
--void BilinearForm::MultTranspose(const Vector & x, Vector & y) const
--{
--   if (ext)
--   {
--      ext->MultTranspose(x, y);
--   }
--   else
--   {
--      y = 0.0;
--      AddMultTranspose (x, y);
--   }
--}
--
- void BilinearForm::Update(FiniteElementSpace *nfes)
- {
-    bool full_update;
-@@ -1098,7 +949,6 @@ void BilinearForm::Update(FiniteElementSpace *nfes)
- 
-    delete mat_e;
-    mat_e = NULL;
--   FreeElementMatrices();
-    delete static_cond;
-    static_cond = NULL;
- 
-@@ -1121,16 +971,10 @@ void BilinearForm::Update(FiniteElementSpace *nfes)
-    if (ext) { ext->Update(); }
- }
- 
--void BilinearForm::SetDiagonalPolicy(DiagonalPolicy policy)
--{
--   diag_policy = policy;
--}
--
- BilinearForm::~BilinearForm()
- {
-    delete mat_e;
-    delete mat;
--   delete element_matrices;
-    delete static_cond;
-    delete hybridization;
- 
-@@ -1148,9 +992,8 @@ BilinearForm::~BilinearForm()
-    delete ext;
- }
- 
--
--MixedBilinearForm::MixedBilinearForm (FiniteElementSpace *tr_fes,
--                                      FiniteElementSpace *te_fes)
-+MixedBilinearForm::MixedBilinearForm(FiniteElementSpace *tr_fes,
-+                                     FiniteElementSpace *te_fes)
-    : Matrix(te_fes->GetVSize(), tr_fes->GetVSize())
- {
-    trial_fes = tr_fes;
-@@ -1162,9 +1005,9 @@ MixedBilinearForm::MixedBilinearForm (FiniteElementSpace *tr_fes,
-    ext = NULL;
- }
- 
--MixedBilinearForm::MixedBilinearForm (FiniteElementSpace *tr_fes,
--                                      FiniteElementSpace *te_fes,
--                                      MixedBilinearForm * mbf)
-+MixedBilinearForm::MixedBilinearForm(FiniteElementSpace *tr_fes,
-+                                     FiniteElementSpace *te_fes,
-+                                     MixedBilinearForm *mbf)
-    : Matrix(te_fes->GetVSize(), tr_fes->GetVSize())
- {
-    trial_fes = tr_fes;
-@@ -1173,6 +1016,8 @@ MixedBilinearForm::MixedBilinearForm (FiniteElementSpace *tr_fes,
-    mat_e = NULL;
-    extern_bfs = 1;
-    ext = NULL;
-+   assembly = AssemblyLevel::LEGACY;
-+   ext = NULL;
- 
-    // Copy the pointers to the integrators
-    domain_integs = mbf->domain_integs;
-@@ -1182,9 +1027,6 @@ MixedBilinearForm::MixedBilinearForm (FiniteElementSpace *tr_fes,
- 
-    boundary_integs_marker = mbf->boundary_integs_marker;
-    boundary_trace_face_integs_marker = mbf->boundary_trace_face_integs_marker;
--
--   assembly = AssemblyLevel::LEGACY;
--   ext = NULL;
- }
- 
- void MixedBilinearForm::SetAssemblyLevel(AssemblyLevel assembly_level)
-@@ -1210,31 +1052,36 @@ void MixedBilinearForm::SetAssemblyLevel(AssemblyLevel assembly_level)
-          ext = new PAMixedBilinearFormExtension(this);
-          break;
-       case AssemblyLevel::NONE:
--         mfem_error("Matrix-free action not supported yet... stay tuned!");
--         // ext = new MFMixedBilinearFormExtension(this);
-+         ext = new MFMixedBilinearFormExtension(this);
-          break;
-       default:
-          mfem_error("Unknown assembly level");
-    }
- }
- 
--double & MixedBilinearForm::Elem (int i, int j)
-+double &MixedBilinearForm::Elem(int i, int j)
- {
-    return (*mat)(i, j);
- }
- 
--const double & MixedBilinearForm::Elem (int i, int j) const
-+const double &MixedBilinearForm::Elem(int i, int j) const
- {
-    return (*mat)(i, j);
- }
- 
--void MixedBilinearForm::Mult(const Vector & x, Vector & y) const
-+void MixedBilinearForm::Mult(const Vector &x, Vector &y) const
- {
--   y = 0.0;
--   AddMult(x, y);
-+   if (ext)
-+   {
-+      ext->Mult(x, y);
-+   }
-+   else
-+   {
-+      mat->Mult(x, y);
-+   }
- }
- 
--void MixedBilinearForm::AddMult(const Vector & x, Vector & y,
-+void MixedBilinearForm::AddMult(const Vector &x, Vector &y,
-                                 const double a) const
- {
-    if (ext)
-@@ -1247,13 +1094,19 @@ void MixedBilinearForm::AddMult(const Vector & x, Vector & y,
-    }
- }
- 
--void MixedBilinearForm::MultTranspose(const Vector & x, Vector & y) const
-+void MixedBilinearForm::MultTranspose(const Vector &x, Vector &y) const
- {
--   y = 0.0;
--   AddMultTranspose(x, y);
-+   if (ext)
-+   {
-+      ext->MultTranspose(x, y);
-+   }
-+   else
-+   {
-+      mat->MultTranspose(x, y);
-+   }
- }
- 
--void MixedBilinearForm::AddMultTranspose(const Vector & x, Vector & y,
-+void MixedBilinearForm::AddMultTranspose(const Vector &x, Vector &y,
-                                          const double a) const
- {
-    if (ext)
-@@ -1266,7 +1119,7 @@ void MixedBilinearForm::AddMultTranspose(const Vector & x, Vector & y,
-    }
- }
- 
--MatrixInverse * MixedBilinearForm::Inverse() const
-+MatrixInverse *MixedBilinearForm::Inverse() const
- {
-    if (assembly != AssemblyLevel::LEGACY)
-    {
-@@ -1276,15 +1129,15 @@ MatrixInverse * MixedBilinearForm::Inverse() const
-    }
-    else
-    {
--      return mat -> Inverse ();
-+      return mat->Inverse();
-    }
- }
- 
--void MixedBilinearForm::Finalize (int skip_zeros)
-+void MixedBilinearForm::Finalize(int skip_zeros)
- {
-    if (assembly == AssemblyLevel::LEGACY)
-    {
--      mat -> Finalize (skip_zeros);
-+      mat->Finalize(skip_zeros);
-    }
- }
- 
-@@ -1300,27 +1153,27 @@ void MixedBilinearForm::GetBlocks(Array2D<SparseMatrix *> &blocks) const
-    mat->GetBlocks(blocks);
- }
- 
--void MixedBilinearForm::AddDomainIntegrator (BilinearFormIntegrator * bfi)
-+void MixedBilinearForm::AddDomainIntegrator(BilinearFormIntegrator *bfi)
- {
--   domain_integs.Append (bfi);
-+   domain_integs.Append(bfi);
- }
- 
--void MixedBilinearForm::AddBoundaryIntegrator (BilinearFormIntegrator * bfi)
-+void MixedBilinearForm::AddBoundaryIntegrator(BilinearFormIntegrator *bfi)
- {
--   boundary_integs.Append (bfi);
-+   boundary_integs.Append(bfi);
-    boundary_integs_marker.Append(NULL); // NULL marker means apply everywhere
- }
- 
--void MixedBilinearForm::AddBoundaryIntegrator (BilinearFormIntegrator * bfi,
--                                               Array<int> &bdr_marker)
-+void MixedBilinearForm::AddBoundaryIntegrator(BilinearFormIntegrator *bfi,
-+                                              Array<int> &bdr_marker)
- {
--   boundary_integs.Append (bfi);
-+   boundary_integs.Append(bfi);
-    boundary_integs_marker.Append(&bdr_marker);
- }
- 
--void MixedBilinearForm::AddTraceFaceIntegrator (BilinearFormIntegrator * bfi)
-+void MixedBilinearForm::AddTraceFaceIntegrator(BilinearFormIntegrator *bfi)
- {
--   trace_face_integs.Append (bfi);
-+   trace_face_integs.Append(bfi);
- }
- 
- void MixedBilinearForm::AddBdrTraceFaceIntegrator(BilinearFormIntegrator *bfi)
-@@ -1337,7 +1190,45 @@ void MixedBilinearForm::AddBdrTraceFaceIntegrator(BilinearFormIntegrator *bfi,
-    boundary_trace_face_integs_marker.Append(&bdr_marker);
- }
- 
--void MixedBilinearForm::Assemble (int skip_zeros)
-+void MixedBilinearForm::AssembleElementMatrix(
-+   int i, const DenseMatrix &elmat, int skip_zeros)
-+{
-+   AssembleElementMatrix(i, elmat, trial_vdofs, test_vdofs, skip_zeros);
-+}
-+
-+void MixedBilinearForm::AssembleElementMatrix(
-+   int i, const DenseMatrix &elmat, Array<int> &trial_vdofs_,
-+   Array<int> &test_vdofs_, int skip_zeros)
-+{
-+   trial_fes->GetElementVDofs(i, trial_vdofs_);
-+   test_fes->GetElementVDofs(i, test_vdofs_);
-+   if (mat == NULL)
-+   {
-+      mat = new SparseMatrix(height, width);
-+   }
-+   mat->AddSubMatrix(test_vdofs_, trial_vdofs_, elmat, skip_zeros);
-+}
-+
-+void MixedBilinearForm::AssembleBdrElementMatrix(
-+   int i, const DenseMatrix &elmat, int skip_zeros)
-+{
-+   AssembleBdrElementMatrix(i, elmat, trial_vdofs, test_vdofs, skip_zeros);
-+}
-+
-+void MixedBilinearForm::AssembleBdrElementMatrix(
-+   int i, const DenseMatrix &elmat, Array<int> &trial_vdofs_,
-+   Array<int> &test_vdofs_, int skip_zeros)
-+{
-+   trial_fes->GetBdrElementVDofs(i, trial_vdofs_);
-+   test_fes->GetBdrElementVDofs(i, test_vdofs_);
-+   if (mat == NULL)
-+   {
-+      mat = new SparseMatrix(height, width);
-+   }
-+   mat->AddSubMatrix(test_vdofs_, trial_vdofs_, elmat, skip_zeros);
-+}
-+
-+void MixedBilinearForm::Assemble(int skip_zeros)
- {
-    if (ext)
-    {
-@@ -1346,12 +1237,11 @@ void MixedBilinearForm::Assemble (int skip_zeros)
-    }
- 
-    ElementTransformation *eltrans;
--   DofTransformation * dom_dof_trans;
--   DofTransformation * ran_dof_trans;
-+   DofTransformation *dom_dof_trans;
-+   DofTransformation *ran_dof_trans;
-+   Mesh *mesh = test_fes->GetMesh();
-    DenseMatrix elmat;
- 
--   Mesh *mesh = test_fes -> GetMesh();
--
-    if (mat == NULL)
-    {
-       mat = new SparseMatrix(height, width);
-@@ -1359,26 +1249,26 @@ void MixedBilinearForm::Assemble (int skip_zeros)
- 
-    if (domain_integs.Size())
-    {
--      for (int i = 0; i < test_fes -> GetNE(); i++)
-+      for (int i = 0; i < test_fes->GetNE(); i++)
-       {
--         dom_dof_trans = trial_fes -> GetElementVDofs (i, trial_vdofs);
--         ran_dof_trans = test_fes  -> GetElementVDofs (i, test_vdofs);
--         eltrans = test_fes -> GetElementTransformation (i);
-+         dom_dof_trans = trial_fes->GetElementVDofs(i, trial_vdofs);
-+         ran_dof_trans = test_fes->GetElementVDofs(i, test_vdofs);
-+         eltrans = test_fes->GetElementTransformation(i);
- 
-          elmat.SetSize(test_vdofs.Size(), trial_vdofs.Size());
-          elmat = 0.0;
-          for (int k = 0; k < domain_integs.Size(); k++)
-          {
--            domain_integs[k] -> AssembleElementMatrix2 (*trial_fes -> GetFE(i),
--                                                        *test_fes  -> GetFE(i),
--                                                        *eltrans, elemmat);
-+            domain_integs[k]->AssembleElementMatrix2(*trial_fes->GetFE(i),
-+                                                     *test_fes->GetFE(i),
-+                                                     *eltrans, elemmat);
-             elmat += elemmat;
-          }
-          if (ran_dof_trans || dom_dof_trans)
-          {
-             TransformDual(ran_dof_trans, dom_dof_trans, elmat);
-          }
--         mat -> AddSubMatrix (test_vdofs, trial_vdofs, elmat, skip_zeros);
-+         mat->AddSubMatrix(test_vdofs, trial_vdofs, elmat, skip_zeros);
-       }
-    }
- 
-@@ -1405,14 +1295,14 @@ void MixedBilinearForm::Assemble (int skip_zeros)
-          }
-       }
- 
--      for (int i = 0; i < test_fes -> GetNBE(); i++)
-+      for (int i = 0; i < test_fes->GetNBE(); i++)
-       {
-          const int bdr_attr = mesh->GetBdrAttribute(i);
-          if (bdr_attr_marker[bdr_attr-1] == 0) { continue; }
- 
--         dom_dof_trans = trial_fes -> GetBdrElementVDofs (i, trial_vdofs);
--         ran_dof_trans = test_fes  -> GetBdrElementVDofs (i, test_vdofs);
--         eltrans = test_fes -> GetBdrElementTransformation (i);
-+         dom_dof_trans = trial_fes->GetBdrElementVDofs(i, trial_vdofs);
-+         ran_dof_trans = test_fes->GetBdrElementVDofs(i, test_vdofs);
-+         eltrans = test_fes->GetBdrElementTransformation(i);
- 
-          elmat.SetSize(test_vdofs.Size(), trial_vdofs.Size());
-          elmat = 0.0;
-@@ -1421,16 +1311,16 @@ void MixedBilinearForm::Assemble (int skip_zeros)
-             if (boundary_integs_marker[k] &&
-                 (*boundary_integs_marker[k])[bdr_attr-1] == 0) { continue; }
- 
--            boundary_integs[k]->AssembleElementMatrix2 (*trial_fes -> GetBE(i),
--                                                        *test_fes  -> GetBE(i),
--                                                        *eltrans, elemmat);
-+            boundary_integs[k]->AssembleElementMatrix2(*trial_fes->GetBE(i),
-+                                                       *test_fes->GetBE(i),
-+                                                       *eltrans, elemmat);
-             elmat += elemmat;
-          }
-          if (ran_dof_trans || dom_dof_trans)
-          {
-             TransformDual(ran_dof_trans, dom_dof_trans, elmat);
-          }
--         mat -> AddSubMatrix (test_vdofs, trial_vdofs, elmat, skip_zeros);
-+         mat->AddSubMatrix(test_vdofs, trial_vdofs, elmat, skip_zeros);
-       }
-    }
- 
-@@ -1497,7 +1387,7 @@ void MixedBilinearForm::Assemble (int skip_zeros)
-          }
-       }
- 
--      for (int i = 0; i < trial_fes -> GetNBE(); i++)
-+      for (int i = 0; i < trial_fes->GetNBE(); i++)
-       {
-          const int bdr_attr = mesh->GetBdrAttribute(i);
-          if (bdr_attr_marker[bdr_attr-1] == 0) { continue; }
-@@ -1530,6 +1420,37 @@ void MixedBilinearForm::Assemble (int skip_zeros)
-    }
- }
- 
-+void MixedBilinearForm::ConformingAssemble()
-+{
-+   if (assembly != AssemblyLevel::LEGACY)
-+   {
-+      MFEM_WARNING("Conforming assemble not supported for this assembly level!");
-+      return;
-+   }
-+
-+   const int remove_zeros = 0;
-+   Finalize(remove_zeros);
-+
-+   const SparseMatrix *test_P = test_fes->GetConformingProlongation();
-+   if (test_P)
-+   {
-+      SparseMatrix *RA = mfem::TransposeMult(*test_P, *mat);
-+      delete mat;
-+      mat = RA;
-+   }
-+
-+   const SparseMatrix *trial_P = trial_fes->GetConformingProlongation();
-+   if (trial_P)
-+   {
-+      SparseMatrix *RAP = mfem::Mult(*mat, *trial_P);
-+      delete mat;
-+      mat = RAP;
-+   }
-+
-+   height = mat->Height();
-+   width = mat->Width();
-+}
-+
- void MixedBilinearForm::AssembleDiagonal_ADAt(const Vector &D,
-                                               Vector &diag) const
- {
-@@ -1578,259 +1499,119 @@ void MixedBilinearForm::AssembleDiagonal_ADAt(const Vector &D,
-    }
- }
- 
--void MixedBilinearForm::ConformingAssemble()
-+void MixedBilinearForm::FormRectangularLinearSystem(
-+   const Array<int> &trial_tdof_list,
-+   const Array<int> &test_tdof_list,
-+   Vector &x, Vector &b,
-+   OperatorHandle &A,
-+   Vector &X, Vector &B)
- {
--   if (assembly != AssemblyLevel::LEGACY)
-+   if (ext)
-    {
--      MFEM_WARNING("Conforming assemble not supported for this assembly level!");
-+      Operator *oper;
-+      ext->FormRectangularLinearSystem(trial_tdof_list, test_tdof_list,
-+                                       x, b, oper, X, B);
-+      A.Reset(oper);
-       return;
-    }
- 
--   Finalize();
--
--   const SparseMatrix *P2 = test_fes->GetConformingProlongation();
--   if (P2)
--   {
--      SparseMatrix *R = Transpose(*P2);
--      SparseMatrix *RA = mfem::Mult(*R, *mat);
--      delete R;
--      delete mat;
--      mat = RA;
--   }
-+   const Operator *Pi = this->GetProlongation();
-+   const Operator *Po = this->GetOutputProlongation();
-+   const Operator *Ri = this->GetRestriction();
-+   InitTVectors(Po, Ri, Pi, x, b, X, B);
- 
--   const SparseMatrix *P1 = trial_fes->GetConformingProlongation();
--   if (P1)
-+   if (!mat_e)
-    {
--      SparseMatrix *RAP = mfem::Mult(*mat, *P1);
--      delete mat;
--      mat = RAP;
-+      // Set A = mat_e
-+      FormRectangularSystemMatrix(trial_tdof_list, test_tdof_list, A);
-    }
-+   // Eliminate essential BCs with B -= Ab xb
-+   mat_e->AddMult(X, B, -1.0);
- 
--   height = mat->Height();
--   width = mat->Width();
--}
--
--
--void MixedBilinearForm::ComputeElementMatrix(int i, DenseMatrix &elmat)
--{
--   if (domain_integs.Size())
--   {
--      const FiniteElement &trial_fe = *trial_fes->GetFE(i);
--      const FiniteElement &test_fe = *test_fes->GetFE(i);
--      ElementTransformation *eltrans = test_fes->GetElementTransformation(i);
--      domain_integs[0]->AssembleElementMatrix2(trial_fe, test_fe, *eltrans,
--                                               elmat);
--      for (int k = 1; k < domain_integs.Size(); k++)
--      {
--         domain_integs[k]->AssembleElementMatrix2(trial_fe, test_fe, *eltrans,
--                                                  elemmat);
--         elmat += elemmat;
--      }
--   }
--   else
--   {
--      trial_fes->GetElementVDofs(i, trial_vdofs);
--      test_fes->GetElementVDofs(i, test_vdofs);
--      elmat.SetSize(test_vdofs.Size(), trial_vdofs.Size());
--      elmat = 0.0;
--   }
-+   B.SetSubVector(test_tdof_list, 0.0);
- }
- 
--void MixedBilinearForm::ComputeBdrElementMatrix(int i, DenseMatrix &elmat)
-+void MixedBilinearForm::FormRectangularSystemMatrix(
-+   const Array<int> &trial_tdof_list,
-+   const Array<int> &test_tdof_list,
-+   OperatorHandle &A)
- {
--   if (boundary_integs.Size())
--   {
--      const FiniteElement &trial_be = *trial_fes->GetBE(i);
--      const FiniteElement &test_be = *test_fes->GetBE(i);
--      ElementTransformation *eltrans = test_fes->GetBdrElementTransformation(i);
--      boundary_integs[0]->AssembleElementMatrix2(trial_be, test_be, *eltrans,
--                                                 elmat);
--      for (int k = 1; k < boundary_integs.Size(); k++)
--      {
--         boundary_integs[k]->AssembleElementMatrix2(trial_be, test_be, *eltrans,
--                                                    elemmat);
--         elmat += elemmat;
--      }
--   }
--   else
-+   if (ext)
-    {
--      trial_fes->GetBdrElementVDofs(i, trial_vdofs);
--      test_fes->GetBdrElementVDofs(i, test_vdofs);
--      elmat.SetSize(test_vdofs.Size(), trial_vdofs.Size());
--      elmat = 0.0;
-+      Operator *oper;
-+      ext->FormRectangularSystemOperator(trial_tdof_list, test_tdof_list, oper);
-+      A.Reset(oper);
-+      return;
-    }
--}
- 
--void MixedBilinearForm::AssembleElementMatrix(
--   int i, const DenseMatrix &elmat, int skip_zeros)
--{
--   AssembleElementMatrix(i, elmat, trial_vdofs, test_vdofs, skip_zeros);
--}
-+   ConformingAssemble();
- 
--void MixedBilinearForm::AssembleElementMatrix(
--   int i, const DenseMatrix &elmat, Array<int> &trial_vdofs_,
--   Array<int> &test_vdofs_, int skip_zeros)
--{
--   trial_fes->GetElementVDofs(i, trial_vdofs_);
--   test_fes->GetElementVDofs(i, test_vdofs_);
--   if (mat == NULL)
--   {
--      mat = new SparseMatrix(height, width);
--   }
--   mat->AddSubMatrix(test_vdofs_, trial_vdofs_, elmat, skip_zeros);
--}
-+   Array<int> ess_trial_tdof_marker, ess_test_tdof_marker;
-+   FiniteElementSpace::ListToMarker(trial_tdof_list, trial_fes->GetTrueVSize(),
-+                                    ess_trial_tdof_marker);
-+   FiniteElementSpace::ListToMarker(test_tdof_list, test_fes->GetTrueVSize(),
-+                                    ess_test_tdof_marker);
- 
--void MixedBilinearForm::AssembleBdrElementMatrix(
--   int i, const DenseMatrix &elmat, int skip_zeros)
--{
--   AssembleBdrElementMatrix(i, elmat, trial_vdofs, test_vdofs, skip_zeros);
--}
-+   mat_e = new SparseMatrix(mat->Height(), mat->Width());
-+   mat->EliminateCols(ess_trial_tdof_marker, *mat_e);
- 
--void MixedBilinearForm::AssembleBdrElementMatrix(
--   int i, const DenseMatrix &elmat, Array<int> &trial_vdofs_,
--   Array<int> &test_vdofs_, int skip_zeros)
--{
--   trial_fes->GetBdrElementVDofs(i, trial_vdofs_);
--   test_fes->GetBdrElementVDofs(i, test_vdofs_);
--   if (mat == NULL)
-+   for (int i = 0; i < test_tdof_list.Size(); i++)
-    {
--      mat = new SparseMatrix(height, width);
-+      mat->EliminateRow(test_tdof_list[i]);
-    }
--   mat->AddSubMatrix(test_vdofs_, trial_vdofs_, elmat, skip_zeros);
-+   mat_e->Finalize();
-+   A.Reset(mat, false);
- }
- 
--void MixedBilinearForm::EliminateTrialDofs (
-+void MixedBilinearForm::EliminateTrialDofs(
-    const Array<int> &bdr_attr_is_ess, const Vector &sol, Vector &rhs )
- {
-    int i, j, k;
--   Array<int> tr_vdofs, cols_marker (trial_fes -> GetVSize());
-+   Array<int> tr_vdofs, cols_marker(trial_fes->GetVSize());
- 
-    cols_marker = 0;
--   for (i = 0; i < trial_fes -> GetNBE(); i++)
--      if (bdr_attr_is_ess[trial_fes -> GetBdrAttribute (i)-1])
-+   for (i = 0; i < trial_fes->GetNBE(); i++)
-+      if (bdr_attr_is_ess[trial_fes->GetBdrAttribute(i)-1])
-       {
--         trial_fes -> GetBdrElementVDofs (i, tr_vdofs);
-+         trial_fes->GetBdrElementVDofs(i, tr_vdofs);
-          for (j = 0; j < tr_vdofs.Size(); j++)
-          {
--            if ( (k = tr_vdofs[j]) < 0 )
-+            if ((k = tr_vdofs[j]) < 0)
-             {
-                k = -1-k;
-             }
-             cols_marker[k] = 1;
-          }
-       }
--   mat -> EliminateCols (cols_marker, &sol, &rhs);
-+   mat->EliminateCols(cols_marker, &sol, &rhs);
- }
- 
--void MixedBilinearForm::EliminateEssentialBCFromTrialDofs (
-+void MixedBilinearForm::EliminateEssentialBCFromTrialDofs(
-    const Array<int> &marked_vdofs, const Vector &sol, Vector &rhs)
- {
--   mat -> EliminateCols (marked_vdofs, &sol, &rhs);
-+   mat->EliminateCols(marked_vdofs, &sol, &rhs);
- }
- 
--void MixedBilinearForm::EliminateTestDofs (const Array<int> &bdr_attr_is_ess)
-+void MixedBilinearForm::EliminateTestDofs(const Array<int> &bdr_attr_is_ess)
- {
-    int i, j, k;
-    Array<int> te_vdofs;
- 
--   for (i = 0; i < test_fes -> GetNBE(); i++)
--      if (bdr_attr_is_ess[test_fes -> GetBdrAttribute (i)-1])
-+   for (i = 0; i < test_fes->GetNBE(); i++)
-+      if (bdr_attr_is_ess[test_fes->GetBdrAttribute(i)-1])
-       {
--         test_fes -> GetBdrElementVDofs (i, te_vdofs);
-+         test_fes->GetBdrElementVDofs(i, te_vdofs);
-          for (j = 0; j < te_vdofs.Size(); j++)
-          {
--            if ( (k = te_vdofs[j]) < 0 )
-+            if ((k = te_vdofs[j]) < 0)
-             {
-                k = -1-k;
-             }
--            mat -> EliminateRow (k);
-+            mat->EliminateRow(k);
-          }
-       }
- }
- 
--void MixedBilinearForm::FormRectangularSystemMatrix(
--   const Array<int> &trial_tdof_list,
--   const Array<int> &test_tdof_list,
--   OperatorHandle &A)
--
--{
--   if (ext)
--   {
--      ext->FormRectangularSystemOperator(trial_tdof_list, test_tdof_list, A);
--      return;
--   }
--
--   const SparseMatrix *test_P = test_fes->GetConformingProlongation();
--   const SparseMatrix *trial_P = trial_fes->GetConformingProlongation();
--
--   mat->Finalize();
--
--   if (test_P && trial_P)
--   {
--      SparseMatrix *m = RAP(*test_P, *mat, *trial_P);
--      delete mat;
--      mat = m;
--   }
--   else if (test_P)
--   {
--      SparseMatrix *m = TransposeMult(*test_P, *mat);
--      delete mat;
--      mat = m;
--   }
--   else if (trial_P)
--   {
--      SparseMatrix *m = mfem::Mult(*mat, *trial_P);
--      delete mat;
--      mat = m;
--   }
--
--   Array<int> ess_trial_tdof_marker, ess_test_tdof_marker;
--   FiniteElementSpace::ListToMarker(trial_tdof_list, trial_fes->GetTrueVSize(),
--                                    ess_trial_tdof_marker);
--   FiniteElementSpace::ListToMarker(test_tdof_list, test_fes->GetTrueVSize(),
--                                    ess_test_tdof_marker);
--
--   mat_e = new SparseMatrix(mat->Height(), mat->Width());
--   mat->EliminateCols(ess_trial_tdof_marker, *mat_e);
--
--   for (int i=0; i<test_tdof_list.Size(); ++i)
--   {
--      mat->EliminateRow(test_tdof_list[i]);
--   }
--   mat_e->Finalize();
--   A.Reset(mat, false);
--}
--
--void MixedBilinearForm::FormRectangularLinearSystem(
--   const Array<int> &trial_tdof_list,
--   const Array<int> &test_tdof_list,
--   Vector &x, Vector &b,
--   OperatorHandle &A,
--   Vector &X, Vector &B)
--{
--   if (ext)
--   {
--      ext->FormRectangularLinearSystem(trial_tdof_list, test_tdof_list,
--                                       x, b, A, X, B);
--      return;
--   }
--
--   const Operator *Pi = this->GetProlongation();
--   const Operator *Po = this->GetOutputProlongation();
--   const Operator *Ri = this->GetRestriction();
--   InitTVectors(Po, Ri, Pi, x, b, X, B);
--
--   if (!mat_e)
--   {
--      FormRectangularSystemMatrix(trial_tdof_list, test_tdof_list,
--                                  A); // Set A = mat_e
--   }
--   // Eliminate essential BCs with B -= Ab xb
--   mat_e->AddMult(X, B, -1.0);
--
--   B.SetSubVector(test_tdof_list, 0.0);
--}
--
- void MixedBilinearForm::Update()
- {
-    delete mat;
-@@ -1895,66 +1676,97 @@ void DiscreteLinearOperator::Assemble(int skip_zeros)
-       return;
-    }
- 
--   Array<int> dom_vdofs, ran_vdofs;
--   ElementTransformation *T;
--   DofTransformation * dom_dof_trans;
--   DofTransformation * ran_dof_trans;
--   const FiniteElement *dom_fe, *ran_fe;
--   DenseMatrix totelmat, elmat;
-+   ElementTransformation *eltrans;
-+   DofTransformation *dom_dof_trans;
-+   DofTransformation *ran_dof_trans;
-+   Mesh *mesh = test_fes->GetMesh();
-+   DenseMatrix elmat;
- 
-    if (mat == NULL)
-    {
-       mat = new SparseMatrix(height, width);
-    }
- 
--   if (domain_integs.Size() > 0)
-+   if (domain_integs.Size())
-    {
-       for (int i = 0; i < test_fes->GetNE(); i++)
-       {
--         dom_dof_trans = trial_fes->GetElementVDofs(i, dom_vdofs);
--         ran_dof_trans = test_fes->GetElementVDofs(i, ran_vdofs);
--         T = test_fes->GetElementTransformation(i);
--         dom_fe = trial_fes->GetFE(i);
--         ran_fe = test_fes->GetFE(i);
--
--         domain_integs[0]->AssembleElementMatrix2(*dom_fe, *ran_fe, *T,
--                                                  totelmat);
--         for (int j = 1; j < domain_integs.Size(); j++)
-+         dom_dof_trans = trial_fes->GetElementVDofs(i, trial_vdofs);
-+         ran_dof_trans = test_fes->GetElementVDofs(i, test_vdofs);
-+         eltrans = test_fes->GetElementTransformation(i);
-+
-+         elmat.SetSize(test_vdofs.Size(), trial_vdofs.Size());
-+         elmat = 0.0;
-+         for (int j = 0; j < domain_integs.Size(); j++)
-          {
--            domain_integs[j]->AssembleElementMatrix2(*dom_fe, *ran_fe, *T,
--                                                     elmat);
--            totelmat += elmat;
-+            domain_integs[j]->AssembleElementMatrix2(*trial_fes->GetFE(i),
-+                                                     *test_fes->GetFE(i),
-+                                                     *eltrans, elemmat);
-+            elmat += elemmat;
-          }
-          if (ran_dof_trans || dom_dof_trans)
-          {
--            TransformPrimal(ran_dof_trans, dom_dof_trans, totelmat);
-+            TransformPrimal(ran_dof_trans, dom_dof_trans, elmat);
-          }
--         mat->SetSubMatrix(ran_vdofs, dom_vdofs, totelmat, skip_zeros);
-+         mat->SetSubMatrix(test_vdofs, trial_vdofs, elmat, skip_zeros);
-       }
-    }
- 
-    if (trace_face_integs.Size())
-    {
--      const int nfaces = test_fes->GetMesh()->GetNumFaces();
-+      const int nfaces = mesh->GetNumFaces();
-       for (int i = 0; i < nfaces; i++)
-       {
--         trial_fes->GetFaceVDofs(i, dom_vdofs);
--         test_fes->GetFaceVDofs(i, ran_vdofs);
--         T = test_fes->GetMesh()->GetFaceTransformation(i);
--         dom_fe = trial_fes->GetFaceElement(i);
--         ran_fe = test_fes->GetFaceElement(i);
--
--         trace_face_integs[0]->AssembleElementMatrix2(*dom_fe, *ran_fe, *T,
--                                                      totelmat);
--         for (int j = 1; j < trace_face_integs.Size(); j++)
-+         trial_fes->GetFaceVDofs(i, trial_vdofs);
-+         test_fes->GetFaceVDofs(i, test_vdofs);
-+         eltrans = mesh->GetFaceTransformation(i);
-+
-+         elmat.SetSize(test_vdofs.Size(), trial_vdofs.Size());
-+         elmat = 0.0;
-+         for (int j = 0; j < trace_face_integs.Size(); j++)
-          {
--            trace_face_integs[j]->AssembleElementMatrix2(*dom_fe, *ran_fe, *T,
--                                                         elmat);
--            totelmat += elmat;
-+            trace_face_integs[j]->AssembleElementMatrix2(*trial_fes->GetFaceElement(i),
-+                                                         *test_fes->GetFaceElement(i),
-+                                                         *eltrans, elemmat);
-+            elmat += elemmat;
-          }
--         mat->SetSubMatrix(ran_vdofs, dom_vdofs, totelmat, skip_zeros);
-+         mat->SetSubMatrix(test_vdofs, trial_vdofs, elmat, skip_zeros);
-       }
-    }
- }
- 
-+void DiscreteLinearOperator::FormDiscreteOperatorMatrix(OperatorHandle &A)
-+{
-+   if (ext)
-+   {
-+      Operator *oper;
-+      ext->FormDiscreteOperator(oper);
-+      A.Reset(oper);
-+      return;
-+   }
-+
-+   mat->Finalize();
-+
-+   const SparseMatrix *test_R = test_fes->GetConformingRestriction();
-+   if (test_R)
-+   {
-+      SparseMatrix *RA = mfem::Mult(*test_R, *mat);
-+      delete mat;
-+      mat = RA;
-+   }
-+
-+   const SparseMatrix *trial_P = trial_fes->GetConformingProlongation();
-+   if (trial_P)
-+   {
-+      SparseMatrix *RAP = mfem::Mult(*mat, *trial_P);
-+      delete mat;
-+      mat = RAP;
-+   }
-+
-+   height = mat->Height();
-+   width = mat->Width();
-+
-+   A.Reset(mat, false);
-+}
-+
- }
-diff --git a/fem/bilinearform.hpp b/fem/bilinearform.hpp
-index b23df9280..b878b8d27 100644
---- a/fem/bilinearform.hpp
-+++ b/fem/bilinearform.hpp
-@@ -36,8 +36,6 @@ enum class AssemblyLevel
-    /// is fully evaluated on the fly.
-    /// This assembly level is ALWAYS performed on the host.
-    LEGACY = 0,
--   /// @deprecated Use LEGACY instead.
--   LEGACYFULL = 0,
-    /// Fully assembled form, i.e. a global sparse matrix in MFEM format. This
-    /// assembly is compatible with device execution.
-    FULL,
-@@ -66,7 +64,7 @@ protected:
-    SparseMatrix *mat;
- 
-    /** @brief Sparse Matrix \f$ M_e \f$ used to store the eliminations
--        from the b.c.  Owned.
-+        from the b.c. Owned.
-        \f$ M + M_e = M_{original} \f$ */
-    SparseMatrix *mat_e;
- 
-@@ -75,11 +73,11 @@ protected:
- 
-    /// The assembly level of the form (full, partial, etc.)
-    AssemblyLevel assembly;
--   /// Element batch size used in the form action (1, 8, num_elems, etc.)
--   int batch;
-+
-    /** @brief Extension for supporting Full Assembly (FA), Element Assembly (EA),
-        Partial Assembly (PA), or Matrix Free assembly (MF). */
-    BilinearFormExtension *ext;
-+
-    /** Indicates if the sparse matrix is sorted after assembly when using
-        Full Assembly (FA). */
-    bool sort_sparse_matrix = false;
-@@ -113,11 +111,6 @@ protected:
-    Array<BilinearFormIntegrator*> boundary_face_integs;
-    Array<Array<int>*> boundary_face_integs_marker; ///< Entries are not owned.
- 
--   DenseMatrix elemmat;
--   Array<int>  vdofs;
--
--   DenseTensor *element_matrices; ///< Owned.
--
-    StaticCondensation *static_cond; ///< Owned.
-    Hybridization *hybridization; ///< Owned.
- 
-@@ -126,31 +119,29 @@ protected:
-        the constrained DoFs. */
-    DiagonalPolicy diag_policy;
- 
--   int precompute_sparsity;
--   // Allocate appropriate SparseMatrix and assign it to mat
--   void AllocMat();
--
--   void ConformingAssemble();
-+   DenseMatrix elemmat;
-+   Array<int>  vdofs;
- 
-    // may be used in the construction of derived classes
--   BilinearForm() : Matrix (0)
-+   BilinearForm() : Matrix(0)
-    {
--      fes = NULL; sequence = -1;
--      mat = mat_e = NULL; extern_bfs = 0; element_matrices = NULL;
--      static_cond = NULL; hybridization = NULL;
--      precompute_sparsity = 0;
-+      fes = NULL;
-+      sequence = -1;
-+      mat = mat_e = NULL;
-+      extern_bfs = 0;
-+      static_cond = NULL;
-+      hybridization = NULL;
-       diag_policy = DIAG_KEEP;
-       assembly = AssemblyLevel::LEGACY;
--      batch = 1;
-       ext = NULL;
-    }
- 
- private:
--   /// Copy construction is not supported; body is undefined.
--   BilinearForm(const BilinearForm &);
-+   /// Copy construction is not supported.
-+   BilinearForm(const BilinearForm &) = delete;
- 
--   /// Copy assignment is not supported; body is undefined.
--   BilinearForm &operator=(const BilinearForm &);
-+   /// Copy assignment is not supported.
-+   BilinearForm &operator=(const BilinearForm &) = delete;
- 
- public:
-    /// Creates bilinear form associated with FE space @a *f.
-@@ -163,11 +154,8 @@ public:
-        The pointer @a f is not owned by the newly constructed object.
- 
-        The integrators in @a bf are copied as pointers and they are not owned by
--       the newly constructed BilinearForm.
--
--       The optional parameter @a ps is used to initialize the internal flag
--       #precompute_sparsity, see UsePrecomputedSparsity() for details. */
--   BilinearForm(FiniteElementSpace *f, BilinearForm *bf, int ps = 0);
-+       the newly constructed BilinearForm. */
-+   BilinearForm(FiniteElementSpace *f, BilinearForm *bf);
- 
-    /// Get the size of the BilinearForm as a square matrix.
-    int Size() const { return height; }
-@@ -184,6 +172,18 @@ public:
-        If used, this method must be called before assembly. */
-    void SetAssemblyLevel(AssemblyLevel assembly_level);
- 
-+   /// Returns the assembly level
-+   AssemblyLevel GetAssemblyLevel() const { return assembly; }
-+
-+   /// Sets diagonal policy used upon construction of the linear system.
-+   /** Policies include:
-+
-+       - DIAG_ZERO (Set the diagonal values to zero)
-+       - DIAG_ONE  (Set the diagonal values to one)
-+       - DIAG_KEEP (Keep the diagonal values)
-+   */
-+   void SetDiagonalPolicy(DiagonalPolicy policy) { diag_policy = policy; }
-+
-    /** @brief Force the sparse matrix column indices to be sorted when using
-        AssemblyLevel::FULL.
- 
-@@ -199,8 +199,16 @@ public:
-       sort_sparse_matrix = enable_it;
-    }
- 
--   /// Returns the assembly level
--   AssemblyLevel GetAssemblyLevel() const { return assembly; }
-+   /// Indicate that integrators are not owned by the BilinearForm
-+   void UseExternalIntegrators() { extern_bfs = 1; }
-+
-+   /// Enable hybridization.
-+   /** For details see the description for class
-+       Hybridization in fem/hybridization.hpp. This method should be called
-+       before assembly. */
-+   void EnableHybridization(FiniteElementSpace *constr_space,
-+                            BilinearFormIntegrator *constr_integ,
-+                            const Array<int> &ess_tdof_list);
- 
-    Hybridization *GetHybridization() const { return hybridization; }
- 
-@@ -218,60 +226,6 @@ public:
-    FiniteElementSpace *SCFESpace() const
-    { return static_cond ? static_cond->GetTraceFESpace() : NULL; }
- 
--   /// Enable hybridization.
--   /** For details see the description for class
--       Hybridization in fem/hybridization.hpp. This method should be called
--       before assembly. */
--   void EnableHybridization(FiniteElementSpace *constr_space,
--                            BilinearFormIntegrator *constr_integ,
--                            const Array<int> &ess_tdof_list);
--
--   /** @brief For scalar FE spaces, precompute the sparsity pattern of the matrix
--       (assuming dense element matrices) based on the types of integrators
--       present in the bilinear form. */
--   void UsePrecomputedSparsity(int ps = 1) { precompute_sparsity = ps; }
--
--   /** @brief Use the given CSR sparsity pattern to allocate the internal
--       SparseMatrix.
--
--       - The @a I and @a J arrays must define a square graph with size equal to
--         GetVSize() of the associated FiniteElementSpace.
--       - This method should be called after enabling static condensation or
--         hybridization, if used.
--       - In the case of static condensation, @a I and @a J are not used.
--       - The ownership of the arrays @a I and @a J remains with the caller. */
--   void UseSparsity(int *I, int *J, bool isSorted);
--
--   /// Use the sparsity of @a A to allocate the internal SparseMatrix.
--   void UseSparsity(SparseMatrix &A);
--
--   /// Pre-allocate the internal SparseMatrix before assembly.
--   /**  If the flag 'precompute sparsity'
--       is set, the matrix is allocated in CSR format (i.e.
--       finalized) and the entries are initialized with zeros. */
--   void AllocateMatrix() { if (mat == NULL) { AllocMat(); } }
--
--   /// Access all the integrators added with AddDomainIntegrator().
--   Array<BilinearFormIntegrator*> *GetDBFI() { return &domain_integs; }
--
--   /// Access all the integrators added with AddBoundaryIntegrator().
--   Array<BilinearFormIntegrator*> *GetBBFI() { return &boundary_integs; }
--   /** @brief Access all boundary markers added with AddBoundaryIntegrator().
--       If no marker was specified when the integrator was added, the
--       corresponding pointer (to Array<int>) will be NULL. */
--   Array<Array<int>*> *GetBBFI_Marker() { return &boundary_integs_marker; }
--
--   /// Access all integrators added with AddInteriorFaceIntegrator().
--   Array<BilinearFormIntegrator*> *GetFBFI() { return &interior_face_integs; }
--
--   /// Access all integrators added with AddBdrFaceIntegrator().
--   Array<BilinearFormIntegrator*> *GetBFBFI() { return &boundary_face_integs; }
--   /** @brief Access all boundary markers added with AddBdrFaceIntegrator().
--       If no marker was specified when the integrator was added, the
--       corresponding pointer (to Array<int>) will be NULL. */
--   Array<Array<int>*> *GetBFBFI_Marker()
--   { return &boundary_face_integs_marker; }
--
-    /// Returns a reference to: \f$ M_{ij} \f$
-    const double &operator()(int i, int j) { return (*mat)(i,j); }
- 
-@@ -284,42 +238,52 @@ public:
-    /// Matrix vector multiplication:  \f$ y = M x \f$
-    virtual void Mult(const Vector &x, Vector &y) const;
- 
-+   /// Add the matrix vector multiple to a vector:  \f$ y += a M x \f$
-+   virtual void AddMult(const Vector &x, Vector &y,
-+                        const double a = 1.0) const;
-+
-+   /// Matrix transpose vector multiplication:  \f$ y = M^T x \f$
-+   virtual void MultTranspose(const Vector &x, Vector &y) const;
-+
-+   /// Add the matrix transpose vector multiplication:  \f$ y += a M^T x \f$
-+   virtual void AddMultTranspose(const Vector &x, Vector &y,
-+                                 const double a = 1.0) const;
-+
-    /** @brief Matrix vector multiplication with the original uneliminated
-        matrix.  The original matrix is \f$ M + M_e \f$ so we have:
-        \f$ y = M x + M_e x \f$ */
-    void FullMult(const Vector &x, Vector &y) const
-    { mat->Mult(x, y); mat_e->AddMult(x, y); }
- 
--   /// Add the matrix vector multiple to a vector:  \f$ y += a M x \f$
--   virtual void AddMult(const Vector &x, Vector &y, const double a = 1.0) const
--   { mat -> AddMult (x, y, a); }
--
-    /** @brief Add the original uneliminated matrix vector multiple to a vector.
-        The original matrix is \f$ M + Me \f$ so we have:
-        \f$ y += M x + M_e x \f$ */
-    void FullAddMult(const Vector &x, Vector &y) const
-    { mat->AddMult(x, y); mat_e->AddMult(x, y); }
- 
--   /// Add the matrix transpose vector multiplication:  \f$ y += a M^T x \f$
--   virtual void AddMultTranspose(const Vector & x, Vector & y,
--                                 const double a = 1.0) const
--   { mat->AddMultTranspose(x, y, a); }
--
-    /** @brief Add the original uneliminated matrix transpose vector
-        multiple to a vector. The original matrix is \f$ M + M_e \f$
-        so we have: \f$ y += M^T x + {M_e}^T x \f$ */
--   void FullAddMultTranspose(const Vector & x, Vector & y) const
-+   void FullAddMultTranspose(const Vector &x, Vector &y) const
-    { mat->AddMultTranspose(x, y); mat_e->AddMultTranspose(x, y); }
- 
--   /// Matrix transpose vector multiplication:  \f$ y = M^T x \f$
--   virtual void MultTranspose(const Vector & x, Vector & y) const;
-+   /// Compute inner product for full uneliminated matrix \f$ y^T M x + y^T M_e x \f$
-+   double FullInnerProduct(const Vector &x, const Vector &y) const
-+   { return mat->InnerProduct(x, y) + mat_e->InnerProduct(x, y); }
-+
-+   /// Returns a pointer to (approximation) of the matrix inverse:  \f$ M^{-1} \f$
-+   virtual MatrixInverse *Inverse() const;
- 
-    /// Compute \f$ y^T M x \f$
-    double InnerProduct(const Vector &x, const Vector &y) const
--   { return mat->InnerProduct (x, y); }
-+   { return mat->InnerProduct(x, y); }
- 
--   /// Returns a pointer to (approximation) of the matrix inverse:  \f$ M^{-1} \f$
--   virtual MatrixInverse *Inverse() const;
-+   /// Sets all sparse values of \f$ M \f$ and \f$ M_e \f$ to 'a'.
-+   void operator=(const double a)
-+   {
-+      if (mat != NULL) { *mat = a; }
-+      if (mat_e != NULL) { *mat_e = a; }
-+   }
- 
-    /// Finalizes the matrix initialization.
-    virtual void Finalize(int skip_zeros = 1);
-@@ -345,15 +309,7 @@ public:
-    /** @brief Returns true if the sparse matrix is not null, false otherwise.
- 
-        @sa SpMat(). */
--   bool HasSpMat()
--   {
--      return mat != nullptr;
--   }
--
--
--   /**  @brief Nullifies the internal matrix \f$ M \f$ and returns a pointer
--        to it.  Used for transferring ownership. */
--   SparseMatrix *LoseMat() { SparseMatrix *tmp = mat; mat = NULL; return tmp; }
-+   bool HasSpMat() const { return mat != nullptr; }
- 
-    /** @brief Returns a const reference to the sparse matrix of eliminated b.c.:
-        \f$ M_e \f$
-@@ -379,11 +335,31 @@ public:
-         false otherwise.
- 
-         @sa SpMatElim(). */
--   bool HasSpMatElim()
-+   bool HasSpMatElim()const { return mat_e != nullptr; }
-+
-+   /**  @brief Nullifies the internal matrix \f$ M \f$ and returns a pointer
-+        to it.  Used for transferring ownership. */
-+   SparseMatrix *LoseMat() { SparseMatrix *tmp = mat; mat = NULL; return tmp; }
-+
-+   /** Returns a const reference to the extension for assembly levels other
-+    than AssemblyLevel::LEGACY. */
-+   const BilinearFormExtension &Ext() const
-    {
--      return mat_e != nullptr;
-+      MFEM_VERIFY(ext, "ext is NULL and can't be dereferenced");
-+      return *ext;
-    }
- 
-+   /** Returns a reference to the extension for assembly levels other than
-+       AssemblyLevel::LEGACY. */
-+   BilinearFormExtension &Ext()
-+   {
-+      MFEM_VERIFY(ext, "ext is NULL and can't be dereferenced");
-+      return *ext;
-+   }
-+
-+   /// Returns true if the extension is not null, false otherwise.
-+   bool HasExt() const { return ext != nullptr; }
-+
-    /// Adds new Domain Integrator. Assumes ownership of @a bfi.
-    void AddDomainIntegrator(BilinearFormIntegrator *bfi);
-    /// Adds new Domain Integrator restricted to certain elements specified by
-@@ -416,16 +392,72 @@ public:
-    void AddBdrFaceIntegrator(BilinearFormIntegrator *bfi,
-                              Array<int> &bdr_marker);
- 
--   /// Sets all sparse values of \f$ M \f$ and \f$ M_e \f$ to 'a'.
--   void operator=(const double a)
--   {
--      if (mat != NULL) { *mat = a; }
--      if (mat_e != NULL) { *mat_e = a; }
--   }
-+   /// Access all the integrators added with AddDomainIntegrator().
-+   Array<BilinearFormIntegrator*> *GetDBFI() { return &domain_integs; }
-+
-+   /// Access all the integrators added with AddBoundaryIntegrator().
-+   Array<BilinearFormIntegrator*> *GetBBFI() { return &boundary_integs; }
-+   /** @brief Access all boundary markers added with AddBoundaryIntegrator().
-+       If no marker was specified when the integrator was added, the
-+       corresponding pointer (to Array<int>) will be NULL. */
-+   Array<Array<int>*> *GetBBFI_Marker() { return &boundary_integs_marker; }
-+
-+   /// Access all integrators added with AddInteriorFaceIntegrator().
-+   Array<BilinearFormIntegrator*> *GetFBFI() { return &interior_face_integs; }
-+
-+   /// Access all integrators added with AddBdrFaceIntegrator().
-+   Array<BilinearFormIntegrator*> *GetBFBFI() { return &boundary_face_integs; }
-+   /** @brief Access all boundary markers added with AddBdrFaceIntegrator().
-+       If no marker was specified when the integrator was added, the
-+       corresponding pointer (to Array<int>) will be NULL. */
-+   Array<Array<int>*> *GetBFBFI_Marker()
-+   { return &boundary_face_integs_marker; }
-+
-+   /// Assemble the given element matrix
-+   /** The element matrix @a elmat is assembled for the element @a i, i.e.
-+       added to the system matrix. The flag @a skip_zeros skips the zero
-+       elements of the matrix, unless they are breaking the symmetry of
-+       the system matrix.
-+   */
-+   void AssembleElementMatrix(int i, const DenseMatrix &elmat,
-+                              int skip_zeros = 1);
-+
-+   /// Assemble the given element matrix
-+   /** The element matrix @a elmat is assembled for the element @a i, i.e.
-+       added to the system matrix. The vdofs of the element are returned
-+       in @a vdofs. The flag @a skip_zeros skips the zero elements of the
-+       matrix, unless they are breaking the symmetry of the system matrix.
-+   */
-+   void AssembleElementMatrix(int i, const DenseMatrix &elmat,
-+                              Array<int> &vdofs, int skip_zeros = 1);
-+
-+   /// Assemble the given boundary element matrix
-+   /** The boundary element matrix @a elmat is assembled for the boundary
-+       element @a i, i.e. added to the system matrix. The flag @a skip_zeros
-+       skips the zero elements of the matrix, unless they are breaking the
-+       symmetry of the system matrix.
-+   */
-+   void AssembleBdrElementMatrix(int i, const DenseMatrix &elmat,
-+                                 int skip_zeros = 1);
-+
-+   /// Assemble the given boundary element matrix
-+   /** The boundary element matrix @a elmat is assembled for the boundary
-+       element @a i, i.e. added to the system matrix. The vdofs of the element
-+       are returned in @a vdofs. The flag @a skip_zeros skips the zero elements
-+       of the matrix, unless they are breaking the symmetry of the system matrix.
-+   */
-+   void AssembleBdrElementMatrix(int i, const DenseMatrix &elmat,
-+                                 Array<int> &vdofs, int skip_zeros = 1);
- 
-    /// Assembles the form i.e. sums over all domain/bdr integrators.
-    void Assemble(int skip_zeros = 1);
- 
-+   /** For a partially conforming FE space, complete the assembly process by
-+       performing A := P^t A P where A is the internal sparse matrix; P is the
-+       conforming prolongation matrices of the FE space. After this call the
-+       BilinearForm becomes an operator on the conforming FE spaces. */
-+   void ConformingAssemble();
-+
-    /** @brief Assemble the diagonal of the bilinear form into @a diag. Note that
-        @a diag is a tdof Vector.
- 
-@@ -439,32 +471,20 @@ public:
- 
-    /// Get the finite element space prolongation operator.
-    virtual const Operator *GetProlongation() const
--   { return fes->GetConformingProlongation(); }
-+   { return fes->GetProlongationMatrix(); }
-+
-    /// Get the finite element space restriction operator
-    virtual const Operator *GetRestriction() const
--   { return fes->GetConformingRestriction(); }
-+   { return fes->GetRestrictionMatrix(); }
-+
-    /// Get the output finite element space prolongation matrix
-    virtual const Operator *GetOutputProlongation() const
-    { return GetProlongation(); }
--   /** @brief Returns the output fe space restriction matrix, transposed
- 
--       Logically, this is the transpose of GetOutputRestriction, but in
--       practice it is convenient to have it in transposed form for
--       construction of RAP operators in matrix-free methods. */
--   virtual const Operator *GetOutputRestrictionTranspose() const
--   { return GetOutputProlongation(); }
-    /// Get the output finite element space restriction matrix
-    virtual const Operator *GetOutputRestriction() const
-    { return GetRestriction(); }
- 
--   /// @brief Compute serial RAP operator and store it in @a A as a SparseMatrix.
--   void SerialRAP(OperatorHandle &A)
--   {
--      MFEM_ASSERT(mat, "SerialRAP requires the SparseMatrix to be assembled.");
--      ConformingAssemble();
--      A.Reset(mat, false);
--   }
--
-    /** @brief Form the linear system A X = B, corresponding to this bilinear
-        form and the linear form @a b(.). */
-    /** This method applies any necessary transformations to the linear system
-@@ -541,59 +561,6 @@ public:
-    */
-    virtual void RecoverFEMSolution(const Vector &X, const Vector &b, Vector &x);
- 
--   /// Compute and store internally all element matrices.
--   void ComputeElementMatrices();
--
--   /// Free the memory used by the element matrices.
--   void FreeElementMatrices()
--   { delete element_matrices; element_matrices = NULL; }
--
--   /// Compute the element matrix of the given element
--   /** The element matrix is computed by calling the domain integrators
--       or the one stored internally by a prior call of ComputeElementMatrices()
--       is returned when available.
--   */
--   void ComputeElementMatrix(int i, DenseMatrix &elmat);
--
--   /// Compute the boundary element matrix of the given boundary element
--   void ComputeBdrElementMatrix(int i, DenseMatrix &elmat);
--
--   /// Assemble the given element matrix
--   /** The element matrix @a elmat is assembled for the element @a i, i.e.
--       added to the system matrix. The flag @a skip_zeros skips the zero
--       elements of the matrix, unless they are breaking the symmetry of
--       the system matrix.
--   */
--   void AssembleElementMatrix(int i, const DenseMatrix &elmat,
--                              int skip_zeros = 1);
--
--   /// Assemble the given element matrix
--   /** The element matrix @a elmat is assembled for the element @a i, i.e.
--       added to the system matrix. The vdofs of the element are returned
--       in @a vdofs. The flag @a skip_zeros skips the zero elements of the
--       matrix, unless they are breaking the symmetry of the system matrix.
--   */
--   void AssembleElementMatrix(int i, const DenseMatrix &elmat,
--                              Array<int> &vdofs, int skip_zeros = 1);
--
--   /// Assemble the given boundary element matrix
--   /** The boundary element matrix @a elmat is assembled for the boundary
--       element @a i, i.e. added to the system matrix. The flag @a skip_zeros
--       skips the zero elements of the matrix, unless they are breaking the
--       symmetry of the system matrix.
--   */
--   void AssembleBdrElementMatrix(int i, const DenseMatrix &elmat,
--                                 int skip_zeros = 1);
--
--   /// Assemble the given boundary element matrix
--   /** The boundary element matrix @a elmat is assembled for the boundary
--       element @a i, i.e. added to the system matrix. The vdofs of the element
--       are returned in @a vdofs. The flag @a skip_zeros skips the zero elements
--       of the matrix, unless they are breaking the symmetry of the system matrix.
--   */
--   void AssembleBdrElementMatrix(int i, const DenseMatrix &elmat,
--                                 Array<int> &vdofs, int skip_zeros = 1);
--
-    /// Eliminate essential boundary DOFs from the system.
-    /** The array @a bdr_attr_is_ess marks boundary attributes that constitute
-        the essential part of the boundary. By default, the diagonal at the
-@@ -645,34 +612,14 @@ public:
-    void EliminateVDofsInRHS(const Array<int> &vdofs, const Vector &x,
-                             Vector &b);
- 
--   /// Compute inner product for full uneliminated matrix \f$ y^T M x + y^T M_e x \f$
--   double FullInnerProduct(const Vector &x, const Vector &y) const
--   { return mat->InnerProduct(x, y) + mat_e->InnerProduct(x, y); }
--
-    /// Update the @a FiniteElementSpace and delete all data associated with the old one.
-    virtual void Update(FiniteElementSpace *nfes = NULL);
- 
--   /// (DEPRECATED) Return the FE space associated with the BilinearForm.
--   /** @deprecated Use FESpace() instead. */
--   MFEM_DEPRECATED FiniteElementSpace *GetFES() { return fes; }
--
-    /// Return the FE space associated with the BilinearForm.
-    FiniteElementSpace *FESpace() { return fes; }
-    /// Read-only access to the associated FiniteElementSpace.
-    const FiniteElementSpace *FESpace() const { return fes; }
- 
--   /// Sets diagonal policy used upon construction of the linear system.
--   /** Policies include:
--
--       - DIAG_ZERO (Set the diagonal values to zero)
--       - DIAG_ONE  (Set the diagonal values to one)
--       - DIAG_KEEP (Keep the diagonal values)
--   */
--   void SetDiagonalPolicy(DiagonalPolicy policy);
--
--   /// Indicate that integrators are not owned by the BilinearForm
--   void UseExternalIntegrators() { extern_bfs = 1; }
--
-    /// Destroys bilinear form.
-    virtual ~BilinearForm();
- };
-@@ -696,11 +643,12 @@ public:
- class MixedBilinearForm : public Matrix
- {
- protected:
--   SparseMatrix *mat; ///< Owned.
--   SparseMatrix *mat_e; ///< Owned.
-+   /** Sparse matrices associated with the form and the eliminations from
-+       the b.c. Owned. */
-+   SparseMatrix *mat, *mat_e;
- 
--   FiniteElementSpace *trial_fes, ///< Not owned
--                      *test_fes;  ///< Not owned
-+   /// FE space on which the form lives. Not owned.
-+   FiniteElementSpace *trial_fes, *test_fes;
- 
-    /// The form assembly level (full, partial, etc.)
-    AssemblyLevel assembly;
-@@ -733,11 +681,11 @@ protected:
-    Array<int>  trial_vdofs, test_vdofs;
- 
- private:
--   /// Copy construction is not supported; body is undefined.
--   MixedBilinearForm(const MixedBilinearForm &);
-+   /// Copy construction is not supported.
-+   MixedBilinearForm(const MixedBilinearForm &) = delete;
- 
--   /// Copy assignment is not supported; body is undefined.
--   MixedBilinearForm &operator=(const MixedBilinearForm &);
-+   /// Copy assignment is not supported.
-+   MixedBilinearForm &operator=(const MixedBilinearForm &) = delete;
- 
- public:
-    /** @brief Construct a MixedBilinearForm on the given trial, @a tr_fes, and
-@@ -760,6 +708,13 @@ public:
-                      FiniteElementSpace *te_fes,
-                      MixedBilinearForm *mbf);
- 
-+   /// Set the desired assembly level. The default is AssemblyLevel::LEGACY.
-+   /** This method must be called before assembly. */
-+   void SetAssemblyLevel(AssemblyLevel assembly_level);
-+
-+   /// Returns the assembly level
-+   AssemblyLevel GetAssemblyLevel() const { return assembly; }
-+
-    /// Returns a reference to: \f$ M_{ij} \f$
-    virtual double &Elem(int i, int j);
- 
-@@ -767,17 +722,21 @@ public:
-    virtual const double &Elem(int i, int j) const;
- 
-    /// Matrix multiplication: \f$ y = M x \f$
--   virtual void Mult(const Vector & x, Vector & y) const;
-+   virtual void Mult(const Vector &x, Vector &y) const;
- 
--   virtual void AddMult(const Vector & x, Vector & y,
-+   virtual void AddMult(const Vector &x, Vector &y,
-                         const double a = 1.0) const;
- 
--   virtual void MultTranspose(const Vector & x, Vector & y) const;
--   virtual void AddMultTranspose(const Vector & x, Vector & y,
-+   virtual void MultTranspose(const Vector &x, Vector &y) const;
-+
-+   virtual void AddMultTranspose(const Vector &x, Vector &y,
-                                  const double a = 1.0) const;
- 
-    virtual MatrixInverse *Inverse() const;
- 
-+   /// Sets all sparse values of \f$ M \f$ to @a a.
-+   void operator=(const double a) { *mat = a; }
-+
-    /// Finalizes the matrix initialization.
-    virtual void Finalize(int skip_zeros = 1);
- 
-@@ -787,15 +746,45 @@ public:
-    void GetBlocks(Array2D<SparseMatrix *> &blocks) const;
- 
-    /// Returns a const reference to the sparse matrix:  \f$ M \f$
--   const SparseMatrix &SpMat() const { return *mat; }
-+   const SparseMatrix &SpMat() const
-+   {
-+      MFEM_VERIFY(mat, "mat is NULL and can't be dereferenced");
-+      return *mat;
-+   }
- 
-    /// Returns a reference to the sparse matrix:  \f$ M \f$
--   SparseMatrix &SpMat() { return *mat; }
-+   SparseMatrix &SpMat()
-+   {
-+      MFEM_VERIFY(mat, "mat is NULL and can't be dereferenced");
-+      return *mat;
-+   }
-+
-+   /// Returns true if the sparse matrix is not null, false otherwise.
-+   bool HasSpMat() const { return mat != nullptr; }
- 
-    /**  @brief Nullifies the internal matrix \f$ M \f$ and returns a pointer
-         to it.  Used for transferring ownership. */
-    SparseMatrix *LoseMat() { SparseMatrix *tmp = mat; mat = NULL; return tmp; }
- 
-+   /** Returns a const reference to the extension for assembly levels other
-+    than AssemblyLevel::LEGACY. */
-+   const MixedBilinearFormExtension &Ext() const
-+   {
-+      MFEM_VERIFY(ext, "ext is NULL and can't be dereferenced");
-+      return *ext;
-+   }
-+
-+   /** Returns a reference to the extension for assembly levels other than
-+       AssemblyLevel::LEGACY. */
-+   MixedBilinearFormExtension &Ext()
-+   {
-+      MFEM_VERIFY(ext, "ext is NULL and can't be dereferenced");
-+      return *ext;
-+   }
-+
-+   /// Returns true if the extension is not null, false otherwise.
-+   bool HasExt() const { return ext != nullptr; }
-+
-    /// Adds a domain integrator. Assumes ownership of @a bfi.
-    void AddDomainIntegrator(BilinearFormIntegrator *bfi);
- 
-@@ -803,8 +792,8 @@ public:
-    void AddBoundaryIntegrator(BilinearFormIntegrator *bfi);
- 
-    /// Adds a boundary integrator. Assumes ownership of @a bfi.
--   void AddBoundaryIntegrator (BilinearFormIntegrator * bfi,
--                               Array<int> &bdr_marker);
-+   void AddBoundaryIntegrator(BilinearFormIntegrator *bfi,
-+                              Array<int> &bdr_marker);
- 
-    /** @brief Add a trace face integrator. Assumes ownership of @a bfi.
- 
-@@ -814,11 +803,11 @@ public:
-    void AddTraceFaceIntegrator(BilinearFormIntegrator *bfi);
- 
-    /// Adds a boundary trace face integrator. Assumes ownership of @a bfi.
--   void AddBdrTraceFaceIntegrator (BilinearFormIntegrator * bfi);
-+   void AddBdrTraceFaceIntegrator(BilinearFormIntegrator *bfi);
- 
-    /// Adds a boundary trace face integrator. Assumes ownership of @a bfi.
--   void AddBdrTraceFaceIntegrator (BilinearFormIntegrator * bfi,
--                                   Array<int> &bdr_marker);
-+   void AddBdrTraceFaceIntegrator(BilinearFormIntegrator *bfi,
-+                                  Array<int> &bdr_marker);
- 
-    /// Access all integrators added with AddDomainIntegrator().
-    Array<BilinearFormIntegrator*> *GetDBFI() { return &domain_integs; }
-@@ -842,48 +831,6 @@ public:
-    Array<Array<int>*> *GetBTFBFI_Marker()
-    { return &boundary_trace_face_integs_marker; }
- 
--   /// Sets all sparse values of \f$ M \f$ to @a a.
--   void operator=(const double a) { *mat = a; }
--
--   /// Set the desired assembly level. The default is AssemblyLevel::LEGACY.
--   /** This method must be called before assembly. */
--   void SetAssemblyLevel(AssemblyLevel assembly_level);
--
--   void Assemble(int skip_zeros = 1);
--
--   /** @brief Assemble the diagonal of ADA^T into diag, where A is this mixed
--       bilinear form and D is a diagonal. */
--   void AssembleDiagonal_ADAt(const Vector &D, Vector &diag) const;
--
--   /// Get the input finite element space prolongation matrix
--   virtual const Operator *GetProlongation() const
--   { return trial_fes->GetProlongationMatrix(); }
--
--   /// Get the input finite element space restriction matrix
--   virtual const Operator *GetRestriction() const
--   { return trial_fes->GetRestrictionMatrix(); }
--
--   /// Get the test finite element space prolongation matrix
--   virtual const Operator *GetOutputProlongation() const
--   { return test_fes->GetProlongationMatrix(); }
--
--   /// Get the test finite element space restriction matrix
--   virtual const Operator *GetOutputRestriction() const
--   { return test_fes->GetRestrictionMatrix(); }
--
--   /** For partially conforming trial and/or test FE spaces, complete the
--       assembly process by performing A := P2^t A P1 where A is the internal
--       sparse matrix; P1 and P2 are the conforming prolongation matrices of the
--       trial and test FE spaces, respectively. After this call the
--       MixedBilinearForm becomes an operator on the conforming FE spaces. */
--   void ConformingAssemble();
--
--   /// Compute the element matrix of the given element
--   void ComputeElementMatrix(int i, DenseMatrix &elmat);
--
--   /// Compute the boundary element matrix of the given boundary element
--   void ComputeBdrElementMatrix(int i, DenseMatrix &elmat);
--
-    /// Assemble the given element matrix
-    /** The element matrix @a elmat is assembled for the element @a i, i.e.
-        added to the system matrix. The flag @a skip_zeros skips the zero
-@@ -924,40 +871,34 @@ public:
-                                  Array<int> &trial_vdofs, Array<int> &test_vdofs,
-                                  int skip_zeros = 1);
- 
--   void EliminateTrialDofs(const Array<int> &bdr_attr_is_ess,
--                           const Vector &sol, Vector &rhs);
-+   void Assemble(int skip_zeros = 1);
- 
--   void EliminateEssentialBCFromTrialDofs(const Array<int> &marked_vdofs,
--                                          const Vector &sol, Vector &rhs);
-+   /** For partially conforming trial and/or test FE spaces, complete the
-+       assembly process by performing A := P2^t A P1 where A is the internal
-+       sparse matrix; P1 and P2 are the conforming prolongation matrices of the
-+       trial and test FE spaces, respectively. After this call the
-+       MixedBilinearForm becomes an operator on the conforming FE spaces. */
-+   void ConformingAssemble();
- 
--   virtual void EliminateTestDofs(const Array<int> &bdr_attr_is_ess);
-+   /** @brief Assemble the diagonal of ADA^T into diag, where A is this mixed
-+       bilinear form and D is a diagonal. */
-+   void AssembleDiagonal_ADAt(const Vector &D, Vector &diag) const;
- 
--   /** @brief Return in @a A that is column-constrained.
-+   /// Get the input finite element space prolongation matrix
-+   virtual const Operator *GetProlongation() const
-+   { return trial_fes->GetProlongationMatrix(); }
- 
--      This returns the same operator as FormRectangularLinearSystem(), but does
--      without the transformations of the right-hand side. */
--   virtual void FormRectangularSystemMatrix(const Array<int> &trial_tdof_list,
--                                            const Array<int> &test_tdof_list,
--                                            OperatorHandle &A);
-+   /// Get the input finite element space restriction matrix
-+   virtual const Operator *GetRestriction() const
-+   { return trial_fes->GetRestrictionMatrix(); }
- 
--   /** @brief Form the column-constrained linear system matrix A.
--       See FormRectangularSystemMatrix() for details.
-+   /// Get the test finite element space prolongation matrix
-+   virtual const Operator *GetOutputProlongation() const
-+   { return test_fes->GetProlongationMatrix(); }
- 
--       Version of the method FormRectangularSystemMatrix() where the system matrix is
--       returned in the variable @a A, of type OpType, holding a *reference* to
--       the system matrix (created with the method OpType::MakeRef()). The
--       reference will be invalidated when SetOperatorType(), Update(), or the
--       destructor is called. */
--   template <typename OpType>
--   void FormRectangularSystemMatrix(const Array<int> &trial_tdof_list,
--                                    const Array<int> &test_tdof_list, OpType &A)
--   {
--      OperatorHandle Ah;
--      FormRectangularSystemMatrix(trial_tdof_list, test_tdof_list, Ah);
--      OpType *A_ptr = Ah.Is<OpType>();
--      MFEM_VERIFY(A_ptr, "invalid OpType used");
--      A.MakeRef(*A_ptr);
--   }
-+   /// Get the test finite element space restriction matrix
-+   virtual const Operator *GetOutputRestriction() const
-+   { return test_fes->GetRestrictionMatrix(); }
- 
-    /** @brief Form the linear system A X = B, corresponding to this mixed bilinear
-        form and the linear form @a b(.).
-@@ -992,6 +933,41 @@ public:
-       A.MakeRef(*A_ptr);
-    }
- 
-+   /** @brief Return in @a A that is column-constrained.
-+
-+      This returns the same operator as FormRectangularLinearSystem(), but does
-+      without the transformations of the right-hand side. */
-+   virtual void FormRectangularSystemMatrix(const Array<int> &trial_tdof_list,
-+                                            const Array<int> &test_tdof_list,
-+                                            OperatorHandle &A);
-+
-+   /** @brief Form the column-constrained linear system matrix A.
-+       See FormRectangularSystemMatrix() for details.
-+
-+       Version of the method FormRectangularSystemMatrix() where the system matrix is
-+       returned in the variable @a A, of type OpType, holding a *reference* to
-+       the system matrix (created with the method OpType::MakeRef()). The
-+       reference will be invalidated when SetOperatorType(), Update(), or the
-+       destructor is called. */
-+   template <typename OpType>
-+   void FormRectangularSystemMatrix(const Array<int> &trial_tdof_list,
-+                                    const Array<int> &test_tdof_list, OpType &A)
-+   {
-+      OperatorHandle Ah;
-+      FormRectangularSystemMatrix(trial_tdof_list, test_tdof_list, Ah);
-+      OpType *A_ptr = Ah.Is<OpType>();
-+      MFEM_VERIFY(A_ptr, "invalid OpType used");
-+      A.MakeRef(*A_ptr);
-+   }
-+
-+   void EliminateTrialDofs(const Array<int> &bdr_attr_is_ess,
-+                           const Vector &sol, Vector &rhs);
-+
-+   void EliminateEssentialBCFromTrialDofs(const Array<int> &marked_vdofs,
-+                                          const Vector &sol, Vector &rhs);
-+
-+   virtual void EliminateTestDofs(const Array<int> &bdr_attr_is_ess);
-+
-    void Update();
- 
-    /// Return the trial FE space associated with the BilinearForm.
-@@ -1041,11 +1017,11 @@ public:
- class DiscreteLinearOperator : public MixedBilinearForm
- {
- private:
--   /// Copy construction is not supported; body is undefined.
--   DiscreteLinearOperator(const DiscreteLinearOperator &);
-+   /// Copy construction is not supported.
-+   DiscreteLinearOperator(const DiscreteLinearOperator &) = delete;
- 
--   /// Copy assignment is not supported; body is undefined.
--   DiscreteLinearOperator &operator=(const DiscreteLinearOperator &);
-+   /// Copy assignment is not supported.
-+   DiscreteLinearOperator &operator=(const DiscreteLinearOperator &) = delete;
- 
- public:
-    /** @brief Construct a DiscreteLinearOperator on the given
-@@ -1065,9 +1041,12 @@ public:
-    { AddTraceFaceIntegrator(di); }
- 
-    /// Access all interpolators added with AddDomainInterpolator().
--   Array<BilinearFormIntegrator*> *GetDI() { return &domain_integs; }
-+   Array<BilinearFormIntegrator*> *GetDI() { return GetDBFI(); }
-+
-+   /// Access all interpolators added with AddTraceFaceInterpolator().
-+   Array<BilinearFormIntegrator*> *GetTFI() { return GetTFBFI(); }
- 
--   /// Set the desired assembly level. The default is AssemblyLevel::FULL.
-+   /// Set the desired assembly level. The default is AssemblyLevel::LEGACY.
-    /** This method must be called before assembly. */
-    void SetAssemblyLevel(AssemblyLevel assembly_level);
- 
-@@ -1075,10 +1054,26 @@ public:
-        linear operator. */
-    virtual void Assemble(int skip_zeros = 1);
- 
--   /** @brief Get the output finite element space restriction matrix in
--       transposed form. */
--   virtual const Operator *GetOutputRestrictionTranspose() const
--   { return test_fes->GetRestrictionTransposeOperator(); }
-+   /** @brief Return in @a A that is column-constrained. */
-+   virtual void FormDiscreteOperatorMatrix(OperatorHandle &A);
-+
-+   /** @brief Form the column-constrained discrete linear operator matrix A.
-+       See FormDiscreteOperatorMatrix() for details.
-+
-+       Version of the method FormDiscreteOperatorMatrix() where the discrete
-+       operator matrix is returned in the variable @a A, of type OpType,
-+       holding a *reference* to the discrete operator  matrix (created with the
-+       method OpType::MakeRef()). The reference will be invalidated when
-+       SetOperatorType(), Update(), or the destructor is called. */
-+   template <typename OpType>
-+   void FormDiscreteOperatorMatrix(OpType &A)
-+   {
-+      OperatorHandle Ah;
-+      FormDiscreteOperatorMatrix(Ah);
-+      OpType *A_ptr = Ah.Is<OpType>();
-+      MFEM_VERIFY(A_ptr, "invalid OpType used");
-+      A.MakeRef(*A_ptr);
-+   }
- };
- 
- }
-diff --git a/fem/bilinearform_ext.cpp b/fem/bilinearform_ext.cpp
-index 50c2cf198..90a1655f4 100644
---- a/fem/bilinearform_ext.cpp
-+++ b/fem/bilinearform_ext.cpp
-@@ -21,10 +21,10 @@
- namespace mfem
- {
- 
-+/// Base class for extensions to the BilinearForm class
- BilinearFormExtension::BilinearFormExtension(BilinearForm *form)
-    : Operator(form->Size()), a(form)
- {
--   // empty
- }
- 
- const Operator *BilinearFormExtension::GetProlongation() const
-@@ -37,812 +37,1226 @@ const Operator *BilinearFormExtension::GetRestriction() const
-    return a->GetRestriction();
- }
- 
--// Data and methods for partially-assembled bilinear forms
-+/// Data and methods for matrix-free bilinear forms
- MFBilinearFormExtension::MFBilinearFormExtension(BilinearForm *form)
--   : BilinearFormExtension(form),
--     trial_fes(a->FESpace()),
--     test_fes(a->FESpace())
-+   : BilinearFormExtension(form)
- {
--   elem_restrict = NULL;
--   int_face_restrict_lex = NULL;
--   bdr_face_restrict_lex = NULL;
-+   Update();
- }
- 
--void MFBilinearFormExtension::Assemble()
-+void MFBilinearFormExtension::SetupRestrictionOperators(const L2FaceValues m)
- {
--   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
--   const int integratorCount = integrators.Size();
--   for (int i = 0; i < integratorCount; ++i)
-+   if (DeviceCanUseCeed()) { return; }
-+   ElementDofOrdering ordering = UsesTensorBasis(*fes) ?
-+                                 ElementDofOrdering::LEXICOGRAPHIC :
-+                                 ElementDofOrdering::NATIVE;
-+   elem_restrict = fes->GetElementRestriction(ordering);
-+   if (elem_restrict)
-+   {
-+      local_x.SetSize(elem_restrict->Height(), Device::GetDeviceMemoryType());
-+      local_y.SetSize(elem_restrict->Height(), Device::GetDeviceMemoryType());
-+      local_y.UseDevice(true); // ensure 'local_y = 0.0' is done on device
-+   }
-+
-+   // Construct face restriction operators only if the bilinear form has
-+   // interior or boundary face integrators
-+   if (int_face_restrict_lex == nullptr && a->GetFBFI()->Size() > 0)
-+   {
-+      int_face_restrict_lex = fes->GetFaceRestriction(
-+                                 ElementDofOrdering::LEXICOGRAPHIC,
-+                                 FaceType::Interior);
-+      int_face_x.SetSize(int_face_restrict_lex->Height(),
-+                         Device::GetDeviceMemoryType());
-+      int_face_y.SetSize(int_face_restrict_lex->Height(),
-+                         Device::GetDeviceMemoryType());
-+      int_face_y.UseDevice(true);
-+   }
-+
-+   const bool has_bdr_integs = (a->GetBFBFI()->Size() > 0 ||
-+                                a->GetBBFI()->Size() > 0);
-+   if (bdr_face_restrict_lex == nullptr && has_bdr_integs)
-    {
--      integrators[i]->AssembleMF(*a->FESpace());
-+      bdr_face_restrict_lex = fes->GetFaceRestriction(
-+                                 ElementDofOrdering::LEXICOGRAPHIC,
-+                                 FaceType::Boundary,
-+                                 m);
-+      bdr_face_x.SetSize(bdr_face_restrict_lex->Height(),
-+                         Device::GetDeviceMemoryType());
-+      bdr_face_y.SetSize(bdr_face_restrict_lex->Height(),
-+                         Device::GetDeviceMemoryType());
-+      bdr_face_y.UseDevice(true);
-    }
- }
- 
--void MFBilinearFormExtension::AssembleDiagonal(Vector &y) const
-+void MFBilinearFormExtension::Assemble()
- {
--   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
-+   SetupRestrictionOperators(L2FaceValues::DoubleValued);
-+
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   for (BilinearFormIntegrator *integ : integrators)
-+   {
-+      integ->AssembleMF(*fes);
-+   }
-+
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   for (BilinearFormIntegrator *integ : bdr_integrators)
-+   {
-+      integ->AssembleMFBoundary(*fes);
-+   }
-+
-+   MFEM_VERIFY(a->GetFBFI()->Size() == 0, "AddInteriorFaceIntegrator is not "
-+               "currently supported in MFBilinearFormExtension");
-+
-+   MFEM_VERIFY(a->GetBFBFI()->Size() == 0, "AddBdrFaceIntegrator is not "
-+               "currently supported in MFBilinearFormExtension");
-+}
- 
--   const int iSz = integrators.Size();
--   if (elem_restrict && !DeviceCanUseCeed())
-+void MFBilinearFormExtension::AssembleDiagonal(Vector &diag) const
-+{
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   if (elem_restrict && integrators.Size() > 0)
-    {
--      localY = 0.0;
--      for (int i = 0; i < iSz; ++i)
-+      local_y = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-       {
--         integrators[i]->AssembleDiagonalMF(localY);
-+         integ->AssembleDiagonalMF(local_y);
-       }
--      const ElementRestriction* H1elem_restrict =
--         dynamic_cast<const ElementRestriction*>(elem_restrict);
--      if (H1elem_restrict)
-+      elem_restrict->MultTransposeUnsigned(local_y, diag);
-+   }
-+   else
-+   {
-+      diag.UseDevice(true); // typically this is a large vector, so store on device
-+      diag = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-       {
--         H1elem_restrict->MultTransposeUnsigned(localY, y);
-+         integ->AssembleDiagonalMF(diag);
-       }
--      else
-+   }
-+
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   if (bdr_face_restrict_lex && bdr_integrators.Size() > 0)
-+   {
-+      bdr_face_y = 0.0;
-+      for (BilinearFormIntegrator *integ : bdr_integrators)
-       {
--         elem_restrict->MultTranspose(localY, y);
-+         integ->AssembleDiagonalMF(bdr_face_y);
-       }
-+      bdr_face_restrict_lex->AddMultTransposeUnsigned(bdr_face_y, diag);
-    }
-    else
-    {
--      y.UseDevice(true); // typically this is a large vector, so store on device
--      y = 0.0;
--      for (int i = 0; i < iSz; ++i)
-+      for (BilinearFormIntegrator *integ : bdr_integrators)
-       {
--         integrators[i]->AssembleDiagonalMF(y);
-+         integ->AssembleDiagonalMF(diag);
-       }
-    }
- }
- 
--void MFBilinearFormExtension::Update()
--{
--   FiniteElementSpace *fes = a->FESpace();
--   height = width = fes->GetVSize();
--   trial_fes = fes;
--   test_fes = fes;
--
--   elem_restrict = nullptr;
--   int_face_restrict_lex = nullptr;
--   bdr_face_restrict_lex = nullptr;
--}
--
--void MFBilinearFormExtension::FormSystemMatrix(const Array<int> &ess_tdof_list,
--                                               OperatorHandle &A)
--{
--   Operator *oper;
--   Operator::FormSystemOperator(ess_tdof_list, oper);
--   A.Reset(oper); // A will own oper
--}
--
--void MFBilinearFormExtension::FormLinearSystem(const Array<int> &ess_tdof_list,
--                                               Vector &x, Vector &b,
--                                               OperatorHandle &A,
--                                               Vector &X, Vector &B,
--                                               int copy_interior)
--{
--   Operator *oper;
--   Operator::FormLinearSystem(ess_tdof_list, x, b, oper, X, B, copy_interior);
--   A.Reset(oper); // A will own oper
--}
--
- void MFBilinearFormExtension::Mult(const Vector &x, Vector &y) const
- {
--   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
--
--   const int iSz = integrators.Size();
--   if (DeviceCanUseCeed() || !elem_restrict)
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   if (elem_restrict && integrators.Size() > 0)
-    {
--      y.UseDevice(true); // typically this is a large vector, so store on device
--      y = 0.0;
--      for (int i = 0; i < iSz; ++i)
-+      elem_restrict->Mult(x, local_x);
-+      local_y = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-       {
--         integrators[i]->AddMultMF(x, y);
-+         integ->AddMultMF(local_x, local_y);
-       }
-+      elem_restrict->MultTranspose(local_y, y);
-    }
-    else
-    {
--      elem_restrict->Mult(x, localX);
--      localY = 0.0;
--      for (int i = 0; i < iSz; ++i)
-+      y.UseDevice(true); // typically this is a large vector, so store on device
-+      y = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-       {
--         integrators[i]->AddMultMF(localX, localY);
-+         integ->AddMultMF(x, y);
-       }
--      elem_restrict->MultTranspose(localY, y);
-    }
- 
--   Array<BilinearFormIntegrator*> &intFaceIntegrators = *a->GetFBFI();
--   const int iFISz = intFaceIntegrators.Size();
--   if (int_face_restrict_lex && iFISz>0)
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   if (bdr_face_restrict_lex && bdr_integrators.Size() > 0)
-    {
--      int_face_restrict_lex->Mult(x, int_face_X);
--      if (int_face_X.Size()>0)
-+      bdr_face_restrict_lex->Mult(x, bdr_face_x);
-+      if (bdr_face_x.Size() > 0)
-       {
--         int_face_Y = 0.0;
--         for (int i = 0; i < iFISz; ++i)
-+         bdr_face_y = 0.0;
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-          {
--            intFaceIntegrators[i]->AddMultMF(int_face_X, int_face_Y);
-+            integ->AddMultMF(bdr_face_x, bdr_face_y);
-          }
--         int_face_restrict_lex->AddMultTransposeInPlace(int_face_Y, y);
-+         bdr_face_restrict_lex->AddMultTransposeInPlace(bdr_face_y, y);
-       }
-    }
--
--   Array<BilinearFormIntegrator*> &bdrFaceIntegrators = *a->GetBFBFI();
--   const int bFISz = bdrFaceIntegrators.Size();
--   if (bdr_face_restrict_lex && bFISz>0)
-+   else
-    {
--      bdr_face_restrict_lex->Mult(x, bdr_face_X);
--      if (bdr_face_X.Size()>0)
-+      for (BilinearFormIntegrator *integ : bdr_integrators)
-       {
--         bdr_face_Y = 0.0;
--         for (int i = 0; i < bFISz; ++i)
--         {
--            bdrFaceIntegrators[i]->AddMultMF(bdr_face_X, bdr_face_Y);
--         }
--         bdr_face_restrict_lex->AddMultTransposeInPlace(bdr_face_Y, y);
-+         integ->AddMultMF(x, y);
-       }
-    }
- }
- 
--void MFBilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
-+void MFBilinearFormExtension::AddMult(const Vector &x, Vector &y,
-+                                      const double c) const
- {
--   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
--   const int iSz = integrators.Size();
--   if (elem_restrict)
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   if (elem_restrict && integrators.Size() > 0)
-    {
--      elem_restrict->Mult(x, localX);
--      localY = 0.0;
--      for (int i = 0; i < iSz; ++i)
-+      elem_restrict->Mult(x, local_x);
-+      local_y = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-+      {
-+         integ->AddMultMF(local_x, local_y);
-+      }
-+      if (c != 1.0)
-       {
--         integrators[i]->AddMultTransposeMF(localX, localY);
-+         local_y *= c;
-       }
--      elem_restrict->MultTranspose(localY, y);
-+      elem_restrict->AddMultTranspose(local_y, y);
-    }
-    else
-    {
--      y.UseDevice(true);
--      y = 0.0;
--      for (int i = 0; i < iSz; ++i)
-+      y.UseDevice(true); // typically this is a large vector, so store on device
-+      if (c != 1.0 && integrators.Size() > 0)
-       {
--         integrators[i]->AddMultTransposeMF(x, y);
-+         temp_y.SetSize(y.Size());
-+         temp_y.UseDevice(true);
-+         temp_y = 0.0;
-+         for (BilinearFormIntegrator *integ : integrators)
-+         {
-+            integ->AddMultMF(x, temp_y);
-+         }
-+         y.Add(c, temp_y);
-+      }
-+      else
-+      {
-+         for (BilinearFormIntegrator *integ : integrators)
-+         {
-+            integ->AddMultMF(x, y);
-+         }
-       }
-    }
- 
--   Array<BilinearFormIntegrator*> &intFaceIntegrators = *a->GetFBFI();
--   const int iFISz = intFaceIntegrators.Size();
--   if (int_face_restrict_lex && iFISz>0)
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   if (bdr_face_restrict_lex && bdr_integrators.Size() > 0)
-    {
--      int_face_restrict_lex->Mult(x, int_face_X);
--      if (int_face_X.Size()>0)
-+      bdr_face_restrict_lex->Mult(x, bdr_face_x);
-+      if (bdr_face_x.Size() > 0)
-       {
--         int_face_Y = 0.0;
--         for (int i = 0; i < iFISz; ++i)
-+         bdr_face_y = 0.0;
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultMF(bdr_face_x, bdr_face_y);
-+         }
-+         if (c != 1.0)
-          {
--            intFaceIntegrators[i]->AddMultTransposeMF(int_face_X, int_face_Y);
-+            bdr_face_y *= c;
-          }
--         int_face_restrict_lex->AddMultTransposeInPlace(int_face_Y, y);
-+         bdr_face_restrict_lex->AddMultTransposeInPlace(bdr_face_y, y);
-       }
-    }
--
--   Array<BilinearFormIntegrator*> &bdrFaceIntegrators = *a->GetBFBFI();
--   const int bFISz = bdrFaceIntegrators.Size();
--   if (bdr_face_restrict_lex && bFISz>0)
-+   else
-    {
--      bdr_face_restrict_lex->Mult(x, bdr_face_X);
--      if (bdr_face_X.Size()>0)
-+      if (c != 1.0 && bdr_integrators.Size() > 0)
-+      {
-+         temp_y.SetSize(y.Size());
-+         temp_y.UseDevice(true);
-+         temp_y = 0.0;
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultMF(x, temp_y);
-+         }
-+         y.Add(c, temp_y);
-+      }
-+      else
-       {
--         bdr_face_Y = 0.0;
--         for (int i = 0; i < bFISz; ++i)
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-          {
--            bdrFaceIntegrators[i]->AddMultTransposeMF(bdr_face_X, bdr_face_Y);
-+            integ->AddMultMF(x, y);
-          }
--         bdr_face_restrict_lex->AddMultTransposeInPlace(bdr_face_Y, y);
-       }
-    }
- }
- 
--// Data and methods for partially-assembled bilinear forms
--PABilinearFormExtension::PABilinearFormExtension(BilinearForm *form)
--   : BilinearFormExtension(form),
--     trial_fes(a->FESpace()),
--     test_fes(a->FESpace())
--{
--   elem_restrict = NULL;
--   int_face_restrict_lex = NULL;
--   bdr_face_restrict_lex = NULL;
--}
--
--void PABilinearFormExtension::SetupRestrictionOperators(const L2FaceValues m)
-+void MFBilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
- {
--   if ( Device::Allows(Backend::CEED_MASK) ) { return; }
--   ElementDofOrdering ordering = UsesTensorBasis(*a->FESpace())?
--                                 ElementDofOrdering::LEXICOGRAPHIC:
--                                 ElementDofOrdering::NATIVE;
--   elem_restrict = trial_fes->GetElementRestriction(ordering);
--   if (elem_restrict)
--   {
--      localX.SetSize(elem_restrict->Height(), Device::GetDeviceMemoryType());
--      localY.SetSize(elem_restrict->Height(), Device::GetDeviceMemoryType());
--      localY.UseDevice(true); // ensure 'localY = 0.0' is done on device
--   }
--
--   // Construct face restriction operators only if the bilinear form has
--   // interior or boundary face integrators
--   if (int_face_restrict_lex == NULL && a->GetFBFI()->Size() > 0)
--   {
--      int_face_restrict_lex = trial_fes->GetFaceRestriction(
--                                 ElementDofOrdering::LEXICOGRAPHIC,
--                                 FaceType::Interior);
--      int_face_X.SetSize(int_face_restrict_lex->Height(), Device::GetMemoryType());
--      int_face_Y.SetSize(int_face_restrict_lex->Height(), Device::GetMemoryType());
--      int_face_Y.UseDevice(true); // ensure 'int_face_Y = 0.0' is done on device
--   }
--
--   if (bdr_face_restrict_lex == NULL && a->GetBFBFI()->Size() > 0)
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   if (elem_restrict && integrators.Size() > 0)
-    {
--      bdr_face_restrict_lex = trial_fes->GetFaceRestriction(
--                                 ElementDofOrdering::LEXICOGRAPHIC,
--                                 FaceType::Boundary,
--                                 m);
--      bdr_face_X.SetSize(bdr_face_restrict_lex->Height(), Device::GetMemoryType());
--      bdr_face_Y.SetSize(bdr_face_restrict_lex->Height(), Device::GetMemoryType());
--      bdr_face_Y.UseDevice(true); // ensure 'faceBoundY = 0.0' is done on device
-+      elem_restrict->Mult(x, local_x);
-+      local_y = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-+      {
-+         integ->AddMultTransposeMF(local_x, local_y);
-+      }
-+      elem_restrict->MultTranspose(local_y, y);
-    }
--}
--
--void PABilinearFormExtension::Assemble()
--{
--   SetupRestrictionOperators(L2FaceValues::DoubleValued);
--
--   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
--   const int integratorCount = integrators.Size();
--   for (int i = 0; i < integratorCount; ++i)
-+   else
-    {
--      integrators[i]->AssemblePA(*a->FESpace());
-+      y.UseDevice(true); // typically this is a large vector, so store on device
-+      y = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-+      {
-+         integ->AddMultTransposeMF(x, y);
-+      }
-    }
- 
--   MFEM_VERIFY(a->GetBBFI()->Size() == 0,
--               "Partial assembly does not support AddBoundaryIntegrator yet.");
--
--   Array<BilinearFormIntegrator*> &intFaceIntegrators = *a->GetFBFI();
--   const int intFaceIntegratorCount = intFaceIntegrators.Size();
--   for (int i = 0; i < intFaceIntegratorCount; ++i)
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   if (bdr_face_restrict_lex && bdr_integrators.Size() > 0)
-    {
--      intFaceIntegrators[i]->AssemblePAInteriorFaces(*a->FESpace());
-+      bdr_face_restrict_lex->Mult(x, bdr_face_x);
-+      if (bdr_face_x.Size() > 0)
-+      {
-+         bdr_face_y = 0.0;
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultTransposeMF(bdr_face_x, bdr_face_y);
-+         }
-+         bdr_face_restrict_lex->AddMultTransposeInPlace(bdr_face_y, y);
-+      }
-    }
--
--   Array<BilinearFormIntegrator*> &bdrFaceIntegrators = *a->GetBFBFI();
--   const int boundFaceIntegratorCount = bdrFaceIntegrators.Size();
--   for (int i = 0; i < boundFaceIntegratorCount; ++i)
-+   else
-    {
--      bdrFaceIntegrators[i]->AssemblePABoundaryFaces(*a->FESpace());
-+      for (BilinearFormIntegrator *integ : bdr_integrators)
-+      {
-+         integ->AddMultTransposeMF(x, y);
-+      }
-    }
- }
- 
--void PABilinearFormExtension::AssembleDiagonal(Vector &y) const
-+void MFBilinearFormExtension::AddMultTranspose(const Vector &x, Vector &y,
-+                                               const double c) const
- {
--   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
--
--   const int iSz = integrators.Size();
--   if (elem_restrict && !DeviceCanUseCeed())
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   if (elem_restrict && integrators.Size() > 0)
-    {
--      localY = 0.0;
--      for (int i = 0; i < iSz; ++i)
-+      elem_restrict->Mult(x, local_x);
-+      local_y = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-       {
--         integrators[i]->AssembleDiagonalPA(localY);
-+         integ->AddMultTransposeMF(local_x, local_y);
-       }
--      const ElementRestriction* H1elem_restrict =
--         dynamic_cast<const ElementRestriction*>(elem_restrict);
--      if (H1elem_restrict)
-+      if (c != 1.0)
-+      {
-+         local_y *= c;
-+      }
-+      elem_restrict->AddMultTranspose(local_y, y);
-+   }
-+   else
-+   {
-+      y.UseDevice(true); // typically this is a large vector, so store on device
-+      if (c != 1.0 && integrators.Size() > 0)
-       {
--         H1elem_restrict->MultTransposeUnsigned(localY, y);
-+         temp_y.SetSize(y.Size());
-+         temp_y.UseDevice(true);
-+         temp_y = 0.0;
-+         for (BilinearFormIntegrator *integ : integrators)
-+         {
-+            integ->AddMultTransposeMF(x, temp_y);
-+         }
-+         y.Add(c, temp_y);
-       }
-       else
-       {
--         elem_restrict->MultTranspose(localY, y);
-+         for (BilinearFormIntegrator *integ : integrators)
-+         {
-+            integ->AddMultTransposeMF(x, y);
-+         }
-+      }
-+   }
-+
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   if (bdr_face_restrict_lex && bdr_integrators.Size() > 0)
-+   {
-+      bdr_face_restrict_lex->Mult(x, bdr_face_x);
-+      if (bdr_face_x.Size() > 0)
-+      {
-+         bdr_face_y = 0.0;
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultTransposeMF(bdr_face_x, bdr_face_y);
-+         }
-+         if (c != 1.0)
-+         {
-+            bdr_face_y *= c;
-+         }
-+         bdr_face_restrict_lex->AddMultTransposeInPlace(bdr_face_y, y);
-       }
-    }
-    else
-    {
--      y.UseDevice(true); // typically this is a large vector, so store on device
--      y = 0.0;
--      for (int i = 0; i < iSz; ++i)
-+      if (c != 1.0 && bdr_integrators.Size() > 0)
-+      {
-+         temp_y.SetSize(y.Size());
-+         temp_y.UseDevice(true);
-+         temp_y = 0.0;
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultTransposeMF(x, temp_y);
-+         }
-+         y.Add(c, temp_y);
-+      }
-+      else
-       {
--         integrators[i]->AssembleDiagonalPA(y);
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultTransposeMF(x, y);
-+         }
-       }
-    }
- }
- 
--void PABilinearFormExtension::Update()
-+void MFBilinearFormExtension::Update()
- {
--   FiniteElementSpace *fes = a->FESpace();
-+   fes = a->FESpace();
-    height = width = fes->GetVSize();
--   trial_fes = fes;
--   test_fes = fes;
- 
-    elem_restrict = nullptr;
-    int_face_restrict_lex = nullptr;
-    bdr_face_restrict_lex = nullptr;
- }
- 
--void PABilinearFormExtension::FormSystemMatrix(const Array<int> &ess_tdof_list,
--                                               OperatorHandle &A)
-+/// Data and methods for partially-assembled bilinear forms
-+PABilinearFormExtension::PABilinearFormExtension(BilinearForm *form)
-+   : MFBilinearFormExtension(form)
- {
--   Operator *oper;
--   Operator::FormSystemOperator(ess_tdof_list, oper);
--   A.Reset(oper); // A will own oper
- }
- 
--void PABilinearFormExtension::FormLinearSystem(const Array<int> &ess_tdof_list,
--                                               Vector &x, Vector &b,
--                                               OperatorHandle &A,
--                                               Vector &X, Vector &B,
--                                               int copy_interior)
-+void PABilinearFormExtension::Assemble()
- {
--   Operator *oper;
--   Operator::FormLinearSystem(ess_tdof_list, x, b, oper, X, B, copy_interior);
--   A.Reset(oper); // A will own oper
-+   SetupRestrictionOperators(L2FaceValues::DoubleValued);
-+
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   for (BilinearFormIntegrator *integ : integrators)
-+   {
-+      integ->AssemblePA(*fes);
-+   }
-+
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   for (BilinearFormIntegrator *integ : bdr_integrators)
-+   {
-+      integ->AssemblePABoundary(*fes);
-+   }
-+
-+   Array<BilinearFormIntegrator *> &int_face_integrators = *a->GetFBFI();
-+   for (BilinearFormIntegrator *integ : int_face_integrators)
-+   {
-+      integ->AssemblePAInteriorFaces(*fes);
-+   }
-+
-+   Array<BilinearFormIntegrator *> &bdr_face_integrators = *a->GetBFBFI();
-+   for (BilinearFormIntegrator *integ : bdr_face_integrators)
-+   {
-+      integ->AssemblePABoundaryFaces(*fes);
-+   }
- }
- 
--void PABilinearFormExtension::Mult(const Vector &x, Vector &y) const
-+void PABilinearFormExtension::AssembleDiagonal(Vector &diag) const
- {
--   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
--
--   const int iSz = integrators.Size();
--   if (DeviceCanUseCeed() || !elem_restrict)
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   if (elem_restrict && integrators.Size() > 0)
-    {
--      y.UseDevice(true); // typically this is a large vector, so store on device
--      y = 0.0;
--      for (int i = 0; i < iSz; ++i)
-+      local_y = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-       {
--         integrators[i]->AddMultPA(x, y);
-+         integ->AssembleDiagonalPA(local_y);
-       }
-+      elem_restrict->MultTransposeUnsigned(local_y, diag);
-    }
-    else
-    {
--      elem_restrict->Mult(x, localX);
--      localY = 0.0;
--      for (int i = 0; i < iSz; ++i)
-+      diag.UseDevice(true); // typically this is a large vector, so store on device
-+      diag = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-       {
--         integrators[i]->AddMultPA(localX, localY);
-+         integ->AssembleDiagonalPA(diag);
-       }
--      elem_restrict->MultTranspose(localY, y);
-    }
- 
--   Array<BilinearFormIntegrator*> &intFaceIntegrators = *a->GetFBFI();
--   const int iFISz = intFaceIntegrators.Size();
--   if (int_face_restrict_lex && iFISz>0)
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   if (bdr_face_restrict_lex && bdr_integrators.Size() > 0)
-    {
--      int_face_restrict_lex->Mult(x, int_face_X);
--      if (int_face_X.Size()>0)
-+      bdr_face_y = 0.0;
-+      for (BilinearFormIntegrator *integ : bdr_integrators)
-       {
--         int_face_Y = 0.0;
--         for (int i = 0; i < iFISz; ++i)
--         {
--            intFaceIntegrators[i]->AddMultPA(int_face_X, int_face_Y);
--         }
--         int_face_restrict_lex->AddMultTransposeInPlace(int_face_Y, y);
-+         integ->AssembleDiagonalPA(bdr_face_y);
-       }
-+      bdr_face_restrict_lex->AddMultTransposeUnsigned(bdr_face_y, diag);
-    }
--
--   Array<BilinearFormIntegrator*> &bdrFaceIntegrators = *a->GetBFBFI();
--   const int bFISz = bdrFaceIntegrators.Size();
--   if (bdr_face_restrict_lex && bFISz>0)
-+   else
-    {
--      bdr_face_restrict_lex->Mult(x, bdr_face_X);
--      if (bdr_face_X.Size()>0)
-+      for (BilinearFormIntegrator *integ : bdr_integrators)
-       {
--         bdr_face_Y = 0.0;
--         for (int i = 0; i < bFISz; ++i)
--         {
--            bdrFaceIntegrators[i]->AddMultPA(bdr_face_X, bdr_face_Y);
--         }
--         bdr_face_restrict_lex->AddMultTransposeInPlace(bdr_face_Y, y);
-+         integ->AssembleDiagonalPA(diag);
-       }
-    }
- }
- 
--void PABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
-+void PABilinearFormExtension::Mult(const Vector &x, Vector &y) const
- {
--   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
--   const int iSz = integrators.Size();
--   if (elem_restrict)
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   if (elem_restrict && integrators.Size() > 0)
-    {
--      elem_restrict->Mult(x, localX);
--      localY = 0.0;
--      for (int i = 0; i < iSz; ++i)
-+      elem_restrict->Mult(x, local_x);
-+      local_y = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-       {
--         integrators[i]->AddMultTransposePA(localX, localY);
-+         integ->AddMultPA(local_x, local_y);
-       }
--      elem_restrict->MultTranspose(localY, y);
-+      elem_restrict->MultTranspose(local_y, y);
-    }
-    else
-    {
--      y.UseDevice(true);
-+      y.UseDevice(true); // typically this is a large vector, so store on device
-       y = 0.0;
--      for (int i = 0; i < iSz; ++i)
-+      for (BilinearFormIntegrator *integ : integrators)
-       {
--         integrators[i]->AddMultTransposePA(x, y);
-+         integ->AddMultPA(x, y);
-       }
-    }
- 
--   Array<BilinearFormIntegrator*> &intFaceIntegrators = *a->GetFBFI();
--   const int iFISz = intFaceIntegrators.Size();
--   if (int_face_restrict_lex && iFISz>0)
-+   Array<BilinearFormIntegrator *> &int_face_integrators = *a->GetFBFI();
-+   if (int_face_restrict_lex && int_face_integrators.Size() > 0)
-    {
--      int_face_restrict_lex->Mult(x, int_face_X);
--      if (int_face_X.Size()>0)
-+      int_face_restrict_lex->Mult(x, int_face_x);
-+      if (int_face_x.Size() > 0)
-       {
--         int_face_Y = 0.0;
--         for (int i = 0; i < iFISz; ++i)
-+         int_face_y = 0.0;
-+         for (BilinearFormIntegrator *integ : int_face_integrators)
-          {
--            intFaceIntegrators[i]->AddMultTransposePA(int_face_X, int_face_Y);
-+            integ->AddMultPA(int_face_x, int_face_y);
-          }
--         int_face_restrict_lex->AddMultTransposeInPlace(int_face_Y, y);
-+         int_face_restrict_lex->AddMultTransposeInPlace(int_face_y, y);
-+      }
-+   }
-+   else
-+   {
-+      for (BilinearFormIntegrator *integ : int_face_integrators)
-+      {
-+         integ->AddMultPA(x, y);
-       }
-    }
- 
--   Array<BilinearFormIntegrator*> &bdrFaceIntegrators = *a->GetBFBFI();
--   const int bFISz = bdrFaceIntegrators.Size();
--   if (bdr_face_restrict_lex && bFISz>0)
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   Array<BilinearFormIntegrator *> &bdr_face_integrators = *a->GetBFBFI();
-+   if (bdr_face_restrict_lex && (bdr_integrators.Size() > 0 ||
-+                                 bdr_face_integrators.Size() > 0))
-    {
--      bdr_face_restrict_lex->Mult(x, bdr_face_X);
--      if (bdr_face_X.Size()>0)
-+      bdr_face_restrict_lex->Mult(x, bdr_face_x);
-+      if (bdr_face_x.Size() > 0)
-       {
--         bdr_face_Y = 0.0;
--         for (int i = 0; i < bFISz; ++i)
-+         bdr_face_y = 0.0;
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultPA(bdr_face_x, bdr_face_y);
-+         }
-+         for (BilinearFormIntegrator *integ : bdr_face_integrators)
-          {
--            bdrFaceIntegrators[i]->AddMultTransposePA(bdr_face_X, bdr_face_Y);
-+            integ->AddMultPA(bdr_face_x, bdr_face_y);
-          }
--         bdr_face_restrict_lex->AddMultTransposeInPlace(bdr_face_Y, y);
-+         bdr_face_restrict_lex->AddMultTransposeInPlace(bdr_face_y, y);
-       }
-    }
--}
--
--// Data and methods for element-assembled bilinear forms
--EABilinearFormExtension::EABilinearFormExtension(BilinearForm *form)
--   : PABilinearFormExtension(form),
--     factorize_face_terms(false)
--{
--   if (form->FESpace()->IsDGSpace() && form->FESpace()->Conforming())
-+   else
-    {
--      factorize_face_terms = true;
-+      for (BilinearFormIntegrator *integ : bdr_integrators)
-+      {
-+         integ->AddMultPA(x, y);
-+      }
-+      for (BilinearFormIntegrator *integ : bdr_face_integrators)
-+      {
-+         integ->AddMultPA(x, y);
-+      }
-    }
- }
- 
--void EABilinearFormExtension::Assemble()
-+void PABilinearFormExtension::AddMult(const Vector &x, Vector &y,
-+                                      const double c) const
- {
--   SetupRestrictionOperators(L2FaceValues::SingleValued);
--
--   ne = trial_fes->GetMesh()->GetNE();
--   elemDofs = trial_fes->GetFE(0)->GetDof();
--
--   ea_data.SetSize(ne*elemDofs*elemDofs, Device::GetMemoryType());
--   ea_data.UseDevice(true);
--
--   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
--   const int integratorCount = integrators.Size();
--   if ( integratorCount == 0 )
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   if (elem_restrict && integrators.Size() > 0)
-    {
--      ea_data = 0.0;
-+      elem_restrict->Mult(x, local_x);
-+      local_y = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-+      {
-+         integ->AddMultPA(local_x, local_y);
-+      }
-+      if (c != 1.0)
-+      {
-+         local_y *= c;
-+      }
-+      elem_restrict->AddMultTranspose(local_y, y);
-    }
--   for (int i = 0; i < integratorCount; ++i)
-+   else
-    {
--      integrators[i]->AssembleEA(*a->FESpace(), ea_data, i);
-+      y.UseDevice(true); // typically this is a large vector, so store on device
-+      if (c != 1.0 && integrators.Size() > 0)
-+      {
-+         temp_y.SetSize(y.Size());
-+         temp_y.UseDevice(true);
-+         temp_y = 0.0;
-+         for (BilinearFormIntegrator *integ : integrators)
-+         {
-+            integ->AddMultPA(x, temp_y);
-+         }
-+         y.Add(c, temp_y);
-+      }
-+      else
-+      {
-+         for (BilinearFormIntegrator *integ : integrators)
-+         {
-+            integ->AddMultPA(x, y);
-+         }
-+      }
-    }
- 
--   faceDofs = trial_fes ->
--              GetTraceElement(0, trial_fes->GetMesh()->GetFaceGeometry(0)) ->
--              GetDof();
--
--   MFEM_VERIFY(a->GetBBFI()->Size() == 0,
--               "Element assembly does not support AddBoundaryIntegrator yet.");
--
--   Array<BilinearFormIntegrator*> &intFaceIntegrators = *a->GetFBFI();
--   const int intFaceIntegratorCount = intFaceIntegrators.Size();
--   if (intFaceIntegratorCount>0)
-+   Array<BilinearFormIntegrator *> &int_face_integrators = *a->GetFBFI();
-+   if (int_face_restrict_lex && int_face_integrators.Size() > 0)
-    {
--      nf_int = trial_fes->GetNFbyType(FaceType::Interior);
--      ea_data_int.SetSize(2*nf_int*faceDofs*faceDofs, Device::GetMemoryType());
--      ea_data_ext.SetSize(2*nf_int*faceDofs*faceDofs, Device::GetMemoryType());
-+      int_face_restrict_lex->Mult(x, int_face_x);
-+      if (int_face_x.Size() > 0)
-+      {
-+         int_face_y = 0.0;
-+         for (BilinearFormIntegrator *integ : int_face_integrators)
-+         {
-+            integ->AddMultPA(int_face_x, int_face_y);
-+         }
-+         if (c != 1.0)
-+         {
-+            int_face_y *= c;
-+         }
-+         int_face_restrict_lex->AddMultTransposeInPlace(int_face_y, y);
-+      }
-    }
--   for (int i = 0; i < intFaceIntegratorCount; ++i)
-+   else
-    {
--      intFaceIntegrators[i]->AssembleEAInteriorFaces(*a->FESpace(),
--                                                     ea_data_int,
--                                                     ea_data_ext,
--                                                     i);
-+      if (c != 1.0 && int_face_integrators.Size() > 0)
-+      {
-+         temp_y.SetSize(y.Size());
-+         temp_y.UseDevice(true);
-+         temp_y = 0.0;
-+         for (BilinearFormIntegrator *integ : int_face_integrators)
-+         {
-+            integ->AddMultPA(x, temp_y);
-+         }
-+         y.Add(c, temp_y);
-+      }
-+      else
-+      {
-+         for (BilinearFormIntegrator *integ : int_face_integrators)
-+         {
-+            integ->AddMultPA(x, y);
-+         }
-+      }
-    }
- 
--   Array<BilinearFormIntegrator*> &bdrFaceIntegrators = *a->GetBFBFI();
--   const int boundFaceIntegratorCount = bdrFaceIntegrators.Size();
--   if (boundFaceIntegratorCount>0)
--   {
--      nf_bdr = trial_fes->GetNFbyType(FaceType::Boundary);
--      ea_data_bdr.SetSize(nf_bdr*faceDofs*faceDofs, Device::GetMemoryType());
--      ea_data_bdr = 0.0;
--   }
--   for (int i = 0; i < boundFaceIntegratorCount; ++i)
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   Array<BilinearFormIntegrator *> &bdr_face_integrators = *a->GetBFBFI();
-+   if (bdr_face_restrict_lex && (bdr_integrators.Size() > 0 ||
-+                                 bdr_face_integrators.Size() > 0))
-    {
--      bdrFaceIntegrators[i]->AssembleEABoundaryFaces(*a->FESpace(),ea_data_bdr,i);
-+      bdr_face_restrict_lex->Mult(x, bdr_face_x);
-+      if (bdr_face_x.Size() > 0)
-+      {
-+         bdr_face_y = 0.0;
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultPA(bdr_face_x, bdr_face_y);
-+         }
-+         for (BilinearFormIntegrator *integ : bdr_face_integrators)
-+         {
-+            integ->AddMultPA(bdr_face_x, bdr_face_y);
-+         }
-+         if (c != 1.0)
-+         {
-+            bdr_face_y *= c;
-+         }
-+         bdr_face_restrict_lex->AddMultTransposeInPlace(bdr_face_y, y);
-+      }
-    }
-+   else
-+   {
-+      if (c != 1.0 && (bdr_integrators.Size() > 0 || bdr_face_integrators.Size() > 0))
-+      {
-+         temp_y.SetSize(y.Size());
-+         temp_y.UseDevice(true);
-+         temp_y = 0.0;
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultPA(x, temp_y);
-+         }
-+         for (BilinearFormIntegrator *integ : bdr_face_integrators)
-+         {
-+            integ->AddMultPA(x, temp_y);
-+         }
-+         y.Add(c, temp_y);
-+      }
-+      else
-+      {
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultPA(x, y);
-+         }
-+         for (BilinearFormIntegrator *integ : bdr_face_integrators)
-+         {
-+            integ->AddMultPA(x, y);
-+         }
-+      }
-+   }
-+}
- 
--   if (factorize_face_terms && int_face_restrict_lex)
-+void PABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
-+{
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   if (elem_restrict && integrators.Size() > 0)
-    {
--      auto restFint = dynamic_cast<const L2FaceRestriction*>(int_face_restrict_lex);
--      restFint->AddFaceMatricesToElementMatrices(ea_data_int, ea_data);
-+      elem_restrict->Mult(x, local_x);
-+      local_y = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-+      {
-+         integ->AddMultTransposePA(local_x, local_y);
-+      }
-+      elem_restrict->MultTranspose(local_y, y);
-    }
--   if (factorize_face_terms && bdr_face_restrict_lex)
-+   else
-+   {
-+      y.UseDevice(true);
-+      y = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-+      {
-+         integ->AddMultTransposePA(x, y);
-+      }
-+   }
-+
-+   Array<BilinearFormIntegrator *> &int_face_integrators = *a->GetFBFI();
-+   if (int_face_restrict_lex && int_face_integrators.Size() > 0)
-    {
--      auto restFbdr = dynamic_cast<const L2FaceRestriction*>(bdr_face_restrict_lex);
--      restFbdr->AddFaceMatricesToElementMatrices(ea_data_bdr, ea_data);
-+      int_face_restrict_lex->Mult(x, int_face_x);
-+      if (int_face_x.Size() > 0)
-+      {
-+         int_face_y = 0.0;
-+         for (BilinearFormIntegrator *integ : int_face_integrators)
-+         {
-+            integ->AddMultTransposePA(int_face_x, int_face_y);
-+         }
-+         int_face_restrict_lex->AddMultTransposeInPlace(int_face_y, y);
-+      }
-+   }
-+   else
-+   {
-+      for (BilinearFormIntegrator *integ : int_face_integrators)
-+      {
-+         integ->AddMultTransposePA(x, y);
-+      }
-+   }
-+
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   Array<BilinearFormIntegrator *> &bdr_face_integrators = *a->GetBFBFI();
-+   if (bdr_face_restrict_lex && (bdr_integrators.Size() > 0 ||
-+                                 bdr_face_integrators.Size() > 0))
-+   {
-+      bdr_face_restrict_lex->Mult(x, bdr_face_x);
-+      if (bdr_face_x.Size() > 0)
-+      {
-+         bdr_face_y = 0.0;
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultTransposePA(bdr_face_x, bdr_face_y);
-+         }
-+         for (BilinearFormIntegrator *integ : bdr_face_integrators)
-+         {
-+            integ->AddMultTransposePA(bdr_face_x, bdr_face_y);
-+         }
-+         bdr_face_restrict_lex->AddMultTransposeInPlace(bdr_face_y, y);
-+      }
-+   }
-+   else
-+   {
-+      for (BilinearFormIntegrator *integ : bdr_integrators)
-+      {
-+         integ->AddMultTransposePA(x, y);
-+      }
-+      for (BilinearFormIntegrator *integ : bdr_face_integrators)
-+      {
-+         integ->AddMultTransposePA(x, y);
-+      }
-    }
- }
- 
--void EABilinearFormExtension::Mult(const Vector &x, Vector &y) const
-+void PABilinearFormExtension::AddMultTranspose(const Vector &x, Vector &y,
-+                                               const double c) const
- {
--   // Apply the Element Restriction
--   const bool useRestrict = !DeviceCanUseCeed() && elem_restrict;
--   if (!useRestrict)
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   if (elem_restrict && integrators.Size() > 0)
-+   {
-+      elem_restrict->Mult(x, local_x);
-+      local_y = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-+      {
-+         integ->AddMultTransposePA(local_x, local_y);
-+      }
-+      if (c != 1.0)
-+      {
-+         local_y *= c;
-+      }
-+      elem_restrict->AddMultTranspose(local_y, y);
-+   }
-+   else
-    {
-       y.UseDevice(true); // typically this is a large vector, so store on device
--      y = 0.0;
-+      if (c != 1.0 && integrators.Size() > 0)
-+      {
-+         temp_y.SetSize(y.Size());
-+         temp_y.UseDevice(true);
-+         temp_y = 0.0;
-+         for (BilinearFormIntegrator *integ : integrators)
-+         {
-+            integ->AddMultTransposePA(x, temp_y);
-+         }
-+         y.Add(c, temp_y);
-+      }
-+      else
-+      {
-+         for (BilinearFormIntegrator *integ : integrators)
-+         {
-+            integ->AddMultTransposePA(x, y);
-+         }
-+      }
-+   }
-+
-+   Array<BilinearFormIntegrator *> &int_face_integrators = *a->GetFBFI();
-+   if (int_face_restrict_lex && int_face_integrators.Size() > 0)
-+   {
-+      int_face_restrict_lex->Mult(x, int_face_x);
-+      if (int_face_x.Size() > 0)
-+      {
-+         int_face_y = 0.0;
-+         for (BilinearFormIntegrator *integ : int_face_integrators)
-+         {
-+            integ->AddMultTransposePA(int_face_x, int_face_y);
-+         }
-+         if (c != 1.0)
-+         {
-+            int_face_y *= c;
-+         }
-+         int_face_restrict_lex->AddMultTransposeInPlace(int_face_y, y);
-+      }
-    }
-    else
-    {
--      elem_restrict->Mult(x, localX);
--      localY = 0.0;
-+      if (c != 1.0 && int_face_integrators.Size() > 0)
-+      {
-+         temp_y.SetSize(y.Size());
-+         temp_y.UseDevice(true);
-+         temp_y = 0.0;
-+         for (BilinearFormIntegrator *integ : int_face_integrators)
-+         {
-+            integ->AddMultTransposePA(x, temp_y);
-+         }
-+         y.Add(c, temp_y);
-+      }
-+      else
-+      {
-+         for (BilinearFormIntegrator *integ : int_face_integrators)
-+         {
-+            integ->AddMultTransposePA(x, y);
-+         }
-+      }
-+   }
-+
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   Array<BilinearFormIntegrator *> &bdr_face_integrators = *a->GetBFBFI();
-+   if (bdr_face_restrict_lex && (bdr_integrators.Size() > 0 ||
-+                                 bdr_face_integrators.Size() > 0))
-+   {
-+      bdr_face_restrict_lex->Mult(x, bdr_face_x);
-+      if (bdr_face_x.Size() > 0)
-+      {
-+         bdr_face_y = 0.0;
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultTransposePA(bdr_face_x, bdr_face_y);
-+         }
-+         for (BilinearFormIntegrator *integ : bdr_face_integrators)
-+         {
-+            integ->AddMultTransposePA(bdr_face_x, bdr_face_y);
-+         }
-+         if (c != 1.0)
-+         {
-+            bdr_face_y *= c;
-+         }
-+         bdr_face_restrict_lex->AddMultTransposeInPlace(bdr_face_y, y);
-+      }
-+   }
-+   else
-+   {
-+      if (c != 1.0 && (bdr_integrators.Size() > 0 || bdr_face_integrators.Size() > 0))
-+      {
-+         temp_y.SetSize(y.Size());
-+         temp_y.UseDevice(true);
-+         temp_y = 0.0;
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultTransposePA(x, temp_y);
-+         }
-+         for (BilinearFormIntegrator *integ : bdr_face_integrators)
-+         {
-+            integ->AddMultTransposePA(x, temp_y);
-+         }
-+         y.Add(c, temp_y);
-+      }
-+      else
-+      {
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultTransposePA(x, y);
-+         }
-+         for (BilinearFormIntegrator *integ : bdr_face_integrators)
-+         {
-+            integ->AddMultTransposePA(x, y);
-+         }
-+      }
-+   }
-+}
-+
-+/// Data and methods for element-assembled bilinear forms
-+EABilinearFormExtension::EABilinearFormExtension(BilinearForm *form)
-+   : PABilinearFormExtension(form),
-+     factorize_face_terms(fes->IsDGSpace() && fes->Conforming())
-+{
-+}
-+
-+void EABilinearFormExtension::Assemble()
-+{
-+   SetupRestrictionOperators(L2FaceValues::SingleValued);
-+
-+   ne = fes->GetNE();
-+   elem_dofs = fes->GetFE(0)->GetDof();
-+
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   if (integrators.Size() > 0)
-+   {
-+      ea_data.SetSize(ne * elem_dofs * elem_dofs, Device::GetMemoryType());
-+      ea_data.UseDevice(true);
-+      ea_data = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-+      {
-+         integ->AssembleEA(*fes, ea_data);
-+      }
-+   }
-+
-+   MFEM_VERIFY(a->GetBBFI()->Size() == 0,
-+               "Element assembly does not support AddBoundaryIntegrator yet.");
-+
-+   nf_int = fes->GetNFbyType(FaceType::Interior);
-+   nf_bdr = fes->GetNFbyType(FaceType::Boundary);
-+   face_dofs = fes->GetTraceElement(0,
-+                                    fes->GetMesh()->GetFaceGeometry(0))->GetDof();
-+
-+   Array<BilinearFormIntegrator *> &int_face_integrators = *a->GetFBFI();
-+   if (int_face_integrators.Size() > 0)
-+   {
-+      ea_data_int.SetSize(2 * nf_int * face_dofs * face_dofs,
-+                          Device::GetMemoryType());
-+      ea_data_ext.SetSize(2 * nf_int * face_dofs * face_dofs,
-+                          Device::GetMemoryType());
-+      ea_data_int = 0.0;
-+      ea_data_ext = 0.0;
-+      for (BilinearFormIntegrator *integ : int_face_integrators)
-+      {
-+         integ->AssembleEAInteriorFaces(*fes, ea_data_int, ea_data_ext);
-+      }
-+   }
-+
-+   Array<BilinearFormIntegrator *> &bdr_face_integrators = *a->GetBFBFI();
-+   if (bdr_face_integrators.Size() > 0)
-+   {
-+      ea_data_bdr.SetSize(nf_bdr * face_dofs * face_dofs, Device::GetMemoryType());
-+      ea_data_bdr = 0.0;
-+      for (BilinearFormIntegrator *integ : bdr_face_integrators)
-+      {
-+         integ->AssembleEABoundaryFaces(*fes, ea_data_bdr);
-+      }
-+   }
-+
-+   if (factorize_face_terms && int_face_restrict_lex)
-+   {
-+      auto l2_face_restrict = dynamic_cast<const L2FaceRestriction &>
-+                              (*int_face_restrict_lex);
-+      l2_face_restrict.AddFaceMatricesToElementMatrices(ea_data_int, ea_data);
-    }
--   // Apply the Element Matrices
-+   if (factorize_face_terms && bdr_face_restrict_lex)
-    {
--      const int NDOFS = elemDofs;
--      auto X = Reshape(useRestrict?localX.Read():x.Read(), NDOFS, ne);
--      auto Y = Reshape(useRestrict?localY.ReadWrite():y.ReadWrite(), NDOFS, ne);
--      auto A = Reshape(ea_data.Read(), NDOFS, NDOFS, ne);
--      mfem::forall(ne*NDOFS, [=] MFEM_HOST_DEVICE (int glob_j)
-+      auto l2_face_restrict = dynamic_cast<const L2FaceRestriction &>
-+                              (*bdr_face_restrict_lex);
-+      l2_face_restrict.AddFaceMatricesToElementMatrices(ea_data_bdr, ea_data);
-+   }
-+}
-+
-+void EABilinearFormExtension::Mult(const Vector &x, Vector &y) const
-+{
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   auto Apply = [](const int nelem, const int ndofs, const Vector &data,
-+                   const Vector &x, Vector &y)
-+   {
-+      auto X = Reshape(x.Read(), ndofs, nelem);
-+      auto Y = Reshape(y.ReadWrite(), ndofs, nelem);
-+      auto A = Reshape(data.Read(), ndofs, ndofs, nelem);
-+      mfem::forall(nelem * ndofs, [=] MFEM_HOST_DEVICE (int k)
-       {
--         const int e = glob_j/NDOFS;
--         const int j = glob_j%NDOFS;
-+         const int e = k / ndofs;
-+         const int j = k % ndofs;
-          double res = 0.0;
--         for (int i = 0; i < NDOFS; i++)
-+         for (int i = 0; i < ndofs; i++)
-          {
--            res += A(i, j, e)*X(i, e);
-+            res += A(i, j, e) * X(i, e);
-          }
-          Y(j, e) += res;
-       });
--      // Apply the Element Restriction transposed
--      if (useRestrict)
-+   };
-+   if (elem_restrict)
-+   {
-+      if (integrators.Size() > 0)
-       {
--         elem_restrict->MultTranspose(localY, y);
-+         elem_restrict->Mult(x, local_x);
-+         local_y = 0.0;
-+         Apply(ne, elem_dofs, ea_data, local_x, local_y);
-+         elem_restrict->MultTranspose(local_y, y);
-+      }
-+      else
-+      {
-+         y = 0.0;
-+      }
-+   }
-+   else
-+   {
-+      y.UseDevice(true); // typically this is a large vector, so store on device
-+      y = 0.0;
-+      if (integrators.Size() > 0)
-+      {
-+         Apply(ne, elem_dofs, ea_data, x, y);
-       }
-    }
- 
-    // Treatment of interior faces
--   Array<BilinearFormIntegrator*> &intFaceIntegrators = *a->GetFBFI();
--   const int iFISz = intFaceIntegrators.Size();
--   if (int_face_restrict_lex && iFISz>0)
--   {
--      // Apply the Interior Face Restriction
--      int_face_restrict_lex->Mult(x, int_face_X);
--      if (int_face_X.Size()>0)
--      {
--         int_face_Y = 0.0;
--         // Apply the interior face matrices
--         const int NDOFS = faceDofs;
--         auto X = Reshape(int_face_X.Read(), NDOFS, 2, nf_int);
--         auto Y = Reshape(int_face_Y.ReadWrite(), NDOFS, 2, nf_int);
-+   Array<BilinearFormIntegrator *> &int_face_integrators = *a->GetFBFI();
-+   auto ApplyIntFace = [](const int nface, const int ndofs, const Vector &data,
-+                          const Vector &x, Vector &y)
-+   {
-+      auto X = Reshape(x.Read(), ndofs, 2, nface);
-+      auto Y = Reshape(y.ReadWrite(), ndofs, 2, nface);
-+      auto A = Reshape(data.Read(), ndofs, ndofs, 2, nface);
-+      mfem::forall(nface * ndofs, [=] MFEM_HOST_DEVICE (int k)
-+      {
-+         const int f = k / ndofs;
-+         const int j = k % ndofs;
-+         double res = 0.0;
-+         for (int i = 0; i < ndofs; i++)
-+         {
-+            res += A(i, j, 0, f) * X(i, 0, f);
-+         }
-+         Y(j, 0, f) += res;
-+         res = 0.0;
-+         for (int i = 0; i < ndofs; i++)
-+         {
-+            res += A(i, j, 1, f) * X(i, 1, f);
-+         }
-+         Y(j, 1, f) += res;
-+      });
-+   };
-+   auto ApplyExtFace = [](const int nface, const int ndofs, const Vector &data,
-+                          const Vector &x, Vector &y)
-+   {
-+      auto X = Reshape(x.Read(), ndofs, 2, nface);
-+      auto Y = Reshape(y.ReadWrite(), ndofs, 2, nface);
-+      auto A = Reshape(data.Read(), ndofs, ndofs, 2, nface);
-+      mfem::forall(nface * ndofs, [=] MFEM_HOST_DEVICE (int k)
-+      {
-+         const int f = k / ndofs;
-+         const int j = k % ndofs;
-+         double res = 0.0;
-+         for (int i = 0; i < ndofs; i++)
-+         {
-+            res += A(i, j, 0, f) * X(i, 0, f);
-+         }
-+         Y(j, 1, f) += res;
-+         res = 0.0;
-+         for (int i = 0; i < ndofs; i++)
-+         {
-+            res += A(i, j, 1, f) * X(i, 1, f);
-+         }
-+         Y(j, 0, f) += res;
-+      });
-+   };
-+   if (int_face_restrict_lex && int_face_integrators.Size() > 0)
-+   {
-+      int_face_restrict_lex->Mult(x, int_face_x);
-+      if (int_face_x.Size() > 0)
-+      {
-+         int_face_y = 0.0;
-          if (!factorize_face_terms)
-          {
--            auto A_int = Reshape(ea_data_int.Read(), NDOFS, NDOFS, 2, nf_int);
--            mfem::forall(nf_int*NDOFS, [=] MFEM_HOST_DEVICE (int glob_j)
--            {
--               const int f = glob_j/NDOFS;
--               const int j = glob_j%NDOFS;
--               double res = 0.0;
--               for (int i = 0; i < NDOFS; i++)
--               {
--                  res += A_int(i, j, 0, f)*X(i, 0, f);
--               }
--               Y(j, 0, f) += res;
--               res = 0.0;
--               for (int i = 0; i < NDOFS; i++)
--               {
--                  res += A_int(i, j, 1, f)*X(i, 1, f);
--               }
--               Y(j, 1, f) += res;
--            });
--         }
--         auto A_ext = Reshape(ea_data_ext.Read(), NDOFS, NDOFS, 2, nf_int);
--         mfem::forall(nf_int*NDOFS, [=] MFEM_HOST_DEVICE (int glob_j)
--         {
--            const int f = glob_j/NDOFS;
--            const int j = glob_j%NDOFS;
--            double res = 0.0;
--            for (int i = 0; i < NDOFS; i++)
--            {
--               res += A_ext(i, j, 0, f)*X(i, 0, f);
--            }
--            Y(j, 1, f) += res;
--            res = 0.0;
--            for (int i = 0; i < NDOFS; i++)
--            {
--               res += A_ext(i, j, 1, f)*X(i, 1, f);
--            }
--            Y(j, 0, f) += res;
--         });
--         // Apply the Interior Face Restriction transposed
--         int_face_restrict_lex->AddMultTransposeInPlace(int_face_Y, y);
-+            ApplyIntFace(nf_int, face_dofs, ea_data_int, int_face_x, int_face_y);
-+         }
-+         ApplyExtFace(nf_int, face_dofs, ea_data_ext, int_face_x, int_face_y);
-+         int_face_restrict_lex->AddMultTransposeInPlace(int_face_y, y);
-       }
-    }
- 
-    // Treatment of boundary faces
--   Array<BilinearFormIntegrator*> &bdrFaceIntegrators = *a->GetBFBFI();
--   const int bFISz = bdrFaceIntegrators.Size();
--   if (!factorize_face_terms && bdr_face_restrict_lex && bFISz>0)
--   {
--      // Apply the Boundary Face Restriction
--      bdr_face_restrict_lex->Mult(x, bdr_face_X);
--      if (bdr_face_X.Size()>0)
--      {
--         bdr_face_Y = 0.0;
--         // Apply the boundary face matrices
--         const int NDOFS = faceDofs;
--         auto X = Reshape(bdr_face_X.Read(), NDOFS, nf_bdr);
--         auto Y = Reshape(bdr_face_Y.ReadWrite(), NDOFS, nf_bdr);
--         auto A = Reshape(ea_data_bdr.Read(), NDOFS, NDOFS, nf_bdr);
--         mfem::forall(nf_bdr*NDOFS, [=] MFEM_HOST_DEVICE (int glob_j)
--         {
--            const int f = glob_j/NDOFS;
--            const int j = glob_j%NDOFS;
--            double res = 0.0;
--            for (int i = 0; i < NDOFS; i++)
--            {
--               res += A(i, j, f)*X(i, f);
--            }
--            Y(j, f) += res;
--         });
--         // Apply the Boundary Face Restriction transposed
--         bdr_face_restrict_lex->AddMultTransposeInPlace(bdr_face_Y, y);
-+   Array<BilinearFormIntegrator *> &bdr_face_integrators = *a->GetBFBFI();
-+   if (!factorize_face_terms && bdr_face_restrict_lex &&
-+       bdr_face_integrators.Size() > 0)
-+   {
-+      bdr_face_restrict_lex->Mult(x, bdr_face_x);
-+      if (bdr_face_x.Size() > 0)
-+      {
-+         bdr_face_y = 0.0;
-+         Apply(nf_bdr, face_dofs, ea_data_bdr, bdr_face_x, bdr_face_y);
-+         bdr_face_restrict_lex->AddMultTransposeInPlace(bdr_face_y, y);
-       }
-    }
- }
- 
- void EABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
- {
--   // Apply the Element Restriction
--   const bool useRestrict = !DeviceCanUseCeed() && elem_restrict;
--   if (!useRestrict)
--   {
--      y.UseDevice(true); // typically this is a large vector, so store on device
--      y = 0.0;
--   }
--   else
--   {
--      elem_restrict->Mult(x, localX);
--      localY = 0.0;
--   }
--   // Apply the Element Matrices transposed
--   {
--      const int NDOFS = elemDofs;
--      auto X = Reshape(useRestrict?localX.Read():x.Read(), NDOFS, ne);
--      auto Y = Reshape(useRestrict?localY.ReadWrite():y.ReadWrite(), NDOFS, ne);
--      auto A = Reshape(ea_data.Read(), NDOFS, NDOFS, ne);
--      mfem::forall(ne*NDOFS, [=] MFEM_HOST_DEVICE (int glob_j)
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   auto ApplyTranspose = [](const int nelem, const int ndofs, const Vector &data,
-+                            const Vector &x, Vector &y)
-+   {
-+      auto X = Reshape(x.Read(), ndofs, nelem);
-+      auto Y = Reshape(y.ReadWrite(), ndofs, nelem);
-+      auto A = Reshape(data.Read(), ndofs, ndofs, nelem);
-+      mfem::forall(nelem * ndofs, [=] MFEM_HOST_DEVICE (int k)
-       {
--         const int e = glob_j/NDOFS;
--         const int j = glob_j%NDOFS;
-+         const int e = k / ndofs;
-+         const int j = k % ndofs;
-          double res = 0.0;
--         for (int i = 0; i < NDOFS; i++)
-+         for (int i = 0; i < ndofs; i++)
-          {
--            res += A(j, i, e)*X(i, e);
-+            res += A(j, i, e) * X(i, e);
-          }
-          Y(j, e) += res;
-       });
--      // Apply the Element Restriction transposed
--      if (useRestrict)
-+   };
-+   if (elem_restrict)
-+   {
-+      if (integrators.Size() > 0)
-+      {
-+         elem_restrict->Mult(x, local_x);
-+         local_y = 0.0;
-+         ApplyTranspose(ne, elem_dofs, ea_data, local_x, local_y);
-+         elem_restrict->MultTranspose(local_y, y);
-+      }
-+      else
-       {
--         elem_restrict->MultTranspose(localY, y);
-+         y = 0.0;
-+      }
-+   }
-+   else
-+   {
-+      y.UseDevice(true); // typically this is a large vector, so store on device
-+      y = 0.0;
-+      if (integrators.Size() > 0)
-+      {
-+         ApplyTranspose(ne, elem_dofs, ea_data, x, y);
-       }
-    }
- 
-    // Treatment of interior faces
--   Array<BilinearFormIntegrator*> &intFaceIntegrators = *a->GetFBFI();
--   const int iFISz = intFaceIntegrators.Size();
--   if (int_face_restrict_lex && iFISz>0)
--   {
--      // Apply the Interior Face Restriction
--      int_face_restrict_lex->Mult(x, int_face_X);
--      if (int_face_X.Size()>0)
--      {
--         int_face_Y = 0.0;
--         // Apply the interior face matrices transposed
--         const int NDOFS = faceDofs;
--         auto X = Reshape(int_face_X.Read(), NDOFS, 2, nf_int);
--         auto Y = Reshape(int_face_Y.ReadWrite(), NDOFS, 2, nf_int);
-+   Array<BilinearFormIntegrator *> &int_face_integrators = *a->GetFBFI();
-+   auto ApplyIntFaceTranspose = [](const int nface, const int ndofs,
-+                                   const Vector &data, const Vector &x, Vector &y)
-+   {
-+      auto X = Reshape(x.Read(), ndofs, 2, nface);
-+      auto Y = Reshape(y.ReadWrite(), ndofs, 2, nface);
-+      auto A = Reshape(data.Read(), ndofs, ndofs, 2, nface);
-+      mfem::forall(nface * ndofs, [=] MFEM_HOST_DEVICE (int k)
-+      {
-+         const int f = k / ndofs;
-+         const int j = k % ndofs;
-+         double res = 0.0;
-+         for (int i = 0; i < ndofs; i++)
-+         {
-+            res += A(j, i, 0, f) * X(i, 0, f);
-+         }
-+         Y(j, 0, f) += res;
-+         res = 0.0;
-+         for (int i = 0; i < ndofs; i++)
-+         {
-+            res += A(j, i, 1, f) * X(i, 1, f);
-+         }
-+         Y(j, 1, f) += res;
-+      });
-+   };
-+   auto ApplyExtFaceTranspose = [](const int nface, const int ndofs,
-+                                   const Vector &data, const Vector &x, Vector &y)
-+   {
-+      auto X = Reshape(x.Read(), ndofs, 2, nface);
-+      auto Y = Reshape(y.ReadWrite(), ndofs, 2, nface);
-+      auto A = Reshape(data.Read(), ndofs, ndofs, 2, nface);
-+      mfem::forall(nface * ndofs, [=] MFEM_HOST_DEVICE (int k)
-+      {
-+         const int f = k / ndofs;
-+         const int j = k % ndofs;
-+         double res = 0.0;
-+         for (int i = 0; i < ndofs; i++)
-+         {
-+            res += A(j, i, 1, f) * X(i, 0, f);
-+         }
-+         Y(j, 1, f) += res;
-+         res = 0.0;
-+         for (int i = 0; i < ndofs; i++)
-+         {
-+            res += A(j, i, 0, f) * X(i, 1, f);
-+         }
-+         Y(j, 0, f) += res;
-+      });
-+   };
-+   if (int_face_restrict_lex && int_face_integrators.Size() > 0)
-+   {
-+      int_face_restrict_lex->Mult(x, int_face_x);
-+      if (int_face_x.Size() > 0)
-+      {
-+         int_face_y = 0.0;
-          if (!factorize_face_terms)
-          {
--            auto A_int = Reshape(ea_data_int.Read(), NDOFS, NDOFS, 2, nf_int);
--            mfem::forall(nf_int*NDOFS, [=] MFEM_HOST_DEVICE (int glob_j)
--            {
--               const int f = glob_j/NDOFS;
--               const int j = glob_j%NDOFS;
--               double res = 0.0;
--               for (int i = 0; i < NDOFS; i++)
--               {
--                  res += A_int(j, i, 0, f)*X(i, 0, f);
--               }
--               Y(j, 0, f) += res;
--               res = 0.0;
--               for (int i = 0; i < NDOFS; i++)
--               {
--                  res += A_int(j, i, 1, f)*X(i, 1, f);
--               }
--               Y(j, 1, f) += res;
--            });
--         }
--         auto A_ext = Reshape(ea_data_ext.Read(), NDOFS, NDOFS, 2, nf_int);
--         mfem::forall(nf_int*NDOFS, [=] MFEM_HOST_DEVICE (int glob_j)
--         {
--            const int f = glob_j/NDOFS;
--            const int j = glob_j%NDOFS;
--            double res = 0.0;
--            for (int i = 0; i < NDOFS; i++)
--            {
--               res += A_ext(j, i, 1, f)*X(i, 0, f);
--            }
--            Y(j, 1, f) += res;
--            res = 0.0;
--            for (int i = 0; i < NDOFS; i++)
--            {
--               res += A_ext(j, i, 0, f)*X(i, 1, f);
--            }
--            Y(j, 0, f) += res;
--         });
--         // Apply the Interior Face Restriction transposed
--         int_face_restrict_lex->AddMultTransposeInPlace(int_face_Y, y);
-+            ApplyIntFaceTranspose(nf_int, face_dofs, ea_data_int, int_face_x, int_face_y);
-+         }
-+         ApplyExtFaceTranspose(nf_int, face_dofs, ea_data_ext, int_face_x, int_face_y);
-+         int_face_restrict_lex->AddMultTransposeInPlace(int_face_y, y);
-       }
-    }
- 
-    // Treatment of boundary faces
--   Array<BilinearFormIntegrator*> &bdrFaceIntegrators = *a->GetBFBFI();
--   const int bFISz = bdrFaceIntegrators.Size();
--   if (!factorize_face_terms && bdr_face_restrict_lex && bFISz>0)
--   {
--      // Apply the Boundary Face Restriction
--      bdr_face_restrict_lex->Mult(x, bdr_face_X);
--      if (bdr_face_X.Size()>0)
--      {
--         bdr_face_Y = 0.0;
--         // Apply the boundary face matrices transposed
--         const int NDOFS = faceDofs;
--         auto X = Reshape(bdr_face_X.Read(), NDOFS, nf_bdr);
--         auto Y = Reshape(bdr_face_Y.ReadWrite(), NDOFS, nf_bdr);
--         auto A = Reshape(ea_data_bdr.Read(), NDOFS, NDOFS, nf_bdr);
--         mfem::forall(nf_bdr*NDOFS, [=] MFEM_HOST_DEVICE (int glob_j)
--         {
--            const int f = glob_j/NDOFS;
--            const int j = glob_j%NDOFS;
--            double res = 0.0;
--            for (int i = 0; i < NDOFS; i++)
--            {
--               res += A(j, i, f)*X(i, f);
--            }
--            Y(j, f) += res;
--         });
--         // Apply the Boundary Face Restriction transposed
--         bdr_face_restrict_lex->AddMultTransposeInPlace(bdr_face_Y, y);
-+   Array<BilinearFormIntegrator *> &bdr_face_integrators = *a->GetBFBFI();
-+   if (!factorize_face_terms && bdr_face_restrict_lex &&
-+       bdr_face_integrators.Size() > 0)
-+   {
-+      bdr_face_restrict_lex->Mult(x, bdr_face_x);
-+      if (bdr_face_x.Size() > 0)
-+      {
-+         bdr_face_y = 0.0;
-+         ApplyTranspose(nf_bdr, face_dofs, ea_data_bdr, bdr_face_x, bdr_face_y);
-+         bdr_face_restrict_lex->AddMultTransposeInPlace(bdr_face_y, y);
-       }
-    }
- }
- 
--// Data and methods for fully-assembled bilinear forms
-+/// Data and methods for fully-assembled bilinear forms
- FABilinearFormExtension::FABilinearFormExtension(BilinearForm *form)
-    : EABilinearFormExtension(form),
-      mat(a->mat)
- {
- #ifdef MFEM_USE_MPI
--   ParFiniteElementSpace *pfes = nullptr;
--   if ( a->GetFBFI()->Size()>0 &&
--        (pfes = dynamic_cast<ParFiniteElementSpace*>(form->FESpace())) )
-+   const ParFiniteElementSpace *pfes = nullptr;
-+   if (a->GetFBFI()->Size() > 0 &&
-+       (pfes = dynamic_cast<const ParFiniteElementSpace *>(form->FESpace())))
-    {
--      pfes->ExchangeFaceNbrData();
-+      const_cast<ParFiniteElementSpace *>(pfes)->ExchangeFaceNbrData();
-    }
- #endif
- }
-@@ -850,20 +1264,20 @@ FABilinearFormExtension::FABilinearFormExtension(BilinearForm *form)
- void FABilinearFormExtension::Assemble()
- {
-    EABilinearFormExtension::Assemble();
--   FiniteElementSpace &fes = *a->FESpace();
--   int width = fes.GetVSize();
--   int height = fes.GetVSize();
-+
-+   int width = fes->GetVSize();
-+   int height = fes->GetVSize();
-    bool keep_nbr_block = false;
- #ifdef MFEM_USE_MPI
--   ParFiniteElementSpace *pfes = nullptr;
--   if ( a->GetFBFI()->Size()>0 &&
--        (pfes = dynamic_cast<ParFiniteElementSpace*>(&fes)) )
-+   const ParFiniteElementSpace *pfes = nullptr;
-+   if (a->GetFBFI()->Size() > 0 &&
-+       (pfes = dynamic_cast<const ParFiniteElementSpace *>(fes)))
-    {
--      pfes->ExchangeFaceNbrData();
-+      const_cast<ParFiniteElementSpace *>(pfes)->ExchangeFaceNbrData();
-       width += pfes->GetFaceNbrVSize();
-       dg_x.SetSize(width);
-       ParBilinearForm *pb = nullptr;
--      if ((pb = dynamic_cast<ParBilinearForm*>(a)) && (pb->keep_nbr_block))
-+      if ((pb = dynamic_cast<ParBilinearForm *>(a)) && pb->keep_nbr_block)
-       {
-          height += pfes->GetFaceNbrVSize();
-          dg_y.SetSize(height);
-@@ -873,15 +1287,14 @@ void FABilinearFormExtension::Assemble()
- #endif
-    if (a->mat) // We reuse the sparse matrix memory
-    {
--      if (fes.IsDGSpace())
-+      if (fes->IsDGSpace())
-       {
--         const L2ElementRestriction *restE =
--            static_cast<const L2ElementRestriction*>(elem_restrict);
--         const L2FaceRestriction *restF =
--            static_cast<const L2FaceRestriction*>(int_face_restrict_lex);
--         MFEM_VERIFY(
--            fes.Conforming(),
--            "Full Assembly not yet supported on NCMesh.");
-+         const auto *restE =
-+            static_cast<const L2ElementRestriction *>(elem_restrict);
-+         const auto *restF =
-+            static_cast<const L2FaceRestriction *>(int_face_restrict_lex);
-+         MFEM_VERIFY(fes->Conforming(),
-+                     "Full Assembly not yet supported on NCMesh.");
-          // 1. Fill J and Data
-          // 1.1 Fill J and Data with Elem ea_data
-          restE->FillJAndData(ea_data, *mat);
-@@ -897,8 +1310,8 @@ void FABilinearFormExtension::Assemble()
-       }
-       else
-       {
--         const ElementRestriction &rest =
--            static_cast<const ElementRestriction&>(*elem_restrict);
-+         const auto &rest =
-+            static_cast<const ConformingElementRestriction&>(*elem_restrict);
-          rest.FillJAndData(ea_data, *mat);
-       }
-    }
-@@ -906,15 +1319,14 @@ void FABilinearFormExtension::Assemble()
-    {
-       mat = new SparseMatrix;
-       mat->OverrideSize(height, width);
--      if (fes.IsDGSpace())
--      {
--         const L2ElementRestriction *restE =
--            static_cast<const L2ElementRestriction*>(elem_restrict);
--         const L2FaceRestriction *restF =
--            static_cast<const L2FaceRestriction*>(int_face_restrict_lex);
--         MFEM_VERIFY(
--            fes.Conforming(),
--            "Full Assembly not yet supported on NCMesh.");
-+      if (fes->IsDGSpace())
-+      {
-+         const auto *restE =
-+            static_cast<const L2ElementRestriction *>(elem_restrict);
-+         const auto *restF =
-+            static_cast<const L2FaceRestriction *>(int_face_restrict_lex);
-+         MFEM_VERIFY(fes->Conforming(),
-+                     "Full Assembly not yet supported on NCMesh.");
-          // 1. Fill I
-          mat->GetMemoryI().New(height+1, mat->GetMemoryI().GetMemoryType());
-          //  1.1 Increment with restE
-@@ -947,87 +1359,32 @@ void FABilinearFormExtension::Assemble()
-          }
-          I[0] = 0;
-       }
--      else // continuous Galerkin case
-+      else
-       {
--         const ElementRestriction &rest =
--            static_cast<const ElementRestriction&>(*elem_restrict);
-+         const auto &rest =
-+            static_cast<const ConformingElementRestriction &>(*elem_restrict);
-          rest.FillSparseMatrix(ea_data, *mat);
-       }
-       a->mat = mat;
-    }
--   if ( a->sort_sparse_matrix )
-+   if (a->sort_sparse_matrix)
-    {
-       a->mat->SortColumnIndices();
-    }
- }
- 
--
--void FABilinearFormExtension::RAP(OperatorHandle &A)
--{
--#ifdef MFEM_USE_MPI
--   if ( auto pa = dynamic_cast<ParBilinearForm*>(a) )
--   {
--      pa->ParallelRAP(*pa->mat, A);
--   }
--   else
--#endif
--   {
--      a->SerialRAP(A);
--   }
--}
--
--void FABilinearFormExtension::EliminateBC(const Array<int> &ess_dofs,
--                                          OperatorHandle &A)
--{
--   MFEM_VERIFY(a->diag_policy == DiagonalPolicy::DIAG_ONE,
--               "Only DiagonalPolicy::DIAG_ONE supported with"
--               " FABilinearFormExtension.");
--#ifdef MFEM_USE_MPI
--   if ( dynamic_cast<ParBilinearForm*>(a) )
--   {
--      A.As<HypreParMatrix>()->EliminateBC(ess_dofs,
--                                          DiagonalPolicy::DIAG_ONE);
--   }
--   else
--#endif
--   {
--      A.As<SparseMatrix>()->EliminateBC(ess_dofs,
--                                        DiagonalPolicy::DIAG_ONE);
--   }
--}
--
--void FABilinearFormExtension::FormSystemMatrix(const Array<int> &ess_dofs,
--                                               OperatorHandle &A)
--{
--   RAP(A);
--   EliminateBC(ess_dofs, A);
--}
--
--void FABilinearFormExtension::FormLinearSystem(const Array<int> &ess_tdof_list,
--                                               Vector &x, Vector &b,
--                                               OperatorHandle &A,
--                                               Vector &X, Vector &B,
--                                               int copy_interior)
--{
--   Operator *A_out;
--   Operator::FormLinearSystem(ess_tdof_list, x, b, A_out, X, B, copy_interior);
--   delete A_out;
--   FormSystemMatrix(ess_tdof_list, A);
--}
--
- void FABilinearFormExtension::DGMult(const Vector &x, Vector &y) const
- {
- #ifdef MFEM_USE_MPI
--   const ParFiniteElementSpace *pfes;
--   if ( (pfes = dynamic_cast<const ParFiniteElementSpace*>(test_fes)) )
-+   if (const auto pfes = dynamic_cast<const ParFiniteElementSpace *>(fes))
-    {
-       // DG Prolongation
-       ParGridFunction x_gf;
--      x_gf.MakeRef(const_cast<ParFiniteElementSpace*>(pfes),
--                   const_cast<Vector&>(x),0);
-+      x_gf.MakeRef(const_cast<ParFiniteElementSpace *>(pfes),
-+                   const_cast<Vector &>(x), 0);
-       x_gf.ExchangeFaceNbrData();
-       Vector &shared_x = x_gf.FaceNbrData();
--      const int local_size = a->FESpace()->GetVSize();
-+      const int local_size = fes->GetVSize();
-       auto dg_x_ptr = dg_x.Write();
-       auto x_ptr = x.Read();
-       mfem::forall(local_size, [=] MFEM_HOST_DEVICE (int i)
-@@ -1040,8 +1397,8 @@ void FABilinearFormExtension::DGMult(const Vector &x, Vector &y) const
-       {
-          dg_x_ptr[local_size+i] = shared_x_ptr[i];
-       });
--      ParBilinearForm *pform = nullptr;
--      if ((pform = dynamic_cast<ParBilinearForm*>(a)) && (pform->keep_nbr_block))
-+      ParBilinearForm *pb = nullptr;
-+      if ((pb = dynamic_cast<ParBilinearForm *>(a)) && pb->keep_nbr_block)
-       {
-          mat->Mult(dg_x, dg_y);
-          // DG Restriction
-@@ -1066,7 +1423,7 @@ void FABilinearFormExtension::DGMult(const Vector &x, Vector &y) const
- 
- void FABilinearFormExtension::Mult(const Vector &x, Vector &y) const
- {
--   if ( a->GetFBFI()->Size()>0 )
-+   if (a->GetFBFI()->Size() > 0)
-    {
-       DGMult(x, y);
-    }
-@@ -1079,16 +1436,15 @@ void FABilinearFormExtension::Mult(const Vector &x, Vector &y) const
- void FABilinearFormExtension::DGMultTranspose(const Vector &x, Vector &y) const
- {
- #ifdef MFEM_USE_MPI
--   const ParFiniteElementSpace *pfes;
--   if ( (pfes = dynamic_cast<const ParFiniteElementSpace*>(test_fes)) )
-+   if (const auto pfes = dynamic_cast<const ParFiniteElementSpace *>(fes))
-    {
-       // DG Prolongation
-       ParGridFunction x_gf;
--      x_gf.MakeRef(const_cast<ParFiniteElementSpace*>(pfes),
--                   const_cast<Vector&>(x),0);
-+      x_gf.MakeRef(const_cast<ParFiniteElementSpace *>(pfes),
-+                   const_cast<Vector &>(x), 0);
-       x_gf.ExchangeFaceNbrData();
-       Vector &shared_x = x_gf.FaceNbrData();
--      const int local_size = a->FESpace()->GetVSize();
-+      const int local_size = fes->GetVSize();
-       auto dg_x_ptr = dg_x.Write();
-       auto x_ptr = x.Read();
-       mfem::forall(local_size, [=] MFEM_HOST_DEVICE (int i)
-@@ -1102,7 +1458,7 @@ void FABilinearFormExtension::DGMultTranspose(const Vector &x, Vector &y) const
-          dg_x_ptr[local_size+i] = shared_x_ptr[i];
-       });
-       ParBilinearForm *pb = nullptr;
--      if ((pb = dynamic_cast<ParBilinearForm*>(a)) && (pb->keep_nbr_block))
-+      if ((pb = dynamic_cast<ParBilinearForm *>(a)) && (pb->keep_nbr_block))
-       {
-          mat->MultTranspose(dg_x, dg_y);
-          // DG Restriction
-@@ -1127,7 +1483,7 @@ void FABilinearFormExtension::DGMultTranspose(const Vector &x, Vector &y) const
- 
- void FABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
- {
--   if ( a->GetFBFI()->Size()>0 )
-+   if (a->GetFBFI()->Size() > 0)
-    {
-       DGMultTranspose(x, y);
-    }
-@@ -1138,10 +1494,10 @@ void FABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
- }
- 
- 
-+/// Base class for extensions to the MixedBilinearForm class
- MixedBilinearFormExtension::MixedBilinearFormExtension(MixedBilinearForm *form)
-    : Operator(form->Height(), form->Width()), a(form)
- {
--   // empty
- }
- 
- const Operator *MixedBilinearFormExtension::GetProlongation() const
-@@ -1164,377 +1520,690 @@ const Operator *MixedBilinearFormExtension::GetOutputRestriction() const
-    return a->GetOutputRestriction();
- }
- 
--// Data and methods for partially-assembled bilinear forms
--
--PAMixedBilinearFormExtension::PAMixedBilinearFormExtension(
-+/// Data and methods for matrix-free mixed bilinear forms
-+MFMixedBilinearFormExtension::MFMixedBilinearFormExtension(
-    MixedBilinearForm *form)
--   : MixedBilinearFormExtension(form),
--     trial_fes(form->TrialFESpace()),
--     test_fes(form->TestFESpace()),
--     elem_restrict_trial(NULL),
--     elem_restrict_test(NULL)
-+   : MixedBilinearFormExtension(form)
- {
-    Update();
- }
- 
--void PAMixedBilinearFormExtension::Assemble()
-+void MFMixedBilinearFormExtension::SetupRestrictionOperators(
-+   const L2FaceValues m)
- {
--   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
--   const int integratorCount = integrators.Size();
--   for (int i = 0; i < integratorCount; ++i)
-+   if (DeviceCanUseCeed()) { return; }
-+   ElementDofOrdering trial_ordering = UsesTensorBasis(*trial_fes) ?
-+                                       ElementDofOrdering::LEXICOGRAPHIC :
-+                                       ElementDofOrdering::NATIVE;
-+   ElementDofOrdering test_ordering = UsesTensorBasis(*test_fes) ?
-+                                      ElementDofOrdering::LEXICOGRAPHIC :
-+                                      ElementDofOrdering::NATIVE;
-+   elem_restrict_trial = trial_fes->GetElementRestriction(trial_ordering);
-+   elem_restrict_test = test_fes->GetElementRestriction(test_ordering);
-+   if (elem_restrict_trial)
-    {
--      integrators[i]->AssemblePA(*trial_fes, *test_fes);
-+      local_trial.SetSize(elem_restrict_trial->Height(),
-+                          Device::GetDeviceMemoryType());
-+      local_trial.UseDevice(true); // ensure 'local_trial = 0.0' is done on device
-+   }
-+   if (elem_restrict_test)
-+   {
-+      local_test.SetSize(elem_restrict_test->Height(),
-+                         Device::GetDeviceMemoryType());
-+      local_test.UseDevice(true); // ensure 'local_test = 0.0' is done on device
-+   }
-+
-+   // Construct face restriction operators only if the bilinear form has
-+   // interior or boundary face integrators
-+   if (a->GetTFBFI()->Size() > 0)
-+   {
-+      if (int_face_restrict_lex_trial == nullptr)
-+      {
-+         int_face_restrict_lex_trial = trial_fes->GetFaceRestriction(
-+                                          ElementDofOrdering::LEXICOGRAPHIC,
-+                                          FaceType::Interior);
-+         int_face_trial.SetSize(int_face_restrict_lex_trial->Height(),
-+                                Device::GetDeviceMemoryType());
-+         int_face_trial.UseDevice(true);
-+      }
-+      if (int_face_restrict_lex_test == nullptr)
-+      {
-+         int_face_restrict_lex_test = test_fes->GetFaceRestriction(
-+                                         ElementDofOrdering::LEXICOGRAPHIC,
-+                                         FaceType::Interior);
-+         int_face_test.SetSize(int_face_restrict_lex_test->Height(),
-+                               Device::GetDeviceMemoryType());
-+         int_face_test.UseDevice(true);
-+      }
-+   }
-+
-+   const bool has_bdr_integs = (a->GetBTFBFI()->Size() > 0 ||
-+                                a->GetBBFI()->Size() > 0);
-+   if (has_bdr_integs)
-+   {
-+      if (bdr_face_restrict_lex_trial == nullptr)
-+      {
-+         bdr_face_restrict_lex_trial = trial_fes->GetFaceRestriction(
-+                                          ElementDofOrdering::LEXICOGRAPHIC,
-+                                          FaceType::Boundary,
-+                                          m);
-+         bdr_face_trial.SetSize(bdr_face_restrict_lex_trial->Height(),
-+                                Device::GetDeviceMemoryType());
-+         bdr_face_trial.UseDevice(true);
-+      }
-+      if (bdr_face_restrict_lex_test == nullptr)
-+      {
-+         bdr_face_restrict_lex_test = test_fes->GetFaceRestriction(
-+                                         ElementDofOrdering::LEXICOGRAPHIC,
-+                                         FaceType::Boundary,
-+                                         m);
-+         bdr_face_test.SetSize(bdr_face_restrict_lex_test->Height(),
-+                               Device::GetDeviceMemoryType());
-+         bdr_face_test.UseDevice(true);
-+      }
-    }
--   MFEM_VERIFY(a->GetBBFI()->Size() == 0,
--               "Partial assembly does not support AddBoundaryIntegrator yet.");
--   MFEM_VERIFY(a->GetTFBFI()->Size() == 0,
--               "Partial assembly does not support AddTraceFaceIntegrator yet.");
--   MFEM_VERIFY(a->GetBTFBFI()->Size() == 0,
--               "Partial assembly does not support AddBdrTraceFaceIntegrator yet.");
- }
- 
--void PAMixedBilinearFormExtension::Update()
-+void MFMixedBilinearFormExtension::Assemble()
- {
--   trial_fes = a->TrialFESpace();
--   test_fes  = a->TestFESpace();
--   height = test_fes->GetVSize();
--   width = trial_fes->GetVSize();
--   elem_restrict_trial = trial_fes->GetElementRestriction(
--                            ElementDofOrdering::LEXICOGRAPHIC);
--   elem_restrict_test  =  test_fes->GetElementRestriction(
--                             ElementDofOrdering::LEXICOGRAPHIC);
--   if (elem_restrict_trial)
-+   SetupRestrictionOperators(L2FaceValues::DoubleValued);
-+
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   for (BilinearFormIntegrator *integ : integrators)
-    {
--      localTrial.UseDevice(true);
--      localTrial.SetSize(elem_restrict_trial->Height(),
--                         Device::GetMemoryType());
-+      integ->AssembleMF(*trial_fes, *test_fes);
-    }
--   if (elem_restrict_test)
-+
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   for (BilinearFormIntegrator *integ : bdr_integrators)
-    {
--      localTest.UseDevice(true); // ensure 'localY = 0.0' is done on device
--      localTest.SetSize(elem_restrict_test->Height(), Device::GetMemoryType());
-+      integ->AssembleMFBoundary(*trial_fes, *test_fes);
-    }
--}
- 
--void PAMixedBilinearFormExtension::FormRectangularSystemOperator(
--   const Array<int> &trial_tdof_list,
--   const Array<int> &test_tdof_list,
--   OperatorHandle &A)
--{
--   Operator * oper;
--   Operator::FormRectangularSystemOperator(trial_tdof_list, test_tdof_list,
--                                           oper);
--   A.Reset(oper); // A will own oper
-+   MFEM_VERIFY(a->GetTFBFI()->Size() == 0, "AddInteriorFaceIntegrator is not "
-+               "currently supported in MFMixedBilinearFormExtension");
-+
-+   MFEM_VERIFY(a->GetBTFBFI()->Size() == 0, "AddBdrFaceIntegrator is not "
-+               "currently supported in MFMixedBilinearFormExtension");
- }
- 
--void PAMixedBilinearFormExtension::FormRectangularLinearSystem(
--   const Array<int> &trial_tdof_list,
--   const Array<int> &test_tdof_list,
--   Vector &x, Vector &b,
--   OperatorHandle &A,
--   Vector &X, Vector &B)
-+void MFMixedBilinearFormExtension::Mult(const Vector &x, Vector &y) const
- {
--   Operator *oper;
--   Operator::FormRectangularLinearSystem(trial_tdof_list, test_tdof_list, x, b,
--                                         oper, X, B);
--   A.Reset(oper); // A will own oper
-+   y = 0.0;
-+   AddMult(x, y);
- }
- 
--void PAMixedBilinearFormExtension::SetupMultInputs(
--   const Operator *elem_restrict_x,
--   const Vector &x,
--   Vector &localX,
--   const Operator *elem_restrict_y,
--   Vector &y,
--   Vector &localY,
--   const double c) const
-+void MFMixedBilinearFormExtension::AddMult(const Vector &x, Vector &y,
-+                                           const double c) const
- {
--   // * G operation: localX = c*local(x)
--   if (elem_restrict_x)
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   if (elem_restrict_trial && integrators.Size() > 0)
-+   {
-+      elem_restrict_trial->Mult(x, local_trial);
-+   }
-+   if (elem_restrict_test && integrators.Size() > 0)
-    {
--      elem_restrict_x->Mult(x, localX);
-+      local_test = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-+      {
-+         integ->AddMultMF(elem_restrict_trial ? local_trial : x, local_test);
-+      }
-       if (c != 1.0)
-       {
--         localX *= c;
-+         local_test *= c;
-       }
-+      elem_restrict_test->AddMultTranspose(local_test, y);
-    }
-    else
-    {
--      if (c == 1.0)
-+      y.UseDevice(true); // typically this is a large vector, so store on device
-+      if (c != 1.0 && integrators.Size() > 0)
-       {
--         localX.SyncAliasMemory(x);
-+         temp_test.SetSize(y.Size());
-+         temp_test.UseDevice(true);
-+         temp_test = 0.0;
-+         for (BilinearFormIntegrator *integ : integrators)
-+         {
-+            integ->AddMultMF(elem_restrict_trial ? local_trial : x, temp_test);
-+         }
-+         y.Add(c, temp_test);
-       }
-       else
-       {
--         localX.Set(c, x);
-+         for (BilinearFormIntegrator *integ : integrators)
-+         {
-+            integ->AddMultMF(elem_restrict_trial ? local_trial : x, y);
-+         }
-       }
-    }
--   if (elem_restrict_y)
-+
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   if (bdr_face_restrict_lex_trial && bdr_integrators.Size() > 0)
-+   {
-+      bdr_face_restrict_lex_trial->Mult(x, bdr_face_trial);
-+   }
-+   if (bdr_face_restrict_lex_test && bdr_integrators.Size() > 0)
-    {
--      localY = 0.0;
-+      bdr_face_test = 0.0;
-+      for (BilinearFormIntegrator *integ : bdr_integrators)
-+      {
-+         integ->AddMultMF(bdr_face_restrict_lex_trial ? bdr_face_trial : x,
-+                          bdr_face_test);
-+      }
-+      if (c != 1.0)
-+      {
-+         bdr_face_test *= c;
-+      }
-+      bdr_face_restrict_lex_test->AddMultTranspose(bdr_face_test, y);
-    }
-    else
-    {
--      y.UseDevice(true);
--      localY.SyncAliasMemory(y);
-+      if (c != 1.0 && bdr_integrators.Size() > 0)
-+      {
-+         temp_test.SetSize(y.Size());
-+         temp_test.UseDevice(true);
-+         temp_test = 0.0;
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultMF(bdr_face_restrict_lex_trial ? bdr_face_trial : x,
-+                             temp_test);
-+         }
-+         y.Add(c, temp_test);
-+      }
-+      else
-+      {
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultMF(bdr_face_restrict_lex_trial ? bdr_face_trial : x, y);
-+         }
-+      }
-    }
- }
- 
--void PAMixedBilinearFormExtension::Mult(const Vector &x, Vector &y) const
-+void MFMixedBilinearFormExtension::MultTranspose(const Vector &x,
-+                                                 Vector &y) const
- {
-    y = 0.0;
--   AddMult(x, y);
-+   AddMultTranspose(x, y);
- }
- 
--void PAMixedBilinearFormExtension::AddMult(const Vector &x, Vector &y,
--                                           const double c) const
-+void MFMixedBilinearFormExtension::AddMultTranspose(const Vector &x, Vector &y,
-+                                                    const double c) const
- {
--   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
--   const int iSz = integrators.Size();
--
--   // * G operation
--   SetupMultInputs(elem_restrict_trial, x, localTrial,
--                   elem_restrict_test, y, localTest, c);
--
--   // * B^TDB operation
--   for (int i = 0; i < iSz; ++i)
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   if (integrators.Size() > 0)
-    {
--      integrators[i]->AddMultPA(localTrial, localTest);
-+      if (elem_restrict_test)
-+      {
-+         elem_restrict_test->Mult(x, local_test);
-+      }
-+      if (elem_restrict_trial)
-+      {
-+         local_trial = 0.0;
-+         for (BilinearFormIntegrator *integ : integrators)
-+         {
-+            integ->AddMultTransposeMF(elem_restrict_test ? local_test : x,
-+                                      local_trial);
-+         }
-+         if (c != 1.0)
-+         {
-+            local_trial *= c;
-+         }
-+         elem_restrict_trial->AddMultTranspose(local_trial, y);
-+      }
-+      else
-+      {
-+         y.UseDevice(true); // typically this is a large vector, so store on device
-+         if (c != 1.0)
-+         {
-+            temp_trial.SetSize(y.Size());
-+            temp_trial.UseDevice(true);
-+            temp_trial = 0.0;
-+            for (BilinearFormIntegrator *integ : integrators)
-+            {
-+               integ->AddMultTransposeMF(elem_restrict_test ? local_test : x,
-+                                         temp_trial);
-+            }
-+            y.Add(c, temp_trial);
-+         }
-+         else
-+         {
-+            for (BilinearFormIntegrator *integ : integrators)
-+            {
-+               integ->AddMultTransposeMF(elem_restrict_test ? local_test : x, y);
-+            }
-+         }
-+      }
-    }
- 
--   // * G^T operation
--   if (elem_restrict_test)
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   if (bdr_face_restrict_lex_test && bdr_integrators.Size() > 0)
-+   {
-+      bdr_face_restrict_lex_test->Mult(x, bdr_face_test);
-+   }
-+   if (bdr_face_restrict_lex_trial && bdr_integrators.Size() > 0)
-+   {
-+      bdr_face_trial = 0.0;
-+      for (BilinearFormIntegrator *integ : bdr_integrators)
-+      {
-+         integ->AddMultTransposeMF(bdr_face_restrict_lex_test ? bdr_face_test : x,
-+                                   bdr_face_trial);
-+      }
-+      if (c != 1.0)
-+      {
-+         bdr_face_trial *= c;
-+      }
-+      bdr_face_restrict_lex_trial->AddMultTranspose(bdr_face_trial, y);
-+   }
-+   else
-    {
--      tempY.SetSize(y.Size());
--      elem_restrict_test->MultTranspose(localTest, tempY);
--      y += tempY;
-+      y.UseDevice(true); // typically this is a large vector, so store on device
-+      if (c != 1.0 && bdr_integrators.Size() > 0)
-+      {
-+         temp_trial.SetSize(y.Size());
-+         temp_trial.UseDevice(true);
-+         temp_trial = 0.0;
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultTransposeMF(bdr_face_restrict_lex_test ? bdr_face_test : x,
-+                                      temp_trial);
-+         }
-+         y.Add(c, temp_trial);
-+      }
-+      else
-+      {
-+         for (BilinearFormIntegrator *integ : integrators)
-+         {
-+            integ->AddMultTransposeMF(bdr_face_restrict_lex_test ? bdr_face_test : x, y);
-+         }
-+      }
-    }
- }
- 
--void PAMixedBilinearFormExtension::MultTranspose(const Vector &x,
--                                                 Vector &y) const
-+void MFMixedBilinearFormExtension::Update()
- {
--   y = 0.0;
--   AddMultTranspose(x, y);
-+   trial_fes = a->TrialFESpace();
-+   test_fes  = a->TestFESpace();
-+   height = test_fes->GetVSize();
-+   width  = trial_fes->GetVSize();
-+
-+   elem_restrict_trial = nullptr;
-+   elem_restrict_test = nullptr;
-+   int_face_restrict_lex_trial = nullptr;
-+   int_face_restrict_lex_test = nullptr;
-+   bdr_face_restrict_lex_trial = nullptr;
-+   bdr_face_restrict_lex_test = nullptr;
- }
- 
--void PAMixedBilinearFormExtension::AddMultTranspose(const Vector &x, Vector &y,
--                                                    const double c) const
-+/// Data and methods for partially-assembled mixed bilinear forms
-+PAMixedBilinearFormExtension::PAMixedBilinearFormExtension(
-+   MixedBilinearForm *form)
-+   : MFMixedBilinearFormExtension(form)
- {
--   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
--   const int iSz = integrators.Size();
-+}
- 
--   // * G operation
--   SetupMultInputs(elem_restrict_test, x, localTest,
--                   elem_restrict_trial, y, localTrial, c);
-+void PAMixedBilinearFormExtension::Assemble()
-+{
-+   SetupRestrictionOperators(L2FaceValues::DoubleValued);
- 
--   // * B^TD^TB operation
--   for (int i = 0; i < iSz; ++i)
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   for (BilinearFormIntegrator *integ : integrators)
-    {
--      integrators[i]->AddMultTransposePA(localTest, localTrial);
-+      integ->AssemblePA(*trial_fes, *test_fes);
-    }
- 
--   // * G^T operation
--   if (elem_restrict_trial)
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   for (BilinearFormIntegrator *integ : bdr_integrators)
-    {
--      tempY.SetSize(y.Size());
--      elem_restrict_trial->MultTranspose(localTrial, tempY);
--      y += tempY;
-+      integ->AssemblePABoundary(*trial_fes, *test_fes);
-    }
-+
-+   MFEM_VERIFY(a->GetTFBFI()->Size() == 0, "AddInteriorFaceIntegrator is not "
-+               "currently supported in PAMixedBilinearFormExtension");
-+
-+   MFEM_VERIFY(a->GetBTFBFI()->Size() == 0, "AddBdrFaceIntegrator is not "
-+               "currently supported in PAMixedBilinearFormExtension");
- }
- 
- void PAMixedBilinearFormExtension::AssembleDiagonal_ADAt(const Vector &D,
-                                                          Vector &diag) const
- {
--   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   if (elem_restrict_trial && integrators.Size() > 0)
-+   {
-+      elem_restrict_trial->MultUnsigned(D, local_trial);
-+   }
-+   if (elem_restrict_test && integrators.Size() > 0)
-+   {
-+      local_test = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-+      {
-+         integ->AssembleDiagonalPA_ADAt(elem_restrict_trial ? local_trial : D,
-+                                        local_test);
-+      }
-+      elem_restrict_test->MultTransposeUnsigned(local_test, diag);
-+   }
-+   else
-+   {
-+      diag.UseDevice(true); // typically this is a large vector, so store on device
-+      diag = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-+      {
-+         integ->AssembleDiagonalPA_ADAt(elem_restrict_trial ? local_trial : D, diag);
-+      }
-+   }
-+
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   if (bdr_face_restrict_lex_trial && bdr_integrators.Size() > 0)
-+   {
-+      bdr_face_restrict_lex_trial->MultUnsigned(D, bdr_face_trial);
-+   }
-+   if (bdr_face_restrict_lex_test && bdr_integrators.Size() > 0)
-+   {
-+      bdr_face_test = 0.0;
-+      for (BilinearFormIntegrator *integ : bdr_integrators)
-+      {
-+         integ->AssembleDiagonalPA_ADAt(bdr_face_restrict_lex_trial ? bdr_face_trial : D,
-+                                        bdr_face_test);
-+      }
-+      bdr_face_restrict_lex_test->AddMultTransposeUnsigned(bdr_face_test, diag);
-+   }
-+   else
-+   {
-+      for (BilinearFormIntegrator *integ : bdr_integrators)
-+      {
-+         integ->AssembleDiagonalPA_ADAt(bdr_face_restrict_lex_trial ? bdr_face_trial : D,
-+                                        diag);
-+      }
-+   }
-+}
- 
--   const int iSz = integrators.Size();
-+void PAMixedBilinearFormExtension::AddMult(const Vector &x, Vector &y,
-+                                           const double c) const
-+{
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   if (elem_restrict_trial && integrators.Size() > 0)
-+   {
-+      elem_restrict_trial->Mult(x, local_trial);
-+   }
-+   if (elem_restrict_test && integrators.Size() > 0)
-+   {
-+      local_test = 0.0;
-+      for (BilinearFormIntegrator *integ : integrators)
-+      {
-+         integ->AddMultPA(elem_restrict_trial ? local_trial : x, local_test);
-+      }
-+      if (c != 1.0)
-+      {
-+         local_test *= c;
-+      }
-+      elem_restrict_test->AddMultTranspose(local_test, y);
-+   }
-+   else
-+   {
-+      y.UseDevice(true); // typically this is a large vector, so store on device
-+      if (c != 1.0 && integrators.Size() > 0)
-+      {
-+         temp_test.SetSize(y.Size());
-+         temp_test.UseDevice(true);
-+         temp_test = 0.0;
-+         for (BilinearFormIntegrator *integ : integrators)
-+         {
-+            integ->AddMultPA(elem_restrict_trial ? local_trial : x, temp_test);
-+         }
-+         y.Add(c, temp_test);
-+      }
-+      else
-+      {
-+         for (BilinearFormIntegrator *integ : integrators)
-+         {
-+            integ->AddMultPA(elem_restrict_trial ? local_trial : x, y);
-+         }
-+      }
-+   }
- 
--   if (elem_restrict_trial)
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   if (bdr_face_restrict_lex_trial && bdr_integrators.Size() > 0)
-    {
--      const ElementRestriction* H1elem_restrict_trial =
--         dynamic_cast<const ElementRestriction*>(elem_restrict_trial);
--      if (H1elem_restrict_trial)
-+      bdr_face_restrict_lex_trial->Mult(x, bdr_face_trial);
-+   }
-+   if (bdr_face_restrict_lex_test && bdr_integrators.Size() > 0)
-+   {
-+      bdr_face_test = 0.0;
-+      for (BilinearFormIntegrator *integ : bdr_integrators)
-+      {
-+         integ->AddMultPA(bdr_face_restrict_lex_trial ? bdr_face_trial : x,
-+                          bdr_face_test);
-+      }
-+      if (c != 1.0)
-+      {
-+         bdr_face_test *= c;
-+      }
-+      bdr_face_restrict_lex_test->AddMultTranspose(bdr_face_test, y);
-+   }
-+   else
-+   {
-+      y.UseDevice(true); // typically this is a large vector, so store on device
-+      if (c != 1.0 && bdr_integrators.Size() > 0)
-       {
--         H1elem_restrict_trial->MultUnsigned(D, localTrial);
-+         temp_test.SetSize(y.Size());
-+         temp_test.UseDevice(true);
-+         temp_test = 0.0;
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultPA(bdr_face_restrict_lex_trial ? bdr_face_trial : x,
-+                             temp_test);
-+         }
-+         y.Add(c, temp_test);
-       }
-       else
-       {
--         elem_restrict_trial->Mult(D, localTrial);
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-+         {
-+            integ->AddMultPA(bdr_face_restrict_lex_trial ? bdr_face_trial : x, y);
-+         }
-       }
-    }
-+}
- 
--   if (elem_restrict_test)
-+void PAMixedBilinearFormExtension::AddMultTranspose(const Vector &x, Vector &y,
-+                                                    const double c) const
-+{
-+   Array<BilinearFormIntegrator *> &integrators = *a->GetDBFI();
-+   if (integrators.Size() > 0)
-    {
--      localTest = 0.0;
--      for (int i = 0; i < iSz; ++i)
-+      if (elem_restrict_test)
-+      {
-+         elem_restrict_test->Mult(x, local_test);
-+      }
-+      if (elem_restrict_trial)
-+      {
-+         local_trial = 0.0;
-+         for (BilinearFormIntegrator *integ : integrators)
-+         {
-+            integ->AddMultTransposePA(elem_restrict_test ? local_test : x,
-+                                      local_trial);
-+         }
-+         if (c != 1.0)
-+         {
-+            local_trial *= c;
-+         }
-+         elem_restrict_trial->AddMultTranspose(local_trial, y);
-+      }
-+      else
-       {
--         if (elem_restrict_trial)
-+         y.UseDevice(true); // typically this is a large vector, so store on device
-+         if (c != 1.0)
-          {
--            integrators[i]->AssembleDiagonalPA_ADAt(localTrial, localTest);
-+            temp_trial.SetSize(y.Size());
-+            temp_trial.UseDevice(true);
-+            temp_trial = 0.0;
-+            for (BilinearFormIntegrator *integ : integrators)
-+            {
-+               integ->AddMultTransposePA(elem_restrict_test ? local_test : x,
-+                                         temp_trial);
-+            }
-+            y.Add(c, temp_trial);
-          }
-          else
-          {
--            integrators[i]->AssembleDiagonalPA_ADAt(D, localTest);
-+            for (BilinearFormIntegrator *integ : integrators)
-+            {
-+               integ->AddMultTransposePA(elem_restrict_test ? local_test : x, y);
-+            }
-          }
-       }
--      const ElementRestriction* H1elem_restrict_test =
--         dynamic_cast<const ElementRestriction*>(elem_restrict_test);
--      if (H1elem_restrict_test)
-+   }
-+
-+   Array<BilinearFormIntegrator *> &bdr_integrators = *a->GetBBFI();
-+   if (bdr_face_restrict_lex_test && bdr_integrators.Size() > 0)
-+   {
-+      bdr_face_restrict_lex_test->Mult(x, bdr_face_test);
-+   }
-+   if (bdr_face_restrict_lex_trial && bdr_integrators.Size() > 0)
-+   {
-+      bdr_face_trial = 0.0;
-+      for (BilinearFormIntegrator *integ : bdr_integrators)
-       {
--         H1elem_restrict_test->MultTransposeUnsigned(localTest, diag);
-+         integ->AddMultTransposePA(bdr_face_restrict_lex_test ? bdr_face_test : x,
-+                                   bdr_face_trial);
-       }
--      else
-+      if (c != 1.0)
-       {
--         elem_restrict_test->MultTranspose(localTest, diag);
-+         bdr_face_trial *= c;
-       }
-+      bdr_face_restrict_lex_trial->AddMultTranspose(bdr_face_trial, y);
-    }
-    else
-    {
--      diag.UseDevice(true); // typically this is a large vector, so store on device
--      diag = 0.0;
--      for (int i = 0; i < iSz; ++i)
-+      y.UseDevice(true); // typically this is a large vector, so store on device
-+      if (c != 1.0 && bdr_integrators.Size() > 0)
-       {
--         if (elem_restrict_trial)
-+         temp_trial.SetSize(y.Size());
-+         temp_trial.UseDevice(true);
-+         temp_trial = 0.0;
-+         for (BilinearFormIntegrator *integ : bdr_integrators)
-          {
--            integrators[i]->AssembleDiagonalPA_ADAt(localTrial, diag);
-+            integ->AddMultTransposePA(bdr_face_restrict_lex_test ? bdr_face_test : x,
-+                                      temp_trial);
-          }
--         else
-+         y.Add(c, temp_trial);
-+      }
-+      else
-+      {
-+         for (BilinearFormIntegrator *integ : integrators)
-          {
--            integrators[i]->AssembleDiagonalPA_ADAt(D, diag);
-+            integ->AddMultTransposePA(bdr_face_restrict_lex_test ? bdr_face_test : x, y);
-          }
-       }
-    }
- }
- 
-+/// Data and methods for partially-assembled discrete linear operators
- PADiscreteLinearOperatorExtension::PADiscreteLinearOperatorExtension(
-    DiscreteLinearOperator *linop) :
-    PAMixedBilinearFormExtension(linop)
- {
- }
- 
--const
--Operator *PADiscreteLinearOperatorExtension::GetOutputRestrictionTranspose()
--const
--{
--   return a->GetOutputRestrictionTranspose();
--}
--
- void PADiscreteLinearOperatorExtension::Assemble()
- {
--   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
--   const int integratorCount = integrators.Size();
--   for (int i = 0; i < integratorCount; ++i)
--   {
--      integrators[i]->AssemblePA(*trial_fes, *test_fes);
--   }
-+   PAMixedBilinearFormExtension::Assemble();
- 
-+   // Construct element vdof multiplicity (avoid use of elem_restrict_test
-+   // because it might not exist for libCEED)
-+   test_multiplicity.SetSize(height);
-    test_multiplicity.UseDevice(true);
--   test_multiplicity.SetSize(elem_restrict_test->Width()); // l-vector
--   Vector ones(elem_restrict_test->Height()); // e-vector
--   ones = 1.0;
--
--   const ElementRestriction* elem_restrict =
--      dynamic_cast<const ElementRestriction*>(elem_restrict_test);
--   if (elem_restrict)
--   {
--      elem_restrict->MultTransposeUnsigned(ones, test_multiplicity);
--   }
--   else
--   {
--      mfem_error("A real ElementRestriction is required in this setting!");
-+   test_multiplicity = 0.0;
-+   Array<int> dofs;
-+   for (int i = 0; i < test_fes->GetNE(); i++)
-+   {
-+      test_fes->GetElementVDofs(i, dofs);
-+      const int ndofs = dofs.Size();
-+      auto d_mult = test_multiplicity.HostReadWrite();
-+      auto d_dofs = dofs.HostRead();
-+      mfem::forall(ndofs, [=] MFEM_HOST_DEVICE (int i)
-+      {
-+         const int j = d_dofs[i];
-+         d_mult[(j >= 0) ? j : -1 - j] += 1.0;
-+      });
-    }
--
--   auto tm = test_multiplicity.ReadWrite();
--   mfem::forall(test_multiplicity.Size(), [=] MFEM_HOST_DEVICE (int i)
--   {
--      tm[i] = 1.0 / tm[i];
--   });
-+   test_multiplicity.Reciprocal();
- }
- 
--void PADiscreteLinearOperatorExtension::AddMult(
--   const Vector &x, Vector &y, const double c) const
-+void PADiscreteLinearOperatorExtension::AddMult(const Vector &x, Vector &y,
-+                                                const double c) const
- {
--   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
--   const int iSz = integrators.Size();
--
--   // * G operation
--   SetupMultInputs(elem_restrict_trial, x, localTrial,
--                   elem_restrict_test, y, localTest, c);
--
--   // * B^TDB operation
--   for (int i = 0; i < iSz; ++i)
-+   Array<BilinearFormIntegrator *> &interpolators = *a->GetDBFI();
-+   temp_test.SetSize(y.Size());
-+   temp_test.UseDevice(true);
-+   if (elem_restrict_trial)
-    {
--      integrators[i]->AddMultPA(localTrial, localTest);
-+      elem_restrict_trial->Mult(x, local_trial);
-    }
--
--   // do a kind of "set" rather than "add" in the below
--   // operation as compared to the BilinearForm case
--   // * G^T operation (kind of...)
--   const ElementRestriction* elem_restrict =
--      dynamic_cast<const ElementRestriction*>(elem_restrict_test);
--   if (elem_restrict)
-+   if (elem_restrict_test)
-    {
--      tempY.SetSize(y.Size());
--      elem_restrict->MultLeftInverse(localTest, tempY);
--      y += tempY;
-+      local_test = 0.0;
-+      for (BilinearFormIntegrator *interp : interpolators)
-+      {
-+         interp->AddMultPA(elem_restrict_trial ? local_trial : x, local_test);
-+      }
-+      elem_restrict_test->MultTranspose(local_test, temp_test);
-    }
-    else
-    {
--      mfem_error("In this setting you need a real ElementRestriction!");
-+      for (BilinearFormIntegrator *interp : interpolators)
-+      {
-+         interp->AddMultPA(elem_restrict_trial ? local_trial : x, temp_test);
-+      }
-    }
-+   temp_test *= test_multiplicity;
-+   y.Add(c, temp_test);
- }
- 
--void PADiscreteLinearOperatorExtension::AddMultTranspose(
--   const Vector &x, Vector &y, const double c) const
-+void PADiscreteLinearOperatorExtension::AddMultTranspose(const Vector &x,
-+                                                         Vector &y,
-+                                                         const double c) const
- {
--   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
--   const int iSz = integrators.Size();
--
--   // do a kind of "set" rather than "add" in the below
--   // operation as compared to the BilinearForm case
--   // * G operation (kinda)
--   Vector xscaled(x);
--   MFEM_VERIFY(x.Size() == test_multiplicity.Size(), "Input vector of wrong size");
--   auto xs = xscaled.ReadWrite();
--   auto tm = test_multiplicity.Read();
--   mfem::forall(x.Size(), [=] MFEM_HOST_DEVICE (int i)
--   {
--      xs[i] *= tm[i];
--   });
--   SetupMultInputs(elem_restrict_test, xscaled, localTest,
--                   elem_restrict_trial, y, localTrial, c);
--
--   // * B^TD^TB operation
--   for (int i = 0; i < iSz; ++i)
-+   Array<BilinearFormIntegrator *> &interpolators = *a->GetDBFI();
-+   temp_test.SetSize(y.Size());
-+   temp_test.UseDevice(true);
-+   temp_test = x;
-+   temp_test *= test_multiplicity;
-+   if (elem_restrict_test)
-    {
--      integrators[i]->AddMultTransposePA(localTest, localTrial);
-+      elem_restrict_test->Mult(temp_test, local_test);
-    }
--
--   // * G^T operation
-    if (elem_restrict_trial)
-    {
--      tempY.SetSize(y.Size());
--      elem_restrict_trial->MultTranspose(localTrial, tempY);
--      y += tempY;
-+      local_trial = 0.0;
-+      for (BilinearFormIntegrator *interp : interpolators)
-+      {
-+         interp->AddMultTransposePA(elem_restrict_test ? local_test : temp_test,
-+                                    local_trial);
-+      }
-+      if (c != 1.0)
-+      {
-+         local_trial *= c;
-+      }
-+      elem_restrict_trial->AddMultTranspose(local_trial, y);
-    }
-    else
-    {
--      mfem_error("Trial ElementRestriction not defined");
-+      y.UseDevice(true); // typically this is a large vector, so store on device
-+      if (c != 1.0)
-+      {
-+         MFEM_ABORT("General coefficient case for PADiscreteLinearOperatorExtension::"
-+                    "AddMultTranspose is not yet supported!");
-+      }
-+      else
-+      {
-+         for (BilinearFormIntegrator *interp : interpolators)
-+         {
-+            interp->AddMultTransposePA(elem_restrict_test ? local_test : temp_test, y);
-+         }
-+      }
-    }
- }
- 
--void PADiscreteLinearOperatorExtension::FormRectangularSystemOperator(
--   const Array<int>& ess1, const Array<int>& ess2, OperatorHandle &A)
--{
--   const Operator *Pi = this->GetProlongation();
--   const Operator *RoT = this->GetOutputRestrictionTranspose();
--   Operator *rap = SetupRAP(Pi, RoT);
--
--   RectangularConstrainedOperator *Arco
--      = new RectangularConstrainedOperator(rap, ess1, ess2, rap != this);
--
--   A.Reset(Arco);
--}
--
- } // namespace mfem
-diff --git a/fem/bilinearform_ext.hpp b/fem/bilinearform_ext.hpp
-index ef54dc71c..db26eb801 100644
---- a/fem/bilinearform_ext.hpp
-+++ b/fem/bilinearform_ext.hpp
-@@ -25,8 +25,8 @@ class DiscreteLinearOperator;
- 
- /// Class extending the BilinearForm class to support different AssemblyLevels.
- /**  FA - Full Assembly
--     PA - Partial Assembly
-      EA - Element Assembly
-+     PA - Partial Assembly
-      MF - Matrix Free
- */
- class BilinearFormExtension : public Operator
-@@ -54,57 +54,59 @@ public:
-       MFEM_ABORT("AssembleDiagonal not implemented for this assembly level!");
-    }
- 
--   virtual void FormSystemMatrix(const Array<int> &ess_tdof_list,
--                                 OperatorHandle &A) = 0;
--   virtual void FormLinearSystem(const Array<int> &ess_tdof_list,
--                                 Vector &x, Vector &b,
--                                 OperatorHandle &A, Vector &X, Vector &B,
--                                 int copy_interior = 0) = 0;
-    virtual void Update() = 0;
- };
- 
--/// Data and methods for partially-assembled bilinear forms
--class PABilinearFormExtension : public BilinearFormExtension
-+/// Data and methods for matrix-free bilinear forms
-+class MFBilinearFormExtension : public BilinearFormExtension
- {
- protected:
--   const FiniteElementSpace *trial_fes, *test_fes; // Not owned
--   mutable Vector localX, localY;
--   mutable Vector int_face_X, int_face_Y;
--   mutable Vector bdr_face_X, bdr_face_Y;
--   const Operator *elem_restrict; // Not owned
-+   const FiniteElementSpace *fes; // Not owned
-+   mutable Vector local_x, local_y, temp_y;
-+   mutable Vector int_face_x, int_face_y;
-+   mutable Vector bdr_face_x, bdr_face_y;
-+   const ElementRestriction *elem_restrict; // Not owned
-    const FaceRestriction *int_face_restrict_lex; // Not owned
-    const FaceRestriction *bdr_face_restrict_lex; // Not owned
- 
- public:
--   PABilinearFormExtension(BilinearForm*);
-+   MFBilinearFormExtension(BilinearForm *form);
- 
-    void Assemble();
-    void AssembleDiagonal(Vector &diag) const;
--   void FormSystemMatrix(const Array<int> &ess_tdof_list, OperatorHandle &A);
--   void FormLinearSystem(const Array<int> &ess_tdof_list,
--                         Vector &x, Vector &b,
--                         OperatorHandle &A, Vector &X, Vector &B,
--                         int copy_interior = 0);
-    void Mult(const Vector &x, Vector &y) const;
-+   void AddMult(const Vector &x, Vector &y, const double c = 1.0) const;
-    void MultTranspose(const Vector &x, Vector &y) const;
-+   void AddMultTranspose(const Vector &x, Vector &y, const double c = 1.0) const;
-    void Update();
- 
- protected:
-    void SetupRestrictionOperators(const L2FaceValues m);
- };
- 
-+/// Data and methods for partially-assembled bilinear forms
-+class PABilinearFormExtension : public MFBilinearFormExtension
-+{
-+public:
-+   PABilinearFormExtension(BilinearForm *form);
-+
-+   void Assemble();
-+   void AssembleDiagonal(Vector &diag) const;
-+   void Mult(const Vector &x, Vector &y) const;
-+   void AddMult(const Vector &x, Vector &y, const double c = 1.0) const;
-+   void MultTranspose(const Vector &x, Vector &y) const;
-+   void AddMultTranspose(const Vector &x, Vector &y, const double c = 1.0) const;
-+};
-+
- /// Data and methods for element-assembled bilinear forms
- class EABilinearFormExtension : public PABilinearFormExtension
- {
- protected:
--   int ne;
--   int elemDofs;
--   // The element matrices are stored row major
--   Vector ea_data;
--   int nf_int, nf_bdr;
--   int faceDofs;
-+   const bool factorize_face_terms;
-+   int ne, elem_dofs;
-+   Vector ea_data;  // The element matrices are stored row major
-+   int nf_int, nf_bdr, face_dofs;
-    Vector ea_data_int, ea_data_ext, ea_data_bdr;
--   bool factorize_face_terms;
- 
- public:
-    EABilinearFormExtension(BilinearForm *form);
-@@ -125,15 +127,6 @@ public:
-    FABilinearFormExtension(BilinearForm *form);
- 
-    void Assemble();
--   void RAP(OperatorHandle &A);
--   /** @note Always does `DIAG_ONE` policy to be consistent with
--       `Operator::FormConstrainedSystemOperator`. */
--   void EliminateBC(const Array<int> &ess_dofs, OperatorHandle &A);
--   void FormSystemMatrix(const Array<int> &ess_tdof_list, OperatorHandle &A);
--   void FormLinearSystem(const Array<int> &ess_tdof_list,
--                         Vector &x, Vector &b,
--                         OperatorHandle &A, Vector &X, Vector &B,
--                         int copy_interior = 0);
-    void Mult(const Vector &x, Vector &y) const;
-    void MultTranspose(const Vector &x, Vector &y) const;
- 
-@@ -143,37 +136,10 @@ public:
-    void DGMultTranspose(const Vector &x, Vector &y) const;
- };
- 
--/// Data and methods for matrix-free bilinear forms
--class MFBilinearFormExtension : public BilinearFormExtension
--{
--protected:
--   const FiniteElementSpace *trial_fes, *test_fes; // Not owned
--   mutable Vector localX, localY;
--   mutable Vector int_face_X, int_face_Y;
--   mutable Vector bdr_face_X, bdr_face_Y;
--   const Operator *elem_restrict; // Not owned
--   const FaceRestriction *int_face_restrict_lex; // Not owned
--   const FaceRestriction *bdr_face_restrict_lex; // Not owned
--
--public:
--   MFBilinearFormExtension(BilinearForm *form);
--
--   void Assemble();
--   void AssembleDiagonal(Vector &diag) const;
--   void FormSystemMatrix(const Array<int> &ess_tdof_list, OperatorHandle &A);
--   void FormLinearSystem(const Array<int> &ess_tdof_list,
--                         Vector &x, Vector &b,
--                         OperatorHandle &A, Vector &X, Vector &B,
--                         int copy_interior = 0);
--   void Mult(const Vector &x, Vector &y) const;
--   void MultTranspose(const Vector &x, Vector &y) const;
--   void Update();
--};
--
- /// Class extending the MixedBilinearForm class to support different AssemblyLevels.
- /**  FA - Full Assembly
--     PA - Partial Assembly
-      EA - Element Assembly
-+     PA - Partial Assembly
-      MF - Matrix Free
- */
- class MixedBilinearFormExtension : public Operator
-@@ -185,7 +151,7 @@ public:
-    MixedBilinearFormExtension(MixedBilinearForm *form);
- 
-    virtual MemoryClass GetMemoryClass() const
--   { return Device::GetMemoryClass(); }
-+   { return Device::GetDeviceMemoryClass(); }
- 
-    /// Get the finite element space prolongation matrix
-    virtual const Operator *GetProlongation() const;
-@@ -199,101 +165,70 @@ public:
-    /// Get the output finite element space restriction matrix
-    virtual const Operator *GetOutputRestriction() const;
- 
-+   /// Assemble at the level given for the BilinearFormExtension subclass
-    virtual void Assemble() = 0;
--   virtual void FormRectangularSystemOperator(const Array<int> &trial_tdof_list,
--                                              const Array<int> &test_tdof_list,
--                                              OperatorHandle &A) = 0;
--   virtual void FormRectangularLinearSystem(const Array<int> &trial_tdof_list,
--                                            const Array<int> &test_tdof_list,
--                                            Vector &x, Vector &b,
--                                            OperatorHandle &A, Vector &X, Vector &B) = 0;
- 
--   virtual void AssembleDiagonal_ADAt(const Vector &D, Vector &diag) const = 0;
-+   virtual void AssembleDiagonal_ADAt(const Vector &D, Vector &diag) const
-+   {
-+      MFEM_ABORT("AssembleDiagonal_ADAt not implemented for this assembly level!");
-+   }
- 
-    virtual void Update() = 0;
- };
- 
--/// Data and methods for partially-assembled mixed bilinear forms
--class PAMixedBilinearFormExtension : public MixedBilinearFormExtension
-+/// Data and methods for matrix-free mixed bilinear forms
-+class MFMixedBilinearFormExtension : public MixedBilinearFormExtension
- {
- protected:
-    const FiniteElementSpace *trial_fes, *test_fes; // Not owned
--   mutable Vector localTrial, localTest, tempY;
--   const Operator *elem_restrict_trial; // Not owned
--   const Operator *elem_restrict_test;  // Not owned
--
--   /// Helper function to set up inputs/outputs for Mult or MultTranspose
--   void SetupMultInputs(const Operator *elem_restrict_x,
--                        const Vector &x, Vector &localX,
--                        const Operator *elem_restrict_y,
--                        Vector &y, Vector &localY, const double c) const;
-+   mutable Vector local_trial, local_test, temp_trial, temp_test;
-+   mutable Vector int_face_trial, int_face_test, int_face_y;
-+   mutable Vector bdr_face_trial, bdr_face_test, bdr_face_y;
-+   const ElementRestriction *elem_restrict_trial; // Not owned
-+   const ElementRestriction *elem_restrict_test;  // Not owned
-+   const FaceRestriction *int_face_restrict_lex_trial; // Not owned
-+   const FaceRestriction *int_face_restrict_lex_test;  // Not owned
-+   const FaceRestriction *bdr_face_restrict_lex_trial; // Not owned
-+   const FaceRestriction *bdr_face_restrict_lex_test;  // Not owned
- 
- public:
--   PAMixedBilinearFormExtension(MixedBilinearForm *form);
-+   MFMixedBilinearFormExtension(MixedBilinearForm *form);
- 
--   /// Partial assembly of all internal integrators
-    void Assemble();
--   /**
--      @brief Setup OperatorHandle A to contain constrained linear operator
--
--      OperatorHandle A contains matrix-free constrained operator formed for RAP
--      system where ess_tdof_list are in trial space and eliminated from
--      "columns" of A.
--   */
--   void FormRectangularSystemOperator(const Array<int> &trial_tdof_list,
--                                      const Array<int> &test_tdof_list,
--                                      OperatorHandle &A);
--   /**
--      Setup OperatorHandle A to contain constrained linear operator and
--      eliminate columns corresponding to essential dofs from system,
--      updating RHS B vector with the results.
--   */
--   void FormRectangularLinearSystem(const Array<int> &trial_tdof_list,
--                                    const Array<int> &test_tdof_list,
--                                    Vector &x, Vector &b,
--                                    OperatorHandle &A, Vector &X, Vector &B);
--   /// y = A*x
-    void Mult(const Vector &x, Vector &y) const;
--   /// y += c*A*x
--   void AddMult(const Vector &x, Vector &y, const double c=1.0) const;
--   /// y = A^T*x
-+   void AddMult(const Vector &x, Vector &y, const double c = 1.0) const;
-    void MultTranspose(const Vector &x, Vector &y) const;
--   /// y += c*A^T*x
--   void AddMultTranspose(const Vector &x, Vector &y, const double c=1.0) const;
--   /// Assemble the diagonal of ADA^T for a diagonal vector D.
--   void AssembleDiagonal_ADAt(const Vector &D, Vector &diag) const;
--
--   /// Update internals for when a new MixedBilinearForm is given to this class
-+   void AddMultTranspose(const Vector &x, Vector &y, const double c = 1.0) const;
-    void Update();
-+
-+protected:
-+   void SetupRestrictionOperators(const L2FaceValues m);
- };
- 
-+/// Data and methods for partially-assembled mixed bilinear forms
-+class PAMixedBilinearFormExtension : public MFMixedBilinearFormExtension
-+{
-+public:
-+   PAMixedBilinearFormExtension(MixedBilinearForm *form);
- 
--/**
--   @brief Partial assembly extension for DiscreteLinearOperator
-+   void Assemble();
-+   void AssembleDiagonal_ADAt(const Vector &D, Vector &diag) const;
-+   void AddMult(const Vector &x, Vector &y, const double c = 1.0) const;
-+   void AddMultTranspose(const Vector &x, Vector &y, const double c = 1.0) const;
-+};
- 
--   This acts very much like PAMixedBilinearFormExtension, but its
--   FormRectangularSystemOperator implementation emulates 'Set' rather than
--   'Add' in the assembly case.
--*/
-+/// Data and methods for partially-assembled discrete linear operators
- class PADiscreteLinearOperatorExtension : public PAMixedBilinearFormExtension
- {
-+private:
-+   Vector test_multiplicity;
-+
- public:
-    PADiscreteLinearOperatorExtension(DiscreteLinearOperator *linop);
- 
--   /// Partial assembly of all internal integrators
-    void Assemble();
--
--   void AddMult(const Vector &x, Vector &y, const double c=1.0) const;
--
--   void AddMultTranspose(const Vector &x, Vector &y, const double c=1.0) const;
--
--   void FormRectangularSystemOperator(const Array<int>&, const Array<int>&,
--                                      OperatorHandle& A);
--
--   const Operator * GetOutputRestrictionTranspose() const;
--
--private:
--   Vector test_multiplicity;
-+   void AddMult(const Vector &x, Vector &y, const double c = 1.0) const;
-+   void AddMultTranspose(const Vector &x, Vector &y, const double c = 1.0) const;
- };
- 
- }
-diff --git a/fem/bilininteg.cpp b/fem/bilininteg.cpp
-index c552e9510..e6fc2a6ee 100644
---- a/fem/bilininteg.cpp
-+++ b/fem/bilininteg.cpp
-@@ -22,126 +22,162 @@ namespace mfem
- 
- void BilinearFormIntegrator::AssemblePA(const FiniteElementSpace&)
- {
--   mfem_error ("BilinearFormIntegrator::AssemblePA(fes)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AssemblePA(fes)\n"
-+              "   is not implemented for this class.");
- }
- 
- void BilinearFormIntegrator::AssemblePA(const FiniteElementSpace&,
-                                         const FiniteElementSpace&)
- {
--   mfem_error ("BilinearFormIntegrator::AssemblePA(fes, fes)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AssemblePA(fes, fes)\n"
-+              "   is not implemented for this class.");
-+}
-+
-+void BilinearFormIntegrator::AssemblePABoundary(const FiniteElementSpace&)
-+{
-+   MFEM_ABORT("BilinearFormIntegrator::AssemblePABoundary(fes)\n"
-+              "   is not implemented for this class.");
-+}
-+
-+void BilinearFormIntegrator::AssemblePABoundary(const FiniteElementSpace&,
-+                                                const FiniteElementSpace&)
-+{
-+   MFEM_ABORT("BilinearFormIntegrator::AssemblePABoundary(fes, fes)\n"
-+              "   is not implemented for this class.");
- }
- 
- void BilinearFormIntegrator::AssemblePAInteriorFaces(const FiniteElementSpace&)
- {
--   mfem_error ("BilinearFormIntegrator::AssemblePAInteriorFaces(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AssemblePAInteriorFaces(fes)\n"
-+              "   is not implemented for this class.");
- }
- 
- void BilinearFormIntegrator::AssemblePABoundaryFaces(const FiniteElementSpace&)
- {
--   mfem_error ("BilinearFormIntegrator::AssemblePABoundaryFaces(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AssemblePABoundaryFaces(fes)\n"
-+              "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AssembleDiagonalPA(Vector &)
-+void BilinearFormIntegrator::AssembleDiagonalPA(Vector&)
- {
--   mfem_error ("BilinearFormIntegrator::AssembleDiagonalPA(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AssembleDiagonalPA(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AssembleEA(const FiniteElementSpace &fes,
--                                        Vector &emat,
--                                        const bool add)
-+void BilinearFormIntegrator::AssembleDiagonalPA_ADAt(const Vector&, Vector&)
- {
--   mfem_error ("BilinearFormIntegrator::AssembleEA(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AssembleDiagonalPA_ADAt(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AssembleEAInteriorFaces(const FiniteElementSpace
--                                                     &fes,
--                                                     Vector &ea_data_int,
--                                                     Vector &ea_data_ext,
--                                                     const bool add)
-+void BilinearFormIntegrator::AddMultPA(const Vector&, Vector&) const
- {
--   mfem_error ("BilinearFormIntegrator::AssembleEAInteriorFaces(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("BilinearFormIntegrator::MultAssembled(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AssembleEABoundaryFaces(const FiniteElementSpace
--                                                     &fes,
--                                                     Vector &ea_data_bdr,
--                                                     const bool add)
-+void BilinearFormIntegrator::AddMultTransposePA(const Vector&, Vector&) const
- {
--   mfem_error ("BilinearFormIntegrator::AssembleEABoundaryFaces(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AddMultTransposePA(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AssembleDiagonalPA_ADAt(const Vector &, Vector &)
-+void BilinearFormIntegrator::AssembleMF(const FiniteElementSpace&)
- {
--   MFEM_ABORT("BilinearFormIntegrator::AssembleDiagonalPA_ADAt(...)\n"
-+   MFEM_ABORT("BilinearFormIntegrator::AssembleMF(fes)\n"
-               "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AddMultPA(const Vector &, Vector &) const
-+void BilinearFormIntegrator::AssembleMF(const FiniteElementSpace&,
-+                                        const FiniteElementSpace&)
- {
--   mfem_error ("BilinearFormIntegrator::MultAssembled(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AssembleMF(fes, fes)\n"
-+              "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AddMultTransposePA(const Vector &, Vector &) const
-+void BilinearFormIntegrator::AssembleMFBoundary(const FiniteElementSpace&)
- {
--   mfem_error ("BilinearFormIntegrator::AddMultTransposePA(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AssembleMFBoundary(fes)\n"
-+              "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AssembleMF(const FiniteElementSpace &fes)
-+void BilinearFormIntegrator::AssembleMFBoundary(const FiniteElementSpace&,
-+                                                const FiniteElementSpace&)
- {
--   mfem_error ("BilinearFormIntegrator::AssembleMF(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AssembleMFBoundary(fes, fes)\n"
-+              "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AddMultMF(const Vector &, Vector &) const
-+void BilinearFormIntegrator::AssembleDiagonalMF(Vector&)
- {
--   mfem_error ("BilinearFormIntegrator::AddMultMF(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AssembleDiagonalMF(...)\n"
-+              "   is not implemented for this class.");
-+}
-+
-+void BilinearFormIntegrator::AddMultMF(const Vector&, Vector&) const
-+{
-+   MFEM_ABORT("BilinearFormIntegrator::AddMultMF(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AddMultTransposeMF(const Vector &, Vector &) const
-+void BilinearFormIntegrator::AddMultTransposeMF(const Vector&, Vector&) const
- {
--   mfem_error ("BilinearFormIntegrator::AddMultTransposeMF(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AddMultTransposeMF(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AssembleDiagonalMF(Vector &)
-+void BilinearFormIntegrator::AssembleEA(const FiniteElementSpace&,
-+                                        Vector&)
- {
--   mfem_error ("BilinearFormIntegrator::AssembleDiagonalMF(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AssembleEA(...)\n"
-+              "   is not implemented for this class.");
-+}
-+
-+void BilinearFormIntegrator::AssembleEA(const FiniteElementSpace&,
-+                                        const FiniteElementSpace&,
-+                                        Vector&)
-+{
-+   MFEM_ABORT("BilinearFormIntegrator::AssembleEA(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AssembleElementMatrix (
-+void BilinearFormIntegrator::AssembleEAInteriorFaces(const FiniteElementSpace &,
-+                                                     Vector&,
-+                                                     Vector&)
-+{
-+   MFEM_ABORT("BilinearFormIntegrator::AssembleEAInteriorFaces(...)\n"
-+              "   is not implemented for this class.");
-+}
-+
-+void BilinearFormIntegrator::AssembleEABoundaryFaces(const FiniteElementSpace&,
-+                                                     Vector&)
-+{
-+   MFEM_ABORT("BilinearFormIntegrator::AssembleEABoundaryFaces(...)\n"
-+              "   is not implemented for this class.");
-+}
-+
-+void BilinearFormIntegrator::AssembleElementMatrix(
-    const FiniteElement &el, ElementTransformation &Trans,
--   DenseMatrix &elmat )
-+   DenseMatrix &elmat)
- {
--   mfem_error ("BilinearFormIntegrator::AssembleElementMatrix(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AssembleElementMatrix(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AssembleElementMatrix2 (
-+void BilinearFormIntegrator::AssembleElementMatrix2(
-    const FiniteElement &el1, const FiniteElement &el2,
--   ElementTransformation &Trans, DenseMatrix &elmat )
-+   ElementTransformation &Trans, DenseMatrix &elmat)
- {
--   mfem_error ("BilinearFormIntegrator::AssembleElementMatrix2(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AssembleElementMatrix2(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--void BilinearFormIntegrator::AssembleFaceMatrix (
-+void BilinearFormIntegrator::AssembleFaceMatrix(
-    const FiniteElement &el1, const FiniteElement &el2,
-    FaceElementTransformations &Trans, DenseMatrix &elmat)
- {
--   mfem_error ("BilinearFormIntegrator::AssembleFaceMatrix(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("BilinearFormIntegrator::AssembleFaceMatrix(...)\n"
-+              "   is not implemented for this class.");
- }
- 
- void BilinearFormIntegrator::AssembleFaceMatrix(
-@@ -191,30 +227,30 @@ void TransposeIntegrator::SetIntRule(const IntegrationRule *ir)
-    bfi->SetIntRule(ir);
- }
- 
--void TransposeIntegrator::AssembleElementMatrix (
-+void TransposeIntegrator::AssembleElementMatrix(
-    const FiniteElement &el, ElementTransformation &Trans, DenseMatrix &elmat)
- {
--   bfi -> AssembleElementMatrix (el, Trans, bfi_elmat);
-+   bfi->AssembleElementMatrix(el, Trans, bfi_elmat);
-    // elmat = bfi_elmat^t
--   elmat.Transpose (bfi_elmat);
-+   elmat.Transpose(bfi_elmat);
- }
- 
--void TransposeIntegrator::AssembleElementMatrix2 (
-+void TransposeIntegrator::AssembleElementMatrix2(
-    const FiniteElement &trial_fe, const FiniteElement &test_fe,
-    ElementTransformation &Trans, DenseMatrix &elmat)
- {
--   bfi -> AssembleElementMatrix2 (test_fe, trial_fe, Trans, bfi_elmat);
-+   bfi->AssembleElementMatrix2(test_fe, trial_fe, Trans, bfi_elmat);
-    // elmat = bfi_elmat^t
--   elmat.Transpose (bfi_elmat);
-+   elmat.Transpose(bfi_elmat);
- }
- 
--void TransposeIntegrator::AssembleFaceMatrix (
-+void TransposeIntegrator::AssembleFaceMatrix(
-    const FiniteElement &el1, const FiniteElement &el2,
-    FaceElementTransformations &Trans, DenseMatrix &elmat)
- {
--   bfi -> AssembleFaceMatrix (el1, el2, Trans, bfi_elmat);
-+   bfi->AssembleFaceMatrix(el1, el2, Trans, bfi_elmat);
-    // elmat = bfi_elmat^t
--   elmat.Transpose (bfi_elmat);
-+   elmat.Transpose(bfi_elmat);
- }
- 
- void LumpedIntegrator::SetIntRule(const IntegrationRule *ir)
-@@ -223,10 +259,10 @@ void LumpedIntegrator::SetIntRule(const IntegrationRule *ir)
-    bfi->SetIntRule(ir);
- }
- 
--void LumpedIntegrator::AssembleElementMatrix (
-+void LumpedIntegrator::AssembleElementMatrix(
-    const FiniteElement &el, ElementTransformation &Trans, DenseMatrix &elmat)
- {
--   bfi -> AssembleElementMatrix (el, Trans, elmat);
-+   bfi->AssembleElementMatrix(el, Trans, elmat);
-    elmat.Lump();
- }
- 
-@@ -316,6 +352,15 @@ void SumIntegrator::AssemblePA(const FiniteElementSpace& fes)
-    }
- }
- 
-+void SumIntegrator::AssemblePA(const FiniteElementSpace& trial_fes,
-+                               const FiniteElementSpace& test_fes)
-+{
-+   for (int i = 0; i < integrators.Size(); i++)
-+   {
-+      integrators[i]->AssemblePA(trial_fes, test_fes);
-+   }
-+}
-+
- void SumIntegrator::AssembleDiagonalPA(Vector &diag)
- {
-    for (int i = 0; i < integrators.Size(); i++)
-@@ -364,57 +409,63 @@ void SumIntegrator::AssembleMF(const FiniteElementSpace &fes)
-    }
- }
- 
--void SumIntegrator::AddMultMF(const Vector& x, Vector& y) const
-+void SumIntegrator::AssembleMF(const FiniteElementSpace& trial_fes,
-+                               const FiniteElementSpace& test_fes)
- {
-    for (int i = 0; i < integrators.Size(); i++)
-    {
--      integrators[i]->AddMultTransposeMF(x, y);
-+      integrators[i]->AssembleMF(trial_fes, test_fes);
-    }
- }
- 
--void SumIntegrator::AddMultTransposeMF(const Vector &x, Vector &y) const
-+void SumIntegrator::AssembleDiagonalMF(Vector &diag)
- {
-    for (int i = 0; i < integrators.Size(); i++)
-    {
--      integrators[i]->AddMultMF(x, y);
-+      integrators[i]->AssembleDiagonalMF(diag);
-    }
- }
- 
--void SumIntegrator::AssembleDiagonalMF(Vector &diag)
-+void SumIntegrator::AddMultMF(const Vector& x, Vector& y) const
- {
-    for (int i = 0; i < integrators.Size(); i++)
-    {
--      integrators[i]->AssembleDiagonalMF(diag);
-+      integrators[i]->AddMultTransposeMF(x, y);
-+   }
-+}
-+
-+void SumIntegrator::AddMultTransposeMF(const Vector &x, Vector &y) const
-+{
-+   for (int i = 0; i < integrators.Size(); i++)
-+   {
-+      integrators[i]->AddMultMF(x, y);
-    }
- }
- 
--void SumIntegrator::AssembleEA(const FiniteElementSpace &fes, Vector &emat,
--                               const bool add)
-+void SumIntegrator::AssembleEA(const FiniteElementSpace &fes, Vector &emat)
- {
-    for (int i = 0; i < integrators.Size(); i++)
-    {
--      integrators[i]->AssembleEA(fes, emat, add);
-+      integrators[i]->AssembleEA(fes, emat);
-    }
- }
- 
- void SumIntegrator::AssembleEAInteriorFaces(const FiniteElementSpace &fes,
-                                             Vector &ea_data_int,
--                                            Vector &ea_data_ext,
--                                            const bool add)
-+                                            Vector &ea_data_ext)
- {
-    for (int i = 0; i < integrators.Size(); i++)
-    {
--      integrators[i]->AssembleEAInteriorFaces(fes,ea_data_int,ea_data_ext,add);
-+      integrators[i]->AssembleEAInteriorFaces(fes, ea_data_int, ea_data_ext);
-    }
- }
- 
- void SumIntegrator::AssembleEABoundaryFaces(const FiniteElementSpace &fes,
--                                            Vector &ea_data_bdr,
--                                            const bool add)
-+                                            Vector &ea_data_bdr)
- {
-    for (int i = 0; i < integrators.Size(); i++)
-    {
--      integrators[i]->AssembleEABoundaryFaces(fes, ea_data_bdr, add);
-+      integrators[i]->AssembleEABoundaryFaces(fes, ea_data_bdr);
-    }
- }
- 
-@@ -642,15 +693,15 @@ void MixedVectorIntegrator::AssembleElementMatrix2(
-       {
-          if (Q)
-          {
--            w *= Q -> Eval (Trans, ip);
-+            w *= Q->Eval(Trans, ip);
-          }
-          if (same_shapes)
-          {
--            AddMult_a_AAt (w, test_shape, elmat);
-+            AddMult_a_AAt(w, test_shape, elmat);
-          }
-          else
-          {
--            AddMult_a_ABt (w, test_shape, trial_shape, elmat);
-+            AddMult_a_ABt(w, test_shape, trial_shape, elmat);
-          }
-       }
-    }
-@@ -724,7 +775,7 @@ void MixedScalarVectorIntegrator::AssembleElementMatrix2(
-       VQ->Eval(V, Trans, ip);
-       V *= w;
- 
--      if ( vdim == 2 && cross_2d )
-+      if (vdim == 2 && cross_2d)
-       {
-          vtmp = V[0];
-          V[0] = -V[1];
-@@ -736,7 +787,6 @@ void MixedScalarVectorIntegrator::AssembleElementMatrix2(
-    }
- }
- 
--
- void GradientIntegrator::AssembleElementMatrix2(
-    const FiniteElement &trial_fe, const FiniteElement &test_fe,
-    ElementTransformation &Trans,  DenseMatrix &elmat)
-@@ -793,19 +843,18 @@ void GradientIntegrator::AssembleElementMatrix2(
-    }
- }
- 
--const IntegrationRule &GradientIntegrator::GetRule(const FiniteElement
--                                                   &trial_fe,
--                                                   const FiniteElement &test_fe,
--                                                   ElementTransformation &Trans)
-+const IntegrationRule &GradientIntegrator::GetRule(
-+   const FiniteElement &trial_fe,
-+   const FiniteElement &test_fe,
-+   ElementTransformation &Trans)
- {
-    int order = Trans.OrderGrad(&trial_fe) + test_fe.GetOrder() + Trans.OrderJ();
-    return IntRules.Get(trial_fe.GetGeomType(), order);
- }
- 
--
--void DiffusionIntegrator::AssembleElementMatrix
--( const FiniteElement &el, ElementTransformation &Trans,
--  DenseMatrix &elmat )
-+void DiffusionIntegrator::AssembleElementMatrix(
-+   const FiniteElement &el, ElementTransformation &Trans,
-+   DenseMatrix &elmat)
- {
-    int nd = el.GetDof();
-    dim = el.GetDim();
-@@ -1044,10 +1093,14 @@ void DiffusionIntegrator::AssembleElementVector(
-    }
- }
- 
--void DiffusionIntegrator::ComputeElementFlux
--( const FiniteElement &el, ElementTransformation &Trans,
--  Vector &u, const FiniteElement &fluxelem, Vector &flux, bool with_coef,
--  const IntegrationRule *ir)
-+void DiffusionIntegrator::ComputeElementFlux(
-+   const FiniteElement &el,
-+   ElementTransformation &Trans,
-+   Vector &u,
-+   const FiniteElement &fluxelem,
-+   Vector &flux,
-+   bool with_coef,
-+   const IntegrationRule *ir)
- {
-    int nd, spaceDim, fnd;
- 
-@@ -1087,7 +1140,7 @@ void DiffusionIntegrator::ComputeElementFlux
-       ir = &fluxelem.GetNodes();
-    }
-    fnd = ir->GetNPoints();
--   flux.SetSize( fnd * spaceDim );
-+   flux.SetSize(fnd * spaceDim);
- 
-    for (int i = 0; i < fnd; i++)
-    {
-@@ -1095,7 +1148,7 @@ void DiffusionIntegrator::ComputeElementFlux
-       el.CalcDShape(ip, dshape);
-       dshape.MultTranspose(u, vec);
- 
--      Trans.SetIntPoint (&ip);
-+      Trans.SetIntPoint(&ip);
-       CalcInverse(Trans.Jacobian(), invdfdx);
-       invdfdx.MultTranspose(vec, vecdxt);
- 
-@@ -1143,9 +1196,11 @@ void DiffusionIntegrator::ComputeElementFlux
-    }
- }
- 
--double DiffusionIntegrator::ComputeFluxEnergy
--( const FiniteElement &fluxelem, ElementTransformation &Trans,
--  Vector &flux, Vector* d_energy)
-+double DiffusionIntegrator::ComputeFluxEnergy(
-+   const FiniteElement &fluxelem,
-+   ElementTransformation &Trans,
-+   Vector &flux,
-+   Vector* d_energy)
- {
-    int nd = fluxelem.GetDof();
-    dim = fluxelem.GetDim();
-@@ -1220,7 +1275,8 @@ double DiffusionIntegrator::ComputeFluxEnergy
- }
- 
- const IntegrationRule &DiffusionIntegrator::GetRule(
--   const FiniteElement &trial_fe, const FiniteElement &test_fe)
-+   const FiniteElement &trial_fe,
-+   const FiniteElement &test_fe)
- {
-    int order;
-    if (trial_fe.Space() == FunctionSpace::Pk)
-@@ -1232,7 +1288,6 @@ const IntegrationRule &DiffusionIntegrator::GetRule(
-       // order = 2*el.GetOrder() - 2;  // <-- this seems to work fine too
-       order = trial_fe.GetOrder() + test_fe.GetOrder() + trial_fe.GetDim() - 1;
-    }
--
-    if (trial_fe.Space() == FunctionSpace::rQk)
-    {
-       return RefinedIntRules.Get(trial_fe.GetGeomType(), order);
-@@ -1240,10 +1295,10 @@ const IntegrationRule &DiffusionIntegrator::GetRule(
-    return IntRules.Get(trial_fe.GetGeomType(), order);
- }
- 
--
--void MassIntegrator::AssembleElementMatrix
--( const FiniteElement &el, ElementTransformation &Trans,
--  DenseMatrix &elmat )
-+void MassIntegrator::AssembleElementMatrix(
-+   const FiniteElement &el,
-+   ElementTransformation &Trans,
-+   DenseMatrix &elmat)
- {
-    int nd = el.GetDof();
-    // int dim = el.GetDim();
-@@ -1252,8 +1307,8 @@ void MassIntegrator::AssembleElementMatrix
- #ifdef MFEM_THREAD_SAFE
-    Vector shape;
- #endif
--   elmat.SetSize(nd);
-    shape.SetSize(nd);
-+   elmat.SetSize(nd);
- 
-    const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el, Trans);
- 
-@@ -1261,14 +1316,14 @@ void MassIntegrator::AssembleElementMatrix
-    for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-       const IntegrationPoint &ip = ir->IntPoint(i);
--      Trans.SetIntPoint (&ip);
-+      Trans.SetIntPoint(&ip);
- 
-       el.CalcPhysShape(Trans, shape);
- 
-       w = Trans.Weight() * ip.weight;
-       if (Q)
-       {
--         w *= Q -> Eval(Trans, ip);
-+         w *= Q->Eval(Trans, ip);
-       }
- 
-       AddMult_a_VVt(w, shape, elmat);
-@@ -1300,11 +1355,11 @@ void MassIntegrator::AssembleElementMatrix2(
-       trial_fe.CalcShape(ip, shape);
-       test_fe.CalcShape(ip, te_shape);
- 
--      Trans.SetIntPoint (&ip);
-+      Trans.SetIntPoint(&ip);
-       w = Trans.Weight() * ip.weight;
-       if (Q)
-       {
--         w *= Q -> Eval(Trans, ip);
-+         w *= Q->Eval(Trans, ip);
-       }
- 
-       te_shape *= w;
-@@ -1326,7 +1381,6 @@ const IntegrationRule &MassIntegrator::GetRule(const FiniteElement &trial_fe,
-    return IntRules.Get(trial_fe.GetGeomType(), order);
- }
- 
--
- void BoundaryMassIntegrator::AssembleFaceMatrix(
-    const FiniteElement &el1, const FiniteElement &el2,
-    FaceElementTransformations &Trans, DenseMatrix &elmat)
-@@ -1366,7 +1420,7 @@ void BoundaryMassIntegrator::AssembleFaceMatrix(
-       w = Trans.Weight() * ip.weight;
-       if (Q)
-       {
--         w *= Q -> Eval(Trans, ip);
-+         w *= Q->Eval(Trans, ip);
-       }
- 
-       AddMult_a_VVt(w, shape, elmat);
-@@ -1420,9 +1474,18 @@ void ConvectionIntegrator::AssembleElementMatrix(
-    }
- }
- 
-+const IntegrationRule &ConvectionIntegrator::GetRule(
-+   const FiniteElement &fe,
-+   ElementTransformation &Trans)
-+{
-+   int order = Trans.OrderGrad(&fe) + Trans.Order() + fe.GetOrder();
-+   return IntRules.Get(fe.GetGeomType(), order);
-+}
- 
- void GroupConvectionIntegrator::AssembleElementMatrix(
--   const FiniteElement &el, ElementTransformation &Trans, DenseMatrix &elmat)
-+   const FiniteElement &el,
-+   ElementTransformation &Trans,
-+   DenseMatrix &elmat)
- {
-    int nd = el.GetDof();
-    int dim = el.GetDim();
-@@ -1473,24 +1536,10 @@ void GroupConvectionIntegrator::AssembleElementMatrix(
-    }
- }
- 
--const IntegrationRule &ConvectionIntegrator::GetRule(
--   const FiniteElement &trial_fe, const FiniteElement &test_fe,
--   ElementTransformation &Trans)
--{
--   int order = Trans.OrderGrad(&trial_fe) + Trans.Order() + test_fe.GetOrder();
--
--   return IntRules.Get(trial_fe.GetGeomType(), order);
--}
--
--const IntegrationRule &ConvectionIntegrator::GetRule(
--   const FiniteElement &el, ElementTransformation &Trans)
--{
--   return GetRule(el,el,Trans);
--}
--
--void VectorMassIntegrator::AssembleElementMatrix
--( const FiniteElement &el, ElementTransformation &Trans,
--  DenseMatrix &elmat )
-+void VectorMassIntegrator::AssembleElementMatrix(
-+   const FiniteElement &el,
-+   ElementTransformation &Trans,
-+   DenseMatrix &elmat)
- {
-    int nd = el.GetDof();
-    int spaceDim = Trans.GetSpaceDim();
-@@ -1533,7 +1582,7 @@ void VectorMassIntegrator::AssembleElementMatrix
-       const IntegrationPoint &ip = ir->IntPoint(s);
-       el.CalcShape(ip, shape);
- 
--      Trans.SetIntPoint (&ip);
-+      Trans.SetIntPoint(&ip);
-       norm = ip.weight * Trans.Weight();
- 
-       MultVVt(shape, partelmat);
-@@ -1790,7 +1839,7 @@ void VectorFECurlIntegrator::AssembleElementMatrix2(
-                "At least one of the finite elements must be in H(Curl)");
- 
-    int curl_nd, vec_nd;
--   if ( trial_fe.GetMapType() == mfem::FiniteElement::H_CURL )
-+   if (trial_fe.GetMapType() == mfem::FiniteElement::H_CURL)
-    {
-       curl_nd = trial_nd;
-       vec_nd  = test_nd;
-@@ -1829,7 +1878,7 @@ void VectorFECurlIntegrator::AssembleElementMatrix2(
-       Trans.SetIntPoint(&ip);
-       if (dim == 3)
-       {
--         if ( trial_fe.GetMapType() == mfem::FiniteElement::H_CURL )
-+         if (trial_fe.GetMapType() == mfem::FiniteElement::H_CURL)
-          {
-             trial_fe.CalcCurlShape(ip, curlshapeTrial);
-             test_fe.CalcVShape(Trans, vshapeTest);
-@@ -1843,7 +1892,7 @@ void VectorFECurlIntegrator::AssembleElementMatrix2(
-       }
-       else
-       {
--         if ( trial_fe.GetMapType() == mfem::FiniteElement::H_CURL )
-+         if (trial_fe.GetMapType() == mfem::FiniteElement::H_CURL)
-          {
-             trial_fe.CalcCurlShape(ip, curlshapeTrial_dFT);
-             test_fe.CalcShape(ip, shapeTest);
-@@ -1863,7 +1912,7 @@ void VectorFECurlIntegrator::AssembleElementMatrix2(
-       }
-       // Note: shapeTest points to the same data as vshapeTest
-       vshapeTest *= w;
--      if ( trial_fe.GetMapType() == mfem::FiniteElement::H_CURL )
-+      if (trial_fe.GetMapType() == mfem::FiniteElement::H_CURL)
-       {
-          AddMultABt(vshapeTest, curlshapeTrial_dFT, elmat);
-       }
-@@ -1874,7 +1923,7 @@ void VectorFECurlIntegrator::AssembleElementMatrix2(
-    }
- }
- 
--void DerivativeIntegrator::AssembleElementMatrix2 (
-+void DerivativeIntegrator::AssembleElementMatrix2(
-    const FiniteElement &trial_fe,
-    const FiniteElement &test_fe,
-    ElementTransformation &Trans,
-@@ -1888,12 +1937,12 @@ void DerivativeIntegrator::AssembleElementMatrix2 (
-    int i, l;
-    double det;
- 
--   elmat.SetSize (test_nd,trial_nd);
--   dshape.SetSize (trial_nd,dim);
-+   elmat.SetSize(test_nd,trial_nd);
-+   dshape.SetSize(trial_nd,dim);
-    dshapedxt.SetSize(trial_nd, spaceDim);
-    dshapedxi.SetSize(trial_nd);
-    invdfdx.SetSize(dim, spaceDim);
--   shape.SetSize (test_nd);
-+   shape.SetSize(test_nd);
- 
-    const IntegrationRule *ir = IntRule;
-    if (ir == NULL)
-@@ -1925,10 +1974,10 @@ void DerivativeIntegrator::AssembleElementMatrix2 (
- 
-       trial_fe.CalcDShape(ip, dshape);
- 
--      Trans.SetIntPoint (&ip);
--      CalcInverse (Trans.Jacobian(), invdfdx);
-+      Trans.SetIntPoint(&ip);
-+      CalcInverse(Trans.Jacobian(), invdfdx);
-       det = Trans.Weight();
--      Mult (dshape, invdfdx, dshapedxt);
-+      Mult(dshape, invdfdx, dshapedxt);
- 
-       test_fe.CalcShape(ip, shape);
- 
-@@ -1938,13 +1987,14 @@ void DerivativeIntegrator::AssembleElementMatrix2 (
-       }
- 
-       shape *= Q->Eval(Trans,ip) * det * ip.weight;
--      AddMultVWt (shape, dshapedxi, elmat);
-+      AddMultVWt(shape, dshapedxi, elmat);
-    }
- }
- 
--void CurlCurlIntegrator::AssembleElementMatrix
--( const FiniteElement &el, ElementTransformation &Trans,
--  DenseMatrix &elmat )
-+void CurlCurlIntegrator::AssembleElementMatrix(
-+   const FiniteElement &el,
-+   ElementTransformation &Trans,
-+   DenseMatrix &elmat)
- {
-    int nd = el.GetDof();
-    dim = el.GetDim();
-@@ -1959,6 +2009,7 @@ void CurlCurlIntegrator::AssembleElementMatrix
-    curlshape_dFt.SetSize(nd,dimc);
- #endif
-    elmat.SetSize(nd);
-+
-    if (MQ) { M.SetSize(dimc); }
-    if (DQ) { D.SetSize(dimc); }
- 
-@@ -1983,7 +2034,7 @@ void CurlCurlIntegrator::AssembleElementMatrix
-    {
-       const IntegrationPoint &ip = ir->IntPoint(i);
- 
--      Trans.SetIntPoint (&ip);
-+      Trans.SetIntPoint(&ip);
- 
-       w = ip.weight * Trans.Weight();
-       el.CalcPhysCurlShape(Trans, curlshape_dFt);
-@@ -2090,10 +2141,10 @@ void CurlCurlIntegrator::AssembleElementMatrix2(const FiniteElement &trial_fe,
-    }
- }
- 
--void CurlCurlIntegrator
--::ComputeElementFlux(const FiniteElement &el, ElementTransformation &Trans,
--                     Vector &u, const FiniteElement &fluxelem, Vector &flux,
--                     bool with_coef, const IntegrationRule *ir)
-+void CurlCurlIntegrator::ComputeElementFlux(
-+   const FiniteElement &el, ElementTransformation &Trans,
-+   Vector &u, const FiniteElement &fluxelem, Vector &flux,
-+   bool with_coef, const IntegrationRule *ir)
- {
- #ifdef MFEM_THREAD_SAFE
-    DenseMatrix projcurl;
-@@ -2215,7 +2266,9 @@ double CurlCurlIntegrator::ComputeFluxEnergy(const FiniteElement &fluxelem,
- }
- 
- void VectorCurlCurlIntegrator::AssembleElementMatrix(
--   const FiniteElement &el, ElementTransformation &Trans, DenseMatrix &elmat)
-+   const FiniteElement &el,
-+   ElementTransformation &Trans,
-+   DenseMatrix &elmat)
- {
-    int dim = el.GetDim();
-    int dof = el.GetDof();
-@@ -2405,7 +2458,6 @@ void MixedCurlIntegrator::AssembleElementMatrix2(
-    }
- }
- 
--
- void VectorFEMassIntegrator::AssembleElementMatrix(
-    const FiniteElement &el,
-    ElementTransformation &Trans,
-@@ -2443,7 +2495,7 @@ void VectorFEMassIntegrator::AssembleElementMatrix(
-    {
-       const IntegrationPoint &ip = ir->IntPoint(i);
- 
--      Trans.SetIntPoint (&ip);
-+      Trans.SetIntPoint(&ip);
- 
-       el.CalcVShape(Trans, trial_vshape);
- 
-@@ -2465,9 +2517,9 @@ void VectorFEMassIntegrator::AssembleElementMatrix(
-       {
-          if (Q)
-          {
--            w *= Q -> Eval (Trans, ip);
-+            w *= Q->Eval (Trans, ip);
-          }
--         AddMult_a_AAt (w, trial_vshape, elmat);
-+         AddMult_a_AAt(w, trial_vshape, elmat);
-       }
-    }
- }
-@@ -2512,7 +2564,7 @@ void VectorFEMassIntegrator::AssembleElementMatrix2(
-       {
-          const IntegrationPoint &ip = ir->IntPoint(i);
- 
--         Trans.SetIntPoint (&ip);
-+         Trans.SetIntPoint(&ip);
- 
-          trial_fe.CalcVShape(Trans, trial_vshape);
-          test_fe.CalcShape(ip, shape);
-@@ -2598,7 +2650,7 @@ void VectorFEMassIntegrator::AssembleElementMatrix2(
- #endif
-       DenseMatrix tmp(test_vshape.Height(), K.Width());
- 
--      elmat.SetSize (test_dof, trial_dof);
-+      elmat.SetSize(test_dof, trial_dof);
- 
-       const IntegrationRule *ir = IntRule;
-       if (ir == NULL)
-@@ -2612,7 +2664,7 @@ void VectorFEMassIntegrator::AssembleElementMatrix2(
-       {
-          const IntegrationPoint &ip = ir->IntPoint(i);
- 
--         Trans.SetIntPoint (&ip);
-+         Trans.SetIntPoint(&ip);
- 
-          trial_fe.CalcVShape(Trans, trial_vshape);
-          test_fe.CalcVShape(Trans, test_vshape);
-@@ -2635,7 +2687,7 @@ void VectorFEMassIntegrator::AssembleElementMatrix2(
-          {
-             if (Q)
-             {
--               w *= Q -> Eval (Trans, ip);
-+               w *= Q->Eval (Trans, ip);
-             }
-             AddMult_a_ABt(w,test_vshape,trial_vshape,elmat);
-          }
-@@ -2643,7 +2695,7 @@ void VectorFEMassIntegrator::AssembleElementMatrix2(
-    }
-    else
-    {
--      mfem_error("VectorFEMassIntegrator::AssembleElementMatrix2(...)\n"
-+      MFEM_ABORT("VectorFEMassIntegrator::AssembleElementMatrix2(...)\n"
-                  "   is not implemented for given trial and test bases.");
-    }
- }
-@@ -2659,42 +2711,42 @@ void VectorDivergenceIntegrator::AssembleElementMatrix2(
-    int test_dof = test_fe.GetDof();
-    double c;
- 
--   dshape.SetSize (trial_dof, dim);
--   gshape.SetSize (trial_dof, dim);
--   Jadj.SetSize (dim);
--   divshape.SetSize (dim*trial_dof);
--   shape.SetSize (test_dof);
-+   dshape.SetSize(trial_dof, dim);
-+   gshape.SetSize(trial_dof, dim);
-+   Jadj.SetSize(dim);
-+   divshape.SetSize(dim*trial_dof);
-+   shape.SetSize(test_dof);
- 
--   elmat.SetSize (test_dof, dim*trial_dof);
-+   elmat.SetSize(test_dof, dim*trial_dof);
- 
-    const IntegrationRule *ir = IntRule ? IntRule : &GetRule(trial_fe, test_fe,
-                                                             Trans);
- 
-    elmat = 0.0;
- 
--   for (int i = 0; i < ir -> GetNPoints(); i++)
-+   for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-       const IntegrationPoint &ip = ir->IntPoint(i);
- 
--      trial_fe.CalcDShape (ip, dshape);
--      test_fe.CalcShape (ip, shape);
-+      trial_fe.CalcDShape(ip, dshape);
-+      test_fe.CalcShape(ip, shape);
- 
--      Trans.SetIntPoint (&ip);
-+      Trans.SetIntPoint(&ip);
-       CalcAdjugate(Trans.Jacobian(), Jadj);
- 
--      Mult (dshape, Jadj, gshape);
-+      Mult(dshape, Jadj, gshape);
- 
-       gshape.GradToDiv (divshape);
- 
-       c = ip.weight;
-       if (Q)
-       {
--         c *= Q -> Eval (Trans, ip);
-+         c *= Q->Eval (Trans, ip);
-       }
- 
-       // elmat += c * shape * divshape ^ t
-       shape *= c;
--      AddMultVWt (shape, divshape, elmat);
-+      AddMultVWt(shape, divshape, elmat);
-    }
- }
- 
-@@ -2707,7 +2759,6 @@ const IntegrationRule &VectorDivergenceIntegrator::GetRule(
-    return IntRules.Get(trial_fe.GetGeomType(), order);
- }
- 
--
- void DivDivIntegrator::AssembleElementMatrix(
-    const FiniteElement &el,
-    ElementTransformation &Trans,
-@@ -2731,23 +2782,22 @@ void DivDivIntegrator::AssembleElementMatrix(
-    }
- 
-    elmat = 0.0;
--
--   for (int i = 0; i < ir -> GetNPoints(); i++)
-+   for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-       const IntegrationPoint &ip = ir->IntPoint(i);
- 
--      el.CalcDivShape (ip, divshape);
-+      el.CalcDivShape(ip, divshape);
- 
--      Trans.SetIntPoint (&ip);
-+      Trans.SetIntPoint(&ip);
-       c = ip.weight / Trans.Weight();
- 
-       if (Q)
-       {
--         c *= Q -> Eval (Trans, ip);
-+         c *= Q->Eval (Trans, ip);
-       }
- 
-       // elmat += c * divshape * divshape ^ t
--      AddMult_a_VVt (c, divshape, elmat);
-+      AddMult_a_VVt(c, divshape, elmat);
-    }
- }
- 
-@@ -2779,20 +2829,19 @@ void DivDivIntegrator::AssembleElementMatrix2(
-    }
- 
-    elmat = 0.0;
--
--   for (int i = 0; i < ir -> GetNPoints(); i++)
-+   for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-       const IntegrationPoint &ip = ir->IntPoint(i);
- 
-       trial_fe.CalcDivShape(ip,divshape);
-       test_fe.CalcDivShape(ip,te_divshape);
- 
--      Trans.SetIntPoint (&ip);
-+      Trans.SetIntPoint(&ip);
-       c = ip.weight / Trans.Weight();
- 
-       if (Q)
-       {
--         c *= Q -> Eval (Trans, ip);
-+         c *= Q->Eval (Trans, ip);
-       }
- 
-       te_divshape *= c;
-@@ -2836,7 +2885,7 @@ void VectorDiffusionIntegrator::AssembleElementMatrix(
- 
-    elmat = 0.0;
- 
--   for (int i = 0; i < ir -> GetNPoints(); i++)
-+   for (int i = 0; i < ir->GetNPoints(); i++)
-    {
- 
-       const IntegrationPoint &ip = ir->IntPoint(i);
-@@ -2972,7 +3021,6 @@ void VectorDiffusionIntegrator::AssembleElementVector(
-    }
- }
- 
--
- void ElasticityIntegrator::AssembleElementMatrix(
-    const FiniteElement &el, ElementTransformation &Trans, DenseMatrix &elmat)
- {
-@@ -3003,7 +3051,7 @@ void ElasticityIntegrator::AssembleElementMatrix(
- 
-    elmat = 0.0;
- 
--   for (int i = 0; i < ir -> GetNPoints(); i++)
-+   for (int i = 0; i < ir->GetNPoints(); i++)
-    {
-       const IntegrationPoint &ip = ir->IntPoint(i);
- 
-@@ -3038,7 +3086,7 @@ void ElasticityIntegrator::AssembleElementMatrix(
-             for (int k = 0; k < dof; k++)
-                for (int l = 0; l < dof; l++)
-                {
--                  elmat (dof*d+k, dof*d+l) += (M * w) * pelmat(k, l);
-+                  elmat(dof*d+k, dof*d+l) += (M * w) * pelmat(k, l);
-                }
-          }
-          for (int ii = 0; ii < dim; ii++)
-@@ -3360,7 +3408,6 @@ void DGTraceIntegrator::AssembleFaceMatrix(const FiniteElement &el1,
-    }
- }
- 
--
- const IntegrationRule &DGTraceIntegrator::GetRule(
-    Geometry::Type geom, int order, FaceElementTransformations &T)
- {
-@@ -3603,8 +3650,6 @@ void DGDiffusionIntegrator::AssembleFaceMatrix(
-    }
- }
- 
--
--// static method
- void DGElasticityIntegrator::AssembleBlock(
-    const int dim, const int row_ndofs, const int col_ndofs,
-    const int row_offset, const int col_offset,
-@@ -3827,7 +3872,6 @@ void DGElasticityIntegrator::AssembleFaceMatrix(
-    }
- }
- 
--
- void TraceJumpIntegrator::AssembleFaceMatrix(
-    const FiniteElement &trial_face_fe, const FiniteElement &test_fe1,
-    const FiniteElement &test_fe2, FaceElementTransformations &Trans,
-@@ -4243,7 +4287,6 @@ void NormalInterpolator::AssembleElementMatrix2(
-    }
- }
- 
--
- namespace internal
- {
- 
-@@ -4284,7 +4327,6 @@ ScalarProductInterpolator::AssembleElementMatrix2(const FiniteElement &dom_fe,
-    ran_fe.Project(dom_shape_coeff, Trans, elmat_as_vec);
- }
- 
--
- void
- ScalarVectorProductInterpolator::AssembleElementMatrix2(
-    const FiniteElement &dom_fe,
-@@ -4319,7 +4361,6 @@ ScalarVectorProductInterpolator::AssembleElementMatrix2(
-    ran_fe.ProjectMatrixCoefficient(dom_shape_coeff, Trans, elmat_as_vec);
- }
- 
--
- void
- VectorScalarProductInterpolator::AssembleElementMatrix2(
-    const FiniteElement &dom_fe,
-@@ -4357,7 +4398,6 @@ VectorScalarProductInterpolator::AssembleElementMatrix2(
-    ran_fe.ProjectMatrixCoefficient(dom_shape_coeff, Trans, elmat_as_vec);
- }
- 
--
- void
- ScalarCrossProductInterpolator::AssembleElementMatrix2(
-    const FiniteElement &dom_fe,
-@@ -4453,7 +4493,6 @@ VectorCrossProductInterpolator::AssembleElementMatrix2(
-    ran_fe.ProjectMatrixCoefficient(dom_shape_coeff, Trans, elmat_as_vec);
- }
- 
--
- namespace internal
- {
- 
-@@ -4483,8 +4522,7 @@ struct VDotVShapeCoefficient : public VectorCoefficient
- 
- }
- 
--void
--VectorInnerProductInterpolator::AssembleElementMatrix2(
-+void VectorInnerProductInterpolator::AssembleElementMatrix2(
-    const FiniteElement &dom_fe,
-    const FiniteElement &ran_fe,
-    ElementTransformation &Trans,
-diff --git a/fem/bilininteg.hpp b/fem/bilininteg.hpp
-index 11922cff0..209898714 100644
---- a/fem/bilininteg.hpp
-+++ b/fem/bilininteg.hpp
-@@ -27,7 +27,6 @@ constexpr int HCURL_MAX_Q1D = 5;
- #else
- constexpr int HCURL_MAX_Q1D = 6;
- #endif
--
- constexpr int HDIV_MAX_D1D = 5;
- constexpr int HDIV_MAX_Q1D = 6;
- 
-@@ -36,7 +35,7 @@ class BilinearFormIntegrator : public NonlinearFormIntegrator
- {
- protected:
-    BilinearFormIntegrator(const IntegrationRule *ir = NULL)
--      : NonlinearFormIntegrator(ir) { }
-+      : NonlinearFormIntegrator(ir) {}
- 
- public:
-    // TODO: add support for other assembly levels (in addition to PA) and their
-@@ -51,8 +50,6 @@ public:
-    // make sense for the action of the nonlinear operator (but they all make
-    // sense for its Jacobian).
- 
--   using NonlinearFormIntegrator::AssemblePA;
--
-    /// Method defining partial assembly.
-    /** The result of the partial assembly is stored internally so that it can be
-        used later in the methods AddMultPA() and AddMultTransposePA(). */
-@@ -61,6 +58,11 @@ public:
-    virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-                            const FiniteElementSpace &test_fes);
- 
-+   virtual void AssemblePABoundary(const FiniteElementSpace &fes);
-+   /** Used with BilinearFormIntegrators that have different spaces. */
-+   virtual void AssemblePABoundary(const FiniteElementSpace &trial_fes,
-+                                   const FiniteElementSpace &test_fes);
-+
-    virtual void AssemblePAInteriorFaces(const FiniteElementSpace &fes);
- 
-    virtual void AssemblePABoundaryFaces(const FiniteElementSpace &fes);
-@@ -89,20 +91,21 @@ public:
-        called. */
-    virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
- 
--   /// Method defining element assembly.
--   /** The result of the element assembly is added to the @a emat Vector if
--       @a add is true. Otherwise, if @a add is false, we set @a emat. */
--   virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat,
--                           const bool add = true);
--   /** Used with BilinearFormIntegrators that have different spaces. */
--   // virtual void AssembleEA(const FiniteElementSpace &trial_fes,
--   //                         const FiniteElementSpace &test_fes,
--   //                         Vector &emat);
--
-    /// Method defining matrix-free assembly.
-    /** The result of fully matrix-free assembly is stored internally so that it
-        can be used later in the methods AddMultMF() and AddMultTransposeMF(). */
-    virtual void AssembleMF(const FiniteElementSpace &fes);
-+   /** Used with BilinearFormIntegrators that have different spaces. */
-+   virtual void AssembleMF(const FiniteElementSpace &trial_fes,
-+                           const FiniteElementSpace &test_fes);
-+
-+   virtual void AssembleMFBoundary(const FiniteElementSpace &fes);
-+   /** Used with BilinearFormIntegrators that have different spaces. */
-+   virtual void AssembleMFBoundary(const FiniteElementSpace &trial_fes,
-+                                   const FiniteElementSpace &test_fes);
-+
-+   /// Assemble diagonal and add it to Vector @a diag.
-+   virtual void AssembleDiagonalMF(Vector &diag);
- 
-    /** Perform the action of integrator on the input @a x and add the result to
-        the output @a y. Both @a x and @a y are E-vectors, i.e. they represent
-@@ -120,17 +123,20 @@ public:
-        called. */
-    virtual void AddMultTransposeMF(const Vector &x, Vector &y) const;
- 
--   /// Assemble diagonal and add it to Vector @a diag.
--   virtual void AssembleDiagonalMF(Vector &diag);
-+   /// Method defining element assembly.
-+   /** The result of the element assembly is added to the @a emat Vector. */
-+   virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat);
-+   /** Used with BilinearFormIntegrators that have different spaces. */
-+   virtual void AssembleEA(const FiniteElementSpace &trial_fes,
-+                           const FiniteElementSpace &test_fes,
-+                           Vector &emat);
- 
-    virtual void AssembleEAInteriorFaces(const FiniteElementSpace &fes,
-                                         Vector &ea_data_int,
--                                        Vector &ea_data_ext,
--                                        const bool add = true);
-+                                        Vector &ea_data_ext);
- 
-    virtual void AssembleEABoundaryFaces(const FiniteElementSpace &fes,
--                                        Vector &ea_data_bdr,
--                                        const bool add = true);
-+                                        Vector &ea_data_bdr);
- 
-    /// Given a particular Finite Element computes the element matrix elmat.
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-@@ -234,7 +240,7 @@ public:
-                                    Vector &u,
-                                    const FiniteElement &fluxelem,
-                                    Vector &flux, bool with_coef = true,
--                                   const IntegrationRule *ir = NULL) { }
-+                                   const IntegrationRule *ir = NULL) {}
- 
-    /** @brief Virtual method required for Zienkiewicz-Zhu type error estimators.
- 
-@@ -260,7 +266,7 @@ public:
-                                     Vector &flux, Vector *d_energy = NULL)
-    { return 0.0; }
- 
--   virtual ~BilinearFormIntegrator() { }
-+   virtual ~BilinearFormIntegrator() {}
- };
- 
- /** Wraps a given @a BilinearFormIntegrator and transposes the resulting element
-@@ -268,13 +274,12 @@ public:
- class TransposeIntegrator : public BilinearFormIntegrator
- {
- private:
--   int own_bfi;
-+   bool own_bfi;
-    BilinearFormIntegrator *bfi;
--
-    DenseMatrix bfi_elmat;
- 
- public:
--   TransposeIntegrator (BilinearFormIntegrator *bfi_, int own_bfi_ = 1)
-+   TransposeIntegrator(BilinearFormIntegrator *bfi_, bool own_bfi_ = true)
-    { bfi = bfi_; own_bfi = own_bfi_; }
- 
-    virtual void SetIntRule(const IntegrationRule *ir);
-@@ -294,12 +299,25 @@ public:
-                                    FaceElementTransformations &Trans,
-                                    DenseMatrix &elmat);
- 
--   using BilinearFormIntegrator::AssemblePA;
--
--   virtual void AssemblePA(const FiniteElementSpace& fes)
-+   virtual void AssemblePA(const FiniteElementSpace &fes)
-    {
-       bfi->AssemblePA(fes);
-    }
-+   virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-+                           const FiniteElementSpace &test_fes)
-+   {
-+      bfi->AssemblePA(trial_fes, test_fes);
-+   }
-+
-+   virtual void AssemblePABoundary(const FiniteElementSpace &fes)
-+   {
-+      bfi->AssemblePABoundary(fes);
-+   }
-+   virtual void AssemblePABoundary(const FiniteElementSpace &trial_fes,
-+                                   const FiniteElementSpace &test_fes)
-+   {
-+      bfi->AssemblePABoundary(trial_fes, test_fes);
-+   }
- 
-    virtual void AssemblePAInteriorFaces(const FiniteElementSpace &fes)
-    {
-@@ -316,22 +334,20 @@ public:
-       bfi->AddMultPA(x, y);
-    }
- 
--   virtual void AddMultPA(const Vector& x, Vector& y) const
-+   virtual void AddMultPA(const Vector &x, Vector &y) const
-    {
-       bfi->AddMultTransposePA(x, y);
-    }
- 
--   virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat,
--                           const bool add);
-+   using BilinearFormIntegrator::AssembleEA;
-+   virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat);
- 
-    virtual void AssembleEAInteriorFaces(const FiniteElementSpace &fes,
-                                         Vector &ea_data_int,
--                                        Vector &ea_data_ext,
--                                        const bool add);
-+                                        Vector &ea_data_ext);
- 
-    virtual void AssembleEABoundaryFaces(const FiniteElementSpace &fes,
--                                        Vector &ea_data_bdr,
--                                        const bool add);
-+                                        Vector &ea_data_bdr);
- 
-    virtual ~TransposeIntegrator() { if (own_bfi) { delete bfi; } }
- };
-@@ -339,11 +355,11 @@ public:
- class LumpedIntegrator : public BilinearFormIntegrator
- {
- private:
--   int own_bfi;
-+   bool own_bfi;
-    BilinearFormIntegrator *bfi;
- 
- public:
--   LumpedIntegrator (BilinearFormIntegrator *bfi_, int own_bfi_ = 1)
-+   LumpedIntegrator(BilinearFormIntegrator *bfi_, bool own_bfi_ = true)
-    { bfi = bfi_; own_bfi = own_bfi_; }
- 
-    virtual void SetIntRule(const IntegrationRule *ir);
-@@ -359,11 +375,11 @@ public:
- class InverseIntegrator : public BilinearFormIntegrator
- {
- private:
--   int own_integrator;
-+   bool own_integrator;
-    BilinearFormIntegrator *integrator;
- 
- public:
--   InverseIntegrator(BilinearFormIntegrator *integ, int own_integ = 1)
-+   InverseIntegrator(BilinearFormIntegrator *integ, bool own_integ = 1)
-    { integrator = integ; own_integrator = own_integ; }
- 
-    virtual void SetIntRule(const IntegrationRule *ir);
-@@ -379,12 +395,12 @@ public:
- class SumIntegrator : public BilinearFormIntegrator
- {
- private:
--   int own_integrators;
-+   bool own_integrators;
-    mutable DenseMatrix elem_mat;
-    Array<BilinearFormIntegrator*> integrators;
- 
- public:
--   SumIntegrator(int own_integs = 1) { own_integrators = own_integs; }
-+   SumIntegrator(bool own_integs = true) { own_integrators = own_integs; }
- 
-    virtual void SetIntRule(const IntegrationRule *ir);
- 
-@@ -411,8 +427,9 @@ public:
-                                    FaceElementTransformations &Trans,
-                                    DenseMatrix &elmat);
- 
--   using BilinearFormIntegrator::AssemblePA;
--   virtual void AssemblePA(const FiniteElementSpace& fes);
-+   virtual void AssemblePA(const FiniteElementSpace &fes);
-+   virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-+                           const FiniteElementSpace &test_fes);
- 
-    virtual void AssembleDiagonalPA(Vector &diag);
- 
-@@ -422,27 +439,27 @@ public:
- 
-    virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
- 
--   virtual void AddMultPA(const Vector& x, Vector& y) const;
-+   virtual void AddMultPA(const Vector &x, Vector &y) const;
- 
-    virtual void AssembleMF(const FiniteElementSpace &fes);
-+   virtual void AssembleMF(const FiniteElementSpace &trial_fes,
-+                           const FiniteElementSpace &test_fes);
-+
-+   virtual void AssembleDiagonalMF(Vector &diag);
- 
-    virtual void AddMultMF(const Vector &x, Vector &y) const;
- 
-    virtual void AddMultTransposeMF(const Vector &x, Vector &y) const;
- 
--   virtual void AssembleDiagonalMF(Vector &diag);
--
--   virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat,
--                           const bool add);
-+   using BilinearFormIntegrator::AssembleEA;
-+   virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat);
- 
-    virtual void AssembleEAInteriorFaces(const FiniteElementSpace &fes,
-                                         Vector &ea_data_int,
--                                        Vector &ea_data_ext,
--                                        const bool add);
-+                                        Vector &ea_data_ext);
- 
-    virtual void AssembleEABoundaryFaces(const FiniteElementSpace &fes,
--                                        Vector &ea_data_bdr,
--                                        const bool add);
-+                                        Vector &ea_data_bdr);
- 
-    virtual ~SumIntegrator();
- };
-@@ -452,7 +469,6 @@ public:
- class MixedScalarIntegrator: public BilinearFormIntegrator
- {
- public:
--
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-                                        ElementTransformation &Trans,
-@@ -474,44 +490,39 @@ protected:
-    MixedScalarIntegrator(Coefficient &q) : same_calc_shape(false), Q(&q) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe, const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarIntegrator:  "
-              "Trial and test spaces must both be scalar fields.";
-    }
- 
--   inline virtual int GetIntegrationOrder(const FiniteElement & trial_fe,
--                                          const FiniteElement & test_fe,
-+   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-                                           ElementTransformation &Trans)
-    { return trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW(); }
- 
--
--   inline virtual void CalcTestShape(const FiniteElement & test_fe,
-+   inline virtual void CalcTestShape(const FiniteElement &test_fe,
-                                      ElementTransformation &Trans,
--                                     Vector & shape)
-+                                     Vector &shape)
-    { test_fe.CalcPhysShape(Trans, shape); }
- 
--   inline virtual void CalcTrialShape(const FiniteElement & trial_fe,
-+   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
-                                       ElementTransformation &Trans,
--                                      Vector & shape)
-+                                      Vector &shape)
-    { trial_fe.CalcPhysShape(Trans, shape); }
- 
-    Coefficient *Q;
- 
- private:
--
- #ifndef MFEM_THREAD_SAFE
--   Vector test_shape;
--   Vector trial_shape;
-+   Vector test_shape, trial_shape;
- #endif
--
- };
- 
- /** An abstract class for integrating the inner product of two vector basis
-@@ -519,7 +530,6 @@ private:
- class MixedVectorIntegrator: public BilinearFormIntegrator
- {
- public:
--
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-                                        ElementTransformation &Trans,
-@@ -548,39 +558,38 @@ protected:
-       : same_calc_shape(false), Q(NULL), VQ(NULL), DQ(NULL), MQ(&mq) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedVectorIntegrator:  "
-              "Trial and test spaces must both be vector fields";
-    }
- 
--   inline virtual int GetIntegrationOrder(const FiniteElement & trial_fe,
--                                          const FiniteElement & test_fe,
-+   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-                                           ElementTransformation &Trans)
-    { return trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW(); }
- 
--
--   inline virtual int GetTestVDim(const FiniteElement & test_fe)
-+   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return std::max(space_dim, test_fe.GetVDim()); }
- 
--   inline virtual void CalcTestShape(const FiniteElement & test_fe,
-+   inline virtual void CalcTestShape(const FiniteElement &test_fe,
-                                      ElementTransformation &Trans,
--                                     DenseMatrix & shape)
-+                                     DenseMatrix &shape)
-    { test_fe.CalcVShape(Trans, shape); }
- 
--   inline virtual int GetTrialVDim(const FiniteElement & trial_fe)
-+   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return std::max(space_dim, trial_fe.GetVDim()); }
- 
--   inline virtual void CalcTrialShape(const FiniteElement & trial_fe,
-+   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
-                                       ElementTransformation &Trans,
--                                      DenseMatrix & shape)
-+                                      DenseMatrix &shape)
-    { trial_fe.CalcVShape(Trans, shape); }
- 
-    int space_dim;
-@@ -590,16 +599,10 @@ protected:
-    MatrixCoefficient *MQ;
- 
- private:
--
- #ifndef MFEM_THREAD_SAFE
--   Vector V;
--   Vector D;
--   DenseMatrix M;
--   DenseMatrix test_shape;
--   DenseMatrix trial_shape;
--   DenseMatrix shape_tmp;
-+   Vector V, D;
-+   DenseMatrix M, test_shape, trial_shape, shape_tmp;
- #endif
--
- };
- 
- /** An abstract class for integrating the product of a scalar basis function and
-@@ -608,7 +611,6 @@ private:
- class MixedScalarVectorIntegrator: public BilinearFormIntegrator
- {
- public:
--
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-                                        ElementTransformation &Trans,
-@@ -625,14 +627,13 @@ public:
-    { AssembleElementMatrix2(fe, fe, Trans, elmat); }
- 
- protected:
--
-    MixedScalarVectorIntegrator(VectorCoefficient &vq, bool transpose_ = false,
-                                bool cross_2d_ = false)
-       : VQ(&vq), transpose(transpose_), cross_2d(cross_2d_) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return ((transpose &&
-                trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-@@ -643,9 +644,9 @@ protected:
-              );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
--      if ( transpose )
-+      if (transpose)
-       {
-          return "MixedScalarVectorIntegrator:  "
-                 "Trial space must be a vector field "
-@@ -659,23 +660,22 @@ protected:
-       }
-    }
- 
--   inline virtual int GetIntegrationOrder(const FiniteElement & trial_fe,
--                                          const FiniteElement & test_fe,
-+   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-                                           ElementTransformation &Trans)
-    { return trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW(); }
- 
--
--   inline virtual int GetVDim(const FiniteElement & vector_fe)
-+   inline virtual int GetVDim(const FiniteElement &vector_fe)
-    { return std::max(space_dim, vector_fe.GetVDim()); }
- 
--   inline virtual void CalcVShape(const FiniteElement & vector_fe,
-+   inline virtual void CalcVShape(const FiniteElement &vector_fe,
-                                   ElementTransformation &Trans,
--                                  DenseMatrix & shape_)
-+                                  DenseMatrix &shape_)
-    { vector_fe.CalcVShape(Trans, shape_); }
- 
--   inline virtual void CalcShape(const FiniteElement & scalar_fe,
-+   inline virtual void CalcShape(const FiniteElement &scalar_fe,
-                                  ElementTransformation &Trans,
--                                 Vector & shape_)
-+                                 Vector &shape_)
-    { scalar_fe.CalcPhysShape(Trans, shape_); }
- 
-    VectorCoefficient *VQ;
-@@ -684,14 +684,12 @@ protected:
-    bool cross_2d;  // In 2D use a cross product rather than a dot product
- 
- private:
--
- #ifndef MFEM_THREAD_SAFE
-    Vector V;
-    DenseMatrix vshape;
-    Vector      shape;
-    Vector      vshape_tmp;
- #endif
--
- };
- 
- /** Class for integrating the bilinear form a(u,v) := (Q u, v) in either 1D, 2D,
-@@ -726,24 +724,24 @@ public:
- 
- protected:
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDim() == 1 && test_fe.GetDim() == 1 &&
-               trial_fe.GetDerivType() == mfem::FiniteElement::GRAD  &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarDerivativeIntegrator:  "
-              "Trial and test spaces must both be scalar fields in 1D "
-              "and the trial space must implement CalcDShape.";
-    }
- 
--   inline virtual void CalcTrialShape(const FiniteElement & trial_fe,
-+   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
-                                       ElementTransformation &Trans,
--                                      Vector & shape)
-+                                      Vector &shape)
-    {
-       DenseMatrix dshape(shape.GetData(), shape.Size(), 1);
-       trial_fe.CalcPhysDShape(Trans, dshape);
-@@ -761,15 +759,15 @@ public:
- 
- protected:
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDim() == 1 && test_fe.GetDim() == 1 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
-               test_fe.GetDerivType()  == mfem::FiniteElement::GRAD );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarWeakDerivativeIntegrator:  "
-              "Trial and test spaces must both be scalar fields in 1D "
-@@ -777,9 +775,9 @@ protected:
-              "map type \"VALUE\".";
-    }
- 
--   inline virtual void CalcTestShape(const FiniteElement & test_fe,
-+   inline virtual void CalcTestShape(const FiniteElement &test_fe,
-                                      ElementTransformation &Trans,
--                                     Vector & shape)
-+                                     Vector &shape)
-    {
-       DenseMatrix dshape(shape.GetData(), shape.Size(), 1);
-       test_fe.CalcPhysDShape(Trans, dshape);
-@@ -799,28 +797,28 @@ public:
- 
- protected:
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDerivType() == mfem::FiniteElement::DIV  &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarDivergenceIntegrator:  "
-              "Trial must be H(Div) and the test space must be a "
-              "scalar field";
-    }
- 
--   inline virtual int GetIntegrationOrder(const FiniteElement & trial_fe,
--                                          const FiniteElement & test_fe,
-+   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-                                           ElementTransformation &Trans)
-    { return trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW() - 1; }
- 
--   inline virtual void CalcTrialShape(const FiniteElement & trial_fe,
-+   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
-                                       ElementTransformation &Trans,
--                                      Vector & shape)
-+                                      Vector &shape)
-    { trial_fe.CalcPhysDivShape(Trans, shape); }
- };
- 
-@@ -835,14 +833,14 @@ public:
- 
- protected:
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDerivType() == mfem::FiniteElement::DIV  &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedVectorDivergenceIntegrator:  "
-              "Trial must be H(Div) and the test space must be a "
-@@ -851,14 +849,14 @@ protected:
- 
-    // Subtract one due to the divergence and add one for the coefficient
-    // which is assumed to be at least linear.
--   inline virtual int GetIntegrationOrder(const FiniteElement & trial_fe,
--                                          const FiniteElement & test_fe,
-+   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-                                           ElementTransformation &Trans)
-    { return trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW() - 1 + 1; }
- 
--   inline virtual void CalcShape(const FiniteElement & scalar_fe,
-+   inline virtual void CalcShape(const FiniteElement &scalar_fe,
-                                  ElementTransformation &Trans,
--                                 Vector & shape)
-+                                 Vector &shape)
-    { scalar_fe.CalcPhysDivShape(Trans, shape); }
- };
- 
-@@ -874,28 +872,28 @@ public:
- 
- protected:
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
-               test_fe.GetDerivType()  == mfem::FiniteElement::DIV );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarWeakGradientIntegrator:  "
-              "Trial space must be a scalar field "
-              "and the test space must be H(Div)";
-    }
- 
--   inline virtual int GetIntegrationOrder(const FiniteElement & trial_fe,
--                                          const FiniteElement & test_fe,
-+   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-                                           ElementTransformation &Trans)
-    { return trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW() - 1; }
- 
--   virtual void CalcTestShape(const FiniteElement & test_fe,
-+   virtual void CalcTestShape(const FiniteElement &test_fe,
-                               ElementTransformation &Trans,
--                              Vector & shape)
-+                              Vector &shape)
-    {
-       test_fe.CalcPhysDivShape(Trans, shape);
-       shape *= -1.0;
-@@ -914,29 +912,29 @@ public:
- 
- protected:
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDim() == 2 && test_fe.GetDim() == 2 &&
-               trial_fe.GetDerivType() == mfem::FiniteElement::CURL &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR);
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarCurlIntegrator:  "
-              "Trial must be H(Curl) and the test space must be a "
-              "scalar field";
-    }
- 
--   inline virtual int GetIntegrationOrder(const FiniteElement & trial_fe,
--                                          const FiniteElement & test_fe,
-+   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-                                           ElementTransformation &Trans)
-    { return trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW() - 1; }
- 
--   inline virtual void CalcTrialShape(const FiniteElement & trial_fe,
-+   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
-                                       ElementTransformation &Trans,
--                                      Vector & shape)
-+                                      Vector &shape)
-    {
-       DenseMatrix dshape(shape.GetData(), shape.Size(), 1);
-       trial_fe.CalcPhysCurlShape(Trans, dshape);
-@@ -946,7 +944,8 @@ protected:
-    virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-                            const FiniteElementSpace &test_fes);
- 
--   virtual void AddMultPA(const Vector&, Vector&) const;
-+   virtual void AddMultPA(const Vector &x, Vector &y) const;
-+
-    virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
- 
-    // PA extension
-@@ -970,24 +969,24 @@ public:
- 
- protected:
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDim() == 2 && test_fe.GetDim() == 2 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
-               test_fe.GetDerivType()  == mfem::FiniteElement::CURL );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarWeakCurlIntegrator:  "
-              "Trial space must be a scalar field "
-              "and the test space must be H(Curl)";
-    }
- 
--   inline virtual void CalcTestShape(const FiniteElement & test_fe,
-+   inline virtual void CalcTestShape(const FiniteElement &test_fe,
-                                      ElementTransformation &Trans,
--                                     Vector & shape)
-+                                     Vector &shape)
-    {
-       DenseMatrix dshape(shape.GetData(), shape.Size(), 1);
-       test_fe.CalcPhysCurlShape(Trans, dshape);
-@@ -1028,14 +1027,14 @@ public:
-       : MixedScalarVectorIntegrator(vq, true) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedDotProductIntegrator:  "
-              "Trial space must be a vector field "
-@@ -1053,15 +1052,15 @@ public:
-       : MixedScalarVectorIntegrator(vq, true) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR &&
-               test_fe.GetDerivType()  == mfem::FiniteElement::DIV );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedWeakGradDotIntegrator:  "
-              "Trial space must be a vector field "
-@@ -1070,14 +1069,14 @@ public:
- 
-    // Subtract one due to the gradient and add one for the coefficient
-    // which is assumed to be at least linear.
--   inline virtual int GetIntegrationOrder(const FiniteElement & trial_fe,
--                                          const FiniteElement & test_fe,
-+   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-                                           ElementTransformation &Trans)
-    { return trial_fe.GetOrder() + test_fe.GetOrder() + Trans.OrderW() - 1 + 1; }
- 
--   inline virtual void CalcShape(const FiniteElement & scalar_fe,
-+   inline virtual void CalcShape(const FiniteElement &scalar_fe,
-                                  ElementTransformation &Trans,
--                                 Vector & shape)
-+                                 Vector &shape)
-    { scalar_fe.CalcPhysDivShape(Trans, shape); shape *= -1.0; }
- };
- 
-@@ -1090,8 +1089,8 @@ public:
-       : MixedVectorIntegrator(vq, false) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetVDim() == 3 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-@@ -1099,19 +1098,19 @@ public:
-               test_fe.GetDerivType()  == mfem::FiniteElement::GRAD );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedWeakDivCrossIntegrator:  "
-              "Trial space must be a vector field in 3D "
-              "and the test space must be a scalar field with a gradient";
-    }
- 
--   inline virtual int GetTestVDim(const FiniteElement & test_fe)
-+   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTestShape(const FiniteElement & test_fe,
-+   inline virtual void CalcTestShape(const FiniteElement &test_fe,
-                                      ElementTransformation &Trans,
--                                     DenseMatrix & shape)
-+                                     DenseMatrix &shape)
-    { test_fe.CalcPhysDShape(Trans, shape); shape *= -1.0; }
- };
- 
-@@ -1130,8 +1129,8 @@ public:
-       : MixedVectorIntegrator(mq) { same_calc_shape = true; }
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
-               trial_fe.GetDerivType() == mfem::FiniteElement::GRAD &&
-@@ -1139,15 +1138,15 @@ public:
-               test_fe.GetDerivType()  == mfem::FiniteElement::GRAD );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedGradGradIntegrator:  "
-              "Trial and test spaces must both be scalar fields "
-              "with a gradient operator.";
-    }
- 
--   inline virtual int GetIntegrationOrder(const FiniteElement & trial_fe,
--                                          const FiniteElement & test_fe,
-+   inline virtual int GetIntegrationOrder(const FiniteElement &trial_fe,
-+                                          const FiniteElement &test_fe,
-                                           ElementTransformation &Trans)
-    {
-       // Same as DiffusionIntegrator
-@@ -1156,20 +1155,20 @@ public:
-              trial_fe.GetOrder() + test_fe.GetOrder() + test_fe.GetDim() - 1;
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement & trial_fe)
-+   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTrialShape(const FiniteElement & trial_fe,
-+   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
-                                       ElementTransformation &Trans,
--                                      DenseMatrix & shape)
-+                                      DenseMatrix &shape)
-    { trial_fe.CalcPhysDShape(Trans, shape); }
- 
--   inline virtual int GetTestVDim(const FiniteElement & test_fe)
-+   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTestShape(const FiniteElement & test_fe,
-+   inline virtual void CalcTestShape(const FiniteElement &test_fe,
-                                      ElementTransformation &Trans,
--                                     DenseMatrix & shape)
-+                                     DenseMatrix &shape)
-    { test_fe.CalcPhysDShape(Trans, shape); }
- };
- 
-@@ -1182,8 +1181,8 @@ public:
-       : MixedVectorIntegrator(vq, false) { same_calc_shape = true; }
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
-               trial_fe.GetDerivType() == mfem::FiniteElement::GRAD &&
-@@ -1191,27 +1190,27 @@ public:
-               test_fe.GetDerivType()  == mfem::FiniteElement::GRAD );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedCrossGradGradIntegrator:  "
-              "Trial and test spaces must both be scalar fields "
-              "with a gradient operator.";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement & trial_fe)
-+   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTrialShape(const FiniteElement & trial_fe,
-+   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
-                                       ElementTransformation &Trans,
--                                      DenseMatrix & shape)
-+                                      DenseMatrix &shape)
-    { trial_fe.CalcPhysDShape(Trans, shape); }
- 
--   inline virtual int GetTestVDim(const FiniteElement & test_fe)
-+   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTestShape(const FiniteElement & test_fe,
-+   inline virtual void CalcTestShape(const FiniteElement &test_fe,
-                                      ElementTransformation &Trans,
--                                     DenseMatrix & shape)
-+                                     DenseMatrix &shape)
-    { test_fe.CalcPhysDShape(Trans, shape); }
- };
- 
-@@ -1230,8 +1229,8 @@ public:
-       : MixedVectorIntegrator(mq) { same_calc_shape = true; }
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetCurlDim() == 3 && test_fe.GetCurlDim() == 3 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-@@ -1240,27 +1239,27 @@ public:
-               test_fe.GetDerivType()  == mfem::FiniteElement::CURL );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedCurlCurlIntegrator"
-              "Trial and test spaces must both be vector fields in 3D "
-              "with a curl.";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement & trial_fe)
-+   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return trial_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTrialShape(const FiniteElement & trial_fe,
-+   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
-                                       ElementTransformation &Trans,
--                                      DenseMatrix & shape)
-+                                      DenseMatrix &shape)
-    { trial_fe.CalcPhysCurlShape(Trans, shape); }
- 
--   inline virtual int GetTestVDim(const FiniteElement & test_fe)
-+   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return test_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTestShape(const FiniteElement & test_fe,
-+   inline virtual void CalcTestShape(const FiniteElement &test_fe,
-                                      ElementTransformation &Trans,
--                                     DenseMatrix & shape)
-+                                     DenseMatrix &shape)
-    { test_fe.CalcPhysCurlShape(Trans, shape); }
- };
- 
-@@ -1273,8 +1272,8 @@ public:
-       : MixedVectorIntegrator(vq, false) { same_calc_shape = true; }
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetCurlDim() == 3 && trial_fe.GetVDim() == 3 &&
-               test_fe.GetCurlDim() == 3 && test_fe.GetVDim() == 3 &&
-@@ -1284,27 +1283,27 @@ public:
-               test_fe.GetDerivType()  == mfem::FiniteElement::CURL );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedCrossCurlCurlIntegrator:  "
-              "Trial and test spaces must both be vector fields in 3D "
-              "with a curl.";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement & trial_fe)
-+   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return trial_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTrialShape(const FiniteElement & trial_fe,
-+   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
-                                       ElementTransformation &Trans,
--                                      DenseMatrix & shape)
-+                                      DenseMatrix &shape)
-    { trial_fe.CalcPhysCurlShape(Trans, shape); }
- 
--   inline virtual int GetTestVDim(const FiniteElement & test_fe)
-+   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return test_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTestShape(const FiniteElement & test_fe,
-+   inline virtual void CalcTestShape(const FiniteElement &test_fe,
-                                      ElementTransformation &Trans,
--                                     DenseMatrix & shape)
-+                                     DenseMatrix &shape)
-    { test_fe.CalcPhysCurlShape(Trans, shape); }
- };
- 
-@@ -1317,8 +1316,8 @@ public:
-       : MixedVectorIntegrator(vq, false) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetCurlDim() == 3 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-@@ -1327,27 +1326,27 @@ public:
-               test_fe.GetDerivType()  == mfem::FiniteElement::GRAD );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedCrossCurlGradIntegrator"
-              "Trial space must be a vector field in 3D with a curl"
-              "and the test space must be a scalar field with a gradient";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement & trial_fe)
-+   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return trial_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTrialShape(const FiniteElement & trial_fe,
-+   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
-                                       ElementTransformation &Trans,
--                                      DenseMatrix & shape)
-+                                      DenseMatrix &shape)
-    { trial_fe.CalcPhysCurlShape(Trans, shape); }
- 
--   inline virtual int GetTestVDim(const FiniteElement & test_fe)
-+   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTestShape(const FiniteElement & test_fe,
-+   inline virtual void CalcTestShape(const FiniteElement &test_fe,
-                                      ElementTransformation &Trans,
--                                     DenseMatrix & shape)
-+                                     DenseMatrix &shape)
-    { test_fe.CalcPhysDShape(Trans, shape); }
- };
- 
-@@ -1360,8 +1359,8 @@ public:
-       : MixedVectorIntegrator(vq, false) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (test_fe.GetCurlDim() == 3 &&
-               trial_fe.GetRangeType()  == mfem::FiniteElement::SCALAR &&
-@@ -1370,27 +1369,27 @@ public:
-               test_fe.GetDerivType() == mfem::FiniteElement::CURL );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedCrossGradCurlIntegrator"
-              "Trial space must be a scalar field in 3D with a gradient"
-              "and the test space must be a vector field with a curl";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement & trial_fe)
-+   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTrialShape(const FiniteElement & trial_fe,
-+   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
-                                       ElementTransformation &Trans,
--                                      DenseMatrix & shape)
-+                                      DenseMatrix &shape)
-    { trial_fe.CalcPhysDShape(Trans, shape); }
- 
--   inline virtual int GetTestVDim(const FiniteElement & test_fe)
-+   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return test_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTestShape(const FiniteElement & test_fe,
-+   inline virtual void CalcTestShape(const FiniteElement &test_fe,
-                                      ElementTransformation &Trans,
--                                     DenseMatrix & shape)
-+                                     DenseMatrix &shape)
-    { test_fe.CalcPhysCurlShape(Trans, shape); }
- };
- 
-@@ -1404,8 +1403,8 @@ public:
-       : MixedVectorIntegrator(vq, false) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetVDim() == 3 && test_fe.GetCurlDim() == 3 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-@@ -1413,19 +1412,19 @@ public:
-               test_fe.GetDerivType()  == mfem::FiniteElement::CURL );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedWeakCurlCrossIntegrator:  "
-              "Trial space must be a vector field in 3D "
-              "and the test space must be a vector field with a curl";
-    }
- 
--   inline virtual int GetTestVDim(const FiniteElement & test_fe)
-+   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return test_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTestShape(const FiniteElement & test_fe,
-+   inline virtual void CalcTestShape(const FiniteElement &test_fe,
-                                      ElementTransformation &Trans,
--                                     DenseMatrix & shape)
-+                                     DenseMatrix &shape)
-    { test_fe.CalcPhysCurlShape(Trans, shape); }
- };
- 
-@@ -1439,8 +1438,8 @@ public:
-       : MixedScalarVectorIntegrator(vq, true, true) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDim() == 2 && test_fe.GetDim() == 2 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-@@ -1448,16 +1447,16 @@ public:
-               test_fe.GetDerivType()  == mfem::FiniteElement::CURL );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarWeakCurlCrossIntegrator:  "
-              "Trial space must be a vector field in 2D "
-              "and the test space must be a vector field with a curl";
-    }
- 
--   inline virtual void CalcShape(const FiniteElement & scalar_fe,
-+   inline virtual void CalcShape(const FiniteElement &scalar_fe,
-                                  ElementTransformation &Trans,
--                                 Vector & shape)
-+                                 Vector &shape)
-    {
-       DenseMatrix dshape(shape.GetData(), shape.Size(), 1);
-       scalar_fe.CalcPhysCurlShape(Trans, dshape);
-@@ -1474,8 +1473,8 @@ public:
-       : MixedVectorIntegrator(vq, false) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (test_fe.GetVDim() == 3 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
-@@ -1483,24 +1482,24 @@ public:
-               test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedCrossGradIntegrator:  "
-              "Trial space must be a scalar field with a gradient operator"
-              " and the test space must be a vector field both in 3D.";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement & trial_fe)
-+   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTrialShape(const FiniteElement & trial_fe,
-+   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
-                                       ElementTransformation &Trans,
--                                      DenseMatrix & shape)
-+                                      DenseMatrix &shape)
-    { trial_fe.CalcPhysDShape(Trans, shape); }
- 
--   inline virtual void CalcTestShape(const FiniteElement & test_fe,
-+   inline virtual void CalcTestShape(const FiniteElement &test_fe,
-                                      ElementTransformation &Trans,
--                                     DenseMatrix & shape)
-+                                     DenseMatrix &shape)
-    { test_fe.CalcVShape(Trans, shape); }
- };
- 
-@@ -1514,8 +1513,8 @@ public:
-       : MixedVectorIntegrator(vq, false) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetCurlDim() == 3 && test_fe.GetVDim() == 3 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-@@ -1523,19 +1522,19 @@ public:
-               test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedCrossCurlIntegrator:  "
-              "Trial space must be a vector field in 3D with a curl "
-              "and the test space must be a vector field";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement & trial_fe)
-+   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return trial_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTrialShape(const FiniteElement & trial_fe,
-+   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
-                                       ElementTransformation &Trans,
--                                      DenseMatrix & shape)
-+                                      DenseMatrix &shape)
-    { trial_fe.CalcPhysCurlShape(Trans, shape); }
- };
- 
-@@ -1549,8 +1548,8 @@ public:
-       : MixedScalarVectorIntegrator(vq, false, true) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDim() == 2 && test_fe.GetDim() == 2 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-@@ -1558,16 +1557,16 @@ public:
-               test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedCrossCurlIntegrator:  "
-              "Trial space must be a vector field in 2D with a curl "
-              "and the test space must be a vector field";
-    }
- 
--   inline virtual void CalcShape(const FiniteElement & scalar_fe,
-+   inline virtual void CalcShape(const FiniteElement &scalar_fe,
-                                  ElementTransformation &Trans,
--                                 Vector & shape)
-+                                 Vector &shape)
-    {
-       DenseMatrix dshape(shape.GetData(), shape.Size(), 1);
-       scalar_fe.CalcPhysCurlShape(Trans, dshape); shape *= -1.0;
-@@ -1583,8 +1582,8 @@ public:
-       : MixedScalarVectorIntegrator(vq, true, true) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDim() == 2 && test_fe.GetDim() == 2 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
-@@ -1592,19 +1591,19 @@ public:
-               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarCrossGradIntegrator:  "
-              "Trial space must be a scalar field in 2D with a gradient "
-              "and the test space must be a scalar field";
-    }
- 
--   inline int GetVDim(const FiniteElement & vector_fe)
-+   inline int GetVDim(const FiniteElement &vector_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcVShape(const FiniteElement & vector_fe,
-+   inline virtual void CalcVShape(const FiniteElement &vector_fe,
-                                   ElementTransformation &Trans,
--                                  DenseMatrix & shape)
-+                                  DenseMatrix &shape)
-    { vector_fe.CalcPhysDShape(Trans, shape); }
- };
- 
-@@ -1617,15 +1616,15 @@ public:
-       : MixedScalarVectorIntegrator(vq, true, true) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDim() == 2 && test_fe.GetDim() == 2 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarCrossProductIntegrator:  "
-              "Trial space must be a vector field in 2D "
-@@ -1642,24 +1641,24 @@ public:
-       : MixedScalarVectorIntegrator(vq, false, true) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDim() == 2 && test_fe.GetDim() == 2 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarWeakCrossProductIntegrator:  "
-              "Trial space must be a scalar field in 2D "
-              "and the test space must be a vector field";
-    }
- 
--   inline virtual void CalcShape(const FiniteElement & scalar_fe,
-+   inline virtual void CalcShape(const FiniteElement &scalar_fe,
-                                  ElementTransformation &Trans,
--                                 Vector & shape)
-+                                 Vector &shape)
-    { scalar_fe.CalcPhysShape(Trans, shape); shape *= -1.0; }
- };
- 
-@@ -1672,27 +1671,27 @@ public:
-       : MixedScalarVectorIntegrator(vq, true) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
-               trial_fe.GetDerivType() == mfem::FiniteElement::GRAD   &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedDirectionalDerivativeIntegrator:  "
-              "Trial space must be a scalar field with a gradient "
-              "and the test space must be a scalar field";
-    }
- 
--   inline virtual int GetVDim(const FiniteElement & vector_fe)
-+   inline virtual int GetVDim(const FiniteElement &vector_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcVShape(const FiniteElement & vector_fe,
-+   inline virtual void CalcVShape(const FiniteElement &vector_fe,
-                                   ElementTransformation &Trans,
--                                  DenseMatrix & shape)
-+                                  DenseMatrix &shape)
-    { vector_fe.CalcPhysDShape(Trans, shape); }
- };
- 
-@@ -1705,8 +1704,8 @@ public:
-       : MixedScalarVectorIntegrator(vq, true) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
-               trial_fe.GetDerivType() == mfem::FiniteElement::GRAD   &&
-@@ -1714,24 +1713,24 @@ public:
-               test_fe.GetDerivType()  == mfem::FiniteElement::DIV   );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedGradDivIntegrator:  "
-              "Trial space must be a scalar field with a gradient"
-              "and the test space must be a vector field with a divergence";
-    }
- 
--   inline virtual int GetVDim(const FiniteElement & vector_fe)
-+   inline virtual int GetVDim(const FiniteElement &vector_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcVShape(const FiniteElement & vector_fe,
-+   inline virtual void CalcVShape(const FiniteElement &vector_fe,
-                                   ElementTransformation &Trans,
--                                  DenseMatrix & shape)
-+                                  DenseMatrix &shape)
-    { vector_fe.CalcPhysDShape(Trans, shape); shape *= -1.0; }
- 
--   inline virtual void CalcShape(const FiniteElement & scalar_fe,
-+   inline virtual void CalcShape(const FiniteElement &scalar_fe,
-                                  ElementTransformation &Trans,
--                                 Vector & shape)
-+                                 Vector &shape)
-    { scalar_fe.CalcPhysDivShape(Trans, shape); }
- };
- 
-@@ -1744,8 +1743,8 @@ public:
-       : MixedScalarVectorIntegrator(vq, false) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-               trial_fe.GetDerivType() == mfem::FiniteElement::DIV    &&
-@@ -1754,24 +1753,24 @@ public:
-              );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedDivGradIntegrator:  "
-              "Trial space must be a vector field with a divergence"
-              "and the test space must be a scalar field with a gradient";
-    }
- 
--   inline virtual int GetVDim(const FiniteElement & vector_fe)
-+   inline virtual int GetVDim(const FiniteElement &vector_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcVShape(const FiniteElement & vector_fe,
-+   inline virtual void CalcVShape(const FiniteElement &vector_fe,
-                                   ElementTransformation &Trans,
--                                  DenseMatrix & shape)
-+                                  DenseMatrix &shape)
-    { vector_fe.CalcPhysDShape(Trans, shape); shape *= -1.0; }
- 
--   inline virtual void CalcShape(const FiniteElement & scalar_fe,
-+   inline virtual void CalcShape(const FiniteElement &scalar_fe,
-                                  ElementTransformation &Trans,
--                                 Vector & shape)
-+                                 Vector &shape)
-    { scalar_fe.CalcPhysDivShape(Trans, shape); }
- };
- 
-@@ -1784,27 +1783,27 @@ public:
-       : MixedScalarVectorIntegrator(vq, false) {}
- 
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::SCALAR &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::SCALAR &&
-               test_fe.GetDerivType()  == mfem::FiniteElement::GRAD   );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedScalarWeakDivergenceIntegrator:  "
-              "Trial space must be a scalar field "
-              "and the test space must be a scalar field with a gradient";
-    }
- 
--   inline int GetVDim(const FiniteElement & vector_fe)
-+   inline int GetVDim(const FiniteElement &vector_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcVShape(const FiniteElement & vector_fe,
-+   inline virtual void CalcVShape(const FiniteElement &vector_fe,
-                                   ElementTransformation &Trans,
--                                  DenseMatrix & shape)
-+                                  DenseMatrix &shape)
-    { vector_fe.CalcPhysDShape(Trans, shape); shape *= -1.0; }
- };
- 
-@@ -1825,40 +1824,40 @@ public:
-    MixedVectorGradientIntegrator(MatrixCoefficient &mq)
-       : MixedVectorIntegrator(mq) {}
- 
-+   using BilinearFormIntegrator::AssemblePA;
-+   virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-+                           const FiniteElementSpace &test_fes);
-+
-+   virtual void AddMultPA(const Vector &x, Vector &y) const;
-+
-+   virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
-+
- protected:
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetDerivType() == mfem::FiniteElement::GRAD &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedVectorGradientIntegrator:  "
-              "Trial spaces must be H1 and the test space must be a "
-              "vector field in 2D or 3D";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement & trial_fe)
-+   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTrialShape(const FiniteElement & trial_fe,
-+   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
-                                       ElementTransformation &Trans,
--                                      DenseMatrix & shape)
-+                                      DenseMatrix &shape)
-    {
-       trial_fe.CalcPhysDShape(Trans, shape);
-    }
- 
--   using BilinearFormIntegrator::AssemblePA;
--   virtual void AssemblePA(const FiniteElementSpace &trial_fes,
--                           const FiniteElementSpace &test_fes);
--
--   virtual void AddMultPA(const Vector&, Vector&) const;
--   virtual void AddMultTransposePA(const Vector&, Vector&) const;
--
--private:
-    DenseMatrix Jinv;
- 
-    // PA extension
-@@ -1883,40 +1882,41 @@ public:
-    MixedVectorCurlIntegrator(MatrixCoefficient &mq)
-       : MixedVectorIntegrator(mq) {}
- 
-+   using BilinearFormIntegrator::AssemblePA;
-+   virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-+                           const FiniteElementSpace &test_fes);
-+
-+   virtual void AddMultPA(const Vector &x, Vector &y) const;
-+
-+   virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
-+
- protected:
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetCurlDim() == 3 && test_fe.GetVDim() == 3 &&
-               trial_fe.GetDerivType() == mfem::FiniteElement::CURL  &&
-               test_fe.GetRangeType()  == mfem::FiniteElement::VECTOR );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedVectorCurlIntegrator:  "
-              "Trial space must be H(Curl) and the test space must be a "
-              "vector field in 3D";
-    }
- 
--   inline virtual int GetTrialVDim(const FiniteElement & trial_fe)
-+   inline virtual int GetTrialVDim(const FiniteElement &trial_fe)
-    { return trial_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTrialShape(const FiniteElement & trial_fe,
-+   inline virtual void CalcTrialShape(const FiniteElement &trial_fe,
-                                       ElementTransformation &Trans,
--                                      DenseMatrix & shape)
-+                                      DenseMatrix &shape)
-    {
-       trial_fe.CalcPhysCurlShape(Trans, shape);
-    }
- 
--   using BilinearFormIntegrator::AssemblePA;
--   virtual void AssemblePA(const FiniteElementSpace &trial_fes,
--                           const FiniteElementSpace &test_fes);
--
--   virtual void AddMultPA(const Vector&, Vector&) const;
--   virtual void AddMultTransposePA(const Vector&, Vector&) const;
--
- private:
-    // PA extension
-    Vector pa_data;
-@@ -1942,40 +1942,41 @@ public:
-    MixedVectorWeakCurlIntegrator(MatrixCoefficient &mq)
-       : MixedVectorIntegrator(mq) {}
- 
-+   using BilinearFormIntegrator::AssemblePA;
-+   virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-+                           const FiniteElementSpace &test_fes);
-+
-+   virtual void AddMultPA(const Vector &x, Vector &y) const;
-+
-+   virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
-+
- protected:
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetVDim() == 3 && test_fe.GetCurlDim() == 3 &&
-               trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-               test_fe.GetDerivType()  == mfem::FiniteElement::CURL );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedVectorWeakCurlIntegrator:  "
-              "Trial space must be vector field in 3D and the "
-              "test space must be H(Curl)";
-    }
- 
--   inline virtual int GetTestVDim(const FiniteElement & test_fe)
-+   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return test_fe.GetCurlDim(); }
- 
--   inline virtual void CalcTestShape(const FiniteElement & test_fe,
-+   inline virtual void CalcTestShape(const FiniteElement &test_fe,
-                                      ElementTransformation &Trans,
--                                     DenseMatrix & shape)
-+                                     DenseMatrix &shape)
-    {
-       test_fe.CalcPhysCurlShape(Trans, shape);
-    }
- 
--   using BilinearFormIntegrator::AssemblePA;
--   virtual void AssemblePA(const FiniteElementSpace &trial_fes,
--                           const FiniteElementSpace &test_fes);
--
--   virtual void AddMultPA(const Vector&, Vector&) const;
--   virtual void AddMultTransposePA(const Vector&, Vector&) const;
--
- private:
-    // PA extension
-    Vector pa_data;
-@@ -2001,26 +2002,26 @@ public:
- 
- protected:
-    inline virtual bool VerifyFiniteElementTypes(
--      const FiniteElement & trial_fe,
--      const FiniteElement & test_fe) const
-+      const FiniteElement &trial_fe,
-+      const FiniteElement &test_fe) const
-    {
-       return (trial_fe.GetRangeType() == mfem::FiniteElement::VECTOR &&
-               test_fe.GetDerivType()  == mfem::FiniteElement::GRAD );
-    }
- 
--   inline virtual const char * FiniteElementTypeFailureMessage() const
-+   inline virtual const char *FiniteElementTypeFailureMessage() const
-    {
-       return "MixedVectorWeakDivergenceIntegrator:  "
-              "Trial space must be vector field and the "
-              "test space must be H1";
-    }
- 
--   inline virtual int GetTestVDim(const FiniteElement & test_fe)
-+   inline virtual int GetTestVDim(const FiniteElement &test_fe)
-    { return space_dim; }
- 
--   inline virtual void CalcTestShape(const FiniteElement & test_fe,
-+   inline virtual void CalcTestShape(const FiniteElement &test_fe,
-                                      ElementTransformation &Trans,
--                                     DenseMatrix & shape)
-+                                     DenseMatrix &shape)
-    {
-       test_fe.CalcPhysDShape(Trans, shape);
-       shape *= -1.0;
-@@ -2043,6 +2044,7 @@ private:
-    DenseMatrix gshape;
-    DenseMatrix Jadj;
-    DenseMatrix elmat_comp;
-+
-    // PA extension
-    Vector pa_data;
-    const DofToQuad *trial_maps, *test_maps; ///< Not owned
-@@ -2053,13 +2055,13 @@ private:
- public:
-    GradientIntegrator() :
-       Q{NULL}, trial_maps{NULL}, test_maps{NULL}, geom{NULL}
--   { }
-+   {}
-    GradientIntegrator(Coefficient *q_) :
-       Q{q_}, trial_maps{NULL}, test_maps{NULL}, geom{NULL}
--   { }
-+   {}
-    GradientIntegrator(Coefficient &q) :
-       Q{&q}, trial_maps{NULL}, test_maps{NULL}, geom{NULL}
--   { }
-+   {}
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-@@ -2071,6 +2073,7 @@ public:
-                            const FiniteElementSpace &test_fes);
- 
-    virtual void AddMultPA(const Vector &x, Vector &y) const;
-+
-    virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
- 
-    static const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-@@ -2096,7 +2099,6 @@ private:
- #endif
- 
-    // PA extension
--   const FiniteElementSpace *fespace;
-    const DofToQuad *maps;         ///< Not owned
-    const GeometricFactors *geom;  ///< Not owned
-    int dim, ne, dofs1D, quad1D;
-@@ -2107,30 +2109,31 @@ public:
-    /// Construct a diffusion integrator with coefficient Q = 1
-    DiffusionIntegrator(const IntegrationRule *ir = nullptr)
-       : BilinearFormIntegrator(ir),
--        Q(NULL), VQ(NULL), MQ(NULL), maps(NULL), geom(NULL) { }
-+        Q(NULL), VQ(NULL), MQ(NULL), maps(NULL), geom(NULL) {}
- 
-    /// Construct a diffusion integrator with a scalar coefficient q
-    DiffusionIntegrator(Coefficient &q, const IntegrationRule *ir = nullptr)
-       : BilinearFormIntegrator(ir),
--        Q(&q), VQ(NULL), MQ(NULL), maps(NULL), geom(NULL) { }
-+        Q(&q), VQ(NULL), MQ(NULL), maps(NULL), geom(NULL) {}
- 
-    /// Construct a diffusion integrator with a vector coefficient q
-    DiffusionIntegrator(VectorCoefficient &q,
-                        const IntegrationRule *ir = nullptr)
-       : BilinearFormIntegrator(ir),
--        Q(NULL), VQ(&q), MQ(NULL), maps(NULL), geom(NULL) { }
-+        Q(NULL), VQ(&q), MQ(NULL), maps(NULL), geom(NULL) {}
- 
-    /// Construct a diffusion integrator with a matrix coefficient q
-    DiffusionIntegrator(MatrixCoefficient &q,
-                        const IntegrationRule *ir = nullptr)
-       : BilinearFormIntegrator(ir),
--        Q(NULL), VQ(NULL), MQ(&q), maps(NULL), geom(NULL) { }
-+        Q(NULL), VQ(NULL), MQ(&q), maps(NULL), geom(NULL) {}
- 
-    /** Given a particular Finite Element computes the element stiffness matrix
-        elmat. */
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
-                                       DenseMatrix &elmat);
-+
-    /** Given a trial and test Finite Element computes the element stiffness
-        matrix elmat. */
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-@@ -2154,23 +2157,23 @@ public:
-                                     Vector &flux, Vector *d_energy = NULL);
- 
-    using BilinearFormIntegrator::AssemblePA;
-+   virtual void AssemblePA(const FiniteElementSpace &fes);
- 
--   virtual void AssembleMF(const FiniteElementSpace &fes);
-+   virtual void AssembleDiagonalPA(Vector &diag);
- 
--   virtual void AssemblePA(const FiniteElementSpace &fes);
-+   virtual void AddMultPA(const Vector &x, Vector &y) const;
- 
--   virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat,
--                           const bool add);
-+   virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
- 
--   virtual void AssembleDiagonalPA(Vector &diag);
-+   using BilinearFormIntegrator::AssembleMF;
-+   virtual void AssembleMF(const FiniteElementSpace &fes);
- 
-    virtual void AssembleDiagonalMF(Vector &diag);
- 
--   virtual void AddMultMF(const Vector&, Vector&) const;
--
--   virtual void AddMultPA(const Vector&, Vector&) const;
-+   virtual void AddMultMF(const Vector &x, Vector &y) const;
- 
--   virtual void AddMultTransposePA(const Vector&, Vector&) const;
-+   using BilinearFormIntegrator::AssembleEA;
-+   virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat);
- 
-    static const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-                                          const FiniteElement &test_fe);
-@@ -2183,55 +2186,62 @@ public:
- /** Class for local mass matrix assembling a(u,v) := (Q u, v) */
- class MassIntegrator: public BilinearFormIntegrator
- {
-+private:
-    friend class DGMassInverse;
-+
- protected:
- #ifndef MFEM_THREAD_SAFE
-    Vector shape, te_shape;
- #endif
-    Coefficient *Q;
-+
-    // PA extension
--   const FiniteElementSpace *fespace;
-    Vector pa_data;
--   const DofToQuad *maps;         ///< Not owned
--   const GeometricFactors *geom;  ///< Not owned
-+   const DofToQuad *maps;                 ///< Not owned
-+   const GeometricFactors *geom;          ///< Not owned
-+   const FaceGeometricFactors *face_geom; ///< Not owned
-    int dim, ne, nq, dofs1D, quad1D;
- 
- public:
-    MassIntegrator(const IntegrationRule *ir = NULL)
--      : BilinearFormIntegrator(ir), Q(NULL), maps(NULL), geom(NULL) { }
-+      : BilinearFormIntegrator(ir), Q(NULL), maps(NULL), geom(NULL) {}
- 
-    /// Construct a mass integrator with coefficient q
-    MassIntegrator(Coefficient &q, const IntegrationRule *ir = NULL)
--      : BilinearFormIntegrator(ir), Q(&q), maps(NULL), geom(NULL) { }
-+      : BilinearFormIntegrator(ir), Q(&q), maps(NULL), geom(NULL) {}
- 
-    /** Given a particular Finite Element computes the element mass matrix
-        elmat. */
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
-                                       DenseMatrix &elmat);
-+
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-                                        ElementTransformation &Trans,
-                                        DenseMatrix &elmat);
- 
-    using BilinearFormIntegrator::AssemblePA;
--
--   virtual void AssembleMF(const FiniteElementSpace &fes);
--
-    virtual void AssemblePA(const FiniteElementSpace &fes);
- 
--   virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat,
--                           const bool add);
-+   using BilinearFormIntegrator::AssemblePABoundary;
-+   virtual void AssemblePABoundary(const FiniteElementSpace &fes);
- 
-    virtual void AssembleDiagonalPA(Vector &diag);
- 
--   virtual void AssembleDiagonalMF(Vector &diag);
-+   virtual void AddMultPA(const Vector &x, Vector &y) const;
- 
--   virtual void AddMultMF(const Vector&, Vector&) const;
-+   virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
- 
--   virtual void AddMultPA(const Vector&, Vector&) const;
-+   using BilinearFormIntegrator::AssembleMF;
-+   virtual void AssembleMF(const FiniteElementSpace &fes);
- 
--   virtual void AddMultTransposePA(const Vector&, Vector&) const;
-+   virtual void AssembleDiagonalMF(Vector &diag);
-+
-+   virtual void AddMultMF(const Vector &x, Vector &y) const;
-+
-+   using BilinearFormIntegrator::AssembleEA;
-+   virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat);
- 
-    static const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-                                          const FiniteElement &test_fe,
-@@ -2246,7 +2256,7 @@ public:
- class BoundaryMassIntegrator : public MassIntegrator
- {
- public:
--   BoundaryMassIntegrator(Coefficient &q) : MassIntegrator(q) { }
-+   BoundaryMassIntegrator(Coefficient &q) : MassIntegrator(q) {}
- 
-    using BilinearFormIntegrator::AssembleFaceMatrix;
- 
-@@ -2262,6 +2272,7 @@ class ConvectionIntegrator : public BilinearFormIntegrator
- protected:
-    VectorCoefficient *Q;
-    double alpha;
-+
-    // PA extension
-    Vector pa_data;
-    const DofToQuad *maps;         ///< Not owned
-@@ -2278,34 +2289,30 @@ public:
-    ConvectionIntegrator(VectorCoefficient &q, double a = 1.0)
-       : Q(&q) { alpha = a; }
- 
--   virtual void AssembleElementMatrix(const FiniteElement &,
--                                      ElementTransformation &,
--                                      DenseMatrix &);
-+   virtual void AssembleElementMatrix(const FiniteElement &fes,
-+                                      ElementTransformation &Trans,
-+                                      DenseMatrix &elmat);
- 
-    using BilinearFormIntegrator::AssemblePA;
-+   virtual void AssemblePA(const FiniteElementSpace &fes);
- 
--   virtual void AssembleMF(const FiniteElementSpace &fes);
-+   virtual void AssembleDiagonalPA(Vector &diag);
- 
--   virtual void AssemblePA(const FiniteElementSpace&);
-+   virtual void AddMultPA(const Vector &x, Vector &y) const;
- 
--   virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat,
--                           const bool add);
-+   virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
- 
--   virtual void AssembleDiagonalPA(Vector &diag);
-+   using BilinearFormIntegrator::AssembleMF;
-+   virtual void AssembleMF(const FiniteElementSpace &fes);
- 
-    virtual void AssembleDiagonalMF(Vector &diag);
- 
--   virtual void AddMultMF(const Vector&, Vector&) const;
--
--   virtual void AddMultPA(const Vector&, Vector&) const;
--
--   virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
-+   virtual void AddMultMF(const Vector &x, Vector &y) const;
- 
--   static const IntegrationRule &GetRule(const FiniteElement &el,
--                                         ElementTransformation &Trans);
-+   using BilinearFormIntegrator::AssembleEA;
-+   virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat);
- 
--   static const IntegrationRule &GetRule(const FiniteElement &trial_fe,
--                                         const FiniteElement &test_fe,
-+   static const IntegrationRule &GetRule(const FiniteElement &fe,
-                                          ElementTransformation &Trans);
- 
-    bool SupportsCeed() const { return DeviceCanUseCeed(); }
-@@ -2319,7 +2326,7 @@ class ConservativeConvectionIntegrator : public TransposeIntegrator
- {
- public:
-    ConservativeConvectionIntegrator(VectorCoefficient &q, double a = 1.0)
--      : TransposeIntegrator(new ConvectionIntegrator(q, -a)) { }
-+      : TransposeIntegrator(new ConvectionIntegrator(q, -a)) {}
- };
- 
- /// alpha (q . grad u, v) using the "group" FE discretization
-@@ -2336,6 +2343,7 @@ private:
- public:
-    GroupConvectionIntegrator(VectorCoefficient &q, double a = 1.0)
-       : Q(&q) { alpha = a; }
-+
-    virtual void AssembleElementMatrix(const FiniteElement &,
-                                       ElementTransformation &,
-                                       DenseMatrix &);
-@@ -2357,6 +2365,7 @@ protected:
-    Coefficient *Q;
-    VectorCoefficient *VQ;
-    MatrixCoefficient *MQ;
-+
-    // PA extension
-    Vector pa_data;
-    const DofToQuad *maps;         ///< Not owned
-@@ -2366,21 +2375,21 @@ protected:
- public:
-    /// Construct an integrator with coefficient 1.0
-    VectorMassIntegrator()
--      : vdim(-1), Q_order(0), Q(NULL), VQ(NULL), MQ(NULL) { }
-+      : vdim(-1), Q_order(0), Q(NULL), VQ(NULL), MQ(NULL) {}
-    /** Construct an integrator with scalar coefficient q.  If possible, save
-        memory by using a scalar integrator since the resulting matrix is block
-        diagonal with the same diagonal block repeated. */
-    VectorMassIntegrator(Coefficient &q, int qo = 0)
--      : vdim(-1), Q_order(qo), Q(&q), VQ(NULL), MQ(NULL) { }
-+      : vdim(-1), Q_order(qo), Q(&q), VQ(NULL), MQ(NULL) {}
-    VectorMassIntegrator(Coefficient &q, const IntegrationRule *ir)
-       : BilinearFormIntegrator(ir), vdim(-1), Q_order(0), Q(&q), VQ(NULL),
--        MQ(NULL) { }
-+        MQ(NULL) {}
-    /// Construct an integrator with diagonal coefficient q
-    VectorMassIntegrator(VectorCoefficient &q, int qo = 0)
--      : vdim(q.GetVDim()), Q_order(qo), Q(NULL), VQ(&q), MQ(NULL) { }
-+      : vdim(q.GetVDim()), Q_order(qo), Q(NULL), VQ(&q), MQ(NULL) {}
-    /// Construct an integrator with matrix coefficient q
-    VectorMassIntegrator(MatrixCoefficient &q, int qo = 0)
--      : vdim(q.GetVDim()), Q_order(qo), Q(NULL), VQ(NULL), MQ(&q) { }
-+      : vdim(q.GetVDim()), Q_order(qo), Q(NULL), VQ(NULL), MQ(&q) {}
- 
-    int GetVDim() const { return vdim; }
-    void SetVDim(int vdim_) { vdim = vdim_; }
-@@ -2388,21 +2397,29 @@ public:
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
-                                       DenseMatrix &elmat);
-+
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-                                        ElementTransformation &Trans,
-                                        DenseMatrix &elmat);
-+
-    using BilinearFormIntegrator::AssemblePA;
-    virtual void AssemblePA(const FiniteElementSpace &fes);
--   virtual void AssembleMF(const FiniteElementSpace &fes);
-+
-    virtual void AssembleDiagonalPA(Vector &diag);
--   virtual void AssembleDiagonalMF(Vector &diag);
-+
-    virtual void AddMultPA(const Vector &x, Vector &y) const;
-+
-+   using BilinearFormIntegrator::AssembleMF;
-+   virtual void AssembleMF(const FiniteElementSpace &fes);
-+
-+   virtual void AssembleDiagonalMF(Vector &diag);
-+
-    virtual void AddMultMF(const Vector &x, Vector &y) const;
-+
-    bool SupportsCeed() const { return DeviceCanUseCeed(); }
- };
- 
--
- /** Class for integrating (div u, p) where u is a vector field given by
-     VectorFiniteElement through Piola transformation (for RT elements); p is
-     scalar function given by FiniteElement through standard transformation.
-@@ -2416,13 +2433,6 @@ class VectorFEDivergenceIntegrator : public BilinearFormIntegrator
- protected:
-    Coefficient *Q;
- 
--   using BilinearFormIntegrator::AssemblePA;
--   virtual void AssemblePA(const FiniteElementSpace &trial_fes,
--                           const FiniteElementSpace &test_fes);
--
--   virtual void AddMultPA(const Vector&, Vector&) const;
--   virtual void AddMultTransposePA(const Vector&, Vector&) const;
--
- private:
- #ifndef MFEM_THREAD_SAFE
-    Vector divshape, shape;
-@@ -2438,17 +2448,26 @@ private:
- public:
-    VectorFEDivergenceIntegrator() { Q = NULL; }
-    VectorFEDivergenceIntegrator(Coefficient &q) { Q = &q; }
-+
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
--                                      DenseMatrix &elmat) { }
-+                                      DenseMatrix &elmat) {}
-+
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-                                        ElementTransformation &Trans,
-                                        DenseMatrix &elmat);
- 
-+   using BilinearFormIntegrator::AssemblePA;
-+   virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-+                           const FiniteElementSpace &test_fes);
-+
-    virtual void AssembleDiagonalPA_ADAt(const Vector &D, Vector &diag);
--};
- 
-+   virtual void AddMultPA(const Vector &x, Vector &y) const;
-+
-+   virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
-+};
- 
- /** Integrator for `(-Q u, grad v)` for Nedelec (`u`) and H1 (`v`) elements.
-     This is equivalent to a weak divergence of the Nedelec basis functions. */
-@@ -2468,9 +2487,11 @@ private:
- public:
-    VectorFEWeakDivergenceIntegrator() { Q = NULL; }
-    VectorFEWeakDivergenceIntegrator(Coefficient &q) { Q = &q; }
-+
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
--                                      DenseMatrix &elmat) { }
-+                                      DenseMatrix &elmat) {}
-+
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-                                        ElementTransformation &Trans,
-@@ -2494,9 +2515,11 @@ private:
- public:
-    VectorFECurlIntegrator() { Q = NULL; }
-    VectorFECurlIntegrator(Coefficient &q) { Q = &q; }
-+
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
--                                      DenseMatrix &elmat) { }
-+                                      DenseMatrix &elmat) {}
-+
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-                                        ElementTransformation &Trans,
-@@ -2515,11 +2538,13 @@ private:
-    Vector shape, dshapedxi;
- 
- public:
--   DerivativeIntegrator(Coefficient &q, int i) : Q(&q), xi(i) { }
-+   DerivativeIntegrator(Coefficient &q, int i) : Q(&q), xi(i) {}
-+
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
-                                       DenseMatrix &elmat)
-    { AssembleElementMatrix2(el,el,Trans,elmat); }
-+
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-                                        ElementTransformation &Trans,
-@@ -2555,12 +2580,12 @@ public:
-    CurlCurlIntegrator() { Q = NULL; DQ = NULL; MQ = NULL; }
-    /// Construct a bilinear form integrator for Nedelec elements
-    CurlCurlIntegrator(Coefficient &q, const IntegrationRule *ir = NULL) :
--      BilinearFormIntegrator(ir), Q(&q), DQ(NULL), MQ(NULL) { }
-+      BilinearFormIntegrator(ir), Q(&q), DQ(NULL), MQ(NULL) {}
-    CurlCurlIntegrator(DiagonalMatrixCoefficient &dq,
-                       const IntegrationRule *ir = NULL) :
--      BilinearFormIntegrator(ir), Q(NULL), DQ(&dq), MQ(NULL) { }
-+      BilinearFormIntegrator(ir), Q(NULL), DQ(&dq), MQ(NULL) {}
-    CurlCurlIntegrator(MatrixCoefficient &mq, const IntegrationRule *ir = NULL) :
--      BilinearFormIntegrator(ir), Q(NULL), DQ(NULL), MQ(&mq) { }
-+      BilinearFormIntegrator(ir), Q(NULL), DQ(NULL), MQ(&mq) {}
- 
-    /* Given a particular Finite Element, compute the
-       element curl-curl matrix elmat */
-@@ -2585,8 +2610,10 @@ public:
- 
-    using BilinearFormIntegrator::AssemblePA;
-    virtual void AssemblePA(const FiniteElementSpace &fes);
-+
-+   virtual void AssembleDiagonalPA(Vector &diag);
-+
-    virtual void AddMultPA(const Vector &x, Vector &y) const;
--   virtual void AssembleDiagonalPA(Vector& diag);
- 
-    const Coefficient *GetCoefficient() const { return Q; }
- };
-@@ -2606,7 +2633,7 @@ protected:
- public:
-    VectorCurlCurlIntegrator() { Q = NULL; }
- 
--   VectorCurlCurlIntegrator(Coefficient &q) : Q(&q) { }
-+   VectorCurlCurlIntegrator(Coefficient &q) : Q(&q) {}
- 
-    /// Assemble an element matrix
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-@@ -2637,9 +2664,9 @@ private:
-    DenseMatrix curlshape;
-    DenseMatrix elmat_comp;
- public:
--   MixedCurlIntegrator() : Q{NULL} { }
--   MixedCurlIntegrator(Coefficient *q_) :  Q{q_} { }
--   MixedCurlIntegrator(Coefficient &q) :  Q{&q} { }
-+   MixedCurlIntegrator() : Q{NULL} {}
-+   MixedCurlIntegrator(Coefficient *q_) :  Q{q_} {}
-+   MixedCurlIntegrator(Coefficient &q) :  Q{&q} {}
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-@@ -2693,18 +2720,21 @@ public:
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
-                                       DenseMatrix &elmat);
-+
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-                                        ElementTransformation &Trans,
-                                        DenseMatrix &elmat);
- 
--   using BilinearFormIntegrator::AssemblePA;
--   virtual void AssemblePA(const FiniteElementSpace &fes);
-+   virtual void AssemblePA(const FiniteElementSpace &fes) { AssemblePA(fes, fes); }
-    virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-                            const FiniteElementSpace &test_fes);
-+
-+   virtual void AssembleDiagonalPA(Vector &diag);
-+
-    virtual void AddMultPA(const Vector &x, Vector &y) const;
-+
-    virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
--   virtual void AssembleDiagonalPA(Vector& diag);
- 
-    const Coefficient *GetCoefficient() const { return Q; }
- };
-@@ -2722,6 +2752,7 @@ private:
-    DenseMatrix dshape;
-    DenseMatrix gshape;
-    DenseMatrix Jadj;
-+
-    // PA extension
-    Vector pa_data;
-    const DofToQuad *trial_maps, *test_maps; ///< Not owned
-@@ -2735,10 +2766,10 @@ public:
-    {  }
-    VectorDivergenceIntegrator(Coefficient *q_) :
-       Q(q_), trial_maps(NULL), test_maps(NULL), geom(NULL)
--   { }
-+   {}
-    VectorDivergenceIntegrator(Coefficient &q) :
-       Q(&q), trial_maps(NULL), test_maps(NULL), geom(NULL)
--   { }
-+   {}
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                        const FiniteElement &test_fe,
-@@ -2750,6 +2781,7 @@ public:
-                            const FiniteElementSpace &test_fes);
- 
-    virtual void AddMultPA(const Vector &x, Vector &y) const;
-+
-    virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
- 
-    static const IntegrationRule &GetRule(const FiniteElement &trial_fe,
-@@ -2763,11 +2795,6 @@ class DivDivIntegrator: public BilinearFormIntegrator
- protected:
-    Coefficient *Q;
- 
--   using BilinearFormIntegrator::AssemblePA;
--   virtual void AssemblePA(const FiniteElementSpace &fes);
--   virtual void AddMultPA(const Vector &x, Vector &y) const;
--   virtual void AssembleDiagonalPA(Vector& diag);
--
- private:
- #ifndef MFEM_THREAD_SAFE
-    Vector divshape, te_divshape;
-@@ -2783,7 +2810,7 @@ private:
- public:
-    DivDivIntegrator() { Q = NULL; }
-    DivDivIntegrator(Coefficient &q, const IntegrationRule *ir = NULL) :
--      BilinearFormIntegrator(ir), Q(&q) { }
-+      BilinearFormIntegrator(ir), Q(&q) {}
- 
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
-@@ -2794,6 +2821,13 @@ public:
-                                        ElementTransformation &Trans,
-                                        DenseMatrix &elmat);
- 
-+   using BilinearFormIntegrator::AssemblePA;
-+   virtual void AssemblePA(const FiniteElementSpace &fes);
-+
-+   virtual void AssembleDiagonalPA(Vector &diag);
-+
-+   virtual void AddMultPA(const Vector &x, Vector &y) const;
-+
-    const Coefficient *GetCoefficient() const { return Q; }
- };
- 
-@@ -2830,7 +2864,7 @@ private:
-    Vector vcoeff;
- 
- public:
--   VectorDiffusionIntegrator() { }
-+   VectorDiffusionIntegrator() {}
- 
-    /** \brief Integrator with unit coefficient for caller-specified vector
-        dimension.
-@@ -2838,13 +2872,13 @@ public:
-        If the vector dimension does not match the true dimension of the space,
-        the resulting element matrix will be mathematically invalid. */
-    VectorDiffusionIntegrator(int vector_dimension)
--      : vdim(vector_dimension) { }
-+      : vdim(vector_dimension) {}
- 
-    VectorDiffusionIntegrator(Coefficient &q)
--      : Q(&q) { }
-+      : Q(&q) {}
- 
-    VectorDiffusionIntegrator(Coefficient &q, const IntegrationRule *ir)
--      : BilinearFormIntegrator(ir), Q(&q) { }
-+      : BilinearFormIntegrator(ir), Q(&q) {}
- 
-    /** \brief Integrator with scalar coefficient for caller-specified vector
-        dimension.
-@@ -2855,7 +2889,7 @@ public:
-        If the vector dimension does not match the true dimension of the space,
-        the resulting element matrix will be mathematically invalid. */
-    VectorDiffusionIntegrator(Coefficient &q, int vector_dimension)
--      : Q(&q), vdim(vector_dimension) { }
-+      : Q(&q), vdim(vector_dimension) {}
- 
-    /** \brief Integrator with \c VectorCoefficient. The vector dimension of the
-        \c FiniteElementSpace is assumed to be the same as the dimension of the
-@@ -2867,7 +2901,7 @@ public:
-        If the vector dimension does not match the true dimension of the space,
-        the resulting element matrix will be mathematically invalid. */
-    VectorDiffusionIntegrator(VectorCoefficient &vq)
--      : VQ(&vq), vdim(vq.GetVDim()) { }
-+      : VQ(&vq), vdim(vq.GetVDim()) {}
- 
-    /** \brief Integrator with \c MatrixCoefficient. The vector dimension of the
-        \c FiniteElementSpace is assumed to be the same as the dimension of the
-@@ -2879,21 +2913,30 @@ public:
-        If the vector dimension does not match the true dimension of the space,
-        the resulting element matrix will be mathematically invalid. */
-    VectorDiffusionIntegrator(MatrixCoefficient& mq)
--      : MQ(&mq), vdim(mq.GetVDim()) { }
-+      : MQ(&mq), vdim(mq.GetVDim()) {}
- 
-    virtual void AssembleElementMatrix(const FiniteElement &el,
-                                       ElementTransformation &Trans,
-                                       DenseMatrix &elmat);
-+
-    virtual void AssembleElementVector(const FiniteElement &el,
-                                       ElementTransformation &Tr,
-                                       const Vector &elfun, Vector &elvect);
-+
-    using BilinearFormIntegrator::AssemblePA;
-    virtual void AssemblePA(const FiniteElementSpace &fes);
--   virtual void AssembleMF(const FiniteElementSpace &fes);
-+
-    virtual void AssembleDiagonalPA(Vector &diag);
--   virtual void AssembleDiagonalMF(Vector &diag);
-+
-    virtual void AddMultPA(const Vector &x, Vector &y) const;
-+
-+   using BilinearFormIntegrator::AssembleMF;
-+   virtual void AssembleMF(const FiniteElementSpace &fes);
-+
-+   virtual void AssembleDiagonalMF(Vector &diag);
-+
-    virtual void AddMultMF(const Vector &x, Vector &y) const;
-+
-    bool SupportsCeed() const { return DeviceCanUseCeed(); }
- };
- 
-@@ -2988,6 +3031,7 @@ protected:
-    Coefficient *rho;
-    VectorCoefficient *u;
-    double alpha, beta;
-+
-    // PA extension
-    Vector pa_data;
-    const DofToQuad *maps;             ///< Not owned
-@@ -3016,24 +3060,20 @@ public:
-                                    FaceElementTransformations &Trans,
-                                    DenseMatrix &elmat);
- 
--   using BilinearFormIntegrator::AssemblePA;
--
-    virtual void AssemblePAInteriorFaces(const FiniteElementSpace &fes);
- 
-    virtual void AssemblePABoundaryFaces(const FiniteElementSpace &fes);
- 
--   virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
-+   virtual void AddMultPA(const Vector &x, Vector &y) const;
- 
--   virtual void AddMultPA(const Vector&, Vector&) const;
-+   virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
- 
-    virtual void AssembleEAInteriorFaces(const FiniteElementSpace& fes,
-                                         Vector &ea_data_int,
--                                        Vector &ea_data_ext,
--                                        const bool add);
-+                                        Vector &ea_data_ext);
- 
-    virtual void AssembleEABoundaryFaces(const FiniteElementSpace& fes,
--                                        Vector &ea_data_bdr,
--                                        const bool add);
-+                                        Vector &ea_data_bdr);
- 
-    static const IntegrationRule &GetRule(Geometry::Type geom, int order,
-                                          FaceElementTransformations &T);
-@@ -3056,14 +3096,14 @@ class NonconservativeDGTraceIntegrator : public TransposeIntegrator
- {
- public:
-    NonconservativeDGTraceIntegrator(VectorCoefficient &u, double a)
--      : TransposeIntegrator(new DGTraceIntegrator(u, -a, 0.5*a)) { }
-+      : TransposeIntegrator(new DGTraceIntegrator(u, -a, 0.5*a)) {}
- 
-    NonconservativeDGTraceIntegrator(VectorCoefficient &u, double a, double b)
--      : TransposeIntegrator(new DGTraceIntegrator(u, -a, b)) { }
-+      : TransposeIntegrator(new DGTraceIntegrator(u, -a, b)) {}
- 
-    NonconservativeDGTraceIntegrator(Coefficient &rho, VectorCoefficient &u,
-                                     double a, double b)
--      : TransposeIntegrator(new DGTraceIntegrator(rho, u, -a, b)) { }
-+      : TransposeIntegrator(new DGTraceIntegrator(rho, u, -a, b)) {}
- };
- 
- /** Integrator for the DG form:
-@@ -3091,11 +3131,12 @@ protected:
- 
- public:
-    DGDiffusionIntegrator(const double s, const double k)
--      : Q(NULL), MQ(NULL), sigma(s), kappa(k) { }
-+      : Q(NULL), MQ(NULL), sigma(s), kappa(k) {}
-    DGDiffusionIntegrator(Coefficient &q, const double s, const double k)
--      : Q(&q), MQ(NULL), sigma(s), kappa(k) { }
-+      : Q(&q), MQ(NULL), sigma(s), kappa(k) {}
-    DGDiffusionIntegrator(MatrixCoefficient &q, const double s, const double k)
--      : Q(NULL), MQ(&q), sigma(s), kappa(k) { }
-+      : Q(NULL), MQ(&q), sigma(s), kappa(k) {}
-+
-    using BilinearFormIntegrator::AssembleFaceMatrix;
-    virtual void AssembleFaceMatrix(const FiniteElement &el1,
-                                    const FiniteElement &el2,
-@@ -3227,11 +3268,11 @@ class DGElasticityIntegrator : public BilinearFormIntegrator
- {
- public:
-    DGElasticityIntegrator(double alpha_, double kappa_)
--      : lambda(NULL), mu(NULL), alpha(alpha_), kappa(kappa_) { }
-+      : lambda(NULL), mu(NULL), alpha(alpha_), kappa(kappa_) {}
- 
-    DGElasticityIntegrator(Coefficient &lambda_, Coefficient &mu_,
-                           double alpha_, double kappa_)
--      : lambda(&lambda_), mu(&mu_), alpha(alpha_), kappa(kappa_) { }
-+      : lambda(&lambda_), mu(&mu_), alpha(alpha_), kappa(kappa_) {}
- 
-    using BilinearFormIntegrator::AssembleFaceMatrix;
-    virtual void AssembleFaceMatrix(const FiniteElement &el1,
-@@ -3282,7 +3323,8 @@ private:
-    Vector face_shape, shape1, shape2;
- 
- public:
--   TraceJumpIntegrator() { }
-+   TraceJumpIntegrator() {}
-+
-    using BilinearFormIntegrator::AssembleFaceMatrix;
-    virtual void AssembleFaceMatrix(const FiniteElement &trial_face_fe,
-                                    const FiniteElement &test_fe1,
-@@ -3301,7 +3343,8 @@ private:
-    DenseMatrix shape1, shape2;
- 
- public:
--   NormalTraceJumpIntegrator() { }
-+   NormalTraceJumpIntegrator() {}
-+
-    using BilinearFormIntegrator::AssembleFaceMatrix;
-    virtual void AssembleFaceMatrix(const FiniteElement &trial_face_fe,
-                                    const FiniteElement &test_fe1,
-@@ -3393,8 +3436,7 @@ public:
- 
- /** Abstract class to serve as a base for local interpolators to be used in the
-     DiscreteLinearOperator class. */
--class DiscreteInterpolator : public BilinearFormIntegrator { };
--
-+class DiscreteInterpolator : public BilinearFormIntegrator {};
- 
- /** Class for constructing the gradient as a DiscreteLinearOperator from an
-     H1-conforming space to an H(curl)-conforming space. The range space can be
-@@ -3402,7 +3444,7 @@ class DiscreteInterpolator : public BilinearFormIntegrator { };
- class GradientInterpolator : public DiscreteInterpolator
- {
- public:
--   GradientInterpolator() : dofquad_fe(NULL) { }
-+   GradientInterpolator() : dofquad_fe(NULL) {}
-    virtual ~GradientInterpolator() { delete dofquad_fe; }
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &h1_fe,
-@@ -3411,17 +3453,17 @@ public:
-                                        DenseMatrix &elmat)
-    { nd_fe.ProjectGrad(h1_fe, Trans, elmat); }
- 
--   using BilinearFormIntegrator::AssemblePA;
--
-    /** @brief Setup method for PA data.
- 
-        @param[in] trial_fes   H1 Lagrange space
-        @param[in] test_fes    H(curl) Nedelec space
-     */
-+   using BilinearFormIntegrator::AssemblePA;
-    virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-                            const FiniteElementSpace &test_fes);
- 
-    virtual void AddMultPA(const Vector &x, Vector &y) const;
-+
-    virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
- 
- private:
-@@ -3434,7 +3476,6 @@ private:
-    int dim, ne, o_dofs1D, c_dofs1D;
- };
- 
--
- /** Class for constructing the identity map as a DiscreteLinearOperator. This
-     is the discrete embedding matrix when the domain space is a subspace of
-     the range space. Otherwise, a dof projection matrix is constructed. */
-@@ -3450,11 +3491,11 @@ public:
-    { ran_fe.Project(dom_fe, Trans, elmat); }
- 
-    using BilinearFormIntegrator::AssemblePA;
--
-    virtual void AssemblePA(const FiniteElementSpace &trial_fes,
-                            const FiniteElementSpace &test_fes);
- 
-    virtual void AddMultPA(const Vector &x, Vector &y) const;
-+
-    virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
- 
-    virtual ~IdentityInterpolator() { delete dofquad_fe; }
-@@ -3470,7 +3511,6 @@ private:
-    Vector pa_data;
- };
- 
--
- /** Class for constructing the (local) discrete curl matrix which can be used
-     as an integrator in a DiscreteLinearOperator object to assemble the global
-     discrete curl matrix. */
-@@ -3484,7 +3524,6 @@ public:
-    { ran_fe.ProjectCurl(dom_fe, Trans, elmat); }
- };
- 
--
- /** Class for constructing the (local) discrete divergence matrix which can
-     be used as an integrator in a DiscreteLinearOperator object to assemble
-     the global discrete divergence matrix.
-@@ -3503,7 +3542,6 @@ public:
-    { ran_fe.ProjectDiv(dom_fe, Trans, elmat); }
- };
- 
--
- /** A trace face interpolator class for interpolating the normal component of
-     the domain space, e.g. vector H1, into the range space, e.g. the trace of
-     RT which uses FiniteElement::INTEGRAL map type. */
-@@ -3522,7 +3560,7 @@ public:
- class ScalarProductInterpolator : public DiscreteInterpolator
- {
- public:
--   ScalarProductInterpolator(Coefficient & sc) : Q(&sc) { }
-+   ScalarProductInterpolator(Coefficient &sc) : Q(&sc) {}
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &dom_fe,
-                                        const FiniteElement &ran_fe,
-@@ -3539,13 +3577,14 @@ protected:
- class ScalarVectorProductInterpolator : public DiscreteInterpolator
- {
- public:
--   ScalarVectorProductInterpolator(Coefficient & sc)
--      : Q(&sc) { }
-+   ScalarVectorProductInterpolator(Coefficient &sc)
-+      : Q(&sc) {}
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &dom_fe,
-                                        const FiniteElement &ran_fe,
-                                        ElementTransformation &Trans,
-                                        DenseMatrix &elmat);
-+
- protected:
-    Coefficient *Q;
- };
-@@ -3556,13 +3595,14 @@ protected:
- class VectorScalarProductInterpolator : public DiscreteInterpolator
- {
- public:
--   VectorScalarProductInterpolator(VectorCoefficient & vc)
--      : VQ(&vc) { }
-+   VectorScalarProductInterpolator(VectorCoefficient &vc)
-+      : VQ(&vc) {}
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &dom_fe,
-                                        const FiniteElement &ran_fe,
-                                        ElementTransformation &Trans,
-                                        DenseMatrix &elmat);
-+
- protected:
-    VectorCoefficient *VQ;
- };
-@@ -3572,13 +3612,14 @@ protected:
- class ScalarCrossProductInterpolator : public DiscreteInterpolator
- {
- public:
--   ScalarCrossProductInterpolator(VectorCoefficient & vc)
--      : VQ(&vc) { }
-+   ScalarCrossProductInterpolator(VectorCoefficient &vc)
-+      : VQ(&vc) {}
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &nd_fe,
-                                        const FiniteElement &l2_fe,
-                                        ElementTransformation &Trans,
-                                        DenseMatrix &elmat);
-+
- protected:
-    VectorCoefficient *VQ;
- };
-@@ -3589,13 +3630,14 @@ protected:
- class VectorCrossProductInterpolator : public DiscreteInterpolator
- {
- public:
--   VectorCrossProductInterpolator(VectorCoefficient & vc)
--      : VQ(&vc) { }
-+   VectorCrossProductInterpolator(VectorCoefficient &vc)
-+      : VQ(&vc) {}
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &nd_fe,
-                                        const FiniteElement &rt_fe,
-                                        ElementTransformation &Trans,
-                                        DenseMatrix &elmat);
-+
- protected:
-    VectorCoefficient *VQ;
- };
-@@ -3606,27 +3648,16 @@ protected:
- class VectorInnerProductInterpolator : public DiscreteInterpolator
- {
- public:
--   VectorInnerProductInterpolator(VectorCoefficient & vc) : VQ(&vc) { }
-+   VectorInnerProductInterpolator(VectorCoefficient &vc) : VQ(&vc) {}
- 
-    virtual void AssembleElementMatrix2(const FiniteElement &rt_fe,
-                                        const FiniteElement &l2_fe,
-                                        ElementTransformation &Trans,
-                                        DenseMatrix &elmat);
-+
- protected:
-    VectorCoefficient *VQ;
- };
- 
--
--
--// PA Diffusion Assemble 2D kernel
--template<const int T_SDIM>
--void PADiffusionSetup2D(const int Q1D,
--                        const int coeffDim,
--                        const int NE,
--                        const Array<double> &w,
--                        const Vector &j,
--                        const Vector &c,
--                        Vector &d);
--
- }
- #endif
-diff --git a/fem/bilininteg_hcurl.cpp b/fem/bilininteg_hcurl.cpp
-deleted file mode 100644
-index e8762a71e..000000000
---- a/fem/bilininteg_hcurl.cpp
-+++ /dev/null
-@@ -1,7764 +0,0 @@
--// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
--// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
--// LICENSE and NOTICE for details. LLNL-CODE-806117.
--//
--// This file is part of the MFEM library. For more information and source code
--// availability visit https://mfem.org.
--//
--// MFEM is free software; you can redistribute it and/or modify it under the
--// terms of the BSD-3 license. We welcome feedback and contributions, see file
--// CONTRIBUTING.md for details.
--
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
--#include "qspace.hpp"
--
--using namespace std;
--
--namespace mfem
--{
--
--void PAHcurlHdivSetup3D(const int Q1D,
--                        const int coeffDim,
--                        const int NE,
--                        const bool transpose,
--                        const Array<double> &w_,
--                        const Vector &j,
--                        Vector &coeff_,
--                        Vector &op);
--
--void PAHcurlMassApply2D(const int D1D,
--                        const int Q1D,
--                        const int NE,
--                        const bool symmetric,
--                        const Array<double> &bo,
--                        const Array<double> &bc,
--                        const Array<double> &bot,
--                        const Array<double> &bct,
--                        const Vector &pa_data,
--                        const Vector &x,
--                        Vector &y)
--{
--   constexpr static int VDIM = 2;
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
--   auto Bct = Reshape(bct.Read(), D1D, Q1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, symmetric ? 3 : 4, NE);
--   auto X = Reshape(x.Read(), 2*(D1D-1)*D1D, NE);
--   auto Y = Reshape(y.ReadWrite(), 2*(D1D-1)*D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double mass[MAX_Q1D][MAX_Q1D][VDIM];
--
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            for (int c = 0; c < VDIM; ++c)
--            {
--               mass[qy][qx][c] = 0.0;
--            }
--         }
--      }
--
--      int osc = 0;
--
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
--      {
--         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--         for (int dy = 0; dy < D1Dy; ++dy)
--         {
--            double massX[MAX_Q1D];
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               massX[qx] = 0.0;
--            }
--
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               const double t = X(dx + (dy * D1Dx) + osc, e);
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  massX[qx] += t * ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
--               }
--            }
--
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               const double wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  mass[qy][qx][c] += massX[qx] * wy;
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy;
--      }  // loop (c) over components
--
--      // Apply D operator.
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            const double O11 = op(qx,qy,0,e);
--            const double O21 = op(qx,qy,1,e);
--            const double O12 = symmetric ? O21 : op(qx,qy,2,e);
--            const double O22 = symmetric ? op(qx,qy,2,e) : op(qx,qy,3,e);
--            const double massX = mass[qy][qx][0];
--            const double massY = mass[qy][qx][1];
--            mass[qy][qx][0] = (O11*massX)+(O12*massY);
--            mass[qy][qx][1] = (O21*massX)+(O22*massY);
--         }
--      }
--
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         osc = 0;
--
--         for (int c = 0; c < VDIM; ++c)  // loop over x, y components
--         {
--            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--            double massX[MAX_D1D];
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               massX[dx] = 0.0;
--            }
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  massX[dx] += mass[qy][qx][c] * ((c == 0) ? Bot(dx,qx) : Bct(dx,qx));
--               }
--            }
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               const double wy = (c == 1) ? Bot(dy,qy) : Bct(dy,qy);
--
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  Y(dx + (dy * D1Dx) + osc, e) += massX[dx] * wy;
--               }
--            }
--
--            osc += D1Dx * D1Dy;
--         }  // loop c
--      }  // loop qy
--   }); // end of element loop
--}
--
--void PAHcurlMassAssembleDiagonal2D(const int D1D,
--                                   const int Q1D,
--                                   const int NE,
--                                   const bool symmetric,
--                                   const Array<double> &bo,
--                                   const Array<double> &bc,
--                                   const Vector &pa_data,
--                                   Vector &diag)
--{
--   constexpr static int VDIM = 2;
--   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, symmetric ? 3 : 4, NE);
--   auto D = Reshape(diag.ReadWrite(), 2*(D1D-1)*D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      int osc = 0;
--
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
--      {
--         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--         double mass[MAX_Q1D];
--
--         for (int dy = 0; dy < D1Dy; ++dy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               mass[qx] = 0.0;
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  const double wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
--
--                  mass[qx] += wy * wy * ((c == 0) ? op(qx,qy,0,e) :
--                                         op(qx,qy,symmetric ? 2 : 3, e));
--               }
--            }
--
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  const double wx = ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
--                  D(dx + (dy * D1Dx) + osc, e) += mass[qx] * wx * wx;
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy;
--      }  // loop c
--   }); // end of element loop
--}
--
--void PAHcurlMassAssembleDiagonal3D(const int D1D,
--                                   const int Q1D,
--                                   const int NE,
--                                   const bool symmetric,
--                                   const Array<double> &bo,
--                                   const Array<double> &bc,
--                                   const Vector &pa_data,
--                                   Vector &diag)
--{
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
--
--   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
--   constexpr static int VDIM = 3;
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
--   auto D = Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      int osc = 0;
--
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--      {
--         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
--         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--         const int opc = (c == 0) ? 0 : ((c == 1) ? (symmetric ? 3 : 4) :
--                                         (symmetric ? 5 : 8));
--
--         double mass[MAX_Q1D];
--
--         for (int dz = 0; dz < D1Dz; ++dz)
--         {
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  mass[qx] = 0.0;
--                  for (int qy = 0; qy < Q1D; ++qy)
--                  {
--                     const double wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
--
--                     for (int qz = 0; qz < Q1D; ++qz)
--                     {
--                        const double wz = (c == 2) ? Bo(qz,dz) : Bc(qz,dz);
--
--                        mass[qx] += wy * wy * wz * wz * op(qx,qy,qz,opc,e);
--                     }
--                  }
--               }
--
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     const double wx = ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
--                     D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += mass[qx] * wx * wx;
--                  }
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }  // loop c
--   }); // end of element loop
--}
--
--template<int T_D1D, int T_Q1D>
--void SmemPAHcurlMassAssembleDiagonal3D(const int D1D,
--                                       const int Q1D,
--                                       const int NE,
--                                       const bool symmetric,
--                                       const Array<double> &bo,
--                                       const Array<double> &bc,
--                                       const Vector &pa_data,
--                                       Vector &diag)
--{
--   MFEM_VERIFY(D1D <= HCURL_MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= HCURL_MAX_Q1D, "Error: Q1D > MAX_Q1D");
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
--   auto D = Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
--
--   mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
--   {
--      constexpr int VDIM = 3;
--      constexpr int tD1D = T_D1D ? T_D1D : HCURL_MAX_D1D;
--      constexpr int tQ1D = T_Q1D ? T_Q1D : HCURL_MAX_Q1D;
--
--      MFEM_SHARED double sBo[tQ1D][tD1D];
--      MFEM_SHARED double sBc[tQ1D][tD1D];
--
--      double op3[3];
--      MFEM_SHARED double sop[3][tQ1D][tQ1D];
--
--      MFEM_FOREACH_THREAD(qx,x,Q1D)
--      {
--         MFEM_FOREACH_THREAD(qy,y,Q1D)
--         {
--            MFEM_FOREACH_THREAD(qz,z,Q1D)
--            {
--               op3[0] = op(qx,qy,qz,0,e);
--               op3[1] = op(qx,qy,qz,symmetric ? 3 : 4,e);
--               op3[2] = op(qx,qy,qz,symmetric ? 5 : 8,e);
--            }
--         }
--      }
--
--      const int tidx = MFEM_THREAD_ID(x);
--      const int tidy = MFEM_THREAD_ID(y);
--      const int tidz = MFEM_THREAD_ID(z);
--
--      if (tidz == 0)
--      {
--         MFEM_FOREACH_THREAD(d,y,D1D)
--         {
--            MFEM_FOREACH_THREAD(q,x,Q1D)
--            {
--               sBc[q][d] = Bc(q,d);
--               if (d < D1D-1)
--               {
--                  sBo[q][d] = Bo(q,d);
--               }
--            }
--         }
--      }
--      MFEM_SYNC_THREAD;
--
--      int osc = 0;
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--      {
--         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
--         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--         double dxyz = 0.0;
--
--         for (int qz=0; qz < Q1D; ++qz)
--         {
--            if (tidz == qz)
--            {
--               for (int i=0; i<3; ++i)
--               {
--                  sop[i][tidx][tidy] = op3[i];
--               }
--            }
--
--            MFEM_SYNC_THREAD;
--
--            MFEM_FOREACH_THREAD(dz,z,D1Dz)
--            {
--               const double wz = ((c == 2) ? sBo[qz][dz] : sBc[qz][dz]);
--
--               MFEM_FOREACH_THREAD(dy,y,D1Dy)
--               {
--                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
--                  {
--                     for (int qy = 0; qy < Q1D; ++qy)
--                     {
--                        const double wy = ((c == 1) ? sBo[qy][dy] : sBc[qy][dy]);
--
--                        for (int qx = 0; qx < Q1D; ++qx)
--                        {
--                           const double wx = ((c == 0) ? sBo[qx][dx] : sBc[qx][dx]);
--                           dxyz += sop[c][qx][qy] * wx * wx * wy * wy * wz * wz;
--                        }
--                     }
--                  }
--               }
--            }
--
--            MFEM_SYNC_THREAD;
--         }  // qz loop
--
--         MFEM_FOREACH_THREAD(dz,z,D1Dz)
--         {
--            MFEM_FOREACH_THREAD(dy,y,D1Dy)
--            {
--               MFEM_FOREACH_THREAD(dx,x,D1Dx)
--               {
--                  D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }  // c loop
--   }); // end of element loop
--}
--
--void PAHcurlMassApply3D(const int D1D,
--                        const int Q1D,
--                        const int NE,
--                        const bool symmetric,
--                        const Array<double> &bo,
--                        const Array<double> &bc,
--                        const Array<double> &bot,
--                        const Array<double> &bct,
--                        const Vector &pa_data,
--                        const Vector &x,
--                        Vector &y)
--{
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
--
--   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
--   constexpr static int VDIM = 3;
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
--   auto Bct = Reshape(bct.Read(), D1D, Q1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
--   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
--   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double mass[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];
--
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               for (int c = 0; c < VDIM; ++c)
--               {
--                  mass[qz][qy][qx][c] = 0.0;
--               }
--            }
--         }
--      }
--
--      int osc = 0;
--
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--      {
--         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
--         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--         for (int dz = 0; dz < D1Dz; ++dz)
--         {
--            double massXY[MAX_Q1D][MAX_Q1D];
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  massXY[qy][qx] = 0.0;
--               }
--            }
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               double massX[MAX_Q1D];
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  massX[qx] = 0.0;
--               }
--
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     massX[qx] += t * ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
--                  }
--               }
--
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  const double wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     const double wx = massX[qx];
--                     massXY[qy][qx] += wx * wy;
--                  }
--               }
--            }
--
--            for (int qz = 0; qz < Q1D; ++qz)
--            {
--               const double wz = (c == 2) ? Bo(qz,dz) : Bc(qz,dz);
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     mass[qz][qy][qx][c] += massXY[qy][qx] * wz;
--                  }
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }  // loop (c) over components
--
--      // Apply D operator.
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               const double O11 = op(qx,qy,qz,0,e);
--               const double O12 = op(qx,qy,qz,1,e);
--               const double O13 = op(qx,qy,qz,2,e);
--               const double O21 = symmetric ? O12 : op(qx,qy,qz,3,e);
--               const double O22 = symmetric ? op(qx,qy,qz,3,e) : op(qx,qy,qz,4,e);
--               const double O23 = symmetric ? op(qx,qy,qz,4,e) : op(qx,qy,qz,5,e);
--               const double O31 = symmetric ? O13 : op(qx,qy,qz,6,e);
--               const double O32 = symmetric ? O23 : op(qx,qy,qz,7,e);
--               const double O33 = symmetric ? op(qx,qy,qz,5,e) : op(qx,qy,qz,8,e);
--               const double massX = mass[qz][qy][qx][0];
--               const double massY = mass[qz][qy][qx][1];
--               const double massZ = mass[qz][qy][qx][2];
--               mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
--               mass[qz][qy][qx][1] = (O21*massX)+(O22*massY)+(O23*massZ);
--               mass[qz][qy][qx][2] = (O31*massX)+(O32*massY)+(O33*massZ);
--            }
--         }
--      }
--
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         double massXY[MAX_D1D][MAX_D1D];
--
--         osc = 0;
--
--         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--         {
--            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
--            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  massXY[dy][dx] = 0.0;
--               }
--            }
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               double massX[MAX_D1D];
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  massX[dx] = 0;
--               }
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     massX[dx] += mass[qz][qy][qx][c] * ((c == 0) ? Bot(dx,qx) : Bct(dx,qx));
--                  }
--               }
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  const double wy = (c == 1) ? Bot(dy,qy) : Bct(dy,qy);
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     massXY[dy][dx] += massX[dx] * wy;
--                  }
--               }
--            }
--
--            for (int dz = 0; dz < D1Dz; ++dz)
--            {
--               const double wz = (c == 2) ? Bot(dz,qz) : Bct(dz,qz);
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += massXY[dy][dx] * wz;
--                  }
--               }
--            }
--
--            osc += D1Dx * D1Dy * D1Dz;
--         }  // loop c
--      }  // loop qz
--   }); // end of element loop
--}
--
--template<int T_D1D, int T_Q1D>
--void SmemPAHcurlMassApply3D(const int D1D,
--                            const int Q1D,
--                            const int NE,
--                            const bool symmetric,
--                            const Array<double> &bo,
--                            const Array<double> &bc,
--                            const Array<double> &bot,
--                            const Array<double> &bct,
--                            const Vector &pa_data,
--                            const Vector &x,
--                            Vector &y)
--{
--   MFEM_VERIFY(D1D <= HCURL_MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= HCURL_MAX_Q1D, "Error: Q1D > MAX_Q1D");
--
--   const int dataSize = symmetric ? 6 : 9;
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, dataSize, NE);
--   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
--   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
--
--   mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
--   {
--      constexpr int VDIM = 3;
--      constexpr int tD1D = T_D1D ? T_D1D : HCURL_MAX_D1D;
--      constexpr int tQ1D = T_Q1D ? T_Q1D : HCURL_MAX_Q1D;
--
--      MFEM_SHARED double sBo[tQ1D][tD1D];
--      MFEM_SHARED double sBc[tQ1D][tD1D];
--
--      double op9[9];
--      MFEM_SHARED double sop[9*tQ1D*tQ1D];
--      MFEM_SHARED double mass[tQ1D][tQ1D][3];
--
--      MFEM_SHARED double sX[tD1D][tD1D][tD1D];
--
--      MFEM_FOREACH_THREAD(qx,x,Q1D)
--      {
--         MFEM_FOREACH_THREAD(qy,y,Q1D)
--         {
--            MFEM_FOREACH_THREAD(qz,z,Q1D)
--            {
--               for (int i=0; i<dataSize; ++i)
--               {
--                  op9[i] = op(qx,qy,qz,i,e);
--               }
--            }
--         }
--      }
--
--      const int tidx = MFEM_THREAD_ID(x);
--      const int tidy = MFEM_THREAD_ID(y);
--      const int tidz = MFEM_THREAD_ID(z);
--
--      if (tidz == 0)
--      {
--         MFEM_FOREACH_THREAD(d,y,D1D)
--         {
--            MFEM_FOREACH_THREAD(q,x,Q1D)
--            {
--               sBc[q][d] = Bc(q,d);
--               if (d < D1D-1)
--               {
--                  sBo[q][d] = Bo(q,d);
--               }
--            }
--         }
--      }
--      MFEM_SYNC_THREAD;
--
--      for (int qz=0; qz < Q1D; ++qz)
--      {
--         int osc = 0;
--         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--         {
--            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
--            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--            MFEM_FOREACH_THREAD(dz,z,D1Dz)
--            {
--               MFEM_FOREACH_THREAD(dy,y,D1Dy)
--               {
--                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
--                  {
--                     sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  }
--               }
--            }
--            MFEM_SYNC_THREAD;
--
--            if (tidz == qz)
--            {
--               for (int i=0; i<dataSize; ++i)
--               {
--                  sop[i + (dataSize*tidx) + (dataSize*Q1D*tidy)] = op9[i];
--               }
--
--               MFEM_FOREACH_THREAD(qy,y,Q1D)
--               {
--                  MFEM_FOREACH_THREAD(qx,x,Q1D)
--                  {
--                     double u = 0.0;
--
--                     for (int dz = 0; dz < D1Dz; ++dz)
--                     {
--                        const double wz = (c == 2) ? sBo[qz][dz] : sBc[qz][dz];
--                        for (int dy = 0; dy < D1Dy; ++dy)
--                        {
--                           const double wy = (c == 1) ? sBo[qy][dy] : sBc[qy][dy];
--                           for (int dx = 0; dx < D1Dx; ++dx)
--                           {
--                              const double t = sX[dz][dy][dx];
--                              const double wx = (c == 0) ? sBo[qx][dx] : sBc[qx][dx];
--                              u += t * wx * wy * wz;
--                           }
--                        }
--                     }
--
--                     mass[qy][qx][c] = u;
--                  } // qx
--               } // qy
--            } // tidz == qz
--
--            osc += D1Dx * D1Dy * D1Dz;
--            MFEM_SYNC_THREAD;
--         } // c
--
--         MFEM_SYNC_THREAD;  // Sync mass[qy][qx][d] and sop
--
--         osc = 0;
--         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--         {
--            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
--            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--            double dxyz = 0.0;
--
--            MFEM_FOREACH_THREAD(dz,z,D1Dz)
--            {
--               const double wz = (c == 2) ? sBo[qz][dz] : sBc[qz][dz];
--
--               MFEM_FOREACH_THREAD(dy,y,D1Dy)
--               {
--                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
--                  {
--                     for (int qy = 0; qy < Q1D; ++qy)
--                     {
--                        const double wy = (c == 1) ? sBo[qy][dy] : sBc[qy][dy];
--                        for (int qx = 0; qx < Q1D; ++qx)
--                        {
--                           const int os = (dataSize*qx) + (dataSize*Q1D*qy);
--                           const int id1 = os + ((c == 0) ? 0 : ((c == 1) ? (symmetric ? 1 : 3) :
--                                                                 (symmetric ? 2 : 6))); // O11, O21, O31
--                           const int id2 = os + ((c == 0) ? 1 : ((c == 1) ? (symmetric ? 3 : 4) :
--                                                                 (symmetric ? 4 : 7))); // O12, O22, O32
--                           const int id3 = os + ((c == 0) ? 2 : ((c == 1) ? (symmetric ? 4 : 5) :
--                                                                 (symmetric ? 5 : 8))); // O13, O23, O33
--
--                           const double m_c = (sop[id1] * mass[qy][qx][0]) + (sop[id2] * mass[qy][qx][1]) +
--                                              (sop[id3] * mass[qy][qx][2]);
--
--                           const double wx = (c == 0) ? sBo[qx][dx] : sBc[qx][dx];
--                           dxyz += m_c * wx * wy * wz;
--                        }
--                     }
--                  }
--               }
--            }
--
--            MFEM_SYNC_THREAD;
--
--            MFEM_FOREACH_THREAD(dz,z,D1Dz)
--            {
--               MFEM_FOREACH_THREAD(dy,y,D1Dy)
--               {
--                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
--                  {
--                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
--                  }
--               }
--            }
--
--            osc += D1Dx * D1Dy * D1Dz;
--         } // c loop
--      } // qz
--   }); // end of element loop
--}
--
--// PA H(curl) curl-curl assemble 2D kernel
--static void PACurlCurlSetup2D(const int Q1D,
--                              const int NE,
--                              const Array<double> &w,
--                              const Vector &j,
--                              Vector &coeff,
--                              Vector &op)
--{
--   const int NQ = Q1D*Q1D;
--   auto W = w.Read();
--   auto J = Reshape(j.Read(), NQ, 2, 2, NE);
--   auto C = Reshape(coeff.Read(), NQ, NE);
--   auto y = Reshape(op.Write(), NQ, NE);
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      for (int q = 0; q < NQ; ++q)
--      {
--         const double J11 = J(q,0,0,e);
--         const double J21 = J(q,1,0,e);
--         const double J12 = J(q,0,1,e);
--         const double J22 = J(q,1,1,e);
--         const double detJ = (J11*J22)-(J21*J12);
--         y(q,e) = W[q] * C(q,e) / detJ;
--      }
--   });
--}
--
--// PA H(curl) curl-curl assemble 3D kernel
--static void PACurlCurlSetup3D(const int Q1D,
--                              const int coeffDim,
--                              const int NE,
--                              const Array<double> &w,
--                              const Vector &j,
--                              Vector &coeff,
--                              Vector &op)
--{
--   const int NQ = Q1D*Q1D*Q1D;
--   const bool symmetric = (coeffDim != 9);
--   auto W = w.Read();
--   auto J = Reshape(j.Read(), NQ, 3, 3, NE);
--   auto C = Reshape(coeff.Read(), coeffDim, NQ, NE);
--   auto y = Reshape(op.Write(), NQ, symmetric ? 6 : 9, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      for (int q = 0; q < NQ; ++q)
--      {
--         const double J11 = J(q,0,0,e);
--         const double J21 = J(q,1,0,e);
--         const double J31 = J(q,2,0,e);
--         const double J12 = J(q,0,1,e);
--         const double J22 = J(q,1,1,e);
--         const double J32 = J(q,2,1,e);
--         const double J13 = J(q,0,2,e);
--         const double J23 = J(q,1,2,e);
--         const double J33 = J(q,2,2,e);
--         const double detJ = J11 * (J22 * J33 - J32 * J23) -
--                             J21 * (J12 * J33 - J32 * J13) +
--                             J31 * (J12 * J23 - J22 * J13);
--
--         const double c_detJ = W[q] / detJ;
--
--         if (coeffDim == 6 || coeffDim == 9) // Matrix coefficient version
--         {
--            // Set y to the 6 or 9 entries of J^T M J / det
--            const double M11 = C(0, q, e);
--            const double M12 = C(1, q, e);
--            const double M13 = C(2, q, e);
--            const double M21 = (!symmetric) ? C(3, q, e) : M12;
--            const double M22 = (!symmetric) ? C(4, q, e) : C(3, q, e);
--            const double M23 = (!symmetric) ? C(5, q, e) : C(4, q, e);
--            const double M31 = (!symmetric) ? C(6, q, e) : M13;
--            const double M32 = (!symmetric) ? C(7, q, e) : M23;
--            const double M33 = (!symmetric) ? C(8, q, e) : C(5, q, e);
--
--            // First compute R = MJ
--            const double R11 = M11*J11 + M12*J21 + M13*J31;
--            const double R12 = M11*J12 + M12*J22 + M13*J32;
--            const double R13 = M11*J13 + M12*J23 + M13*J33;
--            const double R21 = M21*J11 + M22*J21 + M23*J31;
--            const double R22 = M21*J12 + M22*J22 + M23*J32;
--            const double R23 = M21*J13 + M22*J23 + M23*J33;
--            const double R31 = M31*J11 + M32*J21 + M33*J31;
--            const double R32 = M31*J12 + M32*J22 + M33*J32;
--            const double R33 = M31*J13 + M32*J23 + M33*J33;
--
--            // Now set y to J^T R / det
--            y(q,0,e) = c_detJ * (J11*R11 + J21*R21 + J31*R31); // 1,1
--            const double Y12 = c_detJ * (J11*R12 + J21*R22 + J31*R32);
--            y(q,1,e) = Y12; // 1,2
--            y(q,2,e) = c_detJ * (J11*R13 + J21*R23 + J31*R33); // 1,3
--
--            const double Y21 = c_detJ * (J12*R11 + J22*R21 + J32*R31);
--            const double Y22 = c_detJ * (J12*R12 + J22*R22 + J32*R32);
--            const double Y23 = c_detJ * (J12*R13 + J22*R23 + J32*R33);
--
--            const double Y33 = c_detJ * (J13*R13 + J23*R23 + J33*R33);
--
--            y(q,3,e) = symmetric ? Y22 : Y21; // 2,2 or 2,1
--            y(q,4,e) = symmetric ? Y23 : Y22; // 2,3 or 2,2
--            y(q,5,e) = symmetric ? Y33 : Y23; // 3,3 or 2,3
--
--            if (!symmetric)
--            {
--               y(q,6,e) = c_detJ * (J13*R11 + J23*R21 + J33*R31); // 3,1
--               y(q,7,e) = c_detJ * (J13*R12 + J23*R22 + J33*R32); // 3,2
--               y(q,8,e) = Y33; // 3,3
--            }
--         }
--         else  // Vector or scalar coefficient version
--         {
--            // Set y to the 6 entries of J^T D J / det^2
--            const double D1 = C(0, q, e);
--            const double D2 = coeffDim == 3 ? C(1, q, e) : D1;
--            const double D3 = coeffDim == 3 ? C(2, q, e) : D1;
--
--            y(q,0,e) = c_detJ * (D1*J11*J11 + D2*J21*J21 + D3*J31*J31); // 1,1
--            y(q,1,e) = c_detJ * (D1*J11*J12 + D2*J21*J22 + D3*J31*J32); // 1,2
--            y(q,2,e) = c_detJ * (D1*J11*J13 + D2*J21*J23 + D3*J31*J33); // 1,3
--            y(q,3,e) = c_detJ * (D1*J12*J12 + D2*J22*J22 + D3*J32*J32); // 2,2
--            y(q,4,e) = c_detJ * (D1*J12*J13 + D2*J22*J23 + D3*J32*J33); // 2,3
--            y(q,5,e) = c_detJ * (D1*J13*J13 + D2*J23*J23 + D3*J33*J33); // 3,3
--         }
--      }
--   });
--}
--
--// PA H(curl)-L2 assemble 2D kernel
--static void PACurlL2Setup2D(const int Q1D,
--                            const int NE,
--                            const Array<double> &w,
--                            Vector &coeff,
--                            Vector &op)
--{
--   const int NQ = Q1D*Q1D;
--   auto W = w.Read();
--   auto C = Reshape(coeff.Read(), NQ, NE);
--   auto y = Reshape(op.Write(), NQ, NE);
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      for (int q = 0; q < NQ; ++q)
--      {
--         y(q,e) = W[q] * C(q,e);
--      }
--   });
--}
--
--void CurlCurlIntegrator::AssemblePA(const FiniteElementSpace &fes)
--{
--   // Assumes tensor-product elements
--   Mesh *mesh = fes.GetMesh();
--   const FiniteElement *fel = fes.GetFE(0);
--
--   const VectorTensorFiniteElement *el =
--      dynamic_cast<const VectorTensorFiniteElement*>(fel);
--   MFEM_VERIFY(el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const IntegrationRule *ir
--      = IntRule ? IntRule : &MassIntegrator::GetRule(*el, *el,
--                                                     *mesh->GetElementTransformation(0));
--
--   const int dims = el->GetDim();
--   MFEM_VERIFY(dims == 2 || dims == 3, "");
--
--   nq = ir->GetNPoints();
--   dim = mesh->Dimension();
--   MFEM_VERIFY(dim == 2 || dim == 3, "");
--
--   ne = fes.GetNE();
--   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
--   mapsC = &el->GetDofToQuad(*ir, DofToQuad::TENSOR);
--   mapsO = &el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
--   dofs1D = mapsC->ndof;
--   quad1D = mapsC->nqpt;
--
--   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
--
--   QuadratureSpace qs(*mesh, *ir);
--   CoefficientVector coeff(qs, CoefficientStorage::SYMMETRIC);
--   if (Q) { coeff.Project(*Q); }
--   else if (MQ) { coeff.ProjectTranspose(*MQ); }
--   else if (DQ) { coeff.Project(*DQ); }
--   else { coeff.SetConstant(1.0); }
--
--   const int coeff_dim = coeff.GetVDim();
--   symmetric = (coeff_dim != dim*dim);
--   const int sym_dims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
--   const int ndata = (dim == 2) ? 1 : (symmetric ? sym_dims : dim*dim);
--   pa_data.SetSize(ndata * nq * ne, Device::GetMemoryType());
--
--   if (el->GetDerivType() != mfem::FiniteElement::CURL)
--   {
--      MFEM_ABORT("Unknown kernel.");
--   }
--
--   if (dim == 3)
--   {
--      PACurlCurlSetup3D(quad1D, coeff_dim, ne, ir->GetWeights(), geom->J, coeff,
--                        pa_data);
--   }
--   else
--   {
--      PACurlCurlSetup2D(quad1D, ne, ir->GetWeights(), geom->J, coeff, pa_data);
--   }
--}
--
--static void PACurlCurlApply2D(const int D1D,
--                              const int Q1D,
--                              const int NE,
--                              const Array<double> &bo,
--                              const Array<double> &bot,
--                              const Array<double> &gc,
--                              const Array<double> &gct,
--                              const Vector &pa_data,
--                              const Vector &x,
--                              Vector &y)
--{
--   constexpr static int VDIM = 2;
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
--   auto Gc = Reshape(gc.Read(), Q1D, D1D);
--   auto Gct = Reshape(gct.Read(), D1D, Q1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, NE);
--   auto X = Reshape(x.Read(), 2*(D1D-1)*D1D, NE);
--   auto Y = Reshape(y.ReadWrite(), 2*(D1D-1)*D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double curl[MAX_Q1D][MAX_Q1D];
--
--      // curl[qy][qx] will be computed as du_y/dx - du_x/dy
--
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            curl[qy][qx] = 0.0;
--         }
--      }
--
--      int osc = 0;
--
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
--      {
--         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--         for (int dy = 0; dy < D1Dy; ++dy)
--         {
--            double gradX[MAX_Q1D];
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               gradX[qx] = 0;
--            }
--
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               const double t = X(dx + (dy * D1Dx) + osc, e);
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  gradX[qx] += t * ((c == 0) ? Bo(qx,dx) : Gc(qx,dx));
--               }
--            }
--
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               const double wy = (c == 0) ? -Gc(qy,dy) : Bo(qy,dy);
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  curl[qy][qx] += gradX[qx] * wy;
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy;
--      }  // loop (c) over components
--
--      // Apply D operator.
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            curl[qy][qx] *= op(qx,qy,e);
--         }
--      }
--
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         osc = 0;
--
--         for (int c = 0; c < VDIM; ++c)  // loop over x, y components
--         {
--            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--            double gradX[MAX_D1D];
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               gradX[dx] = 0.0;
--            }
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  gradX[dx] += curl[qy][qx] * ((c == 0) ? Bot(dx,qx) : Gct(dx,qx));
--               }
--            }
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               const double wy = (c == 0) ? -Gct(dy,qy) : Bot(dy,qy);
--
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  Y(dx + (dy * D1Dx) + osc, e) += gradX[dx] * wy;
--               }
--            }
--
--            osc += D1Dx * D1Dy;
--         }  // loop c
--      }  // loop qy
--   }); // end of element loop
--}
--
--template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
--static void PACurlCurlApply3D(const int D1D,
--                              const int Q1D,
--                              const bool symmetric,
--                              const int NE,
--                              const Array<double> &bo,
--                              const Array<double> &bc,
--                              const Array<double> &bot,
--                              const Array<double> &bct,
--                              const Array<double> &gc,
--                              const Array<double> &gct,
--                              const Vector &pa_data,
--                              const Vector &x,
--                              Vector &y)
--{
--   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
--   // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
--   // (\nabla\times u) \cdot (\nabla\times v) = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{v}
--   // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
--   // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
--   // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
--
--   constexpr static int VDIM = 3;
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
--   auto Bct = Reshape(bct.Read(), D1D, Q1D);
--   auto Gc = Reshape(gc.Read(), Q1D, D1D);
--   auto Gct = Reshape(gct.Read(), D1D, Q1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
--   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
--   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double curl[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];
--      // curl[qz][qy][qx] will be computed as the vector curl at each quadrature point.
--
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               for (int c = 0; c < VDIM; ++c)
--               {
--                  curl[qz][qy][qx][c] = 0.0;
--               }
--            }
--         }
--      }
--
--      // We treat x, y, z components separately for optimization specific to each.
--
--      int osc = 0;
--
--      {
--         // x component
--         const int D1Dz = D1D;
--         const int D1Dy = D1D;
--         const int D1Dx = D1D - 1;
--
--         for (int dz = 0; dz < D1Dz; ++dz)
--         {
--            double gradXY[MAX_Q1D][MAX_Q1D][2];
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  for (int d = 0; d < 2; ++d)
--                  {
--                     gradXY[qy][qx][d] = 0.0;
--                  }
--               }
--            }
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               double massX[MAX_Q1D];
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  massX[qx] = 0.0;
--               }
--
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     massX[qx] += t * Bo(qx,dx);
--                  }
--               }
--
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  const double wy = Bc(qy,dy);
--                  const double wDy = Gc(qy,dy);
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     const double wx = massX[qx];
--                     gradXY[qy][qx][0] += wx * wDy;
--                     gradXY[qy][qx][1] += wx * wy;
--                  }
--               }
--            }
--
--            for (int qz = 0; qz < Q1D; ++qz)
--            {
--               const double wz = Bc(qz,dz);
--               const double wDz = Gc(qz,dz);
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
--                     curl[qz][qy][qx][1] += gradXY[qy][qx][1] * wDz; // (u_0)_{x_2}
--                     curl[qz][qy][qx][2] -= gradXY[qy][qx][0] * wz;  // -(u_0)_{x_1}
--                  }
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }
--
--      {
--         // y component
--         const int D1Dz = D1D;
--         const int D1Dy = D1D - 1;
--         const int D1Dx = D1D;
--
--         for (int dz = 0; dz < D1Dz; ++dz)
--         {
--            double gradXY[MAX_Q1D][MAX_Q1D][2];
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  for (int d = 0; d < 2; ++d)
--                  {
--                     gradXY[qy][qx][d] = 0.0;
--                  }
--               }
--            }
--
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               double massY[MAX_Q1D];
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  massY[qy] = 0.0;
--               }
--
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  for (int qy = 0; qy < Q1D; ++qy)
--                  {
--                     massY[qy] += t * Bo(qy,dy);
--                  }
--               }
--
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  const double wx = Bc(qx,dx);
--                  const double wDx = Gc(qx,dx);
--                  for (int qy = 0; qy < Q1D; ++qy)
--                  {
--                     const double wy = massY[qy];
--                     gradXY[qy][qx][0] += wDx * wy;
--                     gradXY[qy][qx][1] += wx * wy;
--                  }
--               }
--            }
--
--            for (int qz = 0; qz < Q1D; ++qz)
--            {
--               const double wz = Bc(qz,dz);
--               const double wDz = Gc(qz,dz);
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
--                     curl[qz][qy][qx][0] -= gradXY[qy][qx][1] * wDz; // -(u_1)_{x_2}
--                     curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz;  // (u_1)_{x_0}
--                  }
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }
--
--      {
--         // z component
--         const int D1Dz = D1D - 1;
--         const int D1Dy = D1D;
--         const int D1Dx = D1D;
--
--         for (int dx = 0; dx < D1Dx; ++dx)
--         {
--            double gradYZ[MAX_Q1D][MAX_Q1D][2];
--            for (int qz = 0; qz < Q1D; ++qz)
--            {
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int d = 0; d < 2; ++d)
--                  {
--                     gradYZ[qz][qy][d] = 0.0;
--                  }
--               }
--            }
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               double massZ[MAX_Q1D];
--               for (int qz = 0; qz < Q1D; ++qz)
--               {
--                  massZ[qz] = 0.0;
--               }
--
--               for (int dz = 0; dz < D1Dz; ++dz)
--               {
--                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  for (int qz = 0; qz < Q1D; ++qz)
--                  {
--                     massZ[qz] += t * Bo(qz,dz);
--                  }
--               }
--
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  const double wy = Bc(qy,dy);
--                  const double wDy = Gc(qy,dy);
--                  for (int qz = 0; qz < Q1D; ++qz)
--                  {
--                     const double wz = massZ[qz];
--                     gradYZ[qz][qy][0] += wz * wy;
--                     gradYZ[qz][qy][1] += wz * wDy;
--                  }
--               }
--            }
--
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               const double wx = Bc(qx,dx);
--               const double wDx = Gc(qx,dx);
--
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int qz = 0; qz < Q1D; ++qz)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
--                     curl[qz][qy][qx][0] += gradYZ[qz][qy][1] * wx;  // (u_2)_{x_1}
--                     curl[qz][qy][qx][1] -= gradYZ[qz][qy][0] * wDx; // -(u_2)_{x_0}
--                  }
--               }
--            }
--         }
--      }
--
--      // Apply D operator.
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               const double O11 = op(qx,qy,qz,0,e);
--               const double O12 = op(qx,qy,qz,1,e);
--               const double O13 = op(qx,qy,qz,2,e);
--               const double O21 = symmetric ? O12 : op(qx,qy,qz,3,e);
--               const double O22 = symmetric ? op(qx,qy,qz,3,e) : op(qx,qy,qz,4,e);
--               const double O23 = symmetric ? op(qx,qy,qz,4,e) : op(qx,qy,qz,5,e);
--               const double O31 = symmetric ? O13 : op(qx,qy,qz,6,e);
--               const double O32 = symmetric ? O23 : op(qx,qy,qz,7,e);
--               const double O33 = symmetric ? op(qx,qy,qz,5,e) : op(qx,qy,qz,8,e);
--
--               const double c1 = (O11 * curl[qz][qy][qx][0]) + (O12 * curl[qz][qy][qx][1]) +
--                                 (O13 * curl[qz][qy][qx][2]);
--               const double c2 = (O21 * curl[qz][qy][qx][0]) + (O22 * curl[qz][qy][qx][1]) +
--                                 (O23 * curl[qz][qy][qx][2]);
--               const double c3 = (O31 * curl[qz][qy][qx][0]) + (O32 * curl[qz][qy][qx][1]) +
--                                 (O33 * curl[qz][qy][qx][2]);
--
--               curl[qz][qy][qx][0] = c1;
--               curl[qz][qy][qx][1] = c2;
--               curl[qz][qy][qx][2] = c3;
--            }
--         }
--      }
--
--      // x component
--      osc = 0;
--      {
--         const int D1Dz = D1D;
--         const int D1Dy = D1D;
--         const int D1Dx = D1D - 1;
--
--         for (int qz = 0; qz < Q1D; ++qz)
--         {
--            double gradXY12[MAX_D1D][MAX_D1D];
--            double gradXY21[MAX_D1D][MAX_D1D];
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  gradXY12[dy][dx] = 0.0;
--                  gradXY21[dy][dx] = 0.0;
--               }
--            }
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               double massX[MAX_D1D][2];
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  for (int n = 0; n < 2; ++n)
--                  {
--                     massX[dx][n] = 0.0;
--                  }
--               }
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     const double wx = Bot(dx,qx);
--
--                     massX[dx][0] += wx * curl[qz][qy][qx][1];
--                     massX[dx][1] += wx * curl[qz][qy][qx][2];
--                  }
--               }
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  const double wy = Bct(dy,qy);
--                  const double wDy = Gct(dy,qy);
--
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     gradXY21[dy][dx] += massX[dx][0] * wy;
--                     gradXY12[dy][dx] += massX[dx][1] * wDy;
--                  }
--               }
--            }
--
--            for (int dz = 0; dz < D1Dz; ++dz)
--            {
--               const double wz = Bct(dz,qz);
--               const double wDz = Gct(dz,qz);
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
--                     // (u_0)_{x_2} * (op * curl)_1 - (u_0)_{x_1} * (op * curl)_2
--                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
--                       e) += (gradXY21[dy][dx] * wDz) - (gradXY12[dy][dx] * wz);
--                  }
--               }
--            }
--         }  // loop qz
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }
--
--      // y component
--      {
--         const int D1Dz = D1D;
--         const int D1Dy = D1D - 1;
--         const int D1Dx = D1D;
--
--         for (int qz = 0; qz < Q1D; ++qz)
--         {
--            double gradXY02[MAX_D1D][MAX_D1D];
--            double gradXY20[MAX_D1D][MAX_D1D];
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  gradXY02[dy][dx] = 0.0;
--                  gradXY20[dy][dx] = 0.0;
--               }
--            }
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               double massY[MAX_D1D][2];
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  massY[dy][0] = 0.0;
--                  massY[dy][1] = 0.0;
--               }
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int dy = 0; dy < D1Dy; ++dy)
--                  {
--                     const double wy = Bot(dy,qy);
--
--                     massY[dy][0] += wy * curl[qz][qy][qx][2];
--                     massY[dy][1] += wy * curl[qz][qy][qx][0];
--                  }
--               }
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  const double wx = Bct(dx,qx);
--                  const double wDx = Gct(dx,qx);
--
--                  for (int dy = 0; dy < D1Dy; ++dy)
--                  {
--                     gradXY02[dy][dx] += massY[dy][0] * wDx;
--                     gradXY20[dy][dx] += massY[dy][1] * wx;
--                  }
--               }
--            }
--
--            for (int dz = 0; dz < D1Dz; ++dz)
--            {
--               const double wz = Bct(dz,qz);
--               const double wDz = Gct(dz,qz);
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
--                     // -(u_1)_{x_2} * (op * curl)_0 + (u_1)_{x_0} * (op * curl)_2
--                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
--                       e) += (-gradXY20[dy][dx] * wDz) + (gradXY02[dy][dx] * wz);
--                  }
--               }
--            }
--         }  // loop qz
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }
--
--      // z component
--      {
--         const int D1Dz = D1D - 1;
--         const int D1Dy = D1D;
--         const int D1Dx = D1D;
--
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            double gradYZ01[MAX_D1D][MAX_D1D];
--            double gradYZ10[MAX_D1D][MAX_D1D];
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               for (int dz = 0; dz < D1Dz; ++dz)
--               {
--                  gradYZ01[dz][dy] = 0.0;
--                  gradYZ10[dz][dy] = 0.0;
--               }
--            }
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               double massZ[MAX_D1D][2];
--               for (int dz = 0; dz < D1Dz; ++dz)
--               {
--                  for (int n = 0; n < 2; ++n)
--                  {
--                     massZ[dz][n] = 0.0;
--                  }
--               }
--               for (int qz = 0; qz < Q1D; ++qz)
--               {
--                  for (int dz = 0; dz < D1Dz; ++dz)
--                  {
--                     const double wz = Bot(dz,qz);
--
--                     massZ[dz][0] += wz * curl[qz][qy][qx][0];
--                     massZ[dz][1] += wz * curl[qz][qy][qx][1];
--                  }
--               }
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  const double wy = Bct(dy,qy);
--                  const double wDy = Gct(dy,qy);
--
--                  for (int dz = 0; dz < D1Dz; ++dz)
--                  {
--                     gradYZ01[dz][dy] += wy * massZ[dz][1];
--                     gradYZ10[dz][dy] += wDy * massZ[dz][0];
--                  }
--               }
--            }
--
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               const double wx = Bct(dx,qx);
--               const double wDx = Gct(dx,qx);
--
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  for (int dz = 0; dz < D1Dz; ++dz)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
--                     // (u_2)_{x_1} * (op * curl)_0 - (u_2)_{x_0} * (op * curl)_1
--                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
--                       e) += (gradYZ10[dz][dy] * wx) - (gradYZ01[dz][dy] * wDx);
--                  }
--               }
--            }
--         }  // loop qx
--      }
--   }); // end of element loop
--}
--
--template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
--static void SmemPACurlCurlApply3D(const int D1D,
--                                  const int Q1D,
--                                  const bool symmetric,
--                                  const int NE,
--                                  const Array<double> &bo,
--                                  const Array<double> &bc,
--                                  const Array<double> &bot,
--                                  const Array<double> &bct,
--                                  const Array<double> &gc,
--                                  const Array<double> &gct,
--                                  const Vector &pa_data,
--                                  const Vector &x,
--                                  Vector &y)
--{
--   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
--   // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
--   // (\nabla\times u) \cdot (\nabla\times v) = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{v}
--   // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
--   // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
--   // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto Gc = Reshape(gc.Read(), Q1D, D1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
--   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
--   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
--
--   const int s = symmetric ? 6 : 9;
--
--   auto device_kernel = [=] MFEM_DEVICE (int e)
--   {
--      constexpr int VDIM = 3;
--
--      MFEM_SHARED double sBo[MAX_D1D][MAX_Q1D];
--      MFEM_SHARED double sBc[MAX_D1D][MAX_Q1D];
--      MFEM_SHARED double sGc[MAX_D1D][MAX_Q1D];
--
--      double ope[9];
--      MFEM_SHARED double sop[9][MAX_Q1D][MAX_Q1D];
--      MFEM_SHARED double curl[MAX_Q1D][MAX_Q1D][3];
--
--      MFEM_SHARED double sX[MAX_D1D][MAX_D1D][MAX_D1D];
--
--      MFEM_FOREACH_THREAD(qx,x,Q1D)
--      {
--         MFEM_FOREACH_THREAD(qy,y,Q1D)
--         {
--            MFEM_FOREACH_THREAD(qz,z,Q1D)
--            {
--               for (int i=0; i<s; ++i)
--               {
--                  ope[i] = op(qx,qy,qz,i,e);
--               }
--            }
--         }
--      }
--
--      const int tidx = MFEM_THREAD_ID(x);
--      const int tidy = MFEM_THREAD_ID(y);
--      const int tidz = MFEM_THREAD_ID(z);
--
--      if (tidz == 0)
--      {
--         MFEM_FOREACH_THREAD(d,y,D1D)
--         {
--            MFEM_FOREACH_THREAD(q,x,Q1D)
--            {
--               sBc[d][q] = Bc(q,d);
--               sGc[d][q] = Gc(q,d);
--               if (d < D1D-1)
--               {
--                  sBo[d][q] = Bo(q,d);
--               }
--            }
--         }
--      }
--      MFEM_SYNC_THREAD;
--
--      for (int qz=0; qz < Q1D; ++qz)
--      {
--         if (tidz == qz)
--         {
--            MFEM_FOREACH_THREAD(qy,y,Q1D)
--            {
--               MFEM_FOREACH_THREAD(qx,x,Q1D)
--               {
--                  for (int i=0; i<3; ++i)
--                  {
--                     curl[qy][qx][i] = 0.0;
--                  }
--               }
--            }
--         }
--
--         int osc = 0;
--         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--         {
--            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
--            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--            MFEM_FOREACH_THREAD(dz,z,D1Dz)
--            {
--               MFEM_FOREACH_THREAD(dy,y,D1Dy)
--               {
--                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
--                  {
--                     sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  }
--               }
--            }
--            MFEM_SYNC_THREAD;
--
--            if (tidz == qz)
--            {
--               if (c == 0)
--               {
--                  for (int i=0; i<s; ++i)
--                  {
--                     sop[i][tidx][tidy] = ope[i];
--                  }
--               }
--
--               MFEM_FOREACH_THREAD(qy,y,Q1D)
--               {
--                  MFEM_FOREACH_THREAD(qx,x,Q1D)
--                  {
--                     double u = 0.0;
--                     double v = 0.0;
--
--                     // We treat x, y, z components separately for optimization specific to each.
--                     if (c == 0) // x component
--                     {
--                        // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
--
--                        for (int dz = 0; dz < D1Dz; ++dz)
--                        {
--                           const double wz = sBc[dz][qz];
--                           const double wDz = sGc[dz][qz];
--
--                           for (int dy = 0; dy < D1Dy; ++dy)
--                           {
--                              const double wy = sBc[dy][qy];
--                              const double wDy = sGc[dy][qy];
--
--                              for (int dx = 0; dx < D1Dx; ++dx)
--                              {
--                                 const double wx = sX[dz][dy][dx] * sBo[dx][qx];
--                                 u += wx * wDy * wz;
--                                 v += wx * wy * wDz;
--                              }
--                           }
--                        }
--
--                        curl[qy][qx][1] += v; // (u_0)_{x_2}
--                        curl[qy][qx][2] -= u;  // -(u_0)_{x_1}
--                     }
--                     else if (c == 1)  // y component
--                     {
--                        // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
--
--                        for (int dz = 0; dz < D1Dz; ++dz)
--                        {
--                           const double wz = sBc[dz][qz];
--                           const double wDz = sGc[dz][qz];
--
--                           for (int dy = 0; dy < D1Dy; ++dy)
--                           {
--                              const double wy = sBo[dy][qy];
--
--                              for (int dx = 0; dx < D1Dx; ++dx)
--                              {
--                                 const double t = sX[dz][dy][dx];
--                                 const double wx = t * sBc[dx][qx];
--                                 const double wDx = t * sGc[dx][qx];
--
--                                 u += wDx * wy * wz;
--                                 v += wx * wy * wDz;
--                              }
--                           }
--                        }
--
--                        curl[qy][qx][0] -= v; // -(u_1)_{x_2}
--                        curl[qy][qx][2] += u; // (u_1)_{x_0}
--                     }
--                     else // z component
--                     {
--                        // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
--
--                        for (int dz = 0; dz < D1Dz; ++dz)
--                        {
--                           const double wz = sBo[dz][qz];
--
--                           for (int dy = 0; dy < D1Dy; ++dy)
--                           {
--                              const double wy = sBc[dy][qy];
--                              const double wDy = sGc[dy][qy];
--
--                              for (int dx = 0; dx < D1Dx; ++dx)
--                              {
--                                 const double t = sX[dz][dy][dx];
--                                 const double wx = t * sBc[dx][qx];
--                                 const double wDx = t * sGc[dx][qx];
--
--                                 u += wDx * wy * wz;
--                                 v += wx * wDy * wz;
--                              }
--                           }
--                        }
--
--                        curl[qy][qx][0] += v; // (u_2)_{x_1}
--                        curl[qy][qx][1] -= u; // -(u_2)_{x_0}
--                     }
--                  } // qx
--               } // qy
--            } // tidz == qz
--
--            osc += D1Dx * D1Dy * D1Dz;
--            MFEM_SYNC_THREAD;
--         } // c
--
--         double dxyz1 = 0.0;
--         double dxyz2 = 0.0;
--         double dxyz3 = 0.0;
--
--         MFEM_FOREACH_THREAD(dz,z,D1D)
--         {
--            const double wcz = sBc[dz][qz];
--            const double wcDz = sGc[dz][qz];
--            const double wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
--
--            MFEM_FOREACH_THREAD(dy,y,D1D)
--            {
--               MFEM_FOREACH_THREAD(dx,x,D1D)
--               {
--                  for (int qy = 0; qy < Q1D; ++qy)
--                  {
--                     const double wcy = sBc[dy][qy];
--                     const double wcDy = sGc[dy][qy];
--                     const double wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
--
--                     for (int qx = 0; qx < Q1D; ++qx)
--                     {
--                        const double O11 = sop[0][qx][qy];
--                        const double O12 = sop[1][qx][qy];
--                        const double O13 = sop[2][qx][qy];
--                        const double O21 = symmetric ? O12 : sop[3][qx][qy];
--                        const double O22 = symmetric ? sop[3][qx][qy] : sop[4][qx][qy];
--                        const double O23 = symmetric ? sop[4][qx][qy] : sop[5][qx][qy];
--                        const double O31 = symmetric ? O13 : sop[6][qx][qy];
--                        const double O32 = symmetric ? O23 : sop[7][qx][qy];
--                        const double O33 = symmetric ? sop[5][qx][qy] : sop[8][qx][qy];
--
--                        const double c1 = (O11 * curl[qy][qx][0]) + (O12 * curl[qy][qx][1]) +
--                                          (O13 * curl[qy][qx][2]);
--                        const double c2 = (O21 * curl[qy][qx][0]) + (O22 * curl[qy][qx][1]) +
--                                          (O23 * curl[qy][qx][2]);
--                        const double c3 = (O31 * curl[qy][qx][0]) + (O32 * curl[qy][qx][1]) +
--                                          (O33 * curl[qy][qx][2]);
--
--                        const double wcx = sBc[dx][qx];
--                        const double wDx = sGc[dx][qx];
--
--                        if (dx < D1D-1)
--                        {
--                           // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
--                           // (u_0)_{x_2} * (op * curl)_1 - (u_0)_{x_1} * (op * curl)_2
--                           const double wx = sBo[dx][qx];
--                           dxyz1 += (wx * c2 * wcy * wcDz) - (wx * c3 * wcDy * wcz);
--                        }
--
--                        // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
--                        // -(u_1)_{x_2} * (op * curl)_0 + (u_1)_{x_0} * (op * curl)_2
--                        dxyz2 += (-wy * c1 * wcx * wcDz) + (wy * c3 * wDx * wcz);
--
--                        // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
--                        // (u_2)_{x_1} * (op * curl)_0 - (u_2)_{x_0} * (op * curl)_1
--                        dxyz3 += (wcDy * wz * c1 * wcx) - (wcy * wz * c2 * wDx);
--                     } // qx
--                  } // qy
--               } // dx
--            } // dy
--         } // dz
--
--         MFEM_SYNC_THREAD;
--
--         MFEM_FOREACH_THREAD(dz,z,D1D)
--         {
--            MFEM_FOREACH_THREAD(dy,y,D1D)
--            {
--               MFEM_FOREACH_THREAD(dx,x,D1D)
--               {
--                  if (dx < D1D-1)
--                  {
--                     Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
--                  }
--                  if (dy < D1D-1)
--                  {
--                     Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
--                  }
--                  if (dz < D1D-1)
--                  {
--                     Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
--                  }
--               }
--            }
--         }
--      } // qz
--   }; // end of element loop
--
--   auto host_kernel = [&] MFEM_LAMBDA (int)
--   {
--      MFEM_ABORT_KERNEL("This kernel should only be used on GPU.");
--   };
--
--   ForallWrap<3>(true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
--}
--
--static void PACurlL2Apply2D(const int D1D,
--                            const int D1Dtest,
--                            const int Q1D,
--                            const int NE,
--                            const Array<double> &bo,
--                            const Array<double> &bot,
--                            const Array<double> &bt,
--                            const Array<double> &gc,
--                            const Vector &pa_data,
--                            const Vector &x, // trial = H(curl)
--                            Vector &y)  // test = L2 or H1
--{
--   constexpr static int VDIM = 2;
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
--   const int H1 = (D1Dtest == D1D);
--
--   MFEM_VERIFY(y.Size() == NE*D1Dtest*D1Dtest, "Test vector of wrong dimension");
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
--   auto Bt = Reshape(bt.Read(), D1D, Q1D);
--   auto Gc = Reshape(gc.Read(), Q1D, D1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, NE);
--   auto X = Reshape(x.Read(), 2*(D1D-1)*D1D, NE);
--   auto Y = Reshape(y.ReadWrite(), D1Dtest, D1Dtest, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double curl[MAX_Q1D][MAX_Q1D];
--
--      // curl[qy][qx] will be computed as du_y/dx - du_x/dy
--
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            curl[qy][qx] = 0.0;
--         }
--      }
--
--      int osc = 0;
--
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
--      {
--         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--         for (int dy = 0; dy < D1Dy; ++dy)
--         {
--            double gradX[MAX_Q1D];
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               gradX[qx] = 0;
--            }
--
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               const double t = X(dx + (dy * D1Dx) + osc, e);
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  gradX[qx] += t * ((c == 0) ? Bo(qx,dx) : Gc(qx,dx));
--               }
--            }
--
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               const double wy = (c == 0) ? -Gc(qy,dy) : Bo(qy,dy);
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  curl[qy][qx] += gradX[qx] * wy;
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy;
--      }  // loop (c) over components
--
--      // Apply D operator.
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            curl[qy][qx] *= op(qx,qy,e);
--         }
--      }
--
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         double sol_x[MAX_D1D];
--         for (int dx = 0; dx < D1Dtest; ++dx)
--         {
--            sol_x[dx] = 0.0;
--         }
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            const double s = curl[qy][qx];
--            for (int dx = 0; dx < D1Dtest; ++dx)
--            {
--               sol_x[dx] += s * ((H1 == 1) ? Bt(dx,qx) : Bot(dx,qx));
--            }
--         }
--         for (int dy = 0; dy < D1Dtest; ++dy)
--         {
--            const double wy = (H1 == 1) ? Bt(dy,qy) : Bot(dy,qy);
--
--            for (int dx = 0; dx < D1Dtest; ++dx)
--            {
--               Y(dx,dy,e) += sol_x[dx] * wy;
--            }
--         }
--      }  // loop qy
--   }); // end of element loop
--}
--
--static void PACurlL2ApplyTranspose2D(const int D1D,
--                                     const int D1Dtest,
--                                     const int Q1D,
--                                     const int NE,
--                                     const Array<double> &bo,
--                                     const Array<double> &bot,
--                                     const Array<double> &b,
--                                     const Array<double> &gct,
--                                     const Vector &pa_data,
--                                     const Vector &x, // trial = H(curl)
--                                     Vector &y)  // test = L2 or H1
--{
--   constexpr static int VDIM = 2;
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
--   const int H1 = (D1Dtest == D1D);
--
--   MFEM_VERIFY(x.Size() == NE*D1Dtest*D1Dtest, "Test vector of wrong dimension");
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto B = Reshape(b.Read(), Q1D, D1D);
--   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
--   auto Gct = Reshape(gct.Read(), D1D, Q1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, NE);
--   auto X = Reshape(x.Read(), D1Dtest, D1Dtest, NE);
--   auto Y = Reshape(y.ReadWrite(), 2*(D1D-1)*D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double mass[MAX_Q1D][MAX_Q1D];
--
--      // Zero-order term in L2 or H1 test space
--
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            mass[qy][qx] = 0.0;
--         }
--      }
--
--      for (int dy = 0; dy < D1Dtest; ++dy)
--      {
--         double sol_x[MAX_Q1D];
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            sol_x[qy] = 0.0;
--         }
--         for (int dx = 0; dx < D1Dtest; ++dx)
--         {
--            const double s = X(dx,dy,e);
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               sol_x[qx] += s * ((H1 == 1) ? B(qx,dx) : Bo(qx,dx));
--            }
--         }
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            const double d2q = (H1 == 1) ? B(qy,dy) : Bo(qy,dy);
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               mass[qy][qx] += d2q * sol_x[qx];
--            }
--         }
--      }
--
--      // Apply D operator.
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            mass[qy][qx] *= op(qx,qy,e);
--         }
--      }
--
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         int osc = 0;
--
--         for (int c = 0; c < VDIM; ++c)  // loop over x, y components
--         {
--            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--            double gradX[MAX_D1D];
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               gradX[dx] = 0.0;
--            }
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  gradX[dx] += mass[qy][qx] * ((c == 0) ? Bot(dx,qx) : Gct(dx,qx));
--               }
--            }
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               const double wy = (c == 0) ? -Gct(dy,qy) : Bot(dy,qy);
--
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  Y(dx + (dy * D1Dx) + osc, e) += gradX[dx] * wy;
--               }
--            }
--
--            osc += D1Dx * D1Dy;
--         }  // loop c
--      }  // loop qy
--   }); // end of element loop
--}
--
--void CurlCurlIntegrator::AddMultPA(const Vector &x, Vector &y) const
--{
--   if (dim == 3)
--   {
--      if (Device::Allows(Backend::DEVICE_MASK))
--      {
--         const int ID = (dofs1D << 4) | quad1D;
--         switch (ID)
--         {
--            case 0x23: return SmemPACurlCurlApply3D<2,3>(dofs1D, quad1D, symmetric, ne,
--                                                            mapsO->B, mapsC->B, mapsO->Bt,
--                                                            mapsC->Bt, mapsC->G, mapsC->Gt, pa_data, x, y);
--            case 0x34: return SmemPACurlCurlApply3D<3,4>(dofs1D, quad1D, symmetric, ne,
--                                                            mapsO->B, mapsC->B, mapsO->Bt,
--                                                            mapsC->Bt, mapsC->G, mapsC->Gt, pa_data, x, y);
--            case 0x45: return SmemPACurlCurlApply3D<4,5>(dofs1D, quad1D, symmetric, ne,
--                                                            mapsO->B,
--                                                            mapsC->B, mapsO->Bt,
--                                                            mapsC->Bt, mapsC->G, mapsC->Gt, pa_data, x, y);
--            case 0x56: return SmemPACurlCurlApply3D<5,6>(dofs1D, quad1D, symmetric, ne,
--                                                            mapsO->B, mapsC->B, mapsO->Bt,
--                                                            mapsC->Bt, mapsC->G, mapsC->Gt, pa_data, x, y);
--            default: return SmemPACurlCurlApply3D(dofs1D, quad1D, symmetric, ne, mapsO->B,
--                                                     mapsC->B, mapsO->Bt, mapsC->Bt,
--                                                     mapsC->G, mapsC->Gt, pa_data, x, y);
--         }
--      }
--      else
--         PACurlCurlApply3D(dofs1D, quad1D, symmetric, ne, mapsO->B, mapsC->B, mapsO->Bt,
--                           mapsC->Bt, mapsC->G, mapsC->Gt, pa_data, x, y);
--   }
--   else if (dim == 2)
--   {
--      PACurlCurlApply2D(dofs1D, quad1D, ne, mapsO->B, mapsO->Bt,
--                        mapsC->G, mapsC->Gt, pa_data, x, y);
--   }
--   else
--   {
--      MFEM_ABORT("Unsupported dimension!");
--   }
--}
--
--static void PACurlCurlAssembleDiagonal2D(const int D1D,
--                                         const int Q1D,
--                                         const int NE,
--                                         const Array<double> &bo,
--                                         const Array<double> &gc,
--                                         const Vector &pa_data,
--                                         Vector &diag)
--{
--   constexpr static int VDIM = 2;
--   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Gc = Reshape(gc.Read(), Q1D, D1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, NE);
--   auto D = Reshape(diag.ReadWrite(), 2*(D1D-1)*D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      int osc = 0;
--
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
--      {
--         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--         double t[MAX_Q1D];
--
--         for (int dy = 0; dy < D1Dy; ++dy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               t[qx] = 0.0;
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  const double wy = (c == 1) ? Bo(qy,dy) : -Gc(qy,dy);
--                  t[qx] += wy * wy * op(qx,qy,e);
--               }
--            }
--
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  const double wx = ((c == 0) ? Bo(qx,dx) : Gc(qx,dx));
--                  D(dx + (dy * D1Dx) + osc, e) += t[qx] * wx * wx;
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy;
--      }  // loop c
--   }); // end of element loop
--}
--
--template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
--static void PACurlCurlAssembleDiagonal3D(const int D1D,
--                                         const int Q1D,
--                                         const bool symmetric,
--                                         const int NE,
--                                         const Array<double> &bo,
--                                         const Array<double> &bc,
--                                         const Array<double> &go,
--                                         const Array<double> &gc,
--                                         const Vector &pa_data,
--                                         Vector &diag)
--{
--   constexpr static int VDIM = 3;
--   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto Go = Reshape(go.Read(), Q1D, D1D-1);
--   auto Gc = Reshape(gc.Read(), Q1D, D1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
--   auto D = Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
--
--   const int s = symmetric ? 6 : 9;
--   const int i11 = 0;
--   const int i12 = 1;
--   const int i13 = 2;
--   const int i21 = symmetric ? i12 : 3;
--   const int i22 = symmetric ? 3 : 4;
--   const int i23 = symmetric ? 4 : 5;
--   const int i31 = symmetric ? i13 : 6;
--   const int i32 = symmetric ? i23 : 7;
--   const int i33 = symmetric ? 5 : 8;
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
--      // (\nabla\times u) \cdot (\nabla\times u) = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{u}
--      // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
--      // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
--      // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
--
--      // For each c, we will keep 9 arrays for derivatives multiplied by the 9 entries of the 3x3 matrix (dF^T C dF),
--      // which may be non-symmetric depending on a possibly non-symmetric matrix coefficient.
--
--      int osc = 0;
--
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--      {
--         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
--         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--         double zt[MAX_Q1D][MAX_Q1D][MAX_D1D][9][3];
--
--         // z contraction
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               for (int dz = 0; dz < D1Dz; ++dz)
--               {
--                  for (int i=0; i<s; ++i)
--                  {
--                     for (int d=0; d<3; ++d)
--                     {
--                        zt[qx][qy][dz][i][d] = 0.0;
--                     }
--                  }
--
--                  for (int qz = 0; qz < Q1D; ++qz)
--                  {
--                     const double wz = ((c == 2) ? Bo(qz,dz) : Bc(qz,dz));
--                     const double wDz = ((c == 2) ? Go(qz,dz) : Gc(qz,dz));
--
--                     for (int i=0; i<s; ++i)
--                     {
--                        zt[qx][qy][dz][i][0] += wz * wz * op(qx,qy,qz,i,e);
--                        zt[qx][qy][dz][i][1] += wDz * wz * op(qx,qy,qz,i,e);
--                        zt[qx][qy][dz][i][2] += wDz * wDz * op(qx,qy,qz,i,e);
--                     }
--                  }
--               }
--            }
--         }  // end of z contraction
--
--         double yt[MAX_Q1D][MAX_D1D][MAX_D1D][9][3][3];
--
--         // y contraction
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            for (int dz = 0; dz < D1Dz; ++dz)
--            {
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  for (int i=0; i<s; ++i)
--                  {
--                     for (int d=0; d<3; ++d)
--                        for (int j=0; j<3; ++j)
--                        {
--                           yt[qx][dy][dz][i][d][j] = 0.0;
--                        }
--                  }
--
--                  for (int qy = 0; qy < Q1D; ++qy)
--                  {
--                     const double wy = ((c == 1) ? Bo(qy,dy) : Bc(qy,dy));
--                     const double wDy = ((c == 1) ? Go(qy,dy) : Gc(qy,dy));
--
--                     for (int i=0; i<s; ++i)
--                     {
--                        for (int d=0; d<3; ++d)
--                        {
--                           yt[qx][dy][dz][i][d][0] += wy * wy * zt[qx][qy][dz][i][d];
--                           yt[qx][dy][dz][i][d][1] += wDy * wy * zt[qx][qy][dz][i][d];
--                           yt[qx][dy][dz][i][d][2] += wDy * wDy * zt[qx][qy][dz][i][d];
--                        }
--                     }
--                  }
--               }
--            }
--         }  // end of y contraction
--
--         // x contraction
--         for (int dz = 0; dz < D1Dz; ++dz)
--         {
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     const double wx = ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
--                     const double wDx = ((c == 0) ? Go(qx,dx) : Gc(qx,dx));
--
--                     // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
--                     // (\nabla\times u) \cdot (\nabla\times u) = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{u}
--                     // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
--                     // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
--                     // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
--
--                     /*
--                       const double O11 = op(q,0,e);
--                       const double O12 = op(q,1,e);
--                       const double O13 = op(q,2,e);
--                       const double O22 = op(q,3,e);
--                       const double O23 = op(q,4,e);
--                       const double O33 = op(q,5,e);
--                     */
--
--                     if (c == 0)
--                     {
--                        // (u_0)_{x_2} (O22 (u_0)_{x_2} - O23 (u_0)_{x_1}) - (u_0)_{x_1} (O32 (u_0)_{x_2} - O33 (u_0)_{x_1})
--                        const double sumy = yt[qx][dy][dz][i22][2][0] - yt[qx][dy][dz][i23][1][1]
--                                            - yt[qx][dy][dz][i32][1][1] + yt[qx][dy][dz][i33][0][2];
--
--                        D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += sumy * wx * wx;
--                     }
--                     else if (c == 1)
--                     {
--                        // (u_1)_{x_2} (O11 (u_1)_{x_2} - O13 (u_1)_{x_0}) + (u_1)_{x_0} (-O31 (u_1)_{x_2} + O33 (u_1)_{x_0})
--                        const double d = (yt[qx][dy][dz][i11][2][0] * wx * wx)
--                                         - ((yt[qx][dy][dz][i13][1][0] + yt[qx][dy][dz][i31][1][0]) * wDx * wx)
--                                         + (yt[qx][dy][dz][i33][0][0] * wDx * wDx);
--
--                        D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += d;
--                     }
--                     else
--                     {
--                        // (u_2)_{x_1} (O11 (u_2)_{x_1} - O12 (u_2)_{x_0}) - (u_2)_{x_0} (O21 (u_2)_{x_1} - O22 (u_2)_{x_0})
--                        const double d = (yt[qx][dy][dz][i11][0][2] * wx * wx)
--                                         - ((yt[qx][dy][dz][i12][0][1] + yt[qx][dy][dz][i21][0][1]) * wDx * wx)
--                                         + (yt[qx][dy][dz][i22][0][0] * wDx * wDx);
--
--                        D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += d;
--                     }
--                  }
--               }
--            }
--         }  // end of x contraction
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }  // loop c
--   }); // end of element loop
--}
--
--template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
--static void SmemPACurlCurlAssembleDiagonal3D(const int D1D,
--                                             const int Q1D,
--                                             const bool symmetric,
--                                             const int NE,
--                                             const Array<double> &bo,
--                                             const Array<double> &bc,
--                                             const Array<double> &go,
--                                             const Array<double> &gc,
--                                             const Vector &pa_data,
--                                             Vector &diag)
--{
--   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto Go = Reshape(go.Read(), Q1D, D1D-1);
--   auto Gc = Reshape(gc.Read(), Q1D, D1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
--   auto D = Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
--
--   const int s = symmetric ? 6 : 9;
--   const int i11 = 0;
--   const int i12 = 1;
--   const int i13 = 2;
--   const int i21 = symmetric ? i12 : 3;
--   const int i22 = symmetric ? 3 : 4;
--   const int i23 = symmetric ? 4 : 5;
--   const int i31 = symmetric ? i13 : 6;
--   const int i32 = symmetric ? i23 : 7;
--   const int i33 = symmetric ? 5 : 8;
--
--   mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
--   {
--      // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
--      // (\nabla\times u) \cdot (\nabla\times u) = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{u}
--      // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
--      // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
--      // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
--
--      constexpr int VDIM = 3;
--
--      MFEM_SHARED double sBo[MAX_Q1D][MAX_D1D];
--      MFEM_SHARED double sBc[MAX_Q1D][MAX_D1D];
--      MFEM_SHARED double sGo[MAX_Q1D][MAX_D1D];
--      MFEM_SHARED double sGc[MAX_Q1D][MAX_D1D];
--
--      double ope[9];
--      MFEM_SHARED double sop[9][MAX_Q1D][MAX_Q1D];
--
--      MFEM_FOREACH_THREAD(qx,x,Q1D)
--      {
--         MFEM_FOREACH_THREAD(qy,y,Q1D)
--         {
--            MFEM_FOREACH_THREAD(qz,z,Q1D)
--            {
--               for (int i=0; i<s; ++i)
--               {
--                  ope[i] = op(qx,qy,qz,i,e);
--               }
--            }
--         }
--      }
--
--      const int tidx = MFEM_THREAD_ID(x);
--      const int tidy = MFEM_THREAD_ID(y);
--      const int tidz = MFEM_THREAD_ID(z);
--
--      if (tidz == 0)
--      {
--         MFEM_FOREACH_THREAD(d,y,D1D)
--         {
--            MFEM_FOREACH_THREAD(q,x,Q1D)
--            {
--               sBc[q][d] = Bc(q,d);
--               sGc[q][d] = Gc(q,d);
--               if (d < D1D-1)
--               {
--                  sBo[q][d] = Bo(q,d);
--                  sGo[q][d] = Go(q,d);
--               }
--            }
--         }
--      }
--      MFEM_SYNC_THREAD;
--
--      int osc = 0;
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--      {
--         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
--         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--         double dxyz = 0.0;
--
--         for (int qz=0; qz < Q1D; ++qz)
--         {
--            if (tidz == qz)
--            {
--               for (int i=0; i<s; ++i)
--               {
--                  sop[i][tidx][tidy] = ope[i];
--               }
--            }
--
--            MFEM_SYNC_THREAD;
--
--            MFEM_FOREACH_THREAD(dz,z,D1Dz)
--            {
--               const double wz = ((c == 2) ? sBo[qz][dz] : sBc[qz][dz]);
--               const double wDz = ((c == 2) ? sGo[qz][dz] : sGc[qz][dz]);
--
--               MFEM_FOREACH_THREAD(dy,y,D1Dy)
--               {
--                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
--                  {
--                     for (int qy = 0; qy < Q1D; ++qy)
--                     {
--                        const double wy = ((c == 1) ? sBo[qy][dy] : sBc[qy][dy]);
--                        const double wDy = ((c == 1) ? sGo[qy][dy] : sGc[qy][dy]);
--
--                        for (int qx = 0; qx < Q1D; ++qx)
--                        {
--                           const double wx = ((c == 0) ? sBo[qx][dx] : sBc[qx][dx]);
--                           const double wDx = ((c == 0) ? sGo[qx][dx] : sGc[qx][dx]);
--
--                           if (c == 0)
--                           {
--                              // (u_0)_{x_2} (O22 (u_0)_{x_2} - O23 (u_0)_{x_1}) - (u_0)_{x_1} (O32 (u_0)_{x_2} - O33 (u_0)_{x_1})
--
--                              // (u_0)_{x_2} O22 (u_0)_{x_2}
--                              dxyz += sop[i22][qx][qy] * wx * wx * wy * wy * wDz * wDz;
--
--                              // -(u_0)_{x_2} O23 (u_0)_{x_1} - (u_0)_{x_1} O32 (u_0)_{x_2}
--                              dxyz += -(sop[i23][qx][qy] + sop[i32][qx][qy]) * wx * wx * wDy * wy * wDz * wz;
--
--                              // (u_0)_{x_1} O33 (u_0)_{x_1}
--                              dxyz += sop[i33][qx][qy] * wx * wx * wDy * wDy * wz * wz;
--                           }
--                           else if (c == 1)
--                           {
--                              // (u_1)_{x_2} (O11 (u_1)_{x_2} - O13 (u_1)_{x_0}) + (u_1)_{x_0} (-O31 (u_1)_{x_2} + O33 (u_1)_{x_0})
--
--                              // (u_1)_{x_2} O11 (u_1)_{x_2}
--                              dxyz += sop[i11][qx][qy] * wx * wx * wy * wy * wDz * wDz;
--
--                              // -(u_1)_{x_2} O13 (u_1)_{x_0} - (u_1)_{x_0} O31 (u_1)_{x_2}
--                              dxyz += -(sop[i13][qx][qy] + sop[i31][qx][qy]) * wDx * wx * wy * wy * wDz * wz;
--
--                              // (u_1)_{x_0} O33 (u_1)_{x_0})
--                              dxyz += sop[i33][qx][qy] * wDx * wDx * wy * wy * wz * wz;
--                           }
--                           else
--                           {
--                              // (u_2)_{x_1} (O11 (u_2)_{x_1} - O12 (u_2)_{x_0}) - (u_2)_{x_0} (O21 (u_2)_{x_1} - O22 (u_2)_{x_0})
--
--                              // (u_2)_{x_1} O11 (u_2)_{x_1}
--                              dxyz += sop[i11][qx][qy] * wx * wx * wDy * wDy * wz * wz;
--
--                              // -(u_2)_{x_1} O12 (u_2)_{x_0} - (u_2)_{x_0} O21 (u_2)_{x_1}
--                              dxyz += -(sop[i12][qx][qy] + sop[i21][qx][qy]) * wDx * wx * wDy * wy * wz * wz;
--
--                              // (u_2)_{x_0} O22 (u_2)_{x_0}
--                              dxyz += sop[i22][qx][qy] * wDx * wDx * wy * wy * wz * wz;
--                           }
--                        }
--                     }
--                  }
--               }
--            }
--
--            MFEM_SYNC_THREAD;
--         }  // qz loop
--
--         MFEM_FOREACH_THREAD(dz,z,D1Dz)
--         {
--            MFEM_FOREACH_THREAD(dy,y,D1Dy)
--            {
--               MFEM_FOREACH_THREAD(dx,x,D1Dx)
--               {
--                  D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }  // c loop
--   }); // end of element loop
--}
--
--void CurlCurlIntegrator::AssembleDiagonalPA(Vector& diag)
--{
--   if (dim == 3)
--   {
--      if (Device::Allows(Backend::DEVICE_MASK))
--      {
--         const int ID = (dofs1D << 4) | quad1D;
--         switch (ID)
--         {
--            case 0x23: return SmemPACurlCurlAssembleDiagonal3D<2,3>(dofs1D, quad1D,
--                                                                       symmetric, ne,
--                                                                       mapsO->B, mapsC->B,
--                                                                       mapsO->G, mapsC->G,
--                                                                       pa_data, diag);
--            case 0x34: return SmemPACurlCurlAssembleDiagonal3D<3,4>(dofs1D, quad1D,
--                                                                       symmetric, ne,
--                                                                       mapsO->B, mapsC->B,
--                                                                       mapsO->G, mapsC->G,
--                                                                       pa_data, diag);
--            case 0x45: return SmemPACurlCurlAssembleDiagonal3D<4,5>(dofs1D, quad1D,
--                                                                       symmetric, ne,
--                                                                       mapsO->B, mapsC->B,
--                                                                       mapsO->G, mapsC->G,
--                                                                       pa_data, diag);
--            case 0x56: return SmemPACurlCurlAssembleDiagonal3D<5,6>(dofs1D, quad1D,
--                                                                       symmetric, ne,
--                                                                       mapsO->B, mapsC->B,
--                                                                       mapsO->G, mapsC->G,
--                                                                       pa_data, diag);
--            default: return SmemPACurlCurlAssembleDiagonal3D(dofs1D, quad1D, symmetric, ne,
--                                                                mapsO->B, mapsC->B,
--                                                                mapsO->G, mapsC->G,
--                                                                pa_data, diag);
--         }
--      }
--      else
--         PACurlCurlAssembleDiagonal3D(dofs1D, quad1D, symmetric, ne,
--                                      mapsO->B, mapsC->B,
--                                      mapsO->G, mapsC->G,
--                                      pa_data, diag);
--   }
--   else if (dim == 2)
--   {
--      PACurlCurlAssembleDiagonal2D(dofs1D, quad1D, ne,
--                                   mapsO->B, mapsC->G, pa_data, diag);
--   }
--   else
--   {
--      MFEM_ABORT("Unsupported dimension!");
--   }
--}
--
--// Apply to x corresponding to DOFs in H^1 (trial), whose gradients are
--// integrated against H(curl) test functions corresponding to y.
--void PAHcurlH1Apply3D(const int D1D,
--                      const int Q1D,
--                      const int NE,
--                      const Array<double> &bc,
--                      const Array<double> &gc,
--                      const Array<double> &bot,
--                      const Array<double> &bct,
--                      const Vector &pa_data,
--                      const Vector &x,
--                      Vector &y)
--{
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
--
--   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
--
--   constexpr static int VDIM = 3;
--
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto Gc = Reshape(gc.Read(), Q1D, D1D);
--   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
--   auto Bct = Reshape(bct.Read(), D1D, Q1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, 6, NE);
--   auto X = Reshape(x.Read(), D1D, D1D, D1D, NE);
--   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double mass[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];
--
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               for (int c = 0; c < VDIM; ++c)
--               {
--                  mass[qz][qy][qx][c] = 0.0;
--               }
--            }
--         }
--      }
--
--      for (int dz = 0; dz < D1D; ++dz)
--      {
--         double gradXY[MAX_Q1D][MAX_Q1D][3];
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               gradXY[qy][qx][0] = 0.0;
--               gradXY[qy][qx][1] = 0.0;
--               gradXY[qy][qx][2] = 0.0;
--            }
--         }
--         for (int dy = 0; dy < D1D; ++dy)
--         {
--            double gradX[MAX_Q1D][2];
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               gradX[qx][0] = 0.0;
--               gradX[qx][1] = 0.0;
--            }
--            for (int dx = 0; dx < D1D; ++dx)
--            {
--               const double s = X(dx,dy,dz,e);
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  gradX[qx][0] += s * Bc(qx,dx);
--                  gradX[qx][1] += s * Gc(qx,dx);
--               }
--            }
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               const double wy  = Bc(qy,dy);
--               const double wDy = Gc(qy,dy);
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  const double wx  = gradX[qx][0];
--                  const double wDx = gradX[qx][1];
--                  gradXY[qy][qx][0] += wDx * wy;
--                  gradXY[qy][qx][1] += wx * wDy;
--                  gradXY[qy][qx][2] += wx * wy;
--               }
--            }
--         }
--         for (int qz = 0; qz < Q1D; ++qz)
--         {
--            const double wz  = Bc(qz,dz);
--            const double wDz = Gc(qz,dz);
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  mass[qz][qy][qx][0] += gradXY[qy][qx][0] * wz;
--                  mass[qz][qy][qx][1] += gradXY[qy][qx][1] * wz;
--                  mass[qz][qy][qx][2] += gradXY[qy][qx][2] * wDz;
--               }
--            }
--         }
--      }
--
--      // Apply D operator.
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               const double O11 = op(qx,qy,qz,0,e);
--               const double O12 = op(qx,qy,qz,1,e);
--               const double O13 = op(qx,qy,qz,2,e);
--               const double O22 = op(qx,qy,qz,3,e);
--               const double O23 = op(qx,qy,qz,4,e);
--               const double O33 = op(qx,qy,qz,5,e);
--               const double massX = mass[qz][qy][qx][0];
--               const double massY = mass[qz][qy][qx][1];
--               const double massZ = mass[qz][qy][qx][2];
--               mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
--               mass[qz][qy][qx][1] = (O12*massX)+(O22*massY)+(O23*massZ);
--               mass[qz][qy][qx][2] = (O13*massX)+(O23*massY)+(O33*massZ);
--            }
--         }
--      }
--
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         double massXY[MAX_D1D][MAX_D1D];
--
--         int osc = 0;
--
--         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--         {
--            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
--            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  massXY[dy][dx] = 0.0;
--               }
--            }
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               double massX[MAX_D1D];
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  massX[dx] = 0;
--               }
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     massX[dx] += mass[qz][qy][qx][c] * ((c == 0) ? Bot(dx,qx) : Bct(dx,qx));
--                  }
--               }
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  const double wy = (c == 1) ? Bot(dy,qy) : Bct(dy,qy);
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     massXY[dy][dx] += massX[dx] * wy;
--                  }
--               }
--            }
--
--            for (int dz = 0; dz < D1Dz; ++dz)
--            {
--               const double wz = (c == 2) ? Bot(dz,qz) : Bct(dz,qz);
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += massXY[dy][dx] * wz;
--                  }
--               }
--            }
--
--            osc += D1Dx * D1Dy * D1Dz;
--         }  // loop c
--      }  // loop qz
--   }); // end of element loop
--}
--
--// Apply to x corresponding to DOFs in H(curl), integrated
--// against gradients of H^1 functions corresponding to y.
--void PAHcurlH1ApplyTranspose3D(const int D1D,
--                               const int Q1D,
--                               const int NE,
--                               const Array<double> &bc,
--                               const Array<double> &bo,
--                               const Array<double> &bct,
--                               const Array<double> &gct,
--                               const Vector &pa_data,
--                               const Vector &x,
--                               Vector &y)
--{
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
--
--   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
--
--   constexpr static int VDIM = 3;
--
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bt = Reshape(bct.Read(), D1D, Q1D);
--   auto Gt = Reshape(gct.Read(), D1D, Q1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, 6, NE);
--   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
--   auto Y = Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double mass[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];
--
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               for (int c = 0; c < VDIM; ++c)
--               {
--                  mass[qz][qy][qx][c] = 0.0;
--               }
--            }
--         }
--      }
--
--      int osc = 0;
--
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--      {
--         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
--         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--         for (int dz = 0; dz < D1Dz; ++dz)
--         {
--            double massXY[MAX_Q1D][MAX_Q1D];
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  massXY[qy][qx] = 0.0;
--               }
--            }
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               double massX[MAX_Q1D];
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  massX[qx] = 0.0;
--               }
--
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     massX[qx] += t * ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
--                  }
--               }
--
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  const double wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     const double wx = massX[qx];
--                     massXY[qy][qx] += wx * wy;
--                  }
--               }
--            }
--
--            for (int qz = 0; qz < Q1D; ++qz)
--            {
--               const double wz = (c == 2) ? Bo(qz,dz) : Bc(qz,dz);
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     mass[qz][qy][qx][c] += massXY[qy][qx] * wz;
--                  }
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }  // loop (c) over components
--
--      // Apply D operator.
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               const double O11 = op(qx,qy,qz,0,e);
--               const double O12 = op(qx,qy,qz,1,e);
--               const double O13 = op(qx,qy,qz,2,e);
--               const double O22 = op(qx,qy,qz,3,e);
--               const double O23 = op(qx,qy,qz,4,e);
--               const double O33 = op(qx,qy,qz,5,e);
--               const double massX = mass[qz][qy][qx][0];
--               const double massY = mass[qz][qy][qx][1];
--               const double massZ = mass[qz][qy][qx][2];
--               mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
--               mass[qz][qy][qx][1] = (O12*massX)+(O22*massY)+(O23*massZ);
--               mass[qz][qy][qx][2] = (O13*massX)+(O23*massY)+(O33*massZ);
--            }
--         }
--      }
--
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         double gradXY[MAX_D1D][MAX_D1D][3];
--         for (int dy = 0; dy < D1D; ++dy)
--         {
--            for (int dx = 0; dx < D1D; ++dx)
--            {
--               gradXY[dy][dx][0] = 0;
--               gradXY[dy][dx][1] = 0;
--               gradXY[dy][dx][2] = 0;
--            }
--         }
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            double gradX[MAX_D1D][3];
--            for (int dx = 0; dx < D1D; ++dx)
--            {
--               gradX[dx][0] = 0;
--               gradX[dx][1] = 0;
--               gradX[dx][2] = 0;
--            }
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               const double gX = mass[qz][qy][qx][0];
--               const double gY = mass[qz][qy][qx][1];
--               const double gZ = mass[qz][qy][qx][2];
--               for (int dx = 0; dx < D1D; ++dx)
--               {
--                  const double wx  = Bt(dx,qx);
--                  const double wDx = Gt(dx,qx);
--                  gradX[dx][0] += gX * wDx;
--                  gradX[dx][1] += gY * wx;
--                  gradX[dx][2] += gZ * wx;
--               }
--            }
--            for (int dy = 0; dy < D1D; ++dy)
--            {
--               const double wy  = Bt(dy,qy);
--               const double wDy = Gt(dy,qy);
--               for (int dx = 0; dx < D1D; ++dx)
--               {
--                  gradXY[dy][dx][0] += gradX[dx][0] * wy;
--                  gradXY[dy][dx][1] += gradX[dx][1] * wDy;
--                  gradXY[dy][dx][2] += gradX[dx][2] * wy;
--               }
--            }
--         }
--         for (int dz = 0; dz < D1D; ++dz)
--         {
--            const double wz  = Bt(dz,qz);
--            const double wDz = Gt(dz,qz);
--            for (int dy = 0; dy < D1D; ++dy)
--            {
--               for (int dx = 0; dx < D1D; ++dx)
--               {
--                  Y(dx,dy,dz,e) +=
--                     ((gradXY[dy][dx][0] * wz) +
--                      (gradXY[dy][dx][1] * wz) +
--                      (gradXY[dy][dx][2] * wDz));
--               }
--            }
--         }
--      }  // loop qz
--   }); // end of element loop
--}
--
--// Apply to x corresponding to DOFs in H^1 (trial), whose gradients are
--// integrated against H(curl) test functions corresponding to y.
--void PAHcurlH1Apply2D(const int D1D,
--                      const int Q1D,
--                      const int NE,
--                      const Array<double> &bc,
--                      const Array<double> &gc,
--                      const Array<double> &bot,
--                      const Array<double> &bct,
--                      const Vector &pa_data,
--                      const Vector &x,
--                      Vector &y)
--{
--   constexpr static int VDIM = 2;
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
--
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto Gc = Reshape(gc.Read(), Q1D, D1D);
--   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
--   auto Bct = Reshape(bct.Read(), D1D, Q1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, 3, NE);
--   auto X = Reshape(x.Read(), D1D, D1D, NE);
--   auto Y = Reshape(y.ReadWrite(), 2*(D1D-1)*D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double mass[MAX_Q1D][MAX_Q1D][VDIM];
--
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            for (int c = 0; c < VDIM; ++c)
--            {
--               mass[qy][qx][c] = 0.0;
--            }
--         }
--      }
--
--      for (int dy = 0; dy < D1D; ++dy)
--      {
--         double gradX[MAX_Q1D][2];
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            gradX[qx][0] = 0.0;
--            gradX[qx][1] = 0.0;
--         }
--         for (int dx = 0; dx < D1D; ++dx)
--         {
--            const double s = X(dx,dy,e);
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               gradX[qx][0] += s * Bc(qx,dx);
--               gradX[qx][1] += s * Gc(qx,dx);
--            }
--         }
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            const double wy  = Bc(qy,dy);
--            const double wDy = Gc(qy,dy);
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               const double wx  = gradX[qx][0];
--               const double wDx = gradX[qx][1];
--               mass[qy][qx][0] += wDx * wy;
--               mass[qy][qx][1] += wx * wDy;
--            }
--         }
--      }
--
--      // Apply D operator.
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            const double O11 = op(qx,qy,0,e);
--            const double O12 = op(qx,qy,1,e);
--            const double O22 = op(qx,qy,2,e);
--            const double massX = mass[qy][qx][0];
--            const double massY = mass[qy][qx][1];
--            mass[qy][qx][0] = (O11*massX)+(O12*massY);
--            mass[qy][qx][1] = (O12*massX)+(O22*massY);
--         }
--      }
--
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         int osc = 0;
--
--         for (int c = 0; c < VDIM; ++c)  // loop over x, y components
--         {
--            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--            double massX[MAX_D1D];
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               massX[dx] = 0;
--            }
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  massX[dx] += mass[qy][qx][c] * ((c == 0) ? Bot(dx,qx) : Bct(dx,qx));
--               }
--            }
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               const double wy = (c == 1) ? Bot(dy,qy) : Bct(dy,qy);
--
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  Y(dx + (dy * D1Dx) + osc, e) += massX[dx] * wy;
--               }
--            }
--
--            osc += D1Dx * D1Dy;
--         }  // loop c
--      }
--   }); // end of element loop
--}
--
--// Apply to x corresponding to DOFs in H(curl), integrated
--// against gradients of H^1 functions corresponding to y.
--void PAHcurlH1ApplyTranspose2D(const int D1D,
--                               const int Q1D,
--                               const int NE,
--                               const Array<double> &bc,
--                               const Array<double> &bo,
--                               const Array<double> &bct,
--                               const Array<double> &gct,
--                               const Vector &pa_data,
--                               const Vector &x,
--                               Vector &y)
--{
--   constexpr static int VDIM = 2;
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
--
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bt = Reshape(bct.Read(), D1D, Q1D);
--   auto Gt = Reshape(gct.Read(), D1D, Q1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, 3, NE);
--   auto X = Reshape(x.Read(), 2*(D1D-1)*D1D, NE);
--   auto Y = Reshape(y.ReadWrite(), D1D, D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double mass[MAX_Q1D][MAX_Q1D][VDIM];
--
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            for (int c = 0; c < VDIM; ++c)
--            {
--               mass[qy][qx][c] = 0.0;
--            }
--         }
--      }
--
--      int osc = 0;
--
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
--      {
--         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--         for (int dy = 0; dy < D1Dy; ++dy)
--         {
--            double massX[MAX_Q1D];
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               massX[qx] = 0.0;
--            }
--
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               const double t = X(dx + (dy * D1Dx) + osc, e);
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  massX[qx] += t * ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
--               }
--            }
--
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               const double wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  mass[qy][qx][c] += massX[qx] * wy;
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy;
--      }  // loop (c) over components
--
--      // Apply D operator.
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            const double O11 = op(qx,qy,0,e);
--            const double O12 = op(qx,qy,1,e);
--            const double O22 = op(qx,qy,2,e);
--            const double massX = mass[qy][qx][0];
--            const double massY = mass[qy][qx][1];
--            mass[qy][qx][0] = (O11*massX)+(O12*massY);
--            mass[qy][qx][1] = (O12*massX)+(O22*massY);
--         }
--      }
--
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         double gradX[MAX_D1D][2];
--         for (int dx = 0; dx < D1D; ++dx)
--         {
--            gradX[dx][0] = 0;
--            gradX[dx][1] = 0;
--         }
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            const double gX = mass[qy][qx][0];
--            const double gY = mass[qy][qx][1];
--            for (int dx = 0; dx < D1D; ++dx)
--            {
--               const double wx  = Bt(dx,qx);
--               const double wDx = Gt(dx,qx);
--               gradX[dx][0] += gX * wDx;
--               gradX[dx][1] += gY * wx;
--            }
--         }
--         for (int dy = 0; dy < D1D; ++dy)
--         {
--            const double wy  = Bt(dy,qy);
--            const double wDy = Gt(dy,qy);
--            for (int dx = 0; dx < D1D; ++dx)
--            {
--               Y(dx,dy,e) += ((gradX[dx][0] * wy) + (gradX[dx][1] * wDy));
--            }
--         }
--      }
--   }); // end of element loop
--}
--
--// PA H(curl) Mass Assemble 3D kernel
--void PAHcurlL2Setup(const int NQ,
--                    const int coeffDim,
--                    const int NE,
--                    const Array<double> &w,
--                    Vector &coeff,
--                    Vector &op)
--{
--   auto W = w.Read();
--   auto C = Reshape(coeff.Read(), coeffDim, NQ, NE);
--   auto y = Reshape(op.Write(), coeffDim, NQ, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      for (int q = 0; q < NQ; ++q)
--      {
--         for (int c=0; c<coeffDim; ++c)
--         {
--            y(c,q,e) = W[q] * C(c,q,e);
--         }
--      }
--   });
--}
--
--void MixedScalarCurlIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
--                                           const FiniteElementSpace &test_fes)
--{
--   // Assumes tensor-product elements
--   Mesh *mesh = trial_fes.GetMesh();
--   const FiniteElement *fel = trial_fes.GetFE(0); // In H(curl)
--   const FiniteElement *eltest = test_fes.GetFE(0); // In scalar space
--
--   const VectorTensorFiniteElement *el =
--      dynamic_cast<const VectorTensorFiniteElement*>(fel);
--   MFEM_VERIFY(el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   if (el->GetDerivType() != mfem::FiniteElement::CURL)
--   {
--      MFEM_ABORT("Unknown kernel.");
--   }
--
--   const IntegrationRule *ir
--      = IntRule ? IntRule : &MassIntegrator::GetRule(*eltest, *eltest,
--                                                     *mesh->GetElementTransformation(0));
--
--   const int dims = el->GetDim();
--   MFEM_VERIFY(dims == 2, "");
--
--   const int nq = ir->GetNPoints();
--   dim = mesh->Dimension();
--   MFEM_VERIFY(dim == 2, "");
--
--   ne = test_fes.GetNE();
--   mapsC = &el->GetDofToQuad(*ir, DofToQuad::TENSOR);
--   mapsO = &el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
--   dofs1D = mapsC->ndof;
--   quad1D = mapsC->nqpt;
--
--   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
--
--   if (el->GetOrder() == eltest->GetOrder())
--   {
--      dofs1Dtest = dofs1D;
--   }
--   else
--   {
--      dofs1Dtest = dofs1D - 1;
--   }
--
--   pa_data.SetSize(nq * ne, Device::GetMemoryType());
--
--   QuadratureSpace qs(*mesh, *ir);
--   CoefficientVector coeff(Q, qs, CoefficientStorage::FULL);
--
--   if (dim == 2)
--   {
--      PACurlL2Setup2D(quad1D, ne, ir->GetWeights(), coeff, pa_data);
--   }
--   else
--   {
--      MFEM_ABORT("Unsupported dimension!");
--   }
--}
--
--void MixedScalarCurlIntegrator::AddMultPA(const Vector &x, Vector &y) const
--{
--   if (dim == 2)
--   {
--      PACurlL2Apply2D(dofs1D, dofs1Dtest, quad1D, ne, mapsO->B, mapsO->Bt,
--                      mapsC->Bt, mapsC->G, pa_data, x, y);
--   }
--   else
--   {
--      MFEM_ABORT("Unsupported dimension!");
--   }
--}
--
--void MixedScalarCurlIntegrator::AddMultTransposePA(const Vector &x,
--                                                   Vector &y) const
--{
--   if (dim == 2)
--   {
--      PACurlL2ApplyTranspose2D(dofs1D, dofs1Dtest, quad1D, ne, mapsO->B, mapsO->Bt,
--                               mapsC->B, mapsC->Gt, pa_data, x, y);
--   }
--   else
--   {
--      MFEM_ABORT("Unsupported dimension!");
--   }
--}
--
--void MixedVectorCurlIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
--                                           const FiniteElementSpace &test_fes)
--{
--   // Assumes tensor-product elements, with vector test and trial spaces.
--   Mesh *mesh = trial_fes.GetMesh();
--   const FiniteElement *trial_fel = trial_fes.GetFE(0);
--   const FiniteElement *test_fel = test_fes.GetFE(0);
--
--   const VectorTensorFiniteElement *trial_el =
--      dynamic_cast<const VectorTensorFiniteElement*>(trial_fel);
--   MFEM_VERIFY(trial_el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const VectorTensorFiniteElement *test_el =
--      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
--   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const IntegrationRule *ir
--      = IntRule ? IntRule : &MassIntegrator::GetRule(*trial_el, *trial_el,
--                                                     *mesh->GetElementTransformation(0));
--   const int dims = trial_el->GetDim();
--   MFEM_VERIFY(dims == 3, "");
--
--   const int nq = ir->GetNPoints();
--   dim = mesh->Dimension();
--   MFEM_VERIFY(dim == 3, "");
--
--   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(), "");
--
--   ne = trial_fes.GetNE();
--   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
--   mapsC = &trial_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
--   mapsO = &trial_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
--   mapsCtest = &test_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
--   mapsOtest = &test_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
--   dofs1D = mapsC->ndof;
--   quad1D = mapsC->nqpt;
--   dofs1Dtest = mapsCtest->ndof;
--
--   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
--
--   testType = test_el->GetDerivType();
--   trialType = trial_el->GetDerivType();
--
--   const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
--   coeffDim = (DQ ? 3 : 1);
--
--   const bool curlSpaces = (testType == mfem::FiniteElement::CURL &&
--                            trialType == mfem::FiniteElement::CURL);
--
--   const int ndata = curlSpaces ? (coeffDim == 1 ? 1 : 9) : symmDims;
--   pa_data.SetSize(ndata * nq * ne, Device::GetMemoryType());
--
--   QuadratureSpace qs(*mesh, *ir);
--   CoefficientVector coeff(qs, CoefficientStorage::FULL);
--   if (Q) { coeff.Project(*Q); }
--   else if (DQ) { coeff.Project(*DQ); }
--   else { coeff.SetConstant(1.0); }
--
--   if (testType == mfem::FiniteElement::CURL &&
--       trialType == mfem::FiniteElement::CURL && dim == 3)
--   {
--      if (coeffDim == 1)
--      {
--         PAHcurlL2Setup(nq, coeffDim, ne, ir->GetWeights(), coeff, pa_data);
--      }
--      else
--      {
--         PAHcurlHdivSetup3D(quad1D, coeffDim, ne, false, ir->GetWeights(),
--                            geom->J, coeff, pa_data);
--      }
--   }
--   else if (testType == mfem::FiniteElement::DIV &&
--            trialType == mfem::FiniteElement::CURL && dim == 3 &&
--            test_fel->GetOrder() == trial_fel->GetOrder())
--   {
--      PACurlCurlSetup3D(quad1D, coeffDim, ne, ir->GetWeights(), geom->J, coeff,
--                        pa_data);
--   }
--   else
--   {
--      MFEM_ABORT("Unknown kernel.");
--   }
--}
--
--// Apply to x corresponding to DOFs in H(curl) (trial), whose curl is
--// integrated against H(curl) test functions corresponding to y.
--template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
--static void PAHcurlL2Apply3D(const int D1D,
--                             const int Q1D,
--                             const int coeffDim,
--                             const int NE,
--                             const Array<double> &bo,
--                             const Array<double> &bc,
--                             const Array<double> &bot,
--                             const Array<double> &bct,
--                             const Array<double> &gc,
--                             const Vector &pa_data,
--                             const Vector &x,
--                             Vector &y)
--{
--   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
--   // Using u = dF^{-T} \hat{u} and (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
--   // (\nabla\times u) \cdot v = 1/det(dF) \hat{\nabla}\times\hat{u}^T dF^T dF^{-T} \hat{v}
--   // = 1/det(dF) \hat{\nabla}\times\hat{u}^T \hat{v}
--   // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
--   // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
--   // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
--
--   constexpr static int VDIM = 3;
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
--   auto Bct = Reshape(bct.Read(), D1D, Q1D);
--   auto Gc = Reshape(gc.Read(), Q1D, D1D);
--   auto op = Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
--   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
--   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double curl[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];
--      // curl[qz][qy][qx] will be computed as the vector curl at each quadrature point.
--
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               for (int c = 0; c < VDIM; ++c)
--               {
--                  curl[qz][qy][qx][c] = 0.0;
--               }
--            }
--         }
--      }
--
--      // We treat x, y, z components separately for optimization specific to each.
--
--      int osc = 0;
--
--      {
--         // x component
--         const int D1Dz = D1D;
--         const int D1Dy = D1D;
--         const int D1Dx = D1D - 1;
--
--         for (int dz = 0; dz < D1Dz; ++dz)
--         {
--            double gradXY[MAX_Q1D][MAX_Q1D][2];
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  for (int d = 0; d < 2; ++d)
--                  {
--                     gradXY[qy][qx][d] = 0.0;
--                  }
--               }
--            }
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               double massX[MAX_Q1D];
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  massX[qx] = 0.0;
--               }
--
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     massX[qx] += t * Bo(qx,dx);
--                  }
--               }
--
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  const double wy = Bc(qy,dy);
--                  const double wDy = Gc(qy,dy);
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     const double wx = massX[qx];
--                     gradXY[qy][qx][0] += wx * wDy;
--                     gradXY[qy][qx][1] += wx * wy;
--                  }
--               }
--            }
--
--            for (int qz = 0; qz < Q1D; ++qz)
--            {
--               const double wz = Bc(qz,dz);
--               const double wDz = Gc(qz,dz);
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
--                     curl[qz][qy][qx][1] += gradXY[qy][qx][1] * wDz; // (u_0)_{x_2}
--                     curl[qz][qy][qx][2] -= gradXY[qy][qx][0] * wz;  // -(u_0)_{x_1}
--                  }
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }
--
--      {
--         // y component
--         const int D1Dz = D1D;
--         const int D1Dy = D1D - 1;
--         const int D1Dx = D1D;
--
--         for (int dz = 0; dz < D1Dz; ++dz)
--         {
--            double gradXY[MAX_Q1D][MAX_Q1D][2];
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  for (int d = 0; d < 2; ++d)
--                  {
--                     gradXY[qy][qx][d] = 0.0;
--                  }
--               }
--            }
--
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               double massY[MAX_Q1D];
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  massY[qy] = 0.0;
--               }
--
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  for (int qy = 0; qy < Q1D; ++qy)
--                  {
--                     massY[qy] += t * Bo(qy,dy);
--                  }
--               }
--
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  const double wx = Bc(qx,dx);
--                  const double wDx = Gc(qx,dx);
--                  for (int qy = 0; qy < Q1D; ++qy)
--                  {
--                     const double wy = massY[qy];
--                     gradXY[qy][qx][0] += wDx * wy;
--                     gradXY[qy][qx][1] += wx * wy;
--                  }
--               }
--            }
--
--            for (int qz = 0; qz < Q1D; ++qz)
--            {
--               const double wz = Bc(qz,dz);
--               const double wDz = Gc(qz,dz);
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
--                     curl[qz][qy][qx][0] -= gradXY[qy][qx][1] * wDz; // -(u_1)_{x_2}
--                     curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz;  // (u_1)_{x_0}
--                  }
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }
--
--      {
--         // z component
--         const int D1Dz = D1D - 1;
--         const int D1Dy = D1D;
--         const int D1Dx = D1D;
--
--         for (int dx = 0; dx < D1Dx; ++dx)
--         {
--            double gradYZ[MAX_Q1D][MAX_Q1D][2];
--            for (int qz = 0; qz < Q1D; ++qz)
--            {
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int d = 0; d < 2; ++d)
--                  {
--                     gradYZ[qz][qy][d] = 0.0;
--                  }
--               }
--            }
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               double massZ[MAX_Q1D];
--               for (int qz = 0; qz < Q1D; ++qz)
--               {
--                  massZ[qz] = 0.0;
--               }
--
--               for (int dz = 0; dz < D1Dz; ++dz)
--               {
--                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  for (int qz = 0; qz < Q1D; ++qz)
--                  {
--                     massZ[qz] += t * Bo(qz,dz);
--                  }
--               }
--
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  const double wy = Bc(qy,dy);
--                  const double wDy = Gc(qy,dy);
--                  for (int qz = 0; qz < Q1D; ++qz)
--                  {
--                     const double wz = massZ[qz];
--                     gradYZ[qz][qy][0] += wz * wy;
--                     gradYZ[qz][qy][1] += wz * wDy;
--                  }
--               }
--            }
--
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               const double wx = Bc(qx,dx);
--               const double wDx = Gc(qx,dx);
--
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int qz = 0; qz < Q1D; ++qz)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
--                     curl[qz][qy][qx][0] += gradYZ[qz][qy][1] * wx;  // (u_2)_{x_1}
--                     curl[qz][qy][qx][1] -= gradYZ[qz][qy][0] * wDx; // -(u_2)_{x_0}
--                  }
--               }
--            }
--         }
--      }
--
--      // Apply D operator.
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               const double O11 = op(0,qx,qy,qz,e);
--               if (coeffDim == 1)
--               {
--                  for (int c = 0; c < VDIM; ++c)
--                  {
--                     curl[qz][qy][qx][c] *= O11;
--                  }
--               }
--               else
--               {
--                  const double O21 = op(1,qx,qy,qz,e);
--                  const double O31 = op(2,qx,qy,qz,e);
--                  const double O12 = op(3,qx,qy,qz,e);
--                  const double O22 = op(4,qx,qy,qz,e);
--                  const double O32 = op(5,qx,qy,qz,e);
--                  const double O13 = op(6,qx,qy,qz,e);
--                  const double O23 = op(7,qx,qy,qz,e);
--                  const double O33 = op(8,qx,qy,qz,e);
--                  const double curlX = curl[qz][qy][qx][0];
--                  const double curlY = curl[qz][qy][qx][1];
--                  const double curlZ = curl[qz][qy][qx][2];
--                  curl[qz][qy][qx][0] = (O11*curlX)+(O12*curlY)+(O13*curlZ);
--                  curl[qz][qy][qx][1] = (O21*curlX)+(O22*curlY)+(O23*curlZ);
--                  curl[qz][qy][qx][2] = (O31*curlX)+(O32*curlY)+(O33*curlZ);
--               }
--            }
--         }
--      }
--
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         double massXY[MAX_D1D][MAX_D1D];
--
--         osc = 0;
--
--         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--         {
--            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
--            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  massXY[dy][dx] = 0;
--               }
--            }
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               double massX[MAX_D1D];
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  massX[dx] = 0.0;
--               }
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     massX[dx] += curl[qz][qy][qx][c] * ((c == 0) ? Bot(dx,qx) : Bct(dx,qx));
--                  }
--               }
--
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  const double wy = (c == 1) ? Bot(dy,qy) : Bct(dy,qy);
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     massXY[dy][dx] += massX[dx] * wy;
--                  }
--               }
--            }
--
--            for (int dz = 0; dz < D1Dz; ++dz)
--            {
--               const double wz = (c == 2) ? Bot(dz,qz) : Bct(dz,qz);
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += massXY[dy][dx] * wz;
--                  }
--               }
--            }
--
--            osc += D1Dx * D1Dy * D1Dz;
--         }  // loop c
--      }  // loop qz
--   }); // end of element loop
--}
--
--// Apply to x corresponding to DOFs in H(curl) (trial), whose curl is
--// integrated against H(curl) test functions corresponding to y.
--template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
--static void SmemPAHcurlL2Apply3D(const int D1D,
--                                 const int Q1D,
--                                 const int coeffDim,
--                                 const int NE,
--                                 const Array<double> &bo,
--                                 const Array<double> &bc,
--                                 const Array<double> &gc,
--                                 const Vector &pa_data,
--                                 const Vector &x,
--                                 Vector &y)
--{
--   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto Gc = Reshape(gc.Read(), Q1D, D1D);
--   auto op = Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
--   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
--   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
--
--   auto device_kernel = [=] MFEM_DEVICE (int e)
--   {
--      constexpr int VDIM = 3;
--      constexpr int maxCoeffDim = 9;
--
--      MFEM_SHARED double sBo[MAX_D1D][MAX_Q1D];
--      MFEM_SHARED double sBc[MAX_D1D][MAX_Q1D];
--      MFEM_SHARED double sGc[MAX_D1D][MAX_Q1D];
--
--      double opc[maxCoeffDim];
--      MFEM_SHARED double sop[maxCoeffDim][MAX_Q1D][MAX_Q1D];
--      MFEM_SHARED double curl[MAX_Q1D][MAX_Q1D][3];
--
--      MFEM_SHARED double sX[MAX_D1D][MAX_D1D][MAX_D1D];
--
--      MFEM_FOREACH_THREAD(qx,x,Q1D)
--      {
--         MFEM_FOREACH_THREAD(qy,y,Q1D)
--         {
--            MFEM_FOREACH_THREAD(qz,z,Q1D)
--            {
--               for (int i=0; i<coeffDim; ++i)
--               {
--                  opc[i] = op(i,qx,qy,qz,e);
--               }
--            }
--         }
--      }
--
--      const int tidx = MFEM_THREAD_ID(x);
--      const int tidy = MFEM_THREAD_ID(y);
--      const int tidz = MFEM_THREAD_ID(z);
--
--      if (tidz == 0)
--      {
--         MFEM_FOREACH_THREAD(d,y,D1D)
--         {
--            MFEM_FOREACH_THREAD(q,x,Q1D)
--            {
--               sBc[d][q] = Bc(q,d);
--               sGc[d][q] = Gc(q,d);
--               if (d < D1D-1)
--               {
--                  sBo[d][q] = Bo(q,d);
--               }
--            }
--         }
--      }
--      MFEM_SYNC_THREAD;
--
--      for (int qz=0; qz < Q1D; ++qz)
--      {
--         if (tidz == qz)
--         {
--            MFEM_FOREACH_THREAD(qy,y,Q1D)
--            {
--               MFEM_FOREACH_THREAD(qx,x,Q1D)
--               {
--                  for (int i=0; i<3; ++i)
--                  {
--                     curl[qy][qx][i] = 0.0;
--                  }
--               }
--            }
--         }
--
--         int osc = 0;
--         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--         {
--            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
--            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--            MFEM_FOREACH_THREAD(dz,z,D1Dz)
--            {
--               MFEM_FOREACH_THREAD(dy,y,D1Dy)
--               {
--                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
--                  {
--                     sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  }
--               }
--            }
--            MFEM_SYNC_THREAD;
--
--            if (tidz == qz)
--            {
--               if (c == 0)
--               {
--                  for (int i=0; i<coeffDim; ++i)
--                  {
--                     sop[i][tidx][tidy] = opc[i];
--                  }
--               }
--
--               MFEM_FOREACH_THREAD(qy,y,Q1D)
--               {
--                  MFEM_FOREACH_THREAD(qx,x,Q1D)
--                  {
--                     double u = 0.0;
--                     double v = 0.0;
--
--                     // We treat x, y, z components separately for optimization specific to each.
--                     if (c == 0) // x component
--                     {
--                        // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
--
--                        for (int dz = 0; dz < D1Dz; ++dz)
--                        {
--                           const double wz = sBc[dz][qz];
--                           const double wDz = sGc[dz][qz];
--
--                           for (int dy = 0; dy < D1Dy; ++dy)
--                           {
--                              const double wy = sBc[dy][qy];
--                              const double wDy = sGc[dy][qy];
--
--                              for (int dx = 0; dx < D1Dx; ++dx)
--                              {
--                                 const double wx = sX[dz][dy][dx] * sBo[dx][qx];
--                                 u += wx * wDy * wz;
--                                 v += wx * wy * wDz;
--                              }
--                           }
--                        }
--
--                        curl[qy][qx][1] += v; // (u_0)_{x_2}
--                        curl[qy][qx][2] -= u;  // -(u_0)_{x_1}
--                     }
--                     else if (c == 1)  // y component
--                     {
--                        // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
--
--                        for (int dz = 0; dz < D1Dz; ++dz)
--                        {
--                           const double wz = sBc[dz][qz];
--                           const double wDz = sGc[dz][qz];
--
--                           for (int dy = 0; dy < D1Dy; ++dy)
--                           {
--                              const double wy = sBo[dy][qy];
--
--                              for (int dx = 0; dx < D1Dx; ++dx)
--                              {
--                                 const double t = sX[dz][dy][dx];
--                                 const double wx = t * sBc[dx][qx];
--                                 const double wDx = t * sGc[dx][qx];
--
--                                 u += wDx * wy * wz;
--                                 v += wx * wy * wDz;
--                              }
--                           }
--                        }
--
--                        curl[qy][qx][0] -= v; // -(u_1)_{x_2}
--                        curl[qy][qx][2] += u; // (u_1)_{x_0}
--                     }
--                     else // z component
--                     {
--                        // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
--
--                        for (int dz = 0; dz < D1Dz; ++dz)
--                        {
--                           const double wz = sBo[dz][qz];
--
--                           for (int dy = 0; dy < D1Dy; ++dy)
--                           {
--                              const double wy = sBc[dy][qy];
--                              const double wDy = sGc[dy][qy];
--
--                              for (int dx = 0; dx < D1Dx; ++dx)
--                              {
--                                 const double t = sX[dz][dy][dx];
--                                 const double wx = t * sBc[dx][qx];
--                                 const double wDx = t * sGc[dx][qx];
--
--                                 u += wDx * wy * wz;
--                                 v += wx * wDy * wz;
--                              }
--                           }
--                        }
--
--                        curl[qy][qx][0] += v; // (u_2)_{x_1}
--                        curl[qy][qx][1] -= u; // -(u_2)_{x_0}
--                     }
--                  } // qx
--               } // qy
--            } // tidz == qz
--
--            osc += D1Dx * D1Dy * D1Dz;
--            MFEM_SYNC_THREAD;
--         } // c
--
--         double dxyz1 = 0.0;
--         double dxyz2 = 0.0;
--         double dxyz3 = 0.0;
--
--         MFEM_FOREACH_THREAD(dz,z,D1D)
--         {
--            const double wcz = sBc[dz][qz];
--            const double wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
--
--            MFEM_FOREACH_THREAD(dy,y,D1D)
--            {
--               MFEM_FOREACH_THREAD(dx,x,D1D)
--               {
--                  for (int qy = 0; qy < Q1D; ++qy)
--                  {
--                     const double wcy = sBc[dy][qy];
--                     const double wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
--
--                     for (int qx = 0; qx < Q1D; ++qx)
--                     {
--                        const double O11 = sop[0][qx][qy];
--                        double c1, c2, c3;
--                        if (coeffDim == 1)
--                        {
--                           c1 = O11 * curl[qy][qx][0];
--                           c2 = O11 * curl[qy][qx][1];
--                           c3 = O11 * curl[qy][qx][2];
--                        }
--                        else
--                        {
--                           const double O21 = sop[1][qx][qy];
--                           const double O31 = sop[2][qx][qy];
--                           const double O12 = sop[3][qx][qy];
--                           const double O22 = sop[4][qx][qy];
--                           const double O32 = sop[5][qx][qy];
--                           const double O13 = sop[6][qx][qy];
--                           const double O23 = sop[7][qx][qy];
--                           const double O33 = sop[8][qx][qy];
--                           c1 = (O11*curl[qy][qx][0])+(O12*curl[qy][qx][1])+(O13*curl[qy][qx][2]);
--                           c2 = (O21*curl[qy][qx][0])+(O22*curl[qy][qx][1])+(O23*curl[qy][qx][2]);
--                           c3 = (O31*curl[qy][qx][0])+(O32*curl[qy][qx][1])+(O33*curl[qy][qx][2]);
--                        }
--
--                        const double wcx = sBc[dx][qx];
--
--                        if (dx < D1D-1)
--                        {
--                           const double wx = sBo[dx][qx];
--                           dxyz1 += c1 * wx * wcy * wcz;
--                        }
--
--                        dxyz2 += c2 * wcx * wy * wcz;
--                        dxyz3 += c3 * wcx * wcy * wz;
--                     } // qx
--                  } // qy
--               } // dx
--            } // dy
--         } // dz
--
--         MFEM_SYNC_THREAD;
--
--         MFEM_FOREACH_THREAD(dz,z,D1D)
--         {
--            MFEM_FOREACH_THREAD(dy,y,D1D)
--            {
--               MFEM_FOREACH_THREAD(dx,x,D1D)
--               {
--                  if (dx < D1D-1)
--                  {
--                     Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
--                  }
--                  if (dy < D1D-1)
--                  {
--                     Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
--                  }
--                  if (dz < D1D-1)
--                  {
--                     Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
--                  }
--               }
--            }
--         }
--      } // qz
--   }; // end of element loop
--
--   auto host_kernel = [&] MFEM_LAMBDA (int)
--   {
--      MFEM_ABORT_KERNEL("This kernel should only be used on GPU.");
--   };
--
--   ForallWrap<3>(true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
--}
--
--// Apply to x corresponding to DOFs in H(curl) (trial), whose curl is
--// integrated against H(div) test functions corresponding to y.
--template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
--static void PAHcurlHdivApply3D(const int D1D,
--                               const int D1Dtest,
--                               const int Q1D,
--                               const int NE,
--                               const Array<double> &bo,
--                               const Array<double> &bc,
--                               const Array<double> &bot,
--                               const Array<double> &bct,
--                               const Array<double> &gc,
--                               const Vector &pa_data,
--                               const Vector &x,
--                               Vector &y)
--{
--   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
--   // Using Piola transformations (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u}
--   // for u in H(curl) and w = (1 / det (dF)) dF \hat{w} for w in H(div), we get
--   // (\nabla\times u) \cdot w = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{w}
--   // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
--   // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
--   // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
--
--   constexpr static int VDIM = 3;
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto Bot = Reshape(bot.Read(), D1Dtest-1, Q1D);
--   auto Bct = Reshape(bct.Read(), D1Dtest, Q1D);
--   auto Gc = Reshape(gc.Read(), Q1D, D1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, 6, NE);
--   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
--   auto Y = Reshape(y.ReadWrite(), 3*(D1Dtest-1)*(D1Dtest-1)*D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double curl[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];
--      // curl[qz][qy][qx] will be computed as the vector curl at each quadrature point.
--
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               for (int c = 0; c < VDIM; ++c)
--               {
--                  curl[qz][qy][qx][c] = 0.0;
--               }
--            }
--         }
--      }
--
--      // We treat x, y, z components separately for optimization specific to each.
--
--      int osc = 0;
--
--      {
--         // x component
--         const int D1Dz = D1D;
--         const int D1Dy = D1D;
--         const int D1Dx = D1D - 1;
--
--         for (int dz = 0; dz < D1Dz; ++dz)
--         {
--            double gradXY[MAX_Q1D][MAX_Q1D][2];
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  for (int d = 0; d < 2; ++d)
--                  {
--                     gradXY[qy][qx][d] = 0.0;
--                  }
--               }
--            }
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               double massX[MAX_Q1D];
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  massX[qx] = 0.0;
--               }
--
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     massX[qx] += t * Bo(qx,dx);
--                  }
--               }
--
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  const double wy = Bc(qy,dy);
--                  const double wDy = Gc(qy,dy);
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     const double wx = massX[qx];
--                     gradXY[qy][qx][0] += wx * wDy;
--                     gradXY[qy][qx][1] += wx * wy;
--                  }
--               }
--            }
--
--            for (int qz = 0; qz < Q1D; ++qz)
--            {
--               const double wz = Bc(qz,dz);
--               const double wDz = Gc(qz,dz);
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
--                     curl[qz][qy][qx][1] += gradXY[qy][qx][1] * wDz; // (u_0)_{x_2}
--                     curl[qz][qy][qx][2] -= gradXY[qy][qx][0] * wz;  // -(u_0)_{x_1}
--                  }
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }
--
--      {
--         // y component
--         const int D1Dz = D1D;
--         const int D1Dy = D1D - 1;
--         const int D1Dx = D1D;
--
--         for (int dz = 0; dz < D1Dz; ++dz)
--         {
--            double gradXY[MAX_Q1D][MAX_Q1D][2];
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  for (int d = 0; d < 2; ++d)
--                  {
--                     gradXY[qy][qx][d] = 0.0;
--                  }
--               }
--            }
--
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               double massY[MAX_Q1D];
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  massY[qy] = 0.0;
--               }
--
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  for (int qy = 0; qy < Q1D; ++qy)
--                  {
--                     massY[qy] += t * Bo(qy,dy);
--                  }
--               }
--
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  const double wx = Bc(qx,dx);
--                  const double wDx = Gc(qx,dx);
--                  for (int qy = 0; qy < Q1D; ++qy)
--                  {
--                     const double wy = massY[qy];
--                     gradXY[qy][qx][0] += wDx * wy;
--                     gradXY[qy][qx][1] += wx * wy;
--                  }
--               }
--            }
--
--            for (int qz = 0; qz < Q1D; ++qz)
--            {
--               const double wz = Bc(qz,dz);
--               const double wDz = Gc(qz,dz);
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
--                     curl[qz][qy][qx][0] -= gradXY[qy][qx][1] * wDz; // -(u_1)_{x_2}
--                     curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz;  // (u_1)_{x_0}
--                  }
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }
--
--      {
--         // z component
--         const int D1Dz = D1D - 1;
--         const int D1Dy = D1D;
--         const int D1Dx = D1D;
--
--         for (int dx = 0; dx < D1Dx; ++dx)
--         {
--            double gradYZ[MAX_Q1D][MAX_Q1D][2];
--            for (int qz = 0; qz < Q1D; ++qz)
--            {
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int d = 0; d < 2; ++d)
--                  {
--                     gradYZ[qz][qy][d] = 0.0;
--                  }
--               }
--            }
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               double massZ[MAX_Q1D];
--               for (int qz = 0; qz < Q1D; ++qz)
--               {
--                  massZ[qz] = 0.0;
--               }
--
--               for (int dz = 0; dz < D1Dz; ++dz)
--               {
--                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  for (int qz = 0; qz < Q1D; ++qz)
--                  {
--                     massZ[qz] += t * Bo(qz,dz);
--                  }
--               }
--
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  const double wy = Bc(qy,dy);
--                  const double wDy = Gc(qy,dy);
--                  for (int qz = 0; qz < Q1D; ++qz)
--                  {
--                     const double wz = massZ[qz];
--                     gradYZ[qz][qy][0] += wz * wy;
--                     gradYZ[qz][qy][1] += wz * wDy;
--                  }
--               }
--            }
--
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               const double wx = Bc(qx,dx);
--               const double wDx = Gc(qx,dx);
--
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int qz = 0; qz < Q1D; ++qz)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
--                     curl[qz][qy][qx][0] += gradYZ[qz][qy][1] * wx;  // (u_2)_{x_1}
--                     curl[qz][qy][qx][1] -= gradYZ[qz][qy][0] * wDx; // -(u_2)_{x_0}
--                  }
--               }
--            }
--         }
--      }
--
--      // Apply D operator.
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               const double O11 = op(qx,qy,qz,0,e);
--               const double O12 = op(qx,qy,qz,1,e);
--               const double O13 = op(qx,qy,qz,2,e);
--               const double O22 = op(qx,qy,qz,3,e);
--               const double O23 = op(qx,qy,qz,4,e);
--               const double O33 = op(qx,qy,qz,5,e);
--
--               const double c1 = (O11 * curl[qz][qy][qx][0]) + (O12 * curl[qz][qy][qx][1]) +
--                                 (O13 * curl[qz][qy][qx][2]);
--               const double c2 = (O12 * curl[qz][qy][qx][0]) + (O22 * curl[qz][qy][qx][1]) +
--                                 (O23 * curl[qz][qy][qx][2]);
--               const double c3 = (O13 * curl[qz][qy][qx][0]) + (O23 * curl[qz][qy][qx][1]) +
--                                 (O33 * curl[qz][qy][qx][2]);
--
--               curl[qz][qy][qx][0] = c1;
--               curl[qz][qy][qx][1] = c2;
--               curl[qz][qy][qx][2] = c3;
--            }
--         }
--      }
--
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         double massXY[HCURL_MAX_D1D][HCURL_MAX_D1D];  // Assuming HDIV_MAX_D1D <= HCURL_MAX_D1D
--
--         osc = 0;
--
--         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--         {
--            const int D1Dz = (c == 2) ? D1Dtest : D1Dtest - 1;
--            const int D1Dy = (c == 1) ? D1Dtest : D1Dtest - 1;
--            const int D1Dx = (c == 0) ? D1Dtest : D1Dtest - 1;
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  massXY[dy][dx] = 0;
--               }
--            }
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               double massX[HCURL_MAX_D1D];
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  massX[dx] = 0;
--               }
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     massX[dx] += curl[qz][qy][qx][c] *
--                                  ((c == 0) ? Bct(dx,qx) : Bot(dx,qx));
--                  }
--               }
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  const double wy = (c == 1) ? Bct(dy,qy) : Bot(dy,qy);
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     massXY[dy][dx] += massX[dx] * wy;
--                  }
--               }
--            }
--
--            for (int dz = 0; dz < D1Dz; ++dz)
--            {
--               const double wz = (c == 2) ? Bct(dz,qz) : Bot(dz,qz);
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) +=
--                        massXY[dy][dx] * wz;
--                  }
--               }
--            }
--
--            osc += D1Dx * D1Dy * D1Dz;
--         }  // loop c
--      }  // loop qz
--   }); // end of element loop
--}
--
--// Apply to x corresponding to DOFs in H(div) (test), integrated against the
--// curl of H(curl) trial functions corresponding to y.
--template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
--static void PAHcurlHdivApply3DTranspose(const int D1D,
--                                        const int D1Dtest,
--                                        const int Q1D,
--                                        const int NE,
--                                        const Array<double> &bo,
--                                        const Array<double> &bc,
--                                        const Array<double> &bot,
--                                        const Array<double> &bct,
--                                        const Array<double> &gct,
--                                        const Vector &pa_data,
--                                        const Vector &x,
--                                        Vector &y)
--{
--   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
--   // Using Piola transformations (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u}
--   // for u in H(curl) and w = (1 / det (dF)) dF \hat{w} for w in H(div), we get
--   // (\nabla\times u) \cdot w = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{w}
--   // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
--   // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
--   // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
--
--   constexpr static int VDIM = 3;
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto Bot = Reshape(bot.Read(), D1Dtest-1, Q1D);
--   auto Bct = Reshape(bct.Read(), D1Dtest, Q1D);
--   auto Gct = Reshape(gct.Read(), D1D, Q1D);
--   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, 6, NE);
--   auto X = Reshape(x.Read(), 3*(D1Dtest-1)*(D1Dtest-1)*D1D, NE);
--   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double mass[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];  // Assuming HDIV_MAX_D1D <= HCURL_MAX_D1D
--
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               for (int c = 0; c < VDIM; ++c)
--               {
--                  mass[qz][qy][qx][c] = 0.0;
--               }
--            }
--         }
--      }
--
--      int osc = 0;
--
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--      {
--         const int D1Dz = (c == 2) ? D1D : D1D - 1;
--         const int D1Dy = (c == 1) ? D1D : D1D - 1;
--         const int D1Dx = (c == 0) ? D1D : D1D - 1;
--
--         for (int dz = 0; dz < D1Dz; ++dz)
--         {
--            double massXY[HDIV_MAX_Q1D][HDIV_MAX_Q1D];
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  massXY[qy][qx] = 0.0;
--               }
--            }
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               double massX[HDIV_MAX_Q1D];
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  massX[qx] = 0.0;
--               }
--
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     massX[qx] += t * ((c == 0) ? Bc(qx,dx) : Bo(qx,dx));
--                  }
--               }
--
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  const double wy = (c == 1) ? Bc(qy,dy) : Bo(qy,dy);
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     const double wx = massX[qx];
--                     massXY[qy][qx] += wx * wy;
--                  }
--               }
--            }
--
--            for (int qz = 0; qz < Q1D; ++qz)
--            {
--               const double wz = (c == 2) ? Bc(qz,dz) : Bo(qz,dz);
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     mass[qz][qy][qx][c] += massXY[qy][qx] * wz;
--                  }
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }  // loop (c) over components
--
--      // Apply D operator.
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               const double O11 = op(qx,qy,qz,0,e);
--               const double O12 = op(qx,qy,qz,1,e);
--               const double O13 = op(qx,qy,qz,2,e);
--               const double O22 = op(qx,qy,qz,3,e);
--               const double O23 = op(qx,qy,qz,4,e);
--               const double O33 = op(qx,qy,qz,5,e);
--               const double massX = mass[qz][qy][qx][0];
--               const double massY = mass[qz][qy][qx][1];
--               const double massZ = mass[qz][qy][qx][2];
--               mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
--               mass[qz][qy][qx][1] = (O12*massX)+(O22*massY)+(O23*massZ);
--               mass[qz][qy][qx][2] = (O13*massX)+(O23*massY)+(O33*massZ);
--            }
--         }
--      }
--
--      // x component
--      osc = 0;
--      {
--         const int D1Dz = D1D;
--         const int D1Dy = D1D;
--         const int D1Dx = D1D - 1;
--
--         for (int qz = 0; qz < Q1D; ++qz)
--         {
--            double gradXY12[MAX_D1D][MAX_D1D];
--            double gradXY21[MAX_D1D][MAX_D1D];
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  gradXY12[dy][dx] = 0.0;
--                  gradXY21[dy][dx] = 0.0;
--               }
--            }
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               double massX[MAX_D1D][2];
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  for (int n = 0; n < 2; ++n)
--                  {
--                     massX[dx][n] = 0.0;
--                  }
--               }
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     const double wx = Bot(dx,qx);
--
--                     massX[dx][0] += wx * mass[qz][qy][qx][1];
--                     massX[dx][1] += wx * mass[qz][qy][qx][2];
--                  }
--               }
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  const double wy = Bct(dy,qy);
--                  const double wDy = Gct(dy,qy);
--
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     gradXY21[dy][dx] += massX[dx][0] * wy;
--                     gradXY12[dy][dx] += massX[dx][1] * wDy;
--                  }
--               }
--            }
--
--            for (int dz = 0; dz < D1Dz; ++dz)
--            {
--               const double wz = Bct(dz,qz);
--               const double wDz = Gct(dz,qz);
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
--                     // (u_0)_{x_2} * (op * curl)_1 - (u_0)_{x_1} * (op * curl)_2
--                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
--                       e) += (gradXY21[dy][dx] * wDz) - (gradXY12[dy][dx] * wz);
--                  }
--               }
--            }
--         }  // loop qz
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }
--
--      // y component
--      {
--         const int D1Dz = D1D;
--         const int D1Dy = D1D - 1;
--         const int D1Dx = D1D;
--
--         for (int qz = 0; qz < Q1D; ++qz)
--         {
--            double gradXY02[MAX_D1D][MAX_D1D];
--            double gradXY20[MAX_D1D][MAX_D1D];
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  gradXY02[dy][dx] = 0.0;
--                  gradXY20[dy][dx] = 0.0;
--               }
--            }
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               double massY[MAX_D1D][2];
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  massY[dy][0] = 0.0;
--                  massY[dy][1] = 0.0;
--               }
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int dy = 0; dy < D1Dy; ++dy)
--                  {
--                     const double wy = Bot(dy,qy);
--
--                     massY[dy][0] += wy * mass[qz][qy][qx][2];
--                     massY[dy][1] += wy * mass[qz][qy][qx][0];
--                  }
--               }
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  const double wx = Bct(dx,qx);
--                  const double wDx = Gct(dx,qx);
--
--                  for (int dy = 0; dy < D1Dy; ++dy)
--                  {
--                     gradXY02[dy][dx] += massY[dy][0] * wDx;
--                     gradXY20[dy][dx] += massY[dy][1] * wx;
--                  }
--               }
--            }
--
--            for (int dz = 0; dz < D1Dz; ++dz)
--            {
--               const double wz = Bct(dz,qz);
--               const double wDz = Gct(dz,qz);
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
--                     // -(u_1)_{x_2} * (op * curl)_0 + (u_1)_{x_0} * (op * curl)_2
--                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
--                       e) += (-gradXY20[dy][dx] * wDz) + (gradXY02[dy][dx] * wz);
--                  }
--               }
--            }
--         }  // loop qz
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }
--
--      // z component
--      {
--         const int D1Dz = D1D - 1;
--         const int D1Dy = D1D;
--         const int D1Dx = D1D;
--
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            double gradYZ01[MAX_D1D][MAX_D1D];
--            double gradYZ10[MAX_D1D][MAX_D1D];
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               for (int dz = 0; dz < D1Dz; ++dz)
--               {
--                  gradYZ01[dz][dy] = 0.0;
--                  gradYZ10[dz][dy] = 0.0;
--               }
--            }
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               double massZ[MAX_D1D][2];
--               for (int dz = 0; dz < D1Dz; ++dz)
--               {
--                  for (int n = 0; n < 2; ++n)
--                  {
--                     massZ[dz][n] = 0.0;
--                  }
--               }
--               for (int qz = 0; qz < Q1D; ++qz)
--               {
--                  for (int dz = 0; dz < D1Dz; ++dz)
--                  {
--                     const double wz = Bot(dz,qz);
--
--                     massZ[dz][0] += wz * mass[qz][qy][qx][0];
--                     massZ[dz][1] += wz * mass[qz][qy][qx][1];
--                  }
--               }
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  const double wy = Bct(dy,qy);
--                  const double wDy = Gct(dy,qy);
--
--                  for (int dz = 0; dz < D1Dz; ++dz)
--                  {
--                     gradYZ01[dz][dy] += wy * massZ[dz][1];
--                     gradYZ10[dz][dy] += wDy * massZ[dz][0];
--                  }
--               }
--            }
--
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               const double wx = Bct(dx,qx);
--               const double wDx = Gct(dx,qx);
--
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  for (int dz = 0; dz < D1Dz; ++dz)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
--                     // (u_2)_{x_1} * (op * curl)_0 - (u_2)_{x_0} * (op * curl)_1
--                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
--                       e) += (gradYZ10[dz][dy] * wx) - (gradYZ01[dz][dy] * wDx);
--                  }
--               }
--            }
--         }  // loop qx
--      }
--   }); // end of element loop
--}
--
--void MixedVectorCurlIntegrator::AddMultPA(const Vector &x, Vector &y) const
--{
--   if (testType == mfem::FiniteElement::CURL &&
--       trialType == mfem::FiniteElement::CURL && dim == 3)
--   {
--      const int ndata = coeffDim == 1 ? 1 : 9;
--
--      if (Device::Allows(Backend::DEVICE_MASK))
--      {
--         const int ID = (dofs1D << 4) | quad1D;
--         switch (ID)
--         {
--            case 0x23: return SmemPAHcurlL2Apply3D<2,3>(dofs1D, quad1D, ndata, ne,
--                                                           mapsO->B, mapsC->B,
--                                                           mapsC->G, pa_data, x, y);
--            case 0x34: return SmemPAHcurlL2Apply3D<3,4>(dofs1D, quad1D, ndata, ne,
--                                                           mapsO->B, mapsC->B,
--                                                           mapsC->G, pa_data, x, y);
--            case 0x45: return SmemPAHcurlL2Apply3D<4,5>(dofs1D, quad1D, ndata, ne,
--                                                           mapsO->B, mapsC->B,
--                                                           mapsC->G, pa_data, x, y);
--            case 0x56: return SmemPAHcurlL2Apply3D<5,6>(dofs1D, quad1D, ndata, ne,
--                                                           mapsO->B, mapsC->B,
--                                                           mapsC->G, pa_data, x, y);
--            default: return SmemPAHcurlL2Apply3D(dofs1D, quad1D, ndata, ne,
--                                                    mapsO->B, mapsC->B, mapsC->G,
--                                                    pa_data, x, y);
--         }
--      }
--      else
--         PAHcurlL2Apply3D(dofs1D, quad1D, ndata, ne, mapsO->B, mapsC->B,
--                          mapsO->Bt, mapsC->Bt, mapsC->G, pa_data, x, y);
--   }
--   else if (testType == mfem::FiniteElement::DIV &&
--            trialType == mfem::FiniteElement::CURL && dim == 3)
--      PAHcurlHdivApply3D(dofs1D, dofs1Dtest, quad1D, ne, mapsO->B,
--                         mapsC->B, mapsOtest->Bt, mapsCtest->Bt, mapsC->G,
--                         pa_data, x, y);
--   else
--   {
--      MFEM_ABORT("Unsupported dimension or space!");
--   }
--}
--
--void MixedVectorCurlIntegrator::AddMultTransposePA(const Vector &x,
--                                                   Vector &y) const
--{
--   if (testType == mfem::FiniteElement::DIV &&
--       trialType == mfem::FiniteElement::CURL && dim == 3)
--      PAHcurlHdivApply3DTranspose(dofs1D, dofs1Dtest, quad1D, ne, mapsO->B,
--                                  mapsC->B, mapsOtest->Bt, mapsCtest->Bt,
--                                  mapsC->Gt, pa_data, x, y);
--   else
--   {
--      MFEM_ABORT("Unsupported dimension or space!");
--   }
--}
--
--void MixedVectorWeakCurlIntegrator::AssemblePA(const FiniteElementSpace
--                                               &trial_fes,
--                                               const FiniteElementSpace &test_fes)
--{
--   // Assumes tensor-product elements, with vector test and trial spaces.
--   Mesh *mesh = trial_fes.GetMesh();
--   const FiniteElement *trial_fel = trial_fes.GetFE(0);
--   const FiniteElement *test_fel = test_fes.GetFE(0);
--
--   const VectorTensorFiniteElement *trial_el =
--      dynamic_cast<const VectorTensorFiniteElement*>(trial_fel);
--   MFEM_VERIFY(trial_el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const VectorTensorFiniteElement *test_el =
--      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
--   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const IntegrationRule *ir
--      = IntRule ? IntRule : &MassIntegrator::GetRule(*trial_el, *trial_el,
--                                                     *mesh->GetElementTransformation(0));
--   const int dims = trial_el->GetDim();
--   MFEM_VERIFY(dims == 3, "");
--
--   const int nq = ir->GetNPoints();
--   dim = mesh->Dimension();
--   MFEM_VERIFY(dim == 3, "");
--
--   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(), "");
--
--   ne = trial_fes.GetNE();
--   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
--   mapsC = &test_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
--   mapsO = &test_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
--   dofs1D = mapsC->ndof;
--   quad1D = mapsC->nqpt;
--
--   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
--
--   testType = test_el->GetDerivType();
--   trialType = trial_el->GetDerivType();
--
--   const bool curlSpaces = (testType == mfem::FiniteElement::CURL &&
--                            trialType == mfem::FiniteElement::CURL);
--
--   const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
--
--   coeffDim = DQ ? 3 : 1;
--   const int ndata = curlSpaces ? (DQ ? 9 : 1) : symmDims;
--
--   pa_data.SetSize(ndata * nq * ne, Device::GetMemoryType());
--
--   QuadratureSpace qs(*mesh, *ir);
--   CoefficientVector coeff(qs, CoefficientStorage::FULL);
--   if (Q) { coeff.Project(*Q); }
--   else if (DQ) { coeff.Project(*DQ); }
--   else { coeff.SetConstant(1.0); }
--
--   if (trialType == mfem::FiniteElement::CURL && dim == 3)
--   {
--      if (coeffDim == 1)
--      {
--         PAHcurlL2Setup(nq, coeffDim, ne, ir->GetWeights(), coeff, pa_data);
--      }
--      else
--      {
--         PAHcurlHdivSetup3D(quad1D, coeffDim, ne, false, ir->GetWeights(),
--                            geom->J, coeff, pa_data);
--      }
--   }
--   else if (trialType == mfem::FiniteElement::DIV && dim == 3 &&
--            test_el->GetOrder() == trial_el->GetOrder())
--   {
--      PACurlCurlSetup3D(quad1D, coeffDim, ne, ir->GetWeights(), geom->J, coeff,
--                        pa_data);
--   }
--   else
--   {
--      MFEM_ABORT("Unknown kernel.");
--   }
--}
--
--// Apply to x corresponding to DOFs in H(curl) (trial), integrated against curl
--// of H(curl) test functions corresponding to y.
--template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
--static void PAHcurlL2Apply3DTranspose(const int D1D,
--                                      const int Q1D,
--                                      const int coeffDim,
--                                      const int NE,
--                                      const Array<double> &bo,
--                                      const Array<double> &bc,
--                                      const Array<double> &bot,
--                                      const Array<double> &bct,
--                                      const Array<double> &gct,
--                                      const Vector &pa_data,
--                                      const Vector &x,
--                                      Vector &y)
--{
--   // See PAHcurlL2Apply3D for comments.
--
--   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
--
--   constexpr static int VDIM = 3;
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
--   auto Bct = Reshape(bct.Read(), D1D, Q1D);
--   auto Gct = Reshape(gct.Read(), D1D, Q1D);
--   auto op = Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
--   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
--   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double mass[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];
--
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               for (int c = 0; c < VDIM; ++c)
--               {
--                  mass[qz][qy][qx][c] = 0.0;
--               }
--            }
--         }
--      }
--
--      int osc = 0;
--
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--      {
--         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
--         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--         for (int dz = 0; dz < D1Dz; ++dz)
--         {
--            double massXY[MAX_Q1D][MAX_Q1D];
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  massXY[qy][qx] = 0.0;
--               }
--            }
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               double massX[MAX_Q1D];
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  massX[qx] = 0.0;
--               }
--
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     massX[qx] += t * ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
--                  }
--               }
--
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  const double wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     const double wx = massX[qx];
--                     massXY[qy][qx] += wx * wy;
--                  }
--               }
--            }
--
--            for (int qz = 0; qz < Q1D; ++qz)
--            {
--               const double wz = (c == 2) ? Bo(qz,dz) : Bc(qz,dz);
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     mass[qz][qy][qx][c] += massXY[qy][qx] * wz;
--                  }
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }  // loop (c) over components
--
--      // Apply D operator.
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               const double O11 = op(0,qx,qy,qz,e);
--               if (coeffDim == 1)
--               {
--                  for (int c = 0; c < VDIM; ++c)
--                  {
--                     mass[qz][qy][qx][c] *= O11;
--                  }
--               }
--               else
--               {
--                  const double O12 = op(1,qx,qy,qz,e);
--                  const double O13 = op(2,qx,qy,qz,e);
--                  const double O21 = op(3,qx,qy,qz,e);
--                  const double O22 = op(4,qx,qy,qz,e);
--                  const double O23 = op(5,qx,qy,qz,e);
--                  const double O31 = op(6,qx,qy,qz,e);
--                  const double O32 = op(7,qx,qy,qz,e);
--                  const double O33 = op(8,qx,qy,qz,e);
--                  const double massX = mass[qz][qy][qx][0];
--                  const double massY = mass[qz][qy][qx][1];
--                  const double massZ = mass[qz][qy][qx][2];
--                  mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
--                  mass[qz][qy][qx][1] = (O21*massX)+(O22*massY)+(O23*massZ);
--                  mass[qz][qy][qx][2] = (O31*massX)+(O32*massY)+(O33*massZ);
--               }
--            }
--         }
--      }
--
--      // x component
--      osc = 0;
--      {
--         const int D1Dz = D1D;
--         const int D1Dy = D1D;
--         const int D1Dx = D1D - 1;
--
--         for (int qz = 0; qz < Q1D; ++qz)
--         {
--            double gradXY12[MAX_D1D][MAX_D1D];
--            double gradXY21[MAX_D1D][MAX_D1D];
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  gradXY12[dy][dx] = 0.0;
--                  gradXY21[dy][dx] = 0.0;
--               }
--            }
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               double massX[MAX_D1D][2];
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  for (int n = 0; n < 2; ++n)
--                  {
--                     massX[dx][n] = 0.0;
--                  }
--               }
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     const double wx = Bot(dx,qx);
--
--                     massX[dx][0] += wx * mass[qz][qy][qx][1];
--                     massX[dx][1] += wx * mass[qz][qy][qx][2];
--                  }
--               }
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  const double wy = Bct(dy,qy);
--                  const double wDy = Gct(dy,qy);
--
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     gradXY21[dy][dx] += massX[dx][0] * wy;
--                     gradXY12[dy][dx] += massX[dx][1] * wDy;
--                  }
--               }
--            }
--
--            for (int dz = 0; dz < D1Dz; ++dz)
--            {
--               const double wz = Bct(dz,qz);
--               const double wDz = Gct(dz,qz);
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
--                     // (u_0)_{x_2} * (op * curl)_1 - (u_0)_{x_1} * (op * curl)_2
--                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
--                       e) += (gradXY21[dy][dx] * wDz) - (gradXY12[dy][dx] * wz);
--                  }
--               }
--            }
--         }  // loop qz
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }
--
--      // y component
--      {
--         const int D1Dz = D1D;
--         const int D1Dy = D1D - 1;
--         const int D1Dx = D1D;
--
--         for (int qz = 0; qz < Q1D; ++qz)
--         {
--            double gradXY02[MAX_D1D][MAX_D1D];
--            double gradXY20[MAX_D1D][MAX_D1D];
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  gradXY02[dy][dx] = 0.0;
--                  gradXY20[dy][dx] = 0.0;
--               }
--            }
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               double massY[MAX_D1D][2];
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  massY[dy][0] = 0.0;
--                  massY[dy][1] = 0.0;
--               }
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int dy = 0; dy < D1Dy; ++dy)
--                  {
--                     const double wy = Bot(dy,qy);
--
--                     massY[dy][0] += wy * mass[qz][qy][qx][2];
--                     massY[dy][1] += wy * mass[qz][qy][qx][0];
--                  }
--               }
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  const double wx = Bct(dx,qx);
--                  const double wDx = Gct(dx,qx);
--
--                  for (int dy = 0; dy < D1Dy; ++dy)
--                  {
--                     gradXY02[dy][dx] += massY[dy][0] * wDx;
--                     gradXY20[dy][dx] += massY[dy][1] * wx;
--                  }
--               }
--            }
--
--            for (int dz = 0; dz < D1Dz; ++dz)
--            {
--               const double wz = Bct(dz,qz);
--               const double wDz = Gct(dz,qz);
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
--                     // -(u_1)_{x_2} * (op * curl)_0 + (u_1)_{x_0} * (op * curl)_2
--                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
--                       e) += (-gradXY20[dy][dx] * wDz) + (gradXY02[dy][dx] * wz);
--                  }
--               }
--            }
--         }  // loop qz
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }
--
--      // z component
--      {
--         const int D1Dz = D1D - 1;
--         const int D1Dy = D1D;
--         const int D1Dx = D1D;
--
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            double gradYZ01[MAX_D1D][MAX_D1D];
--            double gradYZ10[MAX_D1D][MAX_D1D];
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               for (int dz = 0; dz < D1Dz; ++dz)
--               {
--                  gradYZ01[dz][dy] = 0.0;
--                  gradYZ10[dz][dy] = 0.0;
--               }
--            }
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               double massZ[MAX_D1D][2];
--               for (int dz = 0; dz < D1Dz; ++dz)
--               {
--                  for (int n = 0; n < 2; ++n)
--                  {
--                     massZ[dz][n] = 0.0;
--                  }
--               }
--               for (int qz = 0; qz < Q1D; ++qz)
--               {
--                  for (int dz = 0; dz < D1Dz; ++dz)
--                  {
--                     const double wz = Bot(dz,qz);
--
--                     massZ[dz][0] += wz * mass[qz][qy][qx][0];
--                     massZ[dz][1] += wz * mass[qz][qy][qx][1];
--                  }
--               }
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  const double wy = Bct(dy,qy);
--                  const double wDy = Gct(dy,qy);
--
--                  for (int dz = 0; dz < D1Dz; ++dz)
--                  {
--                     gradYZ01[dz][dy] += wy * massZ[dz][1];
--                     gradYZ10[dz][dy] += wDy * massZ[dz][0];
--                  }
--               }
--            }
--
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               const double wx = Bct(dx,qx);
--               const double wDx = Gct(dx,qx);
--
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  for (int dz = 0; dz < D1Dz; ++dz)
--                  {
--                     // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
--                     // (u_2)_{x_1} * (op * curl)_0 - (u_2)_{x_0} * (op * curl)_1
--                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
--                       e) += (gradYZ10[dz][dy] * wx) - (gradYZ01[dz][dy] * wDx);
--                  }
--               }
--            }
--         }  // loop qx
--      }
--   });
--}
--
--template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
--static void SmemPAHcurlL2Apply3DTranspose(const int D1D,
--                                          const int Q1D,
--                                          const int coeffDim,
--                                          const int NE,
--                                          const Array<double> &bo,
--                                          const Array<double> &bc,
--                                          const Array<double> &gc,
--                                          const Vector &pa_data,
--                                          const Vector &x,
--                                          Vector &y)
--{
--   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
--
--   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(bc.Read(), Q1D, D1D);
--   auto Gc = Reshape(gc.Read(), Q1D, D1D);
--   auto op = Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
--   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
--   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
--
--   auto device_kernel = [=] MFEM_DEVICE (int e)
--   {
--      constexpr int VDIM = 3;
--      constexpr int maxCoeffDim = 9;
--
--      MFEM_SHARED double sBo[MAX_D1D][MAX_Q1D];
--      MFEM_SHARED double sBc[MAX_D1D][MAX_Q1D];
--      MFEM_SHARED double sGc[MAX_D1D][MAX_Q1D];
--
--      double opc[maxCoeffDim];
--      MFEM_SHARED double sop[maxCoeffDim][MAX_Q1D][MAX_Q1D];
--      MFEM_SHARED double mass[MAX_Q1D][MAX_Q1D][3];
--
--      MFEM_SHARED double sX[MAX_D1D][MAX_D1D][MAX_D1D];
--
--      MFEM_FOREACH_THREAD(qx,x,Q1D)
--      {
--         MFEM_FOREACH_THREAD(qy,y,Q1D)
--         {
--            MFEM_FOREACH_THREAD(qz,z,Q1D)
--            {
--               for (int i=0; i<coeffDim; ++i)
--               {
--                  opc[i] = op(i,qx,qy,qz,e);
--               }
--            }
--         }
--      }
--
--      const int tidx = MFEM_THREAD_ID(x);
--      const int tidy = MFEM_THREAD_ID(y);
--      const int tidz = MFEM_THREAD_ID(z);
--
--      if (tidz == 0)
--      {
--         MFEM_FOREACH_THREAD(d,y,D1D)
--         {
--            MFEM_FOREACH_THREAD(q,x,Q1D)
--            {
--               sBc[d][q] = Bc(q,d);
--               sGc[d][q] = Gc(q,d);
--               if (d < D1D-1)
--               {
--                  sBo[d][q] = Bo(q,d);
--               }
--            }
--         }
--      }
--      MFEM_SYNC_THREAD;
--
--      for (int qz=0; qz < Q1D; ++qz)
--      {
--         if (tidz == qz)
--         {
--            MFEM_FOREACH_THREAD(qy,y,Q1D)
--            {
--               MFEM_FOREACH_THREAD(qx,x,Q1D)
--               {
--                  for (int i=0; i<3; ++i)
--                  {
--                     mass[qy][qx][i] = 0.0;
--                  }
--               }
--            }
--         }
--
--         int osc = 0;
--         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--         {
--            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
--            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
--            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
--
--            MFEM_FOREACH_THREAD(dz,z,D1Dz)
--            {
--               MFEM_FOREACH_THREAD(dy,y,D1Dy)
--               {
--                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
--                  {
--                     sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  }
--               }
--            }
--            MFEM_SYNC_THREAD;
--
--            if (tidz == qz)
--            {
--               if (c == 0)
--               {
--                  for (int i=0; i<coeffDim; ++i)
--                  {
--                     sop[i][tidx][tidy] = opc[i];
--                  }
--               }
--
--               MFEM_FOREACH_THREAD(qy,y,Q1D)
--               {
--                  MFEM_FOREACH_THREAD(qx,x,Q1D)
--                  {
--                     double u = 0.0;
--
--                     for (int dz = 0; dz < D1Dz; ++dz)
--                     {
--                        const double wz = (c == 2) ? sBo[dz][qz] : sBc[dz][qz];
--
--                        for (int dy = 0; dy < D1Dy; ++dy)
--                        {
--                           const double wy = (c == 1) ? sBo[dy][qy] : sBc[dy][qy];
--
--                           for (int dx = 0; dx < D1Dx; ++dx)
--                           {
--                              const double wx = sX[dz][dy][dx] * ((c == 0) ? sBo[dx][qx] : sBc[dx][qx]);
--                              u += wx * wy * wz;
--                           }
--                        }
--                     }
--
--                     mass[qy][qx][c] += u;
--                  } // qx
--               } // qy
--            } // tidz == qz
--
--            osc += D1Dx * D1Dy * D1Dz;
--            MFEM_SYNC_THREAD;
--         } // c
--
--         double dxyz1 = 0.0;
--         double dxyz2 = 0.0;
--         double dxyz3 = 0.0;
--
--         MFEM_FOREACH_THREAD(dz,z,D1D)
--         {
--            const double wcz = sBc[dz][qz];
--            const double wcDz = sGc[dz][qz];
--            const double wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
--
--            MFEM_FOREACH_THREAD(dy,y,D1D)
--            {
--               MFEM_FOREACH_THREAD(dx,x,D1D)
--               {
--                  for (int qy = 0; qy < Q1D; ++qy)
--                  {
--                     const double wcy = sBc[dy][qy];
--                     const double wcDy = sGc[dy][qy];
--                     const double wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
--
--                     for (int qx = 0; qx < Q1D; ++qx)
--                     {
--                        const double O11 = sop[0][qx][qy];
--                        double c1, c2, c3;
--                        if (coeffDim == 1)
--                        {
--                           c1 = O11 * mass[qy][qx][0];
--                           c2 = O11 * mass[qy][qx][1];
--                           c3 = O11 * mass[qy][qx][2];
--                        }
--                        else
--                        {
--                           const double O12 = sop[1][qx][qy];
--                           const double O13 = sop[2][qx][qy];
--                           const double O21 = sop[3][qx][qy];
--                           const double O22 = sop[4][qx][qy];
--                           const double O23 = sop[5][qx][qy];
--                           const double O31 = sop[6][qx][qy];
--                           const double O32 = sop[7][qx][qy];
--                           const double O33 = sop[8][qx][qy];
--
--                           c1 = (O11*mass[qy][qx][0])+(O12*mass[qy][qx][1])+(O13*mass[qy][qx][2]);
--                           c2 = (O21*mass[qy][qx][0])+(O22*mass[qy][qx][1])+(O23*mass[qy][qx][2]);
--                           c3 = (O31*mass[qy][qx][0])+(O32*mass[qy][qx][1])+(O33*mass[qy][qx][2]);
--                        }
--
--                        const double wcx = sBc[dx][qx];
--                        const double wDx = sGc[dx][qx];
--
--                        if (dx < D1D-1)
--                        {
--                           const double wx = sBo[dx][qx];
--                           dxyz1 += (wx * c2 * wcy * wcDz) - (wx * c3 * wcDy * wcz);
--                        }
--
--                        dxyz2 += (-wy * c1 * wcx * wcDz) + (wy * c3 * wDx * wcz);
--
--                        dxyz3 += (wcDy * wz * c1 * wcx) - (wcy * wz * c2 * wDx);
--                     } // qx
--                  } // qy
--               } // dx
--            } // dy
--         } // dz
--
--         MFEM_SYNC_THREAD;
--
--         MFEM_FOREACH_THREAD(dz,z,D1D)
--         {
--            MFEM_FOREACH_THREAD(dy,y,D1D)
--            {
--               MFEM_FOREACH_THREAD(dx,x,D1D)
--               {
--                  if (dx < D1D-1)
--                  {
--                     Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
--                  }
--                  if (dy < D1D-1)
--                  {
--                     Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
--                  }
--                  if (dz < D1D-1)
--                  {
--                     Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
--                  }
--               }
--            }
--         }
--      } // qz
--   }; // end of element loop
--
--   auto host_kernel = [&] MFEM_LAMBDA (int)
--   {
--      MFEM_ABORT_KERNEL("This kernel should only be used on GPU.");
--   };
--
--   ForallWrap<3>(true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
--}
--
--void MixedVectorWeakCurlIntegrator::AddMultPA(const Vector &x, Vector &y) const
--{
--   if (testType == mfem::FiniteElement::CURL &&
--       trialType == mfem::FiniteElement::CURL && dim == 3)
--   {
--      const int ndata = coeffDim == 1 ? 1 : 9;
--      if (Device::Allows(Backend::DEVICE_MASK))
--      {
--         const int ID = (dofs1D << 4) | quad1D;
--         switch (ID)
--         {
--            case 0x23: return SmemPAHcurlL2Apply3DTranspose<2,3>(dofs1D, quad1D, ndata,
--                                                                    ne, mapsO->B, mapsC->B,
--                                                                    mapsC->G, pa_data, x, y);
--            case 0x34: return SmemPAHcurlL2Apply3DTranspose<3,4>(dofs1D, quad1D, ndata,
--                                                                    ne, mapsO->B, mapsC->B,
--                                                                    mapsC->G, pa_data, x, y);
--            case 0x45: return SmemPAHcurlL2Apply3DTranspose<4,5>(dofs1D, quad1D, ndata,
--                                                                    ne, mapsO->B, mapsC->B,
--                                                                    mapsC->G, pa_data, x, y);
--            case 0x56: return SmemPAHcurlL2Apply3DTranspose<5,6>(dofs1D, quad1D, ndata,
--                                                                    ne, mapsO->B, mapsC->B,
--                                                                    mapsC->G, pa_data, x, y);
--            default: return SmemPAHcurlL2Apply3DTranspose(dofs1D, quad1D, ndata, ne,
--                                                             mapsO->B, mapsC->B,
--                                                             mapsC->G, pa_data, x, y);
--         }
--      }
--      else
--         PAHcurlL2Apply3DTranspose(dofs1D, quad1D, ndata, ne, mapsO->B,
--                                   mapsC->B, mapsO->Bt, mapsC->Bt, mapsC->Gt, pa_data, x, y);
--   }
--   else if (testType == mfem::FiniteElement::CURL &&
--            trialType == mfem::FiniteElement::DIV && dim == 3)
--   {
--      PAHcurlHdivApply3DTranspose(dofs1D, dofs1D, quad1D, ne, mapsO->B,
--                                  mapsC->B, mapsO->Bt, mapsC->Bt,
--                                  mapsC->Gt, pa_data, x, y);
--   }
--   else
--   {
--      MFEM_ABORT("Unsupported dimension or space!");
--   }
--}
--
--void MixedVectorWeakCurlIntegrator::AddMultTransposePA(const Vector &x,
--                                                       Vector &y) const
--{
--   if (testType == mfem::FiniteElement::CURL &&
--       trialType == mfem::FiniteElement::DIV && dim == 3)
--   {
--      PAHcurlHdivApply3D(dofs1D, dofs1D, quad1D, ne, mapsO->B,
--                         mapsC->B, mapsO->Bt, mapsC->Bt, mapsC->G,
--                         pa_data, x, y);
--   }
--   else
--   {
--      MFEM_ABORT("Unsupported dimension or space!");
--   }
--}
--
--// Apply to x corresponding to DOFs in H^1 (domain) the (topological) gradient
--// to get a dof in H(curl) (range). You can think of the range as the "test" space
--// and the domain as the "trial" space, but there's no integration.
--static void PAHcurlApplyGradient2D(const int c_dofs1D,
--                                   const int o_dofs1D,
--                                   const int NE,
--                                   const Array<double> &B_,
--                                   const Array<double> &G_,
--                                   const Vector &x_,
--                                   Vector &y_)
--{
--   auto B = Reshape(B_.Read(), c_dofs1D, c_dofs1D);
--   auto G = Reshape(G_.Read(), o_dofs1D, c_dofs1D);
--
--   auto x = Reshape(x_.Read(), c_dofs1D, c_dofs1D, NE);
--   auto y = Reshape(y_.ReadWrite(), 2 * c_dofs1D * o_dofs1D, NE);
--
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double w[MAX_D1D][MAX_D1D];
--
--      // horizontal part
--      for (int dx = 0; dx < c_dofs1D; ++dx)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            w[dx][ey] = 0.0;
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               w[dx][ey] += B(ey, dy) * x(dx, dy, e);
--            }
--         }
--      }
--
--      for (int ey = 0; ey < c_dofs1D; ++ey)
--      {
--         for (int ex = 0; ex < o_dofs1D; ++ex)
--         {
--            double s = 0.0;
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               s += G(ex, dx) * w[dx][ey];
--            }
--            const int local_index = ey*o_dofs1D + ex;
--            y(local_index, e) += s;
--         }
--      }
--
--      // vertical part
--      for (int dx = 0; dx < c_dofs1D; ++dx)
--      {
--         for (int ey = 0; ey < o_dofs1D; ++ey)
--         {
--            w[dx][ey] = 0.0;
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               w[dx][ey] += G(ey, dy) * x(dx, dy, e);
--            }
--         }
--      }
--
--      for (int ey = 0; ey < o_dofs1D; ++ey)
--      {
--         for (int ex = 0; ex < c_dofs1D; ++ex)
--         {
--            double s = 0.0;
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               s += B(ex, dx) * w[dx][ey];
--            }
--            const int local_index = c_dofs1D * o_dofs1D + ey*c_dofs1D + ex;
--            y(local_index, e) += s;
--         }
--      }
--   });
--}
--
--// Specialization of PAHcurlApplyGradient2D to the case where B is identity
--static void PAHcurlApplyGradient2DBId(const int c_dofs1D,
--                                      const int o_dofs1D,
--                                      const int NE,
--                                      const Array<double> &G_,
--                                      const Vector &x_,
--                                      Vector &y_)
--{
--   auto G = Reshape(G_.Read(), o_dofs1D, c_dofs1D);
--
--   auto x = Reshape(x_.Read(), c_dofs1D, c_dofs1D, NE);
--   auto y = Reshape(y_.ReadWrite(), 2 * c_dofs1D * o_dofs1D, NE);
--
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double w[MAX_D1D][MAX_D1D];
--
--      // horizontal part
--      for (int dx = 0; dx < c_dofs1D; ++dx)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            const int dy = ey;
--            w[dx][ey] = x(dx, dy, e);
--         }
--      }
--
--      for (int ey = 0; ey < c_dofs1D; ++ey)
--      {
--         for (int ex = 0; ex < o_dofs1D; ++ex)
--         {
--            double s = 0.0;
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               s += G(ex, dx) * w[dx][ey];
--            }
--            const int local_index = ey*o_dofs1D + ex;
--            y(local_index, e) += s;
--         }
--      }
--
--      // vertical part
--      for (int dx = 0; dx < c_dofs1D; ++dx)
--      {
--         for (int ey = 0; ey < o_dofs1D; ++ey)
--         {
--            w[dx][ey] = 0.0;
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               w[dx][ey] += G(ey, dy) * x(dx, dy, e);
--            }
--         }
--      }
--
--      for (int ey = 0; ey < o_dofs1D; ++ey)
--      {
--         for (int ex = 0; ex < c_dofs1D; ++ex)
--         {
--            const int dx = ex;
--            const double s = w[dx][ey];
--            const int local_index = c_dofs1D * o_dofs1D + ey*c_dofs1D + ex;
--            y(local_index, e) += s;
--         }
--      }
--   });
--}
--
--static void PAHcurlApplyGradientTranspose2D(
--   const int c_dofs1D, const int o_dofs1D, const int NE,
--   const Array<double> &B_, const Array<double> &G_,
--   const Vector &x_, Vector &y_)
--{
--   auto B = Reshape(B_.Read(), c_dofs1D, c_dofs1D);
--   auto G = Reshape(G_.Read(), o_dofs1D, c_dofs1D);
--
--   auto x = Reshape(x_.Read(), 2 * c_dofs1D * o_dofs1D, NE);
--   auto y = Reshape(y_.ReadWrite(), c_dofs1D, c_dofs1D, NE);
--
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double w[MAX_D1D][MAX_D1D];
--
--      // horizontal part (open x, closed y)
--      for (int dy = 0; dy < c_dofs1D; ++dy)
--      {
--         for (int ex = 0; ex < o_dofs1D; ++ex)
--         {
--            w[dy][ex] = 0.0;
--            for (int ey = 0; ey < c_dofs1D; ++ey)
--            {
--               const int local_index = ey*o_dofs1D + ex;
--               w[dy][ex] += B(ey, dy) * x(local_index, e);
--            }
--         }
--      }
--
--      for (int dy = 0; dy < c_dofs1D; ++dy)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            double s = 0.0;
--            for (int ex = 0; ex < o_dofs1D; ++ex)
--            {
--               s += G(ex, dx) * w[dy][ex];
--            }
--            y(dx, dy, e) += s;
--         }
--      }
--
--      // vertical part (open y, closed x)
--      for (int dy = 0; dy < c_dofs1D; ++dy)
--      {
--         for (int ex = 0; ex < c_dofs1D; ++ex)
--         {
--            w[dy][ex] = 0.0;
--            for (int ey = 0; ey < o_dofs1D; ++ey)
--            {
--               const int local_index = c_dofs1D * o_dofs1D + ey*c_dofs1D + ex;
--               w[dy][ex] += G(ey, dy) * x(local_index, e);
--            }
--         }
--      }
--
--      for (int dy = 0; dy < c_dofs1D; ++dy)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            double s = 0.0;
--            for (int ex = 0; ex < c_dofs1D; ++ex)
--            {
--               s += B(ex, dx) * w[dy][ex];
--            }
--            y(dx, dy, e) += s;
--         }
--      }
--   });
--}
--
--// Specialization of PAHcurlApplyGradientTranspose2D to the case where
--// B is identity
--static void PAHcurlApplyGradientTranspose2DBId(
--   const int c_dofs1D, const int o_dofs1D, const int NE,
--   const Array<double> &G_,
--   const Vector &x_, Vector &y_)
--{
--   auto G = Reshape(G_.Read(), o_dofs1D, c_dofs1D);
--
--   auto x = Reshape(x_.Read(), 2 * c_dofs1D * o_dofs1D, NE);
--   auto y = Reshape(y_.ReadWrite(), c_dofs1D, c_dofs1D, NE);
--
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double w[MAX_D1D][MAX_D1D];
--
--      // horizontal part (open x, closed y)
--      for (int dy = 0; dy < c_dofs1D; ++dy)
--      {
--         for (int ex = 0; ex < o_dofs1D; ++ex)
--         {
--            const int ey = dy;
--            const int local_index = ey*o_dofs1D + ex;
--            w[dy][ex] = x(local_index, e);
--         }
--      }
--
--      for (int dy = 0; dy < c_dofs1D; ++dy)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            double s = 0.0;
--            for (int ex = 0; ex < o_dofs1D; ++ex)
--            {
--               s += G(ex, dx) * w[dy][ex];
--            }
--            y(dx, dy, e) += s;
--         }
--      }
--
--      // vertical part (open y, closed x)
--      for (int dy = 0; dy < c_dofs1D; ++dy)
--      {
--         for (int ex = 0; ex < c_dofs1D; ++ex)
--         {
--            w[dy][ex] = 0.0;
--            for (int ey = 0; ey < o_dofs1D; ++ey)
--            {
--               const int local_index = c_dofs1D * o_dofs1D + ey*c_dofs1D + ex;
--               w[dy][ex] += G(ey, dy) * x(local_index, e);
--            }
--         }
--      }
--
--      for (int dy = 0; dy < c_dofs1D; ++dy)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            const int ex = dx;
--            const double s = w[dy][ex];
--            y(dx, dy, e) += s;
--         }
--      }
--   });
--}
--
--static void PAHcurlApplyGradient3D(const int c_dofs1D,
--                                   const int o_dofs1D,
--                                   const int NE,
--                                   const Array<double> &B_,
--                                   const Array<double> &G_,
--                                   const Vector &x_,
--                                   Vector &y_)
--{
--   auto B = Reshape(B_.Read(), c_dofs1D, c_dofs1D);
--   auto G = Reshape(G_.Read(), o_dofs1D, c_dofs1D);
--
--   auto x = Reshape(x_.Read(), c_dofs1D, c_dofs1D, c_dofs1D, NE);
--   auto y = Reshape(y_.ReadWrite(), (3 * c_dofs1D * c_dofs1D * o_dofs1D), NE);
--
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double w1[MAX_D1D][MAX_D1D][MAX_D1D];
--      double w2[MAX_D1D][MAX_D1D][MAX_D1D];
--
--      // ---
--      // dofs that point parallel to x-axis (open in x, closed in y, z)
--      // ---
--
--      // contract in z
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               w1[dx][dy][ez] = 0.0;
--               for (int dz = 0; dz < c_dofs1D; ++dz)
--               {
--                  w1[dx][dy][ez] += B(ez, dz) * x(dx, dy, dz, e);
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               w2[dx][ey][ez] = 0.0;
--               for (int dy = 0; dy < c_dofs1D; ++dy)
--               {
--                  w2[dx][ey][ez] += B(ey, dy) * w1[dx][dy][ez];
--               }
--            }
--         }
--      }
--
--      // contract in x
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int ex = 0; ex < o_dofs1D; ++ex)
--            {
--               double s = 0.0;
--               for (int dx = 0; dx < c_dofs1D; ++dx)
--               {
--                  s += G(ex, dx) * w2[dx][ey][ez];
--               }
--               const int local_index = ez*c_dofs1D*o_dofs1D + ey*o_dofs1D + ex;
--               y(local_index, e) += s;
--            }
--         }
--      }
--
--      // ---
--      // dofs that point parallel to y-axis (open in y, closed in x, z)
--      // ---
--
--      // contract in z
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               w1[dx][dy][ez] = 0.0;
--               for (int dz = 0; dz < c_dofs1D; ++dz)
--               {
--                  w1[dx][dy][ez] += B(ez, dz) * x(dx, dy, dz, e);
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < o_dofs1D; ++ey)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               w2[dx][ey][ez] = 0.0;
--               for (int dy = 0; dy < c_dofs1D; ++dy)
--               {
--                  w2[dx][ey][ez] += G(ey, dy) * w1[dx][dy][ez];
--               }
--            }
--         }
--      }
--
--      // contract in x
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < o_dofs1D; ++ey)
--         {
--            for (int ex = 0; ex < c_dofs1D; ++ex)
--            {
--               double s = 0.0;
--               for (int dx = 0; dx < c_dofs1D; ++dx)
--               {
--                  s += B(ex, dx) * w2[dx][ey][ez];
--               }
--               const int local_index = c_dofs1D*c_dofs1D*o_dofs1D +
--                                       ez*c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
--               y(local_index, e) += s;
--            }
--         }
--      }
--
--      // ---
--      // dofs that point parallel to z-axis (open in z, closed in x, y)
--      // ---
--
--      // contract in z
--      for (int ez = 0; ez < o_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               w1[dx][dy][ez] = 0.0;
--               for (int dz = 0; dz < c_dofs1D; ++dz)
--               {
--                  w1[dx][dy][ez] += G(ez, dz) * x(dx, dy, dz, e);
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < o_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               w2[dx][ey][ez] = 0.0;
--               for (int dy = 0; dy < c_dofs1D; ++dy)
--               {
--                  w2[dx][ey][ez] += B(ey, dy) * w1[dx][dy][ez];
--               }
--            }
--         }
--      }
--
--      // contract in x
--      for (int ez = 0; ez < o_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int ex = 0; ex < c_dofs1D; ++ex)
--            {
--               double s = 0.0;
--               for (int dx = 0; dx < c_dofs1D; ++dx)
--               {
--                  s += B(ex, dx) * w2[dx][ey][ez];
--               }
--               const int local_index = 2*c_dofs1D*c_dofs1D*o_dofs1D +
--                                       ez*c_dofs1D*c_dofs1D + ey*c_dofs1D + ex;
--               y(local_index, e) += s;
--            }
--         }
--      }
--   });
--}
--
--// Specialization of PAHcurlApplyGradient3D to the case where
--static void PAHcurlApplyGradient3DBId(const int c_dofs1D,
--                                      const int o_dofs1D,
--                                      const int NE,
--                                      const Array<double> &G_,
--                                      const Vector &x_,
--                                      Vector &y_)
--{
--   auto G = Reshape(G_.Read(), o_dofs1D, c_dofs1D);
--
--   auto x = Reshape(x_.Read(), c_dofs1D, c_dofs1D, c_dofs1D, NE);
--   auto y = Reshape(y_.ReadWrite(), (3 * c_dofs1D * c_dofs1D * o_dofs1D), NE);
--
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double w1[MAX_D1D][MAX_D1D][MAX_D1D];
--      double w2[MAX_D1D][MAX_D1D][MAX_D1D];
--
--      // ---
--      // dofs that point parallel to x-axis (open in x, closed in y, z)
--      // ---
--
--      // contract in z
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               const int dz = ez;
--               w1[dx][dy][ez] = x(dx, dy, dz, e);
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               const int dy = ey;
--               w2[dx][ey][ez] = w1[dx][dy][ez];
--            }
--         }
--      }
--
--      // contract in x
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int ex = 0; ex < o_dofs1D; ++ex)
--            {
--               double s = 0.0;
--               for (int dx = 0; dx < c_dofs1D; ++dx)
--               {
--                  s += G(ex, dx) * w2[dx][ey][ez];
--               }
--               const int local_index = ez*c_dofs1D*o_dofs1D + ey*o_dofs1D + ex;
--               y(local_index, e) += s;
--            }
--         }
--      }
--
--      // ---
--      // dofs that point parallel to y-axis (open in y, closed in x, z)
--      // ---
--
--      // contract in z
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               const int dz = ez;
--               w1[dx][dy][ez] = x(dx, dy, dz, e);
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < o_dofs1D; ++ey)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               w2[dx][ey][ez] = 0.0;
--               for (int dy = 0; dy < c_dofs1D; ++dy)
--               {
--                  w2[dx][ey][ez] += G(ey, dy) * w1[dx][dy][ez];
--               }
--            }
--         }
--      }
--
--      // contract in x
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < o_dofs1D; ++ey)
--         {
--            for (int ex = 0; ex < c_dofs1D; ++ex)
--            {
--               const int dx = ex;
--               const double s = w2[dx][ey][ez];
--               const int local_index = c_dofs1D*c_dofs1D*o_dofs1D +
--                                       ez*c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
--               y(local_index, e) += s;
--            }
--         }
--      }
--
--      // ---
--      // dofs that point parallel to z-axis (open in z, closed in x, y)
--      // ---
--
--      // contract in z
--      for (int ez = 0; ez < o_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               w1[dx][dy][ez] = 0.0;
--               for (int dz = 0; dz < c_dofs1D; ++dz)
--               {
--                  w1[dx][dy][ez] += G(ez, dz) * x(dx, dy, dz, e);
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < o_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               const int dy = ey;
--               w2[dx][ey][ez] = w1[dx][dy][ez];
--            }
--         }
--      }
--
--      // contract in x
--      for (int ez = 0; ez < o_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int ex = 0; ex < c_dofs1D; ++ex)
--            {
--               const int dx = ex;
--               const double s = w2[dx][ey][ez];
--               const int local_index = 2*c_dofs1D*c_dofs1D*o_dofs1D +
--                                       ez*c_dofs1D*c_dofs1D + ey*c_dofs1D + ex;
--               y(local_index, e) += s;
--            }
--         }
--      }
--   });
--}
--
--static void PAHcurlApplyGradientTranspose3D(
--   const int c_dofs1D, const int o_dofs1D, const int NE,
--   const Array<double> &B_, const Array<double> &G_,
--   const Vector &x_, Vector &y_)
--{
--   auto B = Reshape(B_.Read(), c_dofs1D, c_dofs1D);
--   auto G = Reshape(G_.Read(), o_dofs1D, c_dofs1D);
--
--   auto x = Reshape(x_.Read(), (3 * c_dofs1D * c_dofs1D * o_dofs1D), NE);
--   auto y = Reshape(y_.ReadWrite(), c_dofs1D, c_dofs1D, c_dofs1D, NE);
--
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double w1[MAX_D1D][MAX_D1D][MAX_D1D];
--      double w2[MAX_D1D][MAX_D1D][MAX_D1D];
--      // ---
--      // dofs that point parallel to x-axis (open in x, closed in y, z)
--      // ---
--
--      // contract in z
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int ex = 0; ex < o_dofs1D; ++ex)
--         {
--            for (int ey = 0; ey < c_dofs1D; ++ey)
--            {
--               w1[ex][ey][dz] = 0.0;
--               for (int ez = 0; ez < c_dofs1D; ++ez)
--               {
--                  const int local_index = ez*c_dofs1D*o_dofs1D + ey*o_dofs1D + ex;
--                  w1[ex][ey][dz] += B(ez, dz) * x(local_index, e);
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int ex = 0; ex < o_dofs1D; ++ex)
--            {
--               w2[ex][dy][dz] = 0.0;
--               for (int ey = 0; ey < c_dofs1D; ++ey)
--               {
--                  w2[ex][dy][dz] += B(ey, dy) * w1[ex][ey][dz];
--               }
--            }
--         }
--      }
--
--      // contract in x
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               double s = 0.0;
--               for (int ex = 0; ex < o_dofs1D; ++ex)
--               {
--                  s += G(ex, dx) * w2[ex][dy][dz];
--               }
--               y(dx, dy, dz, e) += s;
--            }
--         }
--      }
--
--      // ---
--      // dofs that point parallel to y-axis (open in y, closed in x, z)
--      // ---
--
--      // contract in z
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int ex = 0; ex < c_dofs1D; ++ex)
--         {
--            for (int ey = 0; ey < o_dofs1D; ++ey)
--            {
--               w1[ex][ey][dz] = 0.0;
--               for (int ez = 0; ez < c_dofs1D; ++ez)
--               {
--                  const int local_index = c_dofs1D*c_dofs1D*o_dofs1D +
--                                          ez*c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
--                  w1[ex][ey][dz] += B(ez, dz) * x(local_index, e);
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int ex = 0; ex < c_dofs1D; ++ex)
--            {
--               w2[ex][dy][dz] = 0.0;
--               for (int ey = 0; ey < o_dofs1D; ++ey)
--               {
--                  w2[ex][dy][dz] += G(ey, dy) * w1[ex][ey][dz];
--               }
--            }
--         }
--      }
--
--      // contract in x
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               double s = 0.0;
--               for (int ex = 0; ex < c_dofs1D; ++ex)
--               {
--                  s += B(ex, dx) * w2[ex][dy][dz];
--               }
--               y(dx, dy, dz, e) += s;
--            }
--         }
--      }
--
--      // ---
--      // dofs that point parallel to z-axis (open in z, closed in x, y)
--      // ---
--
--      // contract in z
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int ex = 0; ex < c_dofs1D; ++ex)
--         {
--            for (int ey = 0; ey < c_dofs1D; ++ey)
--            {
--               w1[ex][ey][dz] = 0.0;
--               for (int ez = 0; ez < o_dofs1D; ++ez)
--               {
--                  const int local_index = 2*c_dofs1D*c_dofs1D*o_dofs1D +
--                                          ez*c_dofs1D*c_dofs1D + ey*c_dofs1D + ex;
--                  w1[ex][ey][dz] += G(ez, dz) * x(local_index, e);
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int ex = 0; ex < c_dofs1D; ++ex)
--            {
--               w2[ex][dy][dz] = 0.0;
--               for (int ey = 0; ey < c_dofs1D; ++ey)
--               {
--                  w2[ex][dy][dz] += B(ey, dy) * w1[ex][ey][dz];
--               }
--            }
--         }
--      }
--
--      // contract in x
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               double s = 0.0;
--               for (int ex = 0; ex < c_dofs1D; ++ex)
--               {
--                  s += B(ex, dx) * w2[ex][dy][dz];
--               }
--               y(dx, dy, dz, e) += s;
--            }
--         }
--      }
--   });
--}
--
--// Specialization of PAHcurlApplyGradientTranspose3D to the case where
--static void PAHcurlApplyGradientTranspose3DBId(
--   const int c_dofs1D, const int o_dofs1D, const int NE,
--   const Array<double> &G_,
--   const Vector &x_, Vector &y_)
--{
--   auto G = Reshape(G_.Read(), o_dofs1D, c_dofs1D);
--
--   auto x = Reshape(x_.Read(), (3 * c_dofs1D * c_dofs1D * o_dofs1D), NE);
--   auto y = Reshape(y_.ReadWrite(), c_dofs1D, c_dofs1D, c_dofs1D, NE);
--
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double w1[MAX_D1D][MAX_D1D][MAX_D1D];
--      double w2[MAX_D1D][MAX_D1D][MAX_D1D];
--      // ---
--      // dofs that point parallel to x-axis (open in x, closed in y, z)
--      // ---
--
--      // contract in z
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int ex = 0; ex < o_dofs1D; ++ex)
--         {
--            for (int ey = 0; ey < c_dofs1D; ++ey)
--            {
--               const int ez = dz;
--               const int local_index = ez*c_dofs1D*o_dofs1D + ey*o_dofs1D + ex;
--               w1[ex][ey][dz] = x(local_index, e);
--            }
--         }
--      }
--
--      // contract in y
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int ex = 0; ex < o_dofs1D; ++ex)
--            {
--               const int ey = dy;
--               w2[ex][dy][dz] = w1[ex][ey][dz];
--            }
--         }
--      }
--
--      // contract in x
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               double s = 0.0;
--               for (int ex = 0; ex < o_dofs1D; ++ex)
--               {
--                  s += G(ex, dx) * w2[ex][dy][dz];
--               }
--               y(dx, dy, dz, e) += s;
--            }
--         }
--      }
--
--      // ---
--      // dofs that point parallel to y-axis (open in y, closed in x, z)
--      // ---
--
--      // contract in z
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int ex = 0; ex < c_dofs1D; ++ex)
--         {
--            for (int ey = 0; ey < o_dofs1D; ++ey)
--            {
--               const int ez = dz;
--               const int local_index = c_dofs1D*c_dofs1D*o_dofs1D +
--                                       ez*c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
--               w1[ex][ey][dz] = x(local_index, e);
--            }
--         }
--      }
--
--      // contract in y
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int ex = 0; ex < c_dofs1D; ++ex)
--            {
--               w2[ex][dy][dz] = 0.0;
--               for (int ey = 0; ey < o_dofs1D; ++ey)
--               {
--                  w2[ex][dy][dz] += G(ey, dy) * w1[ex][ey][dz];
--               }
--            }
--         }
--      }
--
--      // contract in x
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               const int ex = dx;
--               double s = w2[ex][dy][dz];
--               y(dx, dy, dz, e) += s;
--            }
--         }
--      }
--
--      // ---
--      // dofs that point parallel to z-axis (open in z, closed in x, y)
--      // ---
--
--      // contract in z
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int ex = 0; ex < c_dofs1D; ++ex)
--         {
--            for (int ey = 0; ey < c_dofs1D; ++ey)
--            {
--               w1[ex][ey][dz] = 0.0;
--               for (int ez = 0; ez < o_dofs1D; ++ez)
--               {
--                  const int local_index = 2*c_dofs1D*c_dofs1D*o_dofs1D +
--                                          ez*c_dofs1D*c_dofs1D + ey*c_dofs1D + ex;
--                  w1[ex][ey][dz] += G(ez, dz) * x(local_index, e);
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int ex = 0; ex < c_dofs1D; ++ex)
--            {
--               const int ey = dy;
--               w2[ex][dy][dz] = w1[ex][ey][dz];
--            }
--         }
--      }
--
--      // contract in x
--      for (int dz = 0; dz < c_dofs1D; ++dz)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               const int ex = dx;
--               double s = w2[ex][dy][dz];
--               y(dx, dy, dz, e) += s;
--            }
--         }
--      }
--   });
--}
--
--void GradientInterpolator::AssemblePA(const FiniteElementSpace &trial_fes,
--                                      const FiniteElementSpace &test_fes)
--{
--   // Assumes tensor-product elements, with a vector test space and H^1 trial space.
--   Mesh *mesh = trial_fes.GetMesh();
--   const FiniteElement *trial_fel = trial_fes.GetFE(0);
--   const FiniteElement *test_fel = test_fes.GetFE(0);
--
--   const NodalTensorFiniteElement *trial_el =
--      dynamic_cast<const NodalTensorFiniteElement*>(trial_fel);
--   MFEM_VERIFY(trial_el != NULL, "Only NodalTensorFiniteElement is supported!");
--
--   const VectorTensorFiniteElement *test_el =
--      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
--   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const int dims = trial_el->GetDim();
--   MFEM_VERIFY(dims == 2 || dims == 3, "Bad dimension!");
--   dim = mesh->Dimension();
--   MFEM_VERIFY(dim == 2 || dim == 3, "Bad dimension!");
--   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(),
--               "Orders do not match!");
--   ne = trial_fes.GetNE();
--
--   const int order = trial_el->GetOrder();
--   dofquad_fe = new H1_SegmentElement(order, trial_el->GetBasisType());
--   mfem::QuadratureFunctions1D qf1d;
--   mfem::IntegrationRule closed_ir;
--   closed_ir.SetSize(order + 1);
--   qf1d.GaussLobatto(order + 1, &closed_ir);
--   mfem::IntegrationRule open_ir;
--   open_ir.SetSize(order);
--   qf1d.GaussLegendre(order, &open_ir);
--
--   maps_O_C = &dofquad_fe->GetDofToQuad(open_ir, DofToQuad::TENSOR);
--   o_dofs1D = maps_O_C->nqpt;
--   if (trial_el->GetBasisType() == BasisType::GaussLobatto)
--   {
--      B_id = true;
--      c_dofs1D = maps_O_C->ndof;
--   }
--   else
--   {
--      B_id = false;
--      maps_C_C = &dofquad_fe->GetDofToQuad(closed_ir, DofToQuad::TENSOR);
--      c_dofs1D = maps_C_C->nqpt;
--   }
--}
--
--void GradientInterpolator::AddMultPA(const Vector &x, Vector &y) const
--{
--   if (dim == 3)
--   {
--      if (B_id)
--      {
--         PAHcurlApplyGradient3DBId(c_dofs1D, o_dofs1D, ne,
--                                   maps_O_C->G, x, y);
--      }
--      else
--      {
--         PAHcurlApplyGradient3D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
--                                maps_O_C->G, x, y);
--      }
--   }
--   else if (dim == 2)
--   {
--      if (B_id)
--      {
--         PAHcurlApplyGradient2DBId(c_dofs1D, o_dofs1D, ne,
--                                   maps_O_C->G, x, y);
--      }
--      else
--      {
--         PAHcurlApplyGradient2D(c_dofs1D, o_dofs1D, ne, maps_C_C->B, maps_O_C->G,
--                                x, y);
--      }
--   }
--   else
--   {
--      mfem_error("Bad dimension!");
--   }
--}
--
--void GradientInterpolator::AddMultTransposePA(const Vector &x, Vector &y) const
--{
--   if (dim == 3)
--   {
--      if (B_id)
--      {
--         PAHcurlApplyGradientTranspose3DBId(c_dofs1D, o_dofs1D, ne,
--                                            maps_O_C->G, x, y);
--      }
--      else
--      {
--         PAHcurlApplyGradientTranspose3D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
--                                         maps_O_C->G, x, y);
--      }
--   }
--   else if (dim == 2)
--   {
--      if (B_id)
--      {
--         PAHcurlApplyGradientTranspose2DBId(c_dofs1D, o_dofs1D, ne,
--                                            maps_O_C->G, x, y);
--      }
--      else
--      {
--         PAHcurlApplyGradientTranspose2D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
--                                         maps_O_C->G, x, y);
--      }
--   }
--   else
--   {
--      mfem_error("Bad dimension!");
--   }
--}
--
--static void PAHcurlVecH1IdentityApply3D(const int c_dofs1D,
--                                        const int o_dofs1D,
--                                        const int NE,
--                                        const Array<double> &Bclosed,
--                                        const Array<double> &Bopen,
--                                        const Vector &pa_data,
--                                        const Vector &x_,
--                                        Vector &y_)
--{
--   auto Bc = Reshape(Bclosed.Read(), c_dofs1D, c_dofs1D);
--   auto Bo = Reshape(Bopen.Read(), o_dofs1D, c_dofs1D);
--
--   auto x = Reshape(x_.Read(), c_dofs1D, c_dofs1D, c_dofs1D, 3, NE);
--   auto y = Reshape(y_.ReadWrite(), (3 * c_dofs1D * c_dofs1D * o_dofs1D), NE);
--
--   auto vk = Reshape(pa_data.Read(), 3, (3 * c_dofs1D * c_dofs1D * o_dofs1D),
--                     NE);
--
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double w1[3][MAX_D1D][MAX_D1D][MAX_D1D];
--      double w2[3][MAX_D1D][MAX_D1D][MAX_D1D];
--
--      // dofs that point parallel to x-axis (open in x, closed in y, z)
--
--      // contract in z
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w1[j][dx][dy][ez] = 0.0;
--                  for (int dz = 0; dz < c_dofs1D; ++dz)
--                  {
--                     w1[j][dx][dy][ez] += Bc(ez, dz) * x(dx, dy, dz, j, e);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w2[j][dx][ey][ez] = 0.0;
--                  for (int dy = 0; dy < c_dofs1D; ++dy)
--                  {
--                     w2[j][dx][ey][ez] += Bc(ey, dy) * w1[j][dx][dy][ez];
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in x
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int ex = 0; ex < o_dofs1D; ++ex)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  double s = 0.0;
--                  for (int dx = 0; dx < c_dofs1D; ++dx)
--                  {
--                     s += Bo(ex, dx) * w2[j][dx][ey][ez];
--                  }
--                  const int local_index = ez*c_dofs1D*o_dofs1D + ey*o_dofs1D + ex;
--                  y(local_index, e) += s * vk(j, local_index, e);
--               }
--            }
--         }
--      }
--
--      // dofs that point parallel to y-axis (open in y, closed in x, z)
--
--      // contract in z
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w1[j][dx][dy][ez] = 0.0;
--                  for (int dz = 0; dz < c_dofs1D; ++dz)
--                  {
--                     w1[j][dx][dy][ez] += Bc(ez, dz) * x(dx, dy, dz, j, e);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < o_dofs1D; ++ey)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w2[j][dx][ey][ez] = 0.0;
--                  for (int dy = 0; dy < c_dofs1D; ++dy)
--                  {
--                     w2[j][dx][ey][ez] += Bo(ey, dy) * w1[j][dx][dy][ez];
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in x
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < o_dofs1D; ++ey)
--         {
--            for (int ex = 0; ex < c_dofs1D; ++ex)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  double s = 0.0;
--                  for (int dx = 0; dx < c_dofs1D; ++dx)
--                  {
--                     s += Bc(ex, dx) * w2[j][dx][ey][ez];
--                  }
--                  const int local_index = c_dofs1D*c_dofs1D*o_dofs1D +
--                                          ez*c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
--                  y(local_index, e) += s * vk(j, local_index, e);
--               }
--            }
--         }
--      }
--
--      // dofs that point parallel to z-axis (open in z, closed in x, y)
--
--      // contract in z
--      for (int ez = 0; ez < o_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w1[j][dx][dy][ez] = 0.0;
--                  for (int dz = 0; dz < c_dofs1D; ++dz)
--                  {
--                     w1[j][dx][dy][ez] += Bo(ez, dz) * x(dx, dy, dz, j, e);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < o_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w2[j][dx][ey][ez] = 0.0;
--                  for (int dy = 0; dy < c_dofs1D; ++dy)
--                  {
--                     w2[j][dx][ey][ez] += Bc(ey, dy) * w1[j][dx][dy][ez];
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in x
--      for (int ez = 0; ez < o_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int ex = 0; ex < c_dofs1D; ++ex)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  double s = 0.0;
--                  for (int dx = 0; dx < c_dofs1D; ++dx)
--                  {
--                     s += Bc(ex, dx) * w2[j][dx][ey][ez];
--                  }
--                  const int local_index = 2*c_dofs1D*c_dofs1D*o_dofs1D +
--                                          ez*c_dofs1D*c_dofs1D + ey*c_dofs1D + ex;
--                  y(local_index, e) += s * vk(j, local_index, e);
--               }
--            }
--         }
--      }
--   });
--}
--
--static void PAHcurlVecH1IdentityApplyTranspose3D(const int c_dofs1D,
--                                                 const int o_dofs1D,
--                                                 const int NE,
--                                                 const Array<double> &Bclosed,
--                                                 const Array<double> &Bopen,
--                                                 const Vector &pa_data,
--                                                 const Vector &x_,
--                                                 Vector &y_)
--{
--   auto Bc = Reshape(Bclosed.Read(), c_dofs1D, c_dofs1D);
--   auto Bo = Reshape(Bopen.Read(), o_dofs1D, c_dofs1D);
--
--   auto x = Reshape(x_.Read(), (3 * c_dofs1D * c_dofs1D * o_dofs1D), NE);
--   auto y = Reshape(y_.ReadWrite(), c_dofs1D, c_dofs1D, c_dofs1D, 3, NE);
--
--   auto vk = Reshape(pa_data.Read(), 3, (3 * c_dofs1D * c_dofs1D * o_dofs1D),
--                     NE);
--
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--
--   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double w1[3][MAX_D1D][MAX_D1D][MAX_D1D];
--      double w2[3][MAX_D1D][MAX_D1D][MAX_D1D];
--
--      // dofs that point parallel to x-axis (open in x, closed in y, z)
--
--      // contract in x
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int j=0; j<3; ++j)
--            {
--               for (int dx = 0; dx < c_dofs1D; ++dx)
--               {
--                  w2[j][dx][ey][ez] = 0.0;
--               }
--               for (int ex = 0; ex < o_dofs1D; ++ex)
--               {
--                  const int local_index = ez*c_dofs1D*o_dofs1D + ey*o_dofs1D + ex;
--                  const double xv = x(local_index, e) * vk(j, local_index, e);
--                  for (int dx = 0; dx < c_dofs1D; ++dx)
--                  {
--                     w2[j][dx][ey][ez] += xv * Bo(ex, dx);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w1[j][dx][dy][ez] = 0.0;
--                  for (int ey = 0; ey < c_dofs1D; ++ey)
--                  {
--                     w1[j][dx][dy][ez] += w2[j][dx][ey][ez] * Bc(ey, dy);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in z
--      for (int dx = 0; dx < c_dofs1D; ++dx)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int dz = 0; dz < c_dofs1D; ++dz)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  double s = 0.0;
--                  for (int ez = 0; ez < c_dofs1D; ++ez)
--                  {
--                     s += w1[j][dx][dy][ez] * Bc(ez, dz);
--                  }
--                  y(dx, dy, dz, j, e) += s;
--               }
--            }
--         }
--      }
--
--      // dofs that point parallel to y-axis (open in y, closed in x, z)
--
--      // contract in x
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < o_dofs1D; ++ey)
--         {
--            for (int j=0; j<3; ++j)
--            {
--               for (int dx = 0; dx < c_dofs1D; ++dx)
--               {
--                  w2[j][dx][ey][ez] = 0.0;
--               }
--               for (int ex = 0; ex < c_dofs1D; ++ex)
--               {
--                  const int local_index = c_dofs1D*c_dofs1D*o_dofs1D +
--                                          ez*c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
--                  const double xv = x(local_index, e) * vk(j, local_index, e);
--                  for (int dx = 0; dx < c_dofs1D; ++dx)
--                  {
--                     w2[j][dx][ey][ez] += xv * Bc(ex, dx);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < c_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w1[j][dx][dy][ez] = 0.0;
--                  for (int ey = 0; ey < o_dofs1D; ++ey)
--                  {
--                     w1[j][dx][dy][ez] += w2[j][dx][ey][ez] * Bo(ey, dy);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in z
--      for (int dx = 0; dx < c_dofs1D; ++dx)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int dz = 0; dz < c_dofs1D; ++dz)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  double s = 0.0;
--                  for (int ez = 0; ez < c_dofs1D; ++ez)
--                  {
--                     s += w1[j][dx][dy][ez] * Bc(ez, dz);
--                  }
--                  y(dx, dy, dz, j, e) += s;
--               }
--            }
--         }
--      }
--
--      // dofs that point parallel to z-axis (open in z, closed in x, y)
--
--      // contract in x
--      for (int ez = 0; ez < o_dofs1D; ++ez)
--      {
--         for (int ey = 0; ey < c_dofs1D; ++ey)
--         {
--            for (int j=0; j<3; ++j)
--            {
--               for (int dx = 0; dx < c_dofs1D; ++dx)
--               {
--                  w2[j][dx][ey][ez] = 0.0;
--               }
--               for (int ex = 0; ex < c_dofs1D; ++ex)
--               {
--                  const int local_index = 2*c_dofs1D*c_dofs1D*o_dofs1D +
--                                          ez*c_dofs1D*c_dofs1D + ey*c_dofs1D + ex;
--                  const double xv = x(local_index, e) * vk(j, local_index, e);
--                  for (int dx = 0; dx < c_dofs1D; ++dx)
--                  {
--                     w2[j][dx][ey][ez] += xv * Bc(ex, dx);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int ez = 0; ez < o_dofs1D; ++ez)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int dy = 0; dy < c_dofs1D; ++dy)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  w1[j][dx][dy][ez] = 0.0;
--                  for (int ey = 0; ey < c_dofs1D; ++ey)
--                  {
--                     w1[j][dx][dy][ez] += w2[j][dx][ey][ez] * Bc(ey, dy);
--                  }
--               }
--            }
--         }
--      }
--
--      // contract in z
--      for (int dx = 0; dx < c_dofs1D; ++dx)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int dz = 0; dz < c_dofs1D; ++dz)
--            {
--               for (int j=0; j<3; ++j)
--               {
--                  double s = 0.0;
--                  for (int ez = 0; ez < o_dofs1D; ++ez)
--                  {
--                     s += w1[j][dx][dy][ez] * Bo(ez, dz);
--                  }
--                  y(dx, dy, dz, j, e) += s;
--               }
--            }
--         }
--      }
--   });
--}
--
--static void PAHcurlVecH1IdentityApply2D(const int c_dofs1D,
--                                        const int o_dofs1D,
--                                        const int NE,
--                                        const Array<double> &Bclosed,
--                                        const Array<double> &Bopen,
--                                        const Vector &pa_data,
--                                        const Vector &x_,
--                                        Vector &y_)
--{
--   auto Bc = Reshape(Bclosed.Read(), c_dofs1D, c_dofs1D);
--   auto Bo = Reshape(Bopen.Read(), o_dofs1D, c_dofs1D);
--
--   auto x = Reshape(x_.Read(), c_dofs1D, c_dofs1D, 2, NE);
--   auto y = Reshape(y_.ReadWrite(), (2 * c_dofs1D * o_dofs1D), NE);
--
--   auto vk = Reshape(pa_data.Read(), 2, (2 * c_dofs1D * o_dofs1D), NE);
--
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--
--   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double w[2][MAX_D1D][MAX_D1D];
--
--      // dofs that point parallel to x-axis (open in x, closed in y)
--
--      // contract in y
--      for (int ey = 0; ey < c_dofs1D; ++ey)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int j=0; j<2; ++j)
--            {
--               w[j][dx][ey] = 0.0;
--               for (int dy = 0; dy < c_dofs1D; ++dy)
--               {
--                  w[j][dx][ey] += Bc(ey, dy) * x(dx, dy, j, e);
--               }
--            }
--         }
--      }
--
--      // contract in x
--      for (int ey = 0; ey < c_dofs1D; ++ey)
--      {
--         for (int ex = 0; ex < o_dofs1D; ++ex)
--         {
--            for (int j=0; j<2; ++j)
--            {
--               double s = 0.0;
--               for (int dx = 0; dx < c_dofs1D; ++dx)
--               {
--                  s += Bo(ex, dx) * w[j][dx][ey];
--               }
--               const int local_index = ey*o_dofs1D + ex;
--               y(local_index, e) += s * vk(j, local_index, e);
--            }
--         }
--      }
--
--      // dofs that point parallel to y-axis (open in y, closed in x)
--
--      // contract in y
--      for (int ey = 0; ey < o_dofs1D; ++ey)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int j=0; j<2; ++j)
--            {
--               w[j][dx][ey] = 0.0;
--               for (int dy = 0; dy < c_dofs1D; ++dy)
--               {
--                  w[j][dx][ey] += Bo(ey, dy) * x(dx, dy, j, e);
--               }
--            }
--         }
--      }
--
--      // contract in x
--      for (int ey = 0; ey < o_dofs1D; ++ey)
--      {
--         for (int ex = 0; ex < c_dofs1D; ++ex)
--         {
--            for (int j=0; j<2; ++j)
--            {
--               double s = 0.0;
--               for (int dx = 0; dx < c_dofs1D; ++dx)
--               {
--                  s += Bc(ex, dx) * w[j][dx][ey];
--               }
--               const int local_index = c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
--               y(local_index, e) += s * vk(j, local_index, e);
--            }
--         }
--      }
--   });
--}
--
--static void PAHcurlVecH1IdentityApplyTranspose2D(const int c_dofs1D,
--                                                 const int o_dofs1D,
--                                                 const int NE,
--                                                 const Array<double> &Bclosed,
--                                                 const Array<double> &Bopen,
--                                                 const Vector &pa_data,
--                                                 const Vector &x_,
--                                                 Vector &y_)
--{
--   auto Bc = Reshape(Bclosed.Read(), c_dofs1D, c_dofs1D);
--   auto Bo = Reshape(Bopen.Read(), o_dofs1D, c_dofs1D);
--
--   auto x = Reshape(x_.Read(), (2 * c_dofs1D * o_dofs1D), NE);
--   auto y = Reshape(y_.ReadWrite(), c_dofs1D, c_dofs1D, 2, NE);
--
--   auto vk = Reshape(pa_data.Read(), 2, (2 * c_dofs1D * o_dofs1D), NE);
--
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   //constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
--
--   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double w[2][MAX_D1D][MAX_D1D];
--
--      // dofs that point parallel to x-axis (open in x, closed in y)
--
--      // contract in x
--      for (int ey = 0; ey < c_dofs1D; ++ey)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int j=0; j<2; ++j) { w[j][dx][ey] = 0.0; }
--         }
--         for (int ex = 0; ex < o_dofs1D; ++ex)
--         {
--            const int local_index = ey*o_dofs1D + ex;
--            const double xd = x(local_index, e);
--
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               for (int j=0; j<2; ++j)
--               {
--                  w[j][dx][ey] += Bo(ex, dx) * xd * vk(j, local_index, e);
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int dx = 0; dx < c_dofs1D; ++dx)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int j=0; j<2; ++j)
--            {
--               double s = 0.0;
--               for (int ey = 0; ey < c_dofs1D; ++ey)
--               {
--                  s += w[j][dx][ey] * Bc(ey, dy);
--               }
--               y(dx, dy, j, e) += s;
--            }
--         }
--      }
--
--      // dofs that point parallel to y-axis (open in y, closed in x)
--
--      // contract in x
--      for (int ey = 0; ey < o_dofs1D; ++ey)
--      {
--         for (int dx = 0; dx < c_dofs1D; ++dx)
--         {
--            for (int j=0; j<2; ++j) { w[j][dx][ey] = 0.0; }
--         }
--         for (int ex = 0; ex < c_dofs1D; ++ex)
--         {
--            const int local_index = c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
--            const double xd = x(local_index, e);
--            for (int dx = 0; dx < c_dofs1D; ++dx)
--            {
--               for (int j=0; j<2; ++j)
--               {
--                  w[j][dx][ey] += Bc(ex, dx) * xd * vk(j, local_index, e);
--               }
--            }
--         }
--      }
--
--      // contract in y
--      for (int dx = 0; dx < c_dofs1D; ++dx)
--      {
--         for (int dy = 0; dy < c_dofs1D; ++dy)
--         {
--            for (int j=0; j<2; ++j)
--            {
--               double s = 0.0;
--               for (int ey = 0; ey < o_dofs1D; ++ey)
--               {
--                  s += w[j][dx][ey] * Bo(ey, dy);
--               }
--               y(dx, dy, j, e) += s;
--            }
--         }
--      }
--   });
--}
--
--void IdentityInterpolator::AssemblePA(const FiniteElementSpace &trial_fes,
--                                      const FiniteElementSpace &test_fes)
--{
--   // Assumes tensor-product elements, with a vector test space and H^1 trial space.
--   Mesh *mesh = trial_fes.GetMesh();
--   const FiniteElement *trial_fel = trial_fes.GetFE(0);
--   const FiniteElement *test_fel = test_fes.GetFE(0);
--
--   const NodalTensorFiniteElement *trial_el =
--      dynamic_cast<const NodalTensorFiniteElement*>(trial_fel);
--   MFEM_VERIFY(trial_el != NULL, "Only NodalTensorFiniteElement is supported!");
--
--   const VectorTensorFiniteElement *test_el =
--      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
--   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const int dims = trial_el->GetDim();
--   MFEM_VERIFY(dims == 2 || dims == 3, "");
--
--   dim = mesh->Dimension();
--   MFEM_VERIFY(dim == 2 || dim == 3, "");
--
--   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(), "");
--
--   ne = trial_fes.GetNE();
--
--   const int order = trial_el->GetOrder();
--   dofquad_fe = new H1_SegmentElement(order);
--   mfem::QuadratureFunctions1D qf1d;
--   mfem::IntegrationRule closed_ir;
--   closed_ir.SetSize(order + 1);
--   qf1d.GaussLobatto(order + 1, &closed_ir);
--   mfem::IntegrationRule open_ir;
--   open_ir.SetSize(order);
--   qf1d.GaussLegendre(order, &open_ir);
--
--   maps_C_C = &dofquad_fe->GetDofToQuad(closed_ir, DofToQuad::TENSOR);
--   maps_O_C = &dofquad_fe->GetDofToQuad(open_ir, DofToQuad::TENSOR);
--
--   o_dofs1D = maps_O_C->nqpt;
--   c_dofs1D = maps_C_C->nqpt;
--   MFEM_VERIFY(maps_O_C->ndof == c_dofs1D &&
--               maps_C_C->ndof == c_dofs1D, "Discrepancy in the number of DOFs");
--
--   const int ndof_test = (dim == 3) ? 3 * c_dofs1D * c_dofs1D * o_dofs1D
--                         : 2 * c_dofs1D * o_dofs1D;
--
--   const IntegrationRule & Nodes = test_el->GetNodes();
--
--   pa_data.SetSize(dim * ndof_test * ne, Device::GetMemoryType());
--   auto op = Reshape(pa_data.HostWrite(), dim, ndof_test, ne);
--
--   const Array<int> &dofmap = test_el->GetDofMap();
--
--   if (dim == 3)
--   {
--      // Note that ND_HexahedronElement uses 6 vectors in tk rather than 3, with
--      // the last 3 having negative signs. Here the signs are all positive, as
--      // signs are applied in ElementRestriction.
--
--      const double tk[9] = { 1.,0.,0.,  0.,1.,0.,  0.,0.,1. };
--
--      for (int c=0; c<3; ++c)
--      {
--         for (int i=0; i<ndof_test/3; ++i)
--         {
--            const int d = (c*ndof_test/3) + i;
--            // ND_HexahedronElement sets dof2tk = (dofmap < 0) ? 3+c : c, but here
--            // no signs should be applied due to ElementRestriction.
--            const int dof2tk = c;
--            const int id = (dofmap[d] >= 0) ? dofmap[d] : -1 - dofmap[d];
--
--            for (int e=0; e<ne; ++e)
--            {
--               double v[3];
--               ElementTransformation *tr = mesh->GetElementTransformation(e);
--               tr->SetIntPoint(&Nodes.IntPoint(id));
--               tr->Jacobian().Mult(tk + dof2tk*dim, v);
--
--               for (int j=0; j<3; ++j)
--               {
--                  op(j,d,e) = v[j];
--               }
--            }
--         }
--      }
--   }
--   else // 2D case
--   {
--      const double tk[4] = { 1.,0.,  0.,1. };
--      for (int c=0; c<2; ++c)
--      {
--         for (int i=0; i<ndof_test/2; ++i)
--         {
--            const int d = (c*ndof_test/2) + i;
--            // ND_QuadrilateralElement sets dof2tk = (dofmap < 0) ? 2+c : c, but here
--            // no signs should be applied due to ElementRestriction.
--            const int dof2tk = c;
--            const int id = (dofmap[d] >= 0) ? dofmap[d] : -1 - dofmap[d];
--
--            for (int e=0; e<ne; ++e)
--            {
--               double v[2];
--               ElementTransformation *tr = mesh->GetElementTransformation(e);
--               tr->SetIntPoint(&Nodes.IntPoint(id));
--               tr->Jacobian().Mult(tk + dof2tk*dim, v);
--
--               for (int j=0; j<2; ++j)
--               {
--                  op(j,d,e) = v[j];
--               }
--            }
--         }
--      }
--   }
--}
--
--void IdentityInterpolator::AddMultPA(const Vector &x, Vector &y) const
--{
--   if (dim == 3)
--   {
--      PAHcurlVecH1IdentityApply3D(c_dofs1D, o_dofs1D, ne, maps_C_C->B, maps_O_C->B,
--                                  pa_data, x, y);
--   }
--   else if (dim == 2)
--   {
--      PAHcurlVecH1IdentityApply2D(c_dofs1D, o_dofs1D, ne, maps_C_C->B, maps_O_C->B,
--                                  pa_data, x, y);
--   }
--   else
--   {
--      mfem_error("Bad dimension!");
--   }
--}
--
--void IdentityInterpolator::AddMultTransposePA(const Vector &x, Vector &y) const
--{
--   if (dim == 3)
--   {
--      PAHcurlVecH1IdentityApplyTranspose3D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
--                                           maps_O_C->B, pa_data, x, y);
--   }
--   else if (dim == 2)
--   {
--      PAHcurlVecH1IdentityApplyTranspose2D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
--                                           maps_O_C->B, pa_data, x, y);
--   }
--   else
--   {
--      mfem_error("Bad dimension!");
--   }
--}
--
--template void SmemPAHcurlMassAssembleDiagonal3D<0,0>(const int D1D,
--                                                     const int Q1D,
--                                                     const int NE,
--                                                     const bool symmetric,
--                                                     const Array<double> &bo,
--                                                     const Array<double> &bc,
--                                                     const Vector &pa_data,
--                                                     Vector &diag);
--
--template void SmemPAHcurlMassAssembleDiagonal3D<2,3>(const int D1D,
--                                                     const int Q1D,
--                                                     const int NE,
--                                                     const bool symmetric,
--                                                     const Array<double> &bo,
--                                                     const Array<double> &bc,
--                                                     const Vector &pa_data,
--                                                     Vector &diag);
--
--template void SmemPAHcurlMassAssembleDiagonal3D<3,4>(const int D1D,
--                                                     const int Q1D,
--                                                     const int NE,
--                                                     const bool symmetric,
--                                                     const Array<double> &bo,
--                                                     const Array<double> &bc,
--                                                     const Vector &pa_data,
--                                                     Vector &diag);
--
--template void SmemPAHcurlMassAssembleDiagonal3D<4,5>(const int D1D,
--                                                     const int Q1D,
--                                                     const int NE,
--                                                     const bool symmetric,
--                                                     const Array<double> &bo,
--                                                     const Array<double> &bc,
--                                                     const Vector &pa_data,
--                                                     Vector &diag);
--
--template void SmemPAHcurlMassAssembleDiagonal3D<5,6>(const int D1D,
--                                                     const int Q1D,
--                                                     const int NE,
--                                                     const bool symmetric,
--                                                     const Array<double> &bo,
--                                                     const Array<double> &bc,
--                                                     const Vector &pa_data,
--                                                     Vector &diag);
--
--template void SmemPAHcurlMassApply3D<0,0>(const int D1D,
--                                          const int Q1D,
--                                          const int NE,
--                                          const bool symmetric,
--                                          const Array<double> &bo,
--                                          const Array<double> &bc,
--                                          const Array<double> &bot,
--                                          const Array<double> &bct,
--                                          const Vector &pa_data,
--                                          const Vector &x,
--                                          Vector &y);
--
--template void SmemPAHcurlMassApply3D<2,3>(const int D1D,
--                                          const int Q1D,
--                                          const int NE,
--                                          const bool symmetric,
--                                          const Array<double> &bo,
--                                          const Array<double> &bc,
--                                          const Array<double> &bot,
--                                          const Array<double> &bct,
--                                          const Vector &pa_data,
--                                          const Vector &x,
--                                          Vector &y);
--
--template void SmemPAHcurlMassApply3D<3,4>(const int D1D,
--                                          const int Q1D,
--                                          const int NE,
--                                          const bool symmetric,
--                                          const Array<double> &bo,
--                                          const Array<double> &bc,
--                                          const Array<double> &bot,
--                                          const Array<double> &bct,
--                                          const Vector &pa_data,
--                                          const Vector &x,
--                                          Vector &y);
--
--template void SmemPAHcurlMassApply3D<4,5>(const int D1D,
--                                          const int Q1D,
--                                          const int NE,
--                                          const bool symmetric,
--                                          const Array<double> &bo,
--                                          const Array<double> &bc,
--                                          const Array<double> &bot,
--                                          const Array<double> &bct,
--                                          const Vector &pa_data,
--                                          const Vector &x,
--                                          Vector &y);
--
--template void SmemPAHcurlMassApply3D<5,6>(const int D1D,
--                                          const int Q1D,
--                                          const int NE,
--                                          const bool symmetric,
--                                          const Array<double> &bo,
--                                          const Array<double> &bc,
--                                          const Array<double> &bot,
--                                          const Array<double> &bct,
--                                          const Vector &pa_data,
--                                          const Vector &x,
--                                          Vector &y);
--
--} // namespace mfem
-diff --git a/fem/bilininteg_mass_pa.cpp b/fem/bilininteg_mass_pa.cpp
-deleted file mode 100644
-index 06156d030..000000000
---- a/fem/bilininteg_mass_pa.cpp
-+++ /dev/null
-@@ -1,737 +0,0 @@
--// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
--// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
--// LICENSE and NOTICE for details. LLNL-CODE-806117.
--//
--// This file is part of the MFEM library. For more information and source code
--// availability visit https://mfem.org.
--//
--// MFEM is free software; you can redistribute it and/or modify it under the
--// terms of the BSD-3 license. We welcome feedback and contributions, see file
--// CONTRIBUTING.md for details.
--
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
--#include "qfunction.hpp"
--#include "ceed/integrators/mass/mass.hpp"
--#include "bilininteg_mass_pa.hpp"
--
--using namespace std;
--
--namespace mfem
--{
--
--// PA Mass Integrator
--
--// PA Mass Assemble kernel
--
--void MassIntegrator::AssemblePA(const FiniteElementSpace &fes)
--{
--   const MemoryType mt = (pa_mt == MemoryType::DEFAULT) ?
--                         Device::GetDeviceMemoryType() : pa_mt;
--
--   // Assuming the same element type
--   fespace = &fes;
--   Mesh *mesh = fes.GetMesh();
--   if (mesh->GetNE() == 0) { return; }
--   const FiniteElement &el = *fes.GetFE(0);
--   ElementTransformation *T0 = mesh->GetElementTransformation(0);
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el, *T0);
--   if (DeviceCanUseCeed())
--   {
--      delete ceedOp;
--      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
--                         fes.IsVariableOrder();
--      if (mixed)
--      {
--         ceedOp = new ceed::MixedPAMassIntegrator(*this, fes, Q);
--      }
--      else
--      {
--         ceedOp = new ceed::PAMassIntegrator(fes, *ir, Q);
--      }
--      return;
--   }
--   int map_type = el.GetMapType();
--   dim = mesh->Dimension();
--   ne = fes.GetMesh()->GetNE();
--   nq = ir->GetNPoints();
--   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::DETERMINANTS, mt);
--   maps = &el.GetDofToQuad(*ir, DofToQuad::TENSOR);
--   dofs1D = maps->ndof;
--   quad1D = maps->nqpt;
--   pa_data.SetSize(ne*nq, mt);
--
--   QuadratureSpace qs(*mesh, *ir);
--   CoefficientVector coeff(Q, qs, CoefficientStorage::COMPRESSED);
--
--   if (dim==1) { MFEM_ABORT("Not supported yet... stay tuned!"); }
--   if (dim==2)
--   {
--      const int NE = ne;
--      const int Q1D = quad1D;
--      const bool const_c = coeff.Size() == 1;
--      const bool by_val = map_type == FiniteElement::VALUE;
--      const auto W = Reshape(ir->GetWeights().Read(), Q1D,Q1D);
--      const auto J = Reshape(geom->detJ.Read(), Q1D,Q1D,NE);
--      const auto C = const_c ? Reshape(coeff.Read(), 1,1,1) :
--                     Reshape(coeff.Read(), Q1D,Q1D,NE);
--      auto v = Reshape(pa_data.Write(), Q1D,Q1D, NE);
--      mfem::forall_2D(NE,Q1D,Q1D, [=] MFEM_HOST_DEVICE (int e)
--      {
--         MFEM_FOREACH_THREAD(qx,x,Q1D)
--         {
--            MFEM_FOREACH_THREAD(qy,y,Q1D)
--            {
--               const double detJ = J(qx,qy,e);
--               const double coeff = const_c ? C(0,0,0) : C(qx,qy,e);
--               v(qx,qy,e) =  W(qx,qy) * coeff * (by_val ? detJ : 1.0/detJ);
--            }
--         }
--      });
--   }
--   if (dim==3)
--   {
--      const int NE = ne;
--      const int Q1D = quad1D;
--      const bool const_c = coeff.Size() == 1;
--      const bool by_val = map_type == FiniteElement::VALUE;
--      const auto W = Reshape(ir->GetWeights().Read(), Q1D,Q1D,Q1D);
--      const auto J = Reshape(geom->detJ.Read(), Q1D,Q1D,Q1D,NE);
--      const auto C = const_c ? Reshape(coeff.Read(), 1,1,1,1) :
--                     Reshape(coeff.Read(), Q1D,Q1D,Q1D,NE);
--      auto v = Reshape(pa_data.Write(), Q1D,Q1D,Q1D,NE);
--      mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
--      {
--         MFEM_FOREACH_THREAD(qx,x,Q1D)
--         {
--            MFEM_FOREACH_THREAD(qy,y,Q1D)
--            {
--               MFEM_FOREACH_THREAD(qz,z,Q1D)
--               {
--                  const double detJ = J(qx,qy,qz,e);
--                  const double coeff = const_c ? C(0,0,0,0) : C(qx,qy,qz,e);
--                  v(qx,qy,qz,e) = W(qx,qy,qz) * coeff * (by_val ? detJ : 1.0/detJ);
--               }
--            }
--         }
--      });
--   }
--}
--
--template<int T_D1D = 0, int T_Q1D = 0>
--static void PAMassAssembleDiagonal2D(const int NE,
--                                     const Array<double> &b,
--                                     const Vector &d,
--                                     Vector &y,
--                                     const int d1d = 0,
--                                     const int q1d = 0)
--{
--   const int D1D = T_D1D ? T_D1D : d1d;
--   const int Q1D = T_Q1D ? T_Q1D : q1d;
--   MFEM_VERIFY(D1D <= MAX_D1D, "");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
--   auto B = Reshape(b.Read(), Q1D, D1D);
--   auto D = Reshape(d.Read(), Q1D, Q1D, NE);
--   auto Y = Reshape(y.ReadWrite(), D1D, D1D, NE);
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      const int D1D = T_D1D ? T_D1D : d1d;
--      const int Q1D = T_Q1D ? T_Q1D : q1d;
--      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
--      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
--      double QD[MQ1][MD1];
--      for (int qx = 0; qx < Q1D; ++qx)
--      {
--         for (int dy = 0; dy < D1D; ++dy)
--         {
--            QD[qx][dy] = 0.0;
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               QD[qx][dy] += B(qy, dy) * B(qy, dy) * D(qx, qy, e);
--            }
--         }
--      }
--      for (int dy = 0; dy < D1D; ++dy)
--      {
--         for (int dx = 0; dx < D1D; ++dx)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               Y(dx,dy,e) += B(qx, dx) * B(qx, dx) * QD[qx][dy];
--            }
--         }
--      }
--   });
--}
--
--template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
--static void SmemPAMassAssembleDiagonal2D(const int NE,
--                                         const Array<double> &b_,
--                                         const Vector &d_,
--                                         Vector &y_,
--                                         const int d1d = 0,
--                                         const int q1d = 0)
--{
--   const int D1D = T_D1D ? T_D1D : d1d;
--   const int Q1D = T_Q1D ? T_Q1D : q1d;
--   constexpr int NBZ = T_NBZ ? T_NBZ : 1;
--   constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
--   constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
--   MFEM_VERIFY(D1D <= MD1, "");
--   MFEM_VERIFY(Q1D <= MQ1, "");
--   auto b = Reshape(b_.Read(), Q1D, D1D);
--   auto D = Reshape(d_.Read(), Q1D, Q1D, NE);
--   auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
--   mfem::forall_2D_batch(NE, Q1D, Q1D, NBZ, [=] MFEM_HOST_DEVICE (int e)
--   {
--      const int tidz = MFEM_THREAD_ID(z);
--      const int D1D = T_D1D ? T_D1D : d1d;
--      const int Q1D = T_Q1D ? T_Q1D : q1d;
--      constexpr int NBZ = T_NBZ ? T_NBZ : 1;
--      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
--      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
--      MFEM_SHARED double B[MQ1][MD1];
--      MFEM_SHARED double QDZ[NBZ][MQ1][MD1];
--      double (*QD)[MD1] = (double (*)[MD1])(QDZ + tidz);
--      if (tidz == 0)
--      {
--         MFEM_FOREACH_THREAD(d,y,D1D)
--         {
--            MFEM_FOREACH_THREAD(q,x,Q1D)
--            {
--               B[q][d] = b(q,d);
--            }
--         }
--      }
--      MFEM_SYNC_THREAD;
--      MFEM_FOREACH_THREAD(qx,x,Q1D)
--      {
--         MFEM_FOREACH_THREAD(dy,y,D1D)
--         {
--            QD[qx][dy] = 0.0;
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               QD[qx][dy] += B[qy][dy] * B[qy][dy] * D(qx, qy, e);
--            }
--         }
--      }
--      MFEM_SYNC_THREAD;
--      MFEM_FOREACH_THREAD(dy,y,D1D)
--      {
--         MFEM_FOREACH_THREAD(dx,x,D1D)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               // might need absolute values on next line
--               Y(dx,dy,e) += B[qx][dx] * B[qx][dx] * QD[qx][dy];
--            }
--         }
--      }
--   });
--}
--
--template<int T_D1D = 0, int T_Q1D = 0>
--static void PAMassAssembleDiagonal3D(const int NE,
--                                     const Array<double> &b,
--                                     const Vector &d,
--                                     Vector &y,
--                                     const int d1d = 0,
--                                     const int q1d = 0)
--{
--   const int D1D = T_D1D ? T_D1D : d1d;
--   const int Q1D = T_Q1D ? T_Q1D : q1d;
--   MFEM_VERIFY(D1D <= MAX_D1D, "");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
--   auto B = Reshape(b.Read(), Q1D, D1D);
--   auto D = Reshape(d.Read(), Q1D, Q1D, Q1D, NE);
--   auto Y = Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      const int D1D = T_D1D ? T_D1D : d1d;
--      const int Q1D = T_Q1D ? T_Q1D : q1d;
--      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
--      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
--      double QQD[MQ1][MQ1][MD1];
--      double QDD[MQ1][MD1][MD1];
--      for (int qx = 0; qx < Q1D; ++qx)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int dz = 0; dz < D1D; ++dz)
--            {
--               QQD[qx][qy][dz] = 0.0;
--               for (int qz = 0; qz < Q1D; ++qz)
--               {
--                  QQD[qx][qy][dz] += B(qz, dz) * B(qz, dz) * D(qx, qy, qz, e);
--               }
--            }
--         }
--      }
--      for (int qx = 0; qx < Q1D; ++qx)
--      {
--         for (int dz = 0; dz < D1D; ++dz)
--         {
--            for (int dy = 0; dy < D1D; ++dy)
--            {
--               QDD[qx][dy][dz] = 0.0;
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  QDD[qx][dy][dz] += B(qy, dy) * B(qy, dy) * QQD[qx][qy][dz];
--               }
--            }
--         }
--      }
--      for (int dz = 0; dz < D1D; ++dz)
--      {
--         for (int dy = 0; dy < D1D; ++dy)
--         {
--            for (int dx = 0; dx < D1D; ++dx)
--            {
--               double t = 0.0;
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  t += B(qx, dx) * B(qx, dx) * QDD[qx][dy][dz];
--               }
--               Y(dx, dy, dz, e) += t;
--            }
--         }
--      }
--   });
--}
--
--template<int T_D1D = 0, int T_Q1D = 0>
--static void SmemPAMassAssembleDiagonal3D(const int NE,
--                                         const Array<double> &b_,
--                                         const Vector &d_,
--                                         Vector &y_,
--                                         const int d1d = 0,
--                                         const int q1d = 0)
--{
--   const int D1D = T_D1D ? T_D1D : d1d;
--   const int Q1D = T_Q1D ? T_Q1D : q1d;
--   constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
--   constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
--   MFEM_VERIFY(D1D <= MD1, "");
--   MFEM_VERIFY(Q1D <= MQ1, "");
--   auto b = Reshape(b_.Read(), Q1D, D1D);
--   auto D = Reshape(d_.Read(), Q1D, Q1D, Q1D, NE);
--   auto Y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
--   mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
--   {
--      const int tidz = MFEM_THREAD_ID(z);
--      const int D1D = T_D1D ? T_D1D : d1d;
--      const int Q1D = T_Q1D ? T_Q1D : q1d;
--      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
--      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
--      MFEM_SHARED double B[MQ1][MD1];
--      MFEM_SHARED double QQD[MQ1][MQ1][MD1];
--      MFEM_SHARED double QDD[MQ1][MD1][MD1];
--      if (tidz == 0)
--      {
--         MFEM_FOREACH_THREAD(d,y,D1D)
--         {
--            MFEM_FOREACH_THREAD(q,x,Q1D)
--            {
--               B[q][d] = b(q,d);
--            }
--         }
--      }
--      MFEM_SYNC_THREAD;
--      MFEM_FOREACH_THREAD(qx,x,Q1D)
--      {
--         MFEM_FOREACH_THREAD(qy,y,Q1D)
--         {
--            MFEM_FOREACH_THREAD(dz,z,D1D)
--            {
--               QQD[qx][qy][dz] = 0.0;
--               for (int qz = 0; qz < Q1D; ++qz)
--               {
--                  QQD[qx][qy][dz] += B[qz][dz] * B[qz][dz] * D(qx, qy, qz, e);
--               }
--            }
--         }
--      }
--      MFEM_SYNC_THREAD;
--      MFEM_FOREACH_THREAD(qx,x,Q1D)
--      {
--         MFEM_FOREACH_THREAD(dz,z,D1D)
--         {
--            MFEM_FOREACH_THREAD(dy,y,D1D)
--            {
--               QDD[qx][dy][dz] = 0.0;
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  QDD[qx][dy][dz] += B[qy][dy] * B[qy][dy] * QQD[qx][qy][dz];
--               }
--            }
--         }
--      }
--      MFEM_SYNC_THREAD;
--      MFEM_FOREACH_THREAD(dz,z,D1D)
--      {
--         MFEM_FOREACH_THREAD(dy,y,D1D)
--         {
--            MFEM_FOREACH_THREAD(dx,x,D1D)
--            {
--               double t = 0.0;
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  t += B[qx][dx] * B[qx][dx] * QDD[qx][dy][dz];
--               }
--               Y(dx, dy, dz, e) += t;
--            }
--         }
--      }
--   });
--}
--
--static void PAMassAssembleDiagonal(const int dim, const int D1D,
--                                   const int Q1D, const int NE,
--                                   const Array<double> &B,
--                                   const Vector &D,
--                                   Vector &Y)
--{
--   if (dim == 2)
--   {
--      switch ((D1D << 4 ) | Q1D)
--      {
--         case 0x22: return SmemPAMassAssembleDiagonal2D<2,2,16>(NE,B,D,Y);
--         case 0x33: return SmemPAMassAssembleDiagonal2D<3,3,16>(NE,B,D,Y);
--         case 0x44: return SmemPAMassAssembleDiagonal2D<4,4,8>(NE,B,D,Y);
--         case 0x55: return SmemPAMassAssembleDiagonal2D<5,5,8>(NE,B,D,Y);
--         case 0x66: return SmemPAMassAssembleDiagonal2D<6,6,4>(NE,B,D,Y);
--         case 0x77: return SmemPAMassAssembleDiagonal2D<7,7,4>(NE,B,D,Y);
--         case 0x88: return SmemPAMassAssembleDiagonal2D<8,8,2>(NE,B,D,Y);
--         case 0x99: return SmemPAMassAssembleDiagonal2D<9,9,2>(NE,B,D,Y);
--         default:   return PAMassAssembleDiagonal2D(NE,B,D,Y,D1D,Q1D);
--      }
--   }
--   else if (dim == 3)
--   {
--      switch ((D1D << 4 ) | Q1D)
--      {
--         case 0x23: return SmemPAMassAssembleDiagonal3D<2,3>(NE,B,D,Y);
--         case 0x24: return SmemPAMassAssembleDiagonal3D<2,4>(NE,B,D,Y);
--         case 0x26: return SmemPAMassAssembleDiagonal3D<2,6>(NE,B,D,Y);
--         case 0x34: return SmemPAMassAssembleDiagonal3D<3,4>(NE,B,D,Y);
--         case 0x35: return SmemPAMassAssembleDiagonal3D<3,5>(NE,B,D,Y);
--         case 0x45: return SmemPAMassAssembleDiagonal3D<4,5>(NE,B,D,Y);
--         case 0x48: return SmemPAMassAssembleDiagonal3D<4,8>(NE,B,D,Y);
--         case 0x56: return SmemPAMassAssembleDiagonal3D<5,6>(NE,B,D,Y);
--         case 0x67: return SmemPAMassAssembleDiagonal3D<6,7>(NE,B,D,Y);
--         case 0x78: return SmemPAMassAssembleDiagonal3D<7,8>(NE,B,D,Y);
--         case 0x89: return SmemPAMassAssembleDiagonal3D<8,9>(NE,B,D,Y);
--         default:   return PAMassAssembleDiagonal3D(NE,B,D,Y,D1D,Q1D);
--      }
--   }
--   MFEM_ABORT("Unknown kernel.");
--}
--
--void MassIntegrator::AssembleDiagonalPA(Vector &diag)
--{
--   if (DeviceCanUseCeed())
--   {
--      ceedOp->GetDiagonal(diag);
--   }
--   else
--   {
--      PAMassAssembleDiagonal(dim, dofs1D, quad1D, ne, maps->B, pa_data, diag);
--   }
--}
--
--
--#ifdef MFEM_USE_OCCA
--// OCCA PA Mass Apply 2D kernel
--static void OccaPAMassApply2D(const int D1D,
--                              const int Q1D,
--                              const int NE,
--                              const Array<double> &B,
--                              const Array<double> &Bt,
--                              const Vector &D,
--                              const Vector &X,
--                              Vector &Y)
--{
--   occa::properties props;
--   props["defines/D1D"] = D1D;
--   props["defines/Q1D"] = Q1D;
--   const occa::memory o_B = OccaMemoryRead(B.GetMemory(), B.Size());
--   const occa::memory o_Bt = OccaMemoryRead(Bt.GetMemory(), Bt.Size());
--   const occa::memory o_D = OccaMemoryRead(D.GetMemory(), D.Size());
--   const occa::memory o_X = OccaMemoryRead(X.GetMemory(), X.Size());
--   occa::memory o_Y = OccaMemoryReadWrite(Y.GetMemory(), Y.Size());
--   const occa_id_t id = std::make_pair(D1D,Q1D);
--   if (!Device::Allows(Backend::OCCA_CUDA))
--   {
--      static occa_kernel_t OccaMassApply2D_cpu;
--      if (OccaMassApply2D_cpu.find(id) == OccaMassApply2D_cpu.end())
--      {
--         const occa::kernel MassApply2D_CPU =
--            mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
--                                        "MassApply2D_CPU", props);
--         OccaMassApply2D_cpu.emplace(id, MassApply2D_CPU);
--      }
--      OccaMassApply2D_cpu.at(id)(NE, o_B, o_Bt, o_D, o_X, o_Y);
--   }
--   else
--   {
--      static occa_kernel_t OccaMassApply2D_gpu;
--      if (OccaMassApply2D_gpu.find(id) == OccaMassApply2D_gpu.end())
--      {
--         const occa::kernel MassApply2D_GPU =
--            mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
--                                        "MassApply2D_GPU", props);
--         OccaMassApply2D_gpu.emplace(id, MassApply2D_GPU);
--      }
--      OccaMassApply2D_gpu.at(id)(NE, o_B, o_Bt, o_D, o_X, o_Y);
--   }
--}
--
--// OCCA PA Mass Apply 3D kernel
--static void OccaPAMassApply3D(const int D1D,
--                              const int Q1D,
--                              const int NE,
--                              const Array<double> &B,
--                              const Array<double> &Bt,
--                              const Vector &D,
--                              const Vector &X,
--                              Vector &Y)
--{
--   occa::properties props;
--   props["defines/D1D"] = D1D;
--   props["defines/Q1D"] = Q1D;
--   const occa::memory o_B = OccaMemoryRead(B.GetMemory(), B.Size());
--   const occa::memory o_Bt = OccaMemoryRead(Bt.GetMemory(), Bt.Size());
--   const occa::memory o_D = OccaMemoryRead(D.GetMemory(), D.Size());
--   const occa::memory o_X = OccaMemoryRead(X.GetMemory(), X.Size());
--   occa::memory o_Y = OccaMemoryReadWrite(Y.GetMemory(), Y.Size());
--   const occa_id_t id = std::make_pair(D1D,Q1D);
--   if (!Device::Allows(Backend::OCCA_CUDA))
--   {
--      static occa_kernel_t OccaMassApply3D_cpu;
--      if (OccaMassApply3D_cpu.find(id) == OccaMassApply3D_cpu.end())
--      {
--         const occa::kernel MassApply3D_CPU =
--            mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
--                                        "MassApply3D_CPU", props);
--         OccaMassApply3D_cpu.emplace(id, MassApply3D_CPU);
--      }
--      OccaMassApply3D_cpu.at(id)(NE, o_B, o_Bt, o_D, o_X, o_Y);
--   }
--   else
--   {
--      static occa_kernel_t OccaMassApply3D_gpu;
--      if (OccaMassApply3D_gpu.find(id) == OccaMassApply3D_gpu.end())
--      {
--         const occa::kernel MassApply3D_GPU =
--            mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
--                                        "MassApply3D_GPU", props);
--         OccaMassApply3D_gpu.emplace(id, MassApply3D_GPU);
--      }
--      OccaMassApply3D_gpu.at(id)(NE, o_B, o_Bt, o_D, o_X, o_Y);
--   }
--}
--#endif // MFEM_USE_OCCA
--
--template<int T_D1D = 0, int T_Q1D = 0>
--static void PAMassApply2D(const int NE,
--                          const Array<double> &b_,
--                          const Array<double> &bt_,
--                          const Vector &d_,
--                          const Vector &x_,
--                          Vector &y_,
--                          const int d1d = 0,
--                          const int q1d = 0)
--{
--   MFEM_VERIFY(T_D1D ? T_D1D : d1d <= MAX_D1D, "");
--   MFEM_VERIFY(T_Q1D ? T_Q1D : q1d <= MAX_Q1D, "");
--
--   const auto B = b_.Read();
--   const auto Bt = bt_.Read();
--   const auto D = d_.Read();
--   const auto X = x_.Read();
--   auto Y = y_.ReadWrite();
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      internal::PAMassApply2D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
--   });
--}
--
--template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
--static void SmemPAMassApply2D(const int NE,
--                              const Array<double> &b_,
--                              const Array<double> &bt_,
--                              const Vector &d_,
--                              const Vector &x_,
--                              Vector &y_,
--                              const int d1d = 0,
--                              const int q1d = 0)
--{
--   MFEM_CONTRACT_VAR(bt_);
--   const int D1D = T_D1D ? T_D1D : d1d;
--   const int Q1D = T_Q1D ? T_Q1D : q1d;
--   constexpr int NBZ = T_NBZ ? T_NBZ : 1;
--   constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
--   constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
--   MFEM_VERIFY(D1D <= MD1, "");
--   MFEM_VERIFY(Q1D <= MQ1, "");
--   const auto b = b_.Read();
--   const auto D = d_.Read();
--   const auto x = x_.Read();
--   auto Y = y_.ReadWrite();
--   mfem::forall_2D_batch(NE, Q1D, Q1D, NBZ, [=] MFEM_HOST_DEVICE (int e)
--   {
--      internal::SmemPAMassApply2D_Element<T_D1D,T_Q1D,T_NBZ>(e, NE, b, D, x, Y, d1d,
--                                                             q1d);
--   });
--}
--
--template<int T_D1D = 0, int T_Q1D = 0>
--static void PAMassApply3D(const int NE,
--                          const Array<double> &b_,
--                          const Array<double> &bt_,
--                          const Vector &d_,
--                          const Vector &x_,
--                          Vector &y_,
--                          const int d1d = 0,
--                          const int q1d = 0)
--{
--   MFEM_VERIFY(T_D1D ? T_D1D : d1d <= MAX_D1D, "");
--   MFEM_VERIFY(T_Q1D ? T_Q1D : q1d <= MAX_Q1D, "");
--
--   const auto B = b_.Read();
--   const auto Bt = bt_.Read();
--   const auto D = d_.Read();
--   const auto X = x_.Read();
--   auto Y = y_.ReadWrite();
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      internal::PAMassApply3D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
--   });
--}
--
--template<int T_D1D = 0, int T_Q1D = 0>
--static void SmemPAMassApply3D(const int NE,
--                              const Array<double> &b_,
--                              const Array<double> &bt_,
--                              const Vector &d_,
--                              const Vector &x_,
--                              Vector &y_,
--                              const int d1d = 0,
--                              const int q1d = 0)
--{
--   MFEM_CONTRACT_VAR(bt_);
--   const int D1D = T_D1D ? T_D1D : d1d;
--   const int Q1D = T_Q1D ? T_Q1D : q1d;
--   constexpr int M1Q = T_Q1D ? T_Q1D : MAX_Q1D;
--   constexpr int M1D = T_D1D ? T_D1D : MAX_D1D;
--   MFEM_VERIFY(D1D <= M1D, "");
--   MFEM_VERIFY(Q1D <= M1Q, "");
--   auto b = b_.Read();
--   auto d = d_.Read();
--   auto x = x_.Read();
--   auto y = y_.ReadWrite();
--   mfem::forall_2D(NE, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
--   {
--      internal::SmemPAMassApply3D_Element<T_D1D,T_Q1D>(e, NE, b, d, x, y, d1d, q1d);
--   });
--}
--
--static void PAMassApply(const int dim,
--                        const int D1D,
--                        const int Q1D,
--                        const int NE,
--                        const Array<double> &B,
--                        const Array<double> &Bt,
--                        const Vector &D,
--                        const Vector &X,
--                        Vector &Y)
--{
--#ifdef MFEM_USE_OCCA
--   if (DeviceCanUseOcca())
--   {
--      if (dim == 2)
--      {
--         return OccaPAMassApply2D(D1D,Q1D,NE,B,Bt,D,X,Y);
--      }
--      if (dim == 3)
--      {
--         return OccaPAMassApply3D(D1D,Q1D,NE,B,Bt,D,X,Y);
--      }
--      MFEM_ABORT("OCCA PA Mass Apply unknown kernel!");
--   }
--#endif // MFEM_USE_OCCA
--   const int id = (D1D << 4) | Q1D;
--
--   if (dim == 2)
--   {
--      switch (id)
--      {
--         case 0x22: return SmemPAMassApply2D<2,2,16>(NE,B,Bt,D,X,Y);
--         case 0x24: return SmemPAMassApply2D<2,4,16>(NE,B,Bt,D,X,Y);
--         case 0x33: return SmemPAMassApply2D<3,3,16>(NE,B,Bt,D,X,Y);
--         case 0x34: return SmemPAMassApply2D<3,4,16>(NE,B,Bt,D,X,Y);
--         case 0x35: return SmemPAMassApply2D<3,5,16>(NE,B,Bt,D,X,Y);
--         case 0x36: return SmemPAMassApply2D<3,6,16>(NE,B,Bt,D,X,Y);
--         case 0x44: return SmemPAMassApply2D<4,4,8>(NE,B,Bt,D,X,Y);
--         case 0x46: return SmemPAMassApply2D<4,6,8>(NE,B,Bt,D,X,Y);
--         case 0x48: return SmemPAMassApply2D<4,8,4>(NE,B,Bt,D,X,Y);
--         case 0x55: return SmemPAMassApply2D<5,5,8>(NE,B,Bt,D,X,Y);
--         case 0x57: return SmemPAMassApply2D<5,7,8>(NE,B,Bt,D,X,Y);
--         case 0x58: return SmemPAMassApply2D<5,8,2>(NE,B,Bt,D,X,Y);
--         case 0x66: return SmemPAMassApply2D<6,6,4>(NE,B,Bt,D,X,Y);
--         case 0x77: return SmemPAMassApply2D<7,7,4>(NE,B,Bt,D,X,Y);
--         case 0x88: return SmemPAMassApply2D<8,8,2>(NE,B,Bt,D,X,Y);
--         case 0x99: return SmemPAMassApply2D<9,9,2>(NE,B,Bt,D,X,Y);
--         default:   return PAMassApply2D(NE,B,Bt,D,X,Y,D1D,Q1D);
--      }
--   }
--   else if (dim == 3)
--   {
--      switch (id)
--      {
--         case 0x22: return SmemPAMassApply3D<2,2>(NE,B,Bt,D,X,Y);
--         case 0x23: return SmemPAMassApply3D<2,3>(NE,B,Bt,D,X,Y);
--         case 0x24: return SmemPAMassApply3D<2,4>(NE,B,Bt,D,X,Y);
--         case 0x26: return SmemPAMassApply3D<2,6>(NE,B,Bt,D,X,Y);
--         case 0x34: return SmemPAMassApply3D<3,4>(NE,B,Bt,D,X,Y);
--         case 0x35: return SmemPAMassApply3D<3,5>(NE,B,Bt,D,X,Y);
--         case 0x36: return SmemPAMassApply3D<3,6>(NE,B,Bt,D,X,Y);
--         case 0x37: return SmemPAMassApply3D<3,7>(NE,B,Bt,D,X,Y);
--         case 0x45: return SmemPAMassApply3D<4,5>(NE,B,Bt,D,X,Y);
--         case 0x46: return SmemPAMassApply3D<4,6>(NE,B,Bt,D,X,Y);
--         case 0x48: return SmemPAMassApply3D<4,8>(NE,B,Bt,D,X,Y);
--         case 0x56: return SmemPAMassApply3D<5,6>(NE,B,Bt,D,X,Y);
--         case 0x58: return SmemPAMassApply3D<5,8>(NE,B,Bt,D,X,Y);
--         case 0x67: return SmemPAMassApply3D<6,7>(NE,B,Bt,D,X,Y);
--         case 0x78: return SmemPAMassApply3D<7,8>(NE,B,Bt,D,X,Y);
--         case 0x89: return SmemPAMassApply3D<8,9>(NE,B,Bt,D,X,Y);
--         case 0x9A: return SmemPAMassApply3D<9,10>(NE,B,Bt,D,X,Y);
--         default:   return PAMassApply3D(NE,B,Bt,D,X,Y,D1D,Q1D);
--      }
--   }
--   mfem::out << "Unknown kernel 0x" << std::hex << id << std::endl;
--   MFEM_ABORT("Unknown kernel.");
--}
--
--void MassIntegrator::AddMultPA(const Vector &x, Vector &y) const
--{
--   if (DeviceCanUseCeed())
--   {
--      ceedOp->AddMult(x, y);
--   }
--   else
--   {
--      PAMassApply(dim, dofs1D, quad1D, ne, maps->B, maps->Bt, pa_data, x, y);
--   }
--}
--
--void MassIntegrator::AddMultTransposePA(const Vector &x, Vector &y) const
--{
--   // Mass integrator is symmetric
--   AddMultPA(x, y);
--}
--
--} // namespace mfem
-diff --git a/fem/bilininteg_mass_pa.hpp b/fem/bilininteg_mass_pa.hpp
-deleted file mode 100644
-index 73c8892e1..000000000
---- a/fem/bilininteg_mass_pa.hpp
-+++ /dev/null
-@@ -1,632 +0,0 @@
--// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
--// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
--// LICENSE and NOTICE for details. LLNL-CODE-806117.
--//
--// This file is part of the MFEM library. For more information and source code
--// availability visit https://mfem.org.
--//
--// MFEM is free software; you can redistribute it and/or modify it under the
--// terms of the BSD-3 license. We welcome feedback and contributions, see file
--// CONTRIBUTING.md for details.
--
--#ifndef MFEM_BILININTEG_MASS_PA_HPP
--#define MFEM_BILININTEG_MASS_PA_HPP
--
--#include "../config/config.hpp"
--#include "../general/forall.hpp"
--#include "../linalg/dtensor.hpp"
--
--namespace mfem
--{
--
--namespace internal
--{
--
--template <bool ACCUMULATE = true>
--MFEM_HOST_DEVICE inline
--void PAMassApply2D_Element(const int e,
--                           const int NE,
--                           const double *b_,
--                           const double *bt_,
--                           const double *d_,
--                           const double *x_,
--                           double *y_,
--                           const int d1d = 0,
--                           const int q1d = 0)
--{
--   const int D1D = d1d;
--   const int Q1D = q1d;
--   auto B = ConstDeviceMatrix(b_, Q1D, D1D);
--   auto Bt = ConstDeviceMatrix(bt_, D1D, Q1D);
--   auto D = ConstDeviceCube(d_, Q1D, Q1D, NE);
--   auto X = ConstDeviceCube(x_, D1D, D1D, NE);
--   auto Y = DeviceCube(y_, D1D, D1D, NE);
--
--   if (!ACCUMULATE)
--   {
--      for (int dy = 0; dy < D1D; ++dy)
--      {
--         for (int dx = 0; dx < D1D; ++dx)
--         {
--            Y(dx, dy, e) = 0.0;
--         }
--      }
--   }
--
--   constexpr int max_D1D = MAX_D1D;
--   constexpr int max_Q1D = MAX_Q1D;
--   double sol_xy[max_Q1D][max_Q1D];
--   for (int qy = 0; qy < Q1D; ++qy)
--   {
--      for (int qx = 0; qx < Q1D; ++qx)
--      {
--         sol_xy[qy][qx] = 0.0;
--      }
--   }
--   for (int dy = 0; dy < D1D; ++dy)
--   {
--      double sol_x[max_Q1D];
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         sol_x[qy] = 0.0;
--      }
--      for (int dx = 0; dx < D1D; ++dx)
--      {
--         const double s = X(dx,dy,e);
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            sol_x[qx] += B(qx,dx)* s;
--         }
--      }
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         const double d2q = B(qy,dy);
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            sol_xy[qy][qx] += d2q * sol_x[qx];
--         }
--      }
--   }
--   for (int qy = 0; qy < Q1D; ++qy)
--   {
--      for (int qx = 0; qx < Q1D; ++qx)
--      {
--         sol_xy[qy][qx] *= D(qx,qy,e);
--      }
--   }
--   for (int qy = 0; qy < Q1D; ++qy)
--   {
--      double sol_x[max_D1D];
--      for (int dx = 0; dx < D1D; ++dx)
--      {
--         sol_x[dx] = 0.0;
--      }
--      for (int qx = 0; qx < Q1D; ++qx)
--      {
--         const double s = sol_xy[qy][qx];
--         for (int dx = 0; dx < D1D; ++dx)
--         {
--            sol_x[dx] += Bt(dx,qx) * s;
--         }
--      }
--      for (int dy = 0; dy < D1D; ++dy)
--      {
--         const double q2d = Bt(dy,qy);
--         for (int dx = 0; dx < D1D; ++dx)
--         {
--            Y(dx,dy,e) += q2d * sol_x[dx];
--         }
--      }
--   }
--}
--
--template<int T_D1D, int T_Q1D, int T_NBZ, bool ACCUMULATE = true>
--MFEM_HOST_DEVICE inline
--void SmemPAMassApply2D_Element(const int e,
--                               const int NE,
--                               const double *b_,
--                               const double *d_,
--                               const double *x_,
--                               double *y_,
--                               int d1d = 0,
--                               int q1d = 0)
--{
--   const int D1D = T_D1D ? T_D1D : d1d;
--   const int Q1D = T_Q1D ? T_Q1D : q1d;
--   constexpr int NBZ = T_NBZ ? T_NBZ : 1;
--
--   constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
--   constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
--   constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
--
--   auto b = ConstDeviceMatrix(b_, Q1D, D1D);
--   auto D = ConstDeviceCube(d_, Q1D, Q1D, NE);
--   auto x = ConstDeviceCube(x_, D1D, D1D, NE);
--   auto Y = DeviceCube(y_, D1D, D1D, NE);
--
--   const int tidz = MFEM_THREAD_ID(z);
--
--   MFEM_SHARED double BBt[MQ1*MD1];
--   double (*B)[MD1] = (double (*)[MD1]) BBt;
--   double (*Bt)[MQ1] = (double (*)[MQ1]) BBt;
--   MFEM_SHARED double sm0[NBZ][MDQ*MDQ];
--   MFEM_SHARED double sm1[NBZ][MDQ*MDQ];
--   double (*X)[MD1] = (double (*)[MD1]) (sm0 + tidz);
--   double (*DQ)[MQ1] = (double (*)[MQ1]) (sm1 + tidz);
--   double (*QQ)[MQ1] = (double (*)[MQ1]) (sm0 + tidz);
--   double (*QD)[MD1] = (double (*)[MD1]) (sm1 + tidz);
--
--
--   MFEM_FOREACH_THREAD(dy,y,D1D)
--   {
--      MFEM_FOREACH_THREAD(dx,x,D1D)
--      {
--         X[dy][dx] = x(dx,dy,e);
--      }
--   }
--   if (tidz == 0)
--   {
--      MFEM_FOREACH_THREAD(dy,y,D1D)
--      {
--         MFEM_FOREACH_THREAD(q,x,Q1D)
--         {
--            B[q][dy] = b(q,dy);
--         }
--      }
--   }
--   MFEM_SYNC_THREAD;
--   MFEM_FOREACH_THREAD(dy,y,D1D)
--   {
--      MFEM_FOREACH_THREAD(qx,x,Q1D)
--      {
--         double dq = 0.0;
--         for (int dx = 0; dx < D1D; ++dx)
--         {
--            dq += X[dy][dx] * B[qx][dx];
--         }
--         DQ[dy][qx] = dq;
--      }
--   }
--   MFEM_SYNC_THREAD;
--   MFEM_FOREACH_THREAD(qy,y,Q1D)
--   {
--      MFEM_FOREACH_THREAD(qx,x,Q1D)
--      {
--         double qq = 0.0;
--         for (int dy = 0; dy < D1D; ++dy)
--         {
--            qq += DQ[dy][qx] * B[qy][dy];
--         }
--         QQ[qy][qx] = qq * D(qx, qy, e);
--      }
--   }
--   MFEM_SYNC_THREAD;
--   if (tidz == 0)
--   {
--      MFEM_FOREACH_THREAD(dy,y,D1D)
--      {
--         MFEM_FOREACH_THREAD(q,x,Q1D)
--         {
--            Bt[dy][q] = b(q,dy);
--         }
--      }
--   }
--   MFEM_SYNC_THREAD;
--   MFEM_FOREACH_THREAD(qy,y,Q1D)
--   {
--      MFEM_FOREACH_THREAD(dx,x,D1D)
--      {
--         double dq = 0.0;
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            dq += QQ[qy][qx] * Bt[dx][qx];
--         }
--         QD[qy][dx] = dq;
--      }
--   }
--   MFEM_SYNC_THREAD;
--   MFEM_FOREACH_THREAD(dy,y,D1D)
--   {
--      MFEM_FOREACH_THREAD(dx,x,D1D)
--      {
--         double dd = 0.0;
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            dd += (QD[qy][dx] * Bt[dy][qy]);
--         }
--         if (ACCUMULATE)
--         {
--            Y(dx, dy, e) += dd;
--         }
--         else
--         {
--            Y(dx, dy, e) = dd;
--         }
--      }
--   }
--}
--
--template <bool ACCUMULATE = true>
--MFEM_HOST_DEVICE inline
--void PAMassApply3D_Element(const int e,
--                           const int NE,
--                           const double *b_,
--                           const double *bt_,
--                           const double *d_,
--                           const double *x_,
--                           double *y_,
--                           const int d1d,
--                           const int q1d)
--{
--   const int D1D = d1d;
--   const int Q1D = q1d;
--   auto B = ConstDeviceMatrix(b_, Q1D, D1D);
--   auto Bt = ConstDeviceMatrix(bt_, D1D, Q1D);
--   auto D = DeviceTensor<4,const double>(d_, Q1D, Q1D, Q1D, NE);
--   auto X = DeviceTensor<4,const double>(x_, D1D, D1D, D1D, NE);
--   auto Y = DeviceTensor<4,double>(y_, D1D, D1D, D1D, NE);
--
--   if (!ACCUMULATE)
--   {
--      for (int dz = 0; dz < D1D; ++dz)
--      {
--         for (int dy = 0; dy < D1D; ++dy)
--         {
--            for (int dx = 0; dx < D1D; ++dx)
--            {
--               Y(dx, dy, dz, e) = 0.0;
--            }
--         }
--      }
--   }
--
--   constexpr int max_D1D = MAX_D1D;
--   constexpr int max_Q1D = MAX_Q1D;
--   double sol_xyz[max_Q1D][max_Q1D][max_Q1D];
--   for (int qz = 0; qz < Q1D; ++qz)
--   {
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            sol_xyz[qz][qy][qx] = 0.0;
--         }
--      }
--   }
--   for (int dz = 0; dz < D1D; ++dz)
--   {
--      double sol_xy[max_Q1D][max_Q1D];
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            sol_xy[qy][qx] = 0.0;
--         }
--      }
--      for (int dy = 0; dy < D1D; ++dy)
--      {
--         double sol_x[max_Q1D];
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            sol_x[qx] = 0;
--         }
--         for (int dx = 0; dx < D1D; ++dx)
--         {
--            const double s = X(dx,dy,dz,e);
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               sol_x[qx] += B(qx,dx) * s;
--            }
--         }
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            const double wy = B(qy,dy);
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               sol_xy[qy][qx] += wy * sol_x[qx];
--            }
--         }
--      }
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         const double wz = B(qz,dz);
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               sol_xyz[qz][qy][qx] += wz * sol_xy[qy][qx];
--            }
--         }
--      }
--   }
--   for (int qz = 0; qz < Q1D; ++qz)
--   {
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            sol_xyz[qz][qy][qx] *= D(qx,qy,qz,e);
--         }
--      }
--   }
--   for (int qz = 0; qz < Q1D; ++qz)
--   {
--      double sol_xy[max_D1D][max_D1D];
--      for (int dy = 0; dy < D1D; ++dy)
--      {
--         for (int dx = 0; dx < D1D; ++dx)
--         {
--            sol_xy[dy][dx] = 0;
--         }
--      }
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         double sol_x[max_D1D];
--         for (int dx = 0; dx < D1D; ++dx)
--         {
--            sol_x[dx] = 0;
--         }
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            const double s = sol_xyz[qz][qy][qx];
--            for (int dx = 0; dx < D1D; ++dx)
--            {
--               sol_x[dx] += Bt(dx,qx) * s;
--            }
--         }
--         for (int dy = 0; dy < D1D; ++dy)
--         {
--            const double wy = Bt(dy,qy);
--            for (int dx = 0; dx < D1D; ++dx)
--            {
--               sol_xy[dy][dx] += wy * sol_x[dx];
--            }
--         }
--      }
--      for (int dz = 0; dz < D1D; ++dz)
--      {
--         const double wz = Bt(dz,qz);
--         for (int dy = 0; dy < D1D; ++dy)
--         {
--            for (int dx = 0; dx < D1D; ++dx)
--            {
--               Y(dx,dy,dz,e) += wz * sol_xy[dy][dx];
--            }
--         }
--      }
--   }
--}
--
--template<int T_D1D, int T_Q1D, bool ACCUMULATE = true>
--MFEM_HOST_DEVICE inline
--void SmemPAMassApply3D_Element(const int e,
--                               const int NE,
--                               const double *b_,
--                               const double *d_,
--                               const double *x_,
--                               double *y_,
--                               const int d1d = 0,
--                               const int q1d = 0)
--{
--   constexpr int D1D = T_D1D ? T_D1D : d1d;
--   constexpr int Q1D = T_Q1D ? T_Q1D : q1d;
--   constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
--   constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
--   constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
--
--   auto b = ConstDeviceMatrix(b_, Q1D, D1D);
--   auto d = DeviceTensor<4,const double>(d_, Q1D, Q1D, Q1D, NE);
--   auto x = DeviceTensor<4,const double>(x_, D1D, D1D, D1D, NE);
--   auto y = DeviceTensor<4,double>(y_, D1D, D1D, D1D, NE);
--
--   MFEM_SHARED double sDQ[MQ1*MD1];
--   double (*B)[MD1] = (double (*)[MD1]) sDQ;
--   double (*Bt)[MQ1] = (double (*)[MQ1]) sDQ;
--   MFEM_SHARED double sm0[MDQ*MDQ*MDQ];
--   MFEM_SHARED double sm1[MDQ*MDQ*MDQ];
--   double (*X)[MD1][MD1]   = (double (*)[MD1][MD1]) sm0;
--   double (*DDQ)[MD1][MQ1] = (double (*)[MD1][MQ1]) sm1;
--   double (*DQQ)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) sm0;
--   double (*QQQ)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) sm1;
--   double (*QQD)[MQ1][MD1] = (double (*)[MQ1][MD1]) sm0;
--   double (*QDD)[MD1][MD1] = (double (*)[MD1][MD1]) sm1;
--   MFEM_FOREACH_THREAD(dy,y,D1D)
--   {
--      MFEM_FOREACH_THREAD(dx,x,D1D)
--      {
--         MFEM_UNROLL(MD1)
--         for (int dz = 0; dz < D1D; ++dz)
--         {
--            X[dz][dy][dx] = x(dx,dy,dz,e);
--         }
--      }
--      MFEM_FOREACH_THREAD(dx,x,Q1D)
--      {
--         B[dx][dy] = b(dx,dy);
--      }
--   }
--   MFEM_SYNC_THREAD;
--   MFEM_FOREACH_THREAD(dy,y,D1D)
--   {
--      MFEM_FOREACH_THREAD(qx,x,Q1D)
--      {
--         double u[D1D];
--         MFEM_UNROLL(MD1)
--         for (int dz = 0; dz < D1D; dz++)
--         {
--            u[dz] = 0;
--         }
--         MFEM_UNROLL(MD1)
--         for (int dx = 0; dx < D1D; ++dx)
--         {
--            MFEM_UNROLL(MD1)
--            for (int dz = 0; dz < D1D; ++dz)
--            {
--               u[dz] += X[dz][dy][dx] * B[qx][dx];
--            }
--         }
--         MFEM_UNROLL(MD1)
--         for (int dz = 0; dz < D1D; ++dz)
--         {
--            DDQ[dz][dy][qx] = u[dz];
--         }
--      }
--   }
--   MFEM_SYNC_THREAD;
--   MFEM_FOREACH_THREAD(qy,y,Q1D)
--   {
--      MFEM_FOREACH_THREAD(qx,x,Q1D)
--      {
--         double u[D1D];
--         MFEM_UNROLL(MD1)
--         for (int dz = 0; dz < D1D; dz++)
--         {
--            u[dz] = 0;
--         }
--         MFEM_UNROLL(MD1)
--         for (int dy = 0; dy < D1D; ++dy)
--         {
--            MFEM_UNROLL(MD1)
--            for (int dz = 0; dz < D1D; dz++)
--            {
--               u[dz] += DDQ[dz][dy][qx] * B[qy][dy];
--            }
--         }
--         MFEM_UNROLL(MD1)
--         for (int dz = 0; dz < D1D; dz++)
--         {
--            DQQ[dz][qy][qx] = u[dz];
--         }
--      }
--   }
--   MFEM_SYNC_THREAD;
--   MFEM_FOREACH_THREAD(qy,y,Q1D)
--   {
--      MFEM_FOREACH_THREAD(qx,x,Q1D)
--      {
--         double u[Q1D];
--         MFEM_UNROLL(MQ1)
--         for (int qz = 0; qz < Q1D; qz++)
--         {
--            u[qz] = 0;
--         }
--         MFEM_UNROLL(MD1)
--         for (int dz = 0; dz < D1D; ++dz)
--         {
--            MFEM_UNROLL(MQ1)
--            for (int qz = 0; qz < Q1D; qz++)
--            {
--               u[qz] += DQQ[dz][qy][qx] * B[qz][dz];
--            }
--         }
--         MFEM_UNROLL(MQ1)
--         for (int qz = 0; qz < Q1D; qz++)
--         {
--            QQQ[qz][qy][qx] = u[qz] * d(qx,qy,qz,e);
--         }
--      }
--   }
--   MFEM_SYNC_THREAD;
--   MFEM_FOREACH_THREAD(di,y,D1D)
--   {
--      MFEM_FOREACH_THREAD(q,x,Q1D)
--      {
--         Bt[di][q] = b(q,di);
--      }
--   }
--   MFEM_SYNC_THREAD;
--   MFEM_FOREACH_THREAD(qy,y,Q1D)
--   {
--      MFEM_FOREACH_THREAD(dx,x,D1D)
--      {
--         double u[Q1D];
--         MFEM_UNROLL(MQ1)
--         for (int qz = 0; qz < Q1D; ++qz)
--         {
--            u[qz] = 0;
--         }
--         MFEM_UNROLL(MQ1)
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            MFEM_UNROLL(MQ1)
--            for (int qz = 0; qz < Q1D; ++qz)
--            {
--               u[qz] += QQQ[qz][qy][qx] * Bt[dx][qx];
--            }
--         }
--         MFEM_UNROLL(MQ1)
--         for (int qz = 0; qz < Q1D; ++qz)
--         {
--            QQD[qz][qy][dx] = u[qz];
--         }
--      }
--   }
--   MFEM_SYNC_THREAD;
--   MFEM_FOREACH_THREAD(dy,y,D1D)
--   {
--      MFEM_FOREACH_THREAD(dx,x,D1D)
--      {
--         double u[Q1D];
--         MFEM_UNROLL(MQ1)
--         for (int qz = 0; qz < Q1D; ++qz)
--         {
--            u[qz] = 0;
--         }
--         MFEM_UNROLL(MQ1)
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            MFEM_UNROLL(MQ1)
--            for (int qz = 0; qz < Q1D; ++qz)
--            {
--               u[qz] += QQD[qz][qy][dx] * Bt[dy][qy];
--            }
--         }
--         MFEM_UNROLL(MQ1)
--         for (int qz = 0; qz < Q1D; ++qz)
--         {
--            QDD[qz][dy][dx] = u[qz];
--         }
--      }
--   }
--   MFEM_SYNC_THREAD;
--   MFEM_FOREACH_THREAD(dy,y,D1D)
--   {
--      MFEM_FOREACH_THREAD(dx,x,D1D)
--      {
--         double u[D1D];
--         MFEM_UNROLL(MD1)
--         for (int dz = 0; dz < D1D; ++dz)
--         {
--            u[dz] = 0;
--         }
--         MFEM_UNROLL(MQ1)
--         for (int qz = 0; qz < Q1D; ++qz)
--         {
--            MFEM_UNROLL(MD1)
--            for (int dz = 0; dz < D1D; ++dz)
--            {
--               u[dz] += QDD[qz][dy][dx] * Bt[dz][qz];
--            }
--         }
--         MFEM_UNROLL(MD1)
--         for (int dz = 0; dz < D1D; ++dz)
--         {
--            if (ACCUMULATE)
--            {
--               y(dx,dy,dz,e) += u[dz];
--            }
--            else
--            {
--               y(dx,dy,dz,e) = u[dz];
--            }
--         }
--      }
--   }
--   MFEM_SYNC_THREAD;
--}
--
--} // namespace internal
--
--} // namespace mfem
--
--#endif
-diff --git a/fem/bilininteg_transpose_ea.cpp b/fem/bilininteg_transpose_ea.cpp
-deleted file mode 100644
-index bea53f1b3..000000000
---- a/fem/bilininteg_transpose_ea.cpp
-+++ /dev/null
-@@ -1,186 +0,0 @@
--// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
--// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
--// LICENSE and NOTICE for details. LLNL-CODE-806117.
--//
--// This file is part of the MFEM library. For more information and source code
--// availability visit https://mfem.org.
--//
--// MFEM is free software; you can redistribute it and/or modify it under the
--// terms of the BSD-3 license. We welcome feedback and contributions, see file
--// CONTRIBUTING.md for details.
--
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--
--namespace mfem
--{
--
--void TransposeIntegrator::AssembleEA(const FiniteElementSpace &fes,
--                                     Vector &ea_data, const bool add)
--{
--   if (add)
--   {
--      Vector ea_data_tmp(ea_data.Size());
--      bfi->AssembleEA(fes, ea_data_tmp, false);
--      const int ne = fes.GetNE();
--      if (ne == 0) { return; }
--      const int dofs = fes.GetFE(0)->GetDof();
--      auto A = Reshape(ea_data_tmp.Read(), dofs, dofs, ne);
--      auto AT = Reshape(ea_data.ReadWrite(), dofs, dofs, ne);
--      mfem::forall(ne, [=] MFEM_HOST_DEVICE (int e)
--      {
--         for (int i = 0; i < dofs; i++)
--         {
--            for (int j = 0; j < dofs; j++)
--            {
--               const double a = A(i, j, e);
--               AT(j, i, e) += a;
--            }
--         }
--      });
--   }
--   else
--   {
--      bfi->AssembleEA(fes, ea_data, false);
--      const int ne = fes.GetNE();
--      if (ne == 0) { return; }
--      const int dofs = fes.GetFE(0)->GetDof();
--      auto A = Reshape(ea_data.ReadWrite(), dofs, dofs, ne);
--      mfem::forall(ne, [=] MFEM_HOST_DEVICE (int e)
--      {
--         for (int i = 0; i < dofs; i++)
--         {
--            for (int j = i+1; j < dofs; j++)
--            {
--               const double aij = A(i, j, e);
--               const double aji = A(j, i, e);
--               A(j, i, e) = aij;
--               A(i, j, e) = aji;
--            }
--         }
--      });
--   }
--}
--
--void TransposeIntegrator::AssembleEAInteriorFaces(const FiniteElementSpace& fes,
--                                                  Vector &ea_data_int,
--                                                  Vector &ea_data_ext,
--                                                  const bool add)
--{
--   const int nf = fes.GetNFbyType(FaceType::Interior);
--   if (nf == 0) { return; }
--   if (add)
--   {
--      Vector ea_data_int_tmp(ea_data_int.Size());
--      Vector ea_data_ext_tmp(ea_data_ext.Size());
--      bfi->AssembleEAInteriorFaces(fes, ea_data_int_tmp, ea_data_ext_tmp, false);
--      const int faceDofs = fes.GetTraceElement(0,
--                                               fes.GetMesh()->GetFaceGeometry(0))->GetDof();
--      auto A_int = Reshape(ea_data_int_tmp.Read(), faceDofs, faceDofs, 2, nf);
--      auto A_ext = Reshape(ea_data_ext_tmp.Read(), faceDofs, faceDofs, 2, nf);
--      auto AT_int = Reshape(ea_data_int.ReadWrite(), faceDofs, faceDofs, 2, nf);
--      auto AT_ext = Reshape(ea_data_ext.ReadWrite(), faceDofs, faceDofs, 2, nf);
--      mfem::forall(nf, [=] MFEM_HOST_DEVICE (int f)
--      {
--         for (int i = 0; i < faceDofs; i++)
--         {
--            for (int j = 0; j < faceDofs; j++)
--            {
--               const double a_int0 = A_int(i, j, 0, f);
--               const double a_int1 = A_int(i, j, 1, f);
--               const double a_ext0 = A_ext(i, j, 0, f);
--               const double a_ext1 = A_ext(i, j, 1, f);
--               AT_int(j, i, 0, f) += a_int0;
--               AT_int(j, i, 1, f) += a_int1;
--               AT_ext(j, i, 0, f) += a_ext1;
--               AT_ext(j, i, 1, f) += a_ext0;
--            }
--         }
--      });
--   }
--   else
--   {
--      bfi->AssembleEAInteriorFaces(fes, ea_data_int, ea_data_ext, false);
--      const int faceDofs = fes.GetTraceElement(0,
--                                               fes.GetMesh()->GetFaceGeometry(0))->GetDof();
--      auto A_int = Reshape(ea_data_int.ReadWrite(), faceDofs, faceDofs, 2, nf);
--      auto A_ext = Reshape(ea_data_ext.ReadWrite(), faceDofs, faceDofs, 2, nf);
--      mfem::forall(nf, [=] MFEM_HOST_DEVICE (int f)
--      {
--         for (int i = 0; i < faceDofs; i++)
--         {
--            for (int j = i+1; j < faceDofs; j++)
--            {
--               const double aij_int0 = A_int(i, j, 0, f);
--               const double aij_int1 = A_int(i, j, 1, f);
--               const double aji_int0 = A_int(j, i, 0, f);
--               const double aji_int1 = A_int(j, i, 1, f);
--               A_int(j, i, 0, f) = aij_int0;
--               A_int(j, i, 1, f) = aij_int1;
--               A_int(i, j, 0, f) = aji_int0;
--               A_int(i, j, 1, f) = aji_int1;
--            }
--         }
--         for (int i = 0; i < faceDofs; i++)
--         {
--            for (int j = 0; j < faceDofs; j++)
--            {
--               const double aij_ext0 = A_ext(i, j, 0, f);
--               const double aji_ext1 = A_ext(j, i, 1, f);
--               A_ext(j, i, 1, f) = aij_ext0;
--               A_ext(i, j, 0, f) = aji_ext1;
--            }
--         }
--      });
--   }
--}
--
--void TransposeIntegrator::AssembleEABoundaryFaces(const FiniteElementSpace& fes,
--                                                  Vector &ea_data_bdr,
--                                                  const bool add)
--{
--   const int nf = fes.GetNFbyType(FaceType::Boundary);
--   if (nf == 0) { return; }
--   if (add)
--   {
--      Vector ea_data_bdr_tmp(ea_data_bdr.Size());
--      bfi->AssembleEABoundaryFaces(fes, ea_data_bdr_tmp, false);
--      const int faceDofs = fes.GetTraceElement(0,
--                                               fes.GetMesh()->GetFaceGeometry(0))->GetDof();
--      auto A_bdr = Reshape(ea_data_bdr_tmp.Read(), faceDofs, faceDofs, nf);
--      auto AT_bdr = Reshape(ea_data_bdr.ReadWrite(), faceDofs, faceDofs, nf);
--      mfem::forall(nf, [=] MFEM_HOST_DEVICE (int f)
--      {
--         for (int i = 0; i < faceDofs; i++)
--         {
--            for (int j = 0; j < faceDofs; j++)
--            {
--               const double a_bdr = A_bdr(i, j, f);
--               AT_bdr(j, i, f) += a_bdr;
--            }
--         }
--      });
--   }
--   else
--   {
--      bfi->AssembleEABoundaryFaces(fes, ea_data_bdr, false);
--      const int faceDofs = fes.GetTraceElement(0,
--                                               fes.GetMesh()->GetFaceGeometry(0))->GetDof();
--      auto A_bdr = Reshape(ea_data_bdr.ReadWrite(), faceDofs, faceDofs, nf);
--      mfem::forall(nf, [=] MFEM_HOST_DEVICE (int f)
--      {
--         for (int i = 0; i < faceDofs; i++)
--         {
--            for (int j = i+1; j < faceDofs; j++)
--            {
--               const double aij_bdr = A_bdr(i, j, f);
--               const double aji_bdr = A_bdr(j, i, f);
--               A_bdr(j, i, f) = aij_bdr;
--               A_bdr(i, j, f) = aji_bdr;
--            }
--         }
--      });
--   }
--}
--
--}
-diff --git a/fem/bilininteg_vectorfe.cpp b/fem/bilininteg_vectorfe.cpp
-deleted file mode 100644
-index 4c2180cf5..000000000
---- a/fem/bilininteg_vectorfe.cpp
-+++ /dev/null
-@@ -1,1144 +0,0 @@
--// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
--// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
--// LICENSE and NOTICE for details. LLNL-CODE-806117.
--//
--// This file is part of the MFEM library. For more information and source code
--// availability visit https://mfem.org.
--//
--// MFEM is free software; you can redistribute it and/or modify it under the
--// terms of the BSD-3 license. We welcome feedback and contributions, see file
--// CONTRIBUTING.md for details.
--
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "qspace.hpp"
--#include "gridfunc.hpp"
--
--namespace mfem
--{
--
--void PADiffusionSetup3D(const int Q1D,
--                        const int coeffDim,
--                        const int NE,
--                        const Array<double> &w,
--                        const Vector &j,
--                        const Vector &coeff_,
--                        Vector &op);
--
--void PAHcurlMassAssembleDiagonal2D(const int D1D,
--                                   const int Q1D,
--                                   const int NE,
--                                   const bool symmetric,
--                                   const Array<double> &bo,
--                                   const Array<double> &bc,
--                                   const Vector &pa_data,
--                                   Vector &diag);
--
--void PAHcurlMassAssembleDiagonal3D(const int D1D,
--                                   const int Q1D,
--                                   const int NE,
--                                   const bool symmetric,
--                                   const Array<double> &bo,
--                                   const Array<double> &bc,
--                                   const Vector &pa_data,
--                                   Vector &diag);
--
--template<int T_D1D = 0, int T_Q1D = 0>
--void SmemPAHcurlMassAssembleDiagonal3D(const int D1D,
--                                       const int Q1D,
--                                       const int NE,
--                                       const bool symmetric,
--                                       const Array<double> &bo,
--                                       const Array<double> &bc,
--                                       const Vector &pa_data,
--                                       Vector &diag);
--
--void PAHcurlMassApply2D(const int D1D,
--                        const int Q1D,
--                        const int NE,
--                        const bool symmetric,
--                        const Array<double> &bo,
--                        const Array<double> &bc,
--                        const Array<double> &bot,
--                        const Array<double> &bct,
--                        const Vector &pa_data,
--                        const Vector &x,
--                        Vector &y);
--
--void PAHcurlMassApply3D(const int D1D,
--                        const int Q1D,
--                        const int NE,
--                        const bool symmetric,
--                        const Array<double> &bo,
--                        const Array<double> &bc,
--                        const Array<double> &bot,
--                        const Array<double> &bct,
--                        const Vector &pa_data,
--                        const Vector &x,
--                        Vector &y);
--
--template<int T_D1D = 0, int T_Q1D = 0>
--void SmemPAHcurlMassApply3D(const int D1D,
--                            const int Q1D,
--                            const int NE,
--                            const bool symmetric,
--                            const Array<double> &bo,
--                            const Array<double> &bc,
--                            const Array<double> &bot,
--                            const Array<double> &bct,
--                            const Vector &pa_data,
--                            const Vector &x,
--                            Vector &y);
--
--void PAHdivSetup2D(const int Q1D,
--                   const int coeffDim,
--                   const int NE,
--                   const Array<double> &w,
--                   const Vector &j,
--                   Vector &coeff_,
--                   Vector &op);
--
--void PAHdivSetup3D(const int Q1D,
--                   const int coeffDim,
--                   const int NE,
--                   const Array<double> &w,
--                   const Vector &j,
--                   Vector &coeff_,
--                   Vector &op);
--
--void PAHcurlH1Apply2D(const int D1D,
--                      const int Q1D,
--                      const int NE,
--                      const Array<double> &bc,
--                      const Array<double> &gc,
--                      const Array<double> &bot,
--                      const Array<double> &bct,
--                      const Vector &pa_data,
--                      const Vector &x,
--                      Vector &y);
--
--void PAHcurlH1ApplyTranspose2D(const int D1D,
--                               const int Q1D,
--                               const int NE,
--                               const Array<double> &bc,
--                               const Array<double> &bo,
--                               const Array<double> &bct,
--                               const Array<double> &gct,
--                               const Vector &pa_data,
--                               const Vector &x,
--                               Vector &y);
--
--void PAHcurlH1Apply3D(const int D1D,
--                      const int Q1D,
--                      const int NE,
--                      const Array<double> &bc,
--                      const Array<double> &gc,
--                      const Array<double> &bot,
--                      const Array<double> &bct,
--                      const Vector &pa_data,
--                      const Vector &x,
--                      Vector &y);
--
--void PAHcurlH1ApplyTranspose3D(const int D1D,
--                               const int Q1D,
--                               const int NE,
--                               const Array<double> &bc,
--                               const Array<double> &bo,
--                               const Array<double> &bct,
--                               const Array<double> &gct,
--                               const Vector &pa_data,
--                               const Vector &x,
--                               Vector &y);
--
--void PAHdivMassAssembleDiagonal2D(const int D1D,
--                                  const int Q1D,
--                                  const int NE,
--                                  const bool symmetric,
--                                  const Array<double> &Bo_,
--                                  const Array<double> &Bc_,
--                                  const Vector &op_,
--                                  Vector &diag_);
--
--void PAHdivMassAssembleDiagonal3D(const int D1D,
--                                  const int Q1D,
--                                  const int NE,
--                                  const bool symmetric,
--                                  const Array<double> &Bo_,
--                                  const Array<double> &Bc_,
--                                  const Vector &op_,
--                                  Vector &diag_);
--
--void PAHdivMassApply(const int dim,
--                     const int D1D,
--                     const int Q1D,
--                     const int NE,
--                     const bool symmetric,
--                     const Array<double> &Bo,
--                     const Array<double> &Bc,
--                     const Array<double> &Bot,
--                     const Array<double> &Bct,
--                     const Vector &op,
--                     const Vector &x,
--                     Vector &y);
--
--void PAHcurlL2Setup(const int NQ,
--                    const int coeffDim,
--                    const int NE,
--                    const Array<double> &w,
--                    Vector &coeff_,
--                    Vector &op);
--
--// PA H(curl) x H(div) mass assemble 3D kernel, with factor
--// dF^{-1} C dF for a vector or matrix coefficient C.
--// If transpose, use dF^T C dF^{-T} for H(div) x H(curl).
--void PAHcurlHdivSetup3D(const int Q1D,
--                        const int coeffDim,
--                        const int NE,
--                        const bool transpose,
--                        const Array<double> &w_,
--                        const Vector &j,
--                        Vector &coeff_,
--                        Vector &op)
--{
--   const bool symmetric = (coeffDim != 9);
--   auto W = Reshape(w_.Read(), Q1D, Q1D, Q1D);
--   auto J = Reshape(j.Read(), Q1D, Q1D, Q1D, 3, 3, NE);
--   auto coeff = Reshape(coeff_.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
--   auto y = Reshape(op.Write(), 9, Q1D, Q1D, Q1D, NE);
--
--   const int i11 = 0;
--   const int i12 = transpose ? 3 : 1;
--   const int i13 = transpose ? 6 : 2;
--   const int i21 = transpose ? 1 : 3;
--   const int i22 = 4;
--   const int i23 = transpose ? 7 : 5;
--   const int i31 = transpose ? 2 : 6;
--   const int i32 = transpose ? 5 : 7;
--   const int i33 = 8;
--
--   mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
--   {
--      MFEM_FOREACH_THREAD(qx,x,Q1D)
--      {
--         MFEM_FOREACH_THREAD(qy,y,Q1D)
--         {
--            MFEM_FOREACH_THREAD(qz,z,Q1D)
--            {
--               const double J11 = J(qx,qy,qz,0,0,e);
--               const double J21 = J(qx,qy,qz,1,0,e);
--               const double J31 = J(qx,qy,qz,2,0,e);
--               const double J12 = J(qx,qy,qz,0,1,e);
--               const double J22 = J(qx,qy,qz,1,1,e);
--               const double J32 = J(qx,qy,qz,2,1,e);
--               const double J13 = J(qx,qy,qz,0,2,e);
--               const double J23 = J(qx,qy,qz,1,2,e);
--               const double J33 = J(qx,qy,qz,2,2,e);
--               const double detJ = J11 * (J22 * J33 - J32 * J23) -
--                                   J21 * (J12 * J33 - J32 * J13) +
--                                   J31 * (J12 * J23 - J22 * J13);
--               const double w_detJ = W(qx,qy,qz) / detJ;
--               // adj(J)
--               const double A11 = (J22 * J33) - (J23 * J32);
--               const double A12 = (J32 * J13) - (J12 * J33);
--               const double A13 = (J12 * J23) - (J22 * J13);
--               const double A21 = (J31 * J23) - (J21 * J33);
--               const double A22 = (J11 * J33) - (J13 * J31);
--               const double A23 = (J21 * J13) - (J11 * J23);
--               const double A31 = (J21 * J32) - (J31 * J22);
--               const double A32 = (J31 * J12) - (J11 * J32);
--               const double A33 = (J11 * J22) - (J12 * J21);
--
--               if (coeffDim == 6 || coeffDim == 9) // Matrix coefficient version
--               {
--                  // First compute entries of R = M^T J
--                  const double M11 = (!symmetric) ? coeff(i11,qx,qy,qz,e) : coeff(0,qx,qy,qz,e);
--                  const double M12 = (!symmetric) ? coeff(i12,qx,qy,qz,e) : coeff(1,qx,qy,qz,e);
--                  const double M13 = (!symmetric) ? coeff(i13,qx,qy,qz,e) : coeff(2,qx,qy,qz,e);
--                  const double M21 = (!symmetric) ? coeff(i21,qx,qy,qz,e) : M12;
--                  const double M22 = (!symmetric) ? coeff(i22,qx,qy,qz,e) : coeff(3,qx,qy,qz,e);
--                  const double M23 = (!symmetric) ? coeff(i23,qx,qy,qz,e) : coeff(4,qx,qy,qz,e);
--                  const double M31 = (!symmetric) ? coeff(i31,qx,qy,qz,e) : M13;
--                  const double M32 = (!symmetric) ? coeff(i32,qx,qy,qz,e) : M23;
--                  const double M33 = (!symmetric) ? coeff(i33,qx,qy,qz,e) : coeff(5,qx,qy,qz,e);
--
--                  const double R11 = M11*J11 + M21*J21 + M31*J31;
--                  const double R12 = M11*J12 + M21*J22 + M31*J32;
--                  const double R13 = M11*J13 + M21*J23 + M31*J33;
--                  const double R21 = M12*J11 + M22*J21 + M32*J31;
--                  const double R22 = M12*J12 + M22*J22 + M32*J32;
--                  const double R23 = M12*J13 + M22*J23 + M32*J33;
--                  const double R31 = M13*J11 + M23*J21 + M33*J31;
--                  const double R32 = M13*J12 + M23*J22 + M33*J32;
--                  const double R33 = M13*J13 + M23*J23 + M33*J33;
--
--                  // y = (J^{-1} M^T J)^T
--                  y(i11,qx,qy,qz,e) = w_detJ * (A11*R11 + A12*R21 + A13*R31); // 1,1
--                  y(i21,qx,qy,qz,e) = w_detJ * (A11*R12 + A12*R22 + A13*R32); // 1,2
--                  y(i31,qx,qy,qz,e) = w_detJ * (A11*R13 + A12*R23 + A13*R33); // 1,3
--                  y(i12,qx,qy,qz,e) = w_detJ * (A21*R11 + A22*R21 + A23*R31); // 2,1
--                  y(i22,qx,qy,qz,e) = w_detJ * (A21*R12 + A22*R22 + A23*R32); // 2,2
--                  y(i32,qx,qy,qz,e) = w_detJ * (A21*R13 + A22*R23 + A23*R33); // 2,3
--                  y(i13,qx,qy,qz,e) = w_detJ * (A31*R11 + A32*R21 + A33*R31); // 3,1
--                  y(i23,qx,qy,qz,e) = w_detJ * (A31*R12 + A32*R22 + A33*R32); // 3,2
--                  y(i33,qx,qy,qz,e) = w_detJ * (A31*R13 + A32*R23 + A33*R33); // 3,3
--               }
--               else if (coeffDim == 3)  // Vector coefficient version
--               {
--                  const double D1 = coeff(0,qx,qy,qz,e);
--                  const double D2 = coeff(1,qx,qy,qz,e);
--                  const double D3 = coeff(2,qx,qy,qz,e);
--                  // detJ J^{-1} DJ = adj(J) DJ
--                  // transpose
--                  y(i11,qx,qy,qz,e) = w_detJ * (D1*A11*J11 + D2*A12*J21 + D3*A13*J31); // 1,1
--                  y(i21,qx,qy,qz,e) = w_detJ * (D1*A11*J12 + D2*A12*J22 + D3*A13*J32); // 1,2
--                  y(i31,qx,qy,qz,e) = w_detJ * (D1*A11*J13 + D2*A12*J23 + D3*A13*J33); // 1,3
--                  y(i12,qx,qy,qz,e) = w_detJ * (D1*A21*J11 + D2*A22*J21 + D3*A23*J31); // 2,1
--                  y(i22,qx,qy,qz,e) = w_detJ * (D1*A21*J12 + D2*A22*J22 + D3*A23*J32); // 2,2
--                  y(i32,qx,qy,qz,e) = w_detJ * (D1*A21*J13 + D2*A22*J23 + D3*A23*J33); // 2,3
--                  y(i13,qx,qy,qz,e) = w_detJ * (D1*A31*J11 + D2*A32*J21 + D3*A33*J31); // 3,1
--                  y(i23,qx,qy,qz,e) = w_detJ * (D1*A31*J12 + D2*A32*J22 + D3*A33*J32); // 3,2
--                  y(i33,qx,qy,qz,e) = w_detJ * (D1*A31*J13 + D2*A32*J23 + D3*A33*J33); // 3,3
--               }
--            }
--         }
--      }
--   });
--}
--
--// PA H(curl) x H(div) mass assemble 2D kernel, with factor
--// dF^{-1} C dF for a vector or matrix coefficient C.
--// If transpose, use dF^T C dF^{-T} for H(div) x H(curl).
--void PAHcurlHdivSetup2D(const int Q1D,
--                        const int coeffDim,
--                        const int NE,
--                        const bool transpose,
--                        const Array<double> &w_,
--                        const Vector &j,
--                        Vector &coeff_,
--                        Vector &op)
--{
--   const bool symmetric = (coeffDim != 4);
--   auto W = Reshape(w_.Read(), Q1D, Q1D);
--   auto J = Reshape(j.Read(), Q1D, Q1D, 2, 2, NE);
--   auto coeff = Reshape(coeff_.Read(), coeffDim, Q1D, Q1D, NE);
--   auto y = Reshape(op.Write(), 4, Q1D, Q1D, NE);
--
--   const int i11 = 0;
--   const int i12 = transpose ? 2 : 1;
--   const int i21 = transpose ? 1 : 2;
--   const int i22 = 3;
--
--   mfem::forall_2D(NE, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
--   {
--      MFEM_FOREACH_THREAD(qx,x,Q1D)
--      {
--         MFEM_FOREACH_THREAD(qy,y,Q1D)
--         {
--            const double J11 = J(qx,qy,0,0,e);
--            const double J21 = J(qx,qy,1,0,e);
--            const double J12 = J(qx,qy,0,1,e);
--            const double J22 = J(qx,qy,1,1,e);
--            const double w_detJ = W(qx,qy) / ((J11*J22) - (J21*J12));
--
--            if (coeffDim == 3 || coeffDim == 4) // Matrix coefficient version
--            {
--               // First compute entries of R = MJ
--               const double M11 = coeff(i11,qx,qy,e);
--               const double M12 = (!symmetric) ? coeff(i12,qx,qy,e) : coeff(1,qx,qy,e);
--               const double M21 = (!symmetric) ? coeff(i21,qx,qy,e) : M12;
--               const double M22 = (!symmetric) ? coeff(i22,qx,qy,e) : coeff(2,qx,qy,e);
--
--               // J^{-1} M^T
--               const double R11 = ( J22*M11 - J12*M12); // 1,1
--               const double R12 = ( J22*M21 - J12*M22); // 1,2
--               const double R21 = (-J21*M11 + J11*M12); // 2,1
--               const double R22 = (-J21*M21 + J11*M22); // 2,2
--
--               // (RJ)^T
--               y(i11,qx,qy,e) = w_detJ * (R11*J11 + R12*J21); // 1,1
--               y(i21,qx,qy,e) = w_detJ * (R11*J12 + R12*J22); // 1,2 (transpose)
--               y(i12,qx,qy,e) = w_detJ * (R21*J11 + R22*J21); // 2,1 (transpose)
--               y(i22,qx,qy,e) = w_detJ * (R21*J12 + R22*J22); // 2,2
--            }
--            else if (coeffDim == 2) // Vector coefficient version
--            {
--               const double D1 = coeff(0,qx,qy,e);
--               const double D2 = coeff(1,qx,qy,e);
--               const double R11 = D1*J11;
--               const double R12 = D1*J12;
--               const double R21 = D2*J21;
--               const double R22 = D2*J22;
--               y(i11,qx,qy,e) = w_detJ * ( J22*R11 - J12*R21); // 1,1
--               y(i21,qx,qy,e) = w_detJ * ( J22*R12 - J12*R22); // 1,2 (transpose)
--               y(i12,qx,qy,e) = w_detJ * (-J21*R11 + J11*R21); // 2,1 (transpose)
--               y(i22,qx,qy,e) = w_detJ * (-J21*R12 + J11*R22); // 2,2
--            }
--         }
--      }
--   });
--}
--
--// Mass operator for H(curl) and H(div) functions, using Piola transformations
--// u = dF^{-T} \hat{u} in H(curl), v = (1 / det dF) dF \hat{v} in H(div).
--void PAHcurlHdivMassApply3D(const int D1D,
--                            const int D1Dtest,
--                            const int Q1D,
--                            const int NE,
--                            const bool scalarCoeff,
--                            const bool trialHcurl,
--                            const bool transpose,
--                            const Array<double> &Bo_,
--                            const Array<double> &Bc_,
--                            const Array<double> &Bot_,
--                            const Array<double> &Bct_,
--                            const Vector &op_,
--                            const Vector &x_,
--                            Vector &y_)
--{
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
--
--   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
--   constexpr static int VDIM = 3;
--
--   auto Bo = Reshape(Bo_.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(Bc_.Read(), Q1D, D1D);
--   auto Bot = Reshape(Bot_.Read(), D1Dtest-1, Q1D);
--   auto Bct = Reshape(Bct_.Read(), D1Dtest, Q1D);
--   auto op = Reshape(op_.Read(), scalarCoeff ? 1 : 9, Q1D, Q1D, Q1D, NE);
--   auto x = Reshape(x_.Read(), 3*(D1D-1)*D1D*(trialHcurl ? D1D : D1D-1), NE);
--   auto y = Reshape(y_.ReadWrite(), 3*(D1Dtest-1)*D1Dtest*
--                    (trialHcurl ? D1Dtest-1 : D1Dtest), NE);
--
--   const int i12 = transpose ? 3 : 1;
--   const int i13 = transpose ? 6 : 2;
--   const int i21 = transpose ? 1 : 3;
--   const int i23 = transpose ? 7 : 5;
--   const int i31 = transpose ? 2 : 6;
--   const int i32 = transpose ? 5 : 7;
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double mass[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];
--
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               for (int c = 0; c < VDIM; ++c)
--               {
--                  mass[qz][qy][qx][c] = 0.0;
--               }
--            }
--         }
--      }
--
--      int osc = 0;
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z trial components
--      {
--         const int D1Dz = trialHcurl ? ((c == 2) ? D1D - 1 : D1D) :
--                          ((c == 2) ? D1D : D1D - 1);
--         const int D1Dy = trialHcurl ? ((c == 1) ? D1D - 1 : D1D) :
--                          ((c == 1) ? D1D : D1D - 1);
--         const int D1Dx = trialHcurl ? ((c == 0) ? D1D - 1 : D1D) :
--                          ((c == 0) ? D1D : D1D - 1);
--
--         for (int dz = 0; dz < D1Dz; ++dz)
--         {
--            double massXY[MAX_Q1D][MAX_Q1D];
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  massXY[qy][qx] = 0.0;
--               }
--            }
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               double massX[MAX_Q1D];
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  massX[qx] = 0.0;
--               }
--
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  const double t = x(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     massX[qx] += t * (trialHcurl ? ((c == 0) ? Bo(qx,dx) : Bc(qx,dx)) :
--                                       ((c == 0) ? Bc(qx,dx) : Bo(qx,dx)));
--                  }
--               }
--
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  const double wy = trialHcurl ? ((c == 1) ? Bo(qy,dy) : Bc(qy,dy)) :
--                                    ((c == 1) ? Bc(qy,dy) : Bo(qy,dy));
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     const double wx = massX[qx];
--                     massXY[qy][qx] += wx * wy;
--                  }
--               }
--            }
--
--            for (int qz = 0; qz < Q1D; ++qz)
--            {
--               const double wz = trialHcurl ? ((c == 2) ? Bo(qz,dz) : Bc(qz,dz)) :
--                                 ((c == 2) ? Bc(qz,dz) : Bo(qz,dz));
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     mass[qz][qy][qx][c] += massXY[qy][qx] * wz;
--                  }
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }  // loop (c) over components
--
--      // Apply D operator.
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               const double O11 = op(0,qx,qy,qz,e);
--               const double O12 = scalarCoeff ? 0.0 : op(i12,qx,qy,qz,e);
--               const double O13 = scalarCoeff ? 0.0 : op(i13,qx,qy,qz,e);
--               const double O21 = scalarCoeff ? 0.0 : op(i21,qx,qy,qz,e);
--               const double O22 = scalarCoeff ? O11 : op(4,qx,qy,qz,e);
--               const double O23 = scalarCoeff ? 0.0 : op(i23,qx,qy,qz,e);
--               const double O31 = scalarCoeff ? 0.0 : op(i31,qx,qy,qz,e);
--               const double O32 = scalarCoeff ? 0.0 : op(i32,qx,qy,qz,e);
--               const double O33 = scalarCoeff ? O11 : op(8,qx,qy,qz,e);
--               const double massX = mass[qz][qy][qx][0];
--               const double massY = mass[qz][qy][qx][1];
--               const double massZ = mass[qz][qy][qx][2];
--               mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
--               mass[qz][qy][qx][1] = (O21*massX)+(O22*massY)+(O23*massZ);
--               mass[qz][qy][qx][2] = (O31*massX)+(O32*massY)+(O33*massZ);
--            }
--         }
--      }
--
--      for (int qz = 0; qz < Q1D; ++qz)
--      {
--         double massXY[HDIV_MAX_D1D][HDIV_MAX_D1D];
--
--         osc = 0;
--         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z test components
--         {
--            const int D1Dz = trialHcurl ? ((c == 2) ? D1Dtest : D1Dtest - 1) :
--                             ((c == 2) ? D1Dtest - 1 : D1Dtest);
--            const int D1Dy = trialHcurl ? ((c == 1) ? D1Dtest : D1Dtest - 1) :
--                             ((c == 1) ? D1Dtest - 1 : D1Dtest);
--            const int D1Dx = trialHcurl ? ((c == 0) ? D1Dtest : D1Dtest - 1) :
--                             ((c == 0) ? D1Dtest - 1 : D1Dtest);
--
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  massXY[dy][dx] = 0.0;
--               }
--            }
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               double massX[HDIV_MAX_D1D];
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  massX[dx] = 0.0;
--               }
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     massX[dx] += mass[qz][qy][qx][c] * (trialHcurl ?
--                                                         ((c == 0) ? Bct(dx,qx) : Bot(dx,qx)) :
--                                                         ((c == 0) ? Bot(dx,qx) : Bct(dx,qx)));
--                  }
--               }
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  const double wy = trialHcurl ? ((c == 1) ? Bct(dy,qy) : Bot(dy,qy)) :
--                                    ((c == 1) ? Bot(dy,qy) : Bct(dy,qy));
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     massXY[dy][dx] += massX[dx] * wy;
--                  }
--               }
--            }
--
--            for (int dz = 0; dz < D1Dz; ++dz)
--            {
--               const double wz = trialHcurl ? ((c == 2) ? Bct(dz,qz) : Bot(dz,qz)) :
--                                 ((c == 2) ? Bot(dz,qz) : Bct(dz,qz));
--               for (int dy = 0; dy < D1Dy; ++dy)
--               {
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) +=
--                        massXY[dy][dx] * wz;
--                  }
--               }
--            }
--
--            osc += D1Dx * D1Dy * D1Dz;
--         }  // loop c
--      }  // loop qz
--   }); // end of element loop
--}
--
--// Mass operator for H(curl) and H(div) functions, using Piola transformations
--// u = dF^{-T} \hat{u} in H(curl), v = (1 / det dF) dF \hat{v} in H(div).
--void PAHcurlHdivMassApply2D(const int D1D,
--                            const int D1Dtest,
--                            const int Q1D,
--                            const int NE,
--                            const bool scalarCoeff,
--                            const bool trialHcurl,
--                            const bool transpose,
--                            const Array<double> &Bo_,
--                            const Array<double> &Bc_,
--                            const Array<double> &Bot_,
--                            const Array<double> &Bct_,
--                            const Vector &op_,
--                            const Vector &x_,
--                            Vector &y_)
--{
--   constexpr static int MAX_D1D = HCURL_MAX_D1D;
--   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
--
--   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
--   constexpr static int VDIM = 2;
--
--   auto Bo = Reshape(Bo_.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(Bc_.Read(), Q1D, D1D);
--   auto Bot = Reshape(Bot_.Read(), D1Dtest-1, Q1D);
--   auto Bct = Reshape(Bct_.Read(), D1Dtest, Q1D);
--   auto op = Reshape(op_.Read(), scalarCoeff ? 1 : 4, Q1D, Q1D, NE);
--   auto x = Reshape(x_.Read(), 2*(D1D-1)*D1D, NE);
--   auto y = Reshape(y_.ReadWrite(), 2*(D1Dtest-1)*D1Dtest, NE);
--
--   const int i12 = transpose ? 2 : 1;
--   const int i21 = transpose ? 1 : 2;
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      double mass[MAX_Q1D][MAX_Q1D][VDIM];
--
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            for (int c = 0; c < VDIM; ++c)
--            {
--               mass[qy][qx][c] = 0.0;
--            }
--         }
--      }
--
--      int osc = 0;
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y trial components
--      {
--         const int D1Dy = trialHcurl ? ((c == 1) ? D1D - 1 : D1D) :
--                          ((c == 1) ? D1D : D1D - 1);
--         const int D1Dx = trialHcurl ? ((c == 0) ? D1D - 1 : D1D) :
--                          ((c == 0) ? D1D : D1D - 1);
--
--         for (int dy = 0; dy < D1Dy; ++dy)
--         {
--            double massX[MAX_Q1D];
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               massX[qx] = 0.0;
--            }
--
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               const double t = x(dx + (dy * D1Dx) + osc, e);
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  massX[qx] += t * (trialHcurl ? ((c == 0) ? Bo(qx,dx) : Bc(qx,dx)) :
--                                    ((c == 0) ? Bc(qx,dx) : Bo(qx,dx)));
--               }
--            }
--
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               const double wy = trialHcurl ? ((c == 1) ? Bo(qy,dy) : Bc(qy,dy)) :
--                                 ((c == 1) ? Bc(qy,dy) : Bo(qy,dy));
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  mass[qy][qx][c] += massX[qx] * wy;
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy;
--      }  // loop (c) over components
--
--      // Apply D operator.
--      for (int qy = 0; qy < Q1D; ++qy)
--      {
--         for (int qx = 0; qx < Q1D; ++qx)
--         {
--            const double O11 = op(0,qx,qy,e);
--            const double O12 = scalarCoeff ? 0.0 : op(i12,qx,qy,e);
--            const double O21 = scalarCoeff ? 0.0 : op(i21,qx,qy,e);
--            const double O22 = scalarCoeff ? O11 : op(3,qx,qy,e);
--            const double massX = mass[qy][qx][0];
--            const double massY = mass[qy][qx][1];
--            mass[qy][qx][0] = (O11*massX)+(O12*massY);
--            mass[qy][qx][1] = (O21*massX)+(O22*massY);
--         }
--      }
--
--      osc = 0;
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y test components
--      {
--         const int D1Dy = trialHcurl ? ((c == 1) ? D1Dtest : D1Dtest - 1) :
--                          ((c == 1) ? D1Dtest - 1 : D1Dtest);
--         const int D1Dx = trialHcurl ? ((c == 0) ? D1Dtest : D1Dtest - 1) :
--                          ((c == 0) ? D1Dtest - 1 : D1Dtest);
--
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            double massX[HDIV_MAX_D1D];
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               massX[dx] = 0.0;
--            }
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  massX[dx] += mass[qy][qx][c] * (trialHcurl ?
--                                                  ((c == 0) ? Bct(dx,qx) : Bot(dx,qx)) :
--                                                  ((c == 0) ? Bot(dx,qx) : Bct(dx,qx)));
--               }
--            }
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               const double wy = trialHcurl ? ((c == 1) ? Bct(dy,qy) : Bot(dy,qy)) :
--                                 ((c == 1) ? Bot(dy,qy) : Bct(dy,qy));
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  y(dx + (dy * D1Dx) + osc, e) += massX[dx] * wy;
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy;
--      }  // loop c
--   }); // end of element loop
--}
--
--void VectorFEMassIntegrator::AssemblePA(const FiniteElementSpace &fes)
--{
--   AssemblePA(fes, fes);
--}
--
--void VectorFEMassIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
--                                        const FiniteElementSpace &test_fes)
--{
--   // Assumes tensor-product elements
--   Mesh *mesh = trial_fes.GetMesh();
--
--   const FiniteElement *trial_fel = trial_fes.GetFE(0);
--   const VectorTensorFiniteElement *trial_el =
--      dynamic_cast<const VectorTensorFiniteElement*>(trial_fel);
--   MFEM_VERIFY(trial_el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const FiniteElement *test_fel = test_fes.GetFE(0);
--   const VectorTensorFiniteElement *test_el =
--      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
--   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const IntegrationRule *ir
--      = IntRule ? IntRule : &MassIntegrator::GetRule(*trial_el, *trial_el,
--                                                     *mesh->GetElementTransformation(0));
--   const int dims = trial_el->GetDim();
--   MFEM_VERIFY(dims == 2 || dims == 3, "");
--
--   const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
--   nq = ir->GetNPoints();
--   dim = mesh->Dimension();
--   MFEM_VERIFY(dim == 2 || dim == 3, "");
--
--   ne = trial_fes.GetNE();
--   MFEM_VERIFY(ne == test_fes.GetNE(),
--               "Different meshes for test and trial spaces");
--   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
--   mapsC = &trial_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
--   mapsO = &trial_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
--   dofs1D = mapsC->ndof;
--   quad1D = mapsC->nqpt;
--
--   mapsCtest = &test_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
--   mapsOtest = &test_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
--   dofs1Dtest = mapsCtest->ndof;
--
--   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
--
--   trial_fetype = trial_el->GetDerivType();
--   test_fetype = test_el->GetDerivType();
--
--   const bool trial_curl = (trial_fetype == mfem::FiniteElement::CURL);
--   const bool trial_div = (trial_fetype == mfem::FiniteElement::DIV);
--   const bool test_curl = (test_fetype == mfem::FiniteElement::CURL);
--   const bool test_div = (test_fetype == mfem::FiniteElement::DIV);
--
--   QuadratureSpace qs(*mesh, *ir);
--   CoefficientVector coeff(qs, CoefficientStorage::SYMMETRIC);
--   if (Q) { coeff.Project(*Q); }
--   else if (MQ) { coeff.ProjectTranspose(*MQ); }
--   else if (DQ) { coeff.Project(*DQ); }
--   else { coeff.SetConstant(1.0); }
--
--   const int coeff_dim = coeff.GetVDim();
--   symmetric = (coeff_dim != dim*dim);
--
--   if ((trial_curl && test_div) || (trial_div && test_curl))
--      pa_data.SetSize((coeff_dim == 1 ? 1 : dim*dim) * nq * ne,
--                      Device::GetMemoryType());
--   else
--      pa_data.SetSize((symmetric ? symmDims : dims*dims) * nq * ne,
--                      Device::GetMemoryType());
--
--   if (trial_curl && test_curl && dim == 3)
--   {
--      PADiffusionSetup3D(quad1D, coeff_dim, ne, ir->GetWeights(), geom->J,
--                         coeff, pa_data);
--   }
--   else if (trial_curl && test_curl && dim == 2)
--   {
--      PADiffusionSetup2D<2>(quad1D, coeff_dim, ne, ir->GetWeights(), geom->J,
--                            coeff, pa_data);
--   }
--   else if (trial_div && test_div && dim == 3)
--   {
--      PAHdivSetup3D(quad1D, coeff_dim, ne, ir->GetWeights(), geom->J,
--                    coeff, pa_data);
--   }
--   else if (trial_div && test_div && dim == 2)
--   {
--      PAHdivSetup2D(quad1D, coeff_dim, ne, ir->GetWeights(), geom->J,
--                    coeff, pa_data);
--   }
--   else if (((trial_curl && test_div) || (trial_div && test_curl)) &&
--            test_fel->GetOrder() == trial_fel->GetOrder())
--   {
--      if (coeff_dim == 1)
--      {
--         PAHcurlL2Setup(nq, coeff_dim, ne, ir->GetWeights(), coeff, pa_data);
--      }
--      else
--      {
--         const bool tr = (trial_div && test_curl);
--         if (dim == 3)
--            PAHcurlHdivSetup3D(quad1D, coeff_dim, ne, tr, ir->GetWeights(),
--                               geom->J, coeff, pa_data);
--         else
--            PAHcurlHdivSetup2D(quad1D, coeff_dim, ne, tr, ir->GetWeights(),
--                               geom->J, coeff, pa_data);
--      }
--   }
--   else
--   {
--      MFEM_ABORT("Unknown kernel.");
--   }
--}
--
--void VectorFEMassIntegrator::AssembleDiagonalPA(Vector& diag)
--{
--   if (dim == 3)
--   {
--      if (trial_fetype == mfem::FiniteElement::CURL && test_fetype == trial_fetype)
--      {
--         if (Device::Allows(Backend::DEVICE_MASK))
--         {
--            const int ID = (dofs1D << 4) | quad1D;
--            switch (ID)
--            {
--               case 0x23: return SmemPAHcurlMassAssembleDiagonal3D<2,3>(dofs1D, quad1D, ne,
--                                                                           symmetric,
--                                                                           mapsO->B, mapsC->B, pa_data, diag);
--               case 0x34: return SmemPAHcurlMassAssembleDiagonal3D<3,4>(dofs1D, quad1D, ne,
--                                                                           symmetric,
--                                                                           mapsO->B, mapsC->B, pa_data, diag);
--               case 0x45: return SmemPAHcurlMassAssembleDiagonal3D<4,5>(dofs1D, quad1D, ne,
--                                                                           symmetric,
--                                                                           mapsO->B, mapsC->B, pa_data, diag);
--               case 0x56: return SmemPAHcurlMassAssembleDiagonal3D<5,6>(dofs1D, quad1D, ne,
--                                                                           symmetric,
--                                                                           mapsO->B, mapsC->B, pa_data, diag);
--               default: return SmemPAHcurlMassAssembleDiagonal3D(dofs1D, quad1D, ne, symmetric,
--                                                                    mapsO->B, mapsC->B, pa_data, diag);
--            }
--         }
--         else
--            PAHcurlMassAssembleDiagonal3D(dofs1D, quad1D, ne, symmetric,
--                                          mapsO->B, mapsC->B, pa_data, diag);
--      }
--      else if (trial_fetype == mfem::FiniteElement::DIV &&
--               test_fetype == trial_fetype)
--      {
--         PAHdivMassAssembleDiagonal3D(dofs1D, quad1D, ne, symmetric,
--                                      mapsO->B, mapsC->B, pa_data, diag);
--      }
--      else
--      {
--         MFEM_ABORT("Unknown kernel.");
--      }
--   }
--   else // 2D
--   {
--      if (trial_fetype == mfem::FiniteElement::CURL && test_fetype == trial_fetype)
--      {
--         PAHcurlMassAssembleDiagonal2D(dofs1D, quad1D, ne, symmetric,
--                                       mapsO->B, mapsC->B, pa_data, diag);
--      }
--      else if (trial_fetype == mfem::FiniteElement::DIV &&
--               test_fetype == trial_fetype)
--      {
--         PAHdivMassAssembleDiagonal2D(dofs1D, quad1D, ne, symmetric,
--                                      mapsO->B, mapsC->B, pa_data, diag);
--      }
--      else
--      {
--         MFEM_ABORT("Unknown kernel.");
--      }
--   }
--}
--
--void VectorFEMassIntegrator::AddMultPA(const Vector &x, Vector &y) const
--{
--   const bool trial_curl = (trial_fetype == mfem::FiniteElement::CURL);
--   const bool trial_div = (trial_fetype == mfem::FiniteElement::DIV);
--   const bool test_curl = (test_fetype == mfem::FiniteElement::CURL);
--   const bool test_div = (test_fetype == mfem::FiniteElement::DIV);
--
--   if (dim == 3)
--   {
--      if (trial_curl && test_curl)
--      {
--         if (Device::Allows(Backend::DEVICE_MASK))
--         {
--            const int ID = (dofs1D << 4) | quad1D;
--            switch (ID)
--            {
--               case 0x23: return SmemPAHcurlMassApply3D<2,3>(dofs1D, quad1D, ne, symmetric,
--                                                                mapsO->B,
--                                                                mapsC->B, mapsO->Bt,
--                                                                mapsC->Bt, pa_data, x, y);
--               case 0x34: return SmemPAHcurlMassApply3D<3,4>(dofs1D, quad1D, ne, symmetric,
--                                                                mapsO->B,
--                                                                mapsC->B, mapsO->Bt,
--                                                                mapsC->Bt, pa_data, x, y);
--               case 0x45: return SmemPAHcurlMassApply3D<4,5>(dofs1D, quad1D, ne, symmetric,
--                                                                mapsO->B,
--                                                                mapsC->B, mapsO->Bt,
--                                                                mapsC->Bt, pa_data, x, y);
--               case 0x56: return SmemPAHcurlMassApply3D<5,6>(dofs1D, quad1D, ne, symmetric,
--                                                                mapsO->B,
--                                                                mapsC->B, mapsO->Bt,
--                                                                mapsC->Bt, pa_data, x, y);
--               default: return SmemPAHcurlMassApply3D(dofs1D, quad1D, ne, symmetric, mapsO->B,
--                                                         mapsC->B,
--                                                         mapsO->Bt, mapsC->Bt, pa_data, x, y);
--            }
--         }
--         else
--            PAHcurlMassApply3D(dofs1D, quad1D, ne, symmetric, mapsO->B, mapsC->B, mapsO->Bt,
--                               mapsC->Bt, pa_data, x, y);
--      }
--      else if (trial_div && test_div)
--      {
--         PAHdivMassApply(3, dofs1D, quad1D, ne, symmetric, mapsO->B, mapsC->B, mapsO->Bt,
--                         mapsC->Bt, pa_data, x, y);
--      }
--      else if (trial_curl && test_div)
--      {
--         const bool scalarCoeff = !(DQ || MQ);
--         PAHcurlHdivMassApply3D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
--                                true, false, mapsO->B, mapsC->B, mapsOtest->Bt,
--                                mapsCtest->Bt, pa_data, x, y);
--      }
--      else if (trial_div && test_curl)
--      {
--         const bool scalarCoeff = !(DQ || MQ);
--         PAHcurlHdivMassApply3D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
--                                false, false, mapsO->B, mapsC->B, mapsOtest->Bt,
--                                mapsCtest->Bt, pa_data, x, y);
--      }
--      else
--      {
--         MFEM_ABORT("Unknown kernel.");
--      }
--   }
--   else // 2D
--   {
--      if (trial_curl && test_curl)
--      {
--         PAHcurlMassApply2D(dofs1D, quad1D, ne, symmetric, mapsO->B, mapsC->B,
--                            mapsO->Bt, mapsC->Bt, pa_data, x, y);
--      }
--      else if (trial_div && test_div)
--      {
--         PAHdivMassApply(2, dofs1D, quad1D, ne, symmetric, mapsO->B, mapsC->B, mapsO->Bt,
--                         mapsC->Bt, pa_data, x, y);
--      }
--      else if ((trial_curl && test_div) || (trial_div && test_curl))
--      {
--         const bool scalarCoeff = !(DQ || MQ);
--         PAHcurlHdivMassApply2D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
--                                trial_curl, false, mapsO->B, mapsC->B,
--                                mapsOtest->Bt, mapsCtest->Bt, pa_data, x, y);
--      }
--      else
--      {
--         MFEM_ABORT("Unknown kernel.");
--      }
--   }
--}
--
--void VectorFEMassIntegrator::AddMultTransposePA(const Vector &x,
--                                                Vector &y) const
--{
--   const bool trial_curl = (trial_fetype == mfem::FiniteElement::CURL);
--   const bool trial_div = (trial_fetype == mfem::FiniteElement::DIV);
--   const bool test_curl = (test_fetype == mfem::FiniteElement::CURL);
--   const bool test_div = (test_fetype == mfem::FiniteElement::DIV);
--
--   bool symmetricSpaces = true;
--
--   if (dim == 3 && ((trial_div && test_curl) || (trial_curl && test_div)))
--   {
--      const bool scalarCoeff = !(DQ || MQ);
--      PAHcurlHdivMassApply3D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
--                             trial_div, true, mapsO->B, mapsC->B, mapsOtest->Bt,
--                             mapsCtest->Bt, pa_data, x, y);
--      symmetricSpaces = false;
--   }
--   else if (dim == 2 && ((trial_curl && test_div) || (trial_div && test_curl)))
--   {
--      const bool scalarCoeff = !(DQ || MQ);
--      PAHcurlHdivMassApply2D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
--                             !trial_curl, true, mapsO->B, mapsC->B, mapsOtest->Bt,
--                             mapsCtest->Bt, pa_data, x, y);
--      symmetricSpaces = false;
--   }
--
--   if (symmetricSpaces)
--   {
--      if (MQ && dynamic_cast<SymmetricMatrixCoefficient*>(MQ) == NULL)
--      {
--         MFEM_ABORT("VectorFEMassIntegrator transpose not implemented for asymmetric MatrixCoefficient");
--      }
--
--      this->AddMultPA(x, y);
--   }
--}
--
--void MixedVectorGradientIntegrator::AssemblePA(const FiniteElementSpace
--                                               &trial_fes,
--                                               const FiniteElementSpace &test_fes)
--{
--   // Assumes tensor-product elements, with a vector test space and H^1 trial space.
--   Mesh *mesh = trial_fes.GetMesh();
--   const FiniteElement *trial_fel = trial_fes.GetFE(0);
--   const FiniteElement *test_fel = test_fes.GetFE(0);
--
--   const NodalTensorFiniteElement *trial_el =
--      dynamic_cast<const NodalTensorFiniteElement*>(trial_fel);
--   MFEM_VERIFY(trial_el != NULL, "Only NodalTensorFiniteElement is supported!");
--
--   const VectorTensorFiniteElement *test_el =
--      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
--   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const IntegrationRule *ir
--      = IntRule ? IntRule : &MassIntegrator::GetRule(*trial_el, *trial_el,
--                                                     *mesh->GetElementTransformation(0));
--   const int dims = trial_el->GetDim();
--   MFEM_VERIFY(dims == 2 || dims == 3, "");
--
--   const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
--   const int nq = ir->GetNPoints();
--   dim = mesh->Dimension();
--   MFEM_VERIFY(dim == 2 || dim == 3, "");
--
--   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(), "");
--
--   ne = trial_fes.GetNE();
--   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
--   mapsC = &test_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
--   mapsO = &test_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
--   dofs1D = mapsC->ndof;
--   quad1D = mapsC->nqpt;
--
--   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
--
--   pa_data.SetSize(symmDims * nq * ne, Device::GetMemoryType());
--
--   QuadratureSpace qs(*mesh, *ir);
--   CoefficientVector coeff(Q, qs, CoefficientStorage::FULL);
--
--   // Use the same setup functions as VectorFEMassIntegrator.
--   if (test_el->GetDerivType() == mfem::FiniteElement::CURL && dim == 3)
--   {
--      PADiffusionSetup3D(quad1D, 1, ne, ir->GetWeights(), geom->J,
--                         coeff, pa_data);
--   }
--   else if (test_el->GetDerivType() == mfem::FiniteElement::CURL && dim == 2)
--   {
--      PADiffusionSetup2D<2>(quad1D, 1, ne, ir->GetWeights(), geom->J,
--                            coeff, pa_data);
--   }
--   else
--   {
--      MFEM_ABORT("Unknown kernel.");
--   }
--}
--
--void MixedVectorGradientIntegrator::AddMultPA(const Vector &x, Vector &y) const
--{
--   if (dim == 3)
--      PAHcurlH1Apply3D(dofs1D, quad1D, ne, mapsC->B, mapsC->G,
--                       mapsO->Bt, mapsC->Bt, pa_data, x, y);
--   else if (dim == 2)
--      PAHcurlH1Apply2D(dofs1D, quad1D, ne, mapsC->B, mapsC->G,
--                       mapsO->Bt, mapsC->Bt, pa_data, x, y);
--   else
--   {
--      MFEM_ABORT("Unsupported dimension!");
--   }
--}
--
--void MixedVectorGradientIntegrator::AddMultTransposePA(const Vector &x,
--                                                       Vector &y) const
--{
--   if (dim == 3)
--      PAHcurlH1ApplyTranspose3D(dofs1D, quad1D, ne, mapsC->B, mapsO->B,
--                                mapsC->Bt, mapsC->Gt, pa_data, x, y);
--   else if (dim == 2)
--      PAHcurlH1ApplyTranspose2D(dofs1D, quad1D, ne, mapsC->B, mapsO->B,
--                                mapsC->Bt, mapsC->Gt, pa_data, x, y);
--   else
--   {
--      MFEM_ABORT("Unsupported dimension!");
--   }
--}
--
--} // namespace mfem
-diff --git a/fem/ceed/interface/operator.cpp b/fem/ceed/interface/operator.cpp
-index 8545ccaa8..745e474e5 100644
---- a/fem/ceed/interface/operator.cpp
-+++ b/fem/ceed/interface/operator.cpp
-@@ -46,7 +46,7 @@ void Operator::Mult(const mfem::Vector &x, mfem::Vector &y) const
-    CeedScalar *y_ptr;
-    CeedMemType mem;
-    CeedGetPreferredMemType(mfem::internal::ceed, &mem);
--   if ( Device::Allows(Backend::DEVICE_MASK) && mem==CEED_MEM_DEVICE )
-+   if (Device::Allows(Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE)
-    {
-       x_ptr = x.Read();
-       y_ptr = y.Write();
-@@ -78,7 +78,7 @@ void Operator::AddMult(const mfem::Vector &x, mfem::Vector &y,
-    CeedScalar *y_ptr;
-    CeedMemType mem;
-    CeedGetPreferredMemType(mfem::internal::ceed, &mem);
--   if ( Device::Allows(Backend::DEVICE_MASK) && mem==CEED_MEM_DEVICE )
-+   if (Device::Allows(Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE)
-    {
-       x_ptr = x.Read();
-       y_ptr = y.ReadWrite();
-@@ -107,7 +107,7 @@ void Operator::GetDiagonal(mfem::Vector &diag) const
-    CeedScalar *d_ptr;
-    CeedMemType mem;
-    CeedGetPreferredMemType(mfem::internal::ceed, &mem);
--   if ( Device::Allows(Backend::DEVICE_MASK) && mem==CEED_MEM_DEVICE )
-+   if (Device::Allows(Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE)
-    {
-       d_ptr = diag.ReadWrite();
-    }
-diff --git a/fem/ceed/interface/operator.hpp b/fem/ceed/interface/operator.hpp
-index cffea2fc7..9e4a4faaf 100644
---- a/fem/ceed/interface/operator.hpp
-+++ b/fem/ceed/interface/operator.hpp
-@@ -37,11 +37,12 @@ public:
-    /// This class takes ownership of op and will delete it
-    Operator(CeedOperator op);
- #endif
-+
-    void Mult(const mfem::Vector &x, mfem::Vector &y) const override;
-    void AddMult(const mfem::Vector &x, mfem::Vector &y,
-                 const double a = 1.0) const override;
-    void GetDiagonal(mfem::Vector &diag) const;
--   using mfem::Operator::SetupRAP;
-+
-    virtual ~Operator()
-    {
- #ifdef MFEM_USE_CEED
-diff --git a/fem/ceed/interface/util.cpp b/fem/ceed/interface/util.cpp
-index d122c2ab5..b65fd2197 100644
---- a/fem/ceed/interface/util.cpp
-+++ b/fem/ceed/interface/util.cpp
-@@ -217,7 +217,7 @@ const IntegrationRule & GetRule<ConvectionIntegrator>(
-    const FiniteElement &test_fe,
-    ElementTransformation &trans)
- {
--   return ConvectionIntegrator::GetRule(trial_fe, test_fe, trans);
-+   return ConvectionIntegrator::GetRule(trial_fe, trans);
- }
- 
- template <>
-diff --git a/fem/ceed/solvers/algebraic.cpp b/fem/ceed/solvers/algebraic.cpp
-index 2cc325dbc..280a19960 100644
---- a/fem/ceed/solvers/algebraic.cpp
-+++ b/fem/ceed/solvers/algebraic.cpp
-@@ -46,7 +46,7 @@ private:
-    Array<int> ess_tdofs;
-    const mfem::Operator *P;
-    ceed::Operator *unconstrained_op;
--   mfem::ConstrainedOperator *constrained_op;
-+   mfem::Operator *constrained_op;
- };
- 
- ConstrainedOperator::ConstrainedOperator(
-@@ -56,10 +56,8 @@ ConstrainedOperator::ConstrainedOperator(
-    : ess_tdofs(ess_tdofs_), P(P_)
- {
-    unconstrained_op = new ceed::Operator(oper);
--   mfem::Operator *rap = unconstrained_op->SetupRAP(P, P);
--   height = width = rap->Height();
--   bool own_rap = (rap != unconstrained_op);
--   constrained_op = new mfem::ConstrainedOperator(rap, ess_tdofs, own_rap);
-+   unconstrained_op->FormSystemOperator(ess_tdofs, constrained_op);
-+   height = width = constrained_op->Height();
- }
- 
- ConstrainedOperator::ConstrainedOperator(CeedOperator oper,
-@@ -535,7 +533,7 @@ void AlgebraicInterpolation::Mult(const mfem::Vector& x, mfem::Vector& y) const
-    CeedScalar *out_ptr;
-    CeedMemType mem;
-    ierr = CeedGetPreferredMemType(internal::ceed, &mem); PCeedChk(ierr);
--   if ( Device::Allows(Backend::DEVICE_MASK) && mem==CEED_MEM_DEVICE )
-+   if (Device::Allows(Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE)
-    {
-       in_ptr = x.Read();
-       out_ptr = y.ReadWrite();
-@@ -568,7 +566,7 @@ void AlgebraicInterpolation::MultTranspose(const mfem::Vector& x,
-    ierr = CeedGetPreferredMemType(internal::ceed, &mem); PCeedChk(ierr);
-    const CeedScalar *in_ptr;
-    CeedScalar *out_ptr;
--   if ( Device::Allows(Backend::DEVICE_MASK) && mem==CEED_MEM_DEVICE )
-+   if (Device::Allows(Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE)
-    {
-       in_ptr = x.Read();
-       out_ptr = y.ReadWrite();
-@@ -808,15 +806,6 @@ ParAlgebraicCoarseSpace::ParAlgebraicCoarseSpace(
-       }
-    }
-    R_mat->Finalize();
--
--   if (Device::Allows(Backend::DEVICE_MASK))
--   {
--      P = new DeviceConformingProlongationOperator(*gc, R_mat);
--   }
--   else
--   {
--      P = new ConformingProlongationOperator(lsize, *gc);
--   }
-    P_mat = NULL;
- }
- 
-@@ -828,8 +817,8 @@ HypreParMatrix *ParAlgebraicCoarseSpace::GetProlongationHypreParMatrix()
-    MFEM_VERIFY(pmesh != NULL, "");
-    Array<HYPRE_BigInt> dof_offsets, tdof_offsets, tdof_nb_offsets;
-    Array<HYPRE_BigInt> *offsets[2] = {&dof_offsets, &tdof_offsets};
--   int lsize = P->Height();
--   int ltsize = P->Width();
-+   int ltsize = R_mat->Height();
-+   int lsize = R_mat->Width();
-    HYPRE_BigInt loc_sizes[2] = {lsize, ltsize};
-    pmesh->GenerateOffsets(2, loc_sizes, offsets);
- 
-@@ -936,7 +925,6 @@ HypreParMatrix *ParAlgebraicCoarseSpace::GetProlongationHypreParMatrix()
- 
- ParAlgebraicCoarseSpace::~ParAlgebraicCoarseSpace()
- {
--   delete P;
-    delete R_mat;
-    delete P_mat;
-    delete gc;
-diff --git a/fem/ceed/solvers/algebraic.hpp b/fem/ceed/solvers/algebraic.hpp
-index 49cdbca98..8ede8324e 100644
---- a/fem/ceed/solvers/algebraic.hpp
-+++ b/fem/ceed/solvers/algebraic.hpp
-@@ -33,12 +33,13 @@ class AlgebraicCoarseSpace : public FiniteElementSpace
- public:
-    AlgebraicCoarseSpace(FiniteElementSpace &fine_fes, CeedElemRestriction fine_er,
-                         int order, int dim, int order_reduction_);
-+   ~AlgebraicCoarseSpace();
-+
-    int GetOrderReduction() const { return order_reduction; }
-    CeedElemRestriction GetCeedElemRestriction() const { return ceed_elem_restriction; }
-    CeedBasis GetCeedCoarseToFine() const { return coarse_to_fine; }
-    virtual const mfem::Operator *GetProlongationMatrix() const override { return NULL; }
-    virtual const SparseMatrix *GetRestrictionMatrix() const override { return NULL; }
--   ~AlgebraicCoarseSpace();
- 
- protected:
-    int *dof_map;
-@@ -64,16 +65,16 @@ public:
-       int order_reduction_,
-       GroupCommunicator *gc_fine
-    );
--   virtual const mfem::Operator *GetProlongationMatrix() const override { return P; }
-+   ~ParAlgebraicCoarseSpace();
-+
-+   virtual const mfem::Operator *GetProlongationMatrix() const override { return P_mat; }
-    virtual const SparseMatrix *GetRestrictionMatrix() const override { return R_mat; }
-    GroupCommunicator *GetGroupCommunicator() const { return gc; }
-    HypreParMatrix *GetProlongationHypreParMatrix();
--   ~ParAlgebraicCoarseSpace();
- 
- private:
--   SparseMatrix *R_mat;
-    GroupCommunicator *gc;
--   ConformingProlongationOperator *P;
-+   SparseMatrix *R_mat;
-    HypreParMatrix *P_mat;
-    Array<int> ldof_group, ldof_ltdof;
- };
-@@ -92,14 +93,11 @@ public:
-       Ceed ceed, CeedBasis basisctof,
-       CeedElemRestriction erestrictu_coarse,
-       CeedElemRestriction erestrictu_fine);
--
-    ~AlgebraicInterpolation();
- 
-    virtual void Mult(const mfem::Vector& x, mfem::Vector& y) const;
--
-    virtual void MultTranspose(const mfem::Vector& x, mfem::Vector& y) const;
- 
--   using mfem::Operator::SetupRAP;
- private:
-    int Initialize(Ceed ceed, CeedBasis basisctof,
-                   CeedElemRestriction erestrictu_coarse,
-@@ -127,11 +125,6 @@ public:
-        The given space is a real (geometric) space, but the coarse spaces are
-        constructed semi-algebraically with no mesh information. */
-    AlgebraicSpaceHierarchy(FiniteElementSpace &fespace);
--   AlgebraicCoarseSpace& GetAlgebraicCoarseSpace(int level)
--   {
--      MFEM_ASSERT(level < GetNumLevels() - 1, "");
--      return static_cast<AlgebraicCoarseSpace&>(*fespaces[level]);
--   }
-    ~AlgebraicSpaceHierarchy()
-    {
-       for (int i=0; i<R_tr.Size(); ++i)
-@@ -144,6 +137,12 @@ public:
-       }
-    }
- 
-+   AlgebraicCoarseSpace& GetAlgebraicCoarseSpace(int level)
-+   {
-+      MFEM_ASSERT(level < GetNumLevels() - 1, "");
-+      return static_cast<AlgebraicCoarseSpace&>(*fespaces[level]);
-+   }
-+
- private:
-    CeedElemRestriction fine_er;
-    Array<AlgebraicInterpolation*> ceed_interpolations;
-@@ -200,6 +199,7 @@ public:
-     */
-    AlgebraicSolver(BilinearForm &form, const Array<int>& ess_tdofs);
-    ~AlgebraicSolver();
-+
-    void Mult(const Vector& x, Vector& y) const;
-    void SetOperator(const mfem::Operator& op);
- };
-diff --git a/fem/coefficient.cpp b/fem/coefficient.cpp
-index 46ad4cf4c..e47073e3f 100644
---- a/fem/coefficient.cpp
-+++ b/fem/coefficient.cpp
-@@ -144,8 +144,8 @@ double FunctionCoefficient::Eval(ElementTransformation & T,
-    }
- }
- 
--double GridFunctionCoefficient::Eval (ElementTransformation &T,
--                                      const IntegrationPoint &ip)
-+double GridFunctionCoefficient::Eval(ElementTransformation &T,
-+                                     const IntegrationPoint &ip)
- {
-    Mesh *gf_mesh = GridF->FESpace()->GetMesh();
-    if (T.mesh == gf_mesh)
-@@ -623,12 +623,6 @@ void PWMatrixCoefficient::UpdateCoefficient(int attr, MatrixCoefficient & coef)
-    MFEM_VERIFY(coef.GetWidth() == width,
-                "PWMatrixCoefficient::UpdateCoefficient:  "
-                "MatrixCoefficient has incompatible width.");
--   if (symmetric)
--   {
--      MFEM_VERIFY(coef.IsSymmetric(),
--                  "PWMatrixCoefficient::UpdateCoefficient:  "
--                  "MatrixCoefficient has incompatible symmetry.");
--   }
-    pieces[attr] = &coef;
- }
- 
-@@ -680,68 +674,17 @@ void MatrixFunctionCoefficient::Eval(DenseMatrix &K, ElementTransformation &T,
- 
-    K.SetSize(height, width);
- 
--   if (symmetric) // Use SymmFunction (deprecated version)
--   {
--      MFEM_VERIFY(height == width && SymmFunction,
--                  "MatrixFunctionCoefficient is not symmetric");
--
--      Vector Ksym((width * (width + 1)) / 2); // 1x1: 1, 2x2: 3, 3x3: 6
--
--      SymmFunction(transip, Ksym);
--
--      // Copy upper triangular values from Ksym to the full matrix K
--      int os = 0;
--      for (int i=0; i<height; ++i)
--      {
--         for (int j=i; j<width; ++j)
--         {
--            const double Kij = Ksym[j - i + os];
--            K(i,j) = Kij;
--            if (j != i) { K(j,i) = Kij; }
--         }
--
--         os += width - i;
--      }
--   }
--   else
-+   if (Function)
-    {
--      if (Function)
--      {
--         Function(transip, K);
--      }
--      else if (TDFunction)
--      {
--         TDFunction(transip, GetTime(), K);
--      }
--      else
--      {
--         K = mat;
--      }
-+      Function(transip, K);
-    }
--
--   if (Q)
-+   else if (TDFunction)
-    {
--      K *= Q->Eval(T, ip, GetTime());
-+      TDFunction(transip, GetTime(), K);
-    }
--}
--
--void MatrixFunctionCoefficient::EvalSymmetric(Vector &K,
--                                              ElementTransformation &T,
--                                              const IntegrationPoint &ip)
--{
--   MFEM_VERIFY(symmetric && height == width && SymmFunction,
--               "MatrixFunctionCoefficient is not symmetric");
--
--   double x[3];
--   Vector transip(x, 3);
--
--   T.Transform(ip, transip);
--
--   K.SetSize((width * (width + 1)) / 2); // 1x1: 1, 2x2: 3, 3x3: 6
--
--   if (SymmFunction)
-+   else
-    {
--      SymmFunction(transip, K);
-+      K = mat;
-    }
- 
-    if (Q)
-@@ -782,7 +725,7 @@ void SymmetricMatrixCoefficient::Eval(DenseMatrix &K, ElementTransformation &T,
-    Eval(mat, T, ip);
-    for (int j = 0; j < width; ++j)
-    {
--      for (int i = 0; i < height; ++ i)
-+      for (int i = 0; i < height; ++i)
-       {
-          K(i, j) = mat(i, j);
-       }
-@@ -1782,3 +1725,4 @@ CoefficientVector::~CoefficientVector()
- }
- 
- }
-+
-diff --git a/fem/coefficient.hpp b/fem/coefficient.hpp
-index ada5b91a3..fc54fd2ef 100644
---- a/fem/coefficient.hpp
-+++ b/fem/coefficient.hpp
-@@ -106,7 +106,6 @@ private:
-    Vector constants;
- 
- public:
--
-    /// Constructs a piecewise constant coefficient in NumOfSubD subdomains
-    explicit PWConstCoefficient(int NumOfSubD = 0) : constants(NumOfSubD)
-    { constants = 0.0; }
-@@ -177,7 +176,6 @@ private:
-                 const Array<Coefficient*> & coefs);
- 
- public:
--
-    /// Constructs a piecewise coefficient
-    explicit PWCoefficient() {}
- 
-@@ -340,7 +338,6 @@ protected:
-    double (*tdf)(double);
- 
- public:
--
-    /// Construct a unit delta function centered at (0.0,0.0,0.0)
-    DeltaCoefficient()
-    {
-@@ -503,15 +500,16 @@ class VectorConstantCoefficient : public VectorCoefficient
- {
- private:
-    Vector vec;
-+
- public:
-    /// Construct the coefficient with constant vector @a v.
-    VectorConstantCoefficient(const Vector &v)
-       : VectorCoefficient(v.Size()), vec(v) { }
--   using VectorCoefficient::Eval;
- 
-    ///  Evaluate the vector coefficient at @a ip.
-    virtual void Eval(Vector &V, ElementTransformation &T,
-                      const IntegrationPoint &ip) { V = vec; }
-+   using VectorCoefficient::Eval;
- 
-    /// Return a reference to the constant vector in this class.
-    const Vector& GetVec() const { return vec; }
-@@ -561,7 +559,6 @@ private:
-                 const Array<VectorCoefficient*> & coefs);
- 
- public:
--
-    /// Constructs a piecewise vector coefficient of dimension vd
-    explicit PWVectorCoefficient(int vd): VectorCoefficient(vd) {}
- 
-@@ -629,10 +626,10 @@ public:
-       : VectorCoefficient(dim), TDFunction(std::move(TDF)), Q(q)
-    { }
- 
--   using VectorCoefficient::Eval;
-    /// Evaluate the vector coefficient at @a ip.
-    virtual void Eval(Vector &V, ElementTransformation &T,
-                      const IntegrationPoint &ip);
-+   using VectorCoefficient::Eval;
- 
-    virtual ~VectorFunctionCoefficient() { }
- };
-@@ -669,11 +666,11 @@ public:
-    double Eval(int i, ElementTransformation &T, const IntegrationPoint &ip)
-    { return Coeff[i] ? Coeff[i]->Eval(T, ip, GetTime()) : 0.0; }
- 
--   using VectorCoefficient::Eval;
-    /** @brief Evaluate the coefficient. Each element of vector V comes from the
-        associated array of scalar coefficients. */
-    virtual void Eval(Vector &V, ElementTransformation &T,
-                      const IntegrationPoint &ip);
-+   using VectorCoefficient::Eval;
- 
-    /// Destroys vector coefficient.
-    virtual ~VectorArrayCoefficient();
-@@ -728,7 +725,6 @@ protected:
-    const GridFunction *GridFunc;
- 
- public:
--
-    /** @brief Construct the coefficient with a scalar grid function @a gf. The
-        grid function is not owned by the coefficient. */
-    GradientGridFunctionCoefficient(const GridFunction *gf);
-@@ -769,10 +765,10 @@ public:
-    /// Get the vector grid function.
-    const GridFunction * GetGridFunction() const { return GridFunc; }
- 
--   using VectorCoefficient::Eval;
-    /// Evaluate the vector curl coefficient at @a ip.
-    virtual void Eval(Vector &V, ElementTransformation &T,
-                      const IntegrationPoint &ip);
-+   using VectorCoefficient::Eval;
- 
-    virtual ~CurlGridFunctionCoefficient() { }
- };
-@@ -861,12 +857,13 @@ public:
-    virtual void EvalDelta(Vector &V, ElementTransformation &T,
-                           const IntegrationPoint &ip);
- 
--   using VectorCoefficient::Eval;
-    /** @brief A VectorDeltaFunction cannot be evaluated. Calling this method
-        will cause an MFEM error, terminating the application. */
-    virtual void Eval(Vector &V, ElementTransformation &T,
-                      const IntegrationPoint &ip)
-    { mfem_error("VectorDeltaCoefficient::Eval"); }
-+   using VectorCoefficient::Eval;
-+
-    virtual ~VectorDeltaCoefficient() { }
- };
- 
-@@ -908,16 +905,15 @@ class MatrixCoefficient
- protected:
-    int height, width;
-    double time;
--   bool symmetric;  // deprecated
- 
- public:
-    /// Construct a dim x dim matrix coefficient.
--   explicit MatrixCoefficient(int dim, bool symm=false)
--   { height = width = dim; time = 0.; symmetric = symm; }
-+   explicit MatrixCoefficient(int dim)
-+   { height = width = dim; time = 0.; }
- 
-    /// Construct a h x w matrix coefficient.
--   MatrixCoefficient(int h, int w, bool symm=false) :
--      height(h), width(w), time(0.), symmetric(symm) { }
-+   MatrixCoefficient(int h, int w) :
-+      height(h), width(w), time(0.) { }
- 
-    /// Set the time for time dependent coefficients
-    virtual void SetTime(double t) { time = t; }
-@@ -934,9 +930,6 @@ public:
-    /// For backward compatibility get the width of the matrix.
-    int GetVDim() const { return width; }
- 
--   /** @deprecated Use SymmetricMatrixCoefficient instead */
--   bool IsSymmetric() const { return symmetric; }
--
-    /** @brief Evaluate the matrix coefficient in the element described by @a T
-        at the point @a ip, storing the result in @a K. */
-    /** @note When this method is called, the caller must make sure that the
-@@ -953,17 +946,6 @@ public:
-    /// the width of the matrix.
-    virtual void Project(QuadratureFunction &qf, bool transpose=false);
- 
--   /// (DEPRECATED) Evaluate a symmetric matrix coefficient.
--   /** @brief Evaluate the upper triangular entries of the matrix coefficient
--       in the symmetric case, similarly to Eval. Matrix entry (i,j) is stored
--       in K[j - i + os_i] for 0 <= i <= j < width, os_0 = 0,
--       os_{i+1} = os_i + width - i. That is, K = {M(0,0), ..., M(0,w-1),
--       M(1,1), ..., M(1,w-1), ..., M(w-1,w-1) with w = width.
--       @deprecated Use Eval() instead. */
--   virtual void EvalSymmetric(Vector &K, ElementTransformation &T,
--                              const IntegrationPoint &ip)
--   { mfem_error("MatrixCoefficient::EvalSymmetric"); }
--
-    virtual ~MatrixCoefficient() { }
- };
- 
-@@ -973,14 +955,17 @@ class MatrixConstantCoefficient : public MatrixCoefficient
- {
- private:
-    DenseMatrix mat;
-+
- public:
-    ///Construct using matrix @a m for the constant.
-    MatrixConstantCoefficient(const DenseMatrix &m)
-       : MatrixCoefficient(m.Height(), m.Width()), mat(m) { }
--   using MatrixCoefficient::Eval;
-+
-    /// Evaluate the matrix coefficient at @a ip.
-    virtual void Eval(DenseMatrix &M, ElementTransformation &T,
-                      const IntegrationPoint &ip) { M = mat; }
-+   using MatrixCoefficient::Eval;
-+
-    /// Return a reference to the constant matrix.
-    const DenseMatrix& GetMatrix() { return mat; }
- };
-@@ -1030,20 +1015,18 @@ private:
-                 const Array<MatrixCoefficient*> & coefs);
- 
- public:
--
-    /// Constructs a piecewise matrix coefficient of dimension dim by dim
--   explicit PWMatrixCoefficient(int dim, bool symm = false)
--      : MatrixCoefficient(dim, symm) {}
-+   explicit PWMatrixCoefficient(int dim)
-+      : MatrixCoefficient(dim) {}
- 
-    /// Constructs a piecewise matrix coefficient of dimension h by w
--   explicit PWMatrixCoefficient(int h, int w, bool symm = false)
--      : MatrixCoefficient(h, w, symm) {}
-+   explicit PWMatrixCoefficient(int h, int w)
-+      : MatrixCoefficient(h, w) {}
- 
-    /// Construct the coefficient using arrays describing the pieces
-    /** \param dim - size of the square matrix-valued result
-        \param attr - an array of attribute numbers for each piece
-        \param coefs - the corresponding array of MatrixCoefficient pointers
--       \param symm - true if the result will be symmetric, false otherwise
-        Any missing attributes or NULL coefficient pointers will result in a
-        zero matrix being returned.
- 
-@@ -1051,16 +1034,14 @@ public:
-        transferred to this object.
-    */
-    PWMatrixCoefficient(int dim, const Array<int> & attr,
--                       const Array<MatrixCoefficient*> & coefs,
--                       bool symm=false)
--      : MatrixCoefficient(dim, symm) { InitMap(attr, coefs); }
-+                       const Array<MatrixCoefficient*> & coefs)
-+      : MatrixCoefficient(dim) { InitMap(attr, coefs); }
- 
-    /// Construct the coefficient using arrays describing the pieces
-    /** \param h - height of the matrix-valued result
-        \param w - width of the matrix-valued result
-        \param attr - an array of attribute numbers for each piece
-        \param coefs - the corresponding array of MatrixCoefficient pointers
--       \param symm - true if the result will be symmetric, false otherwise
-        Any missing attributes or NULL coefficient pointers will result in a
-        zero matrix being returned for that attribute.
- 
-@@ -1068,9 +1049,8 @@ public:
-        transferred to this object.
-    */
-    PWMatrixCoefficient(int h, int w, const Array<int> & attr,
--                       const Array<MatrixCoefficient*> & coefs,
--                       bool symm=false)
--      : MatrixCoefficient(h, w, symm) { InitMap(attr, coefs); }
-+                       const Array<MatrixCoefficient*> & coefs)
-+      : MatrixCoefficient(h, w) { InitMap(attr, coefs); }
- 
-    /// Set the time for time dependent coefficients
-    virtual void SetTime(double t);
-@@ -1099,7 +1079,6 @@ class MatrixFunctionCoefficient : public MatrixCoefficient
- {
- private:
-    std::function<void(const Vector &, DenseMatrix &)> Function;
--   std::function<void(const Vector &, Vector &)> SymmFunction;  // deprecated
-    std::function<void(const Vector &, double, DenseMatrix &)> TDFunction;
- 
-    Coefficient *Q;
-@@ -1123,28 +1102,6 @@ public:
-       : MatrixCoefficient(m.Height(), m.Width()), Q(&q), mat(m)
-    { }
- 
--   /** @brief Define a time-independent symmetric square matrix coefficient from
--       a std function */
--   /** \param dim - the size of the matrix
--       \param SymmF - function used in EvalSymmetric
--       \param q - optional scalar Coefficient to scale the matrix coefficient
--       @deprecated Use another constructor without setting SymmFunction. */
--   MatrixFunctionCoefficient(int dim,
--                             std::function<void(const Vector &, Vector &)> SymmF,
--                             Coefficient *q = NULL)
--      : MatrixCoefficient(dim, true), SymmFunction(std::move(SymmF)), Q(q), mat(0)
--   { }
--
--   /// Define a time-dependent square matrix coefficient from a std function
--   /** \param dim - the size of the matrix
--       \param TDF - time-dependent function
--       \param q - optional scalar Coefficient to scale the matrix coefficient */
--   MatrixFunctionCoefficient(int dim,
--                             std::function<void(const Vector &, double, DenseMatrix &)> TDF,
--                             Coefficient *q = nullptr)
--      : MatrixCoefficient(dim), TDFunction(std::move(TDF)), Q(q)
--   { }
--
-    /// Set the time for internally stored coefficients
-    void SetTime(double t);
- 
-@@ -1152,11 +1109,6 @@ public:
-    virtual void Eval(DenseMatrix &K, ElementTransformation &T,
-                      const IntegrationPoint &ip);
- 
--   /// (DEPRECATED) Evaluate the symmetric matrix coefficient at @a ip.
--   /** @deprecated Use Eval() instead. */
--   virtual void EvalSymmetric(Vector &K, ElementTransformation &T,
--                              const IntegrationPoint &ip);
--
-    virtual ~MatrixFunctionCoefficient() { }
- };
- 
-@@ -1186,12 +1138,11 @@ public:
-        can be overridden with the @a own parameter. */
-    void Set(int i, int j, Coefficient * c, bool own=true);
- 
--   using MatrixCoefficient::Eval;
--
-    /// Evaluate coefficient located at (i,j) in the matrix using integration
-    /// point @a ip.
-    double Eval(int i, int j, ElementTransformation &T, const IntegrationPoint &ip)
-    { return Coeff[i*width+j] ? Coeff[i*width+j] -> Eval(T, ip, GetTime()) : 0.0; }
-+   using MatrixCoefficient::Eval;
- 
-    /// Evaluate the matrix coefficient @a ip.
-    virtual void Eval(DenseMatrix &K, ElementTransformation &T,
-@@ -1294,10 +1245,11 @@ class SymmetricMatrixCoefficient : public MatrixCoefficient
- protected:
-    /// Internal matrix used when evaluating this coefficient as a DenseMatrix.
-    DenseSymmetricMatrix mat;
-+
- public:
-    /// Construct a dim x dim matrix coefficient.
-    explicit SymmetricMatrixCoefficient(int dimension)
--      : MatrixCoefficient(dimension, true) { }
-+      : MatrixCoefficient(dimension) { }
- 
-    /// Get the size of the matrix.
-    int GetSize() const { return height; }
-@@ -1347,10 +1299,11 @@ public:
-    ///Construct using matrix @a m for the constant.
-    SymmetricMatrixConstantCoefficient(const DenseSymmetricMatrix &m)
-       : SymmetricMatrixCoefficient(m.Height()), mat(m) { }
--   using SymmetricMatrixCoefficient::Eval;
-+
-    /// Evaluate the matrix coefficient at @a ip.
-    virtual void Eval(DenseSymmetricMatrix &M, ElementTransformation &T,
-                      const IntegrationPoint &ip) { M = mat; }
-+   using SymmetricMatrixCoefficient::Eval;
- };
- 
- 
-@@ -1398,10 +1351,10 @@ public:
-    /// Set the time for internally stored coefficients
-    void SetTime(double t);
- 
--   using SymmetricMatrixCoefficient::Eval;
-    /// Evaluate the matrix coefficient at @a ip.
-    virtual void Eval(DenseSymmetricMatrix &K, ElementTransformation &T,
-                      const IntegrationPoint &ip);
-+   using SymmetricMatrixCoefficient::Eval;
- 
-    virtual ~SymmetricMatrixFunctionCoefficient() { }
- };
-@@ -1548,6 +1501,7 @@ private:
- 
-    mutable Vector va;
-    mutable Vector vb;
-+
- public:
-    /// Construct with the two vector coefficients.  Result is \f$ A \cdot B \f$.
-    InnerProductCoefficient(VectorCoefficient &A, VectorCoefficient &B);
-@@ -2120,9 +2074,9 @@ public:
- 
-    const QuadratureFunction& GetQuadFunction() const { return QuadF; }
- 
--   using VectorCoefficient::Eval;
-    virtual void Eval(Vector &V, ElementTransformation &T,
-                      const IntegrationPoint &ip);
-+   using VectorCoefficient::Eval;
- 
-    virtual void Project(QuadratureFunction &qf);
- 
-@@ -2186,6 +2140,7 @@ protected:
-    int vdim; ///< Number of values per quadrature point.
-    QuadratureSpaceBase &qs; ///< Associated QuadratureSpaceBase.
-    QuadratureFunction *qf; ///< Internal QuadratureFunction (owned, may be NULL).
-+
- public:
-    /// Create an empty CoefficientVector.
-    CoefficientVector(QuadratureSpaceBase &qs_,
-@@ -2287,3 +2242,4 @@ double ComputeGlobalLpNorm(double p, VectorCoefficient &coeff, ParMesh &pmesh,
- }
- 
- #endif
-+
-diff --git a/fem/dgmassinv.cpp b/fem/dgmassinv.cpp
-index 88774b3ad..3cff5d05d 100644
---- a/fem/dgmassinv.cpp
-+++ b/fem/dgmassinv.cpp
-@@ -107,7 +107,7 @@ void DGMassInverse::Update()
- {
-    M->Assemble();
-    M->AssembleDiagonal(diag_inv);
--   internal::MakeReciprocal(diag_inv.Size(), diag_inv.ReadWrite());
-+   diag_inv.Reciprocal();
- }
- 
- DGMassInverse::~DGMassInverse()
-diff --git a/fem/dgmassinv_kernels.hpp b/fem/dgmassinv_kernels.hpp
-index c497621d4..e78a9bc98 100644
---- a/fem/dgmassinv_kernels.hpp
-+++ b/fem/dgmassinv_kernels.hpp
-@@ -12,9 +12,9 @@
- #ifndef MFEM_DGMASSINV_KERNELS_HPP
- #define MFEM_DGMASSINV_KERNELS_HPP
- 
--#include "bilininteg_mass_pa.hpp"
- #include "../linalg/kernels.hpp"
- #include "kernels.hpp"
-+#include "integ/bilininteg_mass_kernels.hpp"
- 
- namespace mfem
- {
-@@ -22,11 +22,6 @@ namespace mfem
- namespace internal
- {
- 
--void MakeReciprocal(int n, double *x)
--{
--   mfem::forall(n, [=] MFEM_HOST_DEVICE (int i) { x[i] = 1.0/x[i]; });
--}
--
- template <int DIM, int D1D, int Q1D>
- MFEM_HOST_DEVICE inline
- void DGMassApply(const int e,
-diff --git a/fem/fespace.cpp b/fem/fespace.cpp
-index 29a1aa980..cb82c6008 100644
---- a/fem/fespace.cpp
-+++ b/fem/fespace.cpp
-@@ -428,15 +428,6 @@ void FiniteElementSpace::BuildFaceToDofTable() const
-    face_dof = fc_dof;
- }
- 
--void FiniteElementSpace::RebuildElementToDofTable()
--{
--   delete elem_dof;
--   delete elem_fos;
--   elem_dof = NULL;
--   elem_fos = NULL;
--   BuildElementToDofTable();
--}
--
- void FiniteElementSpace::ReorderElementToDofTable()
- {
-    Array<int> dof_marker(ndofs);
-@@ -1256,7 +1247,7 @@ int FiniteElementSpace::GetNConformingDofs() const
-    return P ? (P->Width() / vdim) : ndofs;
- }
- 
--const ElementRestrictionOperator *FiniteElementSpace::GetElementRestriction(
-+const ElementRestriction *FiniteElementSpace::GetElementRestriction(
-    ElementDofOrdering e_ordering) const
- {
-    // Check if we have a discontinuous space using the FE collection:
-@@ -1271,22 +1262,22 @@ const ElementRestrictionOperator *FiniteElementSpace::GetElementRestriction(
-          // The output E-vector layout is: ND x VDIM x NE.
-          L2E_nat.Reset(new L2ElementRestriction(*this));
-       }
--      return L2E_nat.Is<ElementRestrictionOperator>();
-+      return L2E_nat.Is<ElementRestriction>();
-    }
-    if (e_ordering == ElementDofOrdering::LEXICOGRAPHIC)
-    {
-       if (L2E_lex.Ptr() == NULL)
-       {
--         L2E_lex.Reset(new ElementRestriction(*this, e_ordering));
-+         L2E_lex.Reset(new ConformingElementRestriction(*this, e_ordering));
-       }
--      return L2E_lex.Is<ElementRestrictionOperator>();
-+      return L2E_lex.Is<ElementRestriction>();
-    }
-    // e_ordering == ElementDofOrdering::NATIVE
-    if (L2E_nat.Ptr() == NULL)
-    {
--      L2E_nat.Reset(new ElementRestriction(*this, e_ordering));
-+      L2E_nat.Reset(new ConformingElementRestriction(*this, e_ordering));
-    }
--   return L2E_nat.Is<ElementRestrictionOperator>();
-+   return L2E_nat.Is<ElementRestriction>();
- }
- 
- const FaceRestriction *FiniteElementSpace::GetFaceRestriction(
-diff --git a/fem/fespace.hpp b/fem/fespace.hpp
-index f777bf871..00b290c09 100644
---- a/fem/fespace.hpp
-+++ b/fem/fespace.hpp
-@@ -602,18 +602,6 @@ public:
-    virtual const Operator *GetProlongationMatrix() const
-    { return GetConformingProlongation(); }
- 
--   /// Return an operator that performs the transpose of GetRestrictionOperator
--   /** The returned operator is owned by the FiniteElementSpace. In serial this
--       is the same as GetProlongationMatrix() */
--   virtual const Operator *GetRestrictionTransposeOperator() const
--   { return GetConformingProlongation(); }
--
--   /// An abstract operator that performs the same action as GetRestrictionMatrix
--   /** In some cases this is an optimized matrix-free implementation. The
--       returned operator is owned by the FiniteElementSpace. */
--   virtual const Operator *GetRestrictionOperator() const
--   { return GetConformingRestriction(); }
--
-    /// The returned SparseMatrix is owned by the FiniteElementSpace.
-    virtual const SparseMatrix *GetRestrictionMatrix() const
-    { return GetConformingRestriction(); }
-@@ -639,7 +627,7 @@ public:
-        L2ElementRestriction class.
- 
-        The returned Operator is owned by the FiniteElementSpace. */
--   const ElementRestrictionOperator *GetElementRestriction(
-+   const ElementRestriction *GetElementRestriction(
-       ElementDofOrdering e_ordering) const;
- 
-    /// Return an Operator that converts L-vectors to E-vectors on each face.
-@@ -1058,9 +1046,6 @@ public:
-    void GetEdgeInteriorVDofs(int i, Array<int> &vdofs) const;
-    /// @}
- 
--   /// (@deprecated) Use the Update() method if the space or mesh changed.
--   MFEM_DEPRECATED void RebuildElementToDofTable();
--
-    /** @brief Reorder the scalar DOFs based on the element ordering.
- 
-        The new ordering is constructed as follows: 1) loop over all elements as
-diff --git a/fem/bilininteg_br2.cpp b/fem/integ/bilininteg_br2.cpp
-similarity index 99%
-rename from fem/bilininteg_br2.cpp
-rename to fem/integ/bilininteg_br2.cpp
-index dba87a8b5..159947029 100644
---- a/fem/bilininteg_br2.cpp
-+++ b/fem/integ/bilininteg_br2.cpp
-@@ -9,8 +9,8 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "bilininteg.hpp"
--#include "pfespace.hpp"
-+#include "../bilininteg.hpp"
-+#include "../pfespace.hpp"
- #include <algorithm>
- 
- namespace mfem
-diff --git a/fem/bilininteg_convection_ea.cpp b/fem/integ/bilininteg_convection_ea.cpp
-similarity index 85%
-rename from fem/bilininteg_convection_ea.cpp
-rename to fem/integ/bilininteg_convection_ea.cpp
-index 52e3b4e81..08422ce86 100644
---- a/fem/bilininteg_convection_ea.cpp
-+++ b/fem/integ/bilininteg_convection_ea.cpp
-@@ -9,9 +9,9 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
- 
- namespace mfem
- {
-@@ -22,7 +22,6 @@ static void EAConvectionAssemble1D(const int NE,
-                                    const Array<double> &g,
-                                    const Vector &padata,
-                                    Vector &eadata,
--                                   const bool add,
-                                    const int d1d = 0,
-                                    const int q1d = 0)
- {
-@@ -55,14 +54,7 @@ static void EAConvectionAssemble1D(const int NE,
-             {
-                val += r_Bj[k1] * D(k1, e) * r_Gi[k1];
-             }
--            if (add)
--            {
--               A(i1, j1, e) += val;
--            }
--            else
--            {
--               A(i1, j1, e) = val;
--            }
-+            A(i1, j1, e) += val;
-          }
-       }
-    });
-@@ -74,7 +66,6 @@ static void EAConvectionAssemble2D(const int NE,
-                                    const Array<double> &g,
-                                    const Vector &padata,
-                                    Vector &eadata,
--                                   const bool add,
-                                    const int d1d = 0,
-                                    const int q1d = 0)
- {
-@@ -130,14 +121,7 @@ static void EAConvectionAssemble2D(const int NE,
-                                * r_B[k1][j1]* r_B[k2][j2];
-                      }
-                   }
--                  if (add)
--                  {
--                     A(i1, i2, j1, j2, e) += val;
--                  }
--                  else
--                  {
--                     A(i1, i2, j1, j2, e) = val;
--                  }
-+                  A(i1, i2, j1, j2, e) += val;
-                }
-             }
-          }
-@@ -151,7 +135,6 @@ static void EAConvectionAssemble3D(const int NE,
-                                    const Array<double> &g,
-                                    const Vector &padata,
-                                    Vector &eadata,
--                                   const bool add,
-                                    const int d1d = 0,
-                                    const int q1d = 0)
- {
-@@ -208,14 +191,7 @@ static void EAConvectionAssemble3D(const int NE,
-                               }
-                            }
-                         }
--                        if (add)
--                        {
--                           A(i1, i2, i3, j1, j2, j3, e) += val;
--                        }
--                        else
--                        {
--                           A(i1, i2, i3, j1, j2, j3, e) = val;
--                        }
-+                        A(i1, i2, i3, j1, j2, j3, e) += val;
-                      }
-                   }
-                }
-@@ -226,8 +202,7 @@ static void EAConvectionAssemble3D(const int NE,
- }
- 
- void ConvectionIntegrator::AssembleEA(const FiniteElementSpace &fes,
--                                      Vector &ea_data,
--                                      const bool add)
-+                                      Vector &ea_data)
- {
-    AssemblePA(fes);
-    ne = fes.GetMesh()->GetNE();
-@@ -237,15 +212,15 @@ void ConvectionIntegrator::AssembleEA(const FiniteElementSpace &fes,
-    {
-       switch ((dofs1D << 4 ) | quad1D)
-       {
--         case 0x22: return EAConvectionAssemble1D<2,2>(ne,B,G,pa_data,ea_data,add);
--         case 0x33: return EAConvectionAssemble1D<3,3>(ne,B,G,pa_data,ea_data,add);
--         case 0x44: return EAConvectionAssemble1D<4,4>(ne,B,G,pa_data,ea_data,add);
--         case 0x55: return EAConvectionAssemble1D<5,5>(ne,B,G,pa_data,ea_data,add);
--         case 0x66: return EAConvectionAssemble1D<6,6>(ne,B,G,pa_data,ea_data,add);
--         case 0x77: return EAConvectionAssemble1D<7,7>(ne,B,G,pa_data,ea_data,add);
--         case 0x88: return EAConvectionAssemble1D<8,8>(ne,B,G,pa_data,ea_data,add);
--         case 0x99: return EAConvectionAssemble1D<9,9>(ne,B,G,pa_data,ea_data,add);
--         default:   return EAConvectionAssemble1D(ne,B,G,pa_data,ea_data,add,
-+         case 0x22: return EAConvectionAssemble1D<2,2>(ne,B,G,pa_data,ea_data);
-+         case 0x33: return EAConvectionAssemble1D<3,3>(ne,B,G,pa_data,ea_data);
-+         case 0x44: return EAConvectionAssemble1D<4,4>(ne,B,G,pa_data,ea_data);
-+         case 0x55: return EAConvectionAssemble1D<5,5>(ne,B,G,pa_data,ea_data);
-+         case 0x66: return EAConvectionAssemble1D<6,6>(ne,B,G,pa_data,ea_data);
-+         case 0x77: return EAConvectionAssemble1D<7,7>(ne,B,G,pa_data,ea_data);
-+         case 0x88: return EAConvectionAssemble1D<8,8>(ne,B,G,pa_data,ea_data);
-+         case 0x99: return EAConvectionAssemble1D<9,9>(ne,B,G,pa_data,ea_data);
-+         default:   return EAConvectionAssemble1D(ne,B,G,pa_data,ea_data,
-                                                      dofs1D,quad1D);
-       }
-    }
-@@ -253,15 +228,15 @@ void ConvectionIntegrator::AssembleEA(const FiniteElementSpace &fes,
-    {
-       switch ((dofs1D << 4 ) | quad1D)
-       {
--         case 0x22: return EAConvectionAssemble2D<2,2>(ne,B,G,pa_data,ea_data,add);
--         case 0x33: return EAConvectionAssemble2D<3,3>(ne,B,G,pa_data,ea_data,add);
--         case 0x44: return EAConvectionAssemble2D<4,4>(ne,B,G,pa_data,ea_data,add);
--         case 0x55: return EAConvectionAssemble2D<5,5>(ne,B,G,pa_data,ea_data,add);
--         case 0x66: return EAConvectionAssemble2D<6,6>(ne,B,G,pa_data,ea_data,add);
--         case 0x77: return EAConvectionAssemble2D<7,7>(ne,B,G,pa_data,ea_data,add);
--         case 0x88: return EAConvectionAssemble2D<8,8>(ne,B,G,pa_data,ea_data,add);
--         case 0x99: return EAConvectionAssemble2D<9,9>(ne,B,G,pa_data,ea_data,add);
--         default:   return EAConvectionAssemble2D(ne,B,G,pa_data,ea_data,add,
-+         case 0x22: return EAConvectionAssemble2D<2,2>(ne,B,G,pa_data,ea_data);
-+         case 0x33: return EAConvectionAssemble2D<3,3>(ne,B,G,pa_data,ea_data);
-+         case 0x44: return EAConvectionAssemble2D<4,4>(ne,B,G,pa_data,ea_data);
-+         case 0x55: return EAConvectionAssemble2D<5,5>(ne,B,G,pa_data,ea_data);
-+         case 0x66: return EAConvectionAssemble2D<6,6>(ne,B,G,pa_data,ea_data);
-+         case 0x77: return EAConvectionAssemble2D<7,7>(ne,B,G,pa_data,ea_data);
-+         case 0x88: return EAConvectionAssemble2D<8,8>(ne,B,G,pa_data,ea_data);
-+         case 0x99: return EAConvectionAssemble2D<9,9>(ne,B,G,pa_data,ea_data);
-+         default:   return EAConvectionAssemble2D(ne,B,G,pa_data,ea_data,
-                                                      dofs1D,quad1D);
-       }
-    }
-@@ -269,14 +244,14 @@ void ConvectionIntegrator::AssembleEA(const FiniteElementSpace &fes,
-    {
-       switch ((dofs1D << 4 ) | quad1D)
-       {
--         case 0x23: return EAConvectionAssemble3D<2,3>(ne,B,G,pa_data,ea_data,add);
--         case 0x34: return EAConvectionAssemble3D<3,4>(ne,B,G,pa_data,ea_data,add);
--         case 0x45: return EAConvectionAssemble3D<4,5>(ne,B,G,pa_data,ea_data,add);
--         case 0x56: return EAConvectionAssemble3D<5,6>(ne,B,G,pa_data,ea_data,add);
--         case 0x67: return EAConvectionAssemble3D<6,7>(ne,B,G,pa_data,ea_data,add);
--         case 0x78: return EAConvectionAssemble3D<7,8>(ne,B,G,pa_data,ea_data,add);
--         case 0x89: return EAConvectionAssemble3D<8,9>(ne,B,G,pa_data,ea_data,add);
--         default:   return EAConvectionAssemble3D(ne,B,G,pa_data,ea_data,add,
-+         case 0x23: return EAConvectionAssemble3D<2,3>(ne,B,G,pa_data,ea_data);
-+         case 0x34: return EAConvectionAssemble3D<3,4>(ne,B,G,pa_data,ea_data);
-+         case 0x45: return EAConvectionAssemble3D<4,5>(ne,B,G,pa_data,ea_data);
-+         case 0x56: return EAConvectionAssemble3D<5,6>(ne,B,G,pa_data,ea_data);
-+         case 0x67: return EAConvectionAssemble3D<6,7>(ne,B,G,pa_data,ea_data);
-+         case 0x78: return EAConvectionAssemble3D<7,8>(ne,B,G,pa_data,ea_data);
-+         case 0x89: return EAConvectionAssemble3D<8,9>(ne,B,G,pa_data,ea_data);
-+         default:   return EAConvectionAssemble3D(ne,B,G,pa_data,ea_data,
-                                                      dofs1D,quad1D);
-       }
-    }
-diff --git a/fem/bilininteg_convection_mf.cpp b/fem/integ/bilininteg_convection_mf.cpp
-similarity index 92%
-rename from fem/bilininteg_convection_mf.cpp
-rename to fem/integ/bilininteg_convection_mf.cpp
-index 61520c135..bbaf82788 100644
---- a/fem/bilininteg_convection_mf.cpp
-+++ b/fem/integ/bilininteg_convection_mf.cpp
-@@ -9,12 +9,10 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
--#include "ceed/integrators/convection/convection.hpp"
--
--using namespace std;
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../ceed/integrators/convection/convection.hpp"
- 
- namespace mfem
- {
-diff --git a/fem/bilininteg_convection_pa.cpp b/fem/integ/bilininteg_convection_pa.cpp
-similarity index 90%
-rename from fem/bilininteg_convection_pa.cpp
-rename to fem/integ/bilininteg_convection_pa.cpp
-index 48080ecdf..25928f002 100644
---- a/fem/bilininteg_convection_pa.cpp
-+++ b/fem/integ/bilininteg_convection_pa.cpp
-@@ -9,18 +9,15 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
--#include "qfunction.hpp"
--#include "ceed/integrators/convection/convection.hpp"
--#include "quadinterpolator.hpp"
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../qfunction.hpp"
-+#include "../ceed/integrators/convection/convection.hpp"
- 
- namespace mfem
- {
- 
--// PA Convection Integrator
--
- // PA Convection Assemble 2D kernel
- static void PAConvectionSetup2D(const int NQ,
-                                 const int NE,
-@@ -115,38 +112,85 @@ static void PAConvectionSetup3D(const int NQ,
-    });
- }
- 
--static void PAConvectionSetup(const int dim,
--                              const int NQ,
--                              const int NE,
--                              const Array<double> &W,
--                              const Vector &J,
--                              const Vector &coeff,
--                              const double alpha,
--                              Vector &op)
-+void ConvectionIntegrator::AssemblePA(const FiniteElementSpace &fes)
- {
--   if (dim == 1) { MFEM_ABORT("dim==1 not supported in PAConvectionSetup"); }
--   if (dim == 2)
-+   const MemoryType mt = (pa_mt == MemoryType::DEFAULT) ?
-+                         Device::GetDeviceMemoryType() : pa_mt;
-+   // Assumes tensor-product elements
-+   Mesh *mesh = fes.GetMesh();
-+   const FiniteElement &el = *fes.GetFE(0);
-+   ElementTransformation &Trans = *fes.GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Trans);
-+   if (DeviceCanUseCeed())
-    {
--      PAConvectionSetup2D(NQ, NE, W, J, coeff, alpha, op);
-+      delete ceedOp;
-+      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
-+                         fes.IsVariableOrder();
-+      if (mixed)
-+      {
-+         ceedOp = new ceed::MixedPAConvectionIntegrator(*this, fes, Q, alpha);
-+      }
-+      else
-+      {
-+         ceedOp = new ceed::PAConvectionIntegrator(fes, *ir, Q, alpha);
-+      }
-+      return;
-    }
--   if (dim == 3)
-+   const int dims = el.GetDim();
-+   const int symmDims = dims;
-+   nq = ir->GetNPoints();
-+   dim = mesh->Dimension();
-+   ne = fes.GetNE();
-+   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS, mt);
-+   maps = &el.GetDofToQuad(*ir, DofToQuad::TENSOR);
-+   dofs1D = maps->ndof;
-+   quad1D = maps->nqpt;
-+   pa_data.SetSize(symmDims * nq * ne, mt);
-+
-+   QuadratureSpace qs(*mesh, *ir);
-+   CoefficientVector vel(*Q, qs, CoefficientStorage::COMPRESSED);
-+
-+   if (dim == 1)
-    {
--      PAConvectionSetup3D(NQ, NE, W, J, coeff, alpha, op);
-+      MFEM_ABORT("dim==1 not supported in ConvectionIntegrator::AssemblePA");
-+   }
-+   else if (dim == 2)
-+   {
-+      PAConvectionSetup2D(nq, ne, ir->GetWeights(), geom->J,
-+                          vel, alpha, pa_data);
-+   }
-+   else if (dim == 3)
-+   {
-+      PAConvectionSetup3D(nq, ne, ir->GetWeights(), geom->J,
-+                          vel, alpha, pa_data);
-+   }
-+}
-+
-+void ConvectionIntegrator::AssembleDiagonalPA(Vector &diag)
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->GetDiagonal(diag);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("AssembleDiagonalPA not yet implemented for"
-+                 " ConvectionIntegrator.");
-    }
- }
- 
- // PA Convection Apply 2D kernel
--template<int T_D1D = 0, int T_Q1D = 0> static
--void PAConvectionApply2D(const int ne,
--                         const Array<double> &b,
--                         const Array<double> &g,
--                         const Array<double> &bt,
--                         const Array<double> &gt,
--                         const Vector &op_,
--                         const Vector &x_,
--                         Vector &y_,
--                         const int d1d = 0,
--                         const int q1d = 0)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void PAConvectionApply2D(const int ne,
-+                                const Array<double> &b,
-+                                const Array<double> &g,
-+                                const Array<double> &bt,
-+                                const Array<double> &gt,
-+                                const Vector &op_,
-+                                const Vector &x_,
-+                                Vector &y_,
-+                                const int d1d = 0,
-+                                const int q1d = 0)
- {
-    const int NE = ne;
-    const int D1D = T_D1D ? T_D1D : d1d;
-@@ -255,17 +299,17 @@ void PAConvectionApply2D(const int ne,
- }
- 
- // Optimized PA Convection Apply 2D kernel
--template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0> static
--void SmemPAConvectionApply2D(const int ne,
--                             const Array<double> &b,
--                             const Array<double> &g,
--                             const Array<double> &bt,
--                             const Array<double> &gt,
--                             const Vector &op_,
--                             const Vector &x_,
--                             Vector &y_,
--                             const int d1d = 0,
--                             const int q1d = 0)
-+template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
-+static void SmemPAConvectionApply2D(const int ne,
-+                                    const Array<double> &b,
-+                                    const Array<double> &g,
-+                                    const Array<double> &bt,
-+                                    const Array<double> &gt,
-+                                    const Vector &op_,
-+                                    const Vector &x_,
-+                                    Vector &y_,
-+                                    const int d1d = 0,
-+                                    const int q1d = 0)
- {
-    const int NE = ne;
-    const int D1D = T_D1D ? T_D1D : d1d;
-@@ -383,17 +427,17 @@ void SmemPAConvectionApply2D(const int ne,
- }
- 
- // PA Convection Apply 3D kernel
--template<int T_D1D = 0, int T_Q1D = 0> static
--void PAConvectionApply3D(const int ne,
--                         const Array<double> &b,
--                         const Array<double> &g,
--                         const Array<double> &bt,
--                         const Array<double> &gt,
--                         const Vector &op_,
--                         const Vector &x_,
--                         Vector &y_,
--                         const int d1d = 0,
--                         const int q1d = 0)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void PAConvectionApply3D(const int ne,
-+                                const Array<double> &b,
-+                                const Array<double> &g,
-+                                const Array<double> &bt,
-+                                const Array<double> &gt,
-+                                const Vector &op_,
-+                                const Vector &x_,
-+                                Vector &y_,
-+                                const int d1d = 0,
-+                                const int q1d = 0)
- {
-    const int NE = ne;
-    const int D1D = T_D1D ? T_D1D : d1d;
-@@ -564,17 +608,17 @@ void PAConvectionApply3D(const int ne,
- }
- 
- // Optimized PA Convection Apply 3D kernel
--template<int T_D1D = 0, int T_Q1D = 0> static
--void SmemPAConvectionApply3D(const int ne,
--                             const Array<double> &b,
--                             const Array<double> &g,
--                             const Array<double> &bt,
--                             const Array<double> &gt,
--                             const Vector &op_,
--                             const Vector &x_,
--                             Vector &y_,
--                             const int d1d = 0,
--                             const int q1d = 0)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void SmemPAConvectionApply3D(const int ne,
-+                                    const Array<double> &b,
-+                                    const Array<double> &g,
-+                                    const Array<double> &bt,
-+                                    const Array<double> &gt,
-+                                    const Vector &op_,
-+                                    const Vector &x_,
-+                                    Vector &y_,
-+                                    const int d1d = 0,
-+                                    const int q1d = 0)
- {
-    const int NE = ne;
-    const int D1D = T_D1D ? T_D1D : d1d;
-@@ -768,17 +812,17 @@ void SmemPAConvectionApply3D(const int ne,
- }
- 
- // PA Convection Apply 2D kernel
--template<int T_D1D = 0, int T_Q1D = 0> static
--void PAConvectionApplyT2D(const int ne,
--                          const Array<double> &b,
--                          const Array<double> &g,
--                          const Array<double> &bt,
--                          const Array<double> &gt,
--                          const Vector &op_,
--                          const Vector &x_,
--                          Vector &y_,
--                          const int d1d = 0,
--                          const int q1d = 0)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void PAConvectionApplyT2D(const int ne,
-+                                 const Array<double> &b,
-+                                 const Array<double> &g,
-+                                 const Array<double> &bt,
-+                                 const Array<double> &gt,
-+                                 const Vector &op_,
-+                                 const Vector &x_,
-+                                 Vector &y_,
-+                                 const int d1d = 0,
-+                                 const int q1d = 0)
- {
-    const int NE = ne;
-    const int D1D = T_D1D ? T_D1D : d1d;
-@@ -883,17 +927,17 @@ void PAConvectionApplyT2D(const int ne,
- }
- 
- // Optimized PA Convection Apply 2D kernel
--template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0> static
--void SmemPAConvectionApplyT2D(const int ne,
--                              const Array<double> &b,
--                              const Array<double> &g,
--                              const Array<double> &bt,
--                              const Array<double> &gt,
--                              const Vector &op_,
--                              const Vector &x_,
--                              Vector &y_,
--                              const int d1d = 0,
--                              const int q1d = 0)
-+template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
-+static void SmemPAConvectionApplyT2D(const int ne,
-+                                     const Array<double> &b,
-+                                     const Array<double> &g,
-+                                     const Array<double> &bt,
-+                                     const Array<double> &gt,
-+                                     const Vector &op_,
-+                                     const Vector &x_,
-+                                     Vector &y_,
-+                                     const int d1d = 0,
-+                                     const int q1d = 0)
- {
-    const int NE = ne;
-    const int D1D = T_D1D ? T_D1D : d1d;
-@@ -1006,17 +1050,17 @@ void SmemPAConvectionApplyT2D(const int ne,
- }
- 
- // PA Convection Apply 3D kernel
--template<int T_D1D = 0, int T_Q1D = 0> static
--void PAConvectionApplyT3D(const int ne,
--                          const Array<double> &b,
--                          const Array<double> &g,
--                          const Array<double> &bt,
--                          const Array<double> &gt,
--                          const Vector &op_,
--                          const Vector &x_,
--                          Vector &y_,
--                          const int d1d = 0,
--                          const int q1d = 0)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void PAConvectionApplyT3D(const int ne,
-+                                 const Array<double> &b,
-+                                 const Array<double> &g,
-+                                 const Array<double> &bt,
-+                                 const Array<double> &gt,
-+                                 const Vector &op_,
-+                                 const Vector &x_,
-+                                 Vector &y_,
-+                                 const int d1d = 0,
-+                                 const int q1d = 0)
- {
-    const int NE = ne;
-    const int D1D = T_D1D ? T_D1D : d1d;
-@@ -1182,17 +1226,17 @@ void PAConvectionApplyT3D(const int ne,
- }
- 
- // Optimized PA Convection Apply 3D kernel
--template<int T_D1D = 0, int T_Q1D = 0> static
--void SmemPAConvectionApplyT3D(const int ne,
--                              const Array<double> &b,
--                              const Array<double> &g,
--                              const Array<double> &bt,
--                              const Array<double> &gt,
--                              const Vector &op_,
--                              const Vector &x_,
--                              Vector &y_,
--                              const int d1d = 0,
--                              const int q1d = 0)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void SmemPAConvectionApplyT3D(const int ne,
-+                                     const Array<double> &b,
-+                                     const Array<double> &g,
-+                                     const Array<double> &bt,
-+                                     const Array<double> &gt,
-+                                     const Vector &op_,
-+                                     const Vector &x_,
-+                                     Vector &y_,
-+                                     const int d1d = 0,
-+                                     const int q1d = 0)
- {
-    const int NE = ne;
-    const int D1D = T_D1D ? T_D1D : d1d;
-@@ -1375,48 +1419,6 @@ void SmemPAConvectionApplyT3D(const int ne,
-    });
- }
- 
--void ConvectionIntegrator::AssemblePA(const FiniteElementSpace &fes)
--{
--   const MemoryType mt = (pa_mt == MemoryType::DEFAULT) ?
--                         Device::GetDeviceMemoryType() : pa_mt;
--   // Assumes tensor-product elements
--   Mesh *mesh = fes.GetMesh();
--   const FiniteElement &el = *fes.GetFE(0);
--   ElementTransformation &Trans = *fes.GetElementTransformation(0);
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, Trans);
--   if (DeviceCanUseCeed())
--   {
--      delete ceedOp;
--      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
--                         fes.IsVariableOrder();
--      if (mixed)
--      {
--         ceedOp = new ceed::MixedPAConvectionIntegrator(*this, fes, Q, alpha);
--      }
--      else
--      {
--         ceedOp = new ceed::PAConvectionIntegrator(fes, *ir, Q, alpha);
--      }
--      return;
--   }
--   const int dims = el.GetDim();
--   const int symmDims = dims;
--   nq = ir->GetNPoints();
--   dim = mesh->Dimension();
--   ne = fes.GetNE();
--   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS, mt);
--   maps = &el.GetDofToQuad(*ir, DofToQuad::TENSOR);
--   dofs1D = maps->ndof;
--   quad1D = maps->nqpt;
--   pa_data.SetSize(symmDims * nq * ne, mt);
--
--   QuadratureSpace qs(*mesh, *ir);
--   CoefficientVector vel(*Q, qs, CoefficientStorage::COMPRESSED);
--
--   PAConvectionSetup(dim, nq, ne, ir->GetWeights(), geom->J,
--                     vel, alpha, pa_data);
--}
--
- static void PAConvectionApply(const int dim,
-                               const int D1D,
-                               const int Q1D,
-@@ -1521,7 +1523,6 @@ static void PAConvectionApplyT(const int dim,
-    MFEM_ABORT("Unknown kernel.");
- }
- 
--// PA Convection Apply kernel
- void ConvectionIntegrator::AddMultPA(const Vector &x, Vector &y) const
- {
-    if (DeviceCanUseCeed())
-@@ -1536,12 +1537,11 @@ void ConvectionIntegrator::AddMultPA(const Vector &x, Vector &y) const
-    }
- }
- 
--// PA Convection Apply transpose kernel
- void ConvectionIntegrator::AddMultTransposePA(const Vector &x, Vector &y) const
- {
-    if (DeviceCanUseCeed())
-    {
--      MFEM_ABORT("AddMultPA not yet implemented with libCEED for"
-+      MFEM_ABORT("AddMultTransposePA not yet implemented with libCEED for"
-                  " ConvectionIntegrator.");
-    }
-    else
-@@ -1552,17 +1552,4 @@ void ConvectionIntegrator::AddMultTransposePA(const Vector &x, Vector &y) const
-    }
- }
- 
--void ConvectionIntegrator::AssembleDiagonalPA(Vector &diag)
--{
--   if (DeviceCanUseCeed())
--   {
--      ceedOp->GetDiagonal(diag);
--   }
--   else
--   {
--      MFEM_ABORT("AssembleDiagonalPA not yet implemented for"
--                 " ConvectionIntegrator.");
--   }
--}
--
- } // namespace mfem
-diff --git a/fem/integ/bilininteg_curlcurl_pa.cpp b/fem/integ/bilininteg_curlcurl_pa.cpp
-new file mode 100644
-index 000000000..3d12d978a
---- /dev/null
-+++ b/fem/integ/bilininteg_curlcurl_pa.cpp
-@@ -0,0 +1,208 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../qfunction.hpp"
-+#include "bilininteg_hcurl_kernels.hpp"
-+
-+namespace mfem
-+{
-+
-+void CurlCurlIntegrator::AssemblePA(const FiniteElementSpace &fes)
-+{
-+   // Assumes tensor-product elements
-+   Mesh *mesh = fes.GetMesh();
-+   const FiniteElement *fel = fes.GetFE(0);
-+
-+   const VectorTensorFiniteElement *el =
-+      dynamic_cast<const VectorTensorFiniteElement*>(fel);
-+   MFEM_VERIFY(el != NULL, "Only VectorTensorFiniteElement is supported!");
-+
-+   const IntegrationRule *ir
-+      = IntRule ? IntRule : &MassIntegrator::GetRule(*el, *el,
-+                                                     *mesh->GetElementTransformation(0));
-+
-+   const int dims = el->GetDim();
-+   MFEM_VERIFY(dims == 2 || dims == 3, "");
-+
-+   nq = ir->GetNPoints();
-+   dim = mesh->Dimension();
-+   MFEM_VERIFY(dim == 2 || dim == 3, "");
-+
-+   ne = fes.GetNE();
-+   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
-+   mapsC = &el->GetDofToQuad(*ir, DofToQuad::TENSOR);
-+   mapsO = &el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
-+   dofs1D = mapsC->ndof;
-+   quad1D = mapsC->nqpt;
-+
-+   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
-+
-+   QuadratureSpace qs(*mesh, *ir);
-+   CoefficientVector coeff(qs, CoefficientStorage::SYMMETRIC);
-+   if (Q) { coeff.Project(*Q); }
-+   else if (MQ) { coeff.ProjectTranspose(*MQ); }
-+   else if (DQ) { coeff.Project(*DQ); }
-+   else { coeff.SetConstant(1.0); }
-+
-+   const int coeff_dim = coeff.GetVDim();
-+   symmetric = (coeff_dim != dim*dim);
-+   const int sym_dims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
-+   const int ndata = (dim == 2) ? 1 : (symmetric ? sym_dims : dim*dim);
-+   pa_data.SetSize(ndata * nq * ne, Device::GetMemoryType());
-+
-+   if (el->GetDerivType() != mfem::FiniteElement::CURL)
-+   {
-+      MFEM_ABORT("Unknown kernel.");
-+   }
-+
-+   if (dim == 3)
-+   {
-+      internal::PACurlCurlSetup3D(quad1D, coeff_dim, ne, ir->GetWeights(), geom->J,
-+                                  coeff, pa_data);
-+   }
-+   else
-+   {
-+      internal::PACurlCurlSetup2D(quad1D, ne, ir->GetWeights(), geom->J, coeff,
-+                                  pa_data);
-+   }
-+}
-+
-+void CurlCurlIntegrator::AssembleDiagonalPA(Vector& diag)
-+{
-+   if (dim == 3)
-+   {
-+      if (Device::Allows(Backend::DEVICE_MASK))
-+      {
-+         const int ID = (dofs1D << 4) | quad1D;
-+         switch (ID)
-+         {
-+            case 0x23:
-+               return internal::SmemPACurlCurlAssembleDiagonal3D<2,3>(
-+                         dofs1D,
-+                         quad1D,
-+                         symmetric, ne,
-+                         mapsO->B, mapsC->B,
-+                         mapsO->G, mapsC->G,
-+                         pa_data, diag);
-+            case 0x34:
-+               return internal::SmemPACurlCurlAssembleDiagonal3D<3,4>(
-+                         dofs1D,
-+                         quad1D,
-+                         symmetric, ne,
-+                         mapsO->B, mapsC->B,
-+                         mapsO->G, mapsC->G,
-+                         pa_data, diag);
-+            case 0x45:
-+               return internal::SmemPACurlCurlAssembleDiagonal3D<4,5>(
-+                         dofs1D,
-+                         quad1D,
-+                         symmetric, ne,
-+                         mapsO->B, mapsC->B,
-+                         mapsO->G, mapsC->G,
-+                         pa_data, diag);
-+            case 0x56:
-+               return internal::SmemPACurlCurlAssembleDiagonal3D<5,6>(
-+                         dofs1D,
-+                         quad1D,
-+                         symmetric, ne,
-+                         mapsO->B, mapsC->B,
-+                         mapsO->G, mapsC->G,
-+                         pa_data, diag);
-+            default:
-+               return internal::SmemPACurlCurlAssembleDiagonal3D(
-+                         dofs1D, quad1D,
-+                         symmetric, ne,
-+                         mapsO->B, mapsC->B,
-+                         mapsO->G, mapsC->G,
-+                         pa_data, diag);
-+         }
-+      }
-+      else
-+      {
-+         internal::PACurlCurlAssembleDiagonal3D(dofs1D, quad1D, symmetric, ne,
-+                                                mapsO->B, mapsC->B,
-+                                                mapsO->G, mapsC->G,
-+                                                pa_data, diag);
-+      }
-+   }
-+   else if (dim == 2)
-+   {
-+      internal::PACurlCurlAssembleDiagonal2D(dofs1D, quad1D, ne,
-+                                             mapsO->B, mapsC->G, pa_data, diag);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unsupported dimension!");
-+   }
-+}
-+
-+void CurlCurlIntegrator::AddMultPA(const Vector &x, Vector &y) const
-+{
-+   if (dim == 3)
-+   {
-+      if (Device::Allows(Backend::DEVICE_MASK))
-+      {
-+         const int ID = (dofs1D << 4) | quad1D;
-+         switch (ID)
-+         {
-+            case 0x23:
-+               return internal::SmemPACurlCurlApply3D<2,3>(
-+                         dofs1D, quad1D,
-+                         symmetric, ne,
-+                         mapsO->B, mapsC->B, mapsO->Bt, mapsC->Bt,
-+                         mapsC->G, mapsC->Gt, pa_data, x, y);
-+            case 0x34:
-+               return internal::SmemPACurlCurlApply3D<3,4>(
-+                         dofs1D, quad1D,
-+                         symmetric, ne,
-+                         mapsO->B, mapsC->B, mapsO->Bt, mapsC->Bt,
-+                         mapsC->G, mapsC->Gt, pa_data, x, y);
-+            case 0x45:
-+               return internal::SmemPACurlCurlApply3D<4,5>(
-+                         dofs1D, quad1D,
-+                         symmetric, ne,
-+                         mapsO->B, mapsC->B, mapsO->Bt, mapsC->Bt,
-+                         mapsC->G, mapsC->Gt, pa_data, x, y);
-+            case 0x56:
-+               return internal::SmemPACurlCurlApply3D<5,6>(
-+                         dofs1D, quad1D,
-+                         symmetric, ne,
-+                         mapsO->B, mapsC->B, mapsO->Bt, mapsC->Bt,
-+                         mapsC->G, mapsC->Gt, pa_data, x, y);
-+            default:
-+               return internal::SmemPACurlCurlApply3D(
-+                         dofs1D, quad1D, symmetric, ne,
-+                         mapsO->B, mapsC->B, mapsO->Bt, mapsC->Bt,
-+                         mapsC->G, mapsC->Gt, pa_data, x, y);
-+         }
-+      }
-+      else
-+      {
-+         internal::PACurlCurlApply3D(dofs1D, quad1D, symmetric, ne, mapsO->B, mapsC->B,
-+                                     mapsO->Bt, mapsC->Bt, mapsC->G, mapsC->Gt,
-+                                     pa_data, x, y);
-+      }
-+   }
-+   else if (dim == 2)
-+   {
-+      internal::PACurlCurlApply2D(dofs1D, quad1D, ne, mapsO->B, mapsO->Bt,
-+                                  mapsC->G, mapsC->Gt, pa_data, x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unsupported dimension!");
-+   }
-+}
-+
-+} // namespace mfem
-diff --git a/fem/bilininteg_dgtrace_ea.cpp b/fem/integ/bilininteg_dgtrace_ea.cpp
-similarity index 75%
-rename from fem/bilininteg_dgtrace_ea.cpp
-rename to fem/integ/bilininteg_dgtrace_ea.cpp
-index c40d2ff46..602c266ab 100644
---- a/fem/bilininteg_dgtrace_ea.cpp
-+++ b/fem/integ/bilininteg_dgtrace_ea.cpp
-@@ -9,9 +9,9 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
- 
- namespace mfem
- {
-@@ -20,8 +20,7 @@ static void EADGTraceAssemble1DInt(const int NF,
-                                    const Array<double> &basis,
-                                    const Vector &padata,
-                                    Vector &eadata_int,
--                                   Vector &eadata_ext,
--                                   const bool add)
-+                                   Vector &eadata_ext)
- {
-    auto D = Reshape(padata.Read(), 2, 2, NF);
-    auto A_int = Reshape(eadata_int.ReadWrite(), 2, NF);
-@@ -33,41 +32,23 @@ static void EADGTraceAssemble1DInt(const int NF,
-       val_ext10 = D(1, 0, f);
-       val_ext01 = D(0, 1, f);
-       val_int1  = D(1, 1, f);
--      if (add)
--      {
--         A_int(0, f) += val_int0;
--         A_int(1, f) += val_int1;
--         A_ext(0, f) += val_ext01;
--         A_ext(1, f) += val_ext10;
--      }
--      else
--      {
--         A_int(0, f) = val_int0;
--         A_int(1, f) = val_int1;
--         A_ext(0, f) = val_ext01;
--         A_ext(1, f) = val_ext10;
--      }
-+      A_int(0, f) += val_int0;
-+      A_int(1, f) += val_int1;
-+      A_ext(0, f) += val_ext01;
-+      A_ext(1, f) += val_ext10;
-    });
- }
- 
- static void EADGTraceAssemble1DBdr(const int NF,
-                                    const Array<double> &basis,
-                                    const Vector &padata,
--                                   Vector &eadata_bdr,
--                                   const bool add)
-+                                   Vector &eadata_bdr)
- {
-    auto D = Reshape(padata.Read(), 2, 2, NF);
-    auto A_bdr = Reshape(eadata_bdr.ReadWrite(), NF);
-    mfem::forall(NF, [=] MFEM_HOST_DEVICE (int f)
-    {
--      if (add)
--      {
--         A_bdr(f) += D(0, 0, f);
--      }
--      else
--      {
--         A_bdr(f) = D(0, 0, f);
--      }
-+      A_bdr(f) += D(0, 0, f);
-    });
- }
- 
-@@ -77,7 +58,6 @@ static void EADGTraceAssemble2DInt(const int NF,
-                                    const Vector &padata,
-                                    Vector &eadata_int,
-                                    Vector &eadata_ext,
--                                   const bool add,
-                                    const int d1d = 0,
-                                    const int q1d = 0)
- {
-@@ -108,20 +88,10 @@ static void EADGTraceAssemble2DInt(const int NF,
-                val_ext10 += B(k1,i1) * B(k1,j1) * D(k1, 1, 0, f);
-                val_int1  += B(k1,i1) * B(k1,j1) * D(k1, 1, 1, f);
-             }
--            if (add)
--            {
--               A_int(i1, j1, 0, f) += val_int0;
--               A_int(i1, j1, 1, f) += val_int1;
--               A_ext(i1, j1, 0, f) += val_ext01;
--               A_ext(i1, j1, 1, f) += val_ext10;
--            }
--            else
--            {
--               A_int(i1, j1, 0, f) = val_int0;
--               A_int(i1, j1, 1, f) = val_int1;
--               A_ext(i1, j1, 0, f) = val_ext01;
--               A_ext(i1, j1, 1, f) = val_ext10;
--            }
-+            A_int(i1, j1, 0, f) += val_int0;
-+            A_int(i1, j1, 1, f) += val_int1;
-+            A_ext(i1, j1, 0, f) += val_ext01;
-+            A_ext(i1, j1, 1, f) += val_ext10;
-          }
-       }
-    });
-@@ -132,7 +102,6 @@ static void EADGTraceAssemble2DBdr(const int NF,
-                                    const Array<double> &basis,
-                                    const Vector &padata,
-                                    Vector &eadata_bdr,
--                                   const bool add,
-                                    const int d1d = 0,
-                                    const int q1d = 0)
- {
-@@ -156,14 +125,7 @@ static void EADGTraceAssemble2DBdr(const int NF,
-             {
-                val_bdr  += B(k1,i1) * B(k1,j1) * D(k1, 0, 0, f);
-             }
--            if (add)
--            {
--               A_bdr(i1, j1, f) += val_bdr;
--            }
--            else
--            {
--               A_bdr(i1, j1, f) = val_bdr;
--            }
-+            A_bdr(i1, j1, f) += val_bdr;
-          }
-       }
-    });
-@@ -175,7 +137,6 @@ static void EADGTraceAssemble3DInt(const int NF,
-                                    const Vector &padata,
-                                    Vector &eadata_int,
-                                    Vector &eadata_ext,
--                                   const bool add,
-                                    const int d1d = 0,
-                                    const int q1d = 0)
- {
-@@ -246,20 +207,10 @@ static void EADGTraceAssemble3DInt(const int NF,
-                                     * s_D[k1][k2][1][0];
-                      }
-                   }
--                  if (add)
--                  {
--                     A_int(i1, i2, j1, j2, 0, f) += val_int0;
--                     A_int(i1, i2, j1, j2, 1, f) += val_int1;
--                     A_ext(i1, i2, j1, j2, 0, f) += val_ext01;
--                     A_ext(i1, i2, j1, j2, 1, f) += val_ext10;
--                  }
--                  else
--                  {
--                     A_int(i1, i2, j1, j2, 0, f) = val_int0;
--                     A_int(i1, i2, j1, j2, 1, f) = val_int1;
--                     A_ext(i1, i2, j1, j2, 0, f) = val_ext01;
--                     A_ext(i1, i2, j1, j2, 1, f) = val_ext10;
--                  }
-+                  A_int(i1, i2, j1, j2, 0, f) += val_int0;
-+                  A_int(i1, i2, j1, j2, 1, f) += val_int1;
-+                  A_ext(i1, i2, j1, j2, 0, f) += val_ext01;
-+                  A_ext(i1, i2, j1, j2, 1, f) += val_ext10;
-                }
-             }
-          }
-@@ -272,7 +223,6 @@ static void EADGTraceAssemble3DBdr(const int NF,
-                                    const Array<double> &basis,
-                                    const Vector &padata,
-                                    Vector &eadata_bdr,
--                                   const bool add,
-                                    const int d1d = 0,
-                                    const int q1d = 0)
- {
-@@ -330,14 +280,7 @@ static void EADGTraceAssemble3DBdr(const int NF,
-                                    * s_D[k1][k2][0][0];
-                      }
-                   }
--                  if (add)
--                  {
--                     A_bdr(i1, i2, j1, j2, f) += val_bdr;
--                  }
--                  else
--                  {
--                     A_bdr(i1, i2, j1, j2, f) = val_bdr;
--                  }
-+                  A_bdr(i1, i2, j1, j2, f) += val_bdr;
-                }
-             }
-          }
-@@ -347,8 +290,7 @@ static void EADGTraceAssemble3DBdr(const int NF,
- 
- void DGTraceIntegrator::AssembleEAInteriorFaces(const FiniteElementSpace& fes,
-                                                 Vector &ea_data_int,
--                                                Vector &ea_data_ext,
--                                                const bool add)
-+                                                Vector &ea_data_ext)
- {
-    SetupPA(fes, FaceType::Interior);
-    nf = fes.GetNFbyType(FaceType::Interior);
-@@ -356,7 +298,7 @@ void DGTraceIntegrator::AssembleEAInteriorFaces(const FiniteElementSpace& fes,
-    const Array<double> &B = maps->B;
-    if (dim == 1)
-    {
--      return EADGTraceAssemble1DInt(nf,B,pa_data,ea_data_int,ea_data_ext,add);
-+      return EADGTraceAssemble1DInt(nf,B,pa_data,ea_data_int,ea_data_ext);
-    }
-    else if (dim == 2)
-    {
-@@ -364,31 +306,31 @@ void DGTraceIntegrator::AssembleEAInteriorFaces(const FiniteElementSpace& fes,
-       {
-          case 0x22:
-             return EADGTraceAssemble2DInt<2,2>(nf,B,pa_data,ea_data_int,
--                                               ea_data_ext,add);
-+                                               ea_data_ext);
-          case 0x33:
-             return EADGTraceAssemble2DInt<3,3>(nf,B,pa_data,ea_data_int,
--                                               ea_data_ext,add);
-+                                               ea_data_ext);
-          case 0x44:
-             return EADGTraceAssemble2DInt<4,4>(nf,B,pa_data,ea_data_int,
--                                               ea_data_ext,add);
-+                                               ea_data_ext);
-          case 0x55:
-             return EADGTraceAssemble2DInt<5,5>(nf,B,pa_data,ea_data_int,
--                                               ea_data_ext,add);
-+                                               ea_data_ext);
-          case 0x66:
-             return EADGTraceAssemble2DInt<6,6>(nf,B,pa_data,ea_data_int,
--                                               ea_data_ext,add);
-+                                               ea_data_ext);
-          case 0x77:
-             return EADGTraceAssemble2DInt<7,7>(nf,B,pa_data,ea_data_int,
--                                               ea_data_ext,add);
-+                                               ea_data_ext);
-          case 0x88:
-             return EADGTraceAssemble2DInt<8,8>(nf,B,pa_data,ea_data_int,
--                                               ea_data_ext,add);
-+                                               ea_data_ext);
-          case 0x99:
-             return EADGTraceAssemble2DInt<9,9>(nf,B,pa_data,ea_data_int,
--                                               ea_data_ext,add);
-+                                               ea_data_ext);
-          default:
-             return EADGTraceAssemble2DInt(nf,B,pa_data,ea_data_int,
--                                          ea_data_ext,add,dofs1D,quad1D);
-+                                          ea_data_ext,dofs1D,quad1D);
-       }
-    }
-    else if (dim == 3)
-@@ -397,36 +339,35 @@ void DGTraceIntegrator::AssembleEAInteriorFaces(const FiniteElementSpace& fes,
-       {
-          case 0x23:
-             return EADGTraceAssemble3DInt<2,3>(nf,B,pa_data,ea_data_int,
--                                               ea_data_ext,add);
-+                                               ea_data_ext);
-          case 0x34:
-             return EADGTraceAssemble3DInt<3,4>(nf,B,pa_data,ea_data_int,
--                                               ea_data_ext,add);
-+                                               ea_data_ext);
-          case 0x45:
-             return EADGTraceAssemble3DInt<4,5>(nf,B,pa_data,ea_data_int,
--                                               ea_data_ext,add);
-+                                               ea_data_ext);
-          case 0x56:
-             return EADGTraceAssemble3DInt<5,6>(nf,B,pa_data,ea_data_int,
--                                               ea_data_ext,add);
-+                                               ea_data_ext);
-          case 0x67:
-             return EADGTraceAssemble3DInt<6,7>(nf,B,pa_data,ea_data_int,
--                                               ea_data_ext,add);
-+                                               ea_data_ext);
-          case 0x78:
-             return EADGTraceAssemble3DInt<7,8>(nf,B,pa_data,ea_data_int,
--                                               ea_data_ext,add);
-+                                               ea_data_ext);
-          case 0x89:
-             return EADGTraceAssemble3DInt<8,9>(nf,B,pa_data,ea_data_int,
--                                               ea_data_ext,add);
-+                                               ea_data_ext);
-          default:
-             return EADGTraceAssemble3DInt(nf,B,pa_data,ea_data_int,
--                                          ea_data_ext,add,dofs1D,quad1D);
-+                                          ea_data_ext,dofs1D,quad1D);
-       }
-    }
-    MFEM_ABORT("Unknown kernel.");
- }
- 
- void DGTraceIntegrator::AssembleEABoundaryFaces(const FiniteElementSpace& fes,
--                                                Vector &ea_data_bdr,
--                                                const bool add)
-+                                                Vector &ea_data_bdr)
- {
-    SetupPA(fes, FaceType::Boundary);
-    nf = fes.GetNFbyType(FaceType::Boundary);
-@@ -434,37 +375,37 @@ void DGTraceIntegrator::AssembleEABoundaryFaces(const FiniteElementSpace& fes,
-    const Array<double> &B = maps->B;
-    if (dim == 1)
-    {
--      return EADGTraceAssemble1DBdr(nf,B,pa_data,ea_data_bdr,add);
-+      return EADGTraceAssemble1DBdr(nf,B,pa_data,ea_data_bdr);
-    }
-    else if (dim == 2)
-    {
-       switch ((dofs1D << 4 ) | quad1D)
-       {
--         case 0x22: return EADGTraceAssemble2DBdr<2,2>(nf,B,pa_data,ea_data_bdr,add);
--         case 0x33: return EADGTraceAssemble2DBdr<3,3>(nf,B,pa_data,ea_data_bdr,add);
--         case 0x44: return EADGTraceAssemble2DBdr<4,4>(nf,B,pa_data,ea_data_bdr,add);
--         case 0x55: return EADGTraceAssemble2DBdr<5,5>(nf,B,pa_data,ea_data_bdr,add);
--         case 0x66: return EADGTraceAssemble2DBdr<6,6>(nf,B,pa_data,ea_data_bdr,add);
--         case 0x77: return EADGTraceAssemble2DBdr<7,7>(nf,B,pa_data,ea_data_bdr,add);
--         case 0x88: return EADGTraceAssemble2DBdr<8,8>(nf,B,pa_data,ea_data_bdr,add);
--         case 0x99: return EADGTraceAssemble2DBdr<9,9>(nf,B,pa_data,ea_data_bdr,add);
-+         case 0x22: return EADGTraceAssemble2DBdr<2,2>(nf,B,pa_data,ea_data_bdr);
-+         case 0x33: return EADGTraceAssemble2DBdr<3,3>(nf,B,pa_data,ea_data_bdr);
-+         case 0x44: return EADGTraceAssemble2DBdr<4,4>(nf,B,pa_data,ea_data_bdr);
-+         case 0x55: return EADGTraceAssemble2DBdr<5,5>(nf,B,pa_data,ea_data_bdr);
-+         case 0x66: return EADGTraceAssemble2DBdr<6,6>(nf,B,pa_data,ea_data_bdr);
-+         case 0x77: return EADGTraceAssemble2DBdr<7,7>(nf,B,pa_data,ea_data_bdr);
-+         case 0x88: return EADGTraceAssemble2DBdr<8,8>(nf,B,pa_data,ea_data_bdr);
-+         case 0x99: return EADGTraceAssemble2DBdr<9,9>(nf,B,pa_data,ea_data_bdr);
-          default:
--            return EADGTraceAssemble2DBdr(nf,B,pa_data,ea_data_bdr,add,dofs1D,quad1D);
-+            return EADGTraceAssemble2DBdr(nf,B,pa_data,ea_data_bdr,dofs1D,quad1D);
-       }
-    }
-    else if (dim == 3)
-    {
-       switch ((dofs1D << 4 ) | quad1D)
-       {
--         case 0x23: return EADGTraceAssemble3DBdr<2,3>(nf,B,pa_data,ea_data_bdr,add);
--         case 0x34: return EADGTraceAssemble3DBdr<3,4>(nf,B,pa_data,ea_data_bdr,add);
--         case 0x45: return EADGTraceAssemble3DBdr<4,5>(nf,B,pa_data,ea_data_bdr,add);
--         case 0x56: return EADGTraceAssemble3DBdr<5,6>(nf,B,pa_data,ea_data_bdr,add);
--         case 0x67: return EADGTraceAssemble3DBdr<6,7>(nf,B,pa_data,ea_data_bdr,add);
--         case 0x78: return EADGTraceAssemble3DBdr<7,8>(nf,B,pa_data,ea_data_bdr,add);
--         case 0x89: return EADGTraceAssemble3DBdr<8,9>(nf,B,pa_data,ea_data_bdr,add);
-+         case 0x23: return EADGTraceAssemble3DBdr<2,3>(nf,B,pa_data,ea_data_bdr);
-+         case 0x34: return EADGTraceAssemble3DBdr<3,4>(nf,B,pa_data,ea_data_bdr);
-+         case 0x45: return EADGTraceAssemble3DBdr<4,5>(nf,B,pa_data,ea_data_bdr);
-+         case 0x56: return EADGTraceAssemble3DBdr<5,6>(nf,B,pa_data,ea_data_bdr);
-+         case 0x67: return EADGTraceAssemble3DBdr<6,7>(nf,B,pa_data,ea_data_bdr);
-+         case 0x78: return EADGTraceAssemble3DBdr<7,8>(nf,B,pa_data,ea_data_bdr);
-+         case 0x89: return EADGTraceAssemble3DBdr<8,9>(nf,B,pa_data,ea_data_bdr);
-          default:
--            return EADGTraceAssemble3DBdr(nf,B,pa_data,ea_data_bdr,add,dofs1D,quad1D);
-+            return EADGTraceAssemble3DBdr(nf,B,pa_data,ea_data_bdr,dofs1D,quad1D);
-       }
-    }
-    MFEM_ABORT("Unknown kernel.");
-diff --git a/fem/bilininteg_dgtrace_pa.cpp b/fem/integ/bilininteg_dgtrace_pa.cpp
-similarity index 90%
-rename from fem/bilininteg_dgtrace_pa.cpp
-rename to fem/integ/bilininteg_dgtrace_pa.cpp
-index 6987d3455..f4b8d837c 100644
---- a/fem/bilininteg_dgtrace_pa.cpp
-+++ b/fem/integ/bilininteg_dgtrace_pa.cpp
-@@ -9,16 +9,15 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
--#include "qfunction.hpp"
--#include "restriction.hpp"
--
--using namespace std;
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../qfunction.hpp"
-+#include "../restriction.hpp"
- 
- namespace mfem
- {
-+
- // PA DG Trace Integrator
- static void PADGTraceSetup2D(const int Q1D,
-                              const int NF,
-@@ -111,30 +110,6 @@ static void PADGTraceSetup3D(const int Q1D,
-    });
- }
- 
--static void PADGTraceSetup(const int dim,
--                           const int D1D,
--                           const int Q1D,
--                           const int NF,
--                           const Array<double> &W,
--                           const Vector &det,
--                           const Vector &nor,
--                           const Vector &rho,
--                           const Vector &u,
--                           const double alpha,
--                           const double beta,
--                           Vector &op)
--{
--   if (dim == 1) { MFEM_ABORT("dim==1 not supported in PADGTraceSetup"); }
--   if (dim == 2)
--   {
--      PADGTraceSetup2D(Q1D, NF, W, det, nor, rho, u, alpha, beta, op);
--   }
--   if (dim == 3)
--   {
--      PADGTraceSetup3D(Q1D, NF, W, det, nor, rho, u, alpha, beta, op);
--   }
--}
--
- void DGTraceIntegrator::SetupPA(const FiniteElementSpace &fes, FaceType type)
- {
-    const MemoryType mt = (pa_mt == MemoryType::DEFAULT) ?
-@@ -230,9 +205,21 @@ void DGTraceIntegrator::SetupPA(const FiniteElementSpace &fes, FaceType type)
-       }
-       MFEM_VERIFY(f_ind==nf, "Incorrect number of faces.");
-    }
--   PADGTraceSetup(dim, dofs1D, quad1D, nf, ir->GetWeights(),
--                  geom->detJ, geom->normal, r, vel,
--                  alpha, beta, pa_data);
-+
-+   if (dim == 1)
-+   {
-+      MFEM_ABORT("dim==1 not supported in DGTraceIntegrator::SetupPA");
-+   }
-+   else if (dim == 2)
-+   {
-+      PADGTraceSetup2D(quad1D, nf, ir->GetWeights(), geom->detJ, geom->normal,
-+                       r, vel, alpha, beta, pa_data);
-+   }
-+   else if (dim == 3)
-+   {
-+      PADGTraceSetup3D(quad1D, nf, ir->GetWeights(), geom->detJ, geom->normal,
-+                       r, vel, alpha, beta, pa_data);
-+   }
- }
- 
- void DGTraceIntegrator::AssemblePAInteriorFaces(const FiniteElementSpace& fes)
-@@ -246,15 +233,15 @@ void DGTraceIntegrator::AssemblePABoundaryFaces(const FiniteElementSpace& fes)
- }
- 
- // PA DGTrace Apply 2D kernel for Gauss-Lobatto/Bernstein
--template<int T_D1D = 0, int T_Q1D = 0> static
--void PADGTraceApply2D(const int NF,
--                      const Array<double> &b,
--                      const Array<double> &bt,
--                      const Vector &op_,
--                      const Vector &x_,
--                      Vector &y_,
--                      const int d1d = 0,
--                      const int q1d = 0)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void PADGTraceApply2D(const int NF,
-+                             const Array<double> &b,
-+                             const Array<double> &bt,
-+                             const Vector &op_,
-+                             const Vector &x_,
-+                             Vector &y_,
-+                             const int d1d = 0,
-+                             const int q1d = 0)
- {
-    const int VDIM = 1;
-    const int D1D = T_D1D ? T_D1D : d1d;
-@@ -337,15 +324,15 @@ void PADGTraceApply2D(const int NF,
- }
- 
- // PA DGTrace Apply 3D kernel for Gauss-Lobatto/Bernstein
--template<int T_D1D = 0, int T_Q1D = 0> static
--void PADGTraceApply3D(const int NF,
--                      const Array<double> &b,
--                      const Array<double> &bt,
--                      const Vector &op_,
--                      const Vector &x_,
--                      Vector &y_,
--                      const int d1d = 0,
--                      const int q1d = 0)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void PADGTraceApply3D(const int NF,
-+                             const Array<double> &b,
-+                             const Array<double> &bt,
-+                             const Vector &op_,
-+                             const Vector &x_,
-+                             Vector &y_,
-+                             const int d1d = 0,
-+                             const int q1d = 0)
- {
-    const int VDIM = 1;
-    const int D1D = T_D1D ? T_D1D : d1d;
-@@ -482,15 +469,15 @@ void PADGTraceApply3D(const int NF,
- }
- 
- // Optimized PA DGTrace Apply 3D kernel for Gauss-Lobatto/Bernstein
--template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0> static
--void SmemPADGTraceApply3D(const int NF,
--                          const Array<double> &b,
--                          const Array<double> &bt,
--                          const Vector &op_,
--                          const Vector &x_,
--                          Vector &y_,
--                          const int d1d = 0,
--                          const int q1d = 0)
-+template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
-+static void SmemPADGTraceApply3D(const int NF,
-+                                 const Array<double> &b,
-+                                 const Array<double> &bt,
-+                                 const Vector &op_,
-+                                 const Vector &x_,
-+                                 Vector &y_,
-+                                 const int d1d = 0,
-+                                 const int q1d = 0)
- {
-    const int D1D = T_D1D ? T_D1D : d1d;
-    const int Q1D = T_Q1D ? T_Q1D : q1d;
-@@ -647,15 +634,15 @@ static void PADGTraceApply(const int dim,
- }
- 
- // PA DGTrace Apply 2D kernel for Gauss-Lobatto/Bernstein
--template<int T_D1D = 0, int T_Q1D = 0> static
--void PADGTraceApplyTranspose2D(const int NF,
--                               const Array<double> &b,
--                               const Array<double> &bt,
--                               const Vector &op_,
--                               const Vector &x_,
--                               Vector &y_,
--                               const int d1d = 0,
--                               const int q1d = 0)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void PADGTraceApplyTranspose2D(const int NF,
-+                                      const Array<double> &b,
-+                                      const Array<double> &bt,
-+                                      const Vector &op_,
-+                                      const Vector &x_,
-+                                      Vector &y_,
-+                                      const int d1d = 0,
-+                                      const int q1d = 0)
- {
-    const int VDIM = 1;
-    const int D1D = T_D1D ? T_D1D : d1d;
-@@ -743,15 +730,15 @@ void PADGTraceApplyTranspose2D(const int NF,
- }
- 
- // PA DGTrace Apply Transpose 3D kernel for Gauss-Lobatto/Bernstein
--template<int T_D1D = 0, int T_Q1D = 0> static
--void PADGTraceApplyTranspose3D(const int NF,
--                               const Array<double> &b,
--                               const Array<double> &bt,
--                               const Vector &op_,
--                               const Vector &x_,
--                               Vector &y_,
--                               const int d1d = 0,
--                               const int q1d = 0)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void PADGTraceApplyTranspose3D(const int NF,
-+                                      const Array<double> &b,
-+                                      const Array<double> &bt,
-+                                      const Vector &op_,
-+                                      const Vector &x_,
-+                                      Vector &y_,
-+                                      const int d1d = 0,
-+                                      const int q1d = 0)
- {
-    const int VDIM = 1;
-    const int D1D = T_D1D ? T_D1D : d1d;
-@@ -899,15 +886,15 @@ void PADGTraceApplyTranspose3D(const int NF,
- }
- 
- // Optimized PA DGTrace Apply Transpose 3D kernel for Gauss-Lobatto/Bernstein
--template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0> static
--void SmemPADGTraceApplyTranspose3D(const int NF,
--                                   const Array<double> &b,
--                                   const Array<double> &bt,
--                                   const Vector &op_,
--                                   const Vector &x_,
--                                   Vector &y_,
--                                   const int d1d = 0,
--                                   const int q1d = 0)
-+template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
-+static void SmemPADGTraceApplyTranspose3D(const int NF,
-+                                          const Array<double> &b,
-+                                          const Array<double> &bt,
-+                                          const Vector &op_,
-+                                          const Vector &x_,
-+                                          Vector &y_,
-+                                          const int d1d = 0,
-+                                          const int q1d = 0)
- {
-    const int D1D = T_D1D ? T_D1D : d1d;
-    const int Q1D = T_Q1D ? T_Q1D : q1d;
-@@ -1076,7 +1063,6 @@ static void PADGTraceApplyTranspose(const int dim,
-    MFEM_ABORT("Unknown kernel.");
- }
- 
--// PA DGTraceIntegrator Apply kernel
- void DGTraceIntegrator::AddMultPA(const Vector &x, Vector &y) const
- {
-    PADGTraceApply(dim, dofs1D, quad1D, nf,
-diff --git a/fem/bilininteg_diffusion_ea.cpp b/fem/integ/bilininteg_diffusion_ea.cpp
-similarity index 87%
-rename from fem/bilininteg_diffusion_ea.cpp
-rename to fem/integ/bilininteg_diffusion_ea.cpp
-index c6b43053c..aa36233c4 100644
---- a/fem/bilininteg_diffusion_ea.cpp
-+++ b/fem/integ/bilininteg_diffusion_ea.cpp
-@@ -9,9 +9,9 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
- 
- namespace mfem
- {
-@@ -22,7 +22,6 @@ static void EADiffusionAssemble1D(const int NE,
-                                   const Array<double> &g,
-                                   const Vector &padata,
-                                   Vector &eadata,
--                                  const bool add,
-                                   const int d1d = 0,
-                                   const int q1d = 0)
- {
-@@ -54,14 +53,7 @@ static void EADiffusionAssemble1D(const int NE,
-             {
-                val += r_Gj[k1] * D(k1, e) * r_Gi[k1];
-             }
--            if (add)
--            {
--               A(i1, j1, e) += val;
--            }
--            else
--            {
--               A(i1, j1, e) = val;
--            }
-+            A(i1, j1, e) += val;
-          }
-       }
-    });
-@@ -73,7 +65,6 @@ static void EADiffusionAssemble2D(const int NE,
-                                   const Array<double> &g,
-                                   const Vector &padata,
-                                   Vector &eadata,
--                                  const bool add,
-                                   const int d1d = 0,
-                                   const int q1d = 0)
- {
-@@ -129,14 +120,7 @@ static void EADiffusionAssemble2D(const int NE,
-                                + gbi * D11 * gbj;
-                      }
-                   }
--                  if (add)
--                  {
--                     A(i1, i2, j1, j2, e) += val;
--                  }
--                  else
--                  {
--                     A(i1, i2, j1, j2, e) = val;
--                  }
-+                  A(i1, i2, j1, j2, e) += val;
-                }
-             }
-          }
-@@ -150,7 +134,6 @@ static void EADiffusionAssemble3D(const int NE,
-                                   const Array<double> &g,
-                                   const Vector &padata,
-                                   Vector &eadata,
--                                  const bool add,
-                                   const int d1d = 0,
-                                   const int q1d = 0)
- {
-@@ -225,14 +208,7 @@ static void EADiffusionAssemble3D(const int NE,
-                               }
-                            }
-                         }
--                        if (add)
--                        {
--                           A(i1, i2, i3, j1, j2, j3, e) += val;
--                        }
--                        else
--                        {
--                           A(i1, i2, i3, j1, j2, j3, e) = val;
--                        }
-+                        A(i1, i2, i3, j1, j2, j3, e) += val;
-                      }
-                   }
-                }
-@@ -243,8 +219,7 @@ static void EADiffusionAssemble3D(const int NE,
- }
- 
- void DiffusionIntegrator::AssembleEA(const FiniteElementSpace &fes,
--                                     Vector &ea_data,
--                                     const bool add)
-+                                     Vector &ea_data)
- {
-    AssemblePA(fes);
-    ne = fes.GetMesh()->GetNE();
-@@ -254,15 +229,15 @@ void DiffusionIntegrator::AssembleEA(const FiniteElementSpace &fes,
-    {
-       switch ((dofs1D << 4 ) | quad1D)
-       {
--         case 0x22: return EADiffusionAssemble1D<2,2>(ne,B,G,pa_data,ea_data,add);
--         case 0x33: return EADiffusionAssemble1D<3,3>(ne,B,G,pa_data,ea_data,add);
--         case 0x44: return EADiffusionAssemble1D<4,4>(ne,B,G,pa_data,ea_data,add);
--         case 0x55: return EADiffusionAssemble1D<5,5>(ne,B,G,pa_data,ea_data,add);
--         case 0x66: return EADiffusionAssemble1D<6,6>(ne,B,G,pa_data,ea_data,add);
--         case 0x77: return EADiffusionAssemble1D<7,7>(ne,B,G,pa_data,ea_data,add);
--         case 0x88: return EADiffusionAssemble1D<8,8>(ne,B,G,pa_data,ea_data,add);
--         case 0x99: return EADiffusionAssemble1D<9,9>(ne,B,G,pa_data,ea_data,add);
--         default:   return EADiffusionAssemble1D(ne,B,G,pa_data,ea_data,add,
-+         case 0x22: return EADiffusionAssemble1D<2,2>(ne,B,G,pa_data,ea_data);
-+         case 0x33: return EADiffusionAssemble1D<3,3>(ne,B,G,pa_data,ea_data);
-+         case 0x44: return EADiffusionAssemble1D<4,4>(ne,B,G,pa_data,ea_data);
-+         case 0x55: return EADiffusionAssemble1D<5,5>(ne,B,G,pa_data,ea_data);
-+         case 0x66: return EADiffusionAssemble1D<6,6>(ne,B,G,pa_data,ea_data);
-+         case 0x77: return EADiffusionAssemble1D<7,7>(ne,B,G,pa_data,ea_data);
-+         case 0x88: return EADiffusionAssemble1D<8,8>(ne,B,G,pa_data,ea_data);
-+         case 0x99: return EADiffusionAssemble1D<9,9>(ne,B,G,pa_data,ea_data);
-+         default:   return EADiffusionAssemble1D(ne,B,G,pa_data,ea_data,
-                                                     dofs1D,quad1D);
-       }
-    }
-@@ -270,15 +245,15 @@ void DiffusionIntegrator::AssembleEA(const FiniteElementSpace &fes,
-    {
-       switch ((dofs1D << 4 ) | quad1D)
-       {
--         case 0x22: return EADiffusionAssemble2D<2,2>(ne,B,G,pa_data,ea_data,add);
--         case 0x33: return EADiffusionAssemble2D<3,3>(ne,B,G,pa_data,ea_data,add);
--         case 0x44: return EADiffusionAssemble2D<4,4>(ne,B,G,pa_data,ea_data,add);
--         case 0x55: return EADiffusionAssemble2D<5,5>(ne,B,G,pa_data,ea_data,add);
--         case 0x66: return EADiffusionAssemble2D<6,6>(ne,B,G,pa_data,ea_data,add);
--         case 0x77: return EADiffusionAssemble2D<7,7>(ne,B,G,pa_data,ea_data,add);
--         case 0x88: return EADiffusionAssemble2D<8,8>(ne,B,G,pa_data,ea_data,add);
--         case 0x99: return EADiffusionAssemble2D<9,9>(ne,B,G,pa_data,ea_data,add);
--         default:   return EADiffusionAssemble2D(ne,B,G,pa_data,ea_data,add,
-+         case 0x22: return EADiffusionAssemble2D<2,2>(ne,B,G,pa_data,ea_data);
-+         case 0x33: return EADiffusionAssemble2D<3,3>(ne,B,G,pa_data,ea_data);
-+         case 0x44: return EADiffusionAssemble2D<4,4>(ne,B,G,pa_data,ea_data);
-+         case 0x55: return EADiffusionAssemble2D<5,5>(ne,B,G,pa_data,ea_data);
-+         case 0x66: return EADiffusionAssemble2D<6,6>(ne,B,G,pa_data,ea_data);
-+         case 0x77: return EADiffusionAssemble2D<7,7>(ne,B,G,pa_data,ea_data);
-+         case 0x88: return EADiffusionAssemble2D<8,8>(ne,B,G,pa_data,ea_data);
-+         case 0x99: return EADiffusionAssemble2D<9,9>(ne,B,G,pa_data,ea_data);
-+         default:   return EADiffusionAssemble2D(ne,B,G,pa_data,ea_data,
-                                                     dofs1D,quad1D);
-       }
-    }
-@@ -286,14 +261,14 @@ void DiffusionIntegrator::AssembleEA(const FiniteElementSpace &fes,
-    {
-       switch ((dofs1D << 4 ) | quad1D)
-       {
--         case 0x23: return EADiffusionAssemble3D<2,3>(ne,B,G,pa_data,ea_data,add);
--         case 0x34: return EADiffusionAssemble3D<3,4>(ne,B,G,pa_data,ea_data,add);
--         case 0x45: return EADiffusionAssemble3D<4,5>(ne,B,G,pa_data,ea_data,add);
--         case 0x56: return EADiffusionAssemble3D<5,6>(ne,B,G,pa_data,ea_data,add);
--         case 0x67: return EADiffusionAssemble3D<6,7>(ne,B,G,pa_data,ea_data,add);
--         case 0x78: return EADiffusionAssemble3D<7,8>(ne,B,G,pa_data,ea_data,add);
--         case 0x89: return EADiffusionAssemble3D<8,9>(ne,B,G,pa_data,ea_data,add);
--         default:   return EADiffusionAssemble3D(ne,B,G,pa_data,ea_data,add,
-+         case 0x23: return EADiffusionAssemble3D<2,3>(ne,B,G,pa_data,ea_data);
-+         case 0x34: return EADiffusionAssemble3D<3,4>(ne,B,G,pa_data,ea_data);
-+         case 0x45: return EADiffusionAssemble3D<4,5>(ne,B,G,pa_data,ea_data);
-+         case 0x56: return EADiffusionAssemble3D<5,6>(ne,B,G,pa_data,ea_data);
-+         case 0x67: return EADiffusionAssemble3D<6,7>(ne,B,G,pa_data,ea_data);
-+         case 0x78: return EADiffusionAssemble3D<7,8>(ne,B,G,pa_data,ea_data);
-+         case 0x89: return EADiffusionAssemble3D<8,9>(ne,B,G,pa_data,ea_data);
-+         default:   return EADiffusionAssemble3D(ne,B,G,pa_data,ea_data,
-                                                     dofs1D,quad1D);
-       }
-    }
-diff --git a/fem/bilininteg_diffusion_pa.cpp b/fem/integ/bilininteg_diffusion_kernels.hpp
-similarity index 85%
-rename from fem/bilininteg_diffusion_pa.cpp
-rename to fem/integ/bilininteg_diffusion_kernels.hpp
-index 2d953952e..63bc52bd8 100644
---- a/fem/bilininteg_diffusion_pa.cpp
-+++ b/fem/integ/bilininteg_diffusion_kernels.hpp
-@@ -9,28 +9,29 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
--#include "qfunction.hpp"
--#include "ceed/integrators/diffusion/diffusion.hpp"
-+#ifndef MFEM_BILININTEG_DIFFUSION_KERNELS_HPP
-+#define MFEM_BILININTEG_DIFFUSION_KERNELS_HPP
- 
--using namespace std;
-+#include "../../config/config.hpp"
-+#include "../../general/forall.hpp"
-+#include "../../linalg/dtensor.hpp"
- 
- namespace mfem
- {
- 
--// PA Diffusion Integrator
-+namespace internal
-+{
- 
- // OCCA 2D Assemble kernel
- #ifdef MFEM_USE_OCCA
--static void OccaPADiffusionSetup2D(const int D1D,
--                                   const int Q1D,
--                                   const int NE,
--                                   const Array<double> &W,
--                                   const Vector &J,
--                                   const Vector &C,
--                                   Vector &op)
-+MFEM_HOST_DEVICE inline
-+void OccaPADiffusionSetup2D(const int D1D,
-+                            const int Q1D,
-+                            const int NE,
-+                            const Array<double> &W,
-+                            const Vector &J,
-+                            const Vector &C,
-+                            Vector &op)
- {
-    occa::properties props;
-    props["defines/D1D"] = D1D;
-@@ -52,13 +53,14 @@ static void OccaPADiffusionSetup2D(const int D1D,
-    OccaDiffSetup2D_ker.at(id)(NE, o_W, o_J, o_C, o_op, const_c);
- }
- 
--static void OccaPADiffusionSetup3D(const int D1D,
--                                   const int Q1D,
--                                   const int NE,
--                                   const Array<double> &W,
--                                   const Vector &J,
--                                   const Vector &C,
--                                   Vector &op)
-+MFEM_HOST_DEVICE inline
-+void OccaPADiffusionSetup3D(const int D1D,
-+                            const int Q1D,
-+                            const int NE,
-+                            const Array<double> &W,
-+                            const Vector &J,
-+                            const Vector &C,
-+                            Vector &op)
- {
-    occa::properties props;
-    props["defines/D1D"] = D1D;
-@@ -81,7 +83,19 @@ static void OccaPADiffusionSetup3D(const int D1D,
- }
- #endif // MFEM_USE_OCCA
- 
-+// PA Diffusion Assemble 2D kernel
-+template<const int T_SDIM>
-+MFEM_HOST_DEVICE inline
-+void PADiffusionSetup2D(const int Q1D,
-+                        const int coeffDim,
-+                        const int NE,
-+                        const Array<double> &w,
-+                        const Vector &j,
-+                        const Vector &c,
-+                        Vector &d);
-+
- template<>
-+MFEM_HOST_DEVICE inline
- void PADiffusionSetup2D<2>(const int Q1D,
-                            const int coeffDim,
-                            const int NE,
-@@ -149,6 +163,7 @@ void PADiffusionSetup2D<2>(const int Q1D,
- 
- // PA Diffusion Assemble 2D kernel with 3D node coords
- template<>
-+MFEM_HOST_DEVICE inline
- void PADiffusionSetup2D<3>(const int Q1D,
-                            const int coeffDim,
-                            const int NE,
-@@ -194,6 +209,7 @@ void PADiffusionSetup2D<3>(const int Q1D,
- }
- 
- // PA Diffusion Assemble 3D kernel
-+MFEM_HOST_DEVICE inline
- void PADiffusionSetup3D(const int Q1D,
-                         const int coeffDim,
-                         const int NE,
-@@ -314,16 +330,17 @@ void PADiffusionSetup3D(const int Q1D,
-    });
- }
- 
--static void PADiffusionSetup(const int dim,
--                             const int sdim,
--                             const int D1D,
--                             const int Q1D,
--                             const int coeffDim,
--                             const int NE,
--                             const Array<double> &W,
--                             const Vector &J,
--                             const Vector &C,
--                             Vector &D)
-+MFEM_HOST_DEVICE inline
-+void PADiffusionSetup(const int dim,
-+                      const int sdim,
-+                      const int D1D,
-+                      const int Q1D,
-+                      const int coeffDim,
-+                      const int NE,
-+                      const Array<double> &W,
-+                      const Vector &J,
-+                      const Vector &C,
-+                      Vector &D)
- {
-    if (dim == 1) { MFEM_ABORT("dim==1 not supported in PADiffusionSetup"); }
-    if (dim == 2)
-@@ -353,71 +370,16 @@ static void PADiffusionSetup(const int dim,
-    }
- }
- 
--void DiffusionIntegrator::AssemblePA(const FiniteElementSpace &fes)
--{
--   const MemoryType mt = (pa_mt == MemoryType::DEFAULT) ?
--                         Device::GetDeviceMemoryType() : pa_mt;
--   // Assuming the same element type
--   fespace = &fes;
--   Mesh *mesh = fes.GetMesh();
--   if (mesh->GetNE() == 0) { return; }
--   const FiniteElement &el = *fes.GetFE(0);
--   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el);
--   if (DeviceCanUseCeed())
--   {
--      delete ceedOp;
--      MFEM_VERIFY(!VQ && !MQ,
--                  "Only scalar coefficient supported for DiffusionIntegrator"
--                  " with libCEED");
--      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
--                         fes.IsVariableOrder();
--      if (mixed)
--      {
--         ceedOp = new ceed::MixedPADiffusionIntegrator(*this, fes, Q);
--      }
--      else
--      {
--         ceedOp = new ceed::PADiffusionIntegrator(fes, *ir, Q);
--      }
--      return;
--   }
--   const int dims = el.GetDim();
--   const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
--   const int nq = ir->GetNPoints();
--   dim = mesh->Dimension();
--   ne = fes.GetNE();
--   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS, mt);
--   const int sdim = mesh->SpaceDimension();
--   maps = &el.GetDofToQuad(*ir, DofToQuad::TENSOR);
--   dofs1D = maps->ndof;
--   quad1D = maps->nqpt;
--
--   QuadratureSpace qs(*mesh, *ir);
--   CoefficientVector coeff(qs, CoefficientStorage::COMPRESSED);
--
--   if (MQ) { coeff.ProjectTranspose(*MQ); }
--   else if (VQ) { coeff.Project(*VQ); }
--   else if (Q) { coeff.Project(*Q); }
--   else { coeff.SetConstant(1.0); }
--
--   const int coeff_dim = coeff.GetVDim();
--   symmetric = (coeff_dim != dims*dims);
--   const int pa_size = symmetric ? symmDims : dims*dims;
--
--   pa_data.SetSize(pa_size * nq * ne, mt);
--   PADiffusionSetup(dim, sdim, dofs1D, quad1D, coeff_dim, ne, ir->GetWeights(),
--                    geom->J, coeff, pa_data);
--}
--
- template<int T_D1D = 0, int T_Q1D = 0>
--static void PADiffusionDiagonal2D(const int NE,
--                                  const bool symmetric,
--                                  const Array<double> &b,
--                                  const Array<double> &g,
--                                  const Vector &d,
--                                  Vector &y,
--                                  const int d1d = 0,
--                                  const int q1d = 0)
-+MFEM_HOST_DEVICE inline
-+void PADiffusionDiagonal2D(const int NE,
-+                           const bool symmetric,
-+                           const Array<double> &b,
-+                           const Array<double> &g,
-+                           const Vector &d,
-+                           Vector &y,
-+                           const int d1d = 0,
-+                           const int q1d = 0)
- {
-    const int D1D = T_D1D ? T_D1D : d1d;
-    const int Q1D = T_Q1D ? T_Q1D : q1d;
-@@ -476,14 +438,15 @@ static void PADiffusionDiagonal2D(const int NE,
- 
- // Shared memory PA Diffusion Diagonal 2D kernel
- template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
--static void SmemPADiffusionDiagonal2D(const int NE,
--                                      const bool symmetric,
--                                      const Array<double> &b_,
--                                      const Array<double> &g_,
--                                      const Vector &d_,
--                                      Vector &y_,
--                                      const int d1d = 0,
--                                      const int q1d = 0)
-+MFEM_HOST_DEVICE inline
-+void SmemPADiffusionDiagonal2D(const int NE,
-+                               const bool symmetric,
-+                               const Array<double> &b_,
-+                               const Array<double> &g_,
-+                               const Vector &d_,
-+                               Vector &y_,
-+                               const int d1d = 0,
-+                               const int q1d = 0)
- {
-    const int D1D = T_D1D ? T_D1D : d1d;
-    const int Q1D = T_Q1D ? T_Q1D : q1d;
-@@ -570,14 +533,15 @@ static void SmemPADiffusionDiagonal2D(const int NE,
- }
- 
- template<int T_D1D = 0, int T_Q1D = 0>
--static void PADiffusionDiagonal3D(const int NE,
--                                  const bool symmetric,
--                                  const Array<double> &b,
--                                  const Array<double> &g,
--                                  const Vector &d,
--                                  Vector &y,
--                                  const int d1d = 0,
--                                  const int q1d = 0)
-+MFEM_HOST_DEVICE inline
-+void PADiffusionDiagonal3D(const int NE,
-+                           const bool symmetric,
-+                           const Array<double> &b,
-+                           const Array<double> &g,
-+                           const Vector &d,
-+                           Vector &y,
-+                           const int d1d = 0,
-+                           const int q1d = 0)
- {
-    constexpr int DIM = 3;
-    const int D1D = T_D1D ? T_D1D : d1d;
-@@ -671,14 +635,15 @@ static void PADiffusionDiagonal3D(const int NE,
- 
- // Shared memory PA Diffusion Diagonal 3D kernel
- template<int T_D1D = 0, int T_Q1D = 0>
--static void SmemPADiffusionDiagonal3D(const int NE,
--                                      const bool symmetric,
--                                      const Array<double> &b_,
--                                      const Array<double> &g_,
--                                      const Vector &d_,
--                                      Vector &y_,
--                                      const int d1d = 0,
--                                      const int q1d = 0)
-+MFEM_HOST_DEVICE inline
-+void SmemPADiffusionDiagonal3D(const int NE,
-+                               const bool symmetric,
-+                               const Array<double> &b_,
-+                               const Array<double> &g_,
-+                               const Vector &d_,
-+                               Vector &y_,
-+                               const int d1d = 0,
-+                               const int q1d = 0)
- {
-    constexpr int DIM = 3;
-    const int D1D = T_D1D ? T_D1D : d1d;
-@@ -788,15 +753,16 @@ static void SmemPADiffusionDiagonal3D(const int NE,
-    });
- }
- 
--static void PADiffusionAssembleDiagonal(const int dim,
--                                        const int D1D,
--                                        const int Q1D,
--                                        const int NE,
--                                        const bool symm,
--                                        const Array<double> &B,
--                                        const Array<double> &G,
--                                        const Vector &D,
--                                        Vector &Y)
-+MFEM_HOST_DEVICE inline
-+void PADiffusionAssembleDiagonal(const int dim,
-+                                 const int D1D,
-+                                 const int Q1D,
-+                                 const int NE,
-+                                 const bool symm,
-+                                 const Array<double> &B,
-+                                 const Array<double> &G,
-+                                 const Vector &D,
-+                                 Vector &Y)
- {
-    if (dim == 2)
-    {
-@@ -833,33 +799,19 @@ static void PADiffusionAssembleDiagonal(const int dim,
-    MFEM_ABORT("Unknown kernel.");
- }
- 
--void DiffusionIntegrator::AssembleDiagonalPA(Vector &diag)
--{
--   if (DeviceCanUseCeed())
--   {
--      ceedOp->GetDiagonal(diag);
--   }
--   else
--   {
--      if (pa_data.Size()==0) { AssemblePA(*fespace); }
--      PADiffusionAssembleDiagonal(dim, dofs1D, quad1D, ne, symmetric,
--                                  maps->B, maps->G, pa_data, diag);
--   }
--}
--
--
- #ifdef MFEM_USE_OCCA
- // OCCA PA Diffusion Apply 2D kernel
--static void OccaPADiffusionApply2D(const int D1D,
--                                   const int Q1D,
--                                   const int NE,
--                                   const Array<double> &B,
--                                   const Array<double> &G,
--                                   const Array<double> &Bt,
--                                   const Array<double> &Gt,
--                                   const Vector &D,
--                                   const Vector &X,
--                                   Vector &Y)
-+MFEM_HOST_DEVICE inline
-+void OccaPADiffusionApply2D(const int D1D,
-+                            const int Q1D,
-+                            const int NE,
-+                            const Array<double> &B,
-+                            const Array<double> &G,
-+                            const Array<double> &Bt,
-+                            const Array<double> &Gt,
-+                            const Vector &D,
-+                            const Vector &X,
-+                            Vector &Y)
- {
-    occa::properties props;
-    props["defines/D1D"] = D1D;
-@@ -899,16 +851,17 @@ static void OccaPADiffusionApply2D(const int D1D,
- }
- 
- // OCCA PA Diffusion Apply 3D kernel
--static void OccaPADiffusionApply3D(const int D1D,
--                                   const int Q1D,
--                                   const int NE,
--                                   const Array<double> &B,
--                                   const Array<double> &G,
--                                   const Array<double> &Bt,
--                                   const Array<double> &Gt,
--                                   const Vector &D,
--                                   const Vector &X,
--                                   Vector &Y)
-+MFEM_HOST_DEVICE inline
-+void OccaPADiffusionApply3D(const int D1D,
-+                            const int Q1D,
-+                            const int NE,
-+                            const Array<double> &B,
-+                            const Array<double> &G,
-+                            const Array<double> &Bt,
-+                            const Array<double> &Gt,
-+                            const Vector &D,
-+                            const Vector &X,
-+                            Vector &Y)
- {
-    occa::properties props;
-    props["defines/D1D"] = D1D;
-@@ -950,17 +903,18 @@ static void OccaPADiffusionApply3D(const int D1D,
- 
- // PA Diffusion Apply 2D kernel
- template<int T_D1D = 0, int T_Q1D = 0>
--static void PADiffusionApply2D(const int NE,
--                               const bool symmetric,
--                               const Array<double> &b_,
--                               const Array<double> &g_,
--                               const Array<double> &bt_,
--                               const Array<double> &gt_,
--                               const Vector &d_,
--                               const Vector &x_,
--                               Vector &y_,
--                               const int d1d = 0,
--                               const int q1d = 0)
-+MFEM_HOST_DEVICE inline
-+void PADiffusionApply2D(const int NE,
-+                        const bool symmetric,
-+                        const Array<double> &b_,
-+                        const Array<double> &g_,
-+                        const Array<double> &bt_,
-+                        const Array<double> &gt_,
-+                        const Vector &d_,
-+                        const Vector &x_,
-+                        Vector &y_,
-+                        const int d1d = 0,
-+                        const int q1d = 0)
- {
-    const int D1D = T_D1D ? T_D1D : d1d;
-    const int Q1D = T_Q1D ? T_Q1D : q1d;
-@@ -1072,15 +1026,16 @@ static void PADiffusionApply2D(const int NE,
- 
- // Shared memory PA Diffusion Apply 2D kernel
- template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
--static void SmemPADiffusionApply2D(const int NE,
--                                   const bool symmetric,
--                                   const Array<double> &b_,
--                                   const Array<double> &g_,
--                                   const Vector &d_,
--                                   const Vector &x_,
--                                   Vector &y_,
--                                   const int d1d = 0,
--                                   const int q1d = 0)
-+MFEM_HOST_DEVICE inline
-+void SmemPADiffusionApply2D(const int NE,
-+                            const bool symmetric,
-+                            const Array<double> &b_,
-+                            const Array<double> &g_,
-+                            const Vector &d_,
-+                            const Vector &x_,
-+                            Vector &y_,
-+                            const int d1d = 0,
-+                            const int q1d = 0)
- {
-    const int D1D = T_D1D ? T_D1D : d1d;
-    const int Q1D = T_Q1D ? T_Q1D : q1d;
-@@ -1230,16 +1185,17 @@ static void SmemPADiffusionApply2D(const int NE,
- 
- // PA Diffusion Apply 3D kernel
- template<int T_D1D = 0, int T_Q1D = 0>
--static void PADiffusionApply3D(const int NE,
--                               const bool symmetric,
--                               const Array<double> &b,
--                               const Array<double> &g,
--                               const Array<double> &bt,
--                               const Array<double> &gt,
--                               const Vector &d_,
--                               const Vector &x_,
--                               Vector &y_,
--                               int d1d = 0, int q1d = 0)
-+MFEM_HOST_DEVICE inline
-+void PADiffusionApply3D(const int NE,
-+                        const bool symmetric,
-+                        const Array<double> &b,
-+                        const Array<double> &g,
-+                        const Array<double> &bt,
-+                        const Array<double> &gt,
-+                        const Vector &d_,
-+                        const Vector &x_,
-+                        Vector &y_,
-+                        int d1d = 0, int q1d = 0)
- {
-    const int D1D = T_D1D ? T_D1D : d1d;
-    const int Q1D = T_Q1D ? T_Q1D : q1d;
-@@ -1422,15 +1378,16 @@ static void PADiffusionApply3D(const int NE,
- }
- 
- template<int T_D1D = 0, int T_Q1D = 0>
--static void SmemPADiffusionApply3D(const int NE,
--                                   const bool symmetric,
--                                   const Array<double> &b_,
--                                   const Array<double> &g_,
--                                   const Vector &d_,
--                                   const Vector &x_,
--                                   Vector &y_,
--                                   const int d1d = 0,
--                                   const int q1d = 0)
-+MFEM_HOST_DEVICE inline
-+void SmemPADiffusionApply3D(const int NE,
-+                            const bool symmetric,
-+                            const Array<double> &b_,
-+                            const Array<double> &g_,
-+                            const Vector &d_,
-+                            const Vector &x_,
-+                            Vector &y_,
-+                            const int d1d = 0,
-+                            const int q1d = 0)
- {
-    const int D1D = T_D1D ? T_D1D : d1d;
-    const int Q1D = T_Q1D ? T_Q1D : q1d;
-@@ -1643,18 +1600,19 @@ static void SmemPADiffusionApply3D(const int NE,
-    });
- }
- 
--static void PADiffusionApply(const int dim,
--                             const int D1D,
--                             const int Q1D,
--                             const int NE,
--                             const bool symm,
--                             const Array<double> &B,
--                             const Array<double> &G,
--                             const Array<double> &Bt,
--                             const Array<double> &Gt,
--                             const Vector &D,
--                             const Vector &X,
--                             Vector &Y)
-+MFEM_HOST_DEVICE inline
-+void PADiffusionApply(const int dim,
-+                      const int D1D,
-+                      const int Q1D,
-+                      const int NE,
-+                      const bool symm,
-+                      const Array<double> &B,
-+                      const Array<double> &G,
-+                      const Array<double> &Bt,
-+                      const Array<double> &Gt,
-+                      const Vector &D,
-+                      const Vector &X,
-+                      Vector &Y)
- {
- #ifdef MFEM_USE_OCCA
-    if (DeviceCanUseOcca())
-@@ -1710,32 +1668,8 @@ static void PADiffusionApply(const int dim,
-    MFEM_ABORT("Unknown kernel: 0x"<<std::hex << id << std::dec);
- }
- 
--// PA Diffusion Apply kernel
--void DiffusionIntegrator::AddMultPA(const Vector &x, Vector &y) const
--{
--   if (DeviceCanUseCeed())
--   {
--      ceedOp->AddMult(x, y);
--   }
--   else
--   {
--      PADiffusionApply(dim, dofs1D, quad1D, ne, symmetric,
--                       maps->B, maps->G, maps->Bt, maps->Gt,
--                       pa_data, x, y);
--   }
--}
--
--void DiffusionIntegrator::AddMultTransposePA(const Vector &x, Vector &y) const
--{
--   if (symmetric)
--   {
--      AddMultPA(x, y);
--   }
--   else
--   {
--      MFEM_ABORT("DiffusionIntegrator::AddMultTransposePA only implemented in "
--                 "the symmetric case.")
--   }
--}
-+} // namespace internal
- 
- } // namespace mfem
-+
-+#endif
-diff --git a/fem/bilininteg_diffusion_mf.cpp b/fem/integ/bilininteg_diffusion_mf.cpp
-similarity index 91%
-rename from fem/bilininteg_diffusion_mf.cpp
-rename to fem/integ/bilininteg_diffusion_mf.cpp
-index c6bd5c728..0896b8bf9 100644
---- a/fem/bilininteg_diffusion_mf.cpp
-+++ b/fem/integ/bilininteg_diffusion_mf.cpp
-@@ -9,12 +9,9 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
--#include "ceed/integrators/diffusion/diffusion.hpp"
--
--using namespace std;
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../ceed/integrators/diffusion/diffusion.hpp"
- 
- namespace mfem
- {
-@@ -22,7 +19,6 @@ namespace mfem
- void DiffusionIntegrator::AssembleMF(const FiniteElementSpace &fes)
- {
-    // Assuming the same element type
--   fespace = &fes;
-    Mesh *mesh = fes.GetMesh();
-    if (mesh->GetNE() == 0) { return; }
-    const FiniteElement &el = *fes.GetFE(0);
-diff --git a/fem/integ/bilininteg_diffusion_pa.cpp b/fem/integ/bilininteg_diffusion_pa.cpp
-new file mode 100644
-index 000000000..a966c8520
---- /dev/null
-+++ b/fem/integ/bilininteg_diffusion_pa.cpp
-@@ -0,0 +1,124 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../qfunction.hpp"
-+#include "../ceed/integrators/diffusion/diffusion.hpp"
-+#include "bilininteg_diffusion_kernels.hpp"
-+
-+namespace mfem
-+{
-+
-+void DiffusionIntegrator::AssemblePA(const FiniteElementSpace &fes)
-+{
-+   const MemoryType mt = (pa_mt == MemoryType::DEFAULT) ?
-+                         Device::GetDeviceMemoryType() : pa_mt;
-+   // Assuming the same element type
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNE() == 0) { return; }
-+   const FiniteElement &el = *fes.GetFE(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el);
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      MFEM_VERIFY(!VQ && !MQ,
-+                  "Only scalar coefficient supported for DiffusionIntegrator"
-+                  " with libCEED");
-+      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
-+                         fes.IsVariableOrder();
-+      if (mixed)
-+      {
-+         ceedOp = new ceed::MixedPADiffusionIntegrator(*this, fes, Q);
-+      }
-+      else
-+      {
-+         ceedOp = new ceed::PADiffusionIntegrator(fes, *ir, Q);
-+      }
-+      return;
-+   }
-+   const int dims = el.GetDim();
-+   const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
-+   const int nq = ir->GetNPoints();
-+   dim = mesh->Dimension();
-+   ne = fes.GetNE();
-+   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS, mt);
-+   const int sdim = mesh->SpaceDimension();
-+   maps = &el.GetDofToQuad(*ir, DofToQuad::TENSOR);
-+   dofs1D = maps->ndof;
-+   quad1D = maps->nqpt;
-+
-+   QuadratureSpace qs(*mesh, *ir);
-+   CoefficientVector coeff(qs, CoefficientStorage::COMPRESSED);
-+
-+   if (MQ) { coeff.ProjectTranspose(*MQ); }
-+   else if (VQ) { coeff.Project(*VQ); }
-+   else if (Q) { coeff.Project(*Q); }
-+   else { coeff.SetConstant(1.0); }
-+
-+   const int coeff_dim = coeff.GetVDim();
-+   symmetric = (coeff_dim != dims*dims);
-+   const int pa_size = symmetric ? symmDims : dims*dims;
-+
-+   pa_data.SetSize(pa_size * nq * ne, mt);
-+   internal::PADiffusionSetup(dim, sdim, dofs1D, quad1D, coeff_dim, ne,
-+                              ir->GetWeights(), geom->J, coeff, pa_data);
-+}
-+
-+void DiffusionIntegrator::AssembleDiagonalPA(Vector &diag)
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->GetDiagonal(diag);
-+   }
-+   else
-+   {
-+      internal::PADiffusionAssembleDiagonal(dim, dofs1D, quad1D, ne, symmetric,
-+                                            maps->B, maps->G, pa_data, diag);
-+   }
-+}
-+
-+void DiffusionIntegrator::AddMultPA(const Vector &x, Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-+   {
-+      internal::PADiffusionApply(dim, dofs1D, quad1D, ne, symmetric,
-+                                 maps->B, maps->G, maps->Bt, maps->Gt,
-+                                 pa_data, x, y);
-+   }
-+}
-+
-+void DiffusionIntegrator::AddMultTransposePA(const Vector &x, Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      MFEM_ABORT("AddMultTransposePA not yet implemented with libCEED for"
-+                 " DiffusionIntegrator.");
-+   }
-+   else
-+   {
-+      if (symmetric)
-+      {
-+         AddMultPA(x, y);
-+      }
-+      else
-+      {
-+         MFEM_ABORT("DiffusionIntegrator::AddMultTransposePA only implemented in "
-+                    "the symmetric case.")
-+      }
-+   }
-+}
-+
-+} // namespace mfem
-diff --git a/fem/integ/bilininteg_divdiv_pa.cpp b/fem/integ/bilininteg_divdiv_pa.cpp
-new file mode 100644
-index 000000000..8abf233a7
---- /dev/null
-+++ b/fem/integ/bilininteg_divdiv_pa.cpp
-@@ -0,0 +1,99 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license.  We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../qfunction.hpp"
-+#include "bilininteg_hdiv_kernels.hpp"
-+
-+namespace mfem
-+{
-+
-+void DivDivIntegrator::AssemblePA(const FiniteElementSpace &fes)
-+{
-+   // Assumes tensor-product elements
-+   Mesh *mesh = fes.GetMesh();
-+   const FiniteElement *fel = fes.GetFE(0);
-+
-+   const VectorTensorFiniteElement *el =
-+      dynamic_cast<const VectorTensorFiniteElement*>(fel);
-+   MFEM_VERIFY(el != NULL, "Only VectorTensorFiniteElement is supported!");
-+
-+   const IntegrationRule *ir = IntRule ? IntRule : &MassIntegrator::GetRule
-+                               (*el, *el, *mesh->GetElementTransformation(0));
-+
-+   const int dims = el->GetDim();
-+   MFEM_VERIFY(dims == 2 || dims == 3, "");
-+
-+   const int nq = ir->GetNPoints();
-+   dim = mesh->Dimension();
-+   MFEM_VERIFY(dim == 2 || dim == 3, "");
-+
-+   ne = fes.GetNE();
-+   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
-+   mapsC = &el->GetDofToQuad(*ir, DofToQuad::TENSOR);
-+   mapsO = &el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
-+   dofs1D = mapsC->ndof;
-+   quad1D = mapsC->nqpt;
-+
-+   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
-+
-+   pa_data.SetSize(nq * ne, Device::GetMemoryType());
-+
-+   QuadratureSpace qs(*mesh, *ir);
-+   CoefficientVector coeff(Q, qs, CoefficientStorage::FULL);
-+
-+   if (el->GetDerivType() == mfem::FiniteElement::DIV && dim == 3)
-+   {
-+      internal::PADivDivSetup3D(quad1D, ne, ir->GetWeights(), geom->J, coeff,
-+                                pa_data);
-+   }
-+   else if (el->GetDerivType() == mfem::FiniteElement::DIV && dim == 2)
-+   {
-+      internal::PADivDivSetup2D(quad1D, ne, ir->GetWeights(), geom->J, coeff,
-+                                pa_data);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unknown kernel.");
-+   }
-+}
-+
-+void DivDivIntegrator::AssembleDiagonalPA(Vector& diag)
-+{
-+   if (dim == 3)
-+   {
-+      internal::PADivDivAssembleDiagonal3D(dofs1D, quad1D, ne,
-+                                           mapsO->B, mapsC->G, pa_data, diag);
-+   }
-+   else
-+   {
-+      internal::PADivDivAssembleDiagonal2D(dofs1D, quad1D, ne,
-+                                           mapsO->B, mapsC->G, pa_data, diag);
-+   }
-+}
-+
-+void DivDivIntegrator::AddMultPA(const Vector &x, Vector &y) const
-+{
-+   if (dim == 3)
-+      internal::PADivDivApply3D(dofs1D, quad1D, ne, mapsO->B, mapsC->G,
-+                                mapsO->Bt, mapsC->Gt, pa_data, x, y);
-+   else if (dim == 2)
-+      internal::PADivDivApply2D(dofs1D, quad1D, ne, mapsO->B, mapsC->G,
-+                                mapsO->Bt, mapsC->Gt, pa_data, x, y);
-+   else
-+   {
-+      MFEM_ABORT("Unsupported dimension!");
-+   }
-+}
-+
-+} // namespace mfem
-diff --git a/fem/bilininteg_gradient.cpp b/fem/integ/bilininteg_gradient_pa.cpp
-similarity index 93%
-rename from fem/bilininteg_gradient.cpp
-rename to fem/integ/bilininteg_gradient_pa.cpp
-index 73b9d1859..20ef4684d 100644
---- a/fem/bilininteg_gradient.cpp
-+++ b/fem/integ/bilininteg_gradient_pa.cpp
-@@ -9,18 +9,14 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
--#include "qfunction.hpp"
--
--using namespace std;
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../qfunction.hpp"
- 
- namespace mfem
- {
- 
--// PA Gradient Integrator
--
- /* Description of the *SetupND functions
-    Inputs are as follows
-    \b Q1D number of quadrature points in one dimension.
-@@ -162,27 +158,6 @@ static void PAGradientSetup3D(const int Q1D,
-    });
- }
- 
--static void PAGradientSetup(const int dim,
--                            const int TR_D1D,
--                            const int TE_D1D,
--                            const int Q1D,
--                            const int NE,
--                            const Array<double> &W,
--                            const Vector &J,
--                            const Vector &COEFF,
--                            Vector &op)
--{
--   if (dim == 1) { MFEM_ABORT("dim==1 not supported in PAGradientSetup"); }
--   if (dim == 2)
--   {
--      PAGradientSetup2D(Q1D, NE, W, J, COEFF, op);
--   }
--   if (dim == 3)
--   {
--      PAGradientSetup3D(Q1D, NE, W, J, COEFF, op);
--   }
--}
--
- void GradientIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-                                     const FiniteElementSpace &test_fes)
- {
-@@ -213,8 +188,18 @@ void GradientIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-    QuadratureSpace qs(*mesh, *ir);
-    CoefficientVector coeff(Q, qs, CoefficientStorage::COMPRESSED);
- 
--   PAGradientSetup(dim, trial_dofs1D, test_dofs1D, quad1D,
--                   ne, ir->GetWeights(), geom->J, coeff, pa_data);
-+   if (dim == 1)
-+   {
-+      MFEM_ABORT("dim==1 not supported in GradientIntegrator::AssemblePA");
-+   }
-+   else if (dim == 2)
-+   {
-+      PAGradientSetup2D(quad1D, ne, ir->GetWeights(), geom->J, coeff, pa_data);
-+   }
-+   else if (dim == 3)
-+   {
-+      PAGradientSetup3D(quad1D, ne, ir->GetWeights(), geom->J, coeff, pa_data);
-+   }
- }
- 
- // PA Gradient Apply 2D kernel
-@@ -791,40 +776,21 @@ static void SmemPAGradientApply3D(const int NE,
-    });
- }
- 
--static void PAGradientApply(const int dim,
--                            const int TR_D1D,
--                            const int TE_D1D,
--                            const int Q1D,
--                            const int NE,
--                            const Array<double> &B,
--                            const Array<double> &G,
--                            const Array<double> &Bt,
--                            const Vector &op,
--                            const Vector &x,
--                            Vector &y,
--                            bool transpose=false)
-+void GradientIntegrator::AddMultPA(const Vector &x, Vector &y) const
- {
--
-    if (dim == 2)
-    {
--      return PAGradientApply2D(NE,B,G,Bt,op,x,y,TR_D1D,TE_D1D,Q1D);
-+      return PAGradientApply2D(ne, trial_maps->B, trial_maps->G, test_maps->Bt,
-+                               pa_data, x, y, trial_dofs1D, test_dofs1D, quad1D);
-    }
-    if (dim == 3)
-    {
--      return PAGradientApply3D(NE,B,G,Bt,op,x,y,TR_D1D,TE_D1D,Q1D);
-+      return PAGradientApply3D(ne, trial_maps->B, trial_maps->G, test_maps->Bt,
-+                               pa_data, x, y, trial_dofs1D, test_dofs1D, quad1D);
-    }
-    MFEM_ABORT("Unknown kernel.");
- }
- 
--// PA Gradient Apply kernel
--void GradientIntegrator::AddMultPA(const Vector &x, Vector &y) const
--{
--   PAGradientApply(dim, trial_dofs1D, test_dofs1D, quad1D, ne,
--                   trial_maps->B, trial_maps->G, test_maps->Bt, pa_data, x, y,
--                   false);
--}
--
--// PA Gradient Apply kernel
- void GradientIntegrator::AddMultTransposePA(const Vector &x, Vector &y) const
- {
-    MFEM_ABORT("PA Gradient AddMultTransposePA not implemented.");
-diff --git a/fem/integ/bilininteg_hcurl_kernels.hpp b/fem/integ/bilininteg_hcurl_kernels.hpp
-new file mode 100644
-index 000000000..a1545f888
---- /dev/null
-+++ b/fem/integ/bilininteg_hcurl_kernels.hpp
-@@ -0,0 +1,3891 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#ifndef MFEM_BILININTEG_HCURL_KERNELS_HPP
-+#define MFEM_BILININTEG_HCURL_KERNELS_HPP
-+
-+#include "../../config/config.hpp"
-+#include "../../general/forall.hpp"
-+#include "../../linalg/dtensor.hpp"
-+
-+// Piola transformation in H(curl): w = dF^{-T} \hat{w}
-+// curl w = (1 / det (dF)) dF \hat{curl} \hat{w}
-+
-+namespace mfem
-+{
-+
-+namespace internal
-+{
-+
-+MFEM_HOST_DEVICE inline
-+void PAHcurlMassAssembleDiagonal2D(const int D1D,
-+                                   const int Q1D,
-+                                   const int NE,
-+                                   const bool symmetric,
-+                                   const Array<double> &bo,
-+                                   const Array<double> &bc,
-+                                   const Vector &pa_data,
-+                                   Vector &diag)
-+{
-+   constexpr static int VDIM = 2;
-+   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, symmetric ? 3 : 4, NE);
-+   auto D = Reshape(diag.ReadWrite(), 2*(D1D-1)*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      int osc = 0;
-+
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
-+      {
-+         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+         double mass[MAX_Q1D];
-+
-+         for (int dy = 0; dy < D1Dy; ++dy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               mass[qx] = 0.0;
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  const double wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
-+
-+                  mass[qx] += wy * wy * ((c == 0) ? op(qx,qy,0,e) :
-+                                         op(qx,qy,symmetric ? 2 : 3, e));
-+               }
-+            }
-+
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  const double wx = ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
-+                  D(dx + (dy * D1Dx) + osc, e) += mass[qx] * wx * wx;
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy;
-+      }  // loop c
-+   }); // end of element loop
-+}
-+
-+MFEM_HOST_DEVICE inline
-+void PAHcurlMassAssembleDiagonal3D(const int D1D,
-+                                   const int Q1D,
-+                                   const int NE,
-+                                   const bool symmetric,
-+                                   const Array<double> &bo,
-+                                   const Array<double> &bc,
-+                                   const Vector &pa_data,
-+                                   Vector &diag)
-+{
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
-+
-+   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+   constexpr static int VDIM = 3;
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
-+   auto D = Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      int osc = 0;
-+
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+      {
-+         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
-+         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+         const int opc = (c == 0) ? 0 : ((c == 1) ? (symmetric ? 3 : 4) :
-+                                         (symmetric ? 5 : 8));
-+
-+         double mass[MAX_Q1D];
-+
-+         for (int dz = 0; dz < D1Dz; ++dz)
-+         {
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  mass[qx] = 0.0;
-+                  for (int qy = 0; qy < Q1D; ++qy)
-+                  {
-+                     const double wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
-+
-+                     for (int qz = 0; qz < Q1D; ++qz)
-+                     {
-+                        const double wz = (c == 2) ? Bo(qz,dz) : Bc(qz,dz);
-+
-+                        mass[qx] += wy * wy * wz * wz * op(qx,qy,qz,opc,e);
-+                     }
-+                  }
-+               }
-+
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     const double wx = ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
-+                     D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += mass[qx] * wx * wx;
-+                  }
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }  // loop c
-+   }); // end of element loop
-+}
-+
-+template<int T_D1D = HCURL_MAX_D1D, int T_Q1D = HCURL_MAX_Q1D>
-+MFEM_HOST_DEVICE inline
-+void SmemPAHcurlMassAssembleDiagonal3D(const int D1D,
-+                                       const int Q1D,
-+                                       const int NE,
-+                                       const bool symmetric,
-+                                       const Array<double> &bo,
-+                                       const Array<double> &bc,
-+                                       const Vector &pa_data,
-+                                       Vector &diag)
-+{
-+   MFEM_VERIFY(D1D <= HCURL_MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= HCURL_MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
-+   auto D = Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
-+
-+   mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      constexpr int VDIM = 3;
-+
-+      MFEM_SHARED double sBo[T_Q1D][T_D1D];
-+      MFEM_SHARED double sBc[T_Q1D][T_D1D];
-+
-+      double op3[3];
-+      MFEM_SHARED double sop[3][T_Q1D][T_Q1D];
-+
-+      MFEM_FOREACH_THREAD(qx,x,Q1D)
-+      {
-+         MFEM_FOREACH_THREAD(qy,y,Q1D)
-+         {
-+            MFEM_FOREACH_THREAD(qz,z,Q1D)
-+            {
-+               op3[0] = op(qx,qy,qz,0,e);
-+               op3[1] = op(qx,qy,qz,symmetric ? 3 : 4,e);
-+               op3[2] = op(qx,qy,qz,symmetric ? 5 : 8,e);
-+            }
-+         }
-+      }
-+
-+      const int tidx = MFEM_THREAD_ID(x);
-+      const int tidy = MFEM_THREAD_ID(y);
-+      const int tidz = MFEM_THREAD_ID(z);
-+
-+      if (tidz == 0)
-+      {
-+         MFEM_FOREACH_THREAD(d,y,D1D)
-+         {
-+            MFEM_FOREACH_THREAD(q,x,Q1D)
-+            {
-+               sBc[q][d] = Bc(q,d);
-+               if (d < D1D-1)
-+               {
-+                  sBo[q][d] = Bo(q,d);
-+               }
-+            }
-+         }
-+      }
-+      MFEM_SYNC_THREAD;
-+
-+      int osc = 0;
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+      {
-+         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
-+         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+         double dxyz = 0.0;
-+
-+         for (int qz=0; qz < Q1D; ++qz)
-+         {
-+            if (tidz == qz)
-+            {
-+               for (int i=0; i<3; ++i)
-+               {
-+                  sop[i][tidx][tidy] = op3[i];
-+               }
-+            }
-+
-+            MFEM_SYNC_THREAD;
-+
-+            MFEM_FOREACH_THREAD(dz,z,D1Dz)
-+            {
-+               const double wz = ((c == 2) ? sBo[qz][dz] : sBc[qz][dz]);
-+
-+               MFEM_FOREACH_THREAD(dy,y,D1Dy)
-+               {
-+                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
-+                  {
-+                     for (int qy = 0; qy < Q1D; ++qy)
-+                     {
-+                        const double wy = ((c == 1) ? sBo[qy][dy] : sBc[qy][dy]);
-+
-+                        for (int qx = 0; qx < Q1D; ++qx)
-+                        {
-+                           const double wx = ((c == 0) ? sBo[qx][dx] : sBc[qx][dx]);
-+                           dxyz += sop[c][qx][qy] * wx * wx * wy * wy * wz * wz;
-+                        }
-+                     }
-+                  }
-+               }
-+            }
-+
-+            MFEM_SYNC_THREAD;
-+         }  // qz loop
-+
-+         MFEM_FOREACH_THREAD(dz,z,D1Dz)
-+         {
-+            MFEM_FOREACH_THREAD(dy,y,D1Dy)
-+            {
-+               MFEM_FOREACH_THREAD(dx,x,D1Dx)
-+               {
-+                  D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }  // c loop
-+   }); // end of element loop
-+}
-+
-+MFEM_HOST_DEVICE inline
-+void PAHcurlMassApply2D(const int D1D,
-+                        const int Q1D,
-+                        const int NE,
-+                        const bool symmetric,
-+                        const Array<double> &bo,
-+                        const Array<double> &bc,
-+                        const Array<double> &bot,
-+                        const Array<double> &bct,
-+                        const Vector &pa_data,
-+                        const Vector &x,
-+                        Vector &y)
-+{
-+   constexpr static int VDIM = 2;
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
-+   auto Bct = Reshape(bct.Read(), D1D, Q1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, symmetric ? 3 : 4, NE);
-+   auto X = Reshape(x.Read(), 2*(D1D-1)*D1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), 2*(D1D-1)*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double mass[MAX_Q1D][MAX_Q1D][VDIM];
-+
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            for (int c = 0; c < VDIM; ++c)
-+            {
-+               mass[qy][qx][c] = 0.0;
-+            }
-+         }
-+      }
-+
-+      int osc = 0;
-+
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
-+      {
-+         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+         for (int dy = 0; dy < D1Dy; ++dy)
-+         {
-+            double massX[MAX_Q1D];
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               massX[qx] = 0.0;
-+            }
-+
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               const double t = X(dx + (dy * D1Dx) + osc, e);
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  massX[qx] += t * ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
-+               }
-+            }
-+
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               const double wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  mass[qy][qx][c] += massX[qx] * wy;
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy;
-+      }  // loop (c) over components
-+
-+      // Apply D operator.
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            const double O11 = op(qx,qy,0,e);
-+            const double O21 = op(qx,qy,1,e);
-+            const double O12 = symmetric ? O21 : op(qx,qy,2,e);
-+            const double O22 = symmetric ? op(qx,qy,2,e) : op(qx,qy,3,e);
-+            const double massX = mass[qy][qx][0];
-+            const double massY = mass[qy][qx][1];
-+            mass[qy][qx][0] = (O11*massX)+(O12*massY);
-+            mass[qy][qx][1] = (O21*massX)+(O22*massY);
-+         }
-+      }
-+
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         osc = 0;
-+
-+         for (int c = 0; c < VDIM; ++c)  // loop over x, y components
-+         {
-+            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+            double massX[MAX_D1D];
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               massX[dx] = 0.0;
-+            }
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  massX[dx] += mass[qy][qx][c] * ((c == 0) ? Bot(dx,qx) : Bct(dx,qx));
-+               }
-+            }
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               const double wy = (c == 1) ? Bot(dy,qy) : Bct(dy,qy);
-+
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  Y(dx + (dy * D1Dx) + osc, e) += massX[dx] * wy;
-+               }
-+            }
-+
-+            osc += D1Dx * D1Dy;
-+         }  // loop c
-+      }  // loop qy
-+   }); // end of element loop
-+}
-+
-+MFEM_HOST_DEVICE inline
-+void PAHcurlMassApply3D(const int D1D,
-+                        const int Q1D,
-+                        const int NE,
-+                        const bool symmetric,
-+                        const Array<double> &bo,
-+                        const Array<double> &bc,
-+                        const Array<double> &bot,
-+                        const Array<double> &bct,
-+                        const Vector &pa_data,
-+                        const Vector &x,
-+                        Vector &y)
-+{
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
-+
-+   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+   constexpr static int VDIM = 3;
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
-+   auto Bct = Reshape(bct.Read(), D1D, Q1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
-+   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double mass[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];
-+
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               for (int c = 0; c < VDIM; ++c)
-+               {
-+                  mass[qz][qy][qx][c] = 0.0;
-+               }
-+            }
-+         }
-+      }
-+
-+      int osc = 0;
-+
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+      {
-+         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
-+         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+         for (int dz = 0; dz < D1Dz; ++dz)
-+         {
-+            double massXY[MAX_Q1D][MAX_Q1D];
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  massXY[qy][qx] = 0.0;
-+               }
-+            }
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               double massX[MAX_Q1D];
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  massX[qx] = 0.0;
-+               }
-+
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     massX[qx] += t * ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
-+                  }
-+               }
-+
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  const double wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     const double wx = massX[qx];
-+                     massXY[qy][qx] += wx * wy;
-+                  }
-+               }
-+            }
-+
-+            for (int qz = 0; qz < Q1D; ++qz)
-+            {
-+               const double wz = (c == 2) ? Bo(qz,dz) : Bc(qz,dz);
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     mass[qz][qy][qx][c] += massXY[qy][qx] * wz;
-+                  }
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }  // loop (c) over components
-+
-+      // Apply D operator.
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               const double O11 = op(qx,qy,qz,0,e);
-+               const double O12 = op(qx,qy,qz,1,e);
-+               const double O13 = op(qx,qy,qz,2,e);
-+               const double O21 = symmetric ? O12 : op(qx,qy,qz,3,e);
-+               const double O22 = symmetric ? op(qx,qy,qz,3,e) : op(qx,qy,qz,4,e);
-+               const double O23 = symmetric ? op(qx,qy,qz,4,e) : op(qx,qy,qz,5,e);
-+               const double O31 = symmetric ? O13 : op(qx,qy,qz,6,e);
-+               const double O32 = symmetric ? O23 : op(qx,qy,qz,7,e);
-+               const double O33 = symmetric ? op(qx,qy,qz,5,e) : op(qx,qy,qz,8,e);
-+               const double massX = mass[qz][qy][qx][0];
-+               const double massY = mass[qz][qy][qx][1];
-+               const double massZ = mass[qz][qy][qx][2];
-+               mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
-+               mass[qz][qy][qx][1] = (O21*massX)+(O22*massY)+(O23*massZ);
-+               mass[qz][qy][qx][2] = (O31*massX)+(O32*massY)+(O33*massZ);
-+            }
-+         }
-+      }
-+
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         double massXY[MAX_D1D][MAX_D1D];
-+
-+         osc = 0;
-+
-+         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+         {
-+            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
-+            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  massXY[dy][dx] = 0.0;
-+               }
-+            }
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               double massX[MAX_D1D];
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  massX[dx] = 0;
-+               }
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     massX[dx] += mass[qz][qy][qx][c] * ((c == 0) ? Bot(dx,qx) : Bct(dx,qx));
-+                  }
-+               }
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  const double wy = (c == 1) ? Bot(dy,qy) : Bct(dy,qy);
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     massXY[dy][dx] += massX[dx] * wy;
-+                  }
-+               }
-+            }
-+
-+            for (int dz = 0; dz < D1Dz; ++dz)
-+            {
-+               const double wz = (c == 2) ? Bot(dz,qz) : Bct(dz,qz);
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += massXY[dy][dx] * wz;
-+                  }
-+               }
-+            }
-+
-+            osc += D1Dx * D1Dy * D1Dz;
-+         }  // loop c
-+      }  // loop qz
-+   }); // end of element loop
-+}
-+
-+template<int T_D1D = HCURL_MAX_D1D, int T_Q1D = HCURL_MAX_Q1D>
-+MFEM_HOST_DEVICE inline
-+void SmemPAHcurlMassApply3D(const int D1D,
-+                            const int Q1D,
-+                            const int NE,
-+                            const bool symmetric,
-+                            const Array<double> &bo,
-+                            const Array<double> &bc,
-+                            const Array<double> &bot,
-+                            const Array<double> &bct,
-+                            const Vector &pa_data,
-+                            const Vector &x,
-+                            Vector &y)
-+{
-+   MFEM_VERIFY(D1D <= HCURL_MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= HCURL_MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+
-+   const int dataSize = symmetric ? 6 : 9;
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, dataSize, NE);
-+   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
-+
-+   mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      constexpr int VDIM = 3;
-+
-+      MFEM_SHARED double sBo[T_Q1D][T_D1D];
-+      MFEM_SHARED double sBc[T_Q1D][T_D1D];
-+
-+      double op9[9];
-+      MFEM_SHARED double sop[9*T_Q1D*T_Q1D];
-+      MFEM_SHARED double mass[T_Q1D][T_Q1D][3];
-+
-+      MFEM_SHARED double sX[T_D1D][T_D1D][T_D1D];
-+
-+      MFEM_FOREACH_THREAD(qx,x,Q1D)
-+      {
-+         MFEM_FOREACH_THREAD(qy,y,Q1D)
-+         {
-+            MFEM_FOREACH_THREAD(qz,z,Q1D)
-+            {
-+               for (int i=0; i<dataSize; ++i)
-+               {
-+                  op9[i] = op(qx,qy,qz,i,e);
-+               }
-+            }
-+         }
-+      }
-+
-+      const int tidx = MFEM_THREAD_ID(x);
-+      const int tidy = MFEM_THREAD_ID(y);
-+      const int tidz = MFEM_THREAD_ID(z);
-+
-+      if (tidz == 0)
-+      {
-+         MFEM_FOREACH_THREAD(d,y,D1D)
-+         {
-+            MFEM_FOREACH_THREAD(q,x,Q1D)
-+            {
-+               sBc[q][d] = Bc(q,d);
-+               if (d < D1D-1)
-+               {
-+                  sBo[q][d] = Bo(q,d);
-+               }
-+            }
-+         }
-+      }
-+      MFEM_SYNC_THREAD;
-+
-+      for (int qz=0; qz < Q1D; ++qz)
-+      {
-+         int osc = 0;
-+         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+         {
-+            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
-+            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+            MFEM_FOREACH_THREAD(dz,z,D1Dz)
-+            {
-+               MFEM_FOREACH_THREAD(dy,y,D1Dy)
-+               {
-+                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
-+                  {
-+                     sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  }
-+               }
-+            }
-+            MFEM_SYNC_THREAD;
-+
-+            if (tidz == qz)
-+            {
-+               for (int i=0; i<dataSize; ++i)
-+               {
-+                  sop[i + (dataSize*tidx) + (dataSize*Q1D*tidy)] = op9[i];
-+               }
-+
-+               MFEM_FOREACH_THREAD(qy,y,Q1D)
-+               {
-+                  MFEM_FOREACH_THREAD(qx,x,Q1D)
-+                  {
-+                     double u = 0.0;
-+
-+                     for (int dz = 0; dz < D1Dz; ++dz)
-+                     {
-+                        const double wz = (c == 2) ? sBo[qz][dz] : sBc[qz][dz];
-+                        for (int dy = 0; dy < D1Dy; ++dy)
-+                        {
-+                           const double wy = (c == 1) ? sBo[qy][dy] : sBc[qy][dy];
-+                           for (int dx = 0; dx < D1Dx; ++dx)
-+                           {
-+                              const double t = sX[dz][dy][dx];
-+                              const double wx = (c == 0) ? sBo[qx][dx] : sBc[qx][dx];
-+                              u += t * wx * wy * wz;
-+                           }
-+                        }
-+                     }
-+
-+                     mass[qy][qx][c] = u;
-+                  } // qx
-+               } // qy
-+            } // tidz == qz
-+
-+            osc += D1Dx * D1Dy * D1Dz;
-+            MFEM_SYNC_THREAD;
-+         } // c
-+
-+         MFEM_SYNC_THREAD;  // Sync mass[qy][qx][d] and sop
-+
-+         osc = 0;
-+         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+         {
-+            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
-+            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+            double dxyz = 0.0;
-+
-+            MFEM_FOREACH_THREAD(dz,z,D1Dz)
-+            {
-+               const double wz = (c == 2) ? sBo[qz][dz] : sBc[qz][dz];
-+
-+               MFEM_FOREACH_THREAD(dy,y,D1Dy)
-+               {
-+                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
-+                  {
-+                     for (int qy = 0; qy < Q1D; ++qy)
-+                     {
-+                        const double wy = (c == 1) ? sBo[qy][dy] : sBc[qy][dy];
-+                        for (int qx = 0; qx < Q1D; ++qx)
-+                        {
-+                           const int os = (dataSize*qx) + (dataSize*Q1D*qy);
-+                           const int id1 = os + ((c == 0) ? 0 : ((c == 1) ? (symmetric ? 1 : 3) :
-+                                                                 (symmetric ? 2 : 6))); // O11, O21, O31
-+                           const int id2 = os + ((c == 0) ? 1 : ((c == 1) ? (symmetric ? 3 : 4) :
-+                                                                 (symmetric ? 4 : 7))); // O12, O22, O32
-+                           const int id3 = os + ((c == 0) ? 2 : ((c == 1) ? (symmetric ? 4 : 5) :
-+                                                                 (symmetric ? 5 : 8))); // O13, O23, O33
-+
-+                           const double m_c = (sop[id1] * mass[qy][qx][0]) + (sop[id2] * mass[qy][qx][1]) +
-+                                              (sop[id3] * mass[qy][qx][2]);
-+
-+                           const double wx = (c == 0) ? sBo[qx][dx] : sBc[qx][dx];
-+                           dxyz += m_c * wx * wy * wz;
-+                        }
-+                     }
-+                  }
-+               }
-+            }
-+
-+            MFEM_SYNC_THREAD;
-+
-+            MFEM_FOREACH_THREAD(dz,z,D1Dz)
-+            {
-+               MFEM_FOREACH_THREAD(dy,y,D1Dy)
-+               {
-+                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
-+                  {
-+                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
-+                  }
-+               }
-+            }
-+
-+            osc += D1Dx * D1Dy * D1Dz;
-+         } // c loop
-+      } // qz
-+   }); // end of element loop
-+}
-+
-+// PA H(curl) curl-curl assemble 2D kernel
-+MFEM_HOST_DEVICE inline
-+void PACurlCurlSetup2D(const int Q1D,
-+                       const int NE,
-+                       const Array<double> &w,
-+                       const Vector &j,
-+                       Vector &coeff,
-+                       Vector &op)
-+{
-+   const int NQ = Q1D*Q1D;
-+   auto W = w.Read();
-+   auto J = Reshape(j.Read(), NQ, 2, 2, NE);
-+   auto C = Reshape(coeff.Read(), NQ, NE);
-+   auto y = Reshape(op.Write(), NQ, NE);
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      for (int q = 0; q < NQ; ++q)
-+      {
-+         const double J11 = J(q,0,0,e);
-+         const double J21 = J(q,1,0,e);
-+         const double J12 = J(q,0,1,e);
-+         const double J22 = J(q,1,1,e);
-+         const double detJ = (J11*J22)-(J21*J12);
-+         y(q,e) = W[q] * C(q,e) / detJ;
-+      }
-+   });
-+}
-+
-+// PA H(curl) curl-curl assemble 3D kernel
-+MFEM_HOST_DEVICE inline
-+void PACurlCurlSetup3D(const int Q1D,
-+                       const int coeffDim,
-+                       const int NE,
-+                       const Array<double> &w,
-+                       const Vector &j,
-+                       Vector &coeff,
-+                       Vector &op)
-+{
-+   const int NQ = Q1D*Q1D*Q1D;
-+   const bool symmetric = (coeffDim != 9);
-+   auto W = w.Read();
-+   auto J = Reshape(j.Read(), NQ, 3, 3, NE);
-+   auto C = Reshape(coeff.Read(), coeffDim, NQ, NE);
-+   auto y = Reshape(op.Write(), NQ, symmetric ? 6 : 9, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      for (int q = 0; q < NQ; ++q)
-+      {
-+         const double J11 = J(q,0,0,e);
-+         const double J21 = J(q,1,0,e);
-+         const double J31 = J(q,2,0,e);
-+         const double J12 = J(q,0,1,e);
-+         const double J22 = J(q,1,1,e);
-+         const double J32 = J(q,2,1,e);
-+         const double J13 = J(q,0,2,e);
-+         const double J23 = J(q,1,2,e);
-+         const double J33 = J(q,2,2,e);
-+         const double detJ = J11 * (J22 * J33 - J32 * J23) -
-+                             J21 * (J12 * J33 - J32 * J13) +
-+                             J31 * (J12 * J23 - J22 * J13);
-+
-+         const double c_detJ = W[q] / detJ;
-+
-+         if (coeffDim == 6 || coeffDim == 9) // Matrix coefficient version
-+         {
-+            // Set y to the 6 or 9 entries of J^T M J / det
-+            const double M11 = C(0, q, e);
-+            const double M12 = C(1, q, e);
-+            const double M13 = C(2, q, e);
-+            const double M21 = (!symmetric) ? C(3, q, e) : M12;
-+            const double M22 = (!symmetric) ? C(4, q, e) : C(3, q, e);
-+            const double M23 = (!symmetric) ? C(5, q, e) : C(4, q, e);
-+            const double M31 = (!symmetric) ? C(6, q, e) : M13;
-+            const double M32 = (!symmetric) ? C(7, q, e) : M23;
-+            const double M33 = (!symmetric) ? C(8, q, e) : C(5, q, e);
-+
-+            // First compute R = MJ
-+            const double R11 = M11*J11 + M12*J21 + M13*J31;
-+            const double R12 = M11*J12 + M12*J22 + M13*J32;
-+            const double R13 = M11*J13 + M12*J23 + M13*J33;
-+            const double R21 = M21*J11 + M22*J21 + M23*J31;
-+            const double R22 = M21*J12 + M22*J22 + M23*J32;
-+            const double R23 = M21*J13 + M22*J23 + M23*J33;
-+            const double R31 = M31*J11 + M32*J21 + M33*J31;
-+            const double R32 = M31*J12 + M32*J22 + M33*J32;
-+            const double R33 = M31*J13 + M32*J23 + M33*J33;
-+
-+            // Now set y to J^T R / det
-+            y(q,0,e) = c_detJ * (J11*R11 + J21*R21 + J31*R31); // 1,1
-+            const double Y12 = c_detJ * (J11*R12 + J21*R22 + J31*R32);
-+            y(q,1,e) = Y12; // 1,2
-+            y(q,2,e) = c_detJ * (J11*R13 + J21*R23 + J31*R33); // 1,3
-+
-+            const double Y21 = c_detJ * (J12*R11 + J22*R21 + J32*R31);
-+            const double Y22 = c_detJ * (J12*R12 + J22*R22 + J32*R32);
-+            const double Y23 = c_detJ * (J12*R13 + J22*R23 + J32*R33);
-+
-+            const double Y33 = c_detJ * (J13*R13 + J23*R23 + J33*R33);
-+
-+            y(q,3,e) = symmetric ? Y22 : Y21; // 2,2 or 2,1
-+            y(q,4,e) = symmetric ? Y23 : Y22; // 2,3 or 2,2
-+            y(q,5,e) = symmetric ? Y33 : Y23; // 3,3 or 2,3
-+
-+            if (!symmetric)
-+            {
-+               y(q,6,e) = c_detJ * (J13*R11 + J23*R21 + J33*R31); // 3,1
-+               y(q,7,e) = c_detJ * (J13*R12 + J23*R22 + J33*R32); // 3,2
-+               y(q,8,e) = Y33; // 3,3
-+            }
-+         }
-+         else  // Vector or scalar coefficient version
-+         {
-+            // Set y to the 6 entries of J^T D J / det^2
-+            const double D1 = C(0, q, e);
-+            const double D2 = coeffDim == 3 ? C(1, q, e) : D1;
-+            const double D3 = coeffDim == 3 ? C(2, q, e) : D1;
-+
-+            y(q,0,e) = c_detJ * (D1*J11*J11 + D2*J21*J21 + D3*J31*J31); // 1,1
-+            y(q,1,e) = c_detJ * (D1*J11*J12 + D2*J21*J22 + D3*J31*J32); // 1,2
-+            y(q,2,e) = c_detJ * (D1*J11*J13 + D2*J21*J23 + D3*J31*J33); // 1,3
-+            y(q,3,e) = c_detJ * (D1*J12*J12 + D2*J22*J22 + D3*J32*J32); // 2,2
-+            y(q,4,e) = c_detJ * (D1*J12*J13 + D2*J22*J23 + D3*J32*J33); // 2,3
-+            y(q,5,e) = c_detJ * (D1*J13*J13 + D2*J23*J23 + D3*J33*J33); // 3,3
-+         }
-+      }
-+   });
-+}
-+
-+MFEM_HOST_DEVICE inline
-+void PACurlCurlAssembleDiagonal2D(const int D1D,
-+                                  const int Q1D,
-+                                  const int NE,
-+                                  const Array<double> &bo,
-+                                  const Array<double> &gc,
-+                                  const Vector &pa_data,
-+                                  Vector &diag)
-+{
-+   constexpr static int VDIM = 2;
-+   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Gc = Reshape(gc.Read(), Q1D, D1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, NE);
-+   auto D = Reshape(diag.ReadWrite(), 2*(D1D-1)*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      int osc = 0;
-+
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
-+      {
-+         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+         double t[MAX_Q1D];
-+
-+         for (int dy = 0; dy < D1Dy; ++dy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               t[qx] = 0.0;
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  const double wy = (c == 1) ? Bo(qy,dy) : -Gc(qy,dy);
-+                  t[qx] += wy * wy * op(qx,qy,e);
-+               }
-+            }
-+
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  const double wx = ((c == 0) ? Bo(qx,dx) : Gc(qx,dx));
-+                  D(dx + (dy * D1Dx) + osc, e) += t[qx] * wx * wx;
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy;
-+      }  // loop c
-+   }); // end of element loop
-+}
-+
-+template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
-+MFEM_HOST_DEVICE inline
-+void PACurlCurlAssembleDiagonal3D(const int D1D,
-+                                  const int Q1D,
-+                                  const bool symmetric,
-+                                  const int NE,
-+                                  const Array<double> &bo,
-+                                  const Array<double> &bc,
-+                                  const Array<double> &go,
-+                                  const Array<double> &gc,
-+                                  const Vector &pa_data,
-+                                  Vector &diag)
-+{
-+   constexpr static int VDIM = 3;
-+   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto Go = Reshape(go.Read(), Q1D, D1D-1);
-+   auto Gc = Reshape(gc.Read(), Q1D, D1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
-+   auto D = Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
-+
-+   const int s = symmetric ? 6 : 9;
-+   const int i11 = 0;
-+   const int i12 = 1;
-+   const int i13 = 2;
-+   const int i21 = symmetric ? i12 : 3;
-+   const int i22 = symmetric ? 3 : 4;
-+   const int i23 = symmetric ? 4 : 5;
-+   const int i31 = symmetric ? i13 : 6;
-+   const int i32 = symmetric ? i23 : 7;
-+   const int i33 = symmetric ? 5 : 8;
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
-+      // (\nabla\times u) \cdot (\nabla\times u) = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{u}
-+      // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
-+      // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
-+      // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
-+
-+      // For each c, we will keep 9 arrays for derivatives multiplied by the 9 entries of the 3x3 matrix (dF^T C dF),
-+      // which may be non-symmetric depending on a possibly non-symmetric matrix coefficient.
-+
-+      int osc = 0;
-+
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+      {
-+         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
-+         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+         double zt[MAX_Q1D][MAX_Q1D][MAX_D1D][9][3];
-+
-+         // z contraction
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               for (int dz = 0; dz < D1Dz; ++dz)
-+               {
-+                  for (int i=0; i<s; ++i)
-+                  {
-+                     for (int d=0; d<3; ++d)
-+                     {
-+                        zt[qx][qy][dz][i][d] = 0.0;
-+                     }
-+                  }
-+
-+                  for (int qz = 0; qz < Q1D; ++qz)
-+                  {
-+                     const double wz = ((c == 2) ? Bo(qz,dz) : Bc(qz,dz));
-+                     const double wDz = ((c == 2) ? Go(qz,dz) : Gc(qz,dz));
-+
-+                     for (int i=0; i<s; ++i)
-+                     {
-+                        zt[qx][qy][dz][i][0] += wz * wz * op(qx,qy,qz,i,e);
-+                        zt[qx][qy][dz][i][1] += wDz * wz * op(qx,qy,qz,i,e);
-+                        zt[qx][qy][dz][i][2] += wDz * wDz * op(qx,qy,qz,i,e);
-+                     }
-+                  }
-+               }
-+            }
-+         }  // end of z contraction
-+
-+         double yt[MAX_Q1D][MAX_D1D][MAX_D1D][9][3][3];
-+
-+         // y contraction
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            for (int dz = 0; dz < D1Dz; ++dz)
-+            {
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  for (int i=0; i<s; ++i)
-+                  {
-+                     for (int d=0; d<3; ++d)
-+                        for (int j=0; j<3; ++j)
-+                        {
-+                           yt[qx][dy][dz][i][d][j] = 0.0;
-+                        }
-+                  }
-+
-+                  for (int qy = 0; qy < Q1D; ++qy)
-+                  {
-+                     const double wy = ((c == 1) ? Bo(qy,dy) : Bc(qy,dy));
-+                     const double wDy = ((c == 1) ? Go(qy,dy) : Gc(qy,dy));
-+
-+                     for (int i=0; i<s; ++i)
-+                     {
-+                        for (int d=0; d<3; ++d)
-+                        {
-+                           yt[qx][dy][dz][i][d][0] += wy * wy * zt[qx][qy][dz][i][d];
-+                           yt[qx][dy][dz][i][d][1] += wDy * wy * zt[qx][qy][dz][i][d];
-+                           yt[qx][dy][dz][i][d][2] += wDy * wDy * zt[qx][qy][dz][i][d];
-+                        }
-+                     }
-+                  }
-+               }
-+            }
-+         }  // end of y contraction
-+
-+         // x contraction
-+         for (int dz = 0; dz < D1Dz; ++dz)
-+         {
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     const double wx = ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
-+                     const double wDx = ((c == 0) ? Go(qx,dx) : Gc(qx,dx));
-+
-+                     // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
-+                     // (\nabla\times u) \cdot (\nabla\times u) = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{u}
-+                     // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
-+                     // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
-+                     // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
-+
-+                     /*
-+                       const double O11 = op(q,0,e);
-+                       const double O12 = op(q,1,e);
-+                       const double O13 = op(q,2,e);
-+                       const double O22 = op(q,3,e);
-+                       const double O23 = op(q,4,e);
-+                       const double O33 = op(q,5,e);
-+                     */
-+
-+                     if (c == 0)
-+                     {
-+                        // (u_0)_{x_2} (O22 (u_0)_{x_2} - O23 (u_0)_{x_1}) - (u_0)_{x_1} (O32 (u_0)_{x_2} - O33 (u_0)_{x_1})
-+                        const double sumy = yt[qx][dy][dz][i22][2][0] - yt[qx][dy][dz][i23][1][1]
-+                                            - yt[qx][dy][dz][i32][1][1] + yt[qx][dy][dz][i33][0][2];
-+
-+                        D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += sumy * wx * wx;
-+                     }
-+                     else if (c == 1)
-+                     {
-+                        // (u_1)_{x_2} (O11 (u_1)_{x_2} - O13 (u_1)_{x_0}) + (u_1)_{x_0} (-O31 (u_1)_{x_2} + O33 (u_1)_{x_0})
-+                        const double d = (yt[qx][dy][dz][i11][2][0] * wx * wx)
-+                                         - ((yt[qx][dy][dz][i13][1][0] + yt[qx][dy][dz][i31][1][0]) * wDx * wx)
-+                                         + (yt[qx][dy][dz][i33][0][0] * wDx * wDx);
-+
-+                        D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += d;
-+                     }
-+                     else
-+                     {
-+                        // (u_2)_{x_1} (O11 (u_2)_{x_1} - O12 (u_2)_{x_0}) - (u_2)_{x_0} (O21 (u_2)_{x_1} - O22 (u_2)_{x_0})
-+                        const double d = (yt[qx][dy][dz][i11][0][2] * wx * wx)
-+                                         - ((yt[qx][dy][dz][i12][0][1] + yt[qx][dy][dz][i21][0][1]) * wDx * wx)
-+                                         + (yt[qx][dy][dz][i22][0][0] * wDx * wDx);
-+
-+                        D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += d;
-+                     }
-+                  }
-+               }
-+            }
-+         }  // end of x contraction
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }  // loop c
-+   }); // end of element loop
-+}
-+
-+template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
-+MFEM_HOST_DEVICE inline
-+void SmemPACurlCurlAssembleDiagonal3D(const int D1D,
-+                                      const int Q1D,
-+                                      const bool symmetric,
-+                                      const int NE,
-+                                      const Array<double> &bo,
-+                                      const Array<double> &bc,
-+                                      const Array<double> &go,
-+                                      const Array<double> &gc,
-+                                      const Vector &pa_data,
-+                                      Vector &diag)
-+{
-+   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto Go = Reshape(go.Read(), Q1D, D1D-1);
-+   auto Gc = Reshape(gc.Read(), Q1D, D1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
-+   auto D = Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
-+
-+   const int s = symmetric ? 6 : 9;
-+   const int i11 = 0;
-+   const int i12 = 1;
-+   const int i13 = 2;
-+   const int i21 = symmetric ? i12 : 3;
-+   const int i22 = symmetric ? 3 : 4;
-+   const int i23 = symmetric ? 4 : 5;
-+   const int i31 = symmetric ? i13 : 6;
-+   const int i32 = symmetric ? i23 : 7;
-+   const int i33 = symmetric ? 5 : 8;
-+
-+   mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
-+      // (\nabla\times u) \cdot (\nabla\times u) = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{u}
-+      // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
-+      // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
-+      // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
-+
-+      constexpr int VDIM = 3;
-+
-+      MFEM_SHARED double sBo[MAX_Q1D][MAX_D1D];
-+      MFEM_SHARED double sBc[MAX_Q1D][MAX_D1D];
-+      MFEM_SHARED double sGo[MAX_Q1D][MAX_D1D];
-+      MFEM_SHARED double sGc[MAX_Q1D][MAX_D1D];
-+
-+      double ope[9];
-+      MFEM_SHARED double sop[9][MAX_Q1D][MAX_Q1D];
-+
-+      MFEM_FOREACH_THREAD(qx,x,Q1D)
-+      {
-+         MFEM_FOREACH_THREAD(qy,y,Q1D)
-+         {
-+            MFEM_FOREACH_THREAD(qz,z,Q1D)
-+            {
-+               for (int i=0; i<s; ++i)
-+               {
-+                  ope[i] = op(qx,qy,qz,i,e);
-+               }
-+            }
-+         }
-+      }
-+
-+      const int tidx = MFEM_THREAD_ID(x);
-+      const int tidy = MFEM_THREAD_ID(y);
-+      const int tidz = MFEM_THREAD_ID(z);
-+
-+      if (tidz == 0)
-+      {
-+         MFEM_FOREACH_THREAD(d,y,D1D)
-+         {
-+            MFEM_FOREACH_THREAD(q,x,Q1D)
-+            {
-+               sBc[q][d] = Bc(q,d);
-+               sGc[q][d] = Gc(q,d);
-+               if (d < D1D-1)
-+               {
-+                  sBo[q][d] = Bo(q,d);
-+                  sGo[q][d] = Go(q,d);
-+               }
-+            }
-+         }
-+      }
-+      MFEM_SYNC_THREAD;
-+
-+      int osc = 0;
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+      {
-+         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
-+         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+         double dxyz = 0.0;
-+
-+         for (int qz=0; qz < Q1D; ++qz)
-+         {
-+            if (tidz == qz)
-+            {
-+               for (int i=0; i<s; ++i)
-+               {
-+                  sop[i][tidx][tidy] = ope[i];
-+               }
-+            }
-+
-+            MFEM_SYNC_THREAD;
-+
-+            MFEM_FOREACH_THREAD(dz,z,D1Dz)
-+            {
-+               const double wz = ((c == 2) ? sBo[qz][dz] : sBc[qz][dz]);
-+               const double wDz = ((c == 2) ? sGo[qz][dz] : sGc[qz][dz]);
-+
-+               MFEM_FOREACH_THREAD(dy,y,D1Dy)
-+               {
-+                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
-+                  {
-+                     for (int qy = 0; qy < Q1D; ++qy)
-+                     {
-+                        const double wy = ((c == 1) ? sBo[qy][dy] : sBc[qy][dy]);
-+                        const double wDy = ((c == 1) ? sGo[qy][dy] : sGc[qy][dy]);
-+
-+                        for (int qx = 0; qx < Q1D; ++qx)
-+                        {
-+                           const double wx = ((c == 0) ? sBo[qx][dx] : sBc[qx][dx]);
-+                           const double wDx = ((c == 0) ? sGo[qx][dx] : sGc[qx][dx]);
-+
-+                           if (c == 0)
-+                           {
-+                              // (u_0)_{x_2} (O22 (u_0)_{x_2} - O23 (u_0)_{x_1}) - (u_0)_{x_1} (O32 (u_0)_{x_2} - O33 (u_0)_{x_1})
-+
-+                              // (u_0)_{x_2} O22 (u_0)_{x_2}
-+                              dxyz += sop[i22][qx][qy] * wx * wx * wy * wy * wDz * wDz;
-+
-+                              // -(u_0)_{x_2} O23 (u_0)_{x_1} - (u_0)_{x_1} O32 (u_0)_{x_2}
-+                              dxyz += -(sop[i23][qx][qy] + sop[i32][qx][qy]) * wx * wx * wDy * wy * wDz * wz;
-+
-+                              // (u_0)_{x_1} O33 (u_0)_{x_1}
-+                              dxyz += sop[i33][qx][qy] * wx * wx * wDy * wDy * wz * wz;
-+                           }
-+                           else if (c == 1)
-+                           {
-+                              // (u_1)_{x_2} (O11 (u_1)_{x_2} - O13 (u_1)_{x_0}) + (u_1)_{x_0} (-O31 (u_1)_{x_2} + O33 (u_1)_{x_0})
-+
-+                              // (u_1)_{x_2} O11 (u_1)_{x_2}
-+                              dxyz += sop[i11][qx][qy] * wx * wx * wy * wy * wDz * wDz;
-+
-+                              // -(u_1)_{x_2} O13 (u_1)_{x_0} - (u_1)_{x_0} O31 (u_1)_{x_2}
-+                              dxyz += -(sop[i13][qx][qy] + sop[i31][qx][qy]) * wDx * wx * wy * wy * wDz * wz;
-+
-+                              // (u_1)_{x_0} O33 (u_1)_{x_0})
-+                              dxyz += sop[i33][qx][qy] * wDx * wDx * wy * wy * wz * wz;
-+                           }
-+                           else
-+                           {
-+                              // (u_2)_{x_1} (O11 (u_2)_{x_1} - O12 (u_2)_{x_0}) - (u_2)_{x_0} (O21 (u_2)_{x_1} - O22 (u_2)_{x_0})
-+
-+                              // (u_2)_{x_1} O11 (u_2)_{x_1}
-+                              dxyz += sop[i11][qx][qy] * wx * wx * wDy * wDy * wz * wz;
-+
-+                              // -(u_2)_{x_1} O12 (u_2)_{x_0} - (u_2)_{x_0} O21 (u_2)_{x_1}
-+                              dxyz += -(sop[i12][qx][qy] + sop[i21][qx][qy]) * wDx * wx * wDy * wy * wz * wz;
-+
-+                              // (u_2)_{x_0} O22 (u_2)_{x_0}
-+                              dxyz += sop[i22][qx][qy] * wDx * wDx * wy * wy * wz * wz;
-+                           }
-+                        }
-+                     }
-+                  }
-+               }
-+            }
-+
-+            MFEM_SYNC_THREAD;
-+         }  // qz loop
-+
-+         MFEM_FOREACH_THREAD(dz,z,D1Dz)
-+         {
-+            MFEM_FOREACH_THREAD(dy,y,D1Dy)
-+            {
-+               MFEM_FOREACH_THREAD(dx,x,D1Dx)
-+               {
-+                  D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }  // c loop
-+   }); // end of element loop
-+}
-+
-+MFEM_HOST_DEVICE inline
-+void PACurlCurlApply2D(const int D1D,
-+                       const int Q1D,
-+                       const int NE,
-+                       const Array<double> &bo,
-+                       const Array<double> &bot,
-+                       const Array<double> &gc,
-+                       const Array<double> &gct,
-+                       const Vector &pa_data,
-+                       const Vector &x,
-+                       Vector &y)
-+{
-+   constexpr static int VDIM = 2;
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
-+   auto Gc = Reshape(gc.Read(), Q1D, D1D);
-+   auto Gct = Reshape(gct.Read(), D1D, Q1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, NE);
-+   auto X = Reshape(x.Read(), 2*(D1D-1)*D1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), 2*(D1D-1)*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double curl[MAX_Q1D][MAX_Q1D];
-+
-+      // curl[qy][qx] will be computed as du_y/dx - du_x/dy
-+
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            curl[qy][qx] = 0.0;
-+         }
-+      }
-+
-+      int osc = 0;
-+
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
-+      {
-+         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+         for (int dy = 0; dy < D1Dy; ++dy)
-+         {
-+            double gradX[MAX_Q1D];
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               gradX[qx] = 0;
-+            }
-+
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               const double t = X(dx + (dy * D1Dx) + osc, e);
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  gradX[qx] += t * ((c == 0) ? Bo(qx,dx) : Gc(qx,dx));
-+               }
-+            }
-+
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               const double wy = (c == 0) ? -Gc(qy,dy) : Bo(qy,dy);
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  curl[qy][qx] += gradX[qx] * wy;
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy;
-+      }  // loop (c) over components
-+
-+      // Apply D operator.
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            curl[qy][qx] *= op(qx,qy,e);
-+         }
-+      }
-+
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         osc = 0;
-+
-+         for (int c = 0; c < VDIM; ++c)  // loop over x, y components
-+         {
-+            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+            double gradX[MAX_D1D];
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               gradX[dx] = 0.0;
-+            }
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  gradX[dx] += curl[qy][qx] * ((c == 0) ? Bot(dx,qx) : Gct(dx,qx));
-+               }
-+            }
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               const double wy = (c == 0) ? -Gct(dy,qy) : Bot(dy,qy);
-+
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  Y(dx + (dy * D1Dx) + osc, e) += gradX[dx] * wy;
-+               }
-+            }
-+
-+            osc += D1Dx * D1Dy;
-+         }  // loop c
-+      }  // loop qy
-+   }); // end of element loop
-+}
-+
-+template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
-+MFEM_HOST_DEVICE inline
-+void PACurlCurlApply3D(const int D1D,
-+                       const int Q1D,
-+                       const bool symmetric,
-+                       const int NE,
-+                       const Array<double> &bo,
-+                       const Array<double> &bc,
-+                       const Array<double> &bot,
-+                       const Array<double> &bct,
-+                       const Array<double> &gc,
-+                       const Array<double> &gct,
-+                       const Vector &pa_data,
-+                       const Vector &x,
-+                       Vector &y)
-+{
-+   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+   // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk),
-+   // we get:
-+   // (\nabla\times u) \cdot (\nabla\times v)
-+   //     = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{v}
-+   // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
-+   // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
-+   // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
-+
-+   constexpr static int VDIM = 3;
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
-+   auto Bct = Reshape(bct.Read(), D1D, Q1D);
-+   auto Gc = Reshape(gc.Read(), Q1D, D1D);
-+   auto Gct = Reshape(gct.Read(), D1D, Q1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
-+   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double curl[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];
-+      // curl[qz][qy][qx] will be computed as the vector curl at each quadrature point.
-+
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               for (int c = 0; c < VDIM; ++c)
-+               {
-+                  curl[qz][qy][qx][c] = 0.0;
-+               }
-+            }
-+         }
-+      }
-+
-+      // We treat x, y, z components separately for optimization specific to each.
-+
-+      int osc = 0;
-+
-+      {
-+         // x component
-+         const int D1Dz = D1D;
-+         const int D1Dy = D1D;
-+         const int D1Dx = D1D - 1;
-+
-+         for (int dz = 0; dz < D1Dz; ++dz)
-+         {
-+            double gradXY[MAX_Q1D][MAX_Q1D][2];
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  for (int d = 0; d < 2; ++d)
-+                  {
-+                     gradXY[qy][qx][d] = 0.0;
-+                  }
-+               }
-+            }
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               double massX[MAX_Q1D];
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  massX[qx] = 0.0;
-+               }
-+
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     massX[qx] += t * Bo(qx,dx);
-+                  }
-+               }
-+
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  const double wy = Bc(qy,dy);
-+                  const double wDy = Gc(qy,dy);
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     const double wx = massX[qx];
-+                     gradXY[qy][qx][0] += wx * wDy;
-+                     gradXY[qy][qx][1] += wx * wy;
-+                  }
-+               }
-+            }
-+
-+            for (int qz = 0; qz < Q1D; ++qz)
-+            {
-+               const double wz = Bc(qz,dz);
-+               const double wDz = Gc(qz,dz);
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
-+                     curl[qz][qy][qx][1] += gradXY[qy][qx][1] * wDz; // (u_0)_{x_2}
-+                     curl[qz][qy][qx][2] -= gradXY[qy][qx][0] * wz;  // -(u_0)_{x_1}
-+                  }
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }
-+
-+      {
-+         // y component
-+         const int D1Dz = D1D;
-+         const int D1Dy = D1D - 1;
-+         const int D1Dx = D1D;
-+
-+         for (int dz = 0; dz < D1Dz; ++dz)
-+         {
-+            double gradXY[MAX_Q1D][MAX_Q1D][2];
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  for (int d = 0; d < 2; ++d)
-+                  {
-+                     gradXY[qy][qx][d] = 0.0;
-+                  }
-+               }
-+            }
-+
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               double massY[MAX_Q1D];
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  massY[qy] = 0.0;
-+               }
-+
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  for (int qy = 0; qy < Q1D; ++qy)
-+                  {
-+                     massY[qy] += t * Bo(qy,dy);
-+                  }
-+               }
-+
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  const double wx = Bc(qx,dx);
-+                  const double wDx = Gc(qx,dx);
-+                  for (int qy = 0; qy < Q1D; ++qy)
-+                  {
-+                     const double wy = massY[qy];
-+                     gradXY[qy][qx][0] += wDx * wy;
-+                     gradXY[qy][qx][1] += wx * wy;
-+                  }
-+               }
-+            }
-+
-+            for (int qz = 0; qz < Q1D; ++qz)
-+            {
-+               const double wz = Bc(qz,dz);
-+               const double wDz = Gc(qz,dz);
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
-+                     curl[qz][qy][qx][0] -= gradXY[qy][qx][1] * wDz; // -(u_1)_{x_2}
-+                     curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz;  // (u_1)_{x_0}
-+                  }
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }
-+
-+      {
-+         // z component
-+         const int D1Dz = D1D - 1;
-+         const int D1Dy = D1D;
-+         const int D1Dx = D1D;
-+
-+         for (int dx = 0; dx < D1Dx; ++dx)
-+         {
-+            double gradYZ[MAX_Q1D][MAX_Q1D][2];
-+            for (int qz = 0; qz < Q1D; ++qz)
-+            {
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int d = 0; d < 2; ++d)
-+                  {
-+                     gradYZ[qz][qy][d] = 0.0;
-+                  }
-+               }
-+            }
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               double massZ[MAX_Q1D];
-+               for (int qz = 0; qz < Q1D; ++qz)
-+               {
-+                  massZ[qz] = 0.0;
-+               }
-+
-+               for (int dz = 0; dz < D1Dz; ++dz)
-+               {
-+                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  for (int qz = 0; qz < Q1D; ++qz)
-+                  {
-+                     massZ[qz] += t * Bo(qz,dz);
-+                  }
-+               }
-+
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  const double wy = Bc(qy,dy);
-+                  const double wDy = Gc(qy,dy);
-+                  for (int qz = 0; qz < Q1D; ++qz)
-+                  {
-+                     const double wz = massZ[qz];
-+                     gradYZ[qz][qy][0] += wz * wy;
-+                     gradYZ[qz][qy][1] += wz * wDy;
-+                  }
-+               }
-+            }
-+
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               const double wx = Bc(qx,dx);
-+               const double wDx = Gc(qx,dx);
-+
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int qz = 0; qz < Q1D; ++qz)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
-+                     curl[qz][qy][qx][0] += gradYZ[qz][qy][1] * wx;  // (u_2)_{x_1}
-+                     curl[qz][qy][qx][1] -= gradYZ[qz][qy][0] * wDx; // -(u_2)_{x_0}
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // Apply D operator.
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               const double O11 = op(qx,qy,qz,0,e);
-+               const double O12 = op(qx,qy,qz,1,e);
-+               const double O13 = op(qx,qy,qz,2,e);
-+               const double O21 = symmetric ? O12 : op(qx,qy,qz,3,e);
-+               const double O22 = symmetric ? op(qx,qy,qz,3,e) : op(qx,qy,qz,4,e);
-+               const double O23 = symmetric ? op(qx,qy,qz,4,e) : op(qx,qy,qz,5,e);
-+               const double O31 = symmetric ? O13 : op(qx,qy,qz,6,e);
-+               const double O32 = symmetric ? O23 : op(qx,qy,qz,7,e);
-+               const double O33 = symmetric ? op(qx,qy,qz,5,e) : op(qx,qy,qz,8,e);
-+
-+               const double c1 = (O11 * curl[qz][qy][qx][0]) + (O12 * curl[qz][qy][qx][1]) +
-+                                 (O13 * curl[qz][qy][qx][2]);
-+               const double c2 = (O21 * curl[qz][qy][qx][0]) + (O22 * curl[qz][qy][qx][1]) +
-+                                 (O23 * curl[qz][qy][qx][2]);
-+               const double c3 = (O31 * curl[qz][qy][qx][0]) + (O32 * curl[qz][qy][qx][1]) +
-+                                 (O33 * curl[qz][qy][qx][2]);
-+
-+               curl[qz][qy][qx][0] = c1;
-+               curl[qz][qy][qx][1] = c2;
-+               curl[qz][qy][qx][2] = c3;
-+            }
-+         }
-+      }
-+
-+      // x component
-+      osc = 0;
-+      {
-+         const int D1Dz = D1D;
-+         const int D1Dy = D1D;
-+         const int D1Dx = D1D - 1;
-+
-+         for (int qz = 0; qz < Q1D; ++qz)
-+         {
-+            double gradXY12[MAX_D1D][MAX_D1D];
-+            double gradXY21[MAX_D1D][MAX_D1D];
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  gradXY12[dy][dx] = 0.0;
-+                  gradXY21[dy][dx] = 0.0;
-+               }
-+            }
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               double massX[MAX_D1D][2];
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  for (int n = 0; n < 2; ++n)
-+                  {
-+                     massX[dx][n] = 0.0;
-+                  }
-+               }
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     const double wx = Bot(dx,qx);
-+
-+                     massX[dx][0] += wx * curl[qz][qy][qx][1];
-+                     massX[dx][1] += wx * curl[qz][qy][qx][2];
-+                  }
-+               }
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  const double wy = Bct(dy,qy);
-+                  const double wDy = Gct(dy,qy);
-+
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     gradXY21[dy][dx] += massX[dx][0] * wy;
-+                     gradXY12[dy][dx] += massX[dx][1] * wDy;
-+                  }
-+               }
-+            }
-+
-+            for (int dz = 0; dz < D1Dz; ++dz)
-+            {
-+               const double wz = Bct(dz,qz);
-+               const double wDz = Gct(dz,qz);
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
-+                     // (u_0)_{x_2} * (op * curl)_1 - (u_0)_{x_1} * (op * curl)_2
-+                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
-+                       e) += (gradXY21[dy][dx] * wDz) - (gradXY12[dy][dx] * wz);
-+                  }
-+               }
-+            }
-+         }  // loop qz
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }
-+
-+      // y component
-+      {
-+         const int D1Dz = D1D;
-+         const int D1Dy = D1D - 1;
-+         const int D1Dx = D1D;
-+
-+         for (int qz = 0; qz < Q1D; ++qz)
-+         {
-+            double gradXY02[MAX_D1D][MAX_D1D];
-+            double gradXY20[MAX_D1D][MAX_D1D];
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  gradXY02[dy][dx] = 0.0;
-+                  gradXY20[dy][dx] = 0.0;
-+               }
-+            }
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               double massY[MAX_D1D][2];
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  massY[dy][0] = 0.0;
-+                  massY[dy][1] = 0.0;
-+               }
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int dy = 0; dy < D1Dy; ++dy)
-+                  {
-+                     const double wy = Bot(dy,qy);
-+
-+                     massY[dy][0] += wy * curl[qz][qy][qx][2];
-+                     massY[dy][1] += wy * curl[qz][qy][qx][0];
-+                  }
-+               }
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  const double wx = Bct(dx,qx);
-+                  const double wDx = Gct(dx,qx);
-+
-+                  for (int dy = 0; dy < D1Dy; ++dy)
-+                  {
-+                     gradXY02[dy][dx] += massY[dy][0] * wDx;
-+                     gradXY20[dy][dx] += massY[dy][1] * wx;
-+                  }
-+               }
-+            }
-+
-+            for (int dz = 0; dz < D1Dz; ++dz)
-+            {
-+               const double wz = Bct(dz,qz);
-+               const double wDz = Gct(dz,qz);
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
-+                     // -(u_1)_{x_2} * (op * curl)_0 + (u_1)_{x_0} * (op * curl)_2
-+                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
-+                       e) += (-gradXY20[dy][dx] * wDz) + (gradXY02[dy][dx] * wz);
-+                  }
-+               }
-+            }
-+         }  // loop qz
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }
-+
-+      // z component
-+      {
-+         const int D1Dz = D1D - 1;
-+         const int D1Dy = D1D;
-+         const int D1Dx = D1D;
-+
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            double gradYZ01[MAX_D1D][MAX_D1D];
-+            double gradYZ10[MAX_D1D][MAX_D1D];
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               for (int dz = 0; dz < D1Dz; ++dz)
-+               {
-+                  gradYZ01[dz][dy] = 0.0;
-+                  gradYZ10[dz][dy] = 0.0;
-+               }
-+            }
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               double massZ[MAX_D1D][2];
-+               for (int dz = 0; dz < D1Dz; ++dz)
-+               {
-+                  for (int n = 0; n < 2; ++n)
-+                  {
-+                     massZ[dz][n] = 0.0;
-+                  }
-+               }
-+               for (int qz = 0; qz < Q1D; ++qz)
-+               {
-+                  for (int dz = 0; dz < D1Dz; ++dz)
-+                  {
-+                     const double wz = Bot(dz,qz);
-+
-+                     massZ[dz][0] += wz * curl[qz][qy][qx][0];
-+                     massZ[dz][1] += wz * curl[qz][qy][qx][1];
-+                  }
-+               }
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  const double wy = Bct(dy,qy);
-+                  const double wDy = Gct(dy,qy);
-+
-+                  for (int dz = 0; dz < D1Dz; ++dz)
-+                  {
-+                     gradYZ01[dz][dy] += wy * massZ[dz][1];
-+                     gradYZ10[dz][dy] += wDy * massZ[dz][0];
-+                  }
-+               }
-+            }
-+
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               const double wx = Bct(dx,qx);
-+               const double wDx = Gct(dx,qx);
-+
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  for (int dz = 0; dz < D1Dz; ++dz)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
-+                     // (u_2)_{x_1} * (op * curl)_0 - (u_2)_{x_0} * (op * curl)_1
-+                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
-+                       e) += (gradYZ10[dz][dy] * wx) - (gradYZ01[dz][dy] * wDx);
-+                  }
-+               }
-+            }
-+         }  // loop qx
-+      }
-+   }); // end of element loop
-+}
-+
-+template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
-+MFEM_HOST_DEVICE inline
-+void SmemPACurlCurlApply3D(const int D1D,
-+                           const int Q1D,
-+                           const bool symmetric,
-+                           const int NE,
-+                           const Array<double> &bo,
-+                           const Array<double> &bc,
-+                           const Array<double> &bot,
-+                           const Array<double> &bct,
-+                           const Array<double> &gc,
-+                           const Array<double> &gct,
-+                           const Vector &pa_data,
-+                           const Vector &x,
-+                           Vector &y)
-+{
-+   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+   // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
-+   // (\nabla\times u) \cdot (\nabla\times v) = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{v}
-+   // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
-+   // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
-+   // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto Gc = Reshape(gc.Read(), Q1D, D1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
-+   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
-+
-+   const int s = symmetric ? 6 : 9;
-+
-+   auto device_kernel = [=] MFEM_DEVICE (int e)
-+   {
-+      constexpr int VDIM = 3;
-+
-+      MFEM_SHARED double sBo[MAX_D1D][MAX_Q1D];
-+      MFEM_SHARED double sBc[MAX_D1D][MAX_Q1D];
-+      MFEM_SHARED double sGc[MAX_D1D][MAX_Q1D];
-+
-+      double ope[9];
-+      MFEM_SHARED double sop[9][MAX_Q1D][MAX_Q1D];
-+      MFEM_SHARED double curl[MAX_Q1D][MAX_Q1D][3];
-+
-+      MFEM_SHARED double sX[MAX_D1D][MAX_D1D][MAX_D1D];
-+
-+      MFEM_FOREACH_THREAD(qx,x,Q1D)
-+      {
-+         MFEM_FOREACH_THREAD(qy,y,Q1D)
-+         {
-+            MFEM_FOREACH_THREAD(qz,z,Q1D)
-+            {
-+               for (int i=0; i<s; ++i)
-+               {
-+                  ope[i] = op(qx,qy,qz,i,e);
-+               }
-+            }
-+         }
-+      }
-+
-+      const int tidx = MFEM_THREAD_ID(x);
-+      const int tidy = MFEM_THREAD_ID(y);
-+      const int tidz = MFEM_THREAD_ID(z);
-+
-+      if (tidz == 0)
-+      {
-+         MFEM_FOREACH_THREAD(d,y,D1D)
-+         {
-+            MFEM_FOREACH_THREAD(q,x,Q1D)
-+            {
-+               sBc[d][q] = Bc(q,d);
-+               sGc[d][q] = Gc(q,d);
-+               if (d < D1D-1)
-+               {
-+                  sBo[d][q] = Bo(q,d);
-+               }
-+            }
-+         }
-+      }
-+      MFEM_SYNC_THREAD;
-+
-+      for (int qz=0; qz < Q1D; ++qz)
-+      {
-+         if (tidz == qz)
-+         {
-+            MFEM_FOREACH_THREAD(qy,y,Q1D)
-+            {
-+               MFEM_FOREACH_THREAD(qx,x,Q1D)
-+               {
-+                  for (int i=0; i<3; ++i)
-+                  {
-+                     curl[qy][qx][i] = 0.0;
-+                  }
-+               }
-+            }
-+         }
-+
-+         int osc = 0;
-+         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+         {
-+            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
-+            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+            MFEM_FOREACH_THREAD(dz,z,D1Dz)
-+            {
-+               MFEM_FOREACH_THREAD(dy,y,D1Dy)
-+               {
-+                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
-+                  {
-+                     sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  }
-+               }
-+            }
-+            MFEM_SYNC_THREAD;
-+
-+            if (tidz == qz)
-+            {
-+               if (c == 0)
-+               {
-+                  for (int i=0; i<s; ++i)
-+                  {
-+                     sop[i][tidx][tidy] = ope[i];
-+                  }
-+               }
-+
-+               MFEM_FOREACH_THREAD(qy,y,Q1D)
-+               {
-+                  MFEM_FOREACH_THREAD(qx,x,Q1D)
-+                  {
-+                     double u = 0.0;
-+                     double v = 0.0;
-+
-+                     // We treat x, y, z components separately for optimization specific to each.
-+                     if (c == 0) // x component
-+                     {
-+                        // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
-+
-+                        for (int dz = 0; dz < D1Dz; ++dz)
-+                        {
-+                           const double wz = sBc[dz][qz];
-+                           const double wDz = sGc[dz][qz];
-+
-+                           for (int dy = 0; dy < D1Dy; ++dy)
-+                           {
-+                              const double wy = sBc[dy][qy];
-+                              const double wDy = sGc[dy][qy];
-+
-+                              for (int dx = 0; dx < D1Dx; ++dx)
-+                              {
-+                                 const double wx = sX[dz][dy][dx] * sBo[dx][qx];
-+                                 u += wx * wDy * wz;
-+                                 v += wx * wy * wDz;
-+                              }
-+                           }
-+                        }
-+
-+                        curl[qy][qx][1] += v; // (u_0)_{x_2}
-+                        curl[qy][qx][2] -= u;  // -(u_0)_{x_1}
-+                     }
-+                     else if (c == 1)  // y component
-+                     {
-+                        // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
-+
-+                        for (int dz = 0; dz < D1Dz; ++dz)
-+                        {
-+                           const double wz = sBc[dz][qz];
-+                           const double wDz = sGc[dz][qz];
-+
-+                           for (int dy = 0; dy < D1Dy; ++dy)
-+                           {
-+                              const double wy = sBo[dy][qy];
-+
-+                              for (int dx = 0; dx < D1Dx; ++dx)
-+                              {
-+                                 const double t = sX[dz][dy][dx];
-+                                 const double wx = t * sBc[dx][qx];
-+                                 const double wDx = t * sGc[dx][qx];
-+
-+                                 u += wDx * wy * wz;
-+                                 v += wx * wy * wDz;
-+                              }
-+                           }
-+                        }
-+
-+                        curl[qy][qx][0] -= v; // -(u_1)_{x_2}
-+                        curl[qy][qx][2] += u; // (u_1)_{x_0}
-+                     }
-+                     else // z component
-+                     {
-+                        // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
-+
-+                        for (int dz = 0; dz < D1Dz; ++dz)
-+                        {
-+                           const double wz = sBo[dz][qz];
-+
-+                           for (int dy = 0; dy < D1Dy; ++dy)
-+                           {
-+                              const double wy = sBc[dy][qy];
-+                              const double wDy = sGc[dy][qy];
-+
-+                              for (int dx = 0; dx < D1Dx; ++dx)
-+                              {
-+                                 const double t = sX[dz][dy][dx];
-+                                 const double wx = t * sBc[dx][qx];
-+                                 const double wDx = t * sGc[dx][qx];
-+
-+                                 u += wDx * wy * wz;
-+                                 v += wx * wDy * wz;
-+                              }
-+                           }
-+                        }
-+
-+                        curl[qy][qx][0] += v; // (u_2)_{x_1}
-+                        curl[qy][qx][1] -= u; // -(u_2)_{x_0}
-+                     }
-+                  } // qx
-+               } // qy
-+            } // tidz == qz
-+
-+            osc += D1Dx * D1Dy * D1Dz;
-+            MFEM_SYNC_THREAD;
-+         } // c
-+
-+         double dxyz1 = 0.0;
-+         double dxyz2 = 0.0;
-+         double dxyz3 = 0.0;
-+
-+         MFEM_FOREACH_THREAD(dz,z,D1D)
-+         {
-+            const double wcz = sBc[dz][qz];
-+            const double wcDz = sGc[dz][qz];
-+            const double wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
-+
-+            MFEM_FOREACH_THREAD(dy,y,D1D)
-+            {
-+               MFEM_FOREACH_THREAD(dx,x,D1D)
-+               {
-+                  for (int qy = 0; qy < Q1D; ++qy)
-+                  {
-+                     const double wcy = sBc[dy][qy];
-+                     const double wcDy = sGc[dy][qy];
-+                     const double wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
-+
-+                     for (int qx = 0; qx < Q1D; ++qx)
-+                     {
-+                        const double O11 = sop[0][qx][qy];
-+                        const double O12 = sop[1][qx][qy];
-+                        const double O13 = sop[2][qx][qy];
-+                        const double O21 = symmetric ? O12 : sop[3][qx][qy];
-+                        const double O22 = symmetric ? sop[3][qx][qy] : sop[4][qx][qy];
-+                        const double O23 = symmetric ? sop[4][qx][qy] : sop[5][qx][qy];
-+                        const double O31 = symmetric ? O13 : sop[6][qx][qy];
-+                        const double O32 = symmetric ? O23 : sop[7][qx][qy];
-+                        const double O33 = symmetric ? sop[5][qx][qy] : sop[8][qx][qy];
-+
-+                        const double c1 = (O11 * curl[qy][qx][0]) + (O12 * curl[qy][qx][1]) +
-+                                          (O13 * curl[qy][qx][2]);
-+                        const double c2 = (O21 * curl[qy][qx][0]) + (O22 * curl[qy][qx][1]) +
-+                                          (O23 * curl[qy][qx][2]);
-+                        const double c3 = (O31 * curl[qy][qx][0]) + (O32 * curl[qy][qx][1]) +
-+                                          (O33 * curl[qy][qx][2]);
-+
-+                        const double wcx = sBc[dx][qx];
-+                        const double wDx = sGc[dx][qx];
-+
-+                        if (dx < D1D-1)
-+                        {
-+                           // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
-+                           // (u_0)_{x_2} * (op * curl)_1 - (u_0)_{x_1} * (op * curl)_2
-+                           const double wx = sBo[dx][qx];
-+                           dxyz1 += (wx * c2 * wcy * wcDz) - (wx * c3 * wcDy * wcz);
-+                        }
-+
-+                        // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
-+                        // -(u_1)_{x_2} * (op * curl)_0 + (u_1)_{x_0} * (op * curl)_2
-+                        dxyz2 += (-wy * c1 * wcx * wcDz) + (wy * c3 * wDx * wcz);
-+
-+                        // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
-+                        // (u_2)_{x_1} * (op * curl)_0 - (u_2)_{x_0} * (op * curl)_1
-+                        dxyz3 += (wcDy * wz * c1 * wcx) - (wcy * wz * c2 * wDx);
-+                     } // qx
-+                  } // qy
-+               } // dx
-+            } // dy
-+         } // dz
-+
-+         MFEM_SYNC_THREAD;
-+
-+         MFEM_FOREACH_THREAD(dz,z,D1D)
-+         {
-+            MFEM_FOREACH_THREAD(dy,y,D1D)
-+            {
-+               MFEM_FOREACH_THREAD(dx,x,D1D)
-+               {
-+                  if (dx < D1D-1)
-+                  {
-+                     Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
-+                  }
-+                  if (dy < D1D-1)
-+                  {
-+                     Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
-+                  }
-+                  if (dz < D1D-1)
-+                  {
-+                     Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
-+                  }
-+               }
-+            }
-+         }
-+      } // qz
-+   }; // end of element loop
-+
-+   auto host_kernel = [&] MFEM_LAMBDA (int)
-+   {
-+      MFEM_ABORT_KERNEL("This kernel should only be used on GPU.");
-+   };
-+
-+   ForallWrap<3>(true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
-+}
-+
-+// PA H(curl)-L2 assemble 2D kernel
-+MFEM_HOST_DEVICE inline
-+void PAHcurlL2Setup2D(const int Q1D,
-+                      const int NE,
-+                      const Array<double> &w,
-+                      Vector &coeff,
-+                      Vector &op)
-+{
-+   const int NQ = Q1D*Q1D;
-+   auto W = w.Read();
-+   auto C = Reshape(coeff.Read(), NQ, NE);
-+   auto y = Reshape(op.Write(), NQ, NE);
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      for (int q = 0; q < NQ; ++q)
-+      {
-+         y(q,e) = W[q] * C(q,e);
-+      }
-+   });
-+}
-+
-+// PA H(curl) Mass Assemble 3D kernel
-+MFEM_HOST_DEVICE inline
-+void PAHcurlL2Setup3D(const int NQ,
-+                      const int coeffDim,
-+                      const int NE,
-+                      const Array<double> &w,
-+                      Vector &coeff,
-+                      Vector &op)
-+{
-+   auto W = w.Read();
-+   auto C = Reshape(coeff.Read(), coeffDim, NQ, NE);
-+   auto y = Reshape(op.Write(), coeffDim, NQ, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      for (int q = 0; q < NQ; ++q)
-+      {
-+         for (int c=0; c<coeffDim; ++c)
-+         {
-+            y(c,q,e) = W[q] * C(c,q,e);
-+         }
-+      }
-+   });
-+}
-+
-+MFEM_HOST_DEVICE inline
-+void PAHcurlL2Apply2D(const int D1D,
-+                      const int D1Dtest,
-+                      const int Q1D,
-+                      const int NE,
-+                      const Array<double> &bo,
-+                      const Array<double> &bot,
-+                      const Array<double> &bt,
-+                      const Array<double> &gc,
-+                      const Vector &pa_data,
-+                      const Vector &x, // trial = H(curl)
-+                      Vector &y)  // test = L2 or H1
-+{
-+   constexpr static int VDIM = 2;
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
-+   const int H1 = (D1Dtest == D1D);
-+
-+   MFEM_VERIFY(y.Size() == NE*D1Dtest*D1Dtest, "Test vector of wrong dimension");
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
-+   auto Bt = Reshape(bt.Read(), D1D, Q1D);
-+   auto Gc = Reshape(gc.Read(), Q1D, D1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, NE);
-+   auto X = Reshape(x.Read(), 2*(D1D-1)*D1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), D1Dtest, D1Dtest, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double curl[MAX_Q1D][MAX_Q1D];
-+
-+      // curl[qy][qx] will be computed as du_y/dx - du_x/dy
-+
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            curl[qy][qx] = 0.0;
-+         }
-+      }
-+
-+      int osc = 0;
-+
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
-+      {
-+         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+         for (int dy = 0; dy < D1Dy; ++dy)
-+         {
-+            double gradX[MAX_Q1D];
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               gradX[qx] = 0;
-+            }
-+
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               const double t = X(dx + (dy * D1Dx) + osc, e);
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  gradX[qx] += t * ((c == 0) ? Bo(qx,dx) : Gc(qx,dx));
-+               }
-+            }
-+
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               const double wy = (c == 0) ? -Gc(qy,dy) : Bo(qy,dy);
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  curl[qy][qx] += gradX[qx] * wy;
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy;
-+      }  // loop (c) over components
-+
-+      // Apply D operator.
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            curl[qy][qx] *= op(qx,qy,e);
-+         }
-+      }
-+
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         double sol_x[MAX_D1D];
-+         for (int dx = 0; dx < D1Dtest; ++dx)
-+         {
-+            sol_x[dx] = 0.0;
-+         }
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            const double s = curl[qy][qx];
-+            for (int dx = 0; dx < D1Dtest; ++dx)
-+            {
-+               sol_x[dx] += s * ((H1 == 1) ? Bt(dx,qx) : Bot(dx,qx));
-+            }
-+         }
-+         for (int dy = 0; dy < D1Dtest; ++dy)
-+         {
-+            const double wy = (H1 == 1) ? Bt(dy,qy) : Bot(dy,qy);
-+
-+            for (int dx = 0; dx < D1Dtest; ++dx)
-+            {
-+               Y(dx,dy,e) += sol_x[dx] * wy;
-+            }
-+         }
-+      }  // loop qy
-+   }); // end of element loop
-+}
-+
-+MFEM_HOST_DEVICE inline
-+void PAHcurlL2ApplyTranspose2D(const int D1D,
-+                               const int D1Dtest,
-+                               const int Q1D,
-+                               const int NE,
-+                               const Array<double> &bo,
-+                               const Array<double> &bot,
-+                               const Array<double> &b,
-+                               const Array<double> &gct,
-+                               const Vector &pa_data,
-+                               const Vector &x, // trial = H(curl)
-+                               Vector &y)  // test = L2 or H1
-+{
-+   constexpr static int VDIM = 2;
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
-+   const int H1 = (D1Dtest == D1D);
-+
-+   MFEM_VERIFY(x.Size() == NE*D1Dtest*D1Dtest, "Test vector of wrong dimension");
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto B = Reshape(b.Read(), Q1D, D1D);
-+   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
-+   auto Gct = Reshape(gct.Read(), D1D, Q1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, NE);
-+   auto X = Reshape(x.Read(), D1Dtest, D1Dtest, NE);
-+   auto Y = Reshape(y.ReadWrite(), 2*(D1D-1)*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double mass[MAX_Q1D][MAX_Q1D];
-+
-+      // Zero-order term in L2 or H1 test space
-+
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            mass[qy][qx] = 0.0;
-+         }
-+      }
-+
-+      for (int dy = 0; dy < D1Dtest; ++dy)
-+      {
-+         double sol_x[MAX_Q1D];
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            sol_x[qy] = 0.0;
-+         }
-+         for (int dx = 0; dx < D1Dtest; ++dx)
-+         {
-+            const double s = X(dx,dy,e);
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               sol_x[qx] += s * ((H1 == 1) ? B(qx,dx) : Bo(qx,dx));
-+            }
-+         }
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            const double d2q = (H1 == 1) ? B(qy,dy) : Bo(qy,dy);
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               mass[qy][qx] += d2q * sol_x[qx];
-+            }
-+         }
-+      }
-+
-+      // Apply D operator.
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            mass[qy][qx] *= op(qx,qy,e);
-+         }
-+      }
-+
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         int osc = 0;
-+
-+         for (int c = 0; c < VDIM; ++c)  // loop over x, y components
-+         {
-+            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+            double gradX[MAX_D1D];
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               gradX[dx] = 0.0;
-+            }
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  gradX[dx] += mass[qy][qx] * ((c == 0) ? Bot(dx,qx) : Gct(dx,qx));
-+               }
-+            }
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               const double wy = (c == 0) ? -Gct(dy,qy) : Bot(dy,qy);
-+
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  Y(dx + (dy * D1Dx) + osc, e) += gradX[dx] * wy;
-+               }
-+            }
-+
-+            osc += D1Dx * D1Dy;
-+         }  // loop c
-+      }  // loop qy
-+   }); // end of element loop
-+}
-+
-+// Apply to x corresponding to DOFs in H(curl) (trial), whose curl is
-+// integrated against H(curl) test functions corresponding to y.
-+template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
-+MFEM_HOST_DEVICE inline
-+void PAHcurlL2Apply3D(const int D1D,
-+                      const int Q1D,
-+                      const int coeffDim,
-+                      const int NE,
-+                      const Array<double> &bo,
-+                      const Array<double> &bc,
-+                      const Array<double> &bot,
-+                      const Array<double> &bct,
-+                      const Array<double> &gc,
-+                      const Vector &pa_data,
-+                      const Vector &x,
-+                      Vector &y)
-+{
-+   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+   // Using u = dF^{-T} \hat{u} and (\nabla\times u) F =
-+   // 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get:
-+   // (\nabla\times u) \cdot v
-+   //    = 1/det(dF) \hat{\nabla}\times\hat{u}^T dF^T dF^{-T} \hat{v}
-+   //    = 1/det(dF) \hat{\nabla}\times\hat{u}^T \hat{v}
-+   // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
-+   // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
-+   // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
-+
-+   constexpr static int VDIM = 3;
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
-+   auto Bct = Reshape(bct.Read(), D1D, Q1D);
-+   auto Gc = Reshape(gc.Read(), Q1D, D1D);
-+   auto op = Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
-+   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double curl[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];
-+      // curl[qz][qy][qx] will be computed as the vector curl at each quadrature point.
-+
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               for (int c = 0; c < VDIM; ++c)
-+               {
-+                  curl[qz][qy][qx][c] = 0.0;
-+               }
-+            }
-+         }
-+      }
-+
-+      // We treat x, y, z components separately for optimization specific to each.
-+
-+      int osc = 0;
-+
-+      {
-+         // x component
-+         const int D1Dz = D1D;
-+         const int D1Dy = D1D;
-+         const int D1Dx = D1D - 1;
-+
-+         for (int dz = 0; dz < D1Dz; ++dz)
-+         {
-+            double gradXY[MAX_Q1D][MAX_Q1D][2];
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  for (int d = 0; d < 2; ++d)
-+                  {
-+                     gradXY[qy][qx][d] = 0.0;
-+                  }
-+               }
-+            }
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               double massX[MAX_Q1D];
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  massX[qx] = 0.0;
-+               }
-+
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     massX[qx] += t * Bo(qx,dx);
-+                  }
-+               }
-+
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  const double wy = Bc(qy,dy);
-+                  const double wDy = Gc(qy,dy);
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     const double wx = massX[qx];
-+                     gradXY[qy][qx][0] += wx * wDy;
-+                     gradXY[qy][qx][1] += wx * wy;
-+                  }
-+               }
-+            }
-+
-+            for (int qz = 0; qz < Q1D; ++qz)
-+            {
-+               const double wz = Bc(qz,dz);
-+               const double wDz = Gc(qz,dz);
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
-+                     curl[qz][qy][qx][1] += gradXY[qy][qx][1] * wDz; // (u_0)_{x_2}
-+                     curl[qz][qy][qx][2] -= gradXY[qy][qx][0] * wz;  // -(u_0)_{x_1}
-+                  }
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }
-+
-+      {
-+         // y component
-+         const int D1Dz = D1D;
-+         const int D1Dy = D1D - 1;
-+         const int D1Dx = D1D;
-+
-+         for (int dz = 0; dz < D1Dz; ++dz)
-+         {
-+            double gradXY[MAX_Q1D][MAX_Q1D][2];
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  for (int d = 0; d < 2; ++d)
-+                  {
-+                     gradXY[qy][qx][d] = 0.0;
-+                  }
-+               }
-+            }
-+
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               double massY[MAX_Q1D];
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  massY[qy] = 0.0;
-+               }
-+
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  for (int qy = 0; qy < Q1D; ++qy)
-+                  {
-+                     massY[qy] += t * Bo(qy,dy);
-+                  }
-+               }
-+
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  const double wx = Bc(qx,dx);
-+                  const double wDx = Gc(qx,dx);
-+                  for (int qy = 0; qy < Q1D; ++qy)
-+                  {
-+                     const double wy = massY[qy];
-+                     gradXY[qy][qx][0] += wDx * wy;
-+                     gradXY[qy][qx][1] += wx * wy;
-+                  }
-+               }
-+            }
-+
-+            for (int qz = 0; qz < Q1D; ++qz)
-+            {
-+               const double wz = Bc(qz,dz);
-+               const double wDz = Gc(qz,dz);
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
-+                     curl[qz][qy][qx][0] -= gradXY[qy][qx][1] * wDz; // -(u_1)_{x_2}
-+                     curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz;  // (u_1)_{x_0}
-+                  }
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }
-+
-+      {
-+         // z component
-+         const int D1Dz = D1D - 1;
-+         const int D1Dy = D1D;
-+         const int D1Dx = D1D;
-+
-+         for (int dx = 0; dx < D1Dx; ++dx)
-+         {
-+            double gradYZ[MAX_Q1D][MAX_Q1D][2];
-+            for (int qz = 0; qz < Q1D; ++qz)
-+            {
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int d = 0; d < 2; ++d)
-+                  {
-+                     gradYZ[qz][qy][d] = 0.0;
-+                  }
-+               }
-+            }
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               double massZ[MAX_Q1D];
-+               for (int qz = 0; qz < Q1D; ++qz)
-+               {
-+                  massZ[qz] = 0.0;
-+               }
-+
-+               for (int dz = 0; dz < D1Dz; ++dz)
-+               {
-+                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  for (int qz = 0; qz < Q1D; ++qz)
-+                  {
-+                     massZ[qz] += t * Bo(qz,dz);
-+                  }
-+               }
-+
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  const double wy = Bc(qy,dy);
-+                  const double wDy = Gc(qy,dy);
-+                  for (int qz = 0; qz < Q1D; ++qz)
-+                  {
-+                     const double wz = massZ[qz];
-+                     gradYZ[qz][qy][0] += wz * wy;
-+                     gradYZ[qz][qy][1] += wz * wDy;
-+                  }
-+               }
-+            }
-+
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               const double wx = Bc(qx,dx);
-+               const double wDx = Gc(qx,dx);
-+
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int qz = 0; qz < Q1D; ++qz)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
-+                     curl[qz][qy][qx][0] += gradYZ[qz][qy][1] * wx;  // (u_2)_{x_1}
-+                     curl[qz][qy][qx][1] -= gradYZ[qz][qy][0] * wDx; // -(u_2)_{x_0}
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // Apply D operator.
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               const double O11 = op(0,qx,qy,qz,e);
-+               if (coeffDim == 1)
-+               {
-+                  for (int c = 0; c < VDIM; ++c)
-+                  {
-+                     curl[qz][qy][qx][c] *= O11;
-+                  }
-+               }
-+               else
-+               {
-+                  const double O21 = op(1,qx,qy,qz,e);
-+                  const double O31 = op(2,qx,qy,qz,e);
-+                  const double O12 = op(3,qx,qy,qz,e);
-+                  const double O22 = op(4,qx,qy,qz,e);
-+                  const double O32 = op(5,qx,qy,qz,e);
-+                  const double O13 = op(6,qx,qy,qz,e);
-+                  const double O23 = op(7,qx,qy,qz,e);
-+                  const double O33 = op(8,qx,qy,qz,e);
-+                  const double curlX = curl[qz][qy][qx][0];
-+                  const double curlY = curl[qz][qy][qx][1];
-+                  const double curlZ = curl[qz][qy][qx][2];
-+                  curl[qz][qy][qx][0] = (O11*curlX)+(O12*curlY)+(O13*curlZ);
-+                  curl[qz][qy][qx][1] = (O21*curlX)+(O22*curlY)+(O23*curlZ);
-+                  curl[qz][qy][qx][2] = (O31*curlX)+(O32*curlY)+(O33*curlZ);
-+               }
-+            }
-+         }
-+      }
-+
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         double massXY[MAX_D1D][MAX_D1D];
-+
-+         osc = 0;
-+
-+         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+         {
-+            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
-+            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  massXY[dy][dx] = 0;
-+               }
-+            }
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               double massX[MAX_D1D];
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  massX[dx] = 0.0;
-+               }
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     massX[dx] += curl[qz][qy][qx][c] * ((c == 0) ? Bot(dx,qx) : Bct(dx,qx));
-+                  }
-+               }
-+
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  const double wy = (c == 1) ? Bot(dy,qy) : Bct(dy,qy);
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     massXY[dy][dx] += massX[dx] * wy;
-+                  }
-+               }
-+            }
-+
-+            for (int dz = 0; dz < D1Dz; ++dz)
-+            {
-+               const double wz = (c == 2) ? Bot(dz,qz) : Bct(dz,qz);
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += massXY[dy][dx] * wz;
-+                  }
-+               }
-+            }
-+
-+            osc += D1Dx * D1Dy * D1Dz;
-+         }  // loop c
-+      }  // loop qz
-+   }); // end of element loop
-+}
-+
-+// Apply to x corresponding to DOFs in H(curl) (trial), whose curl is
-+// integrated against H(curl) test functions corresponding to y.
-+template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
-+MFEM_HOST_DEVICE inline
-+void SmemPAHcurlL2Apply3D(const int D1D,
-+                          const int Q1D,
-+                          const int coeffDim,
-+                          const int NE,
-+                          const Array<double> &bo,
-+                          const Array<double> &bc,
-+                          const Array<double> &gc,
-+                          const Vector &pa_data,
-+                          const Vector &x,
-+                          Vector &y)
-+{
-+   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto Gc = Reshape(gc.Read(), Q1D, D1D);
-+   auto op = Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
-+   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
-+
-+   auto device_kernel = [=] MFEM_DEVICE (int e)
-+   {
-+      constexpr int VDIM = 3;
-+      constexpr int maxCoeffDim = 9;
-+
-+      MFEM_SHARED double sBo[MAX_D1D][MAX_Q1D];
-+      MFEM_SHARED double sBc[MAX_D1D][MAX_Q1D];
-+      MFEM_SHARED double sGc[MAX_D1D][MAX_Q1D];
-+
-+      double opc[maxCoeffDim];
-+      MFEM_SHARED double sop[maxCoeffDim][MAX_Q1D][MAX_Q1D];
-+      MFEM_SHARED double curl[MAX_Q1D][MAX_Q1D][3];
-+
-+      MFEM_SHARED double sX[MAX_D1D][MAX_D1D][MAX_D1D];
-+
-+      MFEM_FOREACH_THREAD(qx,x,Q1D)
-+      {
-+         MFEM_FOREACH_THREAD(qy,y,Q1D)
-+         {
-+            MFEM_FOREACH_THREAD(qz,z,Q1D)
-+            {
-+               for (int i=0; i<coeffDim; ++i)
-+               {
-+                  opc[i] = op(i,qx,qy,qz,e);
-+               }
-+            }
-+         }
-+      }
-+
-+      const int tidx = MFEM_THREAD_ID(x);
-+      const int tidy = MFEM_THREAD_ID(y);
-+      const int tidz = MFEM_THREAD_ID(z);
-+
-+      if (tidz == 0)
-+      {
-+         MFEM_FOREACH_THREAD(d,y,D1D)
-+         {
-+            MFEM_FOREACH_THREAD(q,x,Q1D)
-+            {
-+               sBc[d][q] = Bc(q,d);
-+               sGc[d][q] = Gc(q,d);
-+               if (d < D1D-1)
-+               {
-+                  sBo[d][q] = Bo(q,d);
-+               }
-+            }
-+         }
-+      }
-+      MFEM_SYNC_THREAD;
-+
-+      for (int qz=0; qz < Q1D; ++qz)
-+      {
-+         if (tidz == qz)
-+         {
-+            MFEM_FOREACH_THREAD(qy,y,Q1D)
-+            {
-+               MFEM_FOREACH_THREAD(qx,x,Q1D)
-+               {
-+                  for (int i=0; i<3; ++i)
-+                  {
-+                     curl[qy][qx][i] = 0.0;
-+                  }
-+               }
-+            }
-+         }
-+
-+         int osc = 0;
-+         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+         {
-+            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
-+            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+            MFEM_FOREACH_THREAD(dz,z,D1Dz)
-+            {
-+               MFEM_FOREACH_THREAD(dy,y,D1Dy)
-+               {
-+                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
-+                  {
-+                     sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  }
-+               }
-+            }
-+            MFEM_SYNC_THREAD;
-+
-+            if (tidz == qz)
-+            {
-+               if (c == 0)
-+               {
-+                  for (int i=0; i<coeffDim; ++i)
-+                  {
-+                     sop[i][tidx][tidy] = opc[i];
-+                  }
-+               }
-+
-+               MFEM_FOREACH_THREAD(qy,y,Q1D)
-+               {
-+                  MFEM_FOREACH_THREAD(qx,x,Q1D)
-+                  {
-+                     double u = 0.0;
-+                     double v = 0.0;
-+
-+                     // We treat x, y, z components separately for optimization specific to each.
-+                     if (c == 0) // x component
-+                     {
-+                        // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
-+
-+                        for (int dz = 0; dz < D1Dz; ++dz)
-+                        {
-+                           const double wz = sBc[dz][qz];
-+                           const double wDz = sGc[dz][qz];
-+
-+                           for (int dy = 0; dy < D1Dy; ++dy)
-+                           {
-+                              const double wy = sBc[dy][qy];
-+                              const double wDy = sGc[dy][qy];
-+
-+                              for (int dx = 0; dx < D1Dx; ++dx)
-+                              {
-+                                 const double wx = sX[dz][dy][dx] * sBo[dx][qx];
-+                                 u += wx * wDy * wz;
-+                                 v += wx * wy * wDz;
-+                              }
-+                           }
-+                        }
-+
-+                        curl[qy][qx][1] += v; // (u_0)_{x_2}
-+                        curl[qy][qx][2] -= u;  // -(u_0)_{x_1}
-+                     }
-+                     else if (c == 1)  // y component
-+                     {
-+                        // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
-+
-+                        for (int dz = 0; dz < D1Dz; ++dz)
-+                        {
-+                           const double wz = sBc[dz][qz];
-+                           const double wDz = sGc[dz][qz];
-+
-+                           for (int dy = 0; dy < D1Dy; ++dy)
-+                           {
-+                              const double wy = sBo[dy][qy];
-+
-+                              for (int dx = 0; dx < D1Dx; ++dx)
-+                              {
-+                                 const double t = sX[dz][dy][dx];
-+                                 const double wx = t * sBc[dx][qx];
-+                                 const double wDx = t * sGc[dx][qx];
-+
-+                                 u += wDx * wy * wz;
-+                                 v += wx * wy * wDz;
-+                              }
-+                           }
-+                        }
-+
-+                        curl[qy][qx][0] -= v; // -(u_1)_{x_2}
-+                        curl[qy][qx][2] += u; // (u_1)_{x_0}
-+                     }
-+                     else // z component
-+                     {
-+                        // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
-+
-+                        for (int dz = 0; dz < D1Dz; ++dz)
-+                        {
-+                           const double wz = sBo[dz][qz];
-+
-+                           for (int dy = 0; dy < D1Dy; ++dy)
-+                           {
-+                              const double wy = sBc[dy][qy];
-+                              const double wDy = sGc[dy][qy];
-+
-+                              for (int dx = 0; dx < D1Dx; ++dx)
-+                              {
-+                                 const double t = sX[dz][dy][dx];
-+                                 const double wx = t * sBc[dx][qx];
-+                                 const double wDx = t * sGc[dx][qx];
-+
-+                                 u += wDx * wy * wz;
-+                                 v += wx * wDy * wz;
-+                              }
-+                           }
-+                        }
-+
-+                        curl[qy][qx][0] += v; // (u_2)_{x_1}
-+                        curl[qy][qx][1] -= u; // -(u_2)_{x_0}
-+                     }
-+                  } // qx
-+               } // qy
-+            } // tidz == qz
-+
-+            osc += D1Dx * D1Dy * D1Dz;
-+            MFEM_SYNC_THREAD;
-+         } // c
-+
-+         double dxyz1 = 0.0;
-+         double dxyz2 = 0.0;
-+         double dxyz3 = 0.0;
-+
-+         MFEM_FOREACH_THREAD(dz,z,D1D)
-+         {
-+            const double wcz = sBc[dz][qz];
-+            const double wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
-+
-+            MFEM_FOREACH_THREAD(dy,y,D1D)
-+            {
-+               MFEM_FOREACH_THREAD(dx,x,D1D)
-+               {
-+                  for (int qy = 0; qy < Q1D; ++qy)
-+                  {
-+                     const double wcy = sBc[dy][qy];
-+                     const double wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
-+
-+                     for (int qx = 0; qx < Q1D; ++qx)
-+                     {
-+                        const double O11 = sop[0][qx][qy];
-+                        double c1, c2, c3;
-+                        if (coeffDim == 1)
-+                        {
-+                           c1 = O11 * curl[qy][qx][0];
-+                           c2 = O11 * curl[qy][qx][1];
-+                           c3 = O11 * curl[qy][qx][2];
-+                        }
-+                        else
-+                        {
-+                           const double O21 = sop[1][qx][qy];
-+                           const double O31 = sop[2][qx][qy];
-+                           const double O12 = sop[3][qx][qy];
-+                           const double O22 = sop[4][qx][qy];
-+                           const double O32 = sop[5][qx][qy];
-+                           const double O13 = sop[6][qx][qy];
-+                           const double O23 = sop[7][qx][qy];
-+                           const double O33 = sop[8][qx][qy];
-+                           c1 = (O11*curl[qy][qx][0])+(O12*curl[qy][qx][1])+(O13*curl[qy][qx][2]);
-+                           c2 = (O21*curl[qy][qx][0])+(O22*curl[qy][qx][1])+(O23*curl[qy][qx][2]);
-+                           c3 = (O31*curl[qy][qx][0])+(O32*curl[qy][qx][1])+(O33*curl[qy][qx][2]);
-+                        }
-+
-+                        const double wcx = sBc[dx][qx];
-+
-+                        if (dx < D1D-1)
-+                        {
-+                           const double wx = sBo[dx][qx];
-+                           dxyz1 += c1 * wx * wcy * wcz;
-+                        }
-+
-+                        dxyz2 += c2 * wcx * wy * wcz;
-+                        dxyz3 += c3 * wcx * wcy * wz;
-+                     } // qx
-+                  } // qy
-+               } // dx
-+            } // dy
-+         } // dz
-+
-+         MFEM_SYNC_THREAD;
-+
-+         MFEM_FOREACH_THREAD(dz,z,D1D)
-+         {
-+            MFEM_FOREACH_THREAD(dy,y,D1D)
-+            {
-+               MFEM_FOREACH_THREAD(dx,x,D1D)
-+               {
-+                  if (dx < D1D-1)
-+                  {
-+                     Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
-+                  }
-+                  if (dy < D1D-1)
-+                  {
-+                     Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
-+                  }
-+                  if (dz < D1D-1)
-+                  {
-+                     Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
-+                  }
-+               }
-+            }
-+         }
-+      } // qz
-+   }; // end of element loop
-+
-+   auto host_kernel = [&] MFEM_LAMBDA (int)
-+   {
-+      MFEM_ABORT_KERNEL("This kernel should only be used on GPU.");
-+   };
-+
-+   ForallWrap<3>(true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
-+}
-+
-+// Apply to x corresponding to DOFs in H(curl) (trial), integrated against curl
-+// of H(curl) test functions corresponding to y.
-+template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
-+MFEM_HOST_DEVICE inline
-+void PAHcurlL2Apply3DTranspose(const int D1D,
-+                               const int Q1D,
-+                               const int coeffDim,
-+                               const int NE,
-+                               const Array<double> &bo,
-+                               const Array<double> &bc,
-+                               const Array<double> &bot,
-+                               const Array<double> &bct,
-+                               const Array<double> &gct,
-+                               const Vector &pa_data,
-+                               const Vector &x,
-+                               Vector &y)
-+{
-+   // See PAHcurlL2Apply3D for comments.
-+
-+   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+
-+   constexpr static int VDIM = 3;
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
-+   auto Bct = Reshape(bct.Read(), D1D, Q1D);
-+   auto Gct = Reshape(gct.Read(), D1D, Q1D);
-+   auto op = Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
-+   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double mass[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];
-+
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               for (int c = 0; c < VDIM; ++c)
-+               {
-+                  mass[qz][qy][qx][c] = 0.0;
-+               }
-+            }
-+         }
-+      }
-+
-+      int osc = 0;
-+
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+      {
-+         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
-+         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+         for (int dz = 0; dz < D1Dz; ++dz)
-+         {
-+            double massXY[MAX_Q1D][MAX_Q1D];
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  massXY[qy][qx] = 0.0;
-+               }
-+            }
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               double massX[MAX_Q1D];
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  massX[qx] = 0.0;
-+               }
-+
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     massX[qx] += t * ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
-+                  }
-+               }
-+
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  const double wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     const double wx = massX[qx];
-+                     massXY[qy][qx] += wx * wy;
-+                  }
-+               }
-+            }
-+
-+            for (int qz = 0; qz < Q1D; ++qz)
-+            {
-+               const double wz = (c == 2) ? Bo(qz,dz) : Bc(qz,dz);
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     mass[qz][qy][qx][c] += massXY[qy][qx] * wz;
-+                  }
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }  // loop (c) over components
-+
-+      // Apply D operator.
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               const double O11 = op(0,qx,qy,qz,e);
-+               if (coeffDim == 1)
-+               {
-+                  for (int c = 0; c < VDIM; ++c)
-+                  {
-+                     mass[qz][qy][qx][c] *= O11;
-+                  }
-+               }
-+               else
-+               {
-+                  const double O12 = op(1,qx,qy,qz,e);
-+                  const double O13 = op(2,qx,qy,qz,e);
-+                  const double O21 = op(3,qx,qy,qz,e);
-+                  const double O22 = op(4,qx,qy,qz,e);
-+                  const double O23 = op(5,qx,qy,qz,e);
-+                  const double O31 = op(6,qx,qy,qz,e);
-+                  const double O32 = op(7,qx,qy,qz,e);
-+                  const double O33 = op(8,qx,qy,qz,e);
-+                  const double massX = mass[qz][qy][qx][0];
-+                  const double massY = mass[qz][qy][qx][1];
-+                  const double massZ = mass[qz][qy][qx][2];
-+                  mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
-+                  mass[qz][qy][qx][1] = (O21*massX)+(O22*massY)+(O23*massZ);
-+                  mass[qz][qy][qx][2] = (O31*massX)+(O32*massY)+(O33*massZ);
-+               }
-+            }
-+         }
-+      }
-+
-+      // x component
-+      osc = 0;
-+      {
-+         const int D1Dz = D1D;
-+         const int D1Dy = D1D;
-+         const int D1Dx = D1D - 1;
-+
-+         for (int qz = 0; qz < Q1D; ++qz)
-+         {
-+            double gradXY12[MAX_D1D][MAX_D1D];
-+            double gradXY21[MAX_D1D][MAX_D1D];
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  gradXY12[dy][dx] = 0.0;
-+                  gradXY21[dy][dx] = 0.0;
-+               }
-+            }
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               double massX[MAX_D1D][2];
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  for (int n = 0; n < 2; ++n)
-+                  {
-+                     massX[dx][n] = 0.0;
-+                  }
-+               }
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     const double wx = Bot(dx,qx);
-+
-+                     massX[dx][0] += wx * mass[qz][qy][qx][1];
-+                     massX[dx][1] += wx * mass[qz][qy][qx][2];
-+                  }
-+               }
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  const double wy = Bct(dy,qy);
-+                  const double wDy = Gct(dy,qy);
-+
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     gradXY21[dy][dx] += massX[dx][0] * wy;
-+                     gradXY12[dy][dx] += massX[dx][1] * wDy;
-+                  }
-+               }
-+            }
-+
-+            for (int dz = 0; dz < D1Dz; ++dz)
-+            {
-+               const double wz = Bct(dz,qz);
-+               const double wDz = Gct(dz,qz);
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
-+                     // (u_0)_{x_2} * (op * curl)_1 - (u_0)_{x_1} * (op * curl)_2
-+                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
-+                       e) += (gradXY21[dy][dx] * wDz) - (gradXY12[dy][dx] * wz);
-+                  }
-+               }
-+            }
-+         }  // loop qz
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }
-+
-+      // y component
-+      {
-+         const int D1Dz = D1D;
-+         const int D1Dy = D1D - 1;
-+         const int D1Dx = D1D;
-+
-+         for (int qz = 0; qz < Q1D; ++qz)
-+         {
-+            double gradXY02[MAX_D1D][MAX_D1D];
-+            double gradXY20[MAX_D1D][MAX_D1D];
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  gradXY02[dy][dx] = 0.0;
-+                  gradXY20[dy][dx] = 0.0;
-+               }
-+            }
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               double massY[MAX_D1D][2];
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  massY[dy][0] = 0.0;
-+                  massY[dy][1] = 0.0;
-+               }
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int dy = 0; dy < D1Dy; ++dy)
-+                  {
-+                     const double wy = Bot(dy,qy);
-+
-+                     massY[dy][0] += wy * mass[qz][qy][qx][2];
-+                     massY[dy][1] += wy * mass[qz][qy][qx][0];
-+                  }
-+               }
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  const double wx = Bct(dx,qx);
-+                  const double wDx = Gct(dx,qx);
-+
-+                  for (int dy = 0; dy < D1Dy; ++dy)
-+                  {
-+                     gradXY02[dy][dx] += massY[dy][0] * wDx;
-+                     gradXY20[dy][dx] += massY[dy][1] * wx;
-+                  }
-+               }
-+            }
-+
-+            for (int dz = 0; dz < D1Dz; ++dz)
-+            {
-+               const double wz = Bct(dz,qz);
-+               const double wDz = Gct(dz,qz);
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
-+                     // -(u_1)_{x_2} * (op * curl)_0 + (u_1)_{x_0} * (op * curl)_2
-+                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
-+                       e) += (-gradXY20[dy][dx] * wDz) + (gradXY02[dy][dx] * wz);
-+                  }
-+               }
-+            }
-+         }  // loop qz
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }
-+
-+      // z component
-+      {
-+         const int D1Dz = D1D - 1;
-+         const int D1Dy = D1D;
-+         const int D1Dx = D1D;
-+
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            double gradYZ01[MAX_D1D][MAX_D1D];
-+            double gradYZ10[MAX_D1D][MAX_D1D];
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               for (int dz = 0; dz < D1Dz; ++dz)
-+               {
-+                  gradYZ01[dz][dy] = 0.0;
-+                  gradYZ10[dz][dy] = 0.0;
-+               }
-+            }
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               double massZ[MAX_D1D][2];
-+               for (int dz = 0; dz < D1Dz; ++dz)
-+               {
-+                  for (int n = 0; n < 2; ++n)
-+                  {
-+                     massZ[dz][n] = 0.0;
-+                  }
-+               }
-+               for (int qz = 0; qz < Q1D; ++qz)
-+               {
-+                  for (int dz = 0; dz < D1Dz; ++dz)
-+                  {
-+                     const double wz = Bot(dz,qz);
-+
-+                     massZ[dz][0] += wz * mass[qz][qy][qx][0];
-+                     massZ[dz][1] += wz * mass[qz][qy][qx][1];
-+                  }
-+               }
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  const double wy = Bct(dy,qy);
-+                  const double wDy = Gct(dy,qy);
-+
-+                  for (int dz = 0; dz < D1Dz; ++dz)
-+                  {
-+                     gradYZ01[dz][dy] += wy * massZ[dz][1];
-+                     gradYZ10[dz][dy] += wDy * massZ[dz][0];
-+                  }
-+               }
-+            }
-+
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               const double wx = Bct(dx,qx);
-+               const double wDx = Gct(dx,qx);
-+
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  for (int dz = 0; dz < D1Dz; ++dz)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
-+                     // (u_2)_{x_1} * (op * curl)_0 - (u_2)_{x_0} * (op * curl)_1
-+                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
-+                       e) += (gradYZ10[dz][dy] * wx) - (gradYZ01[dz][dy] * wDx);
-+                  }
-+               }
-+            }
-+         }  // loop qx
-+      }
-+   });
-+}
-+
-+template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
-+MFEM_HOST_DEVICE inline
-+void SmemPAHcurlL2Apply3DTranspose(const int D1D,
-+                                   const int Q1D,
-+                                   const int coeffDim,
-+                                   const int NE,
-+                                   const Array<double> &bo,
-+                                   const Array<double> &bc,
-+                                   const Array<double> &gc,
-+                                   const Vector &pa_data,
-+                                   const Vector &x,
-+                                   Vector &y)
-+{
-+   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto Gc = Reshape(gc.Read(), Q1D, D1D);
-+   auto op = Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
-+   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
-+
-+   auto device_kernel = [=] MFEM_DEVICE (int e)
-+   {
-+      constexpr int VDIM = 3;
-+      constexpr int maxCoeffDim = 9;
-+
-+      MFEM_SHARED double sBo[MAX_D1D][MAX_Q1D];
-+      MFEM_SHARED double sBc[MAX_D1D][MAX_Q1D];
-+      MFEM_SHARED double sGc[MAX_D1D][MAX_Q1D];
-+
-+      double opc[maxCoeffDim];
-+      MFEM_SHARED double sop[maxCoeffDim][MAX_Q1D][MAX_Q1D];
-+      MFEM_SHARED double mass[MAX_Q1D][MAX_Q1D][3];
-+
-+      MFEM_SHARED double sX[MAX_D1D][MAX_D1D][MAX_D1D];
-+
-+      MFEM_FOREACH_THREAD(qx,x,Q1D)
-+      {
-+         MFEM_FOREACH_THREAD(qy,y,Q1D)
-+         {
-+            MFEM_FOREACH_THREAD(qz,z,Q1D)
-+            {
-+               for (int i=0; i<coeffDim; ++i)
-+               {
-+                  opc[i] = op(i,qx,qy,qz,e);
-+               }
-+            }
-+         }
-+      }
-+
-+      const int tidx = MFEM_THREAD_ID(x);
-+      const int tidy = MFEM_THREAD_ID(y);
-+      const int tidz = MFEM_THREAD_ID(z);
-+
-+      if (tidz == 0)
-+      {
-+         MFEM_FOREACH_THREAD(d,y,D1D)
-+         {
-+            MFEM_FOREACH_THREAD(q,x,Q1D)
-+            {
-+               sBc[d][q] = Bc(q,d);
-+               sGc[d][q] = Gc(q,d);
-+               if (d < D1D-1)
-+               {
-+                  sBo[d][q] = Bo(q,d);
-+               }
-+            }
-+         }
-+      }
-+      MFEM_SYNC_THREAD;
-+
-+      for (int qz=0; qz < Q1D; ++qz)
-+      {
-+         if (tidz == qz)
-+         {
-+            MFEM_FOREACH_THREAD(qy,y,Q1D)
-+            {
-+               MFEM_FOREACH_THREAD(qx,x,Q1D)
-+               {
-+                  for (int i=0; i<3; ++i)
-+                  {
-+                     mass[qy][qx][i] = 0.0;
-+                  }
-+               }
-+            }
-+         }
-+
-+         int osc = 0;
-+         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+         {
-+            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
-+            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+            MFEM_FOREACH_THREAD(dz,z,D1Dz)
-+            {
-+               MFEM_FOREACH_THREAD(dy,y,D1Dy)
-+               {
-+                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
-+                  {
-+                     sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  }
-+               }
-+            }
-+            MFEM_SYNC_THREAD;
-+
-+            if (tidz == qz)
-+            {
-+               if (c == 0)
-+               {
-+                  for (int i=0; i<coeffDim; ++i)
-+                  {
-+                     sop[i][tidx][tidy] = opc[i];
-+                  }
-+               }
-+
-+               MFEM_FOREACH_THREAD(qy,y,Q1D)
-+               {
-+                  MFEM_FOREACH_THREAD(qx,x,Q1D)
-+                  {
-+                     double u = 0.0;
-+
-+                     for (int dz = 0; dz < D1Dz; ++dz)
-+                     {
-+                        const double wz = (c == 2) ? sBo[dz][qz] : sBc[dz][qz];
-+
-+                        for (int dy = 0; dy < D1Dy; ++dy)
-+                        {
-+                           const double wy = (c == 1) ? sBo[dy][qy] : sBc[dy][qy];
-+
-+                           for (int dx = 0; dx < D1Dx; ++dx)
-+                           {
-+                              const double wx = sX[dz][dy][dx] * ((c == 0) ? sBo[dx][qx] : sBc[dx][qx]);
-+                              u += wx * wy * wz;
-+                           }
-+                        }
-+                     }
-+
-+                     mass[qy][qx][c] += u;
-+                  } // qx
-+               } // qy
-+            } // tidz == qz
-+
-+            osc += D1Dx * D1Dy * D1Dz;
-+            MFEM_SYNC_THREAD;
-+         } // c
-+
-+         double dxyz1 = 0.0;
-+         double dxyz2 = 0.0;
-+         double dxyz3 = 0.0;
-+
-+         MFEM_FOREACH_THREAD(dz,z,D1D)
-+         {
-+            const double wcz = sBc[dz][qz];
-+            const double wcDz = sGc[dz][qz];
-+            const double wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
-+
-+            MFEM_FOREACH_THREAD(dy,y,D1D)
-+            {
-+               MFEM_FOREACH_THREAD(dx,x,D1D)
-+               {
-+                  for (int qy = 0; qy < Q1D; ++qy)
-+                  {
-+                     const double wcy = sBc[dy][qy];
-+                     const double wcDy = sGc[dy][qy];
-+                     const double wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
-+
-+                     for (int qx = 0; qx < Q1D; ++qx)
-+                     {
-+                        const double O11 = sop[0][qx][qy];
-+                        double c1, c2, c3;
-+                        if (coeffDim == 1)
-+                        {
-+                           c1 = O11 * mass[qy][qx][0];
-+                           c2 = O11 * mass[qy][qx][1];
-+                           c3 = O11 * mass[qy][qx][2];
-+                        }
-+                        else
-+                        {
-+                           const double O12 = sop[1][qx][qy];
-+                           const double O13 = sop[2][qx][qy];
-+                           const double O21 = sop[3][qx][qy];
-+                           const double O22 = sop[4][qx][qy];
-+                           const double O23 = sop[5][qx][qy];
-+                           const double O31 = sop[6][qx][qy];
-+                           const double O32 = sop[7][qx][qy];
-+                           const double O33 = sop[8][qx][qy];
-+
-+                           c1 = (O11*mass[qy][qx][0])+(O12*mass[qy][qx][1])+(O13*mass[qy][qx][2]);
-+                           c2 = (O21*mass[qy][qx][0])+(O22*mass[qy][qx][1])+(O23*mass[qy][qx][2]);
-+                           c3 = (O31*mass[qy][qx][0])+(O32*mass[qy][qx][1])+(O33*mass[qy][qx][2]);
-+                        }
-+
-+                        const double wcx = sBc[dx][qx];
-+                        const double wDx = sGc[dx][qx];
-+
-+                        if (dx < D1D-1)
-+                        {
-+                           const double wx = sBo[dx][qx];
-+                           dxyz1 += (wx * c2 * wcy * wcDz) - (wx * c3 * wcDy * wcz);
-+                        }
-+
-+                        dxyz2 += (-wy * c1 * wcx * wcDz) + (wy * c3 * wDx * wcz);
-+
-+                        dxyz3 += (wcDy * wz * c1 * wcx) - (wcy * wz * c2 * wDx);
-+                     } // qx
-+                  } // qy
-+               } // dx
-+            } // dy
-+         } // dz
-+
-+         MFEM_SYNC_THREAD;
-+
-+         MFEM_FOREACH_THREAD(dz,z,D1D)
-+         {
-+            MFEM_FOREACH_THREAD(dy,y,D1D)
-+            {
-+               MFEM_FOREACH_THREAD(dx,x,D1D)
-+               {
-+                  if (dx < D1D-1)
-+                  {
-+                     Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
-+                  }
-+                  if (dy < D1D-1)
-+                  {
-+                     Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
-+                  }
-+                  if (dz < D1D-1)
-+                  {
-+                     Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
-+                  }
-+               }
-+            }
-+         }
-+      } // qz
-+   }; // end of element loop
-+
-+   auto host_kernel = [&] MFEM_LAMBDA (int)
-+   {
-+      MFEM_ABORT_KERNEL("This kernel should only be used on GPU.");
-+   };
-+
-+   ForallWrap<3>(true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
-+}
-+
-+} // namespace internal
-+
-+} // namespace mfem
-+
-+#endif
-diff --git a/fem/integ/bilininteg_hcurlhdiv_kernels.hpp b/fem/integ/bilininteg_hcurlhdiv_kernels.hpp
-new file mode 100644
-index 000000000..c7165654a
---- /dev/null
-+++ b/fem/integ/bilininteg_hcurlhdiv_kernels.hpp
-@@ -0,0 +1,1303 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license.  We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#ifndef MFEM_BILININTEG_HCURLHDIV_KERNELS_HPP
-+#define MFEM_BILININTEG_HCURLHDIV_KERNELS_HPP
-+
-+#include "../../config/config.hpp"
-+#include "../../general/forall.hpp"
-+#include "../../linalg/dtensor.hpp"
-+
-+namespace mfem
-+{
-+
-+namespace internal
-+{
-+
-+// PA H(curl) x H(div) mass assemble 2D kernel, with factor
-+// dF^{-1} C dF for a vector or matrix coefficient C.
-+// If transpose, use dF^T C dF^{-T} for H(div) x H(curl).
-+MFEM_HOST_DEVICE inline
-+void PAHcurlHdivMassSetup2D(const int Q1D,
-+                            const int coeffDim,
-+                            const int NE,
-+                            const bool transpose,
-+                            const Array<double> &w_,
-+                            const Vector &j,
-+                            Vector &coeff_,
-+                            Vector &op)
-+{
-+   const bool symmetric = (coeffDim != 4);
-+   auto W = Reshape(w_.Read(), Q1D, Q1D);
-+   auto J = Reshape(j.Read(), Q1D, Q1D, 2, 2, NE);
-+   auto coeff = Reshape(coeff_.Read(), coeffDim, Q1D, Q1D, NE);
-+   auto y = Reshape(op.Write(), 4, Q1D, Q1D, NE);
-+
-+   const int i11 = 0;
-+   const int i12 = transpose ? 2 : 1;
-+   const int i21 = transpose ? 1 : 2;
-+   const int i22 = 3;
-+
-+   mfem::forall_2D(NE, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      MFEM_FOREACH_THREAD(qx,x,Q1D)
-+      {
-+         MFEM_FOREACH_THREAD(qy,y,Q1D)
-+         {
-+            const double J11 = J(qx,qy,0,0,e);
-+            const double J21 = J(qx,qy,1,0,e);
-+            const double J12 = J(qx,qy,0,1,e);
-+            const double J22 = J(qx,qy,1,1,e);
-+            const double w_detJ = W(qx,qy) / ((J11*J22) - (J21*J12));
-+
-+            if (coeffDim == 3 || coeffDim == 4) // Matrix coefficient version
-+            {
-+               // First compute entries of R = MJ
-+               const double M11 = coeff(i11,qx,qy,e);
-+               const double M12 = (!symmetric) ? coeff(i12,qx,qy,e) : coeff(1,qx,qy,e);
-+               const double M21 = (!symmetric) ? coeff(i21,qx,qy,e) : M12;
-+               const double M22 = (!symmetric) ? coeff(i22,qx,qy,e) : coeff(2,qx,qy,e);
-+
-+               // J^{-1} M^T
-+               const double R11 = ( J22*M11 - J12*M12); // 1,1
-+               const double R12 = ( J22*M21 - J12*M22); // 1,2
-+               const double R21 = (-J21*M11 + J11*M12); // 2,1
-+               const double R22 = (-J21*M21 + J11*M22); // 2,2
-+
-+               // (RJ)^T
-+               y(i11,qx,qy,e) = w_detJ * (R11*J11 + R12*J21); // 1,1
-+               y(i21,qx,qy,e) = w_detJ * (R11*J12 + R12*J22); // 1,2 (transpose)
-+               y(i12,qx,qy,e) = w_detJ * (R21*J11 + R22*J21); // 2,1 (transpose)
-+               y(i22,qx,qy,e) = w_detJ * (R21*J12 + R22*J22); // 2,2
-+            }
-+            else if (coeffDim == 2) // Vector coefficient version
-+            {
-+               const double D1 = coeff(0,qx,qy,e);
-+               const double D2 = coeff(1,qx,qy,e);
-+               const double R11 = D1*J11;
-+               const double R12 = D1*J12;
-+               const double R21 = D2*J21;
-+               const double R22 = D2*J22;
-+               y(i11,qx,qy,e) = w_detJ * ( J22*R11 - J12*R21); // 1,1
-+               y(i21,qx,qy,e) = w_detJ * ( J22*R12 - J12*R22); // 1,2 (transpose)
-+               y(i12,qx,qy,e) = w_detJ * (-J21*R11 + J11*R21); // 2,1 (transpose)
-+               y(i22,qx,qy,e) = w_detJ * (-J21*R12 + J11*R22); // 2,2
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+// PA H(curl) x H(div) mass assemble 3D kernel, with factor
-+// dF^{-1} C dF for a vector or matrix coefficient C.
-+// If transpose, use dF^T C dF^{-T} for H(div) x H(curl).
-+MFEM_HOST_DEVICE inline
-+void PAHcurlHdivMassSetup3D(const int Q1D,
-+                            const int coeffDim,
-+                            const int NE,
-+                            const bool transpose,
-+                            const Array<double> &w_,
-+                            const Vector &j,
-+                            Vector &coeff_,
-+                            Vector &op)
-+{
-+   const bool symmetric = (coeffDim != 9);
-+   auto W = Reshape(w_.Read(), Q1D, Q1D, Q1D);
-+   auto J = Reshape(j.Read(), Q1D, Q1D, Q1D, 3, 3, NE);
-+   auto coeff = Reshape(coeff_.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
-+   auto y = Reshape(op.Write(), 9, Q1D, Q1D, Q1D, NE);
-+
-+   const int i11 = 0;
-+   const int i12 = transpose ? 3 : 1;
-+   const int i13 = transpose ? 6 : 2;
-+   const int i21 = transpose ? 1 : 3;
-+   const int i22 = 4;
-+   const int i23 = transpose ? 7 : 5;
-+   const int i31 = transpose ? 2 : 6;
-+   const int i32 = transpose ? 5 : 7;
-+   const int i33 = 8;
-+
-+   mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      MFEM_FOREACH_THREAD(qx,x,Q1D)
-+      {
-+         MFEM_FOREACH_THREAD(qy,y,Q1D)
-+         {
-+            MFEM_FOREACH_THREAD(qz,z,Q1D)
-+            {
-+               const double J11 = J(qx,qy,qz,0,0,e);
-+               const double J21 = J(qx,qy,qz,1,0,e);
-+               const double J31 = J(qx,qy,qz,2,0,e);
-+               const double J12 = J(qx,qy,qz,0,1,e);
-+               const double J22 = J(qx,qy,qz,1,1,e);
-+               const double J32 = J(qx,qy,qz,2,1,e);
-+               const double J13 = J(qx,qy,qz,0,2,e);
-+               const double J23 = J(qx,qy,qz,1,2,e);
-+               const double J33 = J(qx,qy,qz,2,2,e);
-+               const double detJ = J11 * (J22 * J33 - J32 * J23) -
-+                                   J21 * (J12 * J33 - J32 * J13) +
-+                                   J31 * (J12 * J23 - J22 * J13);
-+               const double w_detJ = W(qx,qy,qz) / detJ;
-+               // adj(J)
-+               const double A11 = (J22 * J33) - (J23 * J32);
-+               const double A12 = (J32 * J13) - (J12 * J33);
-+               const double A13 = (J12 * J23) - (J22 * J13);
-+               const double A21 = (J31 * J23) - (J21 * J33);
-+               const double A22 = (J11 * J33) - (J13 * J31);
-+               const double A23 = (J21 * J13) - (J11 * J23);
-+               const double A31 = (J21 * J32) - (J31 * J22);
-+               const double A32 = (J31 * J12) - (J11 * J32);
-+               const double A33 = (J11 * J22) - (J12 * J21);
-+
-+               if (coeffDim == 6 || coeffDim == 9) // Matrix coefficient version
-+               {
-+                  // First compute entries of R = M^T J
-+                  const double M11 = (!symmetric) ? coeff(i11,qx,qy,qz,e) : coeff(0,qx,qy,qz,e);
-+                  const double M12 = (!symmetric) ? coeff(i12,qx,qy,qz,e) : coeff(1,qx,qy,qz,e);
-+                  const double M13 = (!symmetric) ? coeff(i13,qx,qy,qz,e) : coeff(2,qx,qy,qz,e);
-+                  const double M21 = (!symmetric) ? coeff(i21,qx,qy,qz,e) : M12;
-+                  const double M22 = (!symmetric) ? coeff(i22,qx,qy,qz,e) : coeff(3,qx,qy,qz,e);
-+                  const double M23 = (!symmetric) ? coeff(i23,qx,qy,qz,e) : coeff(4,qx,qy,qz,e);
-+                  const double M31 = (!symmetric) ? coeff(i31,qx,qy,qz,e) : M13;
-+                  const double M32 = (!symmetric) ? coeff(i32,qx,qy,qz,e) : M23;
-+                  const double M33 = (!symmetric) ? coeff(i33,qx,qy,qz,e) : coeff(5,qx,qy,qz,e);
-+
-+                  const double R11 = M11*J11 + M21*J21 + M31*J31;
-+                  const double R12 = M11*J12 + M21*J22 + M31*J32;
-+                  const double R13 = M11*J13 + M21*J23 + M31*J33;
-+                  const double R21 = M12*J11 + M22*J21 + M32*J31;
-+                  const double R22 = M12*J12 + M22*J22 + M32*J32;
-+                  const double R23 = M12*J13 + M22*J23 + M32*J33;
-+                  const double R31 = M13*J11 + M23*J21 + M33*J31;
-+                  const double R32 = M13*J12 + M23*J22 + M33*J32;
-+                  const double R33 = M13*J13 + M23*J23 + M33*J33;
-+
-+                  // y = (J^{-1} M^T J)^T
-+                  y(i11,qx,qy,qz,e) = w_detJ * (A11*R11 + A12*R21 + A13*R31); // 1,1
-+                  y(i21,qx,qy,qz,e) = w_detJ * (A11*R12 + A12*R22 + A13*R32); // 1,2
-+                  y(i31,qx,qy,qz,e) = w_detJ * (A11*R13 + A12*R23 + A13*R33); // 1,3
-+                  y(i12,qx,qy,qz,e) = w_detJ * (A21*R11 + A22*R21 + A23*R31); // 2,1
-+                  y(i22,qx,qy,qz,e) = w_detJ * (A21*R12 + A22*R22 + A23*R32); // 2,2
-+                  y(i32,qx,qy,qz,e) = w_detJ * (A21*R13 + A22*R23 + A23*R33); // 2,3
-+                  y(i13,qx,qy,qz,e) = w_detJ * (A31*R11 + A32*R21 + A33*R31); // 3,1
-+                  y(i23,qx,qy,qz,e) = w_detJ * (A31*R12 + A32*R22 + A33*R32); // 3,2
-+                  y(i33,qx,qy,qz,e) = w_detJ * (A31*R13 + A32*R23 + A33*R33); // 3,3
-+               }
-+               else if (coeffDim == 3)  // Vector coefficient version
-+               {
-+                  const double D1 = coeff(0,qx,qy,qz,e);
-+                  const double D2 = coeff(1,qx,qy,qz,e);
-+                  const double D3 = coeff(2,qx,qy,qz,e);
-+                  // detJ J^{-1} DJ = adj(J) DJ
-+                  // transpose
-+                  y(i11,qx,qy,qz,e) = w_detJ * (D1*A11*J11 + D2*A12*J21 + D3*A13*J31); // 1,1
-+                  y(i21,qx,qy,qz,e) = w_detJ * (D1*A11*J12 + D2*A12*J22 + D3*A13*J32); // 1,2
-+                  y(i31,qx,qy,qz,e) = w_detJ * (D1*A11*J13 + D2*A12*J23 + D3*A13*J33); // 1,3
-+                  y(i12,qx,qy,qz,e) = w_detJ * (D1*A21*J11 + D2*A22*J21 + D3*A23*J31); // 2,1
-+                  y(i22,qx,qy,qz,e) = w_detJ * (D1*A21*J12 + D2*A22*J22 + D3*A23*J32); // 2,2
-+                  y(i32,qx,qy,qz,e) = w_detJ * (D1*A21*J13 + D2*A22*J23 + D3*A23*J33); // 2,3
-+                  y(i13,qx,qy,qz,e) = w_detJ * (D1*A31*J11 + D2*A32*J21 + D3*A33*J31); // 3,1
-+                  y(i23,qx,qy,qz,e) = w_detJ * (D1*A31*J12 + D2*A32*J22 + D3*A33*J32); // 3,2
-+                  y(i33,qx,qy,qz,e) = w_detJ * (D1*A31*J13 + D2*A32*J23 + D3*A33*J33); // 3,3
-+               }
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+// Mass operator for H(curl) and H(div) functions, using Piola transformations
-+// u = dF^{-T} \hat{u} in H(curl), v = (1 / det dF) dF \hat{v} in H(div).
-+MFEM_HOST_DEVICE inline
-+void PAHcurlHdivMassApply2D(const int D1D,
-+                            const int D1Dtest,
-+                            const int Q1D,
-+                            const int NE,
-+                            const bool scalarCoeff,
-+                            const bool trialHcurl,
-+                            const bool transpose,
-+                            const Array<double> &Bo_,
-+                            const Array<double> &Bc_,
-+                            const Array<double> &Bot_,
-+                            const Array<double> &Bct_,
-+                            const Vector &op_,
-+                            const Vector &x_,
-+                            Vector &y_)
-+{
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
-+
-+   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+   constexpr static int VDIM = 2;
-+
-+   auto Bo = Reshape(Bo_.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(Bc_.Read(), Q1D, D1D);
-+   auto Bot = Reshape(Bot_.Read(), D1Dtest-1, Q1D);
-+   auto Bct = Reshape(Bct_.Read(), D1Dtest, Q1D);
-+   auto op = Reshape(op_.Read(), scalarCoeff ? 1 : 4, Q1D, Q1D, NE);
-+   auto x = Reshape(x_.Read(), 2*(D1D-1)*D1D, NE);
-+   auto y = Reshape(y_.ReadWrite(), 2*(D1Dtest-1)*D1Dtest, NE);
-+
-+   const int i12 = transpose ? 2 : 1;
-+   const int i21 = transpose ? 1 : 2;
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double mass[MAX_Q1D][MAX_Q1D][VDIM];
-+
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            for (int c = 0; c < VDIM; ++c)
-+            {
-+               mass[qy][qx][c] = 0.0;
-+            }
-+         }
-+      }
-+
-+      int osc = 0;
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y trial components
-+      {
-+         const int D1Dy = trialHcurl ? ((c == 1) ? D1D - 1 : D1D) :
-+                          ((c == 1) ? D1D : D1D - 1);
-+         const int D1Dx = trialHcurl ? ((c == 0) ? D1D - 1 : D1D) :
-+                          ((c == 0) ? D1D : D1D - 1);
-+
-+         for (int dy = 0; dy < D1Dy; ++dy)
-+         {
-+            double massX[MAX_Q1D];
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               massX[qx] = 0.0;
-+            }
-+
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               const double t = x(dx + (dy * D1Dx) + osc, e);
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  massX[qx] += t * (trialHcurl ? ((c == 0) ? Bo(qx,dx) : Bc(qx,dx)) :
-+                                    ((c == 0) ? Bc(qx,dx) : Bo(qx,dx)));
-+               }
-+            }
-+
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               const double wy = trialHcurl ? ((c == 1) ? Bo(qy,dy) : Bc(qy,dy)) :
-+                                 ((c == 1) ? Bc(qy,dy) : Bo(qy,dy));
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  mass[qy][qx][c] += massX[qx] * wy;
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy;
-+      }  // loop (c) over components
-+
-+      // Apply D operator.
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            const double O11 = op(0,qx,qy,e);
-+            const double O12 = scalarCoeff ? 0.0 : op(i12,qx,qy,e);
-+            const double O21 = scalarCoeff ? 0.0 : op(i21,qx,qy,e);
-+            const double O22 = scalarCoeff ? O11 : op(3,qx,qy,e);
-+            const double massX = mass[qy][qx][0];
-+            const double massY = mass[qy][qx][1];
-+            mass[qy][qx][0] = (O11*massX)+(O12*massY);
-+            mass[qy][qx][1] = (O21*massX)+(O22*massY);
-+         }
-+      }
-+
-+      osc = 0;
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y test components
-+      {
-+         const int D1Dy = trialHcurl ? ((c == 1) ? D1Dtest : D1Dtest - 1) :
-+                          ((c == 1) ? D1Dtest - 1 : D1Dtest);
-+         const int D1Dx = trialHcurl ? ((c == 0) ? D1Dtest : D1Dtest - 1) :
-+                          ((c == 0) ? D1Dtest - 1 : D1Dtest);
-+
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            double massX[HDIV_MAX_D1D];
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               massX[dx] = 0.0;
-+            }
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  massX[dx] += mass[qy][qx][c] * (trialHcurl ?
-+                                                  ((c == 0) ? Bct(dx,qx) : Bot(dx,qx)) :
-+                                                  ((c == 0) ? Bot(dx,qx) : Bct(dx,qx)));
-+               }
-+            }
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               const double wy = trialHcurl ? ((c == 1) ? Bct(dy,qy) : Bot(dy,qy)) :
-+                                 ((c == 1) ? Bot(dy,qy) : Bct(dy,qy));
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  y(dx + (dy * D1Dx) + osc, e) += massX[dx] * wy;
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy;
-+      }  // loop c
-+   }); // end of element loop
-+}
-+
-+// Mass operator for H(curl) and H(div) functions, using Piola transformations
-+// u = dF^{-T} \hat{u} in H(curl), v = (1 / det dF) dF \hat{v} in H(div).
-+MFEM_HOST_DEVICE inline
-+void PAHcurlHdivMassApply3D(const int D1D,
-+                            const int D1Dtest,
-+                            const int Q1D,
-+                            const int NE,
-+                            const bool scalarCoeff,
-+                            const bool trialHcurl,
-+                            const bool transpose,
-+                            const Array<double> &Bo_,
-+                            const Array<double> &Bc_,
-+                            const Array<double> &Bot_,
-+                            const Array<double> &Bct_,
-+                            const Vector &op_,
-+                            const Vector &x_,
-+                            Vector &y_)
-+{
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
-+
-+   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+   constexpr static int VDIM = 3;
-+
-+   auto Bo = Reshape(Bo_.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(Bc_.Read(), Q1D, D1D);
-+   auto Bot = Reshape(Bot_.Read(), D1Dtest-1, Q1D);
-+   auto Bct = Reshape(Bct_.Read(), D1Dtest, Q1D);
-+   auto op = Reshape(op_.Read(), scalarCoeff ? 1 : 9, Q1D, Q1D, Q1D, NE);
-+   auto x = Reshape(x_.Read(), 3*(D1D-1)*D1D*(trialHcurl ? D1D : D1D-1), NE);
-+   auto y = Reshape(y_.ReadWrite(), 3*(D1Dtest-1)*D1Dtest*
-+                    (trialHcurl ? D1Dtest-1 : D1Dtest), NE);
-+
-+   const int i12 = transpose ? 3 : 1;
-+   const int i13 = transpose ? 6 : 2;
-+   const int i21 = transpose ? 1 : 3;
-+   const int i23 = transpose ? 7 : 5;
-+   const int i31 = transpose ? 2 : 6;
-+   const int i32 = transpose ? 5 : 7;
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double mass[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];
-+
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               for (int c = 0; c < VDIM; ++c)
-+               {
-+                  mass[qz][qy][qx][c] = 0.0;
-+               }
-+            }
-+         }
-+      }
-+
-+      int osc = 0;
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z trial components
-+      {
-+         const int D1Dz = trialHcurl ? ((c == 2) ? D1D - 1 : D1D) :
-+                          ((c == 2) ? D1D : D1D - 1);
-+         const int D1Dy = trialHcurl ? ((c == 1) ? D1D - 1 : D1D) :
-+                          ((c == 1) ? D1D : D1D - 1);
-+         const int D1Dx = trialHcurl ? ((c == 0) ? D1D - 1 : D1D) :
-+                          ((c == 0) ? D1D : D1D - 1);
-+
-+         for (int dz = 0; dz < D1Dz; ++dz)
-+         {
-+            double massXY[MAX_Q1D][MAX_Q1D];
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  massXY[qy][qx] = 0.0;
-+               }
-+            }
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               double massX[MAX_Q1D];
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  massX[qx] = 0.0;
-+               }
-+
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  const double t = x(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     massX[qx] += t * (trialHcurl ? ((c == 0) ? Bo(qx,dx) : Bc(qx,dx)) :
-+                                       ((c == 0) ? Bc(qx,dx) : Bo(qx,dx)));
-+                  }
-+               }
-+
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  const double wy = trialHcurl ? ((c == 1) ? Bo(qy,dy) : Bc(qy,dy)) :
-+                                    ((c == 1) ? Bc(qy,dy) : Bo(qy,dy));
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     const double wx = massX[qx];
-+                     massXY[qy][qx] += wx * wy;
-+                  }
-+               }
-+            }
-+
-+            for (int qz = 0; qz < Q1D; ++qz)
-+            {
-+               const double wz = trialHcurl ? ((c == 2) ? Bo(qz,dz) : Bc(qz,dz)) :
-+                                 ((c == 2) ? Bc(qz,dz) : Bo(qz,dz));
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     mass[qz][qy][qx][c] += massXY[qy][qx] * wz;
-+                  }
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }  // loop (c) over components
-+
-+      // Apply D operator.
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               const double O11 = op(0,qx,qy,qz,e);
-+               const double O12 = scalarCoeff ? 0.0 : op(i12,qx,qy,qz,e);
-+               const double O13 = scalarCoeff ? 0.0 : op(i13,qx,qy,qz,e);
-+               const double O21 = scalarCoeff ? 0.0 : op(i21,qx,qy,qz,e);
-+               const double O22 = scalarCoeff ? O11 : op(4,qx,qy,qz,e);
-+               const double O23 = scalarCoeff ? 0.0 : op(i23,qx,qy,qz,e);
-+               const double O31 = scalarCoeff ? 0.0 : op(i31,qx,qy,qz,e);
-+               const double O32 = scalarCoeff ? 0.0 : op(i32,qx,qy,qz,e);
-+               const double O33 = scalarCoeff ? O11 : op(8,qx,qy,qz,e);
-+               const double massX = mass[qz][qy][qx][0];
-+               const double massY = mass[qz][qy][qx][1];
-+               const double massZ = mass[qz][qy][qx][2];
-+               mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
-+               mass[qz][qy][qx][1] = (O21*massX)+(O22*massY)+(O23*massZ);
-+               mass[qz][qy][qx][2] = (O31*massX)+(O32*massY)+(O33*massZ);
-+            }
-+         }
-+      }
-+
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         double massXY[HDIV_MAX_D1D][HDIV_MAX_D1D];
-+
-+         osc = 0;
-+         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z test components
-+         {
-+            const int D1Dz = trialHcurl ? ((c == 2) ? D1Dtest : D1Dtest - 1) :
-+                             ((c == 2) ? D1Dtest - 1 : D1Dtest);
-+            const int D1Dy = trialHcurl ? ((c == 1) ? D1Dtest : D1Dtest - 1) :
-+                             ((c == 1) ? D1Dtest - 1 : D1Dtest);
-+            const int D1Dx = trialHcurl ? ((c == 0) ? D1Dtest : D1Dtest - 1) :
-+                             ((c == 0) ? D1Dtest - 1 : D1Dtest);
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  massXY[dy][dx] = 0.0;
-+               }
-+            }
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               double massX[HDIV_MAX_D1D];
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  massX[dx] = 0.0;
-+               }
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     massX[dx] += mass[qz][qy][qx][c] * (trialHcurl ?
-+                                                         ((c == 0) ? Bct(dx,qx) : Bot(dx,qx)) :
-+                                                         ((c == 0) ? Bot(dx,qx) : Bct(dx,qx)));
-+                  }
-+               }
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  const double wy = trialHcurl ? ((c == 1) ? Bct(dy,qy) : Bot(dy,qy)) :
-+                                    ((c == 1) ? Bot(dy,qy) : Bct(dy,qy));
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     massXY[dy][dx] += massX[dx] * wy;
-+                  }
-+               }
-+            }
-+
-+            for (int dz = 0; dz < D1Dz; ++dz)
-+            {
-+               const double wz = trialHcurl ? ((c == 2) ? Bct(dz,qz) : Bot(dz,qz)) :
-+                                 ((c == 2) ? Bot(dz,qz) : Bct(dz,qz));
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) +=
-+                        massXY[dy][dx] * wz;
-+                  }
-+               }
-+            }
-+
-+            osc += D1Dx * D1Dy * D1Dz;
-+         }  // loop c
-+      }  // loop qz
-+   }); // end of element loop
-+}
-+
-+// Apply to x corresponding to DOFs in H(curl) (trial), whose curl is
-+// integrated against H(div) test functions corresponding to y.
-+template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
-+MFEM_HOST_DEVICE inline
-+void PAHcurlHdivApply3D(const int D1D,
-+                        const int D1Dtest,
-+                        const int Q1D,
-+                        const int NE,
-+                        const Array<double> &bo,
-+                        const Array<double> &bc,
-+                        const Array<double> &bot,
-+                        const Array<double> &bct,
-+                        const Array<double> &gc,
-+                        const Vector &pa_data,
-+                        const Vector &x,
-+                        Vector &y)
-+{
-+   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+   // Using Piola transformations (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u}
-+   // for u in H(curl) and w = (1 / det (dF)) dF \hat{w} for w in H(div), we get
-+   // (\nabla\times u) \cdot w = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{w}
-+   // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
-+   // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
-+   // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
-+
-+   constexpr static int VDIM = 3;
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto Bot = Reshape(bot.Read(), D1Dtest-1, Q1D);
-+   auto Bct = Reshape(bct.Read(), D1Dtest, Q1D);
-+   auto Gc = Reshape(gc.Read(), Q1D, D1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, 6, NE);
-+   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), 3*(D1Dtest-1)*(D1Dtest-1)*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double curl[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];
-+      // curl[qz][qy][qx] will be computed as the vector curl at each quadrature point.
-+
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               for (int c = 0; c < VDIM; ++c)
-+               {
-+                  curl[qz][qy][qx][c] = 0.0;
-+               }
-+            }
-+         }
-+      }
-+
-+      // We treat x, y, z components separately for optimization specific to each.
-+
-+      int osc = 0;
-+
-+      {
-+         // x component
-+         const int D1Dz = D1D;
-+         const int D1Dy = D1D;
-+         const int D1Dx = D1D - 1;
-+
-+         for (int dz = 0; dz < D1Dz; ++dz)
-+         {
-+            double gradXY[MAX_Q1D][MAX_Q1D][2];
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  for (int d = 0; d < 2; ++d)
-+                  {
-+                     gradXY[qy][qx][d] = 0.0;
-+                  }
-+               }
-+            }
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               double massX[MAX_Q1D];
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  massX[qx] = 0.0;
-+               }
-+
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     massX[qx] += t * Bo(qx,dx);
-+                  }
-+               }
-+
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  const double wy = Bc(qy,dy);
-+                  const double wDy = Gc(qy,dy);
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     const double wx = massX[qx];
-+                     gradXY[qy][qx][0] += wx * wDy;
-+                     gradXY[qy][qx][1] += wx * wy;
-+                  }
-+               }
-+            }
-+
-+            for (int qz = 0; qz < Q1D; ++qz)
-+            {
-+               const double wz = Bc(qz,dz);
-+               const double wDz = Gc(qz,dz);
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
-+                     curl[qz][qy][qx][1] += gradXY[qy][qx][1] * wDz; // (u_0)_{x_2}
-+                     curl[qz][qy][qx][2] -= gradXY[qy][qx][0] * wz;  // -(u_0)_{x_1}
-+                  }
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }
-+
-+      {
-+         // y component
-+         const int D1Dz = D1D;
-+         const int D1Dy = D1D - 1;
-+         const int D1Dx = D1D;
-+
-+         for (int dz = 0; dz < D1Dz; ++dz)
-+         {
-+            double gradXY[MAX_Q1D][MAX_Q1D][2];
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  for (int d = 0; d < 2; ++d)
-+                  {
-+                     gradXY[qy][qx][d] = 0.0;
-+                  }
-+               }
-+            }
-+
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               double massY[MAX_Q1D];
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  massY[qy] = 0.0;
-+               }
-+
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  for (int qy = 0; qy < Q1D; ++qy)
-+                  {
-+                     massY[qy] += t * Bo(qy,dy);
-+                  }
-+               }
-+
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  const double wx = Bc(qx,dx);
-+                  const double wDx = Gc(qx,dx);
-+                  for (int qy = 0; qy < Q1D; ++qy)
-+                  {
-+                     const double wy = massY[qy];
-+                     gradXY[qy][qx][0] += wDx * wy;
-+                     gradXY[qy][qx][1] += wx * wy;
-+                  }
-+               }
-+            }
-+
-+            for (int qz = 0; qz < Q1D; ++qz)
-+            {
-+               const double wz = Bc(qz,dz);
-+               const double wDz = Gc(qz,dz);
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
-+                     curl[qz][qy][qx][0] -= gradXY[qy][qx][1] * wDz; // -(u_1)_{x_2}
-+                     curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz;  // (u_1)_{x_0}
-+                  }
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }
-+
-+      {
-+         // z component
-+         const int D1Dz = D1D - 1;
-+         const int D1Dy = D1D;
-+         const int D1Dx = D1D;
-+
-+         for (int dx = 0; dx < D1Dx; ++dx)
-+         {
-+            double gradYZ[MAX_Q1D][MAX_Q1D][2];
-+            for (int qz = 0; qz < Q1D; ++qz)
-+            {
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int d = 0; d < 2; ++d)
-+                  {
-+                     gradYZ[qz][qy][d] = 0.0;
-+                  }
-+               }
-+            }
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               double massZ[MAX_Q1D];
-+               for (int qz = 0; qz < Q1D; ++qz)
-+               {
-+                  massZ[qz] = 0.0;
-+               }
-+
-+               for (int dz = 0; dz < D1Dz; ++dz)
-+               {
-+                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  for (int qz = 0; qz < Q1D; ++qz)
-+                  {
-+                     massZ[qz] += t * Bo(qz,dz);
-+                  }
-+               }
-+
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  const double wy = Bc(qy,dy);
-+                  const double wDy = Gc(qy,dy);
-+                  for (int qz = 0; qz < Q1D; ++qz)
-+                  {
-+                     const double wz = massZ[qz];
-+                     gradYZ[qz][qy][0] += wz * wy;
-+                     gradYZ[qz][qy][1] += wz * wDy;
-+                  }
-+               }
-+            }
-+
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               const double wx = Bc(qx,dx);
-+               const double wDx = Gc(qx,dx);
-+
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int qz = 0; qz < Q1D; ++qz)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
-+                     curl[qz][qy][qx][0] += gradYZ[qz][qy][1] * wx;  // (u_2)_{x_1}
-+                     curl[qz][qy][qx][1] -= gradYZ[qz][qy][0] * wDx; // -(u_2)_{x_0}
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // Apply D operator.
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               const double O11 = op(qx,qy,qz,0,e);
-+               const double O12 = op(qx,qy,qz,1,e);
-+               const double O13 = op(qx,qy,qz,2,e);
-+               const double O22 = op(qx,qy,qz,3,e);
-+               const double O23 = op(qx,qy,qz,4,e);
-+               const double O33 = op(qx,qy,qz,5,e);
-+
-+               const double c1 = (O11 * curl[qz][qy][qx][0]) + (O12 * curl[qz][qy][qx][1]) +
-+                                 (O13 * curl[qz][qy][qx][2]);
-+               const double c2 = (O12 * curl[qz][qy][qx][0]) + (O22 * curl[qz][qy][qx][1]) +
-+                                 (O23 * curl[qz][qy][qx][2]);
-+               const double c3 = (O13 * curl[qz][qy][qx][0]) + (O23 * curl[qz][qy][qx][1]) +
-+                                 (O33 * curl[qz][qy][qx][2]);
-+
-+               curl[qz][qy][qx][0] = c1;
-+               curl[qz][qy][qx][1] = c2;
-+               curl[qz][qy][qx][2] = c3;
-+            }
-+         }
-+      }
-+
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         double massXY[HCURL_MAX_D1D][HCURL_MAX_D1D];  // Assuming HDIV_MAX_D1D <= HCURL_MAX_D1D
-+
-+         osc = 0;
-+
-+         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+         {
-+            const int D1Dz = (c == 2) ? D1Dtest : D1Dtest - 1;
-+            const int D1Dy = (c == 1) ? D1Dtest : D1Dtest - 1;
-+            const int D1Dx = (c == 0) ? D1Dtest : D1Dtest - 1;
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  massXY[dy][dx] = 0;
-+               }
-+            }
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               double massX[HCURL_MAX_D1D];
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  massX[dx] = 0;
-+               }
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     massX[dx] += curl[qz][qy][qx][c] *
-+                                  ((c == 0) ? Bct(dx,qx) : Bot(dx,qx));
-+                  }
-+               }
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  const double wy = (c == 1) ? Bct(dy,qy) : Bot(dy,qy);
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     massXY[dy][dx] += massX[dx] * wy;
-+                  }
-+               }
-+            }
-+
-+            for (int dz = 0; dz < D1Dz; ++dz)
-+            {
-+               const double wz = (c == 2) ? Bct(dz,qz) : Bot(dz,qz);
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) +=
-+                        massXY[dy][dx] * wz;
-+                  }
-+               }
-+            }
-+
-+            osc += D1Dx * D1Dy * D1Dz;
-+         }  // loop c
-+      }  // loop qz
-+   }); // end of element loop
-+}
-+
-+// Apply to x corresponding to DOFs in H(div) (test), integrated against the
-+// curl of H(curl) trial functions corresponding to y.
-+template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
-+MFEM_HOST_DEVICE inline
-+void PAHcurlHdivApply3DTranspose(const int D1D,
-+                                 const int D1Dtest,
-+                                 const int Q1D,
-+                                 const int NE,
-+                                 const Array<double> &bo,
-+                                 const Array<double> &bc,
-+                                 const Array<double> &bot,
-+                                 const Array<double> &bct,
-+                                 const Array<double> &gct,
-+                                 const Vector &pa_data,
-+                                 const Vector &x,
-+                                 Vector &y)
-+{
-+   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+   // Using Piola transformations (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u}
-+   // for u in H(curl) and w = (1 / det (dF)) dF \hat{w} for w in H(div), we get
-+   // (\nabla\times u) \cdot w = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{w}
-+   // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
-+   // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
-+   // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
-+
-+   constexpr static int VDIM = 3;
-+
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto Bot = Reshape(bot.Read(), D1Dtest-1, Q1D);
-+   auto Bct = Reshape(bct.Read(), D1Dtest, Q1D);
-+   auto Gct = Reshape(gct.Read(), D1D, Q1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, 6, NE);
-+   auto X = Reshape(x.Read(), 3*(D1Dtest-1)*(D1Dtest-1)*D1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double mass[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];  // Assuming HDIV_MAX_D1D <= HCURL_MAX_D1D
-+
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               for (int c = 0; c < VDIM; ++c)
-+               {
-+                  mass[qz][qy][qx][c] = 0.0;
-+               }
-+            }
-+         }
-+      }
-+
-+      int osc = 0;
-+
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+      {
-+         const int D1Dz = (c == 2) ? D1D : D1D - 1;
-+         const int D1Dy = (c == 1) ? D1D : D1D - 1;
-+         const int D1Dx = (c == 0) ? D1D : D1D - 1;
-+
-+         for (int dz = 0; dz < D1Dz; ++dz)
-+         {
-+            double massXY[HDIV_MAX_Q1D][HDIV_MAX_Q1D];
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  massXY[qy][qx] = 0.0;
-+               }
-+            }
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               double massX[HDIV_MAX_Q1D];
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  massX[qx] = 0.0;
-+               }
-+
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     massX[qx] += t * ((c == 0) ? Bc(qx,dx) : Bo(qx,dx));
-+                  }
-+               }
-+
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  const double wy = (c == 1) ? Bc(qy,dy) : Bo(qy,dy);
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     const double wx = massX[qx];
-+                     massXY[qy][qx] += wx * wy;
-+                  }
-+               }
-+            }
-+
-+            for (int qz = 0; qz < Q1D; ++qz)
-+            {
-+               const double wz = (c == 2) ? Bc(qz,dz) : Bo(qz,dz);
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     mass[qz][qy][qx][c] += massXY[qy][qx] * wz;
-+                  }
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }  // loop (c) over components
-+
-+      // Apply D operator.
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               const double O11 = op(qx,qy,qz,0,e);
-+               const double O12 = op(qx,qy,qz,1,e);
-+               const double O13 = op(qx,qy,qz,2,e);
-+               const double O22 = op(qx,qy,qz,3,e);
-+               const double O23 = op(qx,qy,qz,4,e);
-+               const double O33 = op(qx,qy,qz,5,e);
-+               const double massX = mass[qz][qy][qx][0];
-+               const double massY = mass[qz][qy][qx][1];
-+               const double massZ = mass[qz][qy][qx][2];
-+               mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
-+               mass[qz][qy][qx][1] = (O12*massX)+(O22*massY)+(O23*massZ);
-+               mass[qz][qy][qx][2] = (O13*massX)+(O23*massY)+(O33*massZ);
-+            }
-+         }
-+      }
-+
-+      // x component
-+      osc = 0;
-+      {
-+         const int D1Dz = D1D;
-+         const int D1Dy = D1D;
-+         const int D1Dx = D1D - 1;
-+
-+         for (int qz = 0; qz < Q1D; ++qz)
-+         {
-+            double gradXY12[MAX_D1D][MAX_D1D];
-+            double gradXY21[MAX_D1D][MAX_D1D];
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  gradXY12[dy][dx] = 0.0;
-+                  gradXY21[dy][dx] = 0.0;
-+               }
-+            }
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               double massX[MAX_D1D][2];
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  for (int n = 0; n < 2; ++n)
-+                  {
-+                     massX[dx][n] = 0.0;
-+                  }
-+               }
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     const double wx = Bot(dx,qx);
-+
-+                     massX[dx][0] += wx * mass[qz][qy][qx][1];
-+                     massX[dx][1] += wx * mass[qz][qy][qx][2];
-+                  }
-+               }
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  const double wy = Bct(dy,qy);
-+                  const double wDy = Gct(dy,qy);
-+
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     gradXY21[dy][dx] += massX[dx][0] * wy;
-+                     gradXY12[dy][dx] += massX[dx][1] * wDy;
-+                  }
-+               }
-+            }
-+
-+            for (int dz = 0; dz < D1Dz; ++dz)
-+            {
-+               const double wz = Bct(dz,qz);
-+               const double wDz = Gct(dz,qz);
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
-+                     // (u_0)_{x_2} * (op * curl)_1 - (u_0)_{x_1} * (op * curl)_2
-+                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
-+                       e) += (gradXY21[dy][dx] * wDz) - (gradXY12[dy][dx] * wz);
-+                  }
-+               }
-+            }
-+         }  // loop qz
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }
-+
-+      // y component
-+      {
-+         const int D1Dz = D1D;
-+         const int D1Dy = D1D - 1;
-+         const int D1Dx = D1D;
-+
-+         for (int qz = 0; qz < Q1D; ++qz)
-+         {
-+            double gradXY02[MAX_D1D][MAX_D1D];
-+            double gradXY20[MAX_D1D][MAX_D1D];
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  gradXY02[dy][dx] = 0.0;
-+                  gradXY20[dy][dx] = 0.0;
-+               }
-+            }
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               double massY[MAX_D1D][2];
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  massY[dy][0] = 0.0;
-+                  massY[dy][1] = 0.0;
-+               }
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int dy = 0; dy < D1Dy; ++dy)
-+                  {
-+                     const double wy = Bot(dy,qy);
-+
-+                     massY[dy][0] += wy * mass[qz][qy][qx][2];
-+                     massY[dy][1] += wy * mass[qz][qy][qx][0];
-+                  }
-+               }
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  const double wx = Bct(dx,qx);
-+                  const double wDx = Gct(dx,qx);
-+
-+                  for (int dy = 0; dy < D1Dy; ++dy)
-+                  {
-+                     gradXY02[dy][dx] += massY[dy][0] * wDx;
-+                     gradXY20[dy][dx] += massY[dy][1] * wx;
-+                  }
-+               }
-+            }
-+
-+            for (int dz = 0; dz < D1Dz; ++dz)
-+            {
-+               const double wz = Bct(dz,qz);
-+               const double wDz = Gct(dz,qz);
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
-+                     // -(u_1)_{x_2} * (op * curl)_0 + (u_1)_{x_0} * (op * curl)_2
-+                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
-+                       e) += (-gradXY20[dy][dx] * wDz) + (gradXY02[dy][dx] * wz);
-+                  }
-+               }
-+            }
-+         }  // loop qz
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }
-+
-+      // z component
-+      {
-+         const int D1Dz = D1D - 1;
-+         const int D1Dy = D1D;
-+         const int D1Dx = D1D;
-+
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            double gradYZ01[MAX_D1D][MAX_D1D];
-+            double gradYZ10[MAX_D1D][MAX_D1D];
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               for (int dz = 0; dz < D1Dz; ++dz)
-+               {
-+                  gradYZ01[dz][dy] = 0.0;
-+                  gradYZ10[dz][dy] = 0.0;
-+               }
-+            }
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               double massZ[MAX_D1D][2];
-+               for (int dz = 0; dz < D1Dz; ++dz)
-+               {
-+                  for (int n = 0; n < 2; ++n)
-+                  {
-+                     massZ[dz][n] = 0.0;
-+                  }
-+               }
-+               for (int qz = 0; qz < Q1D; ++qz)
-+               {
-+                  for (int dz = 0; dz < D1Dz; ++dz)
-+                  {
-+                     const double wz = Bot(dz,qz);
-+
-+                     massZ[dz][0] += wz * mass[qz][qy][qx][0];
-+                     massZ[dz][1] += wz * mass[qz][qy][qx][1];
-+                  }
-+               }
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  const double wy = Bct(dy,qy);
-+                  const double wDy = Gct(dy,qy);
-+
-+                  for (int dz = 0; dz < D1Dz; ++dz)
-+                  {
-+                     gradYZ01[dz][dy] += wy * massZ[dz][1];
-+                     gradYZ10[dz][dy] += wDy * massZ[dz][0];
-+                  }
-+               }
-+            }
-+
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               const double wx = Bct(dx,qx);
-+               const double wDx = Gct(dx,qx);
-+
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  for (int dz = 0; dz < D1Dz; ++dz)
-+                  {
-+                     // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
-+                     // (u_2)_{x_1} * (op * curl)_0 - (u_2)_{x_0} * (op * curl)_1
-+                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
-+                       e) += (gradYZ10[dz][dy] * wx) - (gradYZ01[dz][dy] * wDx);
-+                  }
-+               }
-+            }
-+         }  // loop qx
-+      }
-+   }); // end of element loop
-+}
-+
-+} // namespace internal
-+
-+} // namespace mfem
-+
-+#endif
-diff --git a/fem/bilininteg_hdiv.cpp b/fem/integ/bilininteg_hdiv_kernels.hpp
-similarity index 84%
-rename from fem/bilininteg_hdiv.cpp
-rename to fem/integ/bilininteg_hdiv_kernels.hpp
-index 26e0ed973..cf083a2c6 100644
---- a/fem/bilininteg_hdiv.cpp
-+++ b/fem/integ/bilininteg_hdiv_kernels.hpp
-@@ -9,13 +9,12 @@
- // terms of the BSD-3 license.  We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
--#include "qspace.hpp"
--
--using namespace std;
-+#ifndef MFEM_BILININTEG_HDIV_KERNELS_HPP
-+#define MFEM_BILININTEG_HDIV_KERNELS_HPP
- 
-+#include "../../config/config.hpp"
-+#include "../../general/forall.hpp"
-+#include "../../linalg/dtensor.hpp"
- 
- // Piola transformation in H(div): w = (1 / det (dF)) dF \hat{w}
- // div w = (1 / det (dF)) \hat{div} \hat{w}
-@@ -23,14 +22,18 @@ using namespace std;
- namespace mfem
- {
- 
-+namespace internal
-+{
-+
- // PA H(div) Mass Assemble 2D kernel
--void PAHdivSetup2D(const int Q1D,
--                   const int coeffDim,
--                   const int NE,
--                   const Array<double> &w,
--                   const Vector &j,
--                   Vector &coeff_,
--                   Vector &op)
-+MFEM_HOST_DEVICE inline
-+void PAHdivMassSetup2D(const int Q1D,
-+                       const int coeffDim,
-+                       const int NE,
-+                       const Array<double> &w,
-+                       const Vector &j,
-+                       Vector &coeff_,
-+                       Vector &op)
- {
-    const bool symmetric = (coeffDim != 4);
-    const int NQ = Q1D*Q1D;
-@@ -88,13 +91,14 @@ void PAHdivSetup2D(const int Q1D,
- }
- 
- // PA H(div) Mass Assemble 3D kernel
--void PAHdivSetup3D(const int Q1D,
--                   const int coeffDim,
--                   const int NE,
--                   const Array<double> &w,
--                   const Vector &j,
--                   Vector &coeff_,
--                   Vector &op)
-+MFEM_HOST_DEVICE inline
-+void PAHdivMassSetup3D(const int Q1D,
-+                       const int coeffDim,
-+                       const int NE,
-+                       const Array<double> &w,
-+                       const Vector &j,
-+                       Vector &coeff_,
-+                       Vector &op)
- {
-    const bool symmetric = (coeffDim != 9);
-    const int NQ = Q1D*Q1D*Q1D;
-@@ -175,6 +179,134 @@ void PAHdivSetup3D(const int Q1D,
-    });
- }
- 
-+MFEM_HOST_DEVICE inline
-+void PAHdivMassAssembleDiagonal2D(const int D1D,
-+                                  const int Q1D,
-+                                  const int NE,
-+                                  const bool symmetric,
-+                                  const Array<double> &Bo_,
-+                                  const Array<double> &Bc_,
-+                                  const Vector &op_,
-+                                  Vector &diag_)
-+{
-+   constexpr static int VDIM = 2;
-+   constexpr static int MAX_Q1D = HDIV_MAX_Q1D;
-+
-+   auto Bo = Reshape(Bo_.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(Bc_.Read(), Q1D, D1D);
-+   auto op = Reshape(op_.Read(), Q1D, Q1D, symmetric ? 3 : 4, NE);
-+   auto diag = Reshape(diag_.ReadWrite(), 2*(D1D-1)*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      int osc = 0;
-+
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
-+      {
-+         const int D1Dx = (c == 1) ? D1D - 1 : D1D;
-+         const int D1Dy = (c == 0) ? D1D - 1 : D1D;
-+
-+         for (int dy = 0; dy < D1Dy; ++dy)
-+         {
-+            double mass[MAX_Q1D];
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               mass[qx] = 0.0;
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  const double wy = (c == 1) ? Bc(qy,dy) : Bo(qy,dy);
-+                  mass[qx] += wy*wy*((c == 0) ? op(qx,qy,0,e) : op(qx,qy,symmetric ? 2 : 3,e));
-+               }
-+            }
-+
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               double val = 0.0;
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  const double wx = (c == 0) ? Bc(qx,dx) : Bo(qx,dx);
-+                  val += mass[qx] * wx * wx;
-+               }
-+               diag(dx + (dy * D1Dx) + osc, e) += val;
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy;
-+      }  // loop (c) over components
-+   }); // end of element loop
-+}
-+
-+MFEM_HOST_DEVICE inline
-+void PAHdivMassAssembleDiagonal3D(const int D1D,
-+                                  const int Q1D,
-+                                  const int NE,
-+                                  const bool symmetric,
-+                                  const Array<double> &Bo_,
-+                                  const Array<double> &Bc_,
-+                                  const Vector &op_,
-+                                  Vector &diag_)
-+{
-+   MFEM_VERIFY(D1D <= HDIV_MAX_D1D, "Error: D1D > HDIV_MAX_D1D");
-+   MFEM_VERIFY(Q1D <= HDIV_MAX_Q1D, "Error: Q1D > HDIV_MAX_Q1D");
-+   constexpr static int VDIM = 3;
-+
-+   auto Bo = Reshape(Bo_.Read(), Q1D, D1D-1);
-+   auto Bc = Reshape(Bc_.Read(), Q1D, D1D);
-+   auto op = Reshape(op_.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
-+   auto diag = Reshape(diag_.ReadWrite(), 3*(D1D-1)*(D1D-1)*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      int osc = 0;
-+
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+      {
-+         const int D1Dz = (c == 2) ? D1D : D1D - 1;
-+         const int D1Dy = (c == 1) ? D1D : D1D - 1;
-+         const int D1Dx = (c == 0) ? D1D : D1D - 1;
-+
-+         const int opc = (c == 0) ? 0 : ((c == 1) ? (symmetric ? 3 : 4) :
-+                                         (symmetric ? 5 : 8));
-+
-+         double mass[HDIV_MAX_Q1D];
-+
-+         for (int dz = 0; dz < D1Dz; ++dz)
-+         {
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  mass[qx] = 0.0;
-+                  for (int qy = 0; qy < Q1D; ++qy)
-+                  {
-+                     const double wy = (c == 1) ? Bc(qy,dy) : Bo(qy,dy);
-+                     for (int qz = 0; qz < Q1D; ++qz)
-+                     {
-+                        const double wz = (c == 2) ? Bc(qz,dz) : Bo(qz,dz);
-+                        mass[qx] += wy * wy * wz * wz * op(qx,qy,qz,opc,e);
-+                     }
-+                  }
-+               }
-+
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  double val = 0.0;
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     const double wx = (c == 0) ? Bc(qx,dx) : Bo(qx,dx);
-+                     val += mass[qx] * wx * wx;
-+                  }
-+                  diag(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += val;
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }  // loop c
-+   }); // end of element loop
-+}
-+
-+MFEM_HOST_DEVICE inline
- void PAHdivMassApply2D(const int D1D,
-                        const int Q1D,
-                        const int NE,
-@@ -307,6 +439,7 @@ void PAHdivMassApply2D(const int D1D,
- }
- 
- template<int T_D1D = 0, int T_Q1D = 0>
-+MFEM_HOST_DEVICE inline
- void SmemPAHdivMassApply2D(const int NE,
-                            const bool symmetric,
-                            const Array<double> &Bo_,
-@@ -475,131 +608,7 @@ void SmemPAHdivMassApply2D(const int NE,
-    });
- }
- 
--void PAHdivMassAssembleDiagonal2D(const int D1D,
--                                  const int Q1D,
--                                  const int NE,
--                                  const bool symmetric,
--                                  const Array<double> &Bo_,
--                                  const Array<double> &Bc_,
--                                  const Vector &op_,
--                                  Vector &diag_)
--{
--   constexpr static int VDIM = 2;
--   constexpr static int MAX_Q1D = HDIV_MAX_Q1D;
--
--   auto Bo = Reshape(Bo_.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(Bc_.Read(), Q1D, D1D);
--   auto op = Reshape(op_.Read(), Q1D, Q1D, symmetric ? 3 : 4, NE);
--   auto diag = Reshape(diag_.ReadWrite(), 2*(D1D-1)*D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      int osc = 0;
--
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
--      {
--         const int D1Dx = (c == 1) ? D1D - 1 : D1D;
--         const int D1Dy = (c == 0) ? D1D - 1 : D1D;
--
--         for (int dy = 0; dy < D1Dy; ++dy)
--         {
--            double mass[MAX_Q1D];
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               mass[qx] = 0.0;
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  const double wy = (c == 1) ? Bc(qy,dy) : Bo(qy,dy);
--                  mass[qx] += wy*wy*((c == 0) ? op(qx,qy,0,e) : op(qx,qy,symmetric ? 2 : 3,e));
--               }
--            }
--
--            for (int dx = 0; dx < D1Dx; ++dx)
--            {
--               double val = 0.0;
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  const double wx = (c == 0) ? Bc(qx,dx) : Bo(qx,dx);
--                  val += mass[qx] * wx * wx;
--               }
--               diag(dx + (dy * D1Dx) + osc, e) += val;
--            }
--         }
--
--         osc += D1Dx * D1Dy;
--      }  // loop (c) over components
--   }); // end of element loop
--}
--
--void PAHdivMassAssembleDiagonal3D(const int D1D,
--                                  const int Q1D,
--                                  const int NE,
--                                  const bool symmetric,
--                                  const Array<double> &Bo_,
--                                  const Array<double> &Bc_,
--                                  const Vector &op_,
--                                  Vector &diag_)
--{
--   MFEM_VERIFY(D1D <= HDIV_MAX_D1D, "Error: D1D > HDIV_MAX_D1D");
--   MFEM_VERIFY(Q1D <= HDIV_MAX_Q1D, "Error: Q1D > HDIV_MAX_Q1D");
--   constexpr static int VDIM = 3;
--
--   auto Bo = Reshape(Bo_.Read(), Q1D, D1D-1);
--   auto Bc = Reshape(Bc_.Read(), Q1D, D1D);
--   auto op = Reshape(op_.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
--   auto diag = Reshape(diag_.ReadWrite(), 3*(D1D-1)*(D1D-1)*D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      int osc = 0;
--
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--      {
--         const int D1Dz = (c == 2) ? D1D : D1D - 1;
--         const int D1Dy = (c == 1) ? D1D : D1D - 1;
--         const int D1Dx = (c == 0) ? D1D : D1D - 1;
--
--         const int opc = (c == 0) ? 0 : ((c == 1) ? (symmetric ? 3 : 4) :
--                                         (symmetric ? 5 : 8));
--
--         double mass[HDIV_MAX_Q1D];
--
--         for (int dz = 0; dz < D1Dz; ++dz)
--         {
--            for (int dy = 0; dy < D1Dy; ++dy)
--            {
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  mass[qx] = 0.0;
--                  for (int qy = 0; qy < Q1D; ++qy)
--                  {
--                     const double wy = (c == 1) ? Bc(qy,dy) : Bo(qy,dy);
--                     for (int qz = 0; qz < Q1D; ++qz)
--                     {
--                        const double wz = (c == 2) ? Bc(qz,dz) : Bo(qz,dz);
--                        mass[qx] += wy * wy * wz * wz * op(qx,qy,qz,opc,e);
--                     }
--                  }
--               }
--
--               for (int dx = 0; dx < D1Dx; ++dx)
--               {
--                  double val = 0.0;
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     const double wx = (c == 0) ? Bc(qx,dx) : Bo(qx,dx);
--                     val += mass[qx] * wx * wx;
--                  }
--                  diag(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += val;
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }  // loop c
--   }); // end of element loop
--}
--
-+MFEM_HOST_DEVICE inline
- void PAHdivMassApply3D(const int D1D,
-                        const int Q1D,
-                        const int NE,
-@@ -796,6 +805,7 @@ void PAHdivMassApply3D(const int D1D,
- }
- 
- template<int T_D1D = 0, int T_Q1D = 0>
-+MFEM_HOST_DEVICE inline
- void SmemPAHdivMassApply3D(const int NE,
-                            const bool symmetric,
-                            const Array<double> &Bo_,
-@@ -1083,6 +1093,7 @@ void SmemPAHdivMassApply3D(const int NE,
-    });
- }
- 
-+MFEM_HOST_DEVICE inline
- void PAHdivMassApply(const int dim,
-                      const int D1D,
-                      const int Q1D,
-@@ -1127,13 +1138,14 @@ void PAHdivMassApply(const int dim,
- }
- 
- // PA H(div) div-div assemble 2D kernel
--// NOTE: this is identical to PACurlCurlSetup3D
--static void PADivDivSetup2D(const int Q1D,
--                            const int NE,
--                            const Array<double> &w,
--                            const Vector &j,
--                            Vector &coeff_,
--                            Vector &op)
-+// NOTE: this is identical to PACurlCurlSetup2D
-+MFEM_HOST_DEVICE inline
-+void PADivDivSetup2D(const int Q1D,
-+                     const int NE,
-+                     const Array<double> &w,
-+                     const Vector &j,
-+                     Vector &coeff_,
-+                     Vector &op)
- {
-    const int NQ = Q1D*Q1D;
-    auto W = w.Read();
-@@ -1154,12 +1166,13 @@ static void PADivDivSetup2D(const int Q1D,
-    });
- }
- 
--static void PADivDivSetup3D(const int Q1D,
--                            const int NE,
--                            const Array<double> &w,
--                            const Vector &j,
--                            Vector &coeff_,
--                            Vector &op)
-+MFEM_HOST_DEVICE inline
-+void PADivDivSetup3D(const int Q1D,
-+                     const int NE,
-+                     const Array<double> &w,
-+                     const Vector &j,
-+                     Vector &coeff_,
-+                     Vector &op)
- {
-    const int NQ = Q1D*Q1D*Q1D;
-    auto W = w.Read();
-@@ -1188,16 +1201,141 @@ static void PADivDivSetup3D(const int Q1D,
-    });
- }
- 
--static void PADivDivApply2D(const int D1D,
--                            const int Q1D,
--                            const int NE,
--                            const Array<double> &Bo_,
--                            const Array<double> &Gc_,
--                            const Array<double> &Bot_,
--                            const Array<double> &Gct_,
--                            const Vector &op_,
--                            const Vector &x_,
--                            Vector &y_)
-+MFEM_HOST_DEVICE inline
-+void PADivDivAssembleDiagonal2D(const int D1D,
-+                                const int Q1D,
-+                                const int NE,
-+                                const Array<double> &Bo_,
-+                                const Array<double> &Gc_,
-+                                const Vector &op_,
-+                                Vector &diag_)
-+{
-+   constexpr static int VDIM = 2;
-+   constexpr static int MAX_Q1D = HDIV_MAX_Q1D;
-+
-+   auto Bo = Reshape(Bo_.Read(), Q1D, D1D-1);
-+   auto Gc = Reshape(Gc_.Read(), Q1D, D1D);
-+   auto op = Reshape(op_.Read(), Q1D, Q1D, NE);
-+   auto diag = Reshape(diag_.ReadWrite(), 2*(D1D-1)*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      int osc = 0;
-+
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
-+      {
-+         const int D1Dx = (c == 1) ? D1D - 1 : D1D;
-+         const int D1Dy = (c == 0) ? D1D - 1 : D1D;
-+
-+         double div[MAX_Q1D];
-+
-+         for (int dy = 0; dy < D1Dy; ++dy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               div[qx] = 0.0;
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  const double wy = (c == 0) ? Bo(qy,dy) : Gc(qy,dy);
-+                  div[qx] += wy * wy * op(qx,qy,e);
-+               }
-+            }
-+
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               double val = 0.0;
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  const double wx = (c == 0) ? Gc(qx,dx) : Bo(qx,dx);
-+                  val += div[qx] * wx * wx;
-+               }
-+               diag(dx + (dy * D1Dx) + osc, e) += val;
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy;
-+      }  // loop c
-+   });
-+}
-+
-+MFEM_HOST_DEVICE inline
-+void PADivDivAssembleDiagonal3D(const int D1D,
-+                                const int Q1D,
-+                                const int NE,
-+                                const Array<double> &Bo_,
-+                                const Array<double> &Gc_,
-+                                const Vector &op_,
-+                                Vector &diag_)
-+{
-+   MFEM_VERIFY(D1D <= HDIV_MAX_D1D, "Error: D1D > HDIV_MAX_D1D");
-+   MFEM_VERIFY(Q1D <= HDIV_MAX_Q1D, "Error: Q1D > HDIV_MAX_Q1D");
-+   constexpr static int VDIM = 3;
-+
-+   auto Bo = Reshape(Bo_.Read(), Q1D, D1D-1);
-+   auto Gc = Reshape(Gc_.Read(), Q1D, D1D);
-+   auto op = Reshape(op_.Read(), Q1D, Q1D, Q1D, NE);
-+   auto diag = Reshape(diag_.ReadWrite(), 3*(D1D-1)*(D1D-1)*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      int osc = 0;
-+
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+      {
-+         const int D1Dz = (c == 2) ? D1D : D1D - 1;
-+         const int D1Dy = (c == 1) ? D1D : D1D - 1;
-+         const int D1Dx = (c == 0) ? D1D : D1D - 1;
-+
-+         for (int dz = 0; dz < D1Dz; ++dz)
-+         {
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               double a[HDIV_MAX_Q1D];
-+
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  a[qx] = 0.0;
-+                  for (int qy = 0; qy < Q1D; ++qy)
-+                  {
-+                     const double wy = (c == 1) ? Gc(qy,dy) : Bo(qy,dy);
-+
-+                     for (int qz = 0; qz < Q1D; ++qz)
-+                     {
-+                        const double wz = (c == 2) ? Gc(qz,dz) : Bo(qz,dz);
-+                        a[qx] += wy * wy * wz * wz * op(qx,qy,qz,e);
-+                     }
-+                  }
-+               }
-+
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  double val = 0.0;
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     const double wx = (c == 0) ? Gc(qx,dx) : Bo(qx,dx);
-+                     val += a[qx] * wx * wx;
-+                  }
-+                  diag(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += val;
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }  // loop c
-+   }); // end of element loop
-+}
-+
-+MFEM_HOST_DEVICE inline
-+void PADivDivApply2D(const int D1D,
-+                     const int Q1D,
-+                     const int NE,
-+                     const Array<double> &Bo_,
-+                     const Array<double> &Gc_,
-+                     const Array<double> &Bot_,
-+                     const Array<double> &Gct_,
-+                     const Vector &op_,
-+                     const Vector &x_,
-+                     Vector &y_)
- {
-    constexpr static int VDIM = 2;
-    constexpr static int MAX_D1D = HDIV_MAX_D1D;
-@@ -1307,16 +1445,17 @@ static void PADivDivApply2D(const int D1D,
-    }); // end of element loop
- }
- 
--static void PADivDivApply3D(const int D1D,
--                            const int Q1D,
--                            const int NE,
--                            const Array<double> &Bo_,
--                            const Array<double> &Gc_,
--                            const Array<double> &Bot_,
--                            const Array<double> &Gct_,
--                            const Vector &op_,
--                            const Vector &x_,
--                            Vector &y_)
-+MFEM_HOST_DEVICE inline
-+void PADivDivApply3D(const int D1D,
-+                     const int Q1D,
-+                     const int NE,
-+                     const Array<double> &Bo_,
-+                     const Array<double> &Gc_,
-+                     const Array<double> &Bot_,
-+                     const Array<double> &Gct_,
-+                     const Vector &op_,
-+                     const Vector &x_,
-+                     Vector &y_)
- {
-    MFEM_VERIFY(D1D <= HDIV_MAX_D1D, "Error: D1D > HDIV_MAX_D1D");
-    MFEM_VERIFY(Q1D <= HDIV_MAX_Q1D, "Error: Q1D > HDIV_MAX_Q1D");
-@@ -1483,332 +1622,280 @@ static void PADivDivApply3D(const int D1D,
-    }); // end of element loop
- }
- 
--void DivDivIntegrator::AssemblePA(const FiniteElementSpace &fes)
-+// PA H(div)-L2 (div u, p) assemble 2D kernel
-+MFEM_HOST_DEVICE inline
-+void PAHdivL2Setup2D(const int Q1D,
-+                     const int NE,
-+                     const Array<double> &w,
-+                     Vector &coeff_,
-+                     Vector &op)
- {
--   // Assumes tensor-product elements
--   Mesh *mesh = fes.GetMesh();
--   const FiniteElement *fel = fes.GetFE(0);
--
--   const VectorTensorFiniteElement *el =
--      dynamic_cast<const VectorTensorFiniteElement*>(fel);
--   MFEM_VERIFY(el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const IntegrationRule *ir = IntRule ? IntRule : &MassIntegrator::GetRule
--                               (*el, *el, *mesh->GetElementTransformation(0));
--
--   const int dims = el->GetDim();
--   MFEM_VERIFY(dims == 2 || dims == 3, "");
--
--   const int nq = ir->GetNPoints();
--   dim = mesh->Dimension();
--   MFEM_VERIFY(dim == 2 || dim == 3, "");
--
--   ne = fes.GetNE();
--   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
--   mapsC = &el->GetDofToQuad(*ir, DofToQuad::TENSOR);
--   mapsO = &el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
--   dofs1D = mapsC->ndof;
--   quad1D = mapsC->nqpt;
--
--   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
--
--   pa_data.SetSize(nq * ne, Device::GetMemoryType());
--
--   QuadratureSpace qs(*mesh, *ir);
--   CoefficientVector coeff(Q, qs, CoefficientStorage::FULL);
--
--   if (el->GetDerivType() == mfem::FiniteElement::DIV && dim == 3)
--   {
--      PADivDivSetup3D(quad1D, ne, ir->GetWeights(), geom->J, coeff, pa_data);
--   }
--   else if (el->GetDerivType() == mfem::FiniteElement::DIV && dim == 2)
--   {
--      PADivDivSetup2D(quad1D, ne, ir->GetWeights(), geom->J, coeff, pa_data);
--   }
--   else
-+   const int NQ = Q1D*Q1D;
-+   auto W = w.Read();
-+   auto coeff = Reshape(coeff_.Read(), NQ, NE);
-+   auto y = Reshape(op.Write(), NQ, NE);
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-    {
--      MFEM_ABORT("Unknown kernel.");
--   }
-+      for (int q = 0; q < NQ; ++q)
-+      {
-+         y(q,e) = W[q] * coeff(q,e);
-+      }
-+   });
- }
- 
--void DivDivIntegrator::AddMultPA(const Vector &x, Vector &y) const
-+MFEM_HOST_DEVICE inline
-+void PAHdivL2Setup3D(const int Q1D,
-+                     const int NE,
-+                     const Array<double> &w,
-+                     Vector &coeff_,
-+                     Vector &op)
- {
--   if (dim == 3)
--      PADivDivApply3D(dofs1D, quad1D, ne, mapsO->B, mapsC->G,
--                      mapsO->Bt, mapsC->Gt, pa_data, x, y);
--   else if (dim == 2)
--      PADivDivApply2D(dofs1D, quad1D, ne, mapsO->B, mapsC->G,
--                      mapsO->Bt, mapsC->Gt, pa_data, x, y);
--   else
-+   const int NQ = Q1D*Q1D*Q1D;
-+   auto W = w.Read();
-+   auto coeff = Reshape(coeff_.Read(), NQ, NE);
-+   auto y = Reshape(op.Write(), NQ, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-    {
--      MFEM_ABORT("Unsupported dimension!");
--   }
-+      for (int q = 0; q < NQ; ++q)
-+      {
-+         y(q,e) = W[q] * coeff(q, e);
-+      }
-+   });
- }
- 
--static void PADivDivAssembleDiagonal2D(const int D1D,
--                                       const int Q1D,
--                                       const int NE,
--                                       const Array<double> &Bo_,
--                                       const Array<double> &Gc_,
--                                       const Vector &op_,
--                                       Vector &diag_)
-+MFEM_HOST_DEVICE inline
-+void PAHdivL2AssembleDiagonal_ADAt_2D(const int D1D,
-+                                      const int Q1D,
-+                                      const int L2D1D,
-+                                      const int NE,
-+                                      const Array<double> &L2Bo_,
-+                                      const Array<double> &Gct_,
-+                                      const Array<double> &Bot_,
-+                                      const Vector &op_,
-+                                      const Vector &D_,
-+                                      Vector &diag_)
- {
-    constexpr static int VDIM = 2;
--   constexpr static int MAX_Q1D = HDIV_MAX_Q1D;
- 
--   auto Bo = Reshape(Bo_.Read(), Q1D, D1D-1);
--   auto Gc = Reshape(Gc_.Read(), Q1D, D1D);
-+   auto L2Bo = Reshape(L2Bo_.Read(), Q1D, L2D1D);
-+   auto Gct = Reshape(Gct_.Read(), D1D, Q1D);
-+   auto Bot = Reshape(Bot_.Read(), D1D-1, Q1D);
-    auto op = Reshape(op_.Read(), Q1D, Q1D, NE);
--   auto diag = Reshape(diag_.ReadWrite(), 2*(D1D-1)*D1D, NE);
-+   auto D = Reshape(D_.Read(), 2*(D1D-1)*D1D, NE);
-+   auto diag = Reshape(diag_.ReadWrite(), L2D1D, L2D1D, NE);
- 
-    mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-    {
--      int osc = 0;
--
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
-+      for (int ry = 0; ry < L2D1D; ++ry)
-       {
--         const int D1Dx = (c == 1) ? D1D - 1 : D1D;
--         const int D1Dy = (c == 0) ? D1D - 1 : D1D;
-+         for (int rx = 0; rx < L2D1D; ++rx)
-+         {
-+            // Compute row (rx,ry), assuming all contributions are from
-+            // a single element.
- 
--         double div[MAX_Q1D];
-+            double row[2*HDIV_MAX_D1D*(HDIV_MAX_D1D-1)];
-+            double div[HDIV_MAX_Q1D][HDIV_MAX_Q1D];
- 
--         for (int dy = 0; dy < D1Dy; ++dy)
--         {
--            for (int qx = 0; qx < Q1D; ++qx)
-+            for (int i=0; i<2*D1D*(D1D - 1); ++i)
-             {
--               div[qx] = 0.0;
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  const double wy = (c == 0) ? Bo(qy,dy) : Gc(qy,dy);
--                  div[qx] += wy * wy * op(qx,qy,e);
--               }
-+               row[i] = 0;
-             }
- 
--            for (int dx = 0; dx < D1Dx; ++dx)
-+            for (int qy = 0; qy < Q1D; ++qy)
-             {
--               double val = 0.0;
-                for (int qx = 0; qx < Q1D; ++qx)
-                {
--                  const double wx = (c == 0) ? Gc(qx,dx) : Bo(qx,dx);
--                  val += div[qx] * wx * wx;
-+                  div[qy][qx] = op(qx,qy,e) * L2Bo(qx,rx) * L2Bo(qy,ry);
-                }
--               diag(dx + (dy * D1Dx) + osc, e) += val;
-             }
--         }
- 
--         osc += D1Dx * D1Dy;
--      }  // loop c
--   });
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               int osc = 0;
-+               for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+               {
-+                  const int D1Dy = (c == 1) ? D1D : D1D - 1;
-+                  const int D1Dx = (c == 0) ? D1D : D1D - 1;
-+
-+                  double aX[HDIV_MAX_D1D];
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     aX[dx] = 0;
-+                  }
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     for (int dx = 0; dx < D1Dx; ++dx)
-+                     {
-+                        aX[dx] += div[qy][qx] * ((c == 0) ? Gct(dx,qx) :
-+                                                 Bot(dx,qx));
-+                     }
-+                  }
-+
-+                  for (int dy = 0; dy < D1Dy; ++dy)
-+                  {
-+                     const double wy = (c == 1) ? Gct(dy,qy) : Bot(dy,qy);
-+
-+                     for (int dx = 0; dx < D1Dx; ++dx)
-+                     {
-+                        row[dx + (dy * D1Dx) + osc] += aX[dx] * wy;
-+                     }
-+                  }
-+
-+                  osc += D1Dx * D1Dy;
-+               }  // loop c
-+            }  // loop qy
-+
-+            double val = 0.0;
-+            for (int i=0; i<2*D1D*(D1D - 1); ++i)
-+            {
-+               val += row[i] * row[i] * D(i,e);
-+            }
-+            diag(rx,ry,e) += val;
-+         }  // loop rx
-+      }  // loop ry
-+   }); // end of element loop
- }
- 
--static void PADivDivAssembleDiagonal3D(const int D1D,
--                                       const int Q1D,
--                                       const int NE,
--                                       const Array<double> &Bo_,
--                                       const Array<double> &Gc_,
--                                       const Vector &op_,
--                                       Vector &diag_)
-+MFEM_HOST_DEVICE inline
-+void PAHdivL2AssembleDiagonal_ADAt_3D(const int D1D,
-+                                      const int Q1D,
-+                                      const int L2D1D,
-+                                      const int NE,
-+                                      const Array<double> &L2Bo_,
-+                                      const Array<double> &Gct_,
-+                                      const Array<double> &Bot_,
-+                                      const Vector &op_,
-+                                      const Vector &D_,
-+                                      Vector &diag_)
- {
-    MFEM_VERIFY(D1D <= HDIV_MAX_D1D, "Error: D1D > HDIV_MAX_D1D");
-    MFEM_VERIFY(Q1D <= HDIV_MAX_Q1D, "Error: Q1D > HDIV_MAX_Q1D");
-    constexpr static int VDIM = 3;
- 
--   auto Bo = Reshape(Bo_.Read(), Q1D, D1D-1);
--   auto Gc = Reshape(Gc_.Read(), Q1D, D1D);
-+   auto L2Bo = Reshape(L2Bo_.Read(), Q1D, L2D1D);
-+   auto Gct = Reshape(Gct_.Read(), D1D, Q1D);
-+   auto Bot = Reshape(Bot_.Read(), D1D-1, Q1D);
-    auto op = Reshape(op_.Read(), Q1D, Q1D, Q1D, NE);
--   auto diag = Reshape(diag_.ReadWrite(), 3*(D1D-1)*(D1D-1)*D1D, NE);
-+   auto D = Reshape(D_.Read(), 3*(D1D-1)*(D1D-1)*D1D, NE);
-+   auto diag = Reshape(diag_.ReadWrite(), L2D1D, L2D1D, L2D1D, NE);
- 
-    mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-    {
--      int osc = 0;
--
--      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+      for (int rz = 0; rz < L2D1D; ++rz)
-       {
--         const int D1Dz = (c == 2) ? D1D : D1D - 1;
--         const int D1Dy = (c == 1) ? D1D : D1D - 1;
--         const int D1Dx = (c == 0) ? D1D : D1D - 1;
--
--         for (int dz = 0; dz < D1Dz; ++dz)
-+         for (int ry = 0; ry < L2D1D; ++ry)
-          {
--            for (int dy = 0; dy < D1Dy; ++dy)
-+            for (int rx = 0; rx < L2D1D; ++rx)
-             {
--               double a[HDIV_MAX_Q1D];
-+               // Compute row (rx,ry,rz), assuming all contributions are from
-+               // a single element.
- 
--               for (int qx = 0; qx < Q1D; ++qx)
-+               double row[3*HDIV_MAX_D1D*(HDIV_MAX_D1D-1)*(HDIV_MAX_D1D-1)];
-+               double div[HDIV_MAX_Q1D][HDIV_MAX_Q1D][HDIV_MAX_Q1D];
-+
-+               for (int i=0; i<3*D1D*(D1D - 1)*(D1D - 1); ++i)
-+               {
-+                  row[i] = 0;
-+               }
-+
-+               for (int qz = 0; qz < Q1D; ++qz)
-                {
--                  a[qx] = 0.0;
-                   for (int qy = 0; qy < Q1D; ++qy)
-                   {
--                     const double wy = (c == 1) ? Gc(qy,dy) : Bo(qy,dy);
--
--                     for (int qz = 0; qz < Q1D; ++qz)
-+                     for (int qx = 0; qx < Q1D; ++qx)
-                      {
--                        const double wz = (c == 2) ? Gc(qz,dz) : Bo(qz,dz);
--                        a[qx] += wy * wy * wz * wz * op(qx,qy,qz,e);
-+                        div[qz][qy][qx] = op(qx,qy,qz,e) * L2Bo(qx,rx) *
-+                                          L2Bo(qy,ry) * L2Bo(qz,rz);
-                      }
-                   }
-                }
- 
--               for (int dx = 0; dx < D1Dx; ++dx)
-+               for (int qz = 0; qz < Q1D; ++qz)
-                {
--                  double val = 0.0;
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     const double wx = (c == 0) ? Gc(qx,dx) : Bo(qx,dx);
--                     val += a[qx] * wx * wx;
--                  }
--                  diag(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += val;
--               }
--            }
--         }
--
--         osc += D1Dx * D1Dy * D1Dz;
--      }  // loop c
--   }); // end of element loop
--}
--
--void DivDivIntegrator::AssembleDiagonalPA(Vector& diag)
--{
--   if (dim == 3)
--   {
--      PADivDivAssembleDiagonal3D(dofs1D, quad1D, ne,
--                                 mapsO->B, mapsC->G, pa_data, diag);
--   }
--   else
--   {
--      PADivDivAssembleDiagonal2D(dofs1D, quad1D, ne,
--                                 mapsO->B, mapsC->G, pa_data, diag);
--   }
--}
--
--// PA H(div)-L2 (div u, p) assemble 2D kernel
--static void PADivL2Setup2D(const int Q1D,
--                           const int NE,
--                           const Array<double> &w,
--                           Vector &coeff_,
--                           Vector &op)
--{
--   const int NQ = Q1D*Q1D;
--   auto W = w.Read();
--   auto coeff = Reshape(coeff_.Read(), NQ, NE);
--   auto y = Reshape(op.Write(), NQ, NE);
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      for (int q = 0; q < NQ; ++q)
--      {
--         y(q,e) = W[q] * coeff(q,e);
--      }
--   });
--}
--
--static void PADivL2Setup3D(const int Q1D,
--                           const int NE,
--                           const Array<double> &w,
--                           Vector &coeff_,
--                           Vector &op)
--{
--   const int NQ = Q1D*Q1D*Q1D;
--   auto W = w.Read();
--   auto coeff = Reshape(coeff_.Read(), NQ, NE);
--   auto y = Reshape(op.Write(), NQ, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      for (int q = 0; q < NQ; ++q)
--      {
--         y(q,e) = W[q] * coeff(q, e);
--      }
--   });
--}
--
--void
--VectorFEDivergenceIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
--                                         const FiniteElementSpace &test_fes)
--{
--   // Assumes tensor-product elements, with a vector test space and
--   // scalar trial space.
--   Mesh *mesh = trial_fes.GetMesh();
--   const FiniteElement *trial_fel = trial_fes.GetFE(0);
--   const FiniteElement *test_fel = test_fes.GetFE(0);
--
--   const VectorTensorFiniteElement *trial_el =
--      dynamic_cast<const VectorTensorFiniteElement*>(trial_fel);
--   MFEM_VERIFY(trial_el != NULL, "Only VectorTensorFiniteElement is supported!");
--
--   const NodalTensorFiniteElement *test_el =
--      dynamic_cast<const NodalTensorFiniteElement*>(test_fel);
--   MFEM_VERIFY(test_el != NULL, "Only NodalTensorFiniteElement is supported!");
--
--   const IntegrationRule *ir = IntRule ? IntRule : &MassIntegrator::GetRule(
--                                  *trial_el, *trial_el,
--                                  *mesh->GetElementTransformation(0));
--
--   const int dims = trial_el->GetDim();
--   MFEM_VERIFY(dims == 2 || dims == 3, "");
--
--   const int nq = ir->GetNPoints();
--   dim = mesh->Dimension();
--   MFEM_VERIFY(dim == 2 || dim == 3, "");
--
--   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder() + 1, "");
--
--   ne = trial_fes.GetNE();
--   mapsC = &trial_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
--   mapsO = &trial_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
--   dofs1D = mapsC->ndof;
--   quad1D = mapsC->nqpt;
--
--   L2mapsO = &test_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
--   L2dofs1D = L2mapsO->ndof;
-+                  double aXY[HDIV_MAX_D1D][HDIV_MAX_D1D];
- 
--   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
--   if (dim == 2)
--   {
--      MFEM_VERIFY(nq == quad1D * quad1D, "");
--   }
--   else
--   {
--      MFEM_VERIFY(nq == quad1D * quad1D * quad1D, "");
--   }
-+                  int osc = 0;
-+                  for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+                  {
-+                     const int D1Dz = (c == 2) ? D1D : D1D - 1;
-+                     const int D1Dy = (c == 1) ? D1D : D1D - 1;
-+                     const int D1Dx = (c == 0) ? D1D : D1D - 1;
- 
--   pa_data.SetSize(nq * ne, Device::GetMemoryType());
-+                     for (int dy = 0; dy < D1Dy; ++dy)
-+                     {
-+                        for (int dx = 0; dx < D1Dx; ++dx)
-+                        {
-+                           aXY[dy][dx] = 0;
-+                        }
-+                     }
-+                     for (int qy = 0; qy < Q1D; ++qy)
-+                     {
-+                        double aX[HDIV_MAX_D1D];
-+                        for (int dx = 0; dx < D1Dx; ++dx)
-+                        {
-+                           aX[dx] = 0;
-+                        }
-+                        for (int qx = 0; qx < Q1D; ++qx)
-+                        {
-+                           for (int dx = 0; dx < D1Dx; ++dx)
-+                           {
-+                              aX[dx] += div[qz][qy][qx] * ((c == 0) ? Gct(dx,qx)
-+                                                           : Bot(dx,qx));
-+                           }
-+                        }
-+                        for (int dy = 0; dy < D1Dy; ++dy)
-+                        {
-+                           const double wy = (c == 1) ? Gct(dy,qy) : Bot(dy,qy);
-+                           for (int dx = 0; dx < D1Dx; ++dx)
-+                           {
-+                              aXY[dy][dx] += aX[dx] * wy;
-+                           }
-+                        }
-+                     }
- 
--   QuadratureSpace qs(*mesh, *ir);
--   CoefficientVector coeff(Q, qs, CoefficientStorage::FULL);
-+                     for (int dz = 0; dz < D1Dz; ++dz)
-+                     {
-+                        const double wz = (c == 2) ? Gct(dz,qz) : Bot(dz,qz);
-+                        for (int dy = 0; dy < D1Dy; ++dy)
-+                        {
-+                           for (int dx = 0; dx < D1Dx; ++dx)
-+                           {
-+                              row[dx + ((dy + (dz * D1Dy)) * D1Dx) + osc] +=
-+                                 aXY[dy][dx] * wz;
-+                           }
-+                        }
-+                     }
- 
--   if (test_el->GetMapType() == FiniteElement::INTEGRAL)
--   {
--      const GeometricFactors *geom =
--         mesh->GetGeometricFactors(*ir, GeometricFactors::DETERMINANTS);
--      coeff /= geom->detJ;
--   }
-+                     osc += D1Dx * D1Dy * D1Dz;
-+                  }  // loop c
-+               }  // loop qz
- 
--   if (trial_el->GetDerivType() == mfem::FiniteElement::DIV && dim == 3)
--   {
--      PADivL2Setup3D(quad1D, ne, ir->GetWeights(), coeff, pa_data);
--   }
--   else if (trial_el->GetDerivType() == mfem::FiniteElement::DIV && dim == 2)
--   {
--      PADivL2Setup2D(quad1D, ne, ir->GetWeights(), coeff, pa_data);
--   }
--   else
--   {
--      MFEM_ABORT("Unknown kernel.");
--   }
-+               double val = 0.0;
-+               for (int i=0; i<3*D1D*(D1D - 1)*(D1D - 1); ++i)
-+               {
-+                  val += row[i] * row[i] * D(i,e);
-+               }
-+               diag(rx,ry,rz,e) += val;
-+            }  // loop rx
-+         }  // loop ry
-+      }  // loop rz
-+   }); // end of element loop
- }
- 
- // Apply to x corresponding to DOFs in H(div) (trial), whose divergence is
- // integrated against L_2 test functions corresponding to y.
--static void PAHdivL2Apply3D(const int D1D,
--                            const int Q1D,
--                            const int L2D1D,
--                            const int NE,
--                            const Array<double> &Bo_,
--                            const Array<double> &Gc_,
--                            const Array<double> &L2Bot_,
--                            const Vector &op_,
--                            const Vector &x_,
--                            Vector &y_)
-+MFEM_HOST_DEVICE inline
-+void PAHdivL2Apply3D(const int D1D,
-+                     const int Q1D,
-+                     const int L2D1D,
-+                     const int NE,
-+                     const Array<double> &Bo_,
-+                     const Array<double> &Gc_,
-+                     const Array<double> &L2Bot_,
-+                     const Vector &op_,
-+                     const Vector &x_,
-+                     Vector &y_)
- {
-    MFEM_VERIFY(D1D <= HDIV_MAX_D1D, "Error: D1D > HDIV_MAX_D1D");
-    MFEM_VERIFY(Q1D <= HDIV_MAX_Q1D, "Error: Q1D > HDIV_MAX_Q1D");
-@@ -1962,16 +2049,17 @@ static void PAHdivL2Apply3D(const int D1D,
- 
- // Apply to x corresponding to DOFs in H(div) (trial), whose divergence is
- // integrated against L_2 test functions corresponding to y.
--static void PAHdivL2Apply2D(const int D1D,
--                            const int Q1D,
--                            const int L2D1D,
--                            const int NE,
--                            const Array<double> &Bo_,
--                            const Array<double> &Gc_,
--                            const Array<double> &L2Bot_,
--                            const Vector &op_,
--                            const Vector &x_,
--                            Vector &y_)
-+MFEM_HOST_DEVICE inline
-+void PAHdivL2Apply2D(const int D1D,
-+                     const int Q1D,
-+                     const int L2D1D,
-+                     const int NE,
-+                     const Array<double> &Bo_,
-+                     const Array<double> &Gc_,
-+                     const Array<double> &L2Bot_,
-+                     const Vector &op_,
-+                     const Vector &x_,
-+                     Vector &y_)
- {
-    constexpr static int VDIM = 2;
-    constexpr static int MAX_D1D = HDIV_MAX_D1D;
-@@ -2068,16 +2156,17 @@ static void PAHdivL2Apply2D(const int D1D,
-    }); // end of element loop
- }
- 
--static void PAHdivL2ApplyTranspose3D(const int D1D,
--                                     const int Q1D,
--                                     const int L2D1D,
--                                     const int NE,
--                                     const Array<double> &L2Bo_,
--                                     const Array<double> &Gct_,
--                                     const Array<double> &Bot_,
--                                     const Vector &op_,
--                                     const Vector &x_,
--                                     Vector &y_)
-+MFEM_HOST_DEVICE inline
-+void PAHdivL2ApplyTranspose3D(const int D1D,
-+                              const int Q1D,
-+                              const int L2D1D,
-+                              const int NE,
-+                              const Array<double> &L2Bo_,
-+                              const Array<double> &Gct_,
-+                              const Array<double> &Bot_,
-+                              const Vector &op_,
-+                              const Vector &x_,
-+                              Vector &y_)
- {
-    MFEM_VERIFY(D1D <= HDIV_MAX_D1D, "Error: D1D > HDIV_MAX_D1D");
-    MFEM_VERIFY(Q1D <= HDIV_MAX_Q1D, "Error: Q1D > HDIV_MAX_Q1D");
-@@ -2230,16 +2319,17 @@ static void PAHdivL2ApplyTranspose3D(const int D1D,
-    }); // end of element loop
- }
- 
--static void PAHdivL2ApplyTranspose2D(const int D1D,
--                                     const int Q1D,
--                                     const int L2D1D,
--                                     const int NE,
--                                     const Array<double> &L2Bo_,
--                                     const Array<double> &Gct_,
--                                     const Array<double> &Bot_,
--                                     const Vector &op_,
--                                     const Vector &x_,
--                                     Vector &y_)
-+MFEM_HOST_DEVICE inline
-+void PAHdivL2ApplyTranspose2D(const int D1D,
-+                              const int Q1D,
-+                              const int L2D1D,
-+                              const int NE,
-+                              const Array<double> &L2Bo_,
-+                              const Array<double> &Gct_,
-+                              const Array<double> &Bot_,
-+                              const Vector &op_,
-+                              const Vector &x_,
-+                              Vector &y_)
- {
-    constexpr static int VDIM = 2;
-    constexpr static int MAX_D1D = HDIV_MAX_D1D;
-@@ -2336,265 +2426,8 @@ static void PAHdivL2ApplyTranspose2D(const int D1D,
-    }); // end of element loop
- }
- 
--void VectorFEDivergenceIntegrator::AddMultPA(const Vector &x, Vector &y) const
--{
--   if (dim == 3)
--      PAHdivL2Apply3D(dofs1D, quad1D, L2dofs1D, ne, mapsO->B, mapsC->G,
--                      L2mapsO->Bt, pa_data, x, y);
--   else if (dim == 2)
--      PAHdivL2Apply2D(dofs1D, quad1D, L2dofs1D, ne, mapsO->B, mapsC->G,
--                      L2mapsO->Bt, pa_data, x, y);
--   else
--   {
--      MFEM_ABORT("Unsupported dimension!");
--   }
--}
--
--void VectorFEDivergenceIntegrator::AddMultTransposePA(const Vector &x,
--                                                      Vector &y) const
--{
--   if (dim == 3)
--      PAHdivL2ApplyTranspose3D(dofs1D, quad1D, L2dofs1D, ne, L2mapsO->B,
--                               mapsC->Gt, mapsO->Bt, pa_data, x, y);
--   else if (dim == 2)
--      PAHdivL2ApplyTranspose2D(dofs1D, quad1D, L2dofs1D, ne, L2mapsO->B,
--                               mapsC->Gt, mapsO->Bt, pa_data, x, y);
--   else
--   {
--      MFEM_ABORT("Unsupported dimension!");
--   }
--}
--
--static void PAHdivL2AssembleDiagonal_ADAt_3D(const int D1D,
--                                             const int Q1D,
--                                             const int L2D1D,
--                                             const int NE,
--                                             const Array<double> &L2Bo_,
--                                             const Array<double> &Gct_,
--                                             const Array<double> &Bot_,
--                                             const Vector &op_,
--                                             const Vector &D_,
--                                             Vector &diag_)
--{
--   MFEM_VERIFY(D1D <= HDIV_MAX_D1D, "Error: D1D > HDIV_MAX_D1D");
--   MFEM_VERIFY(Q1D <= HDIV_MAX_Q1D, "Error: Q1D > HDIV_MAX_Q1D");
--   constexpr static int VDIM = 3;
--
--   auto L2Bo = Reshape(L2Bo_.Read(), Q1D, L2D1D);
--   auto Gct = Reshape(Gct_.Read(), D1D, Q1D);
--   auto Bot = Reshape(Bot_.Read(), D1D-1, Q1D);
--   auto op = Reshape(op_.Read(), Q1D, Q1D, Q1D, NE);
--   auto D = Reshape(D_.Read(), 3*(D1D-1)*(D1D-1)*D1D, NE);
--   auto diag = Reshape(diag_.ReadWrite(), L2D1D, L2D1D, L2D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      for (int rz = 0; rz < L2D1D; ++rz)
--      {
--         for (int ry = 0; ry < L2D1D; ++ry)
--         {
--            for (int rx = 0; rx < L2D1D; ++rx)
--            {
--               // Compute row (rx,ry,rz), assuming all contributions are from
--               // a single element.
--
--               double row[3*HDIV_MAX_D1D*(HDIV_MAX_D1D-1)*(HDIV_MAX_D1D-1)];
--               double div[HDIV_MAX_Q1D][HDIV_MAX_Q1D][HDIV_MAX_Q1D];
--
--               for (int i=0; i<3*D1D*(D1D - 1)*(D1D - 1); ++i)
--               {
--                  row[i] = 0;
--               }
--
--               for (int qz = 0; qz < Q1D; ++qz)
--               {
--                  for (int qy = 0; qy < Q1D; ++qy)
--                  {
--                     for (int qx = 0; qx < Q1D; ++qx)
--                     {
--                        div[qz][qy][qx] = op(qx,qy,qz,e) * L2Bo(qx,rx) *
--                                          L2Bo(qy,ry) * L2Bo(qz,rz);
--                     }
--                  }
--               }
--
--               for (int qz = 0; qz < Q1D; ++qz)
--               {
--                  double aXY[HDIV_MAX_D1D][HDIV_MAX_D1D];
--
--                  int osc = 0;
--                  for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--                  {
--                     const int D1Dz = (c == 2) ? D1D : D1D - 1;
--                     const int D1Dy = (c == 1) ? D1D : D1D - 1;
--                     const int D1Dx = (c == 0) ? D1D : D1D - 1;
--
--                     for (int dy = 0; dy < D1Dy; ++dy)
--                     {
--                        for (int dx = 0; dx < D1Dx; ++dx)
--                        {
--                           aXY[dy][dx] = 0;
--                        }
--                     }
--                     for (int qy = 0; qy < Q1D; ++qy)
--                     {
--                        double aX[HDIV_MAX_D1D];
--                        for (int dx = 0; dx < D1Dx; ++dx)
--                        {
--                           aX[dx] = 0;
--                        }
--                        for (int qx = 0; qx < Q1D; ++qx)
--                        {
--                           for (int dx = 0; dx < D1Dx; ++dx)
--                           {
--                              aX[dx] += div[qz][qy][qx] * ((c == 0) ? Gct(dx,qx)
--                                                           : Bot(dx,qx));
--                           }
--                        }
--                        for (int dy = 0; dy < D1Dy; ++dy)
--                        {
--                           const double wy = (c == 1) ? Gct(dy,qy) : Bot(dy,qy);
--                           for (int dx = 0; dx < D1Dx; ++dx)
--                           {
--                              aXY[dy][dx] += aX[dx] * wy;
--                           }
--                        }
--                     }
--
--                     for (int dz = 0; dz < D1Dz; ++dz)
--                     {
--                        const double wz = (c == 2) ? Gct(dz,qz) : Bot(dz,qz);
--                        for (int dy = 0; dy < D1Dy; ++dy)
--                        {
--                           for (int dx = 0; dx < D1Dx; ++dx)
--                           {
--                              row[dx + ((dy + (dz * D1Dy)) * D1Dx) + osc] +=
--                                 aXY[dy][dx] * wz;
--                           }
--                        }
--                     }
--
--                     osc += D1Dx * D1Dy * D1Dz;
--                  }  // loop c
--               }  // loop qz
--
--               double val = 0.0;
--               for (int i=0; i<3*D1D*(D1D - 1)*(D1D - 1); ++i)
--               {
--                  val += row[i] * row[i] * D(i,e);
--               }
--               diag(rx,ry,rz,e) += val;
--            }  // loop rx
--         }  // loop ry
--      }  // loop rz
--   }); // end of element loop
--}
--
--static void PAHdivL2AssembleDiagonal_ADAt_2D(const int D1D,
--                                             const int Q1D,
--                                             const int L2D1D,
--                                             const int NE,
--                                             const Array<double> &L2Bo_,
--                                             const Array<double> &Gct_,
--                                             const Array<double> &Bot_,
--                                             const Vector &op_,
--                                             const Vector &D_,
--                                             Vector &diag_)
--{
--   constexpr static int VDIM = 2;
--
--   auto L2Bo = Reshape(L2Bo_.Read(), Q1D, L2D1D);
--   auto Gct = Reshape(Gct_.Read(), D1D, Q1D);
--   auto Bot = Reshape(Bot_.Read(), D1D-1, Q1D);
--   auto op = Reshape(op_.Read(), Q1D, Q1D, NE);
--   auto D = Reshape(D_.Read(), 2*(D1D-1)*D1D, NE);
--   auto diag = Reshape(diag_.ReadWrite(), L2D1D, L2D1D, NE);
--
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      for (int ry = 0; ry < L2D1D; ++ry)
--      {
--         for (int rx = 0; rx < L2D1D; ++rx)
--         {
--            // Compute row (rx,ry), assuming all contributions are from
--            // a single element.
--
--            double row[2*HDIV_MAX_D1D*(HDIV_MAX_D1D-1)];
--            double div[HDIV_MAX_Q1D][HDIV_MAX_Q1D];
--
--            for (int i=0; i<2*D1D*(D1D - 1); ++i)
--            {
--               row[i] = 0;
--            }
--
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  div[qy][qx] = op(qx,qy,e) * L2Bo(qx,rx) * L2Bo(qy,ry);
--               }
--            }
--
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               int osc = 0;
--               for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
--               {
--                  const int D1Dy = (c == 1) ? D1D : D1D - 1;
--                  const int D1Dx = (c == 0) ? D1D : D1D - 1;
--
--                  double aX[HDIV_MAX_D1D];
--                  for (int dx = 0; dx < D1Dx; ++dx)
--                  {
--                     aX[dx] = 0;
--                  }
--                  for (int qx = 0; qx < Q1D; ++qx)
--                  {
--                     for (int dx = 0; dx < D1Dx; ++dx)
--                     {
--                        aX[dx] += div[qy][qx] * ((c == 0) ? Gct(dx,qx) :
--                                                 Bot(dx,qx));
--                     }
--                  }
--
--                  for (int dy = 0; dy < D1Dy; ++dy)
--                  {
--                     const double wy = (c == 1) ? Gct(dy,qy) : Bot(dy,qy);
--
--                     for (int dx = 0; dx < D1Dx; ++dx)
--                     {
--                        row[dx + (dy * D1Dx) + osc] += aX[dx] * wy;
--                     }
--                  }
--
--                  osc += D1Dx * D1Dy;
--               }  // loop c
--            }  // loop qy
--
--            double val = 0.0;
--            for (int i=0; i<2*D1D*(D1D - 1); ++i)
--            {
--               val += row[i] * row[i] * D(i,e);
--            }
--            diag(rx,ry,e) += val;
--         }  // loop rx
--      }  // loop ry
--   }); // end of element loop
--}
--
--void VectorFEDivergenceIntegrator::AssembleDiagonalPA_ADAt(const Vector &D,
--                                                           Vector &diag)
--{
--   if (dim == 3)
--      PAHdivL2AssembleDiagonal_ADAt_3D(dofs1D, quad1D, L2dofs1D, ne, L2mapsO->B,
--                                       mapsC->Gt, mapsO->Bt, pa_data, D, diag);
--   else if (dim == 2)
--      PAHdivL2AssembleDiagonal_ADAt_2D(dofs1D, quad1D, L2dofs1D, ne, L2mapsO->B,
--                                       mapsC->Gt, mapsO->Bt, pa_data, D, diag);
--   else
--   {
--      MFEM_ABORT("Unsupported dimension!");
--   }
--}
-+} // namespace internal
- 
- } // namespace mfem
-+
-+#endif
-diff --git a/fem/integ/bilininteg_interp_pa.cpp b/fem/integ/bilininteg_interp_pa.cpp
-new file mode 100644
-index 000000000..3cac18c65
---- /dev/null
-+++ b/fem/integ/bilininteg_interp_pa.cpp
-@@ -0,0 +1,1937 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../qfunction.hpp"
-+
-+namespace mfem
-+{
-+
-+// Apply to x corresponding to DOFs in H^1 (domain) the (topological) gradient
-+// to get a dof in H(curl) (range). You can think of the range as the "test" space
-+// and the domain as the "trial" space, but there's no integration.
-+static void PAHcurlApplyGradient2D(const int c_dofs1D,
-+                                   const int o_dofs1D,
-+                                   const int NE,
-+                                   const Array<double> &B_,
-+                                   const Array<double> &G_,
-+                                   const Vector &x_,
-+                                   Vector &y_)
-+{
-+   auto B = Reshape(B_.Read(), c_dofs1D, c_dofs1D);
-+   auto G = Reshape(G_.Read(), o_dofs1D, c_dofs1D);
-+
-+   auto x = Reshape(x_.Read(), c_dofs1D, c_dofs1D, NE);
-+   auto y = Reshape(y_.ReadWrite(), 2 * c_dofs1D * o_dofs1D, NE);
-+
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double w[MAX_D1D][MAX_D1D];
-+
-+      // horizontal part
-+      for (int dx = 0; dx < c_dofs1D; ++dx)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            w[dx][ey] = 0.0;
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               w[dx][ey] += B(ey, dy) * x(dx, dy, e);
-+            }
-+         }
-+      }
-+
-+      for (int ey = 0; ey < c_dofs1D; ++ey)
-+      {
-+         for (int ex = 0; ex < o_dofs1D; ++ex)
-+         {
-+            double s = 0.0;
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               s += G(ex, dx) * w[dx][ey];
-+            }
-+            const int local_index = ey*o_dofs1D + ex;
-+            y(local_index, e) += s;
-+         }
-+      }
-+
-+      // vertical part
-+      for (int dx = 0; dx < c_dofs1D; ++dx)
-+      {
-+         for (int ey = 0; ey < o_dofs1D; ++ey)
-+         {
-+            w[dx][ey] = 0.0;
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               w[dx][ey] += G(ey, dy) * x(dx, dy, e);
-+            }
-+         }
-+      }
-+
-+      for (int ey = 0; ey < o_dofs1D; ++ey)
-+      {
-+         for (int ex = 0; ex < c_dofs1D; ++ex)
-+         {
-+            double s = 0.0;
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               s += B(ex, dx) * w[dx][ey];
-+            }
-+            const int local_index = c_dofs1D * o_dofs1D + ey*c_dofs1D + ex;
-+            y(local_index, e) += s;
-+         }
-+      }
-+   });
-+}
-+
-+// Specialization of PAHcurlApplyGradient2D to the case where B is identity
-+static void PAHcurlApplyGradient2DBId(const int c_dofs1D,
-+                                      const int o_dofs1D,
-+                                      const int NE,
-+                                      const Array<double> &G_,
-+                                      const Vector &x_,
-+                                      Vector &y_)
-+{
-+   auto G = Reshape(G_.Read(), o_dofs1D, c_dofs1D);
-+
-+   auto x = Reshape(x_.Read(), c_dofs1D, c_dofs1D, NE);
-+   auto y = Reshape(y_.ReadWrite(), 2 * c_dofs1D * o_dofs1D, NE);
-+
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double w[MAX_D1D][MAX_D1D];
-+
-+      // horizontal part
-+      for (int dx = 0; dx < c_dofs1D; ++dx)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            const int dy = ey;
-+            w[dx][ey] = x(dx, dy, e);
-+         }
-+      }
-+
-+      for (int ey = 0; ey < c_dofs1D; ++ey)
-+      {
-+         for (int ex = 0; ex < o_dofs1D; ++ex)
-+         {
-+            double s = 0.0;
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               s += G(ex, dx) * w[dx][ey];
-+            }
-+            const int local_index = ey*o_dofs1D + ex;
-+            y(local_index, e) += s;
-+         }
-+      }
-+
-+      // vertical part
-+      for (int dx = 0; dx < c_dofs1D; ++dx)
-+      {
-+         for (int ey = 0; ey < o_dofs1D; ++ey)
-+         {
-+            w[dx][ey] = 0.0;
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               w[dx][ey] += G(ey, dy) * x(dx, dy, e);
-+            }
-+         }
-+      }
-+
-+      for (int ey = 0; ey < o_dofs1D; ++ey)
-+      {
-+         for (int ex = 0; ex < c_dofs1D; ++ex)
-+         {
-+            const int dx = ex;
-+            const double s = w[dx][ey];
-+            const int local_index = c_dofs1D * o_dofs1D + ey*c_dofs1D + ex;
-+            y(local_index, e) += s;
-+         }
-+      }
-+   });
-+}
-+
-+static void PAHcurlApplyGradientTranspose2D(
-+   const int c_dofs1D, const int o_dofs1D, const int NE,
-+   const Array<double> &B_, const Array<double> &G_,
-+   const Vector &x_, Vector &y_)
-+{
-+   auto B = Reshape(B_.Read(), c_dofs1D, c_dofs1D);
-+   auto G = Reshape(G_.Read(), o_dofs1D, c_dofs1D);
-+
-+   auto x = Reshape(x_.Read(), 2 * c_dofs1D * o_dofs1D, NE);
-+   auto y = Reshape(y_.ReadWrite(), c_dofs1D, c_dofs1D, NE);
-+
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double w[MAX_D1D][MAX_D1D];
-+
-+      // horizontal part (open x, closed y)
-+      for (int dy = 0; dy < c_dofs1D; ++dy)
-+      {
-+         for (int ex = 0; ex < o_dofs1D; ++ex)
-+         {
-+            w[dy][ex] = 0.0;
-+            for (int ey = 0; ey < c_dofs1D; ++ey)
-+            {
-+               const int local_index = ey*o_dofs1D + ex;
-+               w[dy][ex] += B(ey, dy) * x(local_index, e);
-+            }
-+         }
-+      }
-+
-+      for (int dy = 0; dy < c_dofs1D; ++dy)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            double s = 0.0;
-+            for (int ex = 0; ex < o_dofs1D; ++ex)
-+            {
-+               s += G(ex, dx) * w[dy][ex];
-+            }
-+            y(dx, dy, e) += s;
-+         }
-+      }
-+
-+      // vertical part (open y, closed x)
-+      for (int dy = 0; dy < c_dofs1D; ++dy)
-+      {
-+         for (int ex = 0; ex < c_dofs1D; ++ex)
-+         {
-+            w[dy][ex] = 0.0;
-+            for (int ey = 0; ey < o_dofs1D; ++ey)
-+            {
-+               const int local_index = c_dofs1D * o_dofs1D + ey*c_dofs1D + ex;
-+               w[dy][ex] += G(ey, dy) * x(local_index, e);
-+            }
-+         }
-+      }
-+
-+      for (int dy = 0; dy < c_dofs1D; ++dy)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            double s = 0.0;
-+            for (int ex = 0; ex < c_dofs1D; ++ex)
-+            {
-+               s += B(ex, dx) * w[dy][ex];
-+            }
-+            y(dx, dy, e) += s;
-+         }
-+      }
-+   });
-+}
-+
-+// Specialization of PAHcurlApplyGradientTranspose2D to the case where
-+// B is identity
-+static void PAHcurlApplyGradientTranspose2DBId(
-+   const int c_dofs1D, const int o_dofs1D, const int NE,
-+   const Array<double> &G_,
-+   const Vector &x_, Vector &y_)
-+{
-+   auto G = Reshape(G_.Read(), o_dofs1D, c_dofs1D);
-+
-+   auto x = Reshape(x_.Read(), 2 * c_dofs1D * o_dofs1D, NE);
-+   auto y = Reshape(y_.ReadWrite(), c_dofs1D, c_dofs1D, NE);
-+
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double w[MAX_D1D][MAX_D1D];
-+
-+      // horizontal part (open x, closed y)
-+      for (int dy = 0; dy < c_dofs1D; ++dy)
-+      {
-+         for (int ex = 0; ex < o_dofs1D; ++ex)
-+         {
-+            const int ey = dy;
-+            const int local_index = ey*o_dofs1D + ex;
-+            w[dy][ex] = x(local_index, e);
-+         }
-+      }
-+
-+      for (int dy = 0; dy < c_dofs1D; ++dy)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            double s = 0.0;
-+            for (int ex = 0; ex < o_dofs1D; ++ex)
-+            {
-+               s += G(ex, dx) * w[dy][ex];
-+            }
-+            y(dx, dy, e) += s;
-+         }
-+      }
-+
-+      // vertical part (open y, closed x)
-+      for (int dy = 0; dy < c_dofs1D; ++dy)
-+      {
-+         for (int ex = 0; ex < c_dofs1D; ++ex)
-+         {
-+            w[dy][ex] = 0.0;
-+            for (int ey = 0; ey < o_dofs1D; ++ey)
-+            {
-+               const int local_index = c_dofs1D * o_dofs1D + ey*c_dofs1D + ex;
-+               w[dy][ex] += G(ey, dy) * x(local_index, e);
-+            }
-+         }
-+      }
-+
-+      for (int dy = 0; dy < c_dofs1D; ++dy)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            const int ex = dx;
-+            const double s = w[dy][ex];
-+            y(dx, dy, e) += s;
-+         }
-+      }
-+   });
-+}
-+
-+static void PAHcurlApplyGradient3D(const int c_dofs1D,
-+                                   const int o_dofs1D,
-+                                   const int NE,
-+                                   const Array<double> &B_,
-+                                   const Array<double> &G_,
-+                                   const Vector &x_,
-+                                   Vector &y_)
-+{
-+   auto B = Reshape(B_.Read(), c_dofs1D, c_dofs1D);
-+   auto G = Reshape(G_.Read(), o_dofs1D, c_dofs1D);
-+
-+   auto x = Reshape(x_.Read(), c_dofs1D, c_dofs1D, c_dofs1D, NE);
-+   auto y = Reshape(y_.ReadWrite(), (3 * c_dofs1D * c_dofs1D * o_dofs1D), NE);
-+
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double w1[MAX_D1D][MAX_D1D][MAX_D1D];
-+      double w2[MAX_D1D][MAX_D1D][MAX_D1D];
-+
-+      // ---
-+      // dofs that point parallel to x-axis (open in x, closed in y, z)
-+      // ---
-+
-+      // contract in z
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               w1[dx][dy][ez] = 0.0;
-+               for (int dz = 0; dz < c_dofs1D; ++dz)
-+               {
-+                  w1[dx][dy][ez] += B(ez, dz) * x(dx, dy, dz, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               w2[dx][ey][ez] = 0.0;
-+               for (int dy = 0; dy < c_dofs1D; ++dy)
-+               {
-+                  w2[dx][ey][ez] += B(ey, dy) * w1[dx][dy][ez];
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int ex = 0; ex < o_dofs1D; ++ex)
-+            {
-+               double s = 0.0;
-+               for (int dx = 0; dx < c_dofs1D; ++dx)
-+               {
-+                  s += G(ex, dx) * w2[dx][ey][ez];
-+               }
-+               const int local_index = ez*c_dofs1D*o_dofs1D + ey*o_dofs1D + ex;
-+               y(local_index, e) += s;
-+            }
-+         }
-+      }
-+
-+      // ---
-+      // dofs that point parallel to y-axis (open in y, closed in x, z)
-+      // ---
-+
-+      // contract in z
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               w1[dx][dy][ez] = 0.0;
-+               for (int dz = 0; dz < c_dofs1D; ++dz)
-+               {
-+                  w1[dx][dy][ez] += B(ez, dz) * x(dx, dy, dz, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < o_dofs1D; ++ey)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               w2[dx][ey][ez] = 0.0;
-+               for (int dy = 0; dy < c_dofs1D; ++dy)
-+               {
-+                  w2[dx][ey][ez] += G(ey, dy) * w1[dx][dy][ez];
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < o_dofs1D; ++ey)
-+         {
-+            for (int ex = 0; ex < c_dofs1D; ++ex)
-+            {
-+               double s = 0.0;
-+               for (int dx = 0; dx < c_dofs1D; ++dx)
-+               {
-+                  s += B(ex, dx) * w2[dx][ey][ez];
-+               }
-+               const int local_index = c_dofs1D*c_dofs1D*o_dofs1D +
-+                                       ez*c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
-+               y(local_index, e) += s;
-+            }
-+         }
-+      }
-+
-+      // ---
-+      // dofs that point parallel to z-axis (open in z, closed in x, y)
-+      // ---
-+
-+      // contract in z
-+      for (int ez = 0; ez < o_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               w1[dx][dy][ez] = 0.0;
-+               for (int dz = 0; dz < c_dofs1D; ++dz)
-+               {
-+                  w1[dx][dy][ez] += G(ez, dz) * x(dx, dy, dz, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < o_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               w2[dx][ey][ez] = 0.0;
-+               for (int dy = 0; dy < c_dofs1D; ++dy)
-+               {
-+                  w2[dx][ey][ez] += B(ey, dy) * w1[dx][dy][ez];
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int ez = 0; ez < o_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int ex = 0; ex < c_dofs1D; ++ex)
-+            {
-+               double s = 0.0;
-+               for (int dx = 0; dx < c_dofs1D; ++dx)
-+               {
-+                  s += B(ex, dx) * w2[dx][ey][ez];
-+               }
-+               const int local_index = 2*c_dofs1D*c_dofs1D*o_dofs1D +
-+                                       ez*c_dofs1D*c_dofs1D + ey*c_dofs1D + ex;
-+               y(local_index, e) += s;
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+// Specialization of PAHcurlApplyGradient3D to the case where
-+static void PAHcurlApplyGradient3DBId(const int c_dofs1D,
-+                                      const int o_dofs1D,
-+                                      const int NE,
-+                                      const Array<double> &G_,
-+                                      const Vector &x_,
-+                                      Vector &y_)
-+{
-+   auto G = Reshape(G_.Read(), o_dofs1D, c_dofs1D);
-+
-+   auto x = Reshape(x_.Read(), c_dofs1D, c_dofs1D, c_dofs1D, NE);
-+   auto y = Reshape(y_.ReadWrite(), (3 * c_dofs1D * c_dofs1D * o_dofs1D), NE);
-+
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double w1[MAX_D1D][MAX_D1D][MAX_D1D];
-+      double w2[MAX_D1D][MAX_D1D][MAX_D1D];
-+
-+      // ---
-+      // dofs that point parallel to x-axis (open in x, closed in y, z)
-+      // ---
-+
-+      // contract in z
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               const int dz = ez;
-+               w1[dx][dy][ez] = x(dx, dy, dz, e);
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               const int dy = ey;
-+               w2[dx][ey][ez] = w1[dx][dy][ez];
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int ex = 0; ex < o_dofs1D; ++ex)
-+            {
-+               double s = 0.0;
-+               for (int dx = 0; dx < c_dofs1D; ++dx)
-+               {
-+                  s += G(ex, dx) * w2[dx][ey][ez];
-+               }
-+               const int local_index = ez*c_dofs1D*o_dofs1D + ey*o_dofs1D + ex;
-+               y(local_index, e) += s;
-+            }
-+         }
-+      }
-+
-+      // ---
-+      // dofs that point parallel to y-axis (open in y, closed in x, z)
-+      // ---
-+
-+      // contract in z
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               const int dz = ez;
-+               w1[dx][dy][ez] = x(dx, dy, dz, e);
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < o_dofs1D; ++ey)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               w2[dx][ey][ez] = 0.0;
-+               for (int dy = 0; dy < c_dofs1D; ++dy)
-+               {
-+                  w2[dx][ey][ez] += G(ey, dy) * w1[dx][dy][ez];
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < o_dofs1D; ++ey)
-+         {
-+            for (int ex = 0; ex < c_dofs1D; ++ex)
-+            {
-+               const int dx = ex;
-+               const double s = w2[dx][ey][ez];
-+               const int local_index = c_dofs1D*c_dofs1D*o_dofs1D +
-+                                       ez*c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
-+               y(local_index, e) += s;
-+            }
-+         }
-+      }
-+
-+      // ---
-+      // dofs that point parallel to z-axis (open in z, closed in x, y)
-+      // ---
-+
-+      // contract in z
-+      for (int ez = 0; ez < o_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               w1[dx][dy][ez] = 0.0;
-+               for (int dz = 0; dz < c_dofs1D; ++dz)
-+               {
-+                  w1[dx][dy][ez] += G(ez, dz) * x(dx, dy, dz, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < o_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               const int dy = ey;
-+               w2[dx][ey][ez] = w1[dx][dy][ez];
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int ez = 0; ez < o_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int ex = 0; ex < c_dofs1D; ++ex)
-+            {
-+               const int dx = ex;
-+               const double s = w2[dx][ey][ez];
-+               const int local_index = 2*c_dofs1D*c_dofs1D*o_dofs1D +
-+                                       ez*c_dofs1D*c_dofs1D + ey*c_dofs1D + ex;
-+               y(local_index, e) += s;
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+static void PAHcurlApplyGradientTranspose3D(
-+   const int c_dofs1D, const int o_dofs1D, const int NE,
-+   const Array<double> &B_, const Array<double> &G_,
-+   const Vector &x_, Vector &y_)
-+{
-+   auto B = Reshape(B_.Read(), c_dofs1D, c_dofs1D);
-+   auto G = Reshape(G_.Read(), o_dofs1D, c_dofs1D);
-+
-+   auto x = Reshape(x_.Read(), (3 * c_dofs1D * c_dofs1D * o_dofs1D), NE);
-+   auto y = Reshape(y_.ReadWrite(), c_dofs1D, c_dofs1D, c_dofs1D, NE);
-+
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double w1[MAX_D1D][MAX_D1D][MAX_D1D];
-+      double w2[MAX_D1D][MAX_D1D][MAX_D1D];
-+      // ---
-+      // dofs that point parallel to x-axis (open in x, closed in y, z)
-+      // ---
-+
-+      // contract in z
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int ex = 0; ex < o_dofs1D; ++ex)
-+         {
-+            for (int ey = 0; ey < c_dofs1D; ++ey)
-+            {
-+               w1[ex][ey][dz] = 0.0;
-+               for (int ez = 0; ez < c_dofs1D; ++ez)
-+               {
-+                  const int local_index = ez*c_dofs1D*o_dofs1D + ey*o_dofs1D + ex;
-+                  w1[ex][ey][dz] += B(ez, dz) * x(local_index, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int ex = 0; ex < o_dofs1D; ++ex)
-+            {
-+               w2[ex][dy][dz] = 0.0;
-+               for (int ey = 0; ey < c_dofs1D; ++ey)
-+               {
-+                  w2[ex][dy][dz] += B(ey, dy) * w1[ex][ey][dz];
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               double s = 0.0;
-+               for (int ex = 0; ex < o_dofs1D; ++ex)
-+               {
-+                  s += G(ex, dx) * w2[ex][dy][dz];
-+               }
-+               y(dx, dy, dz, e) += s;
-+            }
-+         }
-+      }
-+
-+      // ---
-+      // dofs that point parallel to y-axis (open in y, closed in x, z)
-+      // ---
-+
-+      // contract in z
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int ex = 0; ex < c_dofs1D; ++ex)
-+         {
-+            for (int ey = 0; ey < o_dofs1D; ++ey)
-+            {
-+               w1[ex][ey][dz] = 0.0;
-+               for (int ez = 0; ez < c_dofs1D; ++ez)
-+               {
-+                  const int local_index = c_dofs1D*c_dofs1D*o_dofs1D +
-+                                          ez*c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
-+                  w1[ex][ey][dz] += B(ez, dz) * x(local_index, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int ex = 0; ex < c_dofs1D; ++ex)
-+            {
-+               w2[ex][dy][dz] = 0.0;
-+               for (int ey = 0; ey < o_dofs1D; ++ey)
-+               {
-+                  w2[ex][dy][dz] += G(ey, dy) * w1[ex][ey][dz];
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               double s = 0.0;
-+               for (int ex = 0; ex < c_dofs1D; ++ex)
-+               {
-+                  s += B(ex, dx) * w2[ex][dy][dz];
-+               }
-+               y(dx, dy, dz, e) += s;
-+            }
-+         }
-+      }
-+
-+      // ---
-+      // dofs that point parallel to z-axis (open in z, closed in x, y)
-+      // ---
-+
-+      // contract in z
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int ex = 0; ex < c_dofs1D; ++ex)
-+         {
-+            for (int ey = 0; ey < c_dofs1D; ++ey)
-+            {
-+               w1[ex][ey][dz] = 0.0;
-+               for (int ez = 0; ez < o_dofs1D; ++ez)
-+               {
-+                  const int local_index = 2*c_dofs1D*c_dofs1D*o_dofs1D +
-+                                          ez*c_dofs1D*c_dofs1D + ey*c_dofs1D + ex;
-+                  w1[ex][ey][dz] += G(ez, dz) * x(local_index, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int ex = 0; ex < c_dofs1D; ++ex)
-+            {
-+               w2[ex][dy][dz] = 0.0;
-+               for (int ey = 0; ey < c_dofs1D; ++ey)
-+               {
-+                  w2[ex][dy][dz] += B(ey, dy) * w1[ex][ey][dz];
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               double s = 0.0;
-+               for (int ex = 0; ex < c_dofs1D; ++ex)
-+               {
-+                  s += B(ex, dx) * w2[ex][dy][dz];
-+               }
-+               y(dx, dy, dz, e) += s;
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+// Specialization of PAHcurlApplyGradientTranspose3D to the case where
-+static void PAHcurlApplyGradientTranspose3DBId(
-+   const int c_dofs1D, const int o_dofs1D, const int NE,
-+   const Array<double> &G_,
-+   const Vector &x_, Vector &y_)
-+{
-+   auto G = Reshape(G_.Read(), o_dofs1D, c_dofs1D);
-+
-+   auto x = Reshape(x_.Read(), (3 * c_dofs1D * c_dofs1D * o_dofs1D), NE);
-+   auto y = Reshape(y_.ReadWrite(), c_dofs1D, c_dofs1D, c_dofs1D, NE);
-+
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double w1[MAX_D1D][MAX_D1D][MAX_D1D];
-+      double w2[MAX_D1D][MAX_D1D][MAX_D1D];
-+      // ---
-+      // dofs that point parallel to x-axis (open in x, closed in y, z)
-+      // ---
-+
-+      // contract in z
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int ex = 0; ex < o_dofs1D; ++ex)
-+         {
-+            for (int ey = 0; ey < c_dofs1D; ++ey)
-+            {
-+               const int ez = dz;
-+               const int local_index = ez*c_dofs1D*o_dofs1D + ey*o_dofs1D + ex;
-+               w1[ex][ey][dz] = x(local_index, e);
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int ex = 0; ex < o_dofs1D; ++ex)
-+            {
-+               const int ey = dy;
-+               w2[ex][dy][dz] = w1[ex][ey][dz];
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               double s = 0.0;
-+               for (int ex = 0; ex < o_dofs1D; ++ex)
-+               {
-+                  s += G(ex, dx) * w2[ex][dy][dz];
-+               }
-+               y(dx, dy, dz, e) += s;
-+            }
-+         }
-+      }
-+
-+      // ---
-+      // dofs that point parallel to y-axis (open in y, closed in x, z)
-+      // ---
-+
-+      // contract in z
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int ex = 0; ex < c_dofs1D; ++ex)
-+         {
-+            for (int ey = 0; ey < o_dofs1D; ++ey)
-+            {
-+               const int ez = dz;
-+               const int local_index = c_dofs1D*c_dofs1D*o_dofs1D +
-+                                       ez*c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
-+               w1[ex][ey][dz] = x(local_index, e);
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int ex = 0; ex < c_dofs1D; ++ex)
-+            {
-+               w2[ex][dy][dz] = 0.0;
-+               for (int ey = 0; ey < o_dofs1D; ++ey)
-+               {
-+                  w2[ex][dy][dz] += G(ey, dy) * w1[ex][ey][dz];
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               const int ex = dx;
-+               double s = w2[ex][dy][dz];
-+               y(dx, dy, dz, e) += s;
-+            }
-+         }
-+      }
-+
-+      // ---
-+      // dofs that point parallel to z-axis (open in z, closed in x, y)
-+      // ---
-+
-+      // contract in z
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int ex = 0; ex < c_dofs1D; ++ex)
-+         {
-+            for (int ey = 0; ey < c_dofs1D; ++ey)
-+            {
-+               w1[ex][ey][dz] = 0.0;
-+               for (int ez = 0; ez < o_dofs1D; ++ez)
-+               {
-+                  const int local_index = 2*c_dofs1D*c_dofs1D*o_dofs1D +
-+                                          ez*c_dofs1D*c_dofs1D + ey*c_dofs1D + ex;
-+                  w1[ex][ey][dz] += G(ez, dz) * x(local_index, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int ex = 0; ex < c_dofs1D; ++ex)
-+            {
-+               const int ey = dy;
-+               w2[ex][dy][dz] = w1[ex][ey][dz];
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int dz = 0; dz < c_dofs1D; ++dz)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               const int ex = dx;
-+               double s = w2[ex][dy][dz];
-+               y(dx, dy, dz, e) += s;
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+void GradientInterpolator::AssemblePA(const FiniteElementSpace &trial_fes,
-+                                      const FiniteElementSpace &test_fes)
-+{
-+   // Assumes tensor-product elements, with a vector test space and H^1 trial space.
-+   Mesh *mesh = trial_fes.GetMesh();
-+   const FiniteElement *trial_fel = trial_fes.GetFE(0);
-+   const FiniteElement *test_fel = test_fes.GetFE(0);
-+
-+   const NodalTensorFiniteElement *trial_el =
-+      dynamic_cast<const NodalTensorFiniteElement*>(trial_fel);
-+   MFEM_VERIFY(trial_el != NULL, "Only NodalTensorFiniteElement is supported!");
-+
-+   const VectorTensorFiniteElement *test_el =
-+      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
-+   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
-+
-+   const int dims = trial_el->GetDim();
-+   MFEM_VERIFY(dims == 2 || dims == 3, "Bad dimension!");
-+   dim = mesh->Dimension();
-+   MFEM_VERIFY(dim == 2 || dim == 3, "Bad dimension!");
-+   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(),
-+               "Orders do not match!");
-+   ne = trial_fes.GetNE();
-+
-+   const int order = trial_el->GetOrder();
-+   dofquad_fe = new H1_SegmentElement(order, trial_el->GetBasisType());
-+   mfem::QuadratureFunctions1D qf1d;
-+   mfem::IntegrationRule closed_ir;
-+   closed_ir.SetSize(order + 1);
-+   qf1d.GaussLobatto(order + 1, &closed_ir);
-+   mfem::IntegrationRule open_ir;
-+   open_ir.SetSize(order);
-+   qf1d.GaussLegendre(order, &open_ir);
-+
-+   maps_O_C = &dofquad_fe->GetDofToQuad(open_ir, DofToQuad::TENSOR);
-+   o_dofs1D = maps_O_C->nqpt;
-+   if (trial_el->GetBasisType() == BasisType::GaussLobatto)
-+   {
-+      B_id = true;
-+      c_dofs1D = maps_O_C->ndof;
-+   }
-+   else
-+   {
-+      B_id = false;
-+      maps_C_C = &dofquad_fe->GetDofToQuad(closed_ir, DofToQuad::TENSOR);
-+      c_dofs1D = maps_C_C->nqpt;
-+   }
-+}
-+
-+void GradientInterpolator::AddMultPA(const Vector &x, Vector &y) const
-+{
-+   if (dim == 3)
-+   {
-+      if (B_id)
-+      {
-+         PAHcurlApplyGradient3DBId(c_dofs1D, o_dofs1D, ne,
-+                                   maps_O_C->G, x, y);
-+      }
-+      else
-+      {
-+         PAHcurlApplyGradient3D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
-+                                maps_O_C->G, x, y);
-+      }
-+   }
-+   else if (dim == 2)
-+   {
-+      if (B_id)
-+      {
-+         PAHcurlApplyGradient2DBId(c_dofs1D, o_dofs1D, ne,
-+                                   maps_O_C->G, x, y);
-+      }
-+      else
-+      {
-+         PAHcurlApplyGradient2D(c_dofs1D, o_dofs1D, ne, maps_C_C->B, maps_O_C->G,
-+                                x, y);
-+      }
-+   }
-+   else
-+   {
-+      mfem_error("Bad dimension!");
-+   }
-+}
-+
-+void GradientInterpolator::AddMultTransposePA(const Vector &x, Vector &y) const
-+{
-+   if (dim == 3)
-+   {
-+      if (B_id)
-+      {
-+         PAHcurlApplyGradientTranspose3DBId(c_dofs1D, o_dofs1D, ne,
-+                                            maps_O_C->G, x, y);
-+      }
-+      else
-+      {
-+         PAHcurlApplyGradientTranspose3D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
-+                                         maps_O_C->G, x, y);
-+      }
-+   }
-+   else if (dim == 2)
-+   {
-+      if (B_id)
-+      {
-+         PAHcurlApplyGradientTranspose2DBId(c_dofs1D, o_dofs1D, ne,
-+                                            maps_O_C->G, x, y);
-+      }
-+      else
-+      {
-+         PAHcurlApplyGradientTranspose2D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
-+                                         maps_O_C->G, x, y);
-+      }
-+   }
-+   else
-+   {
-+      mfem_error("Bad dimension!");
-+   }
-+}
-+
-+static void PAHcurlVecH1IdentityApply2D(const int c_dofs1D,
-+                                        const int o_dofs1D,
-+                                        const int NE,
-+                                        const Array<double> &Bclosed,
-+                                        const Array<double> &Bopen,
-+                                        const Vector &pa_data,
-+                                        const Vector &x_,
-+                                        Vector &y_)
-+{
-+   auto Bc = Reshape(Bclosed.Read(), c_dofs1D, c_dofs1D);
-+   auto Bo = Reshape(Bopen.Read(), o_dofs1D, c_dofs1D);
-+
-+   auto x = Reshape(x_.Read(), c_dofs1D, c_dofs1D, 2, NE);
-+   auto y = Reshape(y_.ReadWrite(), (2 * c_dofs1D * o_dofs1D), NE);
-+
-+   auto vk = Reshape(pa_data.Read(), 2, (2 * c_dofs1D * o_dofs1D), NE);
-+
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+
-+   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double w[2][MAX_D1D][MAX_D1D];
-+
-+      // dofs that point parallel to x-axis (open in x, closed in y)
-+
-+      // contract in y
-+      for (int ey = 0; ey < c_dofs1D; ++ey)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int j=0; j<2; ++j)
-+            {
-+               w[j][dx][ey] = 0.0;
-+               for (int dy = 0; dy < c_dofs1D; ++dy)
-+               {
-+                  w[j][dx][ey] += Bc(ey, dy) * x(dx, dy, j, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int ey = 0; ey < c_dofs1D; ++ey)
-+      {
-+         for (int ex = 0; ex < o_dofs1D; ++ex)
-+         {
-+            for (int j=0; j<2; ++j)
-+            {
-+               double s = 0.0;
-+               for (int dx = 0; dx < c_dofs1D; ++dx)
-+               {
-+                  s += Bo(ex, dx) * w[j][dx][ey];
-+               }
-+               const int local_index = ey*o_dofs1D + ex;
-+               y(local_index, e) += s * vk(j, local_index, e);
-+            }
-+         }
-+      }
-+
-+      // dofs that point parallel to y-axis (open in y, closed in x)
-+
-+      // contract in y
-+      for (int ey = 0; ey < o_dofs1D; ++ey)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int j=0; j<2; ++j)
-+            {
-+               w[j][dx][ey] = 0.0;
-+               for (int dy = 0; dy < c_dofs1D; ++dy)
-+               {
-+                  w[j][dx][ey] += Bo(ey, dy) * x(dx, dy, j, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int ey = 0; ey < o_dofs1D; ++ey)
-+      {
-+         for (int ex = 0; ex < c_dofs1D; ++ex)
-+         {
-+            for (int j=0; j<2; ++j)
-+            {
-+               double s = 0.0;
-+               for (int dx = 0; dx < c_dofs1D; ++dx)
-+               {
-+                  s += Bc(ex, dx) * w[j][dx][ey];
-+               }
-+               const int local_index = c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
-+               y(local_index, e) += s * vk(j, local_index, e);
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+static void PAHcurlVecH1IdentityApplyTranspose2D(const int c_dofs1D,
-+                                                 const int o_dofs1D,
-+                                                 const int NE,
-+                                                 const Array<double> &Bclosed,
-+                                                 const Array<double> &Bopen,
-+                                                 const Vector &pa_data,
-+                                                 const Vector &x_,
-+                                                 Vector &y_)
-+{
-+   auto Bc = Reshape(Bclosed.Read(), c_dofs1D, c_dofs1D);
-+   auto Bo = Reshape(Bopen.Read(), o_dofs1D, c_dofs1D);
-+
-+   auto x = Reshape(x_.Read(), (2 * c_dofs1D * o_dofs1D), NE);
-+   auto y = Reshape(y_.ReadWrite(), c_dofs1D, c_dofs1D, 2, NE);
-+
-+   auto vk = Reshape(pa_data.Read(), 2, (2 * c_dofs1D * o_dofs1D), NE);
-+
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   //constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
-+
-+   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double w[2][MAX_D1D][MAX_D1D];
-+
-+      // dofs that point parallel to x-axis (open in x, closed in y)
-+
-+      // contract in x
-+      for (int ey = 0; ey < c_dofs1D; ++ey)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int j=0; j<2; ++j) { w[j][dx][ey] = 0.0; }
-+         }
-+         for (int ex = 0; ex < o_dofs1D; ++ex)
-+         {
-+            const int local_index = ey*o_dofs1D + ex;
-+            const double xd = x(local_index, e);
-+
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               for (int j=0; j<2; ++j)
-+               {
-+                  w[j][dx][ey] += Bo(ex, dx) * xd * vk(j, local_index, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int dx = 0; dx < c_dofs1D; ++dx)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int j=0; j<2; ++j)
-+            {
-+               double s = 0.0;
-+               for (int ey = 0; ey < c_dofs1D; ++ey)
-+               {
-+                  s += w[j][dx][ey] * Bc(ey, dy);
-+               }
-+               y(dx, dy, j, e) += s;
-+            }
-+         }
-+      }
-+
-+      // dofs that point parallel to y-axis (open in y, closed in x)
-+
-+      // contract in x
-+      for (int ey = 0; ey < o_dofs1D; ++ey)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int j=0; j<2; ++j) { w[j][dx][ey] = 0.0; }
-+         }
-+         for (int ex = 0; ex < c_dofs1D; ++ex)
-+         {
-+            const int local_index = c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
-+            const double xd = x(local_index, e);
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               for (int j=0; j<2; ++j)
-+               {
-+                  w[j][dx][ey] += Bc(ex, dx) * xd * vk(j, local_index, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int dx = 0; dx < c_dofs1D; ++dx)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int j=0; j<2; ++j)
-+            {
-+               double s = 0.0;
-+               for (int ey = 0; ey < o_dofs1D; ++ey)
-+               {
-+                  s += w[j][dx][ey] * Bo(ey, dy);
-+               }
-+               y(dx, dy, j, e) += s;
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+static void PAHcurlVecH1IdentityApply3D(const int c_dofs1D,
-+                                        const int o_dofs1D,
-+                                        const int NE,
-+                                        const Array<double> &Bclosed,
-+                                        const Array<double> &Bopen,
-+                                        const Vector &pa_data,
-+                                        const Vector &x_,
-+                                        Vector &y_)
-+{
-+   auto Bc = Reshape(Bclosed.Read(), c_dofs1D, c_dofs1D);
-+   auto Bo = Reshape(Bopen.Read(), o_dofs1D, c_dofs1D);
-+
-+   auto x = Reshape(x_.Read(), c_dofs1D, c_dofs1D, c_dofs1D, 3, NE);
-+   auto y = Reshape(y_.ReadWrite(), (3 * c_dofs1D * c_dofs1D * o_dofs1D), NE);
-+
-+   auto vk = Reshape(pa_data.Read(), 3, (3 * c_dofs1D * c_dofs1D * o_dofs1D),
-+                     NE);
-+
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double w1[3][MAX_D1D][MAX_D1D][MAX_D1D];
-+      double w2[3][MAX_D1D][MAX_D1D][MAX_D1D];
-+
-+      // dofs that point parallel to x-axis (open in x, closed in y, z)
-+
-+      // contract in z
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w1[j][dx][dy][ez] = 0.0;
-+                  for (int dz = 0; dz < c_dofs1D; ++dz)
-+                  {
-+                     w1[j][dx][dy][ez] += Bc(ez, dz) * x(dx, dy, dz, j, e);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w2[j][dx][ey][ez] = 0.0;
-+                  for (int dy = 0; dy < c_dofs1D; ++dy)
-+                  {
-+                     w2[j][dx][ey][ez] += Bc(ey, dy) * w1[j][dx][dy][ez];
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int ex = 0; ex < o_dofs1D; ++ex)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  double s = 0.0;
-+                  for (int dx = 0; dx < c_dofs1D; ++dx)
-+                  {
-+                     s += Bo(ex, dx) * w2[j][dx][ey][ez];
-+                  }
-+                  const int local_index = ez*c_dofs1D*o_dofs1D + ey*o_dofs1D + ex;
-+                  y(local_index, e) += s * vk(j, local_index, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // dofs that point parallel to y-axis (open in y, closed in x, z)
-+
-+      // contract in z
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w1[j][dx][dy][ez] = 0.0;
-+                  for (int dz = 0; dz < c_dofs1D; ++dz)
-+                  {
-+                     w1[j][dx][dy][ez] += Bc(ez, dz) * x(dx, dy, dz, j, e);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < o_dofs1D; ++ey)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w2[j][dx][ey][ez] = 0.0;
-+                  for (int dy = 0; dy < c_dofs1D; ++dy)
-+                  {
-+                     w2[j][dx][ey][ez] += Bo(ey, dy) * w1[j][dx][dy][ez];
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < o_dofs1D; ++ey)
-+         {
-+            for (int ex = 0; ex < c_dofs1D; ++ex)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  double s = 0.0;
-+                  for (int dx = 0; dx < c_dofs1D; ++dx)
-+                  {
-+                     s += Bc(ex, dx) * w2[j][dx][ey][ez];
-+                  }
-+                  const int local_index = c_dofs1D*c_dofs1D*o_dofs1D +
-+                                          ez*c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
-+                  y(local_index, e) += s * vk(j, local_index, e);
-+               }
-+            }
-+         }
-+      }
-+
-+      // dofs that point parallel to z-axis (open in z, closed in x, y)
-+
-+      // contract in z
-+      for (int ez = 0; ez < o_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w1[j][dx][dy][ez] = 0.0;
-+                  for (int dz = 0; dz < c_dofs1D; ++dz)
-+                  {
-+                     w1[j][dx][dy][ez] += Bo(ez, dz) * x(dx, dy, dz, j, e);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < o_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int dx = 0; dx < c_dofs1D; ++dx)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w2[j][dx][ey][ez] = 0.0;
-+                  for (int dy = 0; dy < c_dofs1D; ++dy)
-+                  {
-+                     w2[j][dx][ey][ez] += Bc(ey, dy) * w1[j][dx][dy][ez];
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in x
-+      for (int ez = 0; ez < o_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int ex = 0; ex < c_dofs1D; ++ex)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  double s = 0.0;
-+                  for (int dx = 0; dx < c_dofs1D; ++dx)
-+                  {
-+                     s += Bc(ex, dx) * w2[j][dx][ey][ez];
-+                  }
-+                  const int local_index = 2*c_dofs1D*c_dofs1D*o_dofs1D +
-+                                          ez*c_dofs1D*c_dofs1D + ey*c_dofs1D + ex;
-+                  y(local_index, e) += s * vk(j, local_index, e);
-+               }
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+static void PAHcurlVecH1IdentityApplyTranspose3D(const int c_dofs1D,
-+                                                 const int o_dofs1D,
-+                                                 const int NE,
-+                                                 const Array<double> &Bclosed,
-+                                                 const Array<double> &Bopen,
-+                                                 const Vector &pa_data,
-+                                                 const Vector &x_,
-+                                                 Vector &y_)
-+{
-+   auto Bc = Reshape(Bclosed.Read(), c_dofs1D, c_dofs1D);
-+   auto Bo = Reshape(Bopen.Read(), o_dofs1D, c_dofs1D);
-+
-+   auto x = Reshape(x_.Read(), (3 * c_dofs1D * c_dofs1D * o_dofs1D), NE);
-+   auto y = Reshape(y_.ReadWrite(), c_dofs1D, c_dofs1D, c_dofs1D, 3, NE);
-+
-+   auto vk = Reshape(pa_data.Read(), 3, (3 * c_dofs1D * c_dofs1D * o_dofs1D),
-+                     NE);
-+
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+
-+   MFEM_VERIFY(c_dofs1D <= MAX_D1D && o_dofs1D <= c_dofs1D, "");
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double w1[3][MAX_D1D][MAX_D1D][MAX_D1D];
-+      double w2[3][MAX_D1D][MAX_D1D][MAX_D1D];
-+
-+      // dofs that point parallel to x-axis (open in x, closed in y, z)
-+
-+      // contract in x
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int j=0; j<3; ++j)
-+            {
-+               for (int dx = 0; dx < c_dofs1D; ++dx)
-+               {
-+                  w2[j][dx][ey][ez] = 0.0;
-+               }
-+               for (int ex = 0; ex < o_dofs1D; ++ex)
-+               {
-+                  const int local_index = ez*c_dofs1D*o_dofs1D + ey*o_dofs1D + ex;
-+                  const double xv = x(local_index, e) * vk(j, local_index, e);
-+                  for (int dx = 0; dx < c_dofs1D; ++dx)
-+                  {
-+                     w2[j][dx][ey][ez] += xv * Bo(ex, dx);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w1[j][dx][dy][ez] = 0.0;
-+                  for (int ey = 0; ey < c_dofs1D; ++ey)
-+                  {
-+                     w1[j][dx][dy][ez] += w2[j][dx][ey][ez] * Bc(ey, dy);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in z
-+      for (int dx = 0; dx < c_dofs1D; ++dx)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int dz = 0; dz < c_dofs1D; ++dz)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  double s = 0.0;
-+                  for (int ez = 0; ez < c_dofs1D; ++ez)
-+                  {
-+                     s += w1[j][dx][dy][ez] * Bc(ez, dz);
-+                  }
-+                  y(dx, dy, dz, j, e) += s;
-+               }
-+            }
-+         }
-+      }
-+
-+      // dofs that point parallel to y-axis (open in y, closed in x, z)
-+
-+      // contract in x
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < o_dofs1D; ++ey)
-+         {
-+            for (int j=0; j<3; ++j)
-+            {
-+               for (int dx = 0; dx < c_dofs1D; ++dx)
-+               {
-+                  w2[j][dx][ey][ez] = 0.0;
-+               }
-+               for (int ex = 0; ex < c_dofs1D; ++ex)
-+               {
-+                  const int local_index = c_dofs1D*c_dofs1D*o_dofs1D +
-+                                          ez*c_dofs1D*o_dofs1D + ey*c_dofs1D + ex;
-+                  const double xv = x(local_index, e) * vk(j, local_index, e);
-+                  for (int dx = 0; dx < c_dofs1D; ++dx)
-+                  {
-+                     w2[j][dx][ey][ez] += xv * Bc(ex, dx);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < c_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w1[j][dx][dy][ez] = 0.0;
-+                  for (int ey = 0; ey < o_dofs1D; ++ey)
-+                  {
-+                     w1[j][dx][dy][ez] += w2[j][dx][ey][ez] * Bo(ey, dy);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in z
-+      for (int dx = 0; dx < c_dofs1D; ++dx)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int dz = 0; dz < c_dofs1D; ++dz)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  double s = 0.0;
-+                  for (int ez = 0; ez < c_dofs1D; ++ez)
-+                  {
-+                     s += w1[j][dx][dy][ez] * Bc(ez, dz);
-+                  }
-+                  y(dx, dy, dz, j, e) += s;
-+               }
-+            }
-+         }
-+      }
-+
-+      // dofs that point parallel to z-axis (open in z, closed in x, y)
-+
-+      // contract in x
-+      for (int ez = 0; ez < o_dofs1D; ++ez)
-+      {
-+         for (int ey = 0; ey < c_dofs1D; ++ey)
-+         {
-+            for (int j=0; j<3; ++j)
-+            {
-+               for (int dx = 0; dx < c_dofs1D; ++dx)
-+               {
-+                  w2[j][dx][ey][ez] = 0.0;
-+               }
-+               for (int ex = 0; ex < c_dofs1D; ++ex)
-+               {
-+                  const int local_index = 2*c_dofs1D*c_dofs1D*o_dofs1D +
-+                                          ez*c_dofs1D*c_dofs1D + ey*c_dofs1D + ex;
-+                  const double xv = x(local_index, e) * vk(j, local_index, e);
-+                  for (int dx = 0; dx < c_dofs1D; ++dx)
-+                  {
-+                     w2[j][dx][ey][ez] += xv * Bc(ex, dx);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in y
-+      for (int ez = 0; ez < o_dofs1D; ++ez)
-+      {
-+         for (int dx = 0; dx < c_dofs1D; ++dx)
-+         {
-+            for (int dy = 0; dy < c_dofs1D; ++dy)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  w1[j][dx][dy][ez] = 0.0;
-+                  for (int ey = 0; ey < c_dofs1D; ++ey)
-+                  {
-+                     w1[j][dx][dy][ez] += w2[j][dx][ey][ez] * Bc(ey, dy);
-+                  }
-+               }
-+            }
-+         }
-+      }
-+
-+      // contract in z
-+      for (int dx = 0; dx < c_dofs1D; ++dx)
-+      {
-+         for (int dy = 0; dy < c_dofs1D; ++dy)
-+         {
-+            for (int dz = 0; dz < c_dofs1D; ++dz)
-+            {
-+               for (int j=0; j<3; ++j)
-+               {
-+                  double s = 0.0;
-+                  for (int ez = 0; ez < o_dofs1D; ++ez)
-+                  {
-+                     s += w1[j][dx][dy][ez] * Bo(ez, dz);
-+                  }
-+                  y(dx, dy, dz, j, e) += s;
-+               }
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+void IdentityInterpolator::AssemblePA(const FiniteElementSpace &trial_fes,
-+                                      const FiniteElementSpace &test_fes)
-+{
-+   // Assumes tensor-product elements, with a vector test space and H^1 trial space.
-+   Mesh *mesh = trial_fes.GetMesh();
-+   const FiniteElement *trial_fel = trial_fes.GetFE(0);
-+   const FiniteElement *test_fel = test_fes.GetFE(0);
-+
-+   const NodalTensorFiniteElement *trial_el =
-+      dynamic_cast<const NodalTensorFiniteElement*>(trial_fel);
-+   MFEM_VERIFY(trial_el != NULL, "Only NodalTensorFiniteElement is supported!");
-+
-+   const VectorTensorFiniteElement *test_el =
-+      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
-+   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
-+
-+   const int dims = trial_el->GetDim();
-+   MFEM_VERIFY(dims == 2 || dims == 3, "");
-+
-+   dim = mesh->Dimension();
-+   MFEM_VERIFY(dim == 2 || dim == 3, "");
-+
-+   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(), "");
-+
-+   ne = trial_fes.GetNE();
-+
-+   const int order = trial_el->GetOrder();
-+   dofquad_fe = new H1_SegmentElement(order);
-+   mfem::QuadratureFunctions1D qf1d;
-+   mfem::IntegrationRule closed_ir;
-+   closed_ir.SetSize(order + 1);
-+   qf1d.GaussLobatto(order + 1, &closed_ir);
-+   mfem::IntegrationRule open_ir;
-+   open_ir.SetSize(order);
-+   qf1d.GaussLegendre(order, &open_ir);
-+
-+   maps_C_C = &dofquad_fe->GetDofToQuad(closed_ir, DofToQuad::TENSOR);
-+   maps_O_C = &dofquad_fe->GetDofToQuad(open_ir, DofToQuad::TENSOR);
-+
-+   o_dofs1D = maps_O_C->nqpt;
-+   c_dofs1D = maps_C_C->nqpt;
-+   MFEM_VERIFY(maps_O_C->ndof == c_dofs1D &&
-+               maps_C_C->ndof == c_dofs1D, "Discrepancy in the number of DOFs");
-+
-+   const int ndof_test = (dim == 3) ? 3 * c_dofs1D * c_dofs1D * o_dofs1D
-+                         : 2 * c_dofs1D * o_dofs1D;
-+
-+   const IntegrationRule & Nodes = test_el->GetNodes();
-+
-+   pa_data.SetSize(dim * ndof_test * ne, Device::GetMemoryType());
-+   auto op = Reshape(pa_data.HostWrite(), dim, ndof_test, ne);
-+
-+   const Array<int> &dofmap = test_el->GetDofMap();
-+
-+   if (dim == 3)
-+   {
-+      // Note that ND_HexahedronElement uses 6 vectors in tk rather than 3, with
-+      // the last 3 having negative signs. Here the signs are all positive, as
-+      // signs are applied in ElementRestriction.
-+
-+      const double tk[9] = { 1.,0.,0.,  0.,1.,0.,  0.,0.,1. };
-+
-+      for (int c=0; c<3; ++c)
-+      {
-+         for (int i=0; i<ndof_test/3; ++i)
-+         {
-+            const int d = (c*ndof_test/3) + i;
-+            // ND_HexahedronElement sets dof2tk = (dofmap < 0) ? 3+c : c, but here
-+            // no signs should be applied due to ElementRestriction.
-+            const int dof2tk = c;
-+            const int id = (dofmap[d] >= 0) ? dofmap[d] : -1 - dofmap[d];
-+
-+            for (int e=0; e<ne; ++e)
-+            {
-+               double v[3];
-+               ElementTransformation *tr = mesh->GetElementTransformation(e);
-+               tr->SetIntPoint(&Nodes.IntPoint(id));
-+               tr->Jacobian().Mult(tk + dof2tk*dim, v);
-+
-+               for (int j=0; j<3; ++j)
-+               {
-+                  op(j,d,e) = v[j];
-+               }
-+            }
-+         }
-+      }
-+   }
-+   else // 2D case
-+   {
-+      const double tk[4] = { 1.,0.,  0.,1. };
-+      for (int c=0; c<2; ++c)
-+      {
-+         for (int i=0; i<ndof_test/2; ++i)
-+         {
-+            const int d = (c*ndof_test/2) + i;
-+            // ND_QuadrilateralElement sets dof2tk = (dofmap < 0) ? 2+c : c, but here
-+            // no signs should be applied due to ElementRestriction.
-+            const int dof2tk = c;
-+            const int id = (dofmap[d] >= 0) ? dofmap[d] : -1 - dofmap[d];
-+
-+            for (int e=0; e<ne; ++e)
-+            {
-+               double v[2];
-+               ElementTransformation *tr = mesh->GetElementTransformation(e);
-+               tr->SetIntPoint(&Nodes.IntPoint(id));
-+               tr->Jacobian().Mult(tk + dof2tk*dim, v);
-+
-+               for (int j=0; j<2; ++j)
-+               {
-+                  op(j,d,e) = v[j];
-+               }
-+            }
-+         }
-+      }
-+   }
-+}
-+
-+void IdentityInterpolator::AddMultPA(const Vector &x, Vector &y) const
-+{
-+   if (dim == 3)
-+   {
-+      PAHcurlVecH1IdentityApply3D(c_dofs1D, o_dofs1D, ne, maps_C_C->B, maps_O_C->B,
-+                                  pa_data, x, y);
-+   }
-+   else if (dim == 2)
-+   {
-+      PAHcurlVecH1IdentityApply2D(c_dofs1D, o_dofs1D, ne, maps_C_C->B, maps_O_C->B,
-+                                  pa_data, x, y);
-+   }
-+   else
-+   {
-+      mfem_error("Bad dimension!");
-+   }
-+}
-+
-+void IdentityInterpolator::AddMultTransposePA(const Vector &x, Vector &y) const
-+{
-+   if (dim == 3)
-+   {
-+      PAHcurlVecH1IdentityApplyTranspose3D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
-+                                           maps_O_C->B, pa_data, x, y);
-+   }
-+   else if (dim == 2)
-+   {
-+      PAHcurlVecH1IdentityApplyTranspose2D(c_dofs1D, o_dofs1D, ne, maps_C_C->B,
-+                                           maps_O_C->B, pa_data, x, y);
-+   }
-+   else
-+   {
-+      mfem_error("Bad dimension!");
-+   }
-+}
-+
-+} // namespace mfem
-diff --git a/fem/bilininteg_mass_ea.cpp b/fem/integ/bilininteg_mass_ea.cpp
-similarity index 88%
-rename from fem/bilininteg_mass_ea.cpp
-rename to fem/integ/bilininteg_mass_ea.cpp
-index cb1e7e064..315b9da8f 100644
---- a/fem/bilininteg_mass_ea.cpp
-+++ b/fem/integ/bilininteg_mass_ea.cpp
-@@ -9,9 +9,9 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
- 
- namespace mfem
- {
-@@ -21,7 +21,6 @@ static void EAMassAssemble1D(const int NE,
-                              const Array<double> &basis,
-                              const Vector &padata,
-                              Vector &eadata,
--                             const bool add,
-                              const int d1d = 0,
-                              const int q1d = 0)
- {
-@@ -53,14 +52,7 @@ static void EAMassAssemble1D(const int NE,
-             {
-                val += r_Bi[k1] * r_Bj[k1] * D(k1, e);
-             }
--            if (add)
--            {
--               M(i1, j1, e) += val;
--            }
--            else
--            {
--               M(i1, j1, e) = val;
--            }
-+            M(i1, j1, e) += val;
-          }
-       }
-    });
-@@ -71,7 +63,6 @@ static void EAMassAssemble2D(const int NE,
-                              const Array<double> &basis,
-                              const Vector &padata,
-                              Vector &eadata,
--                             const bool add,
-                              const int d1d = 0,
-                              const int q1d = 0)
- {
-@@ -123,14 +114,7 @@ static void EAMassAssemble2D(const int NE,
-                                * s_D[k1][k2];
-                      }
-                   }
--                  if (add)
--                  {
--                     M(i1, i2, j1, j2, e) += val;
--                  }
--                  else
--                  {
--                     M(i1, i2, j1, j2, e) = val;
--                  }
-+                  M(i1, i2, j1, j2, e) += val;
-                }
-             }
-          }
-@@ -143,7 +127,6 @@ static void EAMassAssemble3D(const int NE,
-                              const Array<double> &basis,
-                              const Vector &padata,
-                              Vector &eadata,
--                             const bool add,
-                              const int d1d = 0,
-                              const int q1d = 0)
- {
-@@ -237,14 +220,7 @@ static void EAMassAssemble3D(const int NE,
-                               }
-                            }
-                         }
--                        if (add)
--                        {
--                           M(i1, i2, i3, j1, j2, j3, e) += val;
--                        }
--                        else
--                        {
--                           M(i1, i2, i3, j1, j2, j3, e) = val;
--                        }
-+                        M(i1, i2, i3, j1, j2, j3, e) += val;
-                      }
-                   }
-                }
-@@ -255,8 +231,7 @@ static void EAMassAssemble3D(const int NE,
- }
- 
- void MassIntegrator::AssembleEA(const FiniteElementSpace &fes,
--                                Vector &ea_data,
--                                const bool add)
-+                                Vector &ea_data)
- {
-    AssemblePA(fes);
-    ne = fes.GetMesh()->GetNE();
-@@ -265,15 +240,15 @@ void MassIntegrator::AssembleEA(const FiniteElementSpace &fes,
-    {
-       switch ((dofs1D << 4 ) | quad1D)
-       {
--         case 0x22: return EAMassAssemble1D<2,2>(ne,B,pa_data,ea_data,add);
--         case 0x33: return EAMassAssemble1D<3,3>(ne,B,pa_data,ea_data,add);
--         case 0x44: return EAMassAssemble1D<4,4>(ne,B,pa_data,ea_data,add);
--         case 0x55: return EAMassAssemble1D<5,5>(ne,B,pa_data,ea_data,add);
--         case 0x66: return EAMassAssemble1D<6,6>(ne,B,pa_data,ea_data,add);
--         case 0x77: return EAMassAssemble1D<7,7>(ne,B,pa_data,ea_data,add);
--         case 0x88: return EAMassAssemble1D<8,8>(ne,B,pa_data,ea_data,add);
--         case 0x99: return EAMassAssemble1D<9,9>(ne,B,pa_data,ea_data,add);
--         default:   return EAMassAssemble1D(ne,B,pa_data,ea_data,add,
-+         case 0x22: return EAMassAssemble1D<2,2>(ne,B,pa_data,ea_data);
-+         case 0x33: return EAMassAssemble1D<3,3>(ne,B,pa_data,ea_data);
-+         case 0x44: return EAMassAssemble1D<4,4>(ne,B,pa_data,ea_data);
-+         case 0x55: return EAMassAssemble1D<5,5>(ne,B,pa_data,ea_data);
-+         case 0x66: return EAMassAssemble1D<6,6>(ne,B,pa_data,ea_data);
-+         case 0x77: return EAMassAssemble1D<7,7>(ne,B,pa_data,ea_data);
-+         case 0x88: return EAMassAssemble1D<8,8>(ne,B,pa_data,ea_data);
-+         case 0x99: return EAMassAssemble1D<9,9>(ne,B,pa_data,ea_data);
-+         default:   return EAMassAssemble1D(ne,B,pa_data,ea_data,
-                                                dofs1D,quad1D);
-       }
-    }
-@@ -281,15 +256,15 @@ void MassIntegrator::AssembleEA(const FiniteElementSpace &fes,
-    {
-       switch ((dofs1D << 4 ) | quad1D)
-       {
--         case 0x22: return EAMassAssemble2D<2,2>(ne,B,pa_data,ea_data,add);
--         case 0x33: return EAMassAssemble2D<3,3>(ne,B,pa_data,ea_data,add);
--         case 0x44: return EAMassAssemble2D<4,4>(ne,B,pa_data,ea_data,add);
--         case 0x55: return EAMassAssemble2D<5,5>(ne,B,pa_data,ea_data,add);
--         case 0x66: return EAMassAssemble2D<6,6>(ne,B,pa_data,ea_data,add);
--         case 0x77: return EAMassAssemble2D<7,7>(ne,B,pa_data,ea_data,add);
--         case 0x88: return EAMassAssemble2D<8,8>(ne,B,pa_data,ea_data,add);
--         case 0x99: return EAMassAssemble2D<9,9>(ne,B,pa_data,ea_data,add);
--         default:   return EAMassAssemble2D(ne,B,pa_data,ea_data,add,
-+         case 0x22: return EAMassAssemble2D<2,2>(ne,B,pa_data,ea_data);
-+         case 0x33: return EAMassAssemble2D<3,3>(ne,B,pa_data,ea_data);
-+         case 0x44: return EAMassAssemble2D<4,4>(ne,B,pa_data,ea_data);
-+         case 0x55: return EAMassAssemble2D<5,5>(ne,B,pa_data,ea_data);
-+         case 0x66: return EAMassAssemble2D<6,6>(ne,B,pa_data,ea_data);
-+         case 0x77: return EAMassAssemble2D<7,7>(ne,B,pa_data,ea_data);
-+         case 0x88: return EAMassAssemble2D<8,8>(ne,B,pa_data,ea_data);
-+         case 0x99: return EAMassAssemble2D<9,9>(ne,B,pa_data,ea_data);
-+         default:   return EAMassAssemble2D(ne,B,pa_data,ea_data,
-                                                dofs1D,quad1D);
-       }
-    }
-@@ -297,14 +272,14 @@ void MassIntegrator::AssembleEA(const FiniteElementSpace &fes,
-    {
-       switch ((dofs1D << 4 ) | quad1D)
-       {
--         case 0x23: return EAMassAssemble3D<2,3>(ne,B,pa_data,ea_data,add);
--         case 0x34: return EAMassAssemble3D<3,4>(ne,B,pa_data,ea_data,add);
--         case 0x45: return EAMassAssemble3D<4,5>(ne,B,pa_data,ea_data,add);
--         case 0x56: return EAMassAssemble3D<5,6>(ne,B,pa_data,ea_data,add);
--         case 0x67: return EAMassAssemble3D<6,7>(ne,B,pa_data,ea_data,add);
--         case 0x78: return EAMassAssemble3D<7,8>(ne,B,pa_data,ea_data,add);
--         case 0x89: return EAMassAssemble3D<8,9>(ne,B,pa_data,ea_data,add);
--         default:   return EAMassAssemble3D(ne,B,pa_data,ea_data,add,
-+         case 0x23: return EAMassAssemble3D<2,3>(ne,B,pa_data,ea_data);
-+         case 0x34: return EAMassAssemble3D<3,4>(ne,B,pa_data,ea_data);
-+         case 0x45: return EAMassAssemble3D<4,5>(ne,B,pa_data,ea_data);
-+         case 0x56: return EAMassAssemble3D<5,6>(ne,B,pa_data,ea_data);
-+         case 0x67: return EAMassAssemble3D<6,7>(ne,B,pa_data,ea_data);
-+         case 0x78: return EAMassAssemble3D<7,8>(ne,B,pa_data,ea_data);
-+         case 0x89: return EAMassAssemble3D<8,9>(ne,B,pa_data,ea_data);
-+         default:   return EAMassAssemble3D(ne,B,pa_data,ea_data,
-                                                dofs1D,quad1D);
-       }
-    }
-diff --git a/fem/integ/bilininteg_mass_kernels.hpp b/fem/integ/bilininteg_mass_kernels.hpp
-new file mode 100644
-index 000000000..c26f242ef
---- /dev/null
-+++ b/fem/integ/bilininteg_mass_kernels.hpp
-@@ -0,0 +1,1329 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#ifndef MFEM_BILININTEG_MASS_KERNELS_HPP
-+#define MFEM_BILININTEG_MASS_KERNELS_HPP
-+
-+#include "../../config/config.hpp"
-+#include "../../general/forall.hpp"
-+#include "../../linalg/dtensor.hpp"
-+
-+namespace mfem
-+{
-+
-+namespace internal
-+{
-+
-+MFEM_HOST_DEVICE inline
-+void PAMassAssembleDiagonal1D(const int NE,
-+                              const Array<double> &b,
-+                              const Vector &d,
-+                              Vector &y,
-+                              const int D1D,
-+                              const int Q1D)
-+{
-+   MFEM_VERIFY(D1D <= MAX_D1D, "");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
-+   auto B = Reshape(b.Read(), Q1D, D1D);
-+   auto D = Reshape(d.Read(), Q1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), D1D, NE);
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      for (int dx = 0; dx < D1D; ++dx)
-+      {
-+         Y(dx, e) = 0.0;
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            Y(dx, e) += B(qx, dx) * B(qx, dx) * D(qx, e);
-+         }
-+      }
-+   });
-+}
-+
-+template<int T_D1D = 0, int T_Q1D = 0>
-+MFEM_HOST_DEVICE inline
-+void PAMassAssembleDiagonal2D(const int NE,
-+                              const Array<double> &b,
-+                              const Vector &d,
-+                              Vector &y,
-+                              const int d1d = 0,
-+                              const int q1d = 0)
-+{
-+   const int D1D = T_D1D ? T_D1D : d1d;
-+   const int Q1D = T_Q1D ? T_Q1D : q1d;
-+   MFEM_VERIFY(D1D <= MAX_D1D, "");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
-+   auto B = Reshape(b.Read(), Q1D, D1D);
-+   auto D = Reshape(d.Read(), Q1D, Q1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), D1D, D1D, NE);
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      const int D1D = T_D1D ? T_D1D : d1d;
-+      const int Q1D = T_Q1D ? T_Q1D : q1d;
-+      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
-+      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
-+      double QD[MQ1][MD1];
-+      for (int qx = 0; qx < Q1D; ++qx)
-+      {
-+         for (int dy = 0; dy < D1D; ++dy)
-+         {
-+            QD[qx][dy] = 0.0;
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               QD[qx][dy] += B(qy, dy) * B(qy, dy) * D(qx, qy, e);
-+            }
-+         }
-+      }
-+      for (int dy = 0; dy < D1D; ++dy)
-+      {
-+         for (int dx = 0; dx < D1D; ++dx)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               Y(dx,dy,e) += B(qx, dx) * B(qx, dx) * QD[qx][dy];
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
-+MFEM_HOST_DEVICE inline
-+void SmemPAMassAssembleDiagonal2D(const int NE,
-+                                  const Array<double> &b_,
-+                                  const Vector &d_,
-+                                  Vector &y_,
-+                                  const int d1d = 0,
-+                                  const int q1d = 0)
-+{
-+   const int D1D = T_D1D ? T_D1D : d1d;
-+   const int Q1D = T_Q1D ? T_Q1D : q1d;
-+   constexpr int NBZ = T_NBZ ? T_NBZ : 1;
-+   constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
-+   constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
-+   MFEM_VERIFY(D1D <= MD1, "");
-+   MFEM_VERIFY(Q1D <= MQ1, "");
-+   auto b = Reshape(b_.Read(), Q1D, D1D);
-+   auto D = Reshape(d_.Read(), Q1D, Q1D, NE);
-+   auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
-+   mfem::forall_2D_batch(NE, Q1D, Q1D, NBZ, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      const int tidz = MFEM_THREAD_ID(z);
-+      const int D1D = T_D1D ? T_D1D : d1d;
-+      const int Q1D = T_Q1D ? T_Q1D : q1d;
-+      constexpr int NBZ = T_NBZ ? T_NBZ : 1;
-+      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
-+      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
-+      MFEM_SHARED double B[MQ1][MD1];
-+      MFEM_SHARED double QDZ[NBZ][MQ1][MD1];
-+      double (*QD)[MD1] = (double (*)[MD1])(QDZ + tidz);
-+      if (tidz == 0)
-+      {
-+         MFEM_FOREACH_THREAD(d,y,D1D)
-+         {
-+            MFEM_FOREACH_THREAD(q,x,Q1D)
-+            {
-+               B[q][d] = b(q,d);
-+            }
-+         }
-+      }
-+      MFEM_SYNC_THREAD;
-+      MFEM_FOREACH_THREAD(qx,x,Q1D)
-+      {
-+         MFEM_FOREACH_THREAD(dy,y,D1D)
-+         {
-+            QD[qx][dy] = 0.0;
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               QD[qx][dy] += B[qy][dy] * B[qy][dy] * D(qx, qy, e);
-+            }
-+         }
-+      }
-+      MFEM_SYNC_THREAD;
-+      MFEM_FOREACH_THREAD(dy,y,D1D)
-+      {
-+         MFEM_FOREACH_THREAD(dx,x,D1D)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               // might need absolute values on next line
-+               Y(dx,dy,e) += B[qx][dx] * B[qx][dx] * QD[qx][dy];
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+template<int T_D1D = 0, int T_Q1D = 0>
-+MFEM_HOST_DEVICE inline
-+void PAMassAssembleDiagonal3D(const int NE,
-+                              const Array<double> &b,
-+                              const Vector &d,
-+                              Vector &y,
-+                              const int d1d = 0,
-+                              const int q1d = 0)
-+{
-+   const int D1D = T_D1D ? T_D1D : d1d;
-+   const int Q1D = T_Q1D ? T_Q1D : q1d;
-+   MFEM_VERIFY(D1D <= MAX_D1D, "");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
-+   auto B = Reshape(b.Read(), Q1D, D1D);
-+   auto D = Reshape(d.Read(), Q1D, Q1D, Q1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      const int D1D = T_D1D ? T_D1D : d1d;
-+      const int Q1D = T_Q1D ? T_Q1D : q1d;
-+      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
-+      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
-+      double QQD[MQ1][MQ1][MD1];
-+      double QDD[MQ1][MD1][MD1];
-+      for (int qx = 0; qx < Q1D; ++qx)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int dz = 0; dz < D1D; ++dz)
-+            {
-+               QQD[qx][qy][dz] = 0.0;
-+               for (int qz = 0; qz < Q1D; ++qz)
-+               {
-+                  QQD[qx][qy][dz] += B(qz, dz) * B(qz, dz) * D(qx, qy, qz, e);
-+               }
-+            }
-+         }
-+      }
-+      for (int qx = 0; qx < Q1D; ++qx)
-+      {
-+         for (int dz = 0; dz < D1D; ++dz)
-+         {
-+            for (int dy = 0; dy < D1D; ++dy)
-+            {
-+               QDD[qx][dy][dz] = 0.0;
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  QDD[qx][dy][dz] += B(qy, dy) * B(qy, dy) * QQD[qx][qy][dz];
-+               }
-+            }
-+         }
-+      }
-+      for (int dz = 0; dz < D1D; ++dz)
-+      {
-+         for (int dy = 0; dy < D1D; ++dy)
-+         {
-+            for (int dx = 0; dx < D1D; ++dx)
-+            {
-+               double t = 0.0;
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  t += B(qx, dx) * B(qx, dx) * QDD[qx][dy][dz];
-+               }
-+               Y(dx, dy, dz, e) += t;
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+template<int T_D1D = 0, int T_Q1D = 0>
-+MFEM_HOST_DEVICE inline
-+void SmemPAMassAssembleDiagonal3D(const int NE,
-+                                  const Array<double> &b_,
-+                                  const Vector &d_,
-+                                  Vector &y_,
-+                                  const int d1d = 0,
-+                                  const int q1d = 0)
-+{
-+   const int D1D = T_D1D ? T_D1D : d1d;
-+   const int Q1D = T_Q1D ? T_Q1D : q1d;
-+   constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
-+   constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
-+   MFEM_VERIFY(D1D <= MD1, "");
-+   MFEM_VERIFY(Q1D <= MQ1, "");
-+   auto b = Reshape(b_.Read(), Q1D, D1D);
-+   auto D = Reshape(d_.Read(), Q1D, Q1D, Q1D, NE);
-+   auto Y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
-+   mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      const int tidz = MFEM_THREAD_ID(z);
-+      const int D1D = T_D1D ? T_D1D : d1d;
-+      const int Q1D = T_Q1D ? T_Q1D : q1d;
-+      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
-+      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
-+      MFEM_SHARED double B[MQ1][MD1];
-+      MFEM_SHARED double QQD[MQ1][MQ1][MD1];
-+      MFEM_SHARED double QDD[MQ1][MD1][MD1];
-+      if (tidz == 0)
-+      {
-+         MFEM_FOREACH_THREAD(d,y,D1D)
-+         {
-+            MFEM_FOREACH_THREAD(q,x,Q1D)
-+            {
-+               B[q][d] = b(q,d);
-+            }
-+         }
-+      }
-+      MFEM_SYNC_THREAD;
-+      MFEM_FOREACH_THREAD(qx,x,Q1D)
-+      {
-+         MFEM_FOREACH_THREAD(qy,y,Q1D)
-+         {
-+            MFEM_FOREACH_THREAD(dz,z,D1D)
-+            {
-+               QQD[qx][qy][dz] = 0.0;
-+               for (int qz = 0; qz < Q1D; ++qz)
-+               {
-+                  QQD[qx][qy][dz] += B[qz][dz] * B[qz][dz] * D(qx, qy, qz, e);
-+               }
-+            }
-+         }
-+      }
-+      MFEM_SYNC_THREAD;
-+      MFEM_FOREACH_THREAD(qx,x,Q1D)
-+      {
-+         MFEM_FOREACH_THREAD(dz,z,D1D)
-+         {
-+            MFEM_FOREACH_THREAD(dy,y,D1D)
-+            {
-+               QDD[qx][dy][dz] = 0.0;
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  QDD[qx][dy][dz] += B[qy][dy] * B[qy][dy] * QQD[qx][qy][dz];
-+               }
-+            }
-+         }
-+      }
-+      MFEM_SYNC_THREAD;
-+      MFEM_FOREACH_THREAD(dz,z,D1D)
-+      {
-+         MFEM_FOREACH_THREAD(dy,y,D1D)
-+         {
-+            MFEM_FOREACH_THREAD(dx,x,D1D)
-+            {
-+               double t = 0.0;
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  t += B[qx][dx] * B[qx][dx] * QDD[qx][dy][dz];
-+               }
-+               Y(dx, dy, dz, e) += t;
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+MFEM_HOST_DEVICE inline
-+void PAMassAssembleDiagonal(const int dim, const int D1D,
-+                            const int Q1D, const int NE,
-+                            const Array<double> &B,
-+                            const Vector &D,
-+                            Vector &Y)
-+{
-+   if (dim == 1)
-+   {
-+      return PAMassAssembleDiagonal1D(NE,B,D,Y,D1D,Q1D);
-+   }
-+   else if (dim == 2)
-+   {
-+      switch ((D1D << 4 ) | Q1D)
-+      {
-+         case 0x22: return SmemPAMassAssembleDiagonal2D<2,2,16>(NE,B,D,Y);
-+         case 0x33: return SmemPAMassAssembleDiagonal2D<3,3,16>(NE,B,D,Y);
-+         case 0x44: return SmemPAMassAssembleDiagonal2D<4,4,8>(NE,B,D,Y);
-+         case 0x55: return SmemPAMassAssembleDiagonal2D<5,5,8>(NE,B,D,Y);
-+         case 0x66: return SmemPAMassAssembleDiagonal2D<6,6,4>(NE,B,D,Y);
-+         case 0x77: return SmemPAMassAssembleDiagonal2D<7,7,4>(NE,B,D,Y);
-+         case 0x88: return SmemPAMassAssembleDiagonal2D<8,8,2>(NE,B,D,Y);
-+         case 0x99: return SmemPAMassAssembleDiagonal2D<9,9,2>(NE,B,D,Y);
-+         default:   return PAMassAssembleDiagonal2D(NE,B,D,Y,D1D,Q1D);
-+      }
-+   }
-+   else if (dim == 3)
-+   {
-+      switch ((D1D << 4 ) | Q1D)
-+      {
-+         case 0x23: return SmemPAMassAssembleDiagonal3D<2,3>(NE,B,D,Y);
-+         case 0x24: return SmemPAMassAssembleDiagonal3D<2,4>(NE,B,D,Y);
-+         case 0x26: return SmemPAMassAssembleDiagonal3D<2,6>(NE,B,D,Y);
-+         case 0x34: return SmemPAMassAssembleDiagonal3D<3,4>(NE,B,D,Y);
-+         case 0x35: return SmemPAMassAssembleDiagonal3D<3,5>(NE,B,D,Y);
-+         case 0x45: return SmemPAMassAssembleDiagonal3D<4,5>(NE,B,D,Y);
-+         case 0x48: return SmemPAMassAssembleDiagonal3D<4,8>(NE,B,D,Y);
-+         case 0x56: return SmemPAMassAssembleDiagonal3D<5,6>(NE,B,D,Y);
-+         case 0x67: return SmemPAMassAssembleDiagonal3D<6,7>(NE,B,D,Y);
-+         case 0x78: return SmemPAMassAssembleDiagonal3D<7,8>(NE,B,D,Y);
-+         case 0x89: return SmemPAMassAssembleDiagonal3D<8,9>(NE,B,D,Y);
-+         default:   return PAMassAssembleDiagonal3D(NE,B,D,Y,D1D,Q1D);
-+      }
-+   }
-+   MFEM_ABORT("Unknown kernel.");
-+}
-+
-+#ifdef MFEM_USE_OCCA
-+// OCCA PA Mass Apply 2D kernel
-+MFEM_HOST_DEVICE inline
-+void OccaPAMassApply2D(const int D1D,
-+                       const int Q1D,
-+                       const int NE,
-+                       const Array<double> &B,
-+                       const Array<double> &Bt,
-+                       const Vector &D,
-+                       const Vector &X,
-+                       Vector &Y)
-+{
-+   occa::properties props;
-+   props["defines/D1D"] = D1D;
-+   props["defines/Q1D"] = Q1D;
-+   const occa::memory o_B = OccaMemoryRead(B.GetMemory(), B.Size());
-+   const occa::memory o_Bt = OccaMemoryRead(Bt.GetMemory(), Bt.Size());
-+   const occa::memory o_D = OccaMemoryRead(D.GetMemory(), D.Size());
-+   const occa::memory o_X = OccaMemoryRead(X.GetMemory(), X.Size());
-+   occa::memory o_Y = OccaMemoryReadWrite(Y.GetMemory(), Y.Size());
-+   const occa_id_t id = std::make_pair(D1D,Q1D);
-+   if (!Device::Allows(Backend::OCCA_CUDA))
-+   {
-+      static occa_kernel_t OccaMassApply2D_cpu;
-+      if (OccaMassApply2D_cpu.find(id) == OccaMassApply2D_cpu.end())
-+      {
-+         const occa::kernel MassApply2D_CPU =
-+            mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
-+                                        "MassApply2D_CPU", props);
-+         OccaMassApply2D_cpu.emplace(id, MassApply2D_CPU);
-+      }
-+      OccaMassApply2D_cpu.at(id)(NE, o_B, o_Bt, o_D, o_X, o_Y);
-+   }
-+   else
-+   {
-+      static occa_kernel_t OccaMassApply2D_gpu;
-+      if (OccaMassApply2D_gpu.find(id) == OccaMassApply2D_gpu.end())
-+      {
-+         const occa::kernel MassApply2D_GPU =
-+            mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
-+                                        "MassApply2D_GPU", props);
-+         OccaMassApply2D_gpu.emplace(id, MassApply2D_GPU);
-+      }
-+      OccaMassApply2D_gpu.at(id)(NE, o_B, o_Bt, o_D, o_X, o_Y);
-+   }
-+}
-+
-+// OCCA PA Mass Apply 3D kernel
-+MFEM_HOST_DEVICE inline
-+void OccaPAMassApply3D(const int D1D,
-+                       const int Q1D,
-+                       const int NE,
-+                       const Array<double> &B,
-+                       const Array<double> &Bt,
-+                       const Vector &D,
-+                       const Vector &X,
-+                       Vector &Y)
-+{
-+   occa::properties props;
-+   props["defines/D1D"] = D1D;
-+   props["defines/Q1D"] = Q1D;
-+   const occa::memory o_B = OccaMemoryRead(B.GetMemory(), B.Size());
-+   const occa::memory o_Bt = OccaMemoryRead(Bt.GetMemory(), Bt.Size());
-+   const occa::memory o_D = OccaMemoryRead(D.GetMemory(), D.Size());
-+   const occa::memory o_X = OccaMemoryRead(X.GetMemory(), X.Size());
-+   occa::memory o_Y = OccaMemoryReadWrite(Y.GetMemory(), Y.Size());
-+   const occa_id_t id = std::make_pair(D1D,Q1D);
-+   if (!Device::Allows(Backend::OCCA_CUDA))
-+   {
-+      static occa_kernel_t OccaMassApply3D_cpu;
-+      if (OccaMassApply3D_cpu.find(id) == OccaMassApply3D_cpu.end())
-+      {
-+         const occa::kernel MassApply3D_CPU =
-+            mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
-+                                        "MassApply3D_CPU", props);
-+         OccaMassApply3D_cpu.emplace(id, MassApply3D_CPU);
-+      }
-+      OccaMassApply3D_cpu.at(id)(NE, o_B, o_Bt, o_D, o_X, o_Y);
-+   }
-+   else
-+   {
-+      static occa_kernel_t OccaMassApply3D_gpu;
-+      if (OccaMassApply3D_gpu.find(id) == OccaMassApply3D_gpu.end())
-+      {
-+         const occa::kernel MassApply3D_GPU =
-+            mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
-+                                        "MassApply3D_GPU", props);
-+         OccaMassApply3D_gpu.emplace(id, MassApply3D_GPU);
-+      }
-+      OccaMassApply3D_gpu.at(id)(NE, o_B, o_Bt, o_D, o_X, o_Y);
-+   }
-+}
-+#endif // MFEM_USE_OCCA
-+
-+MFEM_HOST_DEVICE inline
-+void PAMassApply1D_Element(const int e,
-+                           const int NE,
-+                           const double *b_,
-+                           const double *bt_,
-+                           const double *d_,
-+                           const double *x_,
-+                           double *y_,
-+                           const int d1d = 0,
-+                           const int q1d = 0)
-+{
-+   const int D1D = d1d;
-+   const int Q1D = q1d;
-+   auto B = ConstDeviceMatrix(b_, Q1D, D1D);
-+   auto Bt = ConstDeviceMatrix(bt_, D1D, Q1D);
-+   auto D = ConstDeviceMatrix(d_, Q1D, NE);
-+   auto X = ConstDeviceMatrix(x_, D1D, NE);
-+   auto Y = DeviceMatrix(y_, D1D, NE);
-+
-+   constexpr int max_Q1D = MAX_Q1D;
-+   double XQ[max_Q1D];
-+   for (int qx = 0; qx < Q1D; ++qx)
-+   {
-+      XQ[qx] = 0.0;
-+   }
-+   for (int dx = 0; dx < D1D; ++dx)
-+   {
-+      const double s = X(dx,e);
-+      for (int qx = 0; qx < Q1D; ++qx)
-+      {
-+         XQ[qx] += B(qx,dx)*s;
-+      }
-+   }
-+   for (int qx = 0; qx < Q1D; ++qx)
-+   {
-+      const double q = XQ[qx]*D(qx,e);
-+      for (int dx = 0; dx < D1D; ++dx)
-+      {
-+         Y(dx,e) += Bt(dx,qx) * q;
-+      }
-+   }
-+}
-+
-+template <bool ACCUMULATE = true>
-+MFEM_HOST_DEVICE inline
-+void PAMassApply2D_Element(const int e,
-+                           const int NE,
-+                           const double *b_,
-+                           const double *bt_,
-+                           const double *d_,
-+                           const double *x_,
-+                           double *y_,
-+                           const int d1d = 0,
-+                           const int q1d = 0)
-+{
-+   const int D1D = d1d;
-+   const int Q1D = q1d;
-+   auto B = ConstDeviceMatrix(b_, Q1D, D1D);
-+   auto Bt = ConstDeviceMatrix(bt_, D1D, Q1D);
-+   auto D = ConstDeviceCube(d_, Q1D, Q1D, NE);
-+   auto X = ConstDeviceCube(x_, D1D, D1D, NE);
-+   auto Y = DeviceCube(y_, D1D, D1D, NE);
-+
-+   if (!ACCUMULATE)
-+   {
-+      for (int dy = 0; dy < D1D; ++dy)
-+      {
-+         for (int dx = 0; dx < D1D; ++dx)
-+         {
-+            Y(dx, dy, e) = 0.0;
-+         }
-+      }
-+   }
-+
-+   constexpr int max_D1D = MAX_D1D;
-+   constexpr int max_Q1D = MAX_Q1D;
-+   double sol_xy[max_Q1D][max_Q1D];
-+   for (int qy = 0; qy < Q1D; ++qy)
-+   {
-+      for (int qx = 0; qx < Q1D; ++qx)
-+      {
-+         sol_xy[qy][qx] = 0.0;
-+      }
-+   }
-+   for (int dy = 0; dy < D1D; ++dy)
-+   {
-+      double sol_x[max_Q1D];
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         sol_x[qy] = 0.0;
-+      }
-+      for (int dx = 0; dx < D1D; ++dx)
-+      {
-+         const double s = X(dx,dy,e);
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            sol_x[qx] += B(qx,dx)* s;
-+         }
-+      }
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         const double d2q = B(qy,dy);
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            sol_xy[qy][qx] += d2q * sol_x[qx];
-+         }
-+      }
-+   }
-+   for (int qy = 0; qy < Q1D; ++qy)
-+   {
-+      for (int qx = 0; qx < Q1D; ++qx)
-+      {
-+         sol_xy[qy][qx] *= D(qx,qy,e);
-+      }
-+   }
-+   for (int qy = 0; qy < Q1D; ++qy)
-+   {
-+      double sol_x[max_D1D];
-+      for (int dx = 0; dx < D1D; ++dx)
-+      {
-+         sol_x[dx] = 0.0;
-+      }
-+      for (int qx = 0; qx < Q1D; ++qx)
-+      {
-+         const double s = sol_xy[qy][qx];
-+         for (int dx = 0; dx < D1D; ++dx)
-+         {
-+            sol_x[dx] += Bt(dx,qx) * s;
-+         }
-+      }
-+      for (int dy = 0; dy < D1D; ++dy)
-+      {
-+         const double q2d = Bt(dy,qy);
-+         for (int dx = 0; dx < D1D; ++dx)
-+         {
-+            Y(dx,dy,e) += q2d * sol_x[dx];
-+         }
-+      }
-+   }
-+}
-+
-+template<int T_D1D, int T_Q1D, int T_NBZ, bool ACCUMULATE = true>
-+MFEM_HOST_DEVICE inline
-+void SmemPAMassApply2D_Element(const int e,
-+                               const int NE,
-+                               const double *b_,
-+                               const double *d_,
-+                               const double *x_,
-+                               double *y_,
-+                               int d1d = 0,
-+                               int q1d = 0)
-+{
-+   const int D1D = T_D1D ? T_D1D : d1d;
-+   const int Q1D = T_Q1D ? T_Q1D : q1d;
-+   constexpr int NBZ = T_NBZ ? T_NBZ : 1;
-+
-+   constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
-+   constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
-+   constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
-+
-+   auto b = ConstDeviceMatrix(b_, Q1D, D1D);
-+   auto D = ConstDeviceCube(d_, Q1D, Q1D, NE);
-+   auto x = ConstDeviceCube(x_, D1D, D1D, NE);
-+   auto Y = DeviceCube(y_, D1D, D1D, NE);
-+
-+   const int tidz = MFEM_THREAD_ID(z);
-+
-+   MFEM_SHARED double BBt[MQ1*MD1];
-+   double (*B)[MD1] = (double (*)[MD1]) BBt;
-+   double (*Bt)[MQ1] = (double (*)[MQ1]) BBt;
-+   MFEM_SHARED double sm0[NBZ][MDQ*MDQ];
-+   MFEM_SHARED double sm1[NBZ][MDQ*MDQ];
-+   double (*X)[MD1] = (double (*)[MD1]) (sm0 + tidz);
-+   double (*DQ)[MQ1] = (double (*)[MQ1]) (sm1 + tidz);
-+   double (*QQ)[MQ1] = (double (*)[MQ1]) (sm0 + tidz);
-+   double (*QD)[MD1] = (double (*)[MD1]) (sm1 + tidz);
-+
-+   MFEM_FOREACH_THREAD(dy,y,D1D)
-+   {
-+      MFEM_FOREACH_THREAD(dx,x,D1D)
-+      {
-+         X[dy][dx] = x(dx,dy,e);
-+      }
-+   }
-+   if (tidz == 0)
-+   {
-+      MFEM_FOREACH_THREAD(dy,y,D1D)
-+      {
-+         MFEM_FOREACH_THREAD(q,x,Q1D)
-+         {
-+            B[q][dy] = b(q,dy);
-+         }
-+      }
-+   }
-+   MFEM_SYNC_THREAD;
-+   MFEM_FOREACH_THREAD(dy,y,D1D)
-+   {
-+      MFEM_FOREACH_THREAD(qx,x,Q1D)
-+      {
-+         double dq = 0.0;
-+         for (int dx = 0; dx < D1D; ++dx)
-+         {
-+            dq += X[dy][dx] * B[qx][dx];
-+         }
-+         DQ[dy][qx] = dq;
-+      }
-+   }
-+   MFEM_SYNC_THREAD;
-+   MFEM_FOREACH_THREAD(qy,y,Q1D)
-+   {
-+      MFEM_FOREACH_THREAD(qx,x,Q1D)
-+      {
-+         double qq = 0.0;
-+         for (int dy = 0; dy < D1D; ++dy)
-+         {
-+            qq += DQ[dy][qx] * B[qy][dy];
-+         }
-+         QQ[qy][qx] = qq * D(qx, qy, e);
-+      }
-+   }
-+   MFEM_SYNC_THREAD;
-+   if (tidz == 0)
-+   {
-+      MFEM_FOREACH_THREAD(dy,y,D1D)
-+      {
-+         MFEM_FOREACH_THREAD(q,x,Q1D)
-+         {
-+            Bt[dy][q] = b(q,dy);
-+         }
-+      }
-+   }
-+   MFEM_SYNC_THREAD;
-+   MFEM_FOREACH_THREAD(qy,y,Q1D)
-+   {
-+      MFEM_FOREACH_THREAD(dx,x,D1D)
-+      {
-+         double dq = 0.0;
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            dq += QQ[qy][qx] * Bt[dx][qx];
-+         }
-+         QD[qy][dx] = dq;
-+      }
-+   }
-+   MFEM_SYNC_THREAD;
-+   MFEM_FOREACH_THREAD(dy,y,D1D)
-+   {
-+      MFEM_FOREACH_THREAD(dx,x,D1D)
-+      {
-+         double dd = 0.0;
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            dd += (QD[qy][dx] * Bt[dy][qy]);
-+         }
-+         if (ACCUMULATE)
-+         {
-+            Y(dx, dy, e) += dd;
-+         }
-+         else
-+         {
-+            Y(dx, dy, e) = dd;
-+         }
-+      }
-+   }
-+}
-+
-+template <bool ACCUMULATE = true>
-+MFEM_HOST_DEVICE inline
-+void PAMassApply3D_Element(const int e,
-+                           const int NE,
-+                           const double *b_,
-+                           const double *bt_,
-+                           const double *d_,
-+                           const double *x_,
-+                           double *y_,
-+                           const int d1d,
-+                           const int q1d)
-+{
-+   const int D1D = d1d;
-+   const int Q1D = q1d;
-+   auto B = ConstDeviceMatrix(b_, Q1D, D1D);
-+   auto Bt = ConstDeviceMatrix(bt_, D1D, Q1D);
-+   auto D = DeviceTensor<4,const double>(d_, Q1D, Q1D, Q1D, NE);
-+   auto X = DeviceTensor<4,const double>(x_, D1D, D1D, D1D, NE);
-+   auto Y = DeviceTensor<4,double>(y_, D1D, D1D, D1D, NE);
-+
-+   if (!ACCUMULATE)
-+   {
-+      for (int dz = 0; dz < D1D; ++dz)
-+      {
-+         for (int dy = 0; dy < D1D; ++dy)
-+         {
-+            for (int dx = 0; dx < D1D; ++dx)
-+            {
-+               Y(dx, dy, dz, e) = 0.0;
-+            }
-+         }
-+      }
-+   }
-+
-+   constexpr int max_D1D = MAX_D1D;
-+   constexpr int max_Q1D = MAX_Q1D;
-+   double sol_xyz[max_Q1D][max_Q1D][max_Q1D];
-+   for (int qz = 0; qz < Q1D; ++qz)
-+   {
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            sol_xyz[qz][qy][qx] = 0.0;
-+         }
-+      }
-+   }
-+   for (int dz = 0; dz < D1D; ++dz)
-+   {
-+      double sol_xy[max_Q1D][max_Q1D];
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            sol_xy[qy][qx] = 0.0;
-+         }
-+      }
-+      for (int dy = 0; dy < D1D; ++dy)
-+      {
-+         double sol_x[max_Q1D];
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            sol_x[qx] = 0;
-+         }
-+         for (int dx = 0; dx < D1D; ++dx)
-+         {
-+            const double s = X(dx,dy,dz,e);
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               sol_x[qx] += B(qx,dx) * s;
-+            }
-+         }
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            const double wy = B(qy,dy);
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               sol_xy[qy][qx] += wy * sol_x[qx];
-+            }
-+         }
-+      }
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         const double wz = B(qz,dz);
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               sol_xyz[qz][qy][qx] += wz * sol_xy[qy][qx];
-+            }
-+         }
-+      }
-+   }
-+   for (int qz = 0; qz < Q1D; ++qz)
-+   {
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            sol_xyz[qz][qy][qx] *= D(qx,qy,qz,e);
-+         }
-+      }
-+   }
-+   for (int qz = 0; qz < Q1D; ++qz)
-+   {
-+      double sol_xy[max_D1D][max_D1D];
-+      for (int dy = 0; dy < D1D; ++dy)
-+      {
-+         for (int dx = 0; dx < D1D; ++dx)
-+         {
-+            sol_xy[dy][dx] = 0;
-+         }
-+      }
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         double sol_x[max_D1D];
-+         for (int dx = 0; dx < D1D; ++dx)
-+         {
-+            sol_x[dx] = 0;
-+         }
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            const double s = sol_xyz[qz][qy][qx];
-+            for (int dx = 0; dx < D1D; ++dx)
-+            {
-+               sol_x[dx] += Bt(dx,qx) * s;
-+            }
-+         }
-+         for (int dy = 0; dy < D1D; ++dy)
-+         {
-+            const double wy = Bt(dy,qy);
-+            for (int dx = 0; dx < D1D; ++dx)
-+            {
-+               sol_xy[dy][dx] += wy * sol_x[dx];
-+            }
-+         }
-+      }
-+      for (int dz = 0; dz < D1D; ++dz)
-+      {
-+         const double wz = Bt(dz,qz);
-+         for (int dy = 0; dy < D1D; ++dy)
-+         {
-+            for (int dx = 0; dx < D1D; ++dx)
-+            {
-+               Y(dx,dy,dz,e) += wz * sol_xy[dy][dx];
-+            }
-+         }
-+      }
-+   }
-+}
-+
-+template<int T_D1D, int T_Q1D, bool ACCUMULATE = true>
-+MFEM_HOST_DEVICE inline
-+void SmemPAMassApply3D_Element(const int e,
-+                               const int NE,
-+                               const double *b_,
-+                               const double *d_,
-+                               const double *x_,
-+                               double *y_,
-+                               const int d1d = 0,
-+                               const int q1d = 0)
-+{
-+   constexpr int D1D = T_D1D ? T_D1D : d1d;
-+   constexpr int Q1D = T_Q1D ? T_Q1D : q1d;
-+   constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
-+   constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
-+   constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
-+
-+   auto b = ConstDeviceMatrix(b_, Q1D, D1D);
-+   auto d = DeviceTensor<4,const double>(d_, Q1D, Q1D, Q1D, NE);
-+   auto x = DeviceTensor<4,const double>(x_, D1D, D1D, D1D, NE);
-+   auto y = DeviceTensor<4,double>(y_, D1D, D1D, D1D, NE);
-+
-+   MFEM_SHARED double sDQ[MQ1*MD1];
-+   double (*B)[MD1] = (double (*)[MD1]) sDQ;
-+   double (*Bt)[MQ1] = (double (*)[MQ1]) sDQ;
-+   MFEM_SHARED double sm0[MDQ*MDQ*MDQ];
-+   MFEM_SHARED double sm1[MDQ*MDQ*MDQ];
-+   double (*X)[MD1][MD1]   = (double (*)[MD1][MD1]) sm0;
-+   double (*DDQ)[MD1][MQ1] = (double (*)[MD1][MQ1]) sm1;
-+   double (*DQQ)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) sm0;
-+   double (*QQQ)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) sm1;
-+   double (*QQD)[MQ1][MD1] = (double (*)[MQ1][MD1]) sm0;
-+   double (*QDD)[MD1][MD1] = (double (*)[MD1][MD1]) sm1;
-+   MFEM_FOREACH_THREAD(dy,y,D1D)
-+   {
-+      MFEM_FOREACH_THREAD(dx,x,D1D)
-+      {
-+         MFEM_UNROLL(MD1)
-+         for (int dz = 0; dz < D1D; ++dz)
-+         {
-+            X[dz][dy][dx] = x(dx,dy,dz,e);
-+         }
-+      }
-+      MFEM_FOREACH_THREAD(dx,x,Q1D)
-+      {
-+         B[dx][dy] = b(dx,dy);
-+      }
-+   }
-+   MFEM_SYNC_THREAD;
-+   MFEM_FOREACH_THREAD(dy,y,D1D)
-+   {
-+      MFEM_FOREACH_THREAD(qx,x,Q1D)
-+      {
-+         double u[D1D];
-+         MFEM_UNROLL(MD1)
-+         for (int dz = 0; dz < D1D; dz++)
-+         {
-+            u[dz] = 0;
-+         }
-+         MFEM_UNROLL(MD1)
-+         for (int dx = 0; dx < D1D; ++dx)
-+         {
-+            MFEM_UNROLL(MD1)
-+            for (int dz = 0; dz < D1D; ++dz)
-+            {
-+               u[dz] += X[dz][dy][dx] * B[qx][dx];
-+            }
-+         }
-+         MFEM_UNROLL(MD1)
-+         for (int dz = 0; dz < D1D; ++dz)
-+         {
-+            DDQ[dz][dy][qx] = u[dz];
-+         }
-+      }
-+   }
-+   MFEM_SYNC_THREAD;
-+   MFEM_FOREACH_THREAD(qy,y,Q1D)
-+   {
-+      MFEM_FOREACH_THREAD(qx,x,Q1D)
-+      {
-+         double u[D1D];
-+         MFEM_UNROLL(MD1)
-+         for (int dz = 0; dz < D1D; dz++)
-+         {
-+            u[dz] = 0;
-+         }
-+         MFEM_UNROLL(MD1)
-+         for (int dy = 0; dy < D1D; ++dy)
-+         {
-+            MFEM_UNROLL(MD1)
-+            for (int dz = 0; dz < D1D; dz++)
-+            {
-+               u[dz] += DDQ[dz][dy][qx] * B[qy][dy];
-+            }
-+         }
-+         MFEM_UNROLL(MD1)
-+         for (int dz = 0; dz < D1D; dz++)
-+         {
-+            DQQ[dz][qy][qx] = u[dz];
-+         }
-+      }
-+   }
-+   MFEM_SYNC_THREAD;
-+   MFEM_FOREACH_THREAD(qy,y,Q1D)
-+   {
-+      MFEM_FOREACH_THREAD(qx,x,Q1D)
-+      {
-+         double u[Q1D];
-+         MFEM_UNROLL(MQ1)
-+         for (int qz = 0; qz < Q1D; qz++)
-+         {
-+            u[qz] = 0;
-+         }
-+         MFEM_UNROLL(MD1)
-+         for (int dz = 0; dz < D1D; ++dz)
-+         {
-+            MFEM_UNROLL(MQ1)
-+            for (int qz = 0; qz < Q1D; qz++)
-+            {
-+               u[qz] += DQQ[dz][qy][qx] * B[qz][dz];
-+            }
-+         }
-+         MFEM_UNROLL(MQ1)
-+         for (int qz = 0; qz < Q1D; qz++)
-+         {
-+            QQQ[qz][qy][qx] = u[qz] * d(qx,qy,qz,e);
-+         }
-+      }
-+   }
-+   MFEM_SYNC_THREAD;
-+   MFEM_FOREACH_THREAD(di,y,D1D)
-+   {
-+      MFEM_FOREACH_THREAD(q,x,Q1D)
-+      {
-+         Bt[di][q] = b(q,di);
-+      }
-+   }
-+   MFEM_SYNC_THREAD;
-+   MFEM_FOREACH_THREAD(qy,y,Q1D)
-+   {
-+      MFEM_FOREACH_THREAD(dx,x,D1D)
-+      {
-+         double u[Q1D];
-+         MFEM_UNROLL(MQ1)
-+         for (int qz = 0; qz < Q1D; ++qz)
-+         {
-+            u[qz] = 0;
-+         }
-+         MFEM_UNROLL(MQ1)
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            MFEM_UNROLL(MQ1)
-+            for (int qz = 0; qz < Q1D; ++qz)
-+            {
-+               u[qz] += QQQ[qz][qy][qx] * Bt[dx][qx];
-+            }
-+         }
-+         MFEM_UNROLL(MQ1)
-+         for (int qz = 0; qz < Q1D; ++qz)
-+         {
-+            QQD[qz][qy][dx] = u[qz];
-+         }
-+      }
-+   }
-+   MFEM_SYNC_THREAD;
-+   MFEM_FOREACH_THREAD(dy,y,D1D)
-+   {
-+      MFEM_FOREACH_THREAD(dx,x,D1D)
-+      {
-+         double u[Q1D];
-+         MFEM_UNROLL(MQ1)
-+         for (int qz = 0; qz < Q1D; ++qz)
-+         {
-+            u[qz] = 0;
-+         }
-+         MFEM_UNROLL(MQ1)
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            MFEM_UNROLL(MQ1)
-+            for (int qz = 0; qz < Q1D; ++qz)
-+            {
-+               u[qz] += QQD[qz][qy][dx] * Bt[dy][qy];
-+            }
-+         }
-+         MFEM_UNROLL(MQ1)
-+         for (int qz = 0; qz < Q1D; ++qz)
-+         {
-+            QDD[qz][dy][dx] = u[qz];
-+         }
-+      }
-+   }
-+   MFEM_SYNC_THREAD;
-+   MFEM_FOREACH_THREAD(dy,y,D1D)
-+   {
-+      MFEM_FOREACH_THREAD(dx,x,D1D)
-+      {
-+         double u[D1D];
-+         MFEM_UNROLL(MD1)
-+         for (int dz = 0; dz < D1D; ++dz)
-+         {
-+            u[dz] = 0;
-+         }
-+         MFEM_UNROLL(MQ1)
-+         for (int qz = 0; qz < Q1D; ++qz)
-+         {
-+            MFEM_UNROLL(MD1)
-+            for (int dz = 0; dz < D1D; ++dz)
-+            {
-+               u[dz] += QDD[qz][dy][dx] * Bt[dz][qz];
-+            }
-+         }
-+         MFEM_UNROLL(MD1)
-+         for (int dz = 0; dz < D1D; ++dz)
-+         {
-+            if (ACCUMULATE)
-+            {
-+               y(dx,dy,dz,e) += u[dz];
-+            }
-+            else
-+            {
-+               y(dx,dy,dz,e) = u[dz];
-+            }
-+         }
-+      }
-+   }
-+   MFEM_SYNC_THREAD;
-+}
-+
-+MFEM_HOST_DEVICE inline
-+void PAMassApply1D(const int NE,
-+                   const Array<double> &b_,
-+                   const Array<double> &bt_,
-+                   const Vector &d_,
-+                   const Vector &x_,
-+                   Vector &y_,
-+                   const int d1d = 0,
-+                   const int q1d = 0)
-+{
-+   MFEM_VERIFY(d1d <= MAX_D1D, "");
-+   MFEM_VERIFY(q1d <= MAX_Q1D, "");
-+
-+   const auto B = b_.Read();
-+   const auto Bt = bt_.Read();
-+   const auto D = d_.Read();
-+   const auto X = x_.Read();
-+   auto Y = y_.ReadWrite();
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      internal::PAMassApply1D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
-+   });
-+}
-+
-+template<int T_D1D = 0, int T_Q1D = 0>
-+MFEM_HOST_DEVICE inline
-+void PAMassApply2D(const int NE,
-+                   const Array<double> &b_,
-+                   const Array<double> &bt_,
-+                   const Vector &d_,
-+                   const Vector &x_,
-+                   Vector &y_,
-+                   const int d1d = 0,
-+                   const int q1d = 0)
-+{
-+   MFEM_VERIFY(T_D1D ? T_D1D : d1d <= MAX_D1D, "");
-+   MFEM_VERIFY(T_Q1D ? T_Q1D : q1d <= MAX_Q1D, "");
-+
-+   const auto B = b_.Read();
-+   const auto Bt = bt_.Read();
-+   const auto D = d_.Read();
-+   const auto X = x_.Read();
-+   auto Y = y_.ReadWrite();
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      internal::PAMassApply2D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
-+   });
-+}
-+
-+template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
-+MFEM_HOST_DEVICE inline
-+void SmemPAMassApply2D(const int NE,
-+                       const Array<double> &b_,
-+                       const Array<double> &bt_,
-+                       const Vector &d_,
-+                       const Vector &x_,
-+                       Vector &y_,
-+                       const int d1d = 0,
-+                       const int q1d = 0)
-+{
-+   MFEM_CONTRACT_VAR(bt_);
-+   const int D1D = T_D1D ? T_D1D : d1d;
-+   const int Q1D = T_Q1D ? T_Q1D : q1d;
-+   constexpr int NBZ = T_NBZ ? T_NBZ : 1;
-+   constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
-+   constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
-+   MFEM_VERIFY(D1D <= MD1, "");
-+   MFEM_VERIFY(Q1D <= MQ1, "");
-+   const auto b = b_.Read();
-+   const auto D = d_.Read();
-+   const auto x = x_.Read();
-+   auto Y = y_.ReadWrite();
-+   mfem::forall_2D_batch(NE, Q1D, Q1D, NBZ, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      internal::SmemPAMassApply2D_Element<T_D1D,T_Q1D,T_NBZ>(e, NE, b, D, x, Y, d1d,
-+                                                             q1d);
-+   });
-+}
-+
-+template<int T_D1D = 0, int T_Q1D = 0>
-+MFEM_HOST_DEVICE inline
-+void PAMassApply3D(const int NE,
-+                   const Array<double> &b_,
-+                   const Array<double> &bt_,
-+                   const Vector &d_,
-+                   const Vector &x_,
-+                   Vector &y_,
-+                   const int d1d = 0,
-+                   const int q1d = 0)
-+{
-+   MFEM_VERIFY(T_D1D ? T_D1D : d1d <= MAX_D1D, "");
-+   MFEM_VERIFY(T_Q1D ? T_Q1D : q1d <= MAX_Q1D, "");
-+
-+   const auto B = b_.Read();
-+   const auto Bt = bt_.Read();
-+   const auto D = d_.Read();
-+   const auto X = x_.Read();
-+   auto Y = y_.ReadWrite();
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      internal::PAMassApply3D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
-+   });
-+}
-+
-+template<int T_D1D = 0, int T_Q1D = 0>
-+MFEM_HOST_DEVICE inline
-+void SmemPAMassApply3D(const int NE,
-+                       const Array<double> &b_,
-+                       const Array<double> &bt_,
-+                       const Vector &d_,
-+                       const Vector &x_,
-+                       Vector &y_,
-+                       const int d1d = 0,
-+                       const int q1d = 0)
-+{
-+   MFEM_CONTRACT_VAR(bt_);
-+   const int D1D = T_D1D ? T_D1D : d1d;
-+   const int Q1D = T_Q1D ? T_Q1D : q1d;
-+   constexpr int M1Q = T_Q1D ? T_Q1D : MAX_Q1D;
-+   constexpr int M1D = T_D1D ? T_D1D : MAX_D1D;
-+   MFEM_VERIFY(D1D <= M1D, "");
-+   MFEM_VERIFY(Q1D <= M1Q, "");
-+   auto b = b_.Read();
-+   auto d = d_.Read();
-+   auto x = x_.Read();
-+   auto y = y_.ReadWrite();
-+   mfem::forall_2D(NE, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      internal::SmemPAMassApply3D_Element<T_D1D,T_Q1D>(e, NE, b, d, x, y, d1d, q1d);
-+   });
-+}
-+
-+MFEM_HOST_DEVICE inline
-+void PAMassApply(const int dim,
-+                 const int D1D,
-+                 const int Q1D,
-+                 const int NE,
-+                 const Array<double> &B,
-+                 const Array<double> &Bt,
-+                 const Vector &D,
-+                 const Vector &X,
-+                 Vector &Y)
-+{
-+#ifdef MFEM_USE_OCCA
-+   if (DeviceCanUseOcca())
-+   {
-+      if (dim == 2)
-+      {
-+         return OccaPAMassApply2D(D1D,Q1D,NE,B,Bt,D,X,Y);
-+      }
-+      if (dim == 3)
-+      {
-+         return OccaPAMassApply3D(D1D,Q1D,NE,B,Bt,D,X,Y);
-+      }
-+      MFEM_ABORT("OCCA PA Mass Apply unknown kernel!");
-+   }
-+#endif // MFEM_USE_OCCA
-+   const int id = (D1D << 4) | Q1D;
-+
-+   if (dim == 1)
-+   {
-+      return PAMassApply1D(NE,B,Bt,D,X,Y,D1D,Q1D);
-+   }
-+   else if (dim == 2)
-+   {
-+      switch (id)
-+      {
-+         case 0x22: return SmemPAMassApply2D<2,2,16>(NE,B,Bt,D,X,Y);
-+         case 0x24: return SmemPAMassApply2D<2,4,16>(NE,B,Bt,D,X,Y);
-+         case 0x33: return SmemPAMassApply2D<3,3,16>(NE,B,Bt,D,X,Y);
-+         case 0x34: return SmemPAMassApply2D<3,4,16>(NE,B,Bt,D,X,Y);
-+         case 0x35: return SmemPAMassApply2D<3,5,16>(NE,B,Bt,D,X,Y);
-+         case 0x36: return SmemPAMassApply2D<3,6,16>(NE,B,Bt,D,X,Y);
-+         case 0x44: return SmemPAMassApply2D<4,4,8>(NE,B,Bt,D,X,Y);
-+         case 0x46: return SmemPAMassApply2D<4,6,8>(NE,B,Bt,D,X,Y);
-+         case 0x48: return SmemPAMassApply2D<4,8,4>(NE,B,Bt,D,X,Y);
-+         case 0x55: return SmemPAMassApply2D<5,5,8>(NE,B,Bt,D,X,Y);
-+         case 0x57: return SmemPAMassApply2D<5,7,8>(NE,B,Bt,D,X,Y);
-+         case 0x58: return SmemPAMassApply2D<5,8,2>(NE,B,Bt,D,X,Y);
-+         case 0x66: return SmemPAMassApply2D<6,6,4>(NE,B,Bt,D,X,Y);
-+         case 0x77: return SmemPAMassApply2D<7,7,4>(NE,B,Bt,D,X,Y);
-+         case 0x88: return SmemPAMassApply2D<8,8,2>(NE,B,Bt,D,X,Y);
-+         case 0x99: return SmemPAMassApply2D<9,9,2>(NE,B,Bt,D,X,Y);
-+         default:   return PAMassApply2D(NE,B,Bt,D,X,Y,D1D,Q1D);
-+      }
-+   }
-+   else if (dim == 3)
-+   {
-+      switch (id)
-+      {
-+         case 0x22: return SmemPAMassApply3D<2,2>(NE,B,Bt,D,X,Y);
-+         case 0x23: return SmemPAMassApply3D<2,3>(NE,B,Bt,D,X,Y);
-+         case 0x24: return SmemPAMassApply3D<2,4>(NE,B,Bt,D,X,Y);
-+         case 0x26: return SmemPAMassApply3D<2,6>(NE,B,Bt,D,X,Y);
-+         case 0x34: return SmemPAMassApply3D<3,4>(NE,B,Bt,D,X,Y);
-+         case 0x35: return SmemPAMassApply3D<3,5>(NE,B,Bt,D,X,Y);
-+         case 0x36: return SmemPAMassApply3D<3,6>(NE,B,Bt,D,X,Y);
-+         case 0x37: return SmemPAMassApply3D<3,7>(NE,B,Bt,D,X,Y);
-+         case 0x45: return SmemPAMassApply3D<4,5>(NE,B,Bt,D,X,Y);
-+         case 0x46: return SmemPAMassApply3D<4,6>(NE,B,Bt,D,X,Y);
-+         case 0x48: return SmemPAMassApply3D<4,8>(NE,B,Bt,D,X,Y);
-+         case 0x56: return SmemPAMassApply3D<5,6>(NE,B,Bt,D,X,Y);
-+         case 0x58: return SmemPAMassApply3D<5,8>(NE,B,Bt,D,X,Y);
-+         case 0x67: return SmemPAMassApply3D<6,7>(NE,B,Bt,D,X,Y);
-+         case 0x78: return SmemPAMassApply3D<7,8>(NE,B,Bt,D,X,Y);
-+         case 0x89: return SmemPAMassApply3D<8,9>(NE,B,Bt,D,X,Y);
-+         case 0x9A: return SmemPAMassApply3D<9,10>(NE,B,Bt,D,X,Y);
-+         default:   return PAMassApply3D(NE,B,Bt,D,X,Y,D1D,Q1D);
-+      }
-+   }
-+   mfem::out << "Unknown kernel 0x" << std::hex << id << std::endl;
-+   MFEM_ABORT("Unknown kernel.");
-+}
-+
-+} // namespace internal
-+
-+} // namespace mfem
-+
-+#endif
-diff --git a/fem/bilininteg_mass_mf.cpp b/fem/integ/bilininteg_mass_mf.cpp
-similarity index 92%
-rename from fem/bilininteg_mass_mf.cpp
-rename to fem/integ/bilininteg_mass_mf.cpp
-index 2a89c0ce6..34a118b6d 100644
---- a/fem/bilininteg_mass_mf.cpp
-+++ b/fem/integ/bilininteg_mass_mf.cpp
-@@ -9,12 +9,10 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
--#include "ceed/integrators/mass/mass.hpp"
--
--using namespace std;
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../ceed/integrators/mass/mass.hpp"
- 
- namespace mfem
- {
-@@ -22,7 +20,6 @@ namespace mfem
- void MassIntegrator::AssembleMF(const FiniteElementSpace &fes)
- {
-    // Assuming the same element type
--   fespace = &fes;
-    Mesh *mesh = fes.GetMesh();
-    if (mesh->GetNE() == 0) { return; }
-    const FiniteElement &el = *fes.GetFE(0);
-diff --git a/fem/integ/bilininteg_mass_pa.cpp b/fem/integ/bilininteg_mass_pa.cpp
-new file mode 100644
-index 000000000..ffdec1edb
---- /dev/null
-+++ b/fem/integ/bilininteg_mass_pa.cpp
-@@ -0,0 +1,220 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../qfunction.hpp"
-+#include "../ceed/integrators/mass/mass.hpp"
-+#include "bilininteg_mass_kernels.hpp"
-+
-+namespace mfem
-+{
-+
-+void MassIntegrator::AssemblePA(const FiniteElementSpace &fes)
-+{
-+   const MemoryType mt = (pa_mt == MemoryType::DEFAULT) ?
-+                         Device::GetDeviceMemoryType() : pa_mt;
-+
-+   // Assuming the same element type
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNE() == 0) { return; }
-+   const FiniteElement &el = *fes.GetFE(0);
-+   ElementTransformation *T0 = mesh->GetElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el, *T0);
-+   if (DeviceCanUseCeed())
-+   {
-+      delete ceedOp;
-+      const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
-+                         fes.IsVariableOrder();
-+      if (mixed)
-+      {
-+         ceedOp = new ceed::MixedPAMassIntegrator(*this, fes, Q);
-+      }
-+      else
-+      {
-+         ceedOp = new ceed::PAMassIntegrator(fes, *ir, Q);
-+      }
-+      return;
-+   }
-+   int map_type = el.GetMapType();
-+   dim = mesh->Dimension();
-+   ne = fes.GetMesh()->GetNE();
-+   nq = ir->GetNPoints();
-+   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::DETERMINANTS, mt);
-+   maps = &el.GetDofToQuad(*ir, DofToQuad::TENSOR);
-+   dofs1D = maps->ndof;
-+   quad1D = maps->nqpt;
-+   pa_data.SetSize(ne*nq, mt);
-+
-+   QuadratureSpace qs(*mesh, *ir);
-+   CoefficientVector coeff(Q, qs, CoefficientStorage::COMPRESSED);
-+
-+   if (dim==1) { MFEM_ABORT("Not supported yet... stay tuned!"); }
-+   if (dim==2)
-+   {
-+      const int NE = ne;
-+      const int Q1D = quad1D;
-+      const bool const_c = coeff.Size() == 1;
-+      const bool by_val = map_type == FiniteElement::VALUE;
-+      const auto W = Reshape(ir->GetWeights().Read(), Q1D,Q1D);
-+      const auto J = Reshape(geom->detJ.Read(), Q1D,Q1D,NE);
-+      const auto C = const_c ? Reshape(coeff.Read(), 1,1,1) :
-+                     Reshape(coeff.Read(), Q1D,Q1D,NE);
-+      auto v = Reshape(pa_data.Write(), Q1D,Q1D, NE);
-+      mfem::forall_2D(NE, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
-+      {
-+         MFEM_FOREACH_THREAD(qx,x,Q1D)
-+         {
-+            MFEM_FOREACH_THREAD(qy,y,Q1D)
-+            {
-+               const double detJ = J(qx,qy,e);
-+               const double coeff = const_c ? C(0,0,0) : C(qx,qy,e);
-+               v(qx,qy,e) = W(qx,qy) * coeff * (by_val ? detJ : 1.0/detJ);
-+            }
-+         }
-+      });
-+   }
-+   if (dim==3)
-+   {
-+      const int NE = ne;
-+      const int Q1D = quad1D;
-+      const bool const_c = coeff.Size() == 1;
-+      const bool by_val = map_type == FiniteElement::VALUE;
-+      const auto W = Reshape(ir->GetWeights().Read(), Q1D,Q1D,Q1D);
-+      const auto J = Reshape(geom->detJ.Read(), Q1D,Q1D,Q1D,NE);
-+      const auto C = const_c ? Reshape(coeff.Read(), 1,1,1,1) :
-+                     Reshape(coeff.Read(), Q1D,Q1D,Q1D,NE);
-+      auto v = Reshape(pa_data.Write(), Q1D,Q1D,Q1D,NE);
-+      mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
-+      {
-+         MFEM_FOREACH_THREAD(qx,x,Q1D)
-+         {
-+            MFEM_FOREACH_THREAD(qy,y,Q1D)
-+            {
-+               MFEM_FOREACH_THREAD(qz,z,Q1D)
-+               {
-+                  const double detJ = J(qx,qy,qz,e);
-+                  const double coeff = const_c ? C(0,0,0,0) : C(qx,qy,qz,e);
-+                  v(qx,qy,qz,e) = W(qx,qy,qz) * coeff * (by_val ? detJ : 1.0/detJ);
-+               }
-+            }
-+         }
-+      });
-+   }
-+}
-+
-+void MassIntegrator::AssemblePABoundary(const FiniteElementSpace &fes)
-+{
-+   const MemoryType mt = (pa_mt == MemoryType::DEFAULT) ?
-+                         Device::GetDeviceMemoryType() : pa_mt;
-+
-+   // Assuming the same element type
-+   Mesh *mesh = fes.GetMesh();
-+   if (mesh->GetNBE() == 0) { return; }
-+   const FiniteElement &el = *fes.GetBE(0);
-+   ElementTransformation *T0 = mesh->GetBdrElementTransformation(0);
-+   const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el, *T0);
-+
-+   int map_type = el.GetMapType();
-+   dim = el.GetDim(); // Dimension of the boundary element, *not* the mesh
-+   ne = fes.GetMesh()->GetNBE();
-+   nq = ir->GetNPoints();
-+   face_geom = mesh->GetFaceGeometricFactors(*ir, GeometricFactors::DETERMINANTS,
-+                                             FaceType::Boundary, mt);
-+   maps = &el.GetDofToQuad(*ir, DofToQuad::TENSOR);
-+   dofs1D = maps->ndof;
-+   quad1D = maps->nqpt;
-+   pa_data.SetSize(ne*nq, mt);
-+
-+   FaceQuadratureSpace qs(*mesh, *ir, FaceType::Boundary);
-+   CoefficientVector coeff(Q, qs, CoefficientStorage::COMPRESSED);
-+
-+   const int NE = ne;
-+   const int Q1D = quad1D;
-+   const bool const_c = coeff.Size() == 1;
-+   const bool by_val = map_type == FiniteElement::VALUE;
-+   if (dim==1)
-+   {
-+      const auto W = Reshape(ir->GetWeights().Read(), Q1D);
-+      const auto J = Reshape(face_geom->detJ.Read(), Q1D, NE);
-+      const auto C = const_c ? Reshape(coeff.Read(), 1, 1) :
-+                     Reshape(coeff.Read(), Q1D, NE);
-+      auto v = Reshape(pa_data.Write(), Q1D, NE);
-+      mfem::forall_2D(NE, Q1D, 1, [=] MFEM_HOST_DEVICE (int e)
-+      {
-+         MFEM_FOREACH_THREAD(qx,x,Q1D)
-+         {
-+            const double detJ = J(qx,e);
-+            const double coeff = const_c ? C(0,0) : C(qx,e);
-+            v(qx,e) = W(qx) * coeff * (by_val ? detJ : 1.0/detJ);
-+         }
-+      });
-+   }
-+   else if (dim==2)
-+   {
-+      const auto W = Reshape(ir->GetWeights().Read(), Q1D,Q1D);
-+      const auto J = Reshape(face_geom->detJ.Read(), Q1D,Q1D,NE);
-+      const auto C = const_c ? Reshape(coeff.Read(), 1,1,1) :
-+                     Reshape(coeff.Read(), Q1D,Q1D,NE);
-+      auto v = Reshape(pa_data.Write(), Q1D,Q1D, NE);
-+      mfem::forall_2D(NE, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
-+      {
-+         MFEM_FOREACH_THREAD(qx,x,Q1D)
-+         {
-+            MFEM_FOREACH_THREAD(qy,y,Q1D)
-+            {
-+               const double detJ = J(qx,qy,e);
-+               const double coeff = const_c ? C(0,0,0) : C(qx,qy,e);
-+               v(qx,qy,e) = W(qx,qy) * coeff * (by_val ? detJ : 1.0/detJ);
-+            }
-+         }
-+      });
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Not supported.");
-+   }
-+}
-+
-+void MassIntegrator::AssembleDiagonalPA(Vector &diag)
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->GetDiagonal(diag);
-+   }
-+   else
-+   {
-+      internal::PAMassAssembleDiagonal(dim, dofs1D, quad1D, ne, maps->B, pa_data,
-+                                       diag);
-+   }
-+}
-+
-+void MassIntegrator::AddMultPA(const Vector &x, Vector &y) const
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->AddMult(x, y);
-+   }
-+   else
-+   {
-+      internal::PAMassApply(dim, dofs1D, quad1D, ne, maps->B, maps->Bt, pa_data, x,
-+                            y);
-+   }
-+}
-+
-+void MassIntegrator::AddMultTransposePA(const Vector &x, Vector &y) const
-+{
-+   // Mass integrator is symmetric
-+   AddMultPA(x, y);
-+}
-+
-+} // namespace mfem
-diff --git a/fem/integ/bilininteg_mixedcurl_pa.cpp b/fem/integ/bilininteg_mixedcurl_pa.cpp
-new file mode 100644
-index 000000000..3d70bc4c9
---- /dev/null
-+++ b/fem/integ/bilininteg_mixedcurl_pa.cpp
-@@ -0,0 +1,424 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../qfunction.hpp"
-+#include "bilininteg_hcurl_kernels.hpp"
-+#include "bilininteg_hcurlhdiv_kernels.hpp"
-+
-+namespace mfem
-+{
-+
-+void MixedScalarCurlIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-+                                           const FiniteElementSpace &test_fes)
-+{
-+   // Assumes tensor-product elements
-+   Mesh *mesh = trial_fes.GetMesh();
-+   const FiniteElement *fel = trial_fes.GetFE(0); // In H(curl)
-+   const FiniteElement *eltest = test_fes.GetFE(0); // In scalar space
-+
-+   const VectorTensorFiniteElement *el =
-+      dynamic_cast<const VectorTensorFiniteElement*>(fel);
-+   MFEM_VERIFY(el != NULL, "Only VectorTensorFiniteElement is supported!");
-+
-+   if (el->GetDerivType() != mfem::FiniteElement::CURL)
-+   {
-+      MFEM_ABORT("Unknown kernel.");
-+   }
-+
-+   const IntegrationRule *ir
-+      = IntRule ? IntRule : &MassIntegrator::GetRule(*eltest, *eltest,
-+                                                     *mesh->GetElementTransformation(0));
-+
-+   const int dims = el->GetDim();
-+   MFEM_VERIFY(dims == 2, "");
-+
-+   const int nq = ir->GetNPoints();
-+   dim = mesh->Dimension();
-+   MFEM_VERIFY(dim == 2, "");
-+
-+   ne = test_fes.GetNE();
-+   mapsC = &el->GetDofToQuad(*ir, DofToQuad::TENSOR);
-+   mapsO = &el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
-+   dofs1D = mapsC->ndof;
-+   quad1D = mapsC->nqpt;
-+
-+   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
-+
-+   if (el->GetOrder() == eltest->GetOrder())
-+   {
-+      dofs1Dtest = dofs1D;
-+   }
-+   else
-+   {
-+      dofs1Dtest = dofs1D - 1;
-+   }
-+
-+   pa_data.SetSize(nq * ne, Device::GetMemoryType());
-+
-+   QuadratureSpace qs(*mesh, *ir);
-+   CoefficientVector coeff(Q, qs, CoefficientStorage::FULL);
-+
-+   if (dim == 2)
-+   {
-+      internal::PAHcurlL2Setup2D(quad1D, ne, ir->GetWeights(), coeff, pa_data);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unsupported dimension!");
-+   }
-+}
-+
-+void MixedScalarCurlIntegrator::AddMultPA(const Vector &x, Vector &y) const
-+{
-+   if (dim == 2)
-+   {
-+      internal::PAHcurlL2Apply2D(dofs1D, dofs1Dtest, quad1D, ne, mapsO->B,
-+                                 mapsO->Bt, mapsC->Bt, mapsC->G, pa_data,
-+                                 x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unsupported dimension!");
-+   }
-+}
-+
-+void MixedScalarCurlIntegrator::AddMultTransposePA(const Vector &x,
-+                                                   Vector &y) const
-+{
-+   if (dim == 2)
-+   {
-+      internal::PAHcurlL2ApplyTranspose2D(dofs1D, dofs1Dtest, quad1D, ne, mapsO->B,
-+                                          mapsO->Bt, mapsC->B, mapsC->Gt, pa_data,
-+                                          x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unsupported dimension!");
-+   }
-+}
-+
-+void MixedVectorCurlIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-+                                           const FiniteElementSpace &test_fes)
-+{
-+   // Assumes tensor-product elements, with vector test and trial spaces.
-+   Mesh *mesh = trial_fes.GetMesh();
-+   const FiniteElement *trial_fel = trial_fes.GetFE(0);
-+   const FiniteElement *test_fel = test_fes.GetFE(0);
-+
-+   const VectorTensorFiniteElement *trial_el =
-+      dynamic_cast<const VectorTensorFiniteElement*>(trial_fel);
-+   MFEM_VERIFY(trial_el != NULL, "Only VectorTensorFiniteElement is supported!");
-+
-+   const VectorTensorFiniteElement *test_el =
-+      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
-+   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
-+
-+   const IntegrationRule *ir
-+      = IntRule ? IntRule : &MassIntegrator::GetRule(*trial_el, *trial_el,
-+                                                     *mesh->GetElementTransformation(0));
-+   const int dims = trial_el->GetDim();
-+   MFEM_VERIFY(dims == 3, "");
-+
-+   const int nq = ir->GetNPoints();
-+   dim = mesh->Dimension();
-+   MFEM_VERIFY(dim == 3, "");
-+
-+   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(), "");
-+
-+   ne = trial_fes.GetNE();
-+   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
-+   mapsC = &trial_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
-+   mapsO = &trial_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
-+   mapsCtest = &test_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
-+   mapsOtest = &test_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
-+   dofs1D = mapsC->ndof;
-+   quad1D = mapsC->nqpt;
-+   dofs1Dtest = mapsCtest->ndof;
-+
-+   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
-+
-+   testType = test_el->GetDerivType();
-+   trialType = trial_el->GetDerivType();
-+
-+   const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
-+   coeffDim = (DQ ? 3 : 1);
-+
-+   const bool curlSpaces = (testType == mfem::FiniteElement::CURL &&
-+                            trialType == mfem::FiniteElement::CURL);
-+
-+   const int ndata = curlSpaces ? (coeffDim == 1 ? 1 : 9) : symmDims;
-+   pa_data.SetSize(ndata * nq * ne, Device::GetMemoryType());
-+
-+   QuadratureSpace qs(*mesh, *ir);
-+   CoefficientVector coeff(qs, CoefficientStorage::FULL);
-+   if (Q) { coeff.Project(*Q); }
-+   else if (DQ) { coeff.Project(*DQ); }
-+   else { coeff.SetConstant(1.0); }
-+
-+   if (testType == mfem::FiniteElement::CURL &&
-+       trialType == mfem::FiniteElement::CURL && dim == 3)
-+   {
-+      if (coeffDim == 1)
-+      {
-+         internal::PAHcurlL2Setup3D(nq, coeffDim, ne, ir->GetWeights(), coeff, pa_data);
-+      }
-+      else
-+      {
-+         internal::PAHcurlHdivMassSetup3D(quad1D, coeffDim, ne, false, ir->GetWeights(),
-+                                          geom->J, coeff, pa_data);
-+      }
-+   }
-+   else if (testType == mfem::FiniteElement::DIV &&
-+            trialType == mfem::FiniteElement::CURL && dim == 3 &&
-+            test_fel->GetOrder() == trial_fel->GetOrder())
-+   {
-+      internal::PACurlCurlSetup3D(quad1D, coeffDim, ne, ir->GetWeights(), geom->J,
-+                                  coeff, pa_data);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unknown kernel.");
-+   }
-+}
-+
-+void MixedVectorCurlIntegrator::AddMultPA(const Vector &x, Vector &y) const
-+{
-+   if (testType == mfem::FiniteElement::CURL &&
-+       trialType == mfem::FiniteElement::CURL && dim == 3)
-+   {
-+      const int ndata = coeffDim == 1 ? 1 : 9;
-+
-+      if (Device::Allows(Backend::DEVICE_MASK))
-+      {
-+         const int ID = (dofs1D << 4) | quad1D;
-+         switch (ID)
-+         {
-+            case 0x23:
-+               return internal::SmemPAHcurlL2Apply3D<2,3>(
-+                         dofs1D, quad1D, ndata, ne,
-+                         mapsO->B, mapsC->B, mapsC->G,
-+                         pa_data, x, y);
-+            case 0x34:
-+               return internal::SmemPAHcurlL2Apply3D<3,4>(
-+                         dofs1D, quad1D, ndata, ne,
-+                         mapsO->B, mapsC->B, mapsC->G,
-+                         pa_data, x, y);
-+            case 0x45:
-+               return internal::SmemPAHcurlL2Apply3D<4,5>(
-+                         dofs1D, quad1D, ndata, ne,
-+                         mapsO->B, mapsC->B, mapsC->G,
-+                         pa_data, x, y);
-+            case 0x56:
-+               return internal::SmemPAHcurlL2Apply3D<5,6>(
-+                         dofs1D, quad1D, ndata, ne,
-+                         mapsO->B, mapsC->B, mapsC->G,
-+                         pa_data, x, y);
-+            default:
-+               return internal::SmemPAHcurlL2Apply3D(
-+                         dofs1D, quad1D, ndata, ne,
-+                         mapsO->B, mapsC->B, mapsC->G,
-+                         pa_data, x, y);
-+         }
-+      }
-+      else
-+      {
-+         internal::PAHcurlL2Apply3D(dofs1D, quad1D, ndata, ne, mapsO->B, mapsC->B,
-+                                    mapsO->Bt, mapsC->Bt, mapsC->G, pa_data, x, y);
-+      }
-+   }
-+   else if (testType == mfem::FiniteElement::DIV &&
-+            trialType == mfem::FiniteElement::CURL && dim == 3)
-+   {
-+      internal::PAHcurlHdivApply3D(dofs1D, dofs1Dtest, quad1D, ne, mapsO->B,
-+                                   mapsC->B, mapsOtest->Bt, mapsCtest->Bt, mapsC->G,
-+                                   pa_data, x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unsupported dimension or space!");
-+   }
-+}
-+
-+void MixedVectorCurlIntegrator::AddMultTransposePA(const Vector &x,
-+                                                   Vector &y) const
-+{
-+   if (testType == mfem::FiniteElement::DIV &&
-+       trialType == mfem::FiniteElement::CURL && dim == 3)
-+   {
-+      internal::PAHcurlHdivApply3DTranspose(dofs1D, dofs1Dtest, quad1D, ne, mapsO->B,
-+                                            mapsC->B, mapsOtest->Bt, mapsCtest->Bt,
-+                                            mapsC->Gt, pa_data, x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unsupported dimension or space!");
-+   }
-+}
-+
-+void MixedVectorWeakCurlIntegrator::AssemblePA(
-+   const FiniteElementSpace &trial_fes,
-+   const FiniteElementSpace &test_fes)
-+{
-+   // Assumes tensor-product elements, with vector test and trial spaces.
-+   Mesh *mesh = trial_fes.GetMesh();
-+   const FiniteElement *trial_fel = trial_fes.GetFE(0);
-+   const FiniteElement *test_fel = test_fes.GetFE(0);
-+
-+   const VectorTensorFiniteElement *trial_el =
-+      dynamic_cast<const VectorTensorFiniteElement*>(trial_fel);
-+   MFEM_VERIFY(trial_el != NULL, "Only VectorTensorFiniteElement is supported!");
-+
-+   const VectorTensorFiniteElement *test_el =
-+      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
-+   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
-+
-+   const IntegrationRule *ir
-+      = IntRule ? IntRule : &MassIntegrator::GetRule(*trial_el, *trial_el,
-+                                                     *mesh->GetElementTransformation(0));
-+   const int dims = trial_el->GetDim();
-+   MFEM_VERIFY(dims == 3, "");
-+
-+   const int nq = ir->GetNPoints();
-+   dim = mesh->Dimension();
-+   MFEM_VERIFY(dim == 3, "");
-+
-+   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(), "");
-+
-+   ne = trial_fes.GetNE();
-+   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
-+   mapsC = &test_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
-+   mapsO = &test_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
-+   dofs1D = mapsC->ndof;
-+   quad1D = mapsC->nqpt;
-+
-+   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
-+
-+   testType = test_el->GetDerivType();
-+   trialType = trial_el->GetDerivType();
-+
-+   const bool curlSpaces = (testType == mfem::FiniteElement::CURL &&
-+                            trialType == mfem::FiniteElement::CURL);
-+
-+   const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
-+
-+   coeffDim = DQ ? 3 : 1;
-+   const int ndata = curlSpaces ? (DQ ? 9 : 1) : symmDims;
-+
-+   pa_data.SetSize(ndata * nq * ne, Device::GetMemoryType());
-+
-+   QuadratureSpace qs(*mesh, *ir);
-+   CoefficientVector coeff(qs, CoefficientStorage::FULL);
-+   if (Q) { coeff.Project(*Q); }
-+   else if (DQ) { coeff.Project(*DQ); }
-+   else { coeff.SetConstant(1.0); }
-+
-+   if (trialType == mfem::FiniteElement::CURL && dim == 3)
-+   {
-+      if (coeffDim == 1)
-+      {
-+         internal::PAHcurlL2Setup3D(nq, coeffDim, ne, ir->GetWeights(), coeff, pa_data);
-+      }
-+      else
-+      {
-+         internal::PAHcurlHdivMassSetup3D(quad1D, coeffDim, ne, false, ir->GetWeights(),
-+                                          geom->J, coeff, pa_data);
-+      }
-+   }
-+   else if (trialType == mfem::FiniteElement::DIV && dim == 3 &&
-+            test_el->GetOrder() == trial_el->GetOrder())
-+   {
-+      internal::PACurlCurlSetup3D(quad1D, coeffDim, ne, ir->GetWeights(), geom->J,
-+                                  coeff, pa_data);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unknown kernel.");
-+   }
-+}
-+
-+void MixedVectorWeakCurlIntegrator::AddMultPA(const Vector &x, Vector &y) const
-+{
-+   if (testType == mfem::FiniteElement::CURL &&
-+       trialType == mfem::FiniteElement::CURL && dim == 3)
-+   {
-+      const int ndata = coeffDim == 1 ? 1 : 9;
-+      if (Device::Allows(Backend::DEVICE_MASK))
-+      {
-+         const int ID = (dofs1D << 4) | quad1D;
-+         switch (ID)
-+         {
-+            case 0x23:
-+               return internal::SmemPAHcurlL2Apply3DTranspose<2,3>(
-+                         dofs1D, quad1D, ndata,
-+                         ne, mapsO->B, mapsC->B,
-+                         mapsC->G, pa_data, x, y);
-+            case 0x34:
-+               return internal::SmemPAHcurlL2Apply3DTranspose<3,4>(
-+                         dofs1D, quad1D, ndata,
-+                         ne, mapsO->B, mapsC->B,
-+                         mapsC->G, pa_data, x, y);
-+            case 0x45:
-+               return internal::SmemPAHcurlL2Apply3DTranspose<4,5>(
-+                         dofs1D, quad1D, ndata,
-+                         ne, mapsO->B, mapsC->B,
-+                         mapsC->G, pa_data, x, y);
-+            case 0x56:
-+               return internal::SmemPAHcurlL2Apply3DTranspose<5,6>(
-+                         dofs1D, quad1D, ndata,
-+                         ne, mapsO->B, mapsC->B,
-+                         mapsC->G, pa_data, x, y);
-+            default:
-+               return internal::SmemPAHcurlL2Apply3DTranspose(
-+                         dofs1D, quad1D, ndata, ne,
-+                         mapsO->B, mapsC->B,
-+                         mapsC->G, pa_data, x, y);
-+         }
-+      }
-+      else
-+      {
-+         internal::PAHcurlL2Apply3DTranspose(dofs1D, quad1D, ndata, ne, mapsO->B,
-+                                             mapsC->B, mapsO->Bt, mapsC->Bt, mapsC->Gt,
-+                                             pa_data, x, y);
-+      }
-+   }
-+   else if (testType == mfem::FiniteElement::CURL &&
-+            trialType == mfem::FiniteElement::DIV && dim == 3)
-+   {
-+      internal::PAHcurlHdivApply3DTranspose(dofs1D, dofs1D, quad1D, ne, mapsO->B,
-+                                            mapsC->B, mapsO->Bt, mapsC->Bt,
-+                                            mapsC->Gt, pa_data, x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unsupported dimension or space!");
-+   }
-+}
-+
-+void MixedVectorWeakCurlIntegrator::AddMultTransposePA(const Vector &x,
-+                                                       Vector &y) const
-+{
-+   if (testType == mfem::FiniteElement::CURL &&
-+       trialType == mfem::FiniteElement::DIV && dim == 3)
-+   {
-+      internal::PAHcurlHdivApply3D(dofs1D, dofs1D, quad1D, ne, mapsO->B,
-+                                   mapsC->B, mapsO->Bt, mapsC->Bt, mapsC->G,
-+                                   pa_data, x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unsupported dimension or space!");
-+   }
-+}
-+
-+} // namespace mfem
-diff --git a/fem/integ/bilininteg_mixedvecgrad_pa.cpp b/fem/integ/bilininteg_mixedvecgrad_pa.cpp
-new file mode 100644
-index 000000000..f9e6d3ee8
---- /dev/null
-+++ b/fem/integ/bilininteg_mixedvecgrad_pa.cpp
-@@ -0,0 +1,757 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../qfunction.hpp"
-+#include "bilininteg_diffusion_kernels.hpp"
-+
-+namespace mfem
-+{
-+
-+// Apply to x corresponding to DOFs in H^1 (trial), whose gradients are
-+// integrated against H(curl) test functions corresponding to y.
-+static void PAHcurlH1Apply2D(const int D1D,
-+                             const int Q1D,
-+                             const int NE,
-+                             const Array<double> &bc,
-+                             const Array<double> &gc,
-+                             const Array<double> &bot,
-+                             const Array<double> &bct,
-+                             const Vector &pa_data,
-+                             const Vector &x,
-+                             Vector &y)
-+{
-+   constexpr static int VDIM = 2;
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
-+
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto Gc = Reshape(gc.Read(), Q1D, D1D);
-+   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
-+   auto Bct = Reshape(bct.Read(), D1D, Q1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, 3, NE);
-+   auto X = Reshape(x.Read(), D1D, D1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), 2*(D1D-1)*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double mass[MAX_Q1D][MAX_Q1D][VDIM];
-+
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            for (int c = 0; c < VDIM; ++c)
-+            {
-+               mass[qy][qx][c] = 0.0;
-+            }
-+         }
-+      }
-+
-+      for (int dy = 0; dy < D1D; ++dy)
-+      {
-+         double gradX[MAX_Q1D][2];
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            gradX[qx][0] = 0.0;
-+            gradX[qx][1] = 0.0;
-+         }
-+         for (int dx = 0; dx < D1D; ++dx)
-+         {
-+            const double s = X(dx,dy,e);
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               gradX[qx][0] += s * Bc(qx,dx);
-+               gradX[qx][1] += s * Gc(qx,dx);
-+            }
-+         }
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            const double wy  = Bc(qy,dy);
-+            const double wDy = Gc(qy,dy);
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               const double wx  = gradX[qx][0];
-+               const double wDx = gradX[qx][1];
-+               mass[qy][qx][0] += wDx * wy;
-+               mass[qy][qx][1] += wx * wDy;
-+            }
-+         }
-+      }
-+
-+      // Apply D operator.
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            const double O11 = op(qx,qy,0,e);
-+            const double O12 = op(qx,qy,1,e);
-+            const double O22 = op(qx,qy,2,e);
-+            const double massX = mass[qy][qx][0];
-+            const double massY = mass[qy][qx][1];
-+            mass[qy][qx][0] = (O11*massX)+(O12*massY);
-+            mass[qy][qx][1] = (O12*massX)+(O22*massY);
-+         }
-+      }
-+
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         int osc = 0;
-+
-+         for (int c = 0; c < VDIM; ++c)  // loop over x, y components
-+         {
-+            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+            double massX[MAX_D1D];
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               massX[dx] = 0;
-+            }
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  massX[dx] += mass[qy][qx][c] * ((c == 0) ? Bot(dx,qx) : Bct(dx,qx));
-+               }
-+            }
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               const double wy = (c == 1) ? Bot(dy,qy) : Bct(dy,qy);
-+
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  Y(dx + (dy * D1Dx) + osc, e) += massX[dx] * wy;
-+               }
-+            }
-+
-+            osc += D1Dx * D1Dy;
-+         }  // loop c
-+      }
-+   }); // end of element loop
-+}
-+
-+// Apply to x corresponding to DOFs in H(curl), integrated
-+// against gradients of H^1 functions corresponding to y.
-+static void PAHcurlH1ApplyTranspose2D(const int D1D,
-+                                      const int Q1D,
-+                                      const int NE,
-+                                      const Array<double> &bc,
-+                                      const Array<double> &bo,
-+                                      const Array<double> &bct,
-+                                      const Array<double> &gct,
-+                                      const Vector &pa_data,
-+                                      const Vector &x,
-+                                      Vector &y)
-+{
-+   constexpr static int VDIM = 2;
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
-+
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bt = Reshape(bct.Read(), D1D, Q1D);
-+   auto Gt = Reshape(gct.Read(), D1D, Q1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, 3, NE);
-+   auto X = Reshape(x.Read(), 2*(D1D-1)*D1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), D1D, D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double mass[MAX_Q1D][MAX_Q1D][VDIM];
-+
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            for (int c = 0; c < VDIM; ++c)
-+            {
-+               mass[qy][qx][c] = 0.0;
-+            }
-+         }
-+      }
-+
-+      int osc = 0;
-+
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
-+      {
-+         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+         for (int dy = 0; dy < D1Dy; ++dy)
-+         {
-+            double massX[MAX_Q1D];
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               massX[qx] = 0.0;
-+            }
-+
-+            for (int dx = 0; dx < D1Dx; ++dx)
-+            {
-+               const double t = X(dx + (dy * D1Dx) + osc, e);
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  massX[qx] += t * ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
-+               }
-+            }
-+
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               const double wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  mass[qy][qx][c] += massX[qx] * wy;
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy;
-+      }  // loop (c) over components
-+
-+      // Apply D operator.
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            const double O11 = op(qx,qy,0,e);
-+            const double O12 = op(qx,qy,1,e);
-+            const double O22 = op(qx,qy,2,e);
-+            const double massX = mass[qy][qx][0];
-+            const double massY = mass[qy][qx][1];
-+            mass[qy][qx][0] = (O11*massX)+(O12*massY);
-+            mass[qy][qx][1] = (O12*massX)+(O22*massY);
-+         }
-+      }
-+
-+      for (int qy = 0; qy < Q1D; ++qy)
-+      {
-+         double gradX[MAX_D1D][2];
-+         for (int dx = 0; dx < D1D; ++dx)
-+         {
-+            gradX[dx][0] = 0;
-+            gradX[dx][1] = 0;
-+         }
-+         for (int qx = 0; qx < Q1D; ++qx)
-+         {
-+            const double gX = mass[qy][qx][0];
-+            const double gY = mass[qy][qx][1];
-+            for (int dx = 0; dx < D1D; ++dx)
-+            {
-+               const double wx  = Bt(dx,qx);
-+               const double wDx = Gt(dx,qx);
-+               gradX[dx][0] += gX * wDx;
-+               gradX[dx][1] += gY * wx;
-+            }
-+         }
-+         for (int dy = 0; dy < D1D; ++dy)
-+         {
-+            const double wy  = Bt(dy,qy);
-+            const double wDy = Gt(dy,qy);
-+            for (int dx = 0; dx < D1D; ++dx)
-+            {
-+               Y(dx,dy,e) += ((gradX[dx][0] * wy) + (gradX[dx][1] * wDy));
-+            }
-+         }
-+      }
-+   }); // end of element loop
-+}
-+
-+// Apply to x corresponding to DOFs in H^1 (trial), whose gradients are
-+// integrated against H(curl) test functions corresponding to y.
-+static void PAHcurlH1Apply3D(const int D1D,
-+                             const int Q1D,
-+                             const int NE,
-+                             const Array<double> &bc,
-+                             const Array<double> &gc,
-+                             const Array<double> &bot,
-+                             const Array<double> &bct,
-+                             const Vector &pa_data,
-+                             const Vector &x,
-+                             Vector &y)
-+{
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
-+
-+   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+
-+   constexpr static int VDIM = 3;
-+
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto Gc = Reshape(gc.Read(), Q1D, D1D);
-+   auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
-+   auto Bct = Reshape(bct.Read(), D1D, Q1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, 6, NE);
-+   auto X = Reshape(x.Read(), D1D, D1D, D1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double mass[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];
-+
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               for (int c = 0; c < VDIM; ++c)
-+               {
-+                  mass[qz][qy][qx][c] = 0.0;
-+               }
-+            }
-+         }
-+      }
-+
-+      for (int dz = 0; dz < D1D; ++dz)
-+      {
-+         double gradXY[MAX_Q1D][MAX_Q1D][3];
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               gradXY[qy][qx][0] = 0.0;
-+               gradXY[qy][qx][1] = 0.0;
-+               gradXY[qy][qx][2] = 0.0;
-+            }
-+         }
-+         for (int dy = 0; dy < D1D; ++dy)
-+         {
-+            double gradX[MAX_Q1D][2];
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               gradX[qx][0] = 0.0;
-+               gradX[qx][1] = 0.0;
-+            }
-+            for (int dx = 0; dx < D1D; ++dx)
-+            {
-+               const double s = X(dx,dy,dz,e);
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  gradX[qx][0] += s * Bc(qx,dx);
-+                  gradX[qx][1] += s * Gc(qx,dx);
-+               }
-+            }
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               const double wy  = Bc(qy,dy);
-+               const double wDy = Gc(qy,dy);
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  const double wx  = gradX[qx][0];
-+                  const double wDx = gradX[qx][1];
-+                  gradXY[qy][qx][0] += wDx * wy;
-+                  gradXY[qy][qx][1] += wx * wDy;
-+                  gradXY[qy][qx][2] += wx * wy;
-+               }
-+            }
-+         }
-+         for (int qz = 0; qz < Q1D; ++qz)
-+         {
-+            const double wz  = Bc(qz,dz);
-+            const double wDz = Gc(qz,dz);
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  mass[qz][qy][qx][0] += gradXY[qy][qx][0] * wz;
-+                  mass[qz][qy][qx][1] += gradXY[qy][qx][1] * wz;
-+                  mass[qz][qy][qx][2] += gradXY[qy][qx][2] * wDz;
-+               }
-+            }
-+         }
-+      }
-+
-+      // Apply D operator.
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               const double O11 = op(qx,qy,qz,0,e);
-+               const double O12 = op(qx,qy,qz,1,e);
-+               const double O13 = op(qx,qy,qz,2,e);
-+               const double O22 = op(qx,qy,qz,3,e);
-+               const double O23 = op(qx,qy,qz,4,e);
-+               const double O33 = op(qx,qy,qz,5,e);
-+               const double massX = mass[qz][qy][qx][0];
-+               const double massY = mass[qz][qy][qx][1];
-+               const double massZ = mass[qz][qy][qx][2];
-+               mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
-+               mass[qz][qy][qx][1] = (O12*massX)+(O22*massY)+(O23*massZ);
-+               mass[qz][qy][qx][2] = (O13*massX)+(O23*massY)+(O33*massZ);
-+            }
-+         }
-+      }
-+
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         double massXY[MAX_D1D][MAX_D1D];
-+
-+         int osc = 0;
-+
-+         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+         {
-+            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
-+            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  massXY[dy][dx] = 0.0;
-+               }
-+            }
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               double massX[MAX_D1D];
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  massX[dx] = 0;
-+               }
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     massX[dx] += mass[qz][qy][qx][c] * ((c == 0) ? Bot(dx,qx) : Bct(dx,qx));
-+                  }
-+               }
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  const double wy = (c == 1) ? Bot(dy,qy) : Bct(dy,qy);
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     massXY[dy][dx] += massX[dx] * wy;
-+                  }
-+               }
-+            }
-+
-+            for (int dz = 0; dz < D1Dz; ++dz)
-+            {
-+               const double wz = (c == 2) ? Bot(dz,qz) : Bct(dz,qz);
-+               for (int dy = 0; dy < D1Dy; ++dy)
-+               {
-+                  for (int dx = 0; dx < D1Dx; ++dx)
-+                  {
-+                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += massXY[dy][dx] * wz;
-+                  }
-+               }
-+            }
-+
-+            osc += D1Dx * D1Dy * D1Dz;
-+         }  // loop c
-+      }  // loop qz
-+   }); // end of element loop
-+}
-+
-+// Apply to x corresponding to DOFs in H(curl), integrated
-+// against gradients of H^1 functions corresponding to y.
-+static void PAHcurlH1ApplyTranspose3D(const int D1D,
-+                                      const int Q1D,
-+                                      const int NE,
-+                                      const Array<double> &bc,
-+                                      const Array<double> &bo,
-+                                      const Array<double> &bct,
-+                                      const Array<double> &gct,
-+                                      const Vector &pa_data,
-+                                      const Vector &x,
-+                                      Vector &y)
-+{
-+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
-+   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
-+
-+   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
-+
-+   constexpr static int VDIM = 3;
-+
-+   auto Bc = Reshape(bc.Read(), Q1D, D1D);
-+   auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
-+   auto Bt = Reshape(bct.Read(), D1D, Q1D);
-+   auto Gt = Reshape(gct.Read(), D1D, Q1D);
-+   auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, 6, NE);
-+   auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
-+   auto Y = Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
-+
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      double mass[MAX_Q1D][MAX_Q1D][MAX_Q1D][VDIM];
-+
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               for (int c = 0; c < VDIM; ++c)
-+               {
-+                  mass[qz][qy][qx][c] = 0.0;
-+               }
-+            }
-+         }
-+      }
-+
-+      int osc = 0;
-+
-+      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
-+      {
-+         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
-+         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
-+         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
-+
-+         for (int dz = 0; dz < D1Dz; ++dz)
-+         {
-+            double massXY[MAX_Q1D][MAX_Q1D];
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  massXY[qy][qx] = 0.0;
-+               }
-+            }
-+
-+            for (int dy = 0; dy < D1Dy; ++dy)
-+            {
-+               double massX[MAX_Q1D];
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  massX[qx] = 0.0;
-+               }
-+
-+               for (int dx = 0; dx < D1Dx; ++dx)
-+               {
-+                  const double t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     massX[qx] += t * ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
-+                  }
-+               }
-+
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  const double wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     const double wx = massX[qx];
-+                     massXY[qy][qx] += wx * wy;
-+                  }
-+               }
-+            }
-+
-+            for (int qz = 0; qz < Q1D; ++qz)
-+            {
-+               const double wz = (c == 2) ? Bo(qz,dz) : Bc(qz,dz);
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int qx = 0; qx < Q1D; ++qx)
-+                  {
-+                     mass[qz][qy][qx][c] += massXY[qy][qx] * wz;
-+                  }
-+               }
-+            }
-+         }
-+
-+         osc += D1Dx * D1Dy * D1Dz;
-+      }  // loop (c) over components
-+
-+      // Apply D operator.
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               const double O11 = op(qx,qy,qz,0,e);
-+               const double O12 = op(qx,qy,qz,1,e);
-+               const double O13 = op(qx,qy,qz,2,e);
-+               const double O22 = op(qx,qy,qz,3,e);
-+               const double O23 = op(qx,qy,qz,4,e);
-+               const double O33 = op(qx,qy,qz,5,e);
-+               const double massX = mass[qz][qy][qx][0];
-+               const double massY = mass[qz][qy][qx][1];
-+               const double massZ = mass[qz][qy][qx][2];
-+               mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
-+               mass[qz][qy][qx][1] = (O12*massX)+(O22*massY)+(O23*massZ);
-+               mass[qz][qy][qx][2] = (O13*massX)+(O23*massY)+(O33*massZ);
-+            }
-+         }
-+      }
-+
-+      for (int qz = 0; qz < Q1D; ++qz)
-+      {
-+         double gradXY[MAX_D1D][MAX_D1D][3];
-+         for (int dy = 0; dy < D1D; ++dy)
-+         {
-+            for (int dx = 0; dx < D1D; ++dx)
-+            {
-+               gradXY[dy][dx][0] = 0;
-+               gradXY[dy][dx][1] = 0;
-+               gradXY[dy][dx][2] = 0;
-+            }
-+         }
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            double gradX[MAX_D1D][3];
-+            for (int dx = 0; dx < D1D; ++dx)
-+            {
-+               gradX[dx][0] = 0;
-+               gradX[dx][1] = 0;
-+               gradX[dx][2] = 0;
-+            }
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               const double gX = mass[qz][qy][qx][0];
-+               const double gY = mass[qz][qy][qx][1];
-+               const double gZ = mass[qz][qy][qx][2];
-+               for (int dx = 0; dx < D1D; ++dx)
-+               {
-+                  const double wx  = Bt(dx,qx);
-+                  const double wDx = Gt(dx,qx);
-+                  gradX[dx][0] += gX * wDx;
-+                  gradX[dx][1] += gY * wx;
-+                  gradX[dx][2] += gZ * wx;
-+               }
-+            }
-+            for (int dy = 0; dy < D1D; ++dy)
-+            {
-+               const double wy  = Bt(dy,qy);
-+               const double wDy = Gt(dy,qy);
-+               for (int dx = 0; dx < D1D; ++dx)
-+               {
-+                  gradXY[dy][dx][0] += gradX[dx][0] * wy;
-+                  gradXY[dy][dx][1] += gradX[dx][1] * wDy;
-+                  gradXY[dy][dx][2] += gradX[dx][2] * wy;
-+               }
-+            }
-+         }
-+         for (int dz = 0; dz < D1D; ++dz)
-+         {
-+            const double wz  = Bt(dz,qz);
-+            const double wDz = Gt(dz,qz);
-+            for (int dy = 0; dy < D1D; ++dy)
-+            {
-+               for (int dx = 0; dx < D1D; ++dx)
-+               {
-+                  Y(dx,dy,dz,e) +=
-+                     ((gradXY[dy][dx][0] * wz) +
-+                      (gradXY[dy][dx][1] * wz) +
-+                      (gradXY[dy][dx][2] * wDz));
-+               }
-+            }
-+         }
-+      }  // loop qz
-+   }); // end of element loop
-+}
-+
-+void MixedVectorGradientIntegrator::AssemblePA(
-+   const FiniteElementSpace &trial_fes,
-+   const FiniteElementSpace &test_fes)
-+{
-+   // Assumes tensor-product elements, with a vector test space and H^1 trial space.
-+   Mesh *mesh = trial_fes.GetMesh();
-+   const FiniteElement *trial_fel = trial_fes.GetFE(0);
-+   const FiniteElement *test_fel = test_fes.GetFE(0);
-+
-+   const NodalTensorFiniteElement *trial_el =
-+      dynamic_cast<const NodalTensorFiniteElement*>(trial_fel);
-+   MFEM_VERIFY(trial_el != NULL, "Only NodalTensorFiniteElement is supported!");
-+
-+   const VectorTensorFiniteElement *test_el =
-+      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
-+   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
-+
-+   const IntegrationRule *ir
-+      = IntRule ? IntRule : &MassIntegrator::GetRule(*trial_el, *trial_el,
-+                                                     *mesh->GetElementTransformation(0));
-+   const int dims = trial_el->GetDim();
-+   MFEM_VERIFY(dims == 2 || dims == 3, "");
-+
-+   const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
-+   const int nq = ir->GetNPoints();
-+   dim = mesh->Dimension();
-+   MFEM_VERIFY(dim == 2 || dim == 3, "");
-+
-+   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(), "");
-+
-+   ne = trial_fes.GetNE();
-+   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
-+   mapsC = &test_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
-+   mapsO = &test_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
-+   dofs1D = mapsC->ndof;
-+   quad1D = mapsC->nqpt;
-+
-+   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
-+
-+   pa_data.SetSize(symmDims * nq * ne, Device::GetMemoryType());
-+
-+   QuadratureSpace qs(*mesh, *ir);
-+   CoefficientVector coeff(Q, qs, CoefficientStorage::FULL);
-+
-+   // Use the same setup functions as VectorFEMassIntegrator.
-+   if (test_el->GetDerivType() == mfem::FiniteElement::CURL && dim == 3)
-+   {
-+      internal::PADiffusionSetup3D(quad1D, 1, ne, ir->GetWeights(), geom->J,
-+                                   coeff, pa_data);
-+   }
-+   else if (test_el->GetDerivType() == mfem::FiniteElement::CURL && dim == 2)
-+   {
-+      internal::PADiffusionSetup2D<2>(quad1D, 1, ne, ir->GetWeights(), geom->J,
-+                                      coeff, pa_data);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unknown kernel.");
-+   }
-+}
-+
-+void MixedVectorGradientIntegrator::AddMultPA(const Vector &x, Vector &y) const
-+{
-+   if (dim == 3)
-+   {
-+      PAHcurlH1Apply3D(dofs1D, quad1D, ne, mapsC->B, mapsC->G,
-+                       mapsO->Bt, mapsC->Bt, pa_data, x, y);
-+   }
-+   else if (dim == 2)
-+   {
-+      PAHcurlH1Apply2D(dofs1D, quad1D, ne, mapsC->B, mapsC->G,
-+                       mapsO->Bt, mapsC->Bt, pa_data, x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unsupported dimension!");
-+   }
-+}
-+
-+void MixedVectorGradientIntegrator::AddMultTransposePA(const Vector &x,
-+                                                       Vector &y) const
-+{
-+   if (dim == 3)
-+   {
-+      PAHcurlH1ApplyTranspose3D(dofs1D, quad1D, ne, mapsC->B, mapsO->B,
-+                                mapsC->Bt, mapsC->Gt, pa_data, x, y);
-+   }
-+   else if (dim == 2)
-+   {
-+      PAHcurlH1ApplyTranspose2D(dofs1D, quad1D, ne, mapsC->B, mapsO->B,
-+                                mapsC->Bt, mapsC->Gt, pa_data, x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unsupported dimension!");
-+   }
-+}
-+
-+} // namespace mfem
-diff --git a/fem/integ/bilininteg_transpose_ea.cpp b/fem/integ/bilininteg_transpose_ea.cpp
-new file mode 100644
-index 000000000..e1ac154fc
---- /dev/null
-+++ b/fem/integ/bilininteg_transpose_ea.cpp
-@@ -0,0 +1,106 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+
-+namespace mfem
-+{
-+
-+void TransposeIntegrator::AssembleEA(const FiniteElementSpace &fes,
-+                                     Vector &ea_data)
-+{
-+   const int ne = fes.GetNE();
-+   if (ne == 0) { return; }
-+
-+   const int dofs = fes.GetFE(0)->GetDof();
-+   Vector ea_data_tmp(ea_data.Size());
-+   ea_data_tmp = 0.0;
-+   bfi->AssembleEA(fes, ea_data_tmp);
-+   auto A = Reshape(ea_data_tmp.Read(), dofs, dofs, ne);
-+   auto AT = Reshape(ea_data.ReadWrite(), dofs, dofs, ne);
-+   mfem::forall(ne, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      for (int i = 0; i < dofs; i++)
-+      {
-+         for (int j = 0; j < dofs; j++)
-+         {
-+            const double a = A(i, j, e);
-+            AT(j, i, e) += a;
-+         }
-+      }
-+   });
-+}
-+
-+void TransposeIntegrator::AssembleEAInteriorFaces(const FiniteElementSpace &fes,
-+                                                  Vector &ea_data_int,
-+                                                  Vector &ea_data_ext)
-+{
-+   const int nf = fes.GetNFbyType(FaceType::Interior);
-+   if (nf == 0) { return; }
-+
-+   const int face_dofs = fes.GetTraceElement(0,
-+                                             fes.GetMesh()->GetFaceGeometry(0))->GetDof();
-+   Vector ea_data_int_tmp(ea_data_int.Size());
-+   Vector ea_data_ext_tmp(ea_data_ext.Size());
-+   ea_data_int_tmp = 0.0;
-+   ea_data_ext_tmp = 0.0;
-+   bfi->AssembleEAInteriorFaces(fes, ea_data_int_tmp, ea_data_ext_tmp);
-+   auto A_int = Reshape(ea_data_int_tmp.Read(), face_dofs, face_dofs, 2, nf);
-+   auto A_ext = Reshape(ea_data_ext_tmp.Read(), face_dofs, face_dofs, 2, nf);
-+   auto AT_int = Reshape(ea_data_int.ReadWrite(), face_dofs, face_dofs, 2, nf);
-+   auto AT_ext = Reshape(ea_data_ext.ReadWrite(), face_dofs, face_dofs, 2, nf);
-+   mfem::forall(nf, [=] MFEM_HOST_DEVICE (int f)
-+   {
-+      for (int i = 0; i < face_dofs; i++)
-+      {
-+         for (int j = 0; j < face_dofs; j++)
-+         {
-+            const double a_int0 = A_int(i, j, 0, f);
-+            const double a_int1 = A_int(i, j, 1, f);
-+            const double a_ext0 = A_ext(i, j, 0, f);
-+            const double a_ext1 = A_ext(i, j, 1, f);
-+            AT_int(j, i, 0, f) += a_int0;
-+            AT_int(j, i, 1, f) += a_int1;
-+            AT_ext(j, i, 0, f) += a_ext1;
-+            AT_ext(j, i, 1, f) += a_ext0;
-+         }
-+      }
-+   });
-+}
-+
-+void TransposeIntegrator::AssembleEABoundaryFaces(const FiniteElementSpace &fes,
-+                                                  Vector &ea_data_bdr)
-+{
-+   const int nf = fes.GetNFbyType(FaceType::Boundary);
-+   if (nf == 0) { return; }
-+
-+   const int face_dofs = fes.GetTraceElement(0,
-+                                             fes.GetMesh()->GetFaceGeometry(0))->GetDof();
-+   Vector ea_data_bdr_tmp(ea_data_bdr.Size());
-+   ea_data_bdr_tmp = 0.0;
-+   bfi->AssembleEABoundaryFaces(fes, ea_data_bdr_tmp);
-+   auto A_bdr = Reshape(ea_data_bdr_tmp.Read(), face_dofs, face_dofs, nf);
-+   auto AT_bdr = Reshape(ea_data_bdr.ReadWrite(), face_dofs, face_dofs, nf);
-+   mfem::forall(nf, [=] MFEM_HOST_DEVICE (int f)
-+   {
-+      for (int i = 0; i < face_dofs; i++)
-+      {
-+         for (int j = 0; j < face_dofs; j++)
-+         {
-+            const double a_bdr = A_bdr(i, j, f);
-+            AT_bdr(j, i, f) += a_bdr;
-+         }
-+      }
-+   });
-+}
-+
-+}
-diff --git a/fem/bilininteg_vecdiffusion_mf.cpp b/fem/integ/bilininteg_vecdiffusion_mf.cpp
-similarity index 93%
-rename from fem/bilininteg_vecdiffusion_mf.cpp
-rename to fem/integ/bilininteg_vecdiffusion_mf.cpp
-index dae344544..7cad61496 100644
---- a/fem/bilininteg_vecdiffusion_mf.cpp
-+++ b/fem/integ/bilininteg_vecdiffusion_mf.cpp
-@@ -9,12 +9,10 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
--#include "ceed/integrators/diffusion/diffusion.hpp"
--
--using namespace std;
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../ceed/integrators/diffusion/diffusion.hpp"
- 
- namespace mfem
- {
-diff --git a/fem/bilininteg_vecdiffusion.cpp b/fem/integ/bilininteg_vecdiffusion_pa.cpp
-similarity index 88%
-rename from fem/bilininteg_vecdiffusion.cpp
-rename to fem/integ/bilininteg_vecdiffusion_pa.cpp
-index 1915fca37..84e4d5b2a 100644
---- a/fem/bilininteg_vecdiffusion.cpp
-+++ b/fem/integ/bilininteg_vecdiffusion_pa.cpp
-@@ -9,19 +9,15 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
--#include "qfunction.hpp"
--#include "ceed/integrators/diffusion/diffusion.hpp"
--
--using namespace std;
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../qfunction.hpp"
-+#include "../ceed/integrators/diffusion/diffusion.hpp"
- 
- namespace mfem
- {
- 
--// PA Vector Diffusion Integrator
--
- // PA Diffusion Assemble 2D kernel
- static void PAVectorDiffusionSetup2D(const int Q1D,
-                                      const int NE,
-@@ -40,7 +36,6 @@ static void PAVectorDiffusionSetup2D(const int Q1D,
-    const auto C = const_c ? Reshape(c.Read(), 1,1) :
-                   Reshape(c.Read(), NQ, NE);
- 
--
-    mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-    {
-       for (int q = 0; q < NQ; ++q)
-@@ -76,7 +71,6 @@ static void PAVectorDiffusionSetup3D(const int Q1D,
-    const auto C = const_c ? Reshape(c.Read(), 1,1) :
-                   Reshape(c.Read(), NQ,NE);
- 
--
-    mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-    {
-       for (int q = 0; q < NQ; ++q)
-@@ -118,28 +112,6 @@ static void PAVectorDiffusionSetup3D(const int Q1D,
-    });
- }
- 
--static void PAVectorDiffusionSetup(const int dim,
--                                   const int Q1D,
--                                   const int NE,
--                                   const Array<double> &W,
--                                   const Vector &J,
--                                   const Vector &C,
--                                   Vector &op)
--{
--   if (!(dim == 2 || dim == 3))
--   {
--      MFEM_ABORT("Dimension not supported.");
--   }
--   if (dim == 2)
--   {
--      PAVectorDiffusionSetup2D(Q1D, NE, W, J, C, op);
--   }
--   if (dim == 3)
--   {
--      PAVectorDiffusionSetup3D(Q1D, NE, W, J, C, op);
--   }
--}
--
- void VectorDiffusionIntegrator::AssemblePA(const FiniteElementSpace &fes)
- {
-    // Assumes tensor-product elements
-@@ -183,7 +155,10 @@ void VectorDiffusionIntegrator::AssemblePA(const FiniteElementSpace &fes)
-    const Array<double> &w = ir->GetWeights();
-    const Vector &j = geom->J;
-    Vector &d = pa_data;
--   if (dim == 1) { MFEM_ABORT("dim==1 not supported in PAVectorDiffusionSetup"); }
-+   if (dim == 1)
-+   {
-+      MFEM_ABORT("dim==1 not supported in VectorDiffusionIntegrator::AssemblePA");
-+   }
-    if (dim == 2 && sdim == 3)
-    {
-       constexpr int DIM = 2;
-@@ -222,23 +197,224 @@ void VectorDiffusionIntegrator::AssemblePA(const FiniteElementSpace &fes)
-    }
-    else
-    {
--      PAVectorDiffusionSetup(dim, quad1D, ne, w, j, coeff, d);
-+      if (dim == 2)
-+      {
-+         return PAVectorDiffusionSetup2D(quad1D, ne, w, j, coeff, d);
-+      }
-+      if (dim == 3)
-+      {
-+         return PAVectorDiffusionSetup3D(quad1D, ne, w, j, coeff, d);
-+      }
-+      MFEM_ABORT("Dimension not supported.");
-+   }
-+}
-+
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void PAVectorDiffusionDiagonal2D(const int NE,
-+                                        const Array<double> &b,
-+                                        const Array<double> &g,
-+                                        const Vector &d,
-+                                        Vector &y,
-+                                        const int d1d = 0,
-+                                        const int q1d = 0)
-+{
-+   const int D1D = T_D1D ? T_D1D : d1d;
-+   const int Q1D = T_Q1D ? T_Q1D : q1d;
-+   MFEM_VERIFY(D1D <= MAX_D1D, "");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
-+   auto B = Reshape(b.Read(), Q1D, D1D);
-+   auto G = Reshape(g.Read(), Q1D, D1D);
-+   // note the different shape for D, this is a (symmetric) matrix so we only
-+   // store necessary entries
-+   auto D = Reshape(d.Read(), Q1D*Q1D, 3, NE);
-+   auto Y = Reshape(y.ReadWrite(), D1D, D1D, 2, NE);
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      const int D1D = T_D1D ? T_D1D : d1d;
-+      const int Q1D = T_Q1D ? T_Q1D : q1d;
-+      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
-+      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
-+      // gradphi \cdot Q \gradphi has four terms
-+      double QD0[MQ1][MD1];
-+      double QD1[MQ1][MD1];
-+      double QD2[MQ1][MD1];
-+      for (int qx = 0; qx < Q1D; ++qx)
-+      {
-+         for (int dy = 0; dy < D1D; ++dy)
-+         {
-+            QD0[qx][dy] = 0.0;
-+            QD1[qx][dy] = 0.0;
-+            QD2[qx][dy] = 0.0;
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               const int q = qx + qy * Q1D;
-+               const double D0 = D(q,0,e);
-+               const double D1 = D(q,1,e);
-+               const double D2 = D(q,2,e);
-+               QD0[qx][dy] += B(qy, dy) * B(qy, dy) * D0;
-+               QD1[qx][dy] += B(qy, dy) * G(qy, dy) * D1;
-+               QD2[qx][dy] += G(qy, dy) * G(qy, dy) * D2;
-+            }
-+         }
-+      }
-+      for (int dy = 0; dy < D1D; ++dy)
-+      {
-+         for (int dx = 0; dx < D1D; ++dx)
-+         {
-+            double temp = 0.0;
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               temp += G(qx, dx) * G(qx, dx) * QD0[qx][dy];
-+               temp += G(qx, dx) * B(qx, dx) * QD1[qx][dy];
-+               temp += B(qx, dx) * G(qx, dx) * QD1[qx][dy];
-+               temp += B(qx, dx) * B(qx, dx) * QD2[qx][dy];
-+            }
-+            Y(dx,dy,0,e) += temp;
-+            Y(dx,dy,1,e) += temp;
-+         }
-+      }
-+   });
-+}
-+
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void PAVectorDiffusionDiagonal3D(const int NE,
-+                                        const Array<double> &b,
-+                                        const Array<double> &g,
-+                                        const Vector &d,
-+                                        Vector &y,
-+                                        const int d1d = 0,
-+                                        const int q1d = 0)
-+{
-+   constexpr int DIM = 3;
-+   const int D1D = T_D1D ? T_D1D : d1d;
-+   const int Q1D = T_Q1D ? T_Q1D : q1d;
-+   constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
-+   constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
-+   MFEM_VERIFY(D1D <= MD1, "");
-+   MFEM_VERIFY(Q1D <= MQ1, "");
-+   auto B = Reshape(b.Read(), Q1D, D1D);
-+   auto G = Reshape(g.Read(), Q1D, D1D);
-+   auto Q = Reshape(d.Read(), Q1D*Q1D*Q1D, 6, NE);
-+   auto Y = Reshape(y.ReadWrite(), D1D, D1D, D1D, 3, NE);
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      const int D1D = T_D1D ? T_D1D : d1d;
-+      const int Q1D = T_Q1D ? T_Q1D : q1d;
-+      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
-+      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
-+      double QQD[MQ1][MQ1][MD1];
-+      double QDD[MQ1][MD1][MD1];
-+      for (int i = 0; i < DIM; ++i)
-+      {
-+         for (int j = 0; j < DIM; ++j)
-+         {
-+            // first tensor contraction, along z direction
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  for (int dz = 0; dz < D1D; ++dz)
-+                  {
-+                     QQD[qx][qy][dz] = 0.0;
-+                     for (int qz = 0; qz < Q1D; ++qz)
-+                     {
-+                        const int q = qx + (qy + qz * Q1D) * Q1D;
-+                        const int k = j >= i ?
-+                                      3 - (3-i)*(2-i)/2 + j:
-+                                      3 - (3-j)*(2-j)/2 + i;
-+                        const double O = Q(q,k,e);
-+                        const double Bz = B(qz,dz);
-+                        const double Gz = G(qz,dz);
-+                        const double L = i==2 ? Gz : Bz;
-+                        const double R = j==2 ? Gz : Bz;
-+                        QQD[qx][qy][dz] += L * O * R;
-+                     }
-+                  }
-+               }
-+            }
-+            // second tensor contraction, along y direction
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               for (int dz = 0; dz < D1D; ++dz)
-+               {
-+                  for (int dy = 0; dy < D1D; ++dy)
-+                  {
-+                     QDD[qx][dy][dz] = 0.0;
-+                     for (int qy = 0; qy < Q1D; ++qy)
-+                     {
-+                        const double By = B(qy,dy);
-+                        const double Gy = G(qy,dy);
-+                        const double L = i==1 ? Gy : By;
-+                        const double R = j==1 ? Gy : By;
-+                        QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
-+                     }
-+                  }
-+               }
-+            }
-+            // third tensor contraction, along x direction
-+            for (int dz = 0; dz < D1D; ++dz)
-+            {
-+               for (int dy = 0; dy < D1D; ++dy)
-+               {
-+                  for (int dx = 0; dx < D1D; ++dx)
-+                  {
-+                     double temp = 0.0;
-+                     for (int qx = 0; qx < Q1D; ++qx)
-+                     {
-+                        const double Bx = B(qx,dx);
-+                        const double Gx = G(qx,dx);
-+                        const double L = i==0 ? Gx : Bx;
-+                        const double R = j==0 ? Gx : Bx;
-+                        temp += L * QDD[qx][dy][dz] * R;
-+                     }
-+                     Y(dx, dy, dz, 0, e) += temp;
-+                     Y(dx, dy, dz, 1, e) += temp;
-+                     Y(dx, dy, dz, 2, e) += temp;
-+                  }
-+               }
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+void VectorDiffusionIntegrator::AssembleDiagonalPA(Vector &diag)
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->GetDiagonal(diag);
-+   }
-+   else
-+   {
-+      if (dim == 2)
-+      {
-+         return PAVectorDiffusionDiagonal2D(ne, maps->B, maps->G,
-+                                            pa_data, diag,
-+                                            dofs1D, quad1D);
-+      }
-+      else if (dim == 3)
-+      {
-+         return PAVectorDiffusionDiagonal3D(ne, maps->B, maps->G,
-+                                            pa_data, diag,
-+                                            dofs1D, quad1D);
-+      }
-+      MFEM_ABORT("Dimension not implemented.");
-    }
- }
- 
- // PA Diffusion Apply 2D kernel
--template<int T_D1D = 0, int T_Q1D = 0, int T_VDIM = 0> static
--void PAVectorDiffusionApply2D(const int NE,
--                              const Array<double> &b,
--                              const Array<double> &g,
--                              const Array<double> &bt,
--                              const Array<double> &gt,
--                              const Vector &d_,
--                              const Vector &x_,
--                              Vector &y_,
--                              const int d1d = 0,
--                              const int q1d = 0,
--                              const int vdim = 0)
-+template<int T_D1D = 0, int T_Q1D = 0, int T_VDIM = 0>
-+static void PAVectorDiffusionApply2D(const int NE,
-+                                     const Array<double> &b,
-+                                     const Array<double> &g,
-+                                     const Array<double> &bt,
-+                                     const Array<double> &gt,
-+                                     const Vector &d_,
-+                                     const Vector &x_,
-+                                     Vector &y_,
-+                                     const int d1d = 0,
-+                                     const int q1d = 0,
-+                                     const int vdim = 0)
- {
-    const int D1D = T_D1D ? T_D1D : d1d;
-    const int Q1D = T_Q1D ? T_Q1D : q1d;
-@@ -349,17 +525,16 @@ void PAVectorDiffusionApply2D(const int NE,
- }
- 
- // PA Diffusion Apply 3D kernel
--template<const int T_D1D = 0,
--         const int T_Q1D = 0> static
--void PAVectorDiffusionApply3D(const int NE,
--                              const Array<double> &b,
--                              const Array<double> &g,
--                              const Array<double> &bt,
--                              const Array<double> &gt,
--                              const Vector &op_,
--                              const Vector &x_,
--                              Vector &y_,
--                              int d1d = 0, int q1d = 0)
-+template<const int T_D1D = 0, const int T_Q1D = 0>
-+static void PAVectorDiffusionApply3D(const int NE,
-+                                     const Array<double> &b,
-+                                     const Array<double> &g,
-+                                     const Array<double> &bt,
-+                                     const Array<double> &gt,
-+                                     const Vector &op_,
-+                                     const Vector &x_,
-+                                     Vector &y_,
-+                                     int d1d = 0, int q1d = 0)
- {
-    const int D1D = T_D1D ? T_D1D : d1d;
-    const int Q1D = T_Q1D ? T_Q1D : q1d;
-@@ -542,7 +717,6 @@ void PAVectorDiffusionApply3D(const int NE,
-    });
- }
- 
--// PA Diffusion Apply kernel
- void VectorDiffusionIntegrator::AddMultPA(const Vector &x, Vector &y) const
- {
-    if (DeviceCanUseCeed())
-@@ -572,220 +746,14 @@ void VectorDiffusionIntegrator::AddMultPA(const Vector &x, Vector &y) const
-          }
-       }
-       if (dim == 2 && sdim == 2)
--      { return PAVectorDiffusionApply2D(ne,B,G,Bt,Gt,D,x,y,D1D,Q1D,sdim); }
--
--      if (dim == 3 && sdim == 3)
--      { return PAVectorDiffusionApply3D(ne,B,G,Bt,Gt,D,x,y,D1D,Q1D); }
--
--      MFEM_ABORT("Unknown kernel.");
--   }
--}
--
--template<int T_D1D = 0, int T_Q1D = 0>
--static void PAVectorDiffusionDiagonal2D(const int NE,
--                                        const Array<double> &b,
--                                        const Array<double> &g,
--                                        const Vector &d,
--                                        Vector &y,
--                                        const int d1d = 0,
--                                        const int q1d = 0)
--{
--   const int D1D = T_D1D ? T_D1D : d1d;
--   const int Q1D = T_Q1D ? T_Q1D : q1d;
--   MFEM_VERIFY(D1D <= MAX_D1D, "");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
--   auto B = Reshape(b.Read(), Q1D, D1D);
--   auto G = Reshape(g.Read(), Q1D, D1D);
--   // note the different shape for D, this is a (symmetric) matrix so we only
--   // store necessary entries
--   auto D = Reshape(d.Read(), Q1D*Q1D, 3, NE);
--   auto Y = Reshape(y.ReadWrite(), D1D, D1D, 2, NE);
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      const int D1D = T_D1D ? T_D1D : d1d;
--      const int Q1D = T_Q1D ? T_Q1D : q1d;
--      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
--      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
--      // gradphi \cdot Q \gradphi has four terms
--      double QD0[MQ1][MD1];
--      double QD1[MQ1][MD1];
--      double QD2[MQ1][MD1];
--      for (int qx = 0; qx < Q1D; ++qx)
-       {
--         for (int dy = 0; dy < D1D; ++dy)
--         {
--            QD0[qx][dy] = 0.0;
--            QD1[qx][dy] = 0.0;
--            QD2[qx][dy] = 0.0;
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               const int q = qx + qy * Q1D;
--               const double D0 = D(q,0,e);
--               const double D1 = D(q,1,e);
--               const double D2 = D(q,2,e);
--               QD0[qx][dy] += B(qy, dy) * B(qy, dy) * D0;
--               QD1[qx][dy] += B(qy, dy) * G(qy, dy) * D1;
--               QD2[qx][dy] += G(qy, dy) * G(qy, dy) * D2;
--            }
--         }
-+         return PAVectorDiffusionApply2D(ne,B,G,Bt,Gt,D,x,y,D1D,Q1D,sdim);
-       }
--      for (int dy = 0; dy < D1D; ++dy)
--      {
--         for (int dx = 0; dx < D1D; ++dx)
--         {
--            double temp = 0.0;
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               temp += G(qx, dx) * G(qx, dx) * QD0[qx][dy];
--               temp += G(qx, dx) * B(qx, dx) * QD1[qx][dy];
--               temp += B(qx, dx) * G(qx, dx) * QD1[qx][dy];
--               temp += B(qx, dx) * B(qx, dx) * QD2[qx][dy];
--            }
--            Y(dx,dy,0,e) += temp;
--            Y(dx,dy,1,e) += temp;
--         }
--      }
--   });
--}
--
--template<int T_D1D = 0, int T_Q1D = 0>
--static void PAVectorDiffusionDiagonal3D(const int NE,
--                                        const Array<double> &b,
--                                        const Array<double> &g,
--                                        const Vector &d,
--                                        Vector &y,
--                                        const int d1d = 0,
--                                        const int q1d = 0)
--{
--   constexpr int DIM = 3;
--   const int D1D = T_D1D ? T_D1D : d1d;
--   const int Q1D = T_Q1D ? T_Q1D : q1d;
--   constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
--   constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
--   MFEM_VERIFY(D1D <= MD1, "");
--   MFEM_VERIFY(Q1D <= MQ1, "");
--   auto B = Reshape(b.Read(), Q1D, D1D);
--   auto G = Reshape(g.Read(), Q1D, D1D);
--   auto Q = Reshape(d.Read(), Q1D*Q1D*Q1D, 6, NE);
--   auto Y = Reshape(y.ReadWrite(), D1D, D1D, D1D, 3, NE);
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      const int D1D = T_D1D ? T_D1D : d1d;
--      const int Q1D = T_Q1D ? T_Q1D : q1d;
--      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
--      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
--      double QQD[MQ1][MQ1][MD1];
--      double QDD[MQ1][MD1][MD1];
--      for (int i = 0; i < DIM; ++i)
-+      if (dim == 3 && sdim == 3)
-       {
--         for (int j = 0; j < DIM; ++j)
--         {
--            // first tensor contraction, along z direction
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  for (int dz = 0; dz < D1D; ++dz)
--                  {
--                     QQD[qx][qy][dz] = 0.0;
--                     for (int qz = 0; qz < Q1D; ++qz)
--                     {
--                        const int q = qx + (qy + qz * Q1D) * Q1D;
--                        const int k = j >= i ?
--                                      3 - (3-i)*(2-i)/2 + j:
--                                      3 - (3-j)*(2-j)/2 + i;
--                        const double O = Q(q,k,e);
--                        const double Bz = B(qz,dz);
--                        const double Gz = G(qz,dz);
--                        const double L = i==2 ? Gz : Bz;
--                        const double R = j==2 ? Gz : Bz;
--                        QQD[qx][qy][dz] += L * O * R;
--                     }
--                  }
--               }
--            }
--            // second tensor contraction, along y direction
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               for (int dz = 0; dz < D1D; ++dz)
--               {
--                  for (int dy = 0; dy < D1D; ++dy)
--                  {
--                     QDD[qx][dy][dz] = 0.0;
--                     for (int qy = 0; qy < Q1D; ++qy)
--                     {
--                        const double By = B(qy,dy);
--                        const double Gy = G(qy,dy);
--                        const double L = i==1 ? Gy : By;
--                        const double R = j==1 ? Gy : By;
--                        QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
--                     }
--                  }
--               }
--            }
--            // third tensor contraction, along x direction
--            for (int dz = 0; dz < D1D; ++dz)
--            {
--               for (int dy = 0; dy < D1D; ++dy)
--               {
--                  for (int dx = 0; dx < D1D; ++dx)
--                  {
--                     double temp = 0.0;
--                     for (int qx = 0; qx < Q1D; ++qx)
--                     {
--                        const double Bx = B(qx,dx);
--                        const double Gx = G(qx,dx);
--                        const double L = i==0 ? Gx : Bx;
--                        const double R = j==0 ? Gx : Bx;
--                        temp += L * QDD[qx][dy][dz] * R;
--                     }
--                     Y(dx, dy, dz, 0, e) += temp;
--                     Y(dx, dy, dz, 1, e) += temp;
--                     Y(dx, dy, dz, 2, e) += temp;
--                  }
--               }
--            }
--         }
-+         return PAVectorDiffusionApply3D(ne,B,G,Bt,Gt,D,x,y,D1D,Q1D);
-       }
--   });
--}
--
--static void PAVectorDiffusionAssembleDiagonal(const int dim,
--                                              const int D1D,
--                                              const int Q1D,
--                                              const int NE,
--                                              const Array<double> &B,
--                                              const Array<double> &G,
--                                              const Vector &op,
--                                              Vector &y)
--{
--   if (dim == 2)
--   {
--      return PAVectorDiffusionDiagonal2D(NE, B, G, op, y, D1D, Q1D);
--   }
--   else if (dim == 3)
--   {
--      return PAVectorDiffusionDiagonal3D(NE, B, G, op, y, D1D, Q1D);
--   }
--   MFEM_ABORT("Dimension not implemented.");
--}
--
--void VectorDiffusionIntegrator::AssembleDiagonalPA(Vector &diag)
--{
--   if (DeviceCanUseCeed())
--   {
--      ceedOp->GetDiagonal(diag);
--   }
--   else
--   {
--      PAVectorDiffusionAssembleDiagonal(dim,
--                                        dofs1D,
--                                        quad1D,
--                                        ne,
--                                        maps->B,
--                                        maps->G,
--                                        pa_data,
--                                        diag);
-+      MFEM_ABORT("Unknown kernel.");
-    }
- }
- 
-diff --git a/fem/bilininteg_divergence.cpp b/fem/integ/bilininteg_vecdiv_pa.cpp
-similarity index 94%
-rename from fem/bilininteg_divergence.cpp
-rename to fem/integ/bilininteg_vecdiv_pa.cpp
-index c0102ff5e..63f7a3308 100644
---- a/fem/bilininteg_divergence.cpp
-+++ b/fem/integ/bilininteg_vecdiv_pa.cpp
-@@ -9,17 +9,13 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
--
--using namespace std;
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
- 
- namespace mfem
- {
- 
--// PA Divergence Integrator
--
- // PA Divergence Assemble 2D kernel
- static void PADivergenceSetup2D(const int Q1D,
-                                 const int NE,
-@@ -100,27 +96,6 @@ static void PADivergenceSetup3D(const int Q1D,
-    });
- }
- 
--static void PADivergenceSetup(const int dim,
--                              const int TR_D1D,
--                              const int TE_D1D,
--                              const int Q1D,
--                              const int NE,
--                              const Array<double> &W,
--                              const Vector &J,
--                              const double COEFF,
--                              Vector &op)
--{
--   if (dim == 1) { MFEM_ABORT("dim==1 not supported in PADivergenceSetup"); }
--   if (dim == 2)
--   {
--      PADivergenceSetup2D(Q1D, NE, W, J, COEFF, op);
--   }
--   if (dim == 3)
--   {
--      PADivergenceSetup3D(Q1D, NE, W, J, COEFF, op);
--   }
--}
--
- void VectorDivergenceIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-                                             const FiniteElementSpace &test_fes)
- {
-@@ -147,6 +122,7 @@ void VectorDivergenceIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-    MFEM_ASSERT(quad1D == test_maps->nqpt,
-                "PA requires test and trial space to have same number of quadrature points!");
-    pa_data.SetSize(nq * dimsToStore * ne, Device::GetMemoryType());
-+
-    double coeff = 1.0;
-    if (Q)
-    {
-@@ -154,8 +130,19 @@ void VectorDivergenceIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-       MFEM_VERIFY(cQ != NULL, "only ConstantCoefficient is supported!");
-       coeff = cQ->constant;
-    }
--   PADivergenceSetup(dim, trial_dofs1D, test_dofs1D, quad1D,
--                     ne, ir->GetWeights(), geom->J, coeff, pa_data);
-+
-+   if (dim == 1)
-+   {
-+      MFEM_ABORT("dim==1 not supported in VectorDivergenceIntegrator::AssemblePA");
-+   }
-+   else if (dim == 2)
-+   {
-+      PADivergenceSetup2D(quad1D, ne, ir->GetWeights(), geom->J, coeff, pa_data);
-+   }
-+   else if (dim == 3)
-+   {
-+      PADivergenceSetup3D(quad1D, ne, ir->GetWeights(), geom->J, coeff, pa_data);
-+   }
- }
- 
- // PA Divergence Apply 2D kernel
-@@ -1025,45 +1012,37 @@ static void SmemPADivergenceApply3D(const int NE,
-    });
- }
- 
--static void PADivergenceApply(const int dim,
--                              const int TR_D1D,
--                              const int TE_D1D,
--                              const int Q1D,
--                              const int NE,
--                              const Array<double> &B,
--                              const Array<double> &G,
--                              const Array<double> &Bt,
--                              const Vector &op,
--                              const Vector &x,
--                              Vector &y,
--                              bool transpose=false)
-+void VectorDivergenceIntegrator::AddMultPA(const Vector &x, Vector &y) const
- {
-    if (dim == 2)
-    {
--      return PADivergenceApply2D(NE,B,G,Bt,op,x,y,TR_D1D,TE_D1D,Q1D);
-+      return PADivergenceApply2D(ne, trial_maps->B, trial_maps->G, test_maps->Bt,
-+                                 pa_data, x, y, trial_dofs1D, test_dofs1D, quad1D);
-    }
-    if (dim == 3)
-    {
--      return PADivergenceApply3D(NE,B,G,Bt,op,x,y,TR_D1D,TE_D1D,Q1D);
-+      return PADivergenceApply3D(ne, trial_maps->B, trial_maps->G, test_maps->Bt,
-+                                 pa_data, x, y, trial_dofs1D, test_dofs1D, quad1D);
-    }
-    MFEM_ABORT("Unknown kernel.");
- }
- 
--// PA Divergence Apply kernel
--void VectorDivergenceIntegrator::AddMultPA(const Vector &x, Vector &y) const
--{
--   PADivergenceApply(dim, trial_dofs1D, test_dofs1D, quad1D, ne,
--                     trial_maps->B, trial_maps->G, test_maps->Bt, pa_data, x, y,
--                     false);
--}
--
--// PA Divergence Apply kernel
- void VectorDivergenceIntegrator::AddMultTransposePA(const Vector &x,
-                                                     Vector &y) const
- {
--   PADivergenceApply(dim, trial_dofs1D, test_dofs1D, quad1D, ne,
--                     trial_maps->Bt, trial_maps->Gt, test_maps->B, pa_data, x, y,
--                     true);
-+   if (dim == 2)
-+   {
-+      return PADivergenceApplyTranspose2D(ne, trial_maps->Bt, trial_maps->Gt,
-+                                          test_maps->B, pa_data, x, y,
-+                                          trial_dofs1D, test_dofs1D, quad1D);
-+   }
-+   if (dim == 3)
-+   {
-+      return PADivergenceApplyTranspose3D(ne, trial_maps->Bt, trial_maps->Gt,
-+                                          test_maps->B, pa_data, x, y,
-+                                          trial_dofs1D, test_dofs1D, quad1D);
-+   }
-+   MFEM_ABORT("Unknown kernel.");
- }
- 
- } // namespace mfem
-diff --git a/fem/bilininteg_vecmass_mf.cpp b/fem/integ/bilininteg_vecmass_mf.cpp
-similarity index 90%
-rename from fem/bilininteg_vecmass_mf.cpp
-rename to fem/integ/bilininteg_vecmass_mf.cpp
-index 2e8d74491..cc2eb0174 100644
---- a/fem/bilininteg_vecmass_mf.cpp
-+++ b/fem/integ/bilininteg_vecmass_mf.cpp
-@@ -9,19 +9,14 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
--#include "ceed/integrators/mass/mass.hpp"
--
--using namespace std;
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../ceed/integrators/mass/mass.hpp"
- 
- namespace mfem
- {
- 
--// MF Mass Integrator
--
--// MF Mass Assemble kernel
- void VectorMassIntegrator::AssembleMF(const FiniteElementSpace &fes)
- {
-    // Assuming the same element type
-diff --git a/fem/bilininteg_vecmass.cpp b/fem/integ/bilininteg_vecmass_pa.cpp
-similarity index 88%
-rename from fem/bilininteg_vecmass.cpp
-rename to fem/integ/bilininteg_vecmass_pa.cpp
-index 512cd42d5..b1c20b4c4 100644
---- a/fem/bilininteg_vecmass.cpp
-+++ b/fem/integ/bilininteg_vecmass_pa.cpp
-@@ -9,19 +9,14 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "bilininteg.hpp"
--#include "gridfunc.hpp"
--#include "ceed/integrators/mass/mass.hpp"
--
--using namespace std;
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../ceed/integrators/mass/mass.hpp"
- 
- namespace mfem
- {
- 
--// PA Mass Integrator
--
--// PA Mass Assemble kernel
- void VectorMassIntegrator::AssemblePA(const FiniteElementSpace &fes)
- {
-    // Assuming the same element type
-@@ -83,7 +78,7 @@ void VectorMassIntegrator::AssemblePA(const FiniteElementSpace &fes)
-             const double J21 = J(q,0,1,e);
-             const double J22 = J(q,1,1,e);
-             const double detJ = (J11*J22)-(J21*J12);
--            v(q,e) =  w[q] * constant * detJ;
-+            v(q,e) = w[q] * constant * detJ;
-          }
-       });
-    }
-@@ -111,8 +106,159 @@ void VectorMassIntegrator::AssemblePA(const FiniteElementSpace &fes)
-    }
- }
- 
--template<const int T_D1D = 0,
--         const int T_Q1D = 0>
-+template<const int T_D1D = 0, const int T_Q1D = 0>
-+static void PAVectorMassAssembleDiagonal2D(const int NE,
-+                                           const Array<double> &B_,
-+                                           const Array<double> &Bt_,
-+                                           const Vector &op_,
-+                                           Vector &diag_,
-+                                           const int d1d = 0,
-+                                           const int q1d = 0)
-+{
-+   const int D1D = T_D1D ? T_D1D : d1d;
-+   const int Q1D = T_Q1D ? T_Q1D : q1d;
-+   constexpr int VDIM = 2;
-+   MFEM_VERIFY(D1D <= MAX_D1D, "");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
-+   auto B = Reshape(B_.Read(), Q1D, D1D);
-+   auto op = Reshape(op_.Read(), Q1D, Q1D, NE);
-+   auto y = Reshape(diag_.ReadWrite(), D1D, D1D, VDIM, NE);
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      const int D1D = T_D1D ? T_D1D : d1d;
-+      const int Q1D = T_Q1D ? T_Q1D : q1d;
-+      constexpr int max_D1D = T_D1D ? T_D1D : MAX_D1D;
-+      constexpr int max_Q1D = T_Q1D ? T_Q1D : MAX_Q1D;
-+
-+      double temp[max_Q1D][max_D1D];
-+      for (int qx = 0; qx < Q1D; ++qx)
-+      {
-+         for (int dy = 0; dy < D1D; ++dy)
-+         {
-+            temp[qx][dy] = 0.0;
-+            for (int qy = 0; qy < Q1D; ++qy)
-+            {
-+               temp[qx][dy] += B(qy, dy) * B(qy, dy) * op(qx, qy, e);
-+            }
-+         }
-+      }
-+      for (int dy = 0; dy < D1D; ++dy)
-+      {
-+         for (int dx = 0; dx < D1D; ++dx)
-+         {
-+            double temp1 = 0.0;
-+            for (int qx = 0; qx < Q1D; ++qx)
-+            {
-+               temp1 += B(qx, dx) * B(qx, dx) * temp[qx][dy];
-+            }
-+            y(dx, dy, 0, e) = temp1;
-+            y(dx, dy, 1, e) = temp1;
-+         }
-+      }
-+   });
-+}
-+
-+template<const int T_D1D = 0, const int T_Q1D = 0>
-+static void PAVectorMassAssembleDiagonal3D(const int NE,
-+                                           const Array<double> &B_,
-+                                           const Array<double> &Bt_,
-+                                           const Vector &op_,
-+                                           Vector &diag_,
-+                                           const int d1d = 0,
-+                                           const int q1d = 0)
-+{
-+   const int D1D = T_D1D ? T_D1D : d1d;
-+   const int Q1D = T_Q1D ? T_Q1D : q1d;
-+   constexpr int VDIM = 3;
-+   MFEM_VERIFY(D1D <= MAX_D1D, "");
-+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
-+   auto B = Reshape(B_.Read(), Q1D, D1D);
-+   auto op = Reshape(op_.Read(), Q1D, Q1D, Q1D, NE);
-+   auto y = Reshape(diag_.ReadWrite(), D1D, D1D, D1D, VDIM, NE);
-+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
-+   {
-+      const int D1D = T_D1D ? T_D1D : d1d; // nvcc workaround
-+      const int Q1D = T_Q1D ? T_Q1D : q1d;
-+      // the following variables are evaluated at compile time
-+      constexpr int max_D1D = T_D1D ? T_D1D : MAX_D1D;
-+      constexpr int max_Q1D = T_Q1D ? T_Q1D : MAX_Q1D;
-+
-+      double temp[max_Q1D][max_Q1D][max_D1D];
-+      for (int qx = 0; qx < Q1D; ++qx)
-+      {
-+         for (int qy = 0; qy < Q1D; ++qy)
-+         {
-+            for (int dz = 0; dz < D1D; ++dz)
-+            {
-+               temp[qx][qy][dz] = 0.0;
-+               for (int qz = 0; qz < Q1D; ++qz)
-+               {
-+                  temp[qx][qy][dz] += B(qz, dz) * B(qz, dz) * op(qx, qy, qz, e);
-+               }
-+            }
-+         }
-+      }
-+      double temp2[max_Q1D][max_D1D][max_D1D];
-+      for (int qx = 0; qx < Q1D; ++qx)
-+      {
-+         for (int dz = 0; dz < D1D; ++dz)
-+         {
-+            for (int dy = 0; dy < D1D; ++dy)
-+            {
-+               temp2[qx][dy][dz] = 0.0;
-+               for (int qy = 0; qy < Q1D; ++qy)
-+               {
-+                  temp2[qx][dy][dz] += B(qy, dy) * B(qy, dy) * temp[qx][qy][dz];
-+               }
-+            }
-+         }
-+      }
-+      for (int dz = 0; dz < D1D; ++dz)
-+      {
-+         for (int dy = 0; dy < D1D; ++dy)
-+         {
-+            for (int dx = 0; dx < D1D; ++dx)
-+            {
-+               double temp3 = 0.0;
-+               for (int qx = 0; qx < Q1D; ++qx)
-+               {
-+                  temp3 += B(qx, dx) * B(qx, dx)
-+                           * temp2[qx][dy][dz];
-+               }
-+               y(dx, dy, dz, 0, e) = temp3;
-+               y(dx, dy, dz, 1, e) = temp3;
-+               y(dx, dy, dz, 2, e) = temp3;
-+            }
-+         }
-+      }
-+   });
-+}
-+
-+void VectorMassIntegrator::AssembleDiagonalPA(Vector &diag)
-+{
-+   if (DeviceCanUseCeed())
-+   {
-+      ceedOp->GetDiagonal(diag);
-+   }
-+   else
-+   {
-+      if (dim == 2)
-+      {
-+         return PAVectorMassAssembleDiagonal2D(ne, maps->B, maps->Bt,
-+                                               pa_data, diag,
-+                                               dofs1D, quad1D);
-+      }
-+      else if (dim == 3)
-+      {
-+         return PAVectorMassAssembleDiagonal3D(ne, maps->B, maps->Bt,
-+                                               pa_data, diag,
-+                                               dofs1D, quad1D);
-+      }
-+      MFEM_ABORT("Dimension not implemented.");
-+   }
-+}
-+
-+template<const int T_D1D = 0, const int T_Q1D = 0>
- static void PAVectorMassApply2D(const int NE,
-                                 const Array<double> &B_,
-                                 const Array<double> &Bt_,
-@@ -208,8 +354,7 @@ static void PAVectorMassApply2D(const int NE,
-    });
- }
- 
--template<const int T_D1D = 0,
--         const int T_Q1D = 0>
-+template<const int T_D1D = 0, const int T_Q1D = 0>
- static void PAVectorMassApply3D(const int NE,
-                                 const Array<double> &B_,
-                                 const Array<double> &Bt_,
-@@ -354,27 +499,6 @@ static void PAVectorMassApply3D(const int NE,
-    });
- }
- 
--static void PAVectorMassApply(const int dim,
--                              const int D1D,
--                              const int Q1D,
--                              const int NE,
--                              const Array<double> &B,
--                              const Array<double> &Bt,
--                              const Vector &op,
--                              const Vector &x,
--                              Vector &y)
--{
--   if (dim == 2)
--   {
--      return PAVectorMassApply2D(NE, B, Bt, op, x, y, D1D, Q1D);
--   }
--   if (dim == 3)
--   {
--      return PAVectorMassApply3D(NE, B, Bt, op, x, y, D1D, Q1D);
--   }
--   MFEM_ABORT("Unknown kernel.");
--}
--
- void VectorMassIntegrator::AddMultPA(const Vector &x, Vector &y) const
- {
-    if (DeviceCanUseCeed())
-@@ -383,174 +507,17 @@ void VectorMassIntegrator::AddMultPA(const Vector &x, Vector &y) const
-    }
-    else
-    {
--      PAVectorMassApply(dim, dofs1D, quad1D, ne, maps->B, maps->Bt, pa_data, x, y);
--   }
--}
--
--template<const int T_D1D = 0, const int T_Q1D = 0>
--static void PAVectorMassAssembleDiagonal2D(const int NE,
--                                           const Array<double> &B_,
--                                           const Array<double> &Bt_,
--                                           const Vector &op_,
--                                           Vector &diag_,
--                                           const int d1d = 0,
--                                           const int q1d = 0)
--{
--   const int D1D = T_D1D ? T_D1D : d1d;
--   const int Q1D = T_Q1D ? T_Q1D : q1d;
--   constexpr int VDIM = 2;
--   MFEM_VERIFY(D1D <= MAX_D1D, "");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
--   auto B = Reshape(B_.Read(), Q1D, D1D);
--   auto op = Reshape(op_.Read(), Q1D, Q1D, NE);
--   auto y = Reshape(diag_.ReadWrite(), D1D, D1D, VDIM, NE);
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      const int D1D = T_D1D ? T_D1D : d1d;
--      const int Q1D = T_Q1D ? T_Q1D : q1d;
--      constexpr int max_D1D = T_D1D ? T_D1D : MAX_D1D;
--      constexpr int max_Q1D = T_Q1D ? T_Q1D : MAX_Q1D;
--
--      double temp[max_Q1D][max_D1D];
--      for (int qx = 0; qx < Q1D; ++qx)
-+      if (dim == 2)
-       {
--         for (int dy = 0; dy < D1D; ++dy)
--         {
--            temp[qx][dy] = 0.0;
--            for (int qy = 0; qy < Q1D; ++qy)
--            {
--               temp[qx][dy] += B(qy, dy) * B(qy, dy) * op(qx, qy, e);
--            }
--         }
-+         return PAVectorMassApply2D(ne, maps->B, maps->Bt, pa_data, x, y,
-+                                    dofs1D, quad1D);
-       }
--      for (int dy = 0; dy < D1D; ++dy)
-+      if (dim == 3)
-       {
--         for (int dx = 0; dx < D1D; ++dx)
--         {
--            double temp1 = 0.0;
--            for (int qx = 0; qx < Q1D; ++qx)
--            {
--               temp1 += B(qx, dx) * B(qx, dx) * temp[qx][dy];
--            }
--            y(dx, dy, 0, e) = temp1;
--            y(dx, dy, 1, e) = temp1;
--         }
-+         return PAVectorMassApply3D(ne, maps->B, maps->Bt, pa_data, x, y,
-+                                    dofs1D, quad1D);
-       }
--   });
--}
--
--template<const int T_D1D = 0, const int T_Q1D = 0>
--static void PAVectorMassAssembleDiagonal3D(const int NE,
--                                           const Array<double> &B_,
--                                           const Array<double> &Bt_,
--                                           const Vector &op_,
--                                           Vector &diag_,
--                                           const int d1d = 0,
--                                           const int q1d = 0)
--{
--   const int D1D = T_D1D ? T_D1D : d1d;
--   const int Q1D = T_Q1D ? T_Q1D : q1d;
--   constexpr int VDIM = 3;
--   MFEM_VERIFY(D1D <= MAX_D1D, "");
--   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
--   auto B = Reshape(B_.Read(), Q1D, D1D);
--   auto op = Reshape(op_.Read(), Q1D, Q1D, Q1D, NE);
--   auto y = Reshape(diag_.ReadWrite(), D1D, D1D, D1D, VDIM, NE);
--   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
--   {
--      const int D1D = T_D1D ? T_D1D : d1d; // nvcc workaround
--      const int Q1D = T_Q1D ? T_Q1D : q1d;
--      // the following variables are evaluated at compile time
--      constexpr int max_D1D = T_D1D ? T_D1D : MAX_D1D;
--      constexpr int max_Q1D = T_Q1D ? T_Q1D : MAX_Q1D;
--
--      double temp[max_Q1D][max_Q1D][max_D1D];
--      for (int qx = 0; qx < Q1D; ++qx)
--      {
--         for (int qy = 0; qy < Q1D; ++qy)
--         {
--            for (int dz = 0; dz < D1D; ++dz)
--            {
--               temp[qx][qy][dz] = 0.0;
--               for (int qz = 0; qz < Q1D; ++qz)
--               {
--                  temp[qx][qy][dz] += B(qz, dz) * B(qz, dz) * op(qx, qy, qz, e);
--               }
--            }
--         }
--      }
--      double temp2[max_Q1D][max_D1D][max_D1D];
--      for (int qx = 0; qx < Q1D; ++qx)
--      {
--         for (int dz = 0; dz < D1D; ++dz)
--         {
--            for (int dy = 0; dy < D1D; ++dy)
--            {
--               temp2[qx][dy][dz] = 0.0;
--               for (int qy = 0; qy < Q1D; ++qy)
--               {
--                  temp2[qx][dy][dz] += B(qy, dy) * B(qy, dy) * temp[qx][qy][dz];
--               }
--            }
--         }
--      }
--      for (int dz = 0; dz < D1D; ++dz)
--      {
--         for (int dy = 0; dy < D1D; ++dy)
--         {
--            for (int dx = 0; dx < D1D; ++dx)
--            {
--               double temp3 = 0.0;
--               for (int qx = 0; qx < Q1D; ++qx)
--               {
--                  temp3 += B(qx, dx) * B(qx, dx)
--                           * temp2[qx][dy][dz];
--               }
--               y(dx, dy, dz, 0, e) = temp3;
--               y(dx, dy, dz, 1, e) = temp3;
--               y(dx, dy, dz, 2, e) = temp3;
--            }
--         }
--      }
--   });
--}
--
--static void PAVectorMassAssembleDiagonal(const int dim,
--                                         const int D1D,
--                                         const int Q1D,
--                                         const int NE,
--                                         const Array<double> &B,
--                                         const Array<double> &Bt,
--                                         const Vector &op,
--                                         Vector &y)
--{
--   if (dim == 2)
--   {
--      return PAVectorMassAssembleDiagonal2D(NE, B, Bt, op, y, D1D, Q1D);
--   }
--   else if (dim == 3)
--   {
--      return PAVectorMassAssembleDiagonal3D(NE, B, Bt, op, y, D1D, Q1D);
--   }
--   MFEM_ABORT("Dimension not implemented.");
--}
--
--void VectorMassIntegrator::AssembleDiagonalPA(Vector &diag)
--{
--   if (DeviceCanUseCeed())
--   {
--      ceedOp->GetDiagonal(diag);
--   }
--   else
--   {
--      PAVectorMassAssembleDiagonal(dim,
--                                   dofs1D,
--                                   quad1D,
--                                   ne,
--                                   maps->B,
--                                   maps->Bt,
--                                   pa_data,
--                                   diag);
-+      MFEM_ABORT("Unknown kernel.");
-    }
- }
- 
-diff --git a/fem/integ/bilininteg_vectorfediv_pa.cpp b/fem/integ/bilininteg_vectorfediv_pa.cpp
-new file mode 100644
-index 000000000..2915a253b
---- /dev/null
-+++ b/fem/integ/bilininteg_vectorfediv_pa.cpp
-@@ -0,0 +1,157 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license.  We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "../../general/forall.hpp"
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../qfunction.hpp"
-+#include "bilininteg_hdiv_kernels.hpp"
-+
-+using namespace std;
-+
-+namespace mfem
-+{
-+
-+void
-+VectorFEDivergenceIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-+                                         const FiniteElementSpace &test_fes)
-+{
-+   // Assumes tensor-product elements, with a vector test space and
-+   // scalar trial space.
-+   Mesh *mesh = trial_fes.GetMesh();
-+   const FiniteElement *trial_fel = trial_fes.GetFE(0);
-+   const FiniteElement *test_fel = test_fes.GetFE(0);
-+
-+   const VectorTensorFiniteElement *trial_el =
-+      dynamic_cast<const VectorTensorFiniteElement*>(trial_fel);
-+   MFEM_VERIFY(trial_el != NULL, "Only VectorTensorFiniteElement is supported!");
-+
-+   const NodalTensorFiniteElement *test_el =
-+      dynamic_cast<const NodalTensorFiniteElement*>(test_fel);
-+   MFEM_VERIFY(test_el != NULL, "Only NodalTensorFiniteElement is supported!");
-+
-+   const IntegrationRule *ir = IntRule ? IntRule : &MassIntegrator::GetRule(
-+                                  *trial_el, *trial_el,
-+                                  *mesh->GetElementTransformation(0));
-+
-+   const int dims = trial_el->GetDim();
-+   MFEM_VERIFY(dims == 2 || dims == 3, "");
-+
-+   const int nq = ir->GetNPoints();
-+   dim = mesh->Dimension();
-+   MFEM_VERIFY(dim == 2 || dim == 3, "");
-+
-+   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder() + 1, "");
-+
-+   ne = trial_fes.GetNE();
-+   mapsC = &trial_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
-+   mapsO = &trial_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
-+   dofs1D = mapsC->ndof;
-+   quad1D = mapsC->nqpt;
-+
-+   L2mapsO = &test_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
-+   L2dofs1D = L2mapsO->ndof;
-+
-+   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
-+   if (dim == 2)
-+   {
-+      MFEM_VERIFY(nq == quad1D * quad1D, "");
-+   }
-+   else
-+   {
-+      MFEM_VERIFY(nq == quad1D * quad1D * quad1D, "");
-+   }
-+
-+   pa_data.SetSize(nq * ne, Device::GetMemoryType());
-+
-+   QuadratureSpace qs(*mesh, *ir);
-+   CoefficientVector coeff(Q, qs, CoefficientStorage::FULL);
-+
-+   if (test_el->GetMapType() == FiniteElement::INTEGRAL)
-+   {
-+      const GeometricFactors *geom =
-+         mesh->GetGeometricFactors(*ir, GeometricFactors::DETERMINANTS);
-+      coeff /= geom->detJ;
-+   }
-+
-+   if (trial_el->GetDerivType() == mfem::FiniteElement::DIV && dim == 3)
-+   {
-+      internal::PAHdivL2Setup3D(quad1D, ne, ir->GetWeights(), coeff, pa_data);
-+   }
-+   else if (trial_el->GetDerivType() == mfem::FiniteElement::DIV && dim == 2)
-+   {
-+      internal::PAHdivL2Setup2D(quad1D, ne, ir->GetWeights(), coeff, pa_data);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unknown kernel.");
-+   }
-+}
-+
-+void VectorFEDivergenceIntegrator::AssembleDiagonalPA_ADAt(const Vector &D,
-+                                                           Vector &diag)
-+{
-+   if (dim == 3)
-+   {
-+      internal::PAHdivL2AssembleDiagonal_ADAt_3D(dofs1D, quad1D, L2dofs1D, ne,
-+                                                 L2mapsO->B,
-+                                                 mapsC->Gt, mapsO->Bt, pa_data, D, diag);
-+   }
-+   else if (dim == 2)
-+   {
-+      internal::PAHdivL2AssembleDiagonal_ADAt_2D(dofs1D, quad1D, L2dofs1D, ne,
-+                                                 L2mapsO->B,
-+                                                 mapsC->Gt, mapsO->Bt, pa_data, D, diag);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unsupported dimension!");
-+   }
-+}
-+
-+void VectorFEDivergenceIntegrator::AddMultPA(const Vector &x, Vector &y) const
-+{
-+   if (dim == 3)
-+   {
-+      internal::PAHdivL2Apply3D(dofs1D, quad1D, L2dofs1D, ne, mapsO->B, mapsC->G,
-+                                L2mapsO->Bt, pa_data, x, y);
-+   }
-+   else if (dim == 2)
-+   {
-+      internal::PAHdivL2Apply2D(dofs1D, quad1D, L2dofs1D, ne, mapsO->B, mapsC->G,
-+                                L2mapsO->Bt, pa_data, x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unsupported dimension!");
-+   }
-+}
-+
-+void VectorFEDivergenceIntegrator::AddMultTransposePA(const Vector &x,
-+                                                      Vector &y) const
-+{
-+   if (dim == 3)
-+   {
-+      internal::PAHdivL2ApplyTranspose3D(dofs1D, quad1D, L2dofs1D, ne, L2mapsO->B,
-+                                         mapsC->Gt, mapsO->Bt, pa_data, x, y);
-+   }
-+   else if (dim == 2)
-+   {
-+      internal::PAHdivL2ApplyTranspose2D(dofs1D, quad1D, L2dofs1D, ne, L2mapsO->B,
-+                                         mapsC->Gt, mapsO->Bt, pa_data, x, y);
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unsupported dimension!");
-+   }
-+}
-+
-+} // namespace mfem
-diff --git a/fem/integ/bilininteg_vectorfemass_pa.cpp b/fem/integ/bilininteg_vectorfemass_pa.cpp
-new file mode 100644
-index 000000000..c07e9f816
---- /dev/null
-+++ b/fem/integ/bilininteg_vectorfemass_pa.cpp
-@@ -0,0 +1,346 @@
-+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
-+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
-+// LICENSE and NOTICE for details. LLNL-CODE-806117.
-+//
-+// This file is part of the MFEM library. For more information and source code
-+// availability visit https://mfem.org.
-+//
-+// MFEM is free software; you can redistribute it and/or modify it under the
-+// terms of the BSD-3 license. We welcome feedback and contributions, see file
-+// CONTRIBUTING.md for details.
-+
-+#include "../bilininteg.hpp"
-+#include "../gridfunc.hpp"
-+#include "../qfunction.hpp"
-+#include "bilininteg_diffusion_kernels.hpp"
-+#include "bilininteg_hcurl_kernels.hpp"
-+#include "bilininteg_hdiv_kernels.hpp"
-+#include "bilininteg_hcurlhdiv_kernels.hpp"
-+
-+namespace mfem
-+{
-+
-+void VectorFEMassIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
-+                                        const FiniteElementSpace &test_fes)
-+{
-+   // Assumes tensor-product elements
-+   Mesh *mesh = trial_fes.GetMesh();
-+
-+   const FiniteElement *trial_fel = trial_fes.GetFE(0);
-+   const VectorTensorFiniteElement *trial_el =
-+      dynamic_cast<const VectorTensorFiniteElement*>(trial_fel);
-+   MFEM_VERIFY(trial_el != NULL, "Only VectorTensorFiniteElement is supported!");
-+
-+   const FiniteElement *test_fel = test_fes.GetFE(0);
-+   const VectorTensorFiniteElement *test_el =
-+      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
-+   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
-+
-+   const IntegrationRule *ir
-+      = IntRule ? IntRule : &MassIntegrator::GetRule(*trial_el, *trial_el,
-+                                                     *mesh->GetElementTransformation(0));
-+   const int dims = trial_el->GetDim();
-+   MFEM_VERIFY(dims == 2 || dims == 3, "");
-+
-+   const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
-+   nq = ir->GetNPoints();
-+   dim = mesh->Dimension();
-+   MFEM_VERIFY(dim == 2 || dim == 3, "");
-+
-+   ne = trial_fes.GetNE();
-+   MFEM_VERIFY(ne == test_fes.GetNE(),
-+               "Different meshes for test and trial spaces");
-+   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
-+   mapsC = &trial_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
-+   mapsO = &trial_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
-+   dofs1D = mapsC->ndof;
-+   quad1D = mapsC->nqpt;
-+
-+   mapsCtest = &test_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
-+   mapsOtest = &test_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
-+   dofs1Dtest = mapsCtest->ndof;
-+
-+   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
-+
-+   trial_fetype = trial_el->GetDerivType();
-+   test_fetype = test_el->GetDerivType();
-+
-+   const bool trial_curl = (trial_fetype == mfem::FiniteElement::CURL);
-+   const bool trial_div = (trial_fetype == mfem::FiniteElement::DIV);
-+   const bool test_curl = (test_fetype == mfem::FiniteElement::CURL);
-+   const bool test_div = (test_fetype == mfem::FiniteElement::DIV);
-+
-+   QuadratureSpace qs(*mesh, *ir);
-+   CoefficientVector coeff(qs, CoefficientStorage::SYMMETRIC);
-+   if (Q) { coeff.Project(*Q); }
-+   else if (MQ) { coeff.ProjectTranspose(*MQ); }
-+   else if (DQ) { coeff.Project(*DQ); }
-+   else { coeff.SetConstant(1.0); }
-+
-+   const int coeff_dim = coeff.GetVDim();
-+   symmetric = (coeff_dim != dim*dim);
-+
-+   if ((trial_curl && test_div) || (trial_div && test_curl))
-+   {
-+      pa_data.SetSize((coeff_dim == 1 ? 1 : dim*dim) * nq * ne,
-+                      Device::GetMemoryType());
-+   }
-+   else
-+   {
-+      pa_data.SetSize((symmetric ? symmDims : dims*dims) * nq * ne,
-+                      Device::GetMemoryType());
-+   }
-+   if (trial_curl && test_curl && dim == 3)
-+   {
-+      internal::PADiffusionSetup3D(quad1D, coeff_dim, ne, ir->GetWeights(), geom->J,
-+                                   coeff, pa_data);
-+   }
-+   else if (trial_curl && test_curl && dim == 2)
-+   {
-+      internal::PADiffusionSetup2D<2>(quad1D, coeff_dim, ne, ir->GetWeights(),
-+                                      geom->J, coeff, pa_data);
-+   }
-+   else if (trial_div && test_div && dim == 3)
-+   {
-+      internal::PAHdivMassSetup3D(quad1D, coeff_dim, ne, ir->GetWeights(), geom->J,
-+                                  coeff, pa_data);
-+   }
-+   else if (trial_div && test_div && dim == 2)
-+   {
-+      internal::PAHdivMassSetup2D(quad1D, coeff_dim, ne, ir->GetWeights(), geom->J,
-+                                  coeff, pa_data);
-+   }
-+   else if (((trial_curl && test_div) || (trial_div && test_curl)) &&
-+            test_fel->GetOrder() == trial_fel->GetOrder())
-+   {
-+      if (coeff_dim == 1)
-+      {
-+         internal::PAHcurlL2Setup3D(nq, coeff_dim, ne, ir->GetWeights(), coeff, pa_data);
-+      }
-+      else
-+      {
-+         const bool tr = (trial_div && test_curl);
-+         if (dim == 3)
-+         {
-+            internal::PAHcurlHdivMassSetup3D(quad1D, coeff_dim, ne, tr, ir->GetWeights(),
-+                                             geom->J, coeff, pa_data);
-+         }
-+         else
-+         {
-+            internal::PAHcurlHdivMassSetup2D(quad1D, coeff_dim, ne, tr, ir->GetWeights(),
-+                                             geom->J, coeff, pa_data);
-+         }
-+      }
-+   }
-+   else
-+   {
-+      MFEM_ABORT("Unknown kernel.");
-+   }
-+}
-+
-+void VectorFEMassIntegrator::AssembleDiagonalPA(Vector& diag)
-+{
-+   if (dim == 3)
-+   {
-+      if (trial_fetype == mfem::FiniteElement::CURL && test_fetype == trial_fetype)
-+      {
-+         if (Device::Allows(Backend::DEVICE_MASK))
-+         {
-+            const int ID = (dofs1D << 4) | quad1D;
-+            switch (ID)
-+            {
-+               case 0x23:
-+                  return internal::SmemPAHcurlMassAssembleDiagonal3D<2,3>(
-+                            dofs1D, quad1D, ne, symmetric,
-+                            mapsO->B, mapsC->B, pa_data, diag);
-+               case 0x34:
-+                  return internal::SmemPAHcurlMassAssembleDiagonal3D<3,4>(
-+                            dofs1D, quad1D, ne, symmetric,
-+                            mapsO->B, mapsC->B, pa_data, diag);
-+               case 0x45:
-+                  return internal::SmemPAHcurlMassAssembleDiagonal3D<4,5>(
-+                            dofs1D, quad1D, ne, symmetric,
-+                            mapsO->B, mapsC->B, pa_data, diag);
-+               case 0x56:
-+                  return internal::SmemPAHcurlMassAssembleDiagonal3D<5,6>(
-+                            dofs1D, quad1D, ne, symmetric,
-+                            mapsO->B, mapsC->B, pa_data, diag);
-+               default:
-+                  return internal::SmemPAHcurlMassAssembleDiagonal3D(
-+                            dofs1D, quad1D, ne, symmetric,
-+                            mapsO->B, mapsC->B, pa_data, diag);
-+            }
-+         }
-+         else
-+         {
-+            internal::PAHcurlMassAssembleDiagonal3D(dofs1D, quad1D, ne, symmetric,
-+                                                    mapsO->B, mapsC->B, pa_data, diag);
-+         }
-+      }
-+      else if (trial_fetype == mfem::FiniteElement::DIV &&
-+               test_fetype == trial_fetype)
-+      {
-+         internal::PAHdivMassAssembleDiagonal3D(dofs1D, quad1D, ne, symmetric,
-+                                                mapsO->B, mapsC->B, pa_data, diag);
-+      }
-+      else
-+      {
-+         MFEM_ABORT("Unknown kernel.");
-+      }
-+   }
-+   else // 2D
-+   {
-+      if (trial_fetype == mfem::FiniteElement::CURL && test_fetype == trial_fetype)
-+      {
-+         internal::PAHcurlMassAssembleDiagonal2D(dofs1D, quad1D, ne, symmetric,
-+                                                 mapsO->B, mapsC->B, pa_data, diag);
-+      }
-+      else if (trial_fetype == mfem::FiniteElement::DIV &&
-+               test_fetype == trial_fetype)
-+      {
-+         internal::PAHdivMassAssembleDiagonal2D(dofs1D, quad1D, ne, symmetric,
-+                                                mapsO->B, mapsC->B, pa_data, diag);
-+      }
-+      else
-+      {
-+         MFEM_ABORT("Unknown kernel.");
-+      }
-+   }
-+}
-+
-+void VectorFEMassIntegrator::AddMultPA(const Vector &x, Vector &y) const
-+{
-+   const bool trial_curl = (trial_fetype == mfem::FiniteElement::CURL);
-+   const bool trial_div = (trial_fetype == mfem::FiniteElement::DIV);
-+   const bool test_curl = (test_fetype == mfem::FiniteElement::CURL);
-+   const bool test_div = (test_fetype == mfem::FiniteElement::DIV);
-+
-+   if (dim == 3)
-+   {
-+      if (trial_curl && test_curl)
-+      {
-+         if (Device::Allows(Backend::DEVICE_MASK))
-+         {
-+            const int ID = (dofs1D << 4) | quad1D;
-+            switch (ID)
-+            {
-+               case 0x23:
-+                  return internal::SmemPAHcurlMassApply3D<2,3>(
-+                            dofs1D, quad1D, ne, symmetric,
-+                            mapsO->B, mapsC->B, mapsO->Bt,
-+                            mapsC->Bt, pa_data, x, y);
-+               case 0x34:
-+                  return internal::SmemPAHcurlMassApply3D<3,4>(
-+                            dofs1D, quad1D, ne, symmetric,
-+                            mapsO->B, mapsC->B, mapsO->Bt,
-+                            mapsC->Bt, pa_data, x, y);
-+               case 0x45:
-+                  return internal::SmemPAHcurlMassApply3D<4,5>(
-+                            dofs1D, quad1D, ne, symmetric,
-+                            mapsO->B, mapsC->B, mapsO->Bt,
-+                            mapsC->Bt, pa_data, x, y);
-+               case 0x56:
-+                  return internal::SmemPAHcurlMassApply3D<5,6>(
-+                            dofs1D, quad1D, ne, symmetric,
-+                            mapsO->B, mapsC->B, mapsO->Bt,
-+                            mapsC->Bt, pa_data, x, y);
-+               default:
-+                  return internal::SmemPAHcurlMassApply3D(
-+                            dofs1D, quad1D, ne, symmetric,
-+                            mapsO->B, mapsC->B, mapsO->Bt,
-+                            mapsC->Bt, pa_data, x, y);
-+            }
-+         }
-+         else
-+         {
-+            internal::PAHcurlMassApply3D(dofs1D, quad1D, ne, symmetric, mapsO->B, mapsC->B,
-+                                         mapsO->Bt, mapsC->Bt, pa_data, x, y);
-+         }
-+      }
-+      else if (trial_div && test_div)
-+      {
-+         internal::PAHdivMassApply(3, dofs1D, quad1D, ne, symmetric, mapsO->B, mapsC->B,
-+                                   mapsO->Bt, mapsC->Bt, pa_data, x, y);
-+      }
-+      else if (trial_curl && test_div)
-+      {
-+         const bool scalarCoeff = !(DQ || MQ);
-+         internal::PAHcurlHdivMassApply3D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
-+                                          true, false, mapsO->B, mapsC->B, mapsOtest->Bt,
-+                                          mapsCtest->Bt, pa_data, x, y);
-+      }
-+      else if (trial_div && test_curl)
-+      {
-+         const bool scalarCoeff = !(DQ || MQ);
-+         internal::PAHcurlHdivMassApply3D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
-+                                          false, false, mapsO->B, mapsC->B, mapsOtest->Bt,
-+                                          mapsCtest->Bt, pa_data, x, y);
-+      }
-+      else
-+      {
-+         MFEM_ABORT("Unknown kernel.");
-+      }
-+   }
-+   else // 2D
-+   {
-+      if (trial_curl && test_curl)
-+      {
-+         internal::PAHcurlMassApply2D(dofs1D, quad1D, ne, symmetric, mapsO->B, mapsC->B,
-+                                      mapsO->Bt, mapsC->Bt, pa_data, x, y);
-+      }
-+      else if (trial_div && test_div)
-+      {
-+         internal::PAHdivMassApply(2, dofs1D, quad1D, ne, symmetric, mapsO->B, mapsC->B,
-+                                   mapsO->Bt,
-+                                   mapsC->Bt, pa_data, x, y);
-+      }
-+      else if ((trial_curl && test_div) || (trial_div && test_curl))
-+      {
-+         const bool scalarCoeff = !(DQ || MQ);
-+         internal::PAHcurlHdivMassApply2D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
-+                                          trial_curl, false, mapsO->B, mapsC->B,
-+                                          mapsOtest->Bt, mapsCtest->Bt, pa_data, x, y);
-+      }
-+      else
-+      {
-+         MFEM_ABORT("Unknown kernel.");
-+      }
-+   }
-+}
-+
-+void VectorFEMassIntegrator::AddMultTransposePA(const Vector &x,
-+                                                Vector &y) const
-+{
-+   const bool trial_curl = (trial_fetype == mfem::FiniteElement::CURL);
-+   const bool trial_div = (trial_fetype == mfem::FiniteElement::DIV);
-+   const bool test_curl = (test_fetype == mfem::FiniteElement::CURL);
-+   const bool test_div = (test_fetype == mfem::FiniteElement::DIV);
-+
-+   bool symmetricSpaces = true;
-+   if (dim == 3 && ((trial_div && test_curl) || (trial_curl && test_div)))
-+   {
-+      const bool scalarCoeff = !(DQ || MQ);
-+      internal::PAHcurlHdivMassApply3D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
-+                                       trial_div, true, mapsO->B, mapsC->B,
-+                                       mapsOtest->Bt, mapsCtest->Bt, pa_data, x, y);
-+      symmetricSpaces = false;
-+   }
-+   else if (dim == 2 && ((trial_curl && test_div) || (trial_div && test_curl)))
-+   {
-+      const bool scalarCoeff = !(DQ || MQ);
-+      internal::PAHcurlHdivMassApply2D(dofs1D, dofs1Dtest, quad1D, ne, scalarCoeff,
-+                                       !trial_curl, true, mapsO->B, mapsC->B,
-+                                       mapsOtest->Bt, mapsCtest->Bt, pa_data, x, y);
-+      symmetricSpaces = false;
-+   }
-+   if (symmetricSpaces)
-+   {
-+      if (MQ && dynamic_cast<SymmetricMatrixCoefficient*>(MQ) == NULL)
-+      {
-+         MFEM_ABORT("VectorFEMassIntegrator transpose not implemented for asymmetric MatrixCoefficient");
-+      }
-+      AddMultPA(x, y);
-+   }
-+}
-+
-+} // namespace mfem
-diff --git a/fem/lininteg_boundary.cpp b/fem/integ/lininteg_boundary.cpp
-similarity index 89%
-rename from fem/lininteg_boundary.cpp
-rename to fem/integ/lininteg_boundary.cpp
-index 68e54dd1b..9b785335c 100644
---- a/fem/lininteg_boundary.cpp
-+++ b/fem/integ/lininteg_boundary.cpp
-@@ -9,18 +9,19 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "fem.hpp"
--#include "../fem/kernels.hpp"
--#include "../general/forall.hpp"
-+#include "../../general/forall.hpp"
-+#include "../../fem/kernels.hpp"
-+#include "../fem.hpp"
- 
- namespace mfem
- {
- 
--template<int T_D1D = 0, int T_Q1D = 0> static
--void BLFEvalAssemble2D(const int vdim, const int nbe, const int d, const int q,
--                       const bool normals, const int *markers, const double *b,
--                       const double *detj, const double *n, const double *weights,
--                       const Vector &coeff, double *y)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void BLFEvalAssemble2D(const int vdim, const int nbe, const int d,
-+                              const int q,
-+                              const bool normals, const int *markers, const double *b,
-+                              const double *detj, const double *n, const double *weights,
-+                              const Vector &coeff, double *y)
- {
-    const auto F = coeff.Read();
-    const auto M = Reshape(markers, nbe);
-@@ -69,11 +70,12 @@ void BLFEvalAssemble2D(const int vdim, const int nbe, const int d, const int q,
-    });
- }
- 
--template<int T_D1D = 0, int T_Q1D = 0> static
--void BLFEvalAssemble3D(const int vdim, const int nbe, const int d, const int q,
--                       const bool normals, const int *markers, const double *b,
--                       const double *detj, const double *n, const double *weights,
--                       const Vector &coeff, double *y)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void BLFEvalAssemble3D(const int vdim, const int nbe, const int d,
-+                              const int q,
-+                              const bool normals, const int *markers, const double *b,
-+                              const double *detj, const double *n, const double *weights,
-+                              const Vector &coeff, double *y)
- {
-    const auto F = coeff.Read();
-    const auto M = Reshape(markers, nbe);
-diff --git a/fem/lininteg_boundary_flux.cpp b/fem/integ/lininteg_boundary_flux.cpp
-similarity index 89%
-rename from fem/lininteg_boundary_flux.cpp
-rename to fem/integ/lininteg_boundary_flux.cpp
-index 411ba0314..b9f047817 100644
---- a/fem/lininteg_boundary_flux.cpp
-+++ b/fem/integ/lininteg_boundary_flux.cpp
-@@ -9,17 +9,17 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "fem.hpp"
--#include "../fem/kernels.hpp"
--#include "../general/forall.hpp"
-+#include "../../general/forall.hpp"
-+#include "../../fem/kernels.hpp"
-+#include "../fem.hpp"
- 
- namespace mfem
- {
- 
--template<int T_D1D = 0, int T_Q1D = 0> static
--void BFLFEvalAssemble2D(const int nbe, const int d, const int q,
--                        const int *markers, const double *b,
--                        const double *weights, const Vector &coeff, double *y)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void BFLFEvalAssemble2D(const int nbe, const int d, const int q,
-+                               const int *markers, const double *b,
-+                               const double *weights, const Vector &coeff, double *y)
- {
-    const auto F = coeff.Read();
-    const auto M = Reshape(markers, nbe);
-@@ -50,10 +50,10 @@ void BFLFEvalAssemble2D(const int nbe, const int d, const int q,
-    });
- }
- 
--template<int T_D1D = 0, int T_Q1D = 0> static
--void BFLFEvalAssemble3D(const int nbe, const int d, const int q,
--                        const int *markers, const double *b,
--                        const double *weights, const Vector &coeff, double *y)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void BFLFEvalAssemble3D(const int nbe, const int d, const int q,
-+                               const int *markers, const double *b,
-+                               const double *weights, const Vector &coeff, double *y)
- {
-    const auto F = coeff.Read();
-    const auto M = Reshape(markers, nbe);
-diff --git a/fem/lininteg_domain.cpp b/fem/integ/lininteg_domain.cpp
-similarity index 91%
-rename from fem/lininteg_domain.cpp
-rename to fem/integ/lininteg_domain.cpp
-index 6fa0ec82b..6ff7b090d 100644
---- a/fem/lininteg_domain.cpp
-+++ b/fem/integ/lininteg_domain.cpp
-@@ -9,18 +9,19 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "fem.hpp"
--#include "../fem/kernels.hpp"
--#include "../general/forall.hpp"
-+#include "../../general/forall.hpp"
-+#include "../../fem/kernels.hpp"
-+#include "../fem.hpp"
- 
- namespace mfem
- {
- 
--template<int T_D1D = 0, int T_Q1D = 0> static
--void DLFEvalAssemble2D(const int vdim, const int ne, const int d, const int q,
--                       const int map_type, const int *markers, const double *b,
--                       const double *detj, const double *weights,
--                       const Vector &coeff, double *y)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void DLFEvalAssemble2D(const int vdim, const int ne, const int d,
-+                              const int q,
-+                              const int map_type, const int *markers, const double *b,
-+                              const double *detj, const double *weights,
-+                              const Vector &coeff, double *y)
- {
-    const auto F = coeff.Read();
-    const auto M = Reshape(markers, ne);
-@@ -85,11 +86,12 @@ void DLFEvalAssemble2D(const int vdim, const int ne, const int d, const int q,
-    });
- }
- 
--template<int T_D1D = 0, int T_Q1D = 0> static
--void DLFEvalAssemble3D(const int vdim, const int ne, const int d, const int q,
--                       const int map_type, const int *markers, const double *b,
--                       const double *detj, const double *weights,
--                       const Vector &coeff, double *y)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void DLFEvalAssemble3D(const int vdim, const int ne, const int d,
-+                              const int q,
-+                              const int map_type, const int *markers, const double *b,
-+                              const double *detj, const double *weights,
-+                              const Vector &coeff, double *y)
- {
-    const auto F = coeff.Read();
-    const auto M = Reshape(markers, ne);
-diff --git a/fem/lininteg_domain_grad.cpp b/fem/integ/lininteg_domain_grad.cpp
-similarity index 93%
-rename from fem/lininteg_domain_grad.cpp
-rename to fem/integ/lininteg_domain_grad.cpp
-index 16131e11c..5cca01a1d 100644
---- a/fem/lininteg_domain_grad.cpp
-+++ b/fem/integ/lininteg_domain_grad.cpp
-@@ -9,18 +9,19 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "fem.hpp"
--#include "../fem/kernels.hpp"
--#include "../general/forall.hpp"
-+#include "../../general/forall.hpp"
-+#include "../../fem/kernels.hpp"
-+#include "../fem.hpp"
- 
- namespace mfem
- {
- 
--template<int T_D1D = 0, int T_Q1D = 0> static
--void DLFGradAssemble2D(const int vdim, const int ne, const int d, const int q,
--                       const int *markers, const double *b, const double *g,
--                       const double *jacobians,
--                       const double *weights, const Vector &coeff, double *y)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void DLFGradAssemble2D(const int vdim, const int ne, const int d,
-+                              const int q,
-+                              const int *markers, const double *b, const double *g,
-+                              const double *jacobians,
-+                              const double *weights, const Vector &coeff, double *y)
- {
-    const auto F = coeff.Read();
-    const auto M = Reshape(markers, ne);
-@@ -108,12 +109,13 @@ void DLFGradAssemble2D(const int vdim, const int ne, const int d, const int q,
-    });
- }
- 
--template<int T_D1D = 0, int T_Q1D = 0> static
--void DLFGradAssemble3D(const int vdim, const int ne, const int d, const int q,
--                       const int *markers, const double *b, const double *g,
--                       const double *jacobians,
--                       const double *weights, const Vector &coeff,
--                       double *output)
-+template<int T_D1D = 0, int T_Q1D = 0>
-+static void DLFGradAssemble3D(const int vdim, const int ne, const int d,
-+                              const int q,
-+                              const int *markers, const double *b, const double *g,
-+                              const double *jacobians,
-+                              const double *weights, const Vector &coeff,
-+                              double *output)
- {
-    const auto F = coeff.Read();
-    const auto M = Reshape(markers, ne);
-diff --git a/fem/lininteg_vectorfe_domain.cpp b/fem/integ/lininteg_domain_vectorfe.cpp
-similarity index 99%
-rename from fem/lininteg_vectorfe_domain.cpp
-rename to fem/integ/lininteg_domain_vectorfe.cpp
-index 55a3dda7e..16d9e866c 100644
---- a/fem/lininteg_vectorfe_domain.cpp
-+++ b/fem/integ/lininteg_domain_vectorfe.cpp
-@@ -9,9 +9,9 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "fem.hpp"
--#include "../fem/kernels.hpp"
--#include "../general/forall.hpp"
-+#include "../../general/forall.hpp"
-+#include "../../fem/kernels.hpp"
-+#include "../fem.hpp"
- 
- namespace mfem
- {
-diff --git a/fem/nonlininteg_vectorconvection_mf.cpp b/fem/integ/nonlininteg_vecconvection_mf.cpp
-similarity index 92%
-rename from fem/nonlininteg_vectorconvection_mf.cpp
-rename to fem/integ/nonlininteg_vecconvection_mf.cpp
-index c29f4e920..4005d6836 100644
---- a/fem/nonlininteg_vectorconvection_mf.cpp
-+++ b/fem/integ/nonlininteg_vecconvection_mf.cpp
-@@ -9,14 +9,13 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "nonlininteg.hpp"
--#include "ceed/integrators/nlconvection/nlconvection.hpp"
--
--using namespace std;
-+#include "../../general/forall.hpp"
-+#include "../nonlininteg.hpp"
-+#include "../ceed/integrators/nlconvection/nlconvection.hpp"
- 
- namespace mfem
- {
-+
- void VectorConvectionNLFIntegrator::AssembleMF(const FiniteElementSpace &fes)
- {
-    MFEM_ASSERT(fes.GetOrdering() == Ordering::byNODES,
-diff --git a/fem/nonlininteg_vectorconvection.cpp b/fem/integ/nonlininteg_vecconvection_pa.cpp
-similarity index 99%
-rename from fem/nonlininteg_vectorconvection.cpp
-rename to fem/integ/nonlininteg_vecconvection_pa.cpp
-index efa7a10a3..7bed31800 100644
---- a/fem/nonlininteg_vectorconvection.cpp
-+++ b/fem/integ/nonlininteg_vecconvection_pa.cpp
-@@ -9,14 +9,13 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#include "../general/forall.hpp"
--#include "nonlininteg.hpp"
--#include "ceed/integrators/nlconvection/nlconvection.hpp"
--
--using namespace std;
-+#include "../../general/forall.hpp"
-+#include "../nonlininteg.hpp"
-+#include "../ceed/integrators/nlconvection/nlconvection.hpp"
- 
- namespace mfem
- {
-+
- void VectorConvectionNLFIntegrator::AssemblePA(const FiniteElementSpace &fes)
- {
-    MFEM_ASSERT(fes.GetOrdering() == Ordering::byNODES,
-diff --git a/fem/intrules.cpp b/fem/intrules.cpp
-index 67ab66320..1494043d6 100644
---- a/fem/intrules.cpp
-+++ b/fem/intrules.cpp
-@@ -946,22 +946,14 @@ const IntegrationRule &IntegrationRules::Get(int GeomType, int Order)
- 
-    if (!HaveIntRule(*ir_array, Order))
-    {
--#ifdef MFEM_USE_LEGACY_OPENMP
--      #pragma omp critical
--#endif
-+      IntegrationRule *ir = GenerateIntegrationRule(GeomType, Order);
-+      int RealOrder = Order;
-+      while (RealOrder+1 < ir_array->Size() &&
-+             (*ir_array)[RealOrder+1] == ir)
-       {
--         if (!HaveIntRule(*ir_array, Order))
--         {
--            IntegrationRule *ir = GenerateIntegrationRule(GeomType, Order);
--            int RealOrder = Order;
--            while (RealOrder+1 < ir_array->Size() &&
--                   (*ir_array)[RealOrder+1] == ir)
--            {
--               RealOrder++;
--            }
--            ir->SetOrder(RealOrder);
--         }
-+         RealOrder++;
-       }
-+      ir->SetOrder(RealOrder);
-    }
- 
-    return *(*ir_array)[Order];
-diff --git a/fem/linearform.hpp b/fem/linearform.hpp
-index c24118426..29f816db1 100644
---- a/fem/linearform.hpp
-+++ b/fem/linearform.hpp
-@@ -121,10 +121,6 @@ public:
-    LinearForm &operator=(const LinearForm &rhs)
-    { return operator=((const Vector &)rhs); }
- 
--   /// (DEPRECATED) Return the FE space associated with the LinearForm.
--   /** @deprecated Use FESpace() instead. */
--   MFEM_DEPRECATED FiniteElementSpace *GetFES() { return fes; }
--
-    /// Read+write access to the associated FiniteElementSpace.
-    FiniteElementSpace *FESpace() { return fes; }
-    /// Read-only access to the associated FiniteElementSpace.
-diff --git a/fem/linearform_ext.cpp b/fem/linearform_ext.cpp
-index f70a7b6e3..3475574fd 100644
---- a/fem/linearform_ext.cpp
-+++ b/fem/linearform_ext.cpp
-@@ -164,10 +164,8 @@ void LinearFormExtension::Update()
-          }
-       }
- 
--      bdr_restrict_lex =
--         dynamic_cast<const FaceRestriction*>(
--            fes.GetFaceRestriction(ordering, FaceType::Boundary,
--                                   L2FaceValues::SingleValued));
-+      bdr_restrict_lex = fes.GetFaceRestriction(ordering, FaceType::Boundary,
-+                                                L2FaceValues::SingleValued);
-       MFEM_VERIFY(bdr_restrict_lex, "Face restriction not available");
-       bdr_b.SetSize(bdr_restrict_lex->Height(), Device::GetMemoryType());
-       bdr_b.UseDevice(true);
-diff --git a/fem/linearform_ext.hpp b/fem/linearform_ext.hpp
-index 2cc861cea..46acf637d 100644
---- a/fem/linearform_ext.hpp
-+++ b/fem/linearform_ext.hpp
-@@ -34,7 +34,7 @@ class LinearFormExtension
-    LinearForm *lf;
- 
-    /// Operator that converts FiniteElementSpace L-vectors to E-vectors.
--   const ElementRestrictionOperator *elem_restrict_lex; // Not owned
-+   const ElementRestriction *elem_restrict_lex; // Not owned
- 
-    /// Operator that converts L-vectors to boundary E-vectors.
-    const FaceRestriction *bdr_restrict_lex; // Not owned
-diff --git a/fem/lininteg.cpp b/fem/lininteg.cpp
-index 52abc3cad..c9b6b4699 100644
---- a/fem/lininteg.cpp
-+++ b/fem/lininteg.cpp
-@@ -9,7 +9,6 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--
- #include "fem.hpp"
- #include <cmath>
- 
-diff --git a/fem/lor/lor_ads.cpp b/fem/lor/lor_ads.cpp
-index 3ba4816ee..f962f8469 100644
---- a/fem/lor/lor_ads.cpp
-+++ b/fem/lor/lor_ads.cpp
-@@ -103,9 +103,9 @@ void BatchedLOR_ADS::FormCurlMatrix()
-    Form3DFaceToEdge(face2edge);
- 
-    ElementDofOrdering ordering = ElementDofOrdering::LEXICOGRAPHIC;
--   const auto *R_f = dynamic_cast<const ElementRestriction*>(
-+   const auto *R_f = dynamic_cast<const ConformingElementRestriction*>(
-                         face_fes.GetElementRestriction(ordering));
--   const auto *R_e = dynamic_cast<const ElementRestriction*>(
-+   const auto *R_e = dynamic_cast<const ConformingElementRestriction*>(
-                         edge_fes.GetElementRestriction(ordering));
-    MFEM_VERIFY(R_f != NULL && R_e != NULL, "");
- 
-diff --git a/fem/lor/lor_ams.cpp b/fem/lor/lor_ams.cpp
-index 1c37f165b..0d7ac222f 100644
---- a/fem/lor/lor_ams.cpp
-+++ b/fem/lor/lor_ams.cpp
-@@ -163,9 +163,9 @@ void BatchedLOR_AMS::FormGradientMatrix()
-    else { Form3DEdgeToVertex(edge2vertex); }
- 
-    ElementDofOrdering ordering = ElementDofOrdering::LEXICOGRAPHIC;
--   const auto *R_v = dynamic_cast<const ElementRestriction*>(
-+   const auto *R_v = dynamic_cast<const ConformingElementRestriction*>(
-                         vert_fes.GetElementRestriction(ordering));
--   const auto *R_e = dynamic_cast<const ElementRestriction*>(
-+   const auto *R_e = dynamic_cast<const ConformingElementRestriction*>(
-                         edge_fes.GetElementRestriction(ordering));
-    MFEM_VERIFY(R_v != NULL && R_e != NULL, "");
- 
-@@ -268,7 +268,7 @@ void BatchedLOR_AMS::FormCoordinateVectors(const Vector &X_vert)
-    // Create the H1 vertex space and get the element restriction
-    ElementDofOrdering ordering = ElementDofOrdering::LEXICOGRAPHIC;
-    const Operator *op = vert_fes.GetElementRestriction(ordering);
--   const auto *el_restr = dynamic_cast<const ElementRestriction*>(op);
-+   const auto *el_restr = dynamic_cast<const ConformingElementRestriction*>(op);
-    MFEM_VERIFY(el_restr != NULL, "");
-    const SparseMatrix *R = vert_fes.GetRestrictionMatrix();
- 
-diff --git a/fem/lor/lor_batched.cpp b/fem/lor/lor_batched.cpp
-index fe0494880..0050c3de5 100644
---- a/fem/lor/lor_batched.cpp
-+++ b/fem/lor/lor_batched.cpp
-@@ -145,8 +145,8 @@ int BatchedLORAssembly::FillI(SparseMatrix &A) const
- 
-    const ElementDofOrdering ordering = ElementDofOrdering::LEXICOGRAPHIC;
-    const Operator *op = fes_ho.GetElementRestriction(ordering);
--   const ElementRestriction *el_restr =
--      dynamic_cast<const ElementRestriction*>(op);
-+   const auto *el_restr =
-+      dynamic_cast<const ConformingElementRestriction*>(op);
-    MFEM_VERIFY(el_restr != nullptr, "Bad element restriction");
- 
-    const Array<int> &el_dof_lex_ = el_restr->GatherMap();
-@@ -235,8 +235,8 @@ void BatchedLORAssembly::FillJAndData(SparseMatrix &A) const
- 
-    const ElementDofOrdering ordering = ElementDofOrdering::LEXICOGRAPHIC;
-    const Operator *op = fes_ho.GetElementRestriction(ordering);
--   const ElementRestriction *el_restr =
--      dynamic_cast<const ElementRestriction*>(op);
-+   const auto *el_restr =
-+      dynamic_cast<const ConformingElementRestriction*>(op);
-    MFEM_VERIFY(el_restr != nullptr, "Bad element restriction");
- 
-    const Array<int> &el_dof_lex_ = el_restr->GatherMap();
-diff --git a/fem/nonlinearform.hpp b/fem/nonlinearform.hpp
-index d15d09e04..60cae2055 100644
---- a/fem/nonlinearform.hpp
-+++ b/fem/nonlinearform.hpp
-@@ -330,7 +330,6 @@ public:
-    virtual ~BlockNonlinearForm();
- };
- 
--
- }
- 
- #endif
-diff --git a/fem/nonlininteg.cpp b/fem/nonlininteg.cpp
-index e1558fda4..5ee1febea 100644
---- a/fem/nonlininteg.cpp
-+++ b/fem/nonlininteg.cpp
-@@ -15,68 +15,69 @@
- namespace mfem
- {
- 
--double NonlinearFormIntegrator::GetLocalStateEnergyPA(const Vector &x) const
--{
--   mfem_error ("NonlinearFormIntegrator::GetLocalStateEnergyPA(...)\n"
--               "   is not implemented for this class.");
--   return 0.0;
--}
--
- void NonlinearFormIntegrator::AssemblePA(const FiniteElementSpace&)
- {
--   mfem_error ("NonlinearFormIntegrator::AssemblePA(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("NonlinearFormIntegrator::AssemblePA(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--void NonlinearFormIntegrator::AssemblePA(const FiniteElementSpace &,
--                                         const FiniteElementSpace &)
-+void NonlinearFormIntegrator::AssembleGradPA(const Vector &x,
-+                                             const FiniteElementSpace &fes)
- {
--   mfem_error ("NonlinearFormIntegrator::AssemblePA(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("NonlinearFormIntegrator::AssembleGradPA(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--void NonlinearFormIntegrator::AssembleGradPA(const Vector &x,
--                                             const FiniteElementSpace &fes)
-+void NonlinearFormIntegrator::AssembleGradDiagonalPA(Vector &diag) const
- {
--   mfem_error ("NonlinearFormIntegrator::AssembleGradPA(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("NonlinearFormIntegrator::AssembleGradDiagonalPA(...)\n"
-+              "   is not implemented for this class.");
- }
- 
- void NonlinearFormIntegrator::AddMultPA(const Vector &, Vector &) const
- {
--   mfem_error ("NonlinearFormIntegrator::AddMultPA(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("NonlinearFormIntegrator::AddMultPA(...)\n"
-+              "   is not implemented for this class.");
- }
- 
- void NonlinearFormIntegrator::AddMultGradPA(const Vector&, Vector&) const
- {
--   mfem_error ("NonlinearFormIntegrator::AddMultGradPA(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("NonlinearFormIntegrator::AddMultGradPA(...)\n"
-+              "   is not implemented for this class.");
- }
- 
--void NonlinearFormIntegrator::AssembleGradDiagonalPA(Vector &diag) const
-+double NonlinearFormIntegrator::GetLocalStateEnergyPA(const Vector &x) const
- {
--   mfem_error ("NonlinearFormIntegrator::AssembleGradDiagonalPA(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("NonlinearFormIntegrator::GetLocalStateEnergyPA(...)\n"
-+              "   is not implemented for this class.");
-+   return 0.0;
- }
- 
- void NonlinearFormIntegrator::AssembleMF(const FiniteElementSpace &fes)
- {
--   mfem_error ("NonlinearFormIntegrator::AssembleMF(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("NonlinearFormIntegrator::AssembleMF(...)\n"
-+              "   is not implemented for this class.");
- }
- 
- void NonlinearFormIntegrator::AddMultMF(const Vector &, Vector &) const
- {
--   mfem_error ("NonlinearFormIntegrator::AddMultMF(...)\n"
--               "   is not implemented for this class.");
-+   MFEM_ABORT("NonlinearFormIntegrator::AddMultMF(...)\n"
-+              "   is not implemented for this class.");
-+}
-+
-+double NonlinearFormIntegrator::GetElementEnergy(
-+   const FiniteElement &el, ElementTransformation &Tr, const Vector &elfun)
-+{
-+   MFEM_ABORT("NonlinearFormIntegrator::GetElementEnergy"
-+              " is not overloaded!");
-+   return 0.0;
- }
- 
- void NonlinearFormIntegrator::AssembleElementVector(
-    const FiniteElement &el, ElementTransformation &Tr,
-    const Vector &elfun, Vector &elvect)
- {
--   mfem_error("NonlinearFormIntegrator::AssembleElementVector"
-+   MFEM_ABORT("NonlinearFormIntegrator::AssembleElementVector"
-               " is not overloaded!");
- }
- 
-@@ -84,7 +85,7 @@ void NonlinearFormIntegrator::AssembleFaceVector(
-    const FiniteElement &el1, const FiniteElement &el2,
-    FaceElementTransformations &Tr, const Vector &elfun, Vector &elvect)
- {
--   mfem_error("NonlinearFormIntegrator::AssembleFaceVector"
-+   MFEM_ABORT("NonlinearFormIntegrator::AssembleFaceVector"
-               " is not overloaded!");
- }
- 
-@@ -92,7 +93,7 @@ void NonlinearFormIntegrator::AssembleElementGrad(
-    const FiniteElement &el, ElementTransformation &Tr, const Vector &elfun,
-    DenseMatrix &elmat)
- {
--   mfem_error("NonlinearFormIntegrator::AssembleElementGrad"
-+   MFEM_ABORT("NonlinearFormIntegrator::AssembleElementGrad"
-               " is not overloaded!");
- }
- 
-@@ -101,18 +102,10 @@ void NonlinearFormIntegrator::AssembleFaceGrad(
-    FaceElementTransformations &Tr, const Vector &elfun,
-    DenseMatrix &elmat)
- {
--   mfem_error("NonlinearFormIntegrator::AssembleFaceGrad"
-+   MFEM_ABORT("NonlinearFormIntegrator::AssembleFaceGrad"
-               " is not overloaded!");
- }
- 
--double NonlinearFormIntegrator::GetElementEnergy(
--   const FiniteElement &el, ElementTransformation &Tr, const Vector &elfun)
--{
--   mfem_error("NonlinearFormIntegrator::GetElementEnergy"
--              " is not overloaded!");
--   return 0.0;
--}
--
- 
- void BlockNonlinearFormIntegrator::AssembleElementVector(
-    const Array<const FiniteElement *> &el,
-@@ -120,7 +113,7 @@ void BlockNonlinearFormIntegrator::AssembleElementVector(
-    const Array<const Vector *> &elfun,
-    const Array<Vector *> &elvec)
- {
--   mfem_error("BlockNonlinearFormIntegrator::AssembleElementVector"
-+   MFEM_ABORT("BlockNonlinearFormIntegrator::AssembleElementVector"
-               " is not overloaded!");
- }
- 
-@@ -131,7 +124,7 @@ void BlockNonlinearFormIntegrator::AssembleFaceVector(
-    const Array<const Vector *> &elfun,
-    const Array<Vector *> &elvect)
- {
--   mfem_error("BlockNonlinearFormIntegrator::AssembleFaceVector"
-+   MFEM_ABORT("BlockNonlinearFormIntegrator::AssembleFaceVector"
-               " is not overloaded!");
- }
- 
-@@ -141,7 +134,7 @@ void BlockNonlinearFormIntegrator::AssembleElementGrad(
-    const Array<const Vector *> &elfun,
-    const Array2D<DenseMatrix *> &elmats)
- {
--   mfem_error("BlockNonlinearFormIntegrator::AssembleElementGrad"
-+   MFEM_ABORT("BlockNonlinearFormIntegrator::AssembleElementGrad"
-               " is not overloaded!");
- }
- 
-@@ -152,7 +145,7 @@ void BlockNonlinearFormIntegrator::AssembleFaceGrad(
-    const Array<const Vector *> &elfun,
-    const Array2D<DenseMatrix *> &elmats)
- {
--   mfem_error("BlockNonlinearFormIntegrator::AssembleFaceGrad"
-+   MFEM_ABORT("BlockNonlinearFormIntegrator::AssembleFaceGrad"
-               " is not overloaded!");
- }
- 
-@@ -161,7 +154,7 @@ double BlockNonlinearFormIntegrator::GetElementEnergy(
-    ElementTransformation &Tr,
-    const Array<const Vector *>&elfun)
- {
--   mfem_error("BlockNonlinearFormIntegrator::GetElementEnergy"
-+   MFEM_ABORT("BlockNonlinearFormIntegrator::GetElementEnergy"
-               " is not overloaded!");
-    return 0.0;
- }
-@@ -497,6 +490,7 @@ void HyperelasticNLFIntegrator::AssembleElementGrad(const FiniteElement &el,
-    }
- }
- 
-+
- double IncompressibleNeoHookeanIntegrator::GetElementEnergy(
-    const Array<const FiniteElement *>&el,
-    ElementTransformation &Tr,
-@@ -504,7 +498,7 @@ double IncompressibleNeoHookeanIntegrator::GetElementEnergy(
- {
-    if (el.Size() != 2)
-    {
--      mfem_error("IncompressibleNeoHookeanIntegrator::GetElementEnergy"
-+      MFEM_ABORT("IncompressibleNeoHookeanIntegrator::GetElementEnergy"
-                  " has incorrect block finite element space size!");
-    }
- 
-@@ -549,7 +543,7 @@ void IncompressibleNeoHookeanIntegrator::AssembleElementVector(
- {
-    if (el.Size() != 2)
-    {
--      mfem_error("IncompressibleNeoHookeanIntegrator::AssembleElementVector"
-+      MFEM_ABORT("IncompressibleNeoHookeanIntegrator::AssembleElementVector"
-                  " has finite element space of incorrect block number");
-    }
- 
-@@ -561,11 +555,10 @@ void IncompressibleNeoHookeanIntegrator::AssembleElementVector(
- 
-    if (dim != spaceDim)
-    {
--      mfem_error("IncompressibleNeoHookeanIntegrator::AssembleElementVector"
-+      MFEM_ABORT("IncompressibleNeoHookeanIntegrator::AssembleElementVector"
-                  " is not defined on manifold meshes");
-    }
- 
--
-    DSh_u.SetSize(dof_u, dim);
-    DS_u.SetSize(dof_u, dim);
-    J0i.SetSize(dim);
-@@ -731,6 +724,7 @@ void IncompressibleNeoHookeanIntegrator::AssembleElementGrad(
- 
- }
- 
-+
- const IntegrationRule&
- VectorConvectionNLFIntegrator::GetRule(const FiniteElement &fe,
-                                        ElementTransformation &T)
-diff --git a/fem/nonlininteg.hpp b/fem/nonlininteg.hpp
-index 54f342b85..38b133244 100644
---- a/fem/nonlininteg.hpp
-+++ b/fem/nonlininteg.hpp
-@@ -29,13 +29,12 @@ class NonlinearFormIntegrator
- protected:
-    const IntegrationRule *IntRule;
- 
--   // CEED extension
--   ceed::Operator* ceedOp;
-+   ceed::Operator *ceedOp;  // libCEED extension
- 
-    MemoryType pa_mt = MemoryType::DEFAULT;
- 
-    NonlinearFormIntegrator(const IntegrationRule *ir = NULL)
--      : IntRule(ir), ceedOp(NULL) { }
-+      : IntRule(ir), ceedOp(NULL) {}
- 
- public:
-    /** @brief Prescribe a fixed IntegrationRule to use (when @a ir != NULL) or
-@@ -52,46 +51,11 @@ public:
-    /// Get the integration rule of the integrator (possibly NULL).
-    const IntegrationRule *GetIntegrationRule() const { return IntRule; }
- 
--   /// Perform the local action of the NonlinearFormIntegrator
--   virtual void AssembleElementVector(const FiniteElement &el,
--                                      ElementTransformation &Tr,
--                                      const Vector &elfun, Vector &elvect);
--
--   /// @brief Perform the local action of the NonlinearFormIntegrator resulting
--   /// from a face integral term.
--   virtual void AssembleFaceVector(const FiniteElement &el1,
--                                   const FiniteElement &el2,
--                                   FaceElementTransformations &Tr,
--                                   const Vector &elfun, Vector &elvect);
--
--   /// Assemble the local gradient matrix
--   virtual void AssembleElementGrad(const FiniteElement &el,
--                                    ElementTransformation &Tr,
--                                    const Vector &elfun, DenseMatrix &elmat);
--
--   /// @brief Assemble the local action of the gradient of the
--   /// NonlinearFormIntegrator resulting from a face integral term.
--   virtual void AssembleFaceGrad(const FiniteElement &el1,
--                                 const FiniteElement &el2,
--                                 FaceElementTransformations &Tr,
--                                 const Vector &elfun, DenseMatrix &elmat);
--
--   /// Compute the local energy
--   virtual double GetElementEnergy(const FiniteElement &el,
--                                   ElementTransformation &Tr,
--                                   const Vector &elfun);
--
-    /// Method defining partial assembly.
-    /** The result of the partial assembly is stored internally so that it can be
-        used later in the methods AddMultPA(). */
-    virtual void AssemblePA(const FiniteElementSpace &fes);
- 
--   /** The result of the partial assembly is stored internally so that it can be
--       used later in the methods AddMultPA().
--       Used with BilinearFormIntegrators that have different spaces. */
--   virtual void AssemblePA(const FiniteElementSpace &trial_fes,
--                           const FiniteElementSpace &test_fes);
--
-    /** @brief Prepare the integrator for partial assembly (PA) gradient
-        evaluations on the given FE space @a fes at the state @a x. */
-    /** The result of the partial assembly is stored internally so that it can be
-@@ -99,10 +63,12 @@ public:
-        The state Vector @a x is an E-vector. */
-    virtual void AssembleGradPA(const Vector &x, const FiniteElementSpace &fes);
- 
--   /// Compute the local (to the MPI rank) energy with partial assembly.
--   /** Here the state @a x is an E-vector. This method can be called only after
--       the method AssemblePA() has been called. */
--   virtual double GetLocalStateEnergyPA(const Vector &x) const;
-+   /// Method for computing the diagonal of the gradient with partial assembly.
-+   /** The result Vector @a diag is an E-Vector. This method can be called only
-+       after the method AssembleGradPA() has been called.
-+
-+       @param[in,out] diag  The result Vector: @f$ diag += diag(G) @f$. */
-+   virtual void AssembleGradDiagonalPA(Vector &diag) const;
- 
-    /// Method for partially assembled action.
-    /** Perform the action of integrator on the input @a x and add the result to
-@@ -121,15 +87,10 @@ public:
-        @param[in,out] y  The result Vector: @f$ y += G x @f$. */
-    virtual void AddMultGradPA(const Vector &x, Vector &y) const;
- 
--   /// Method for computing the diagonal of the gradient with partial assembly.
--   /** The result Vector @a diag is an E-Vector. This method can be called only
--       after the method AssembleGradPA() has been called.
--
--       @param[in,out] diag  The result Vector: @f$ diag += diag(G) @f$. */
--   virtual void AssembleGradDiagonalPA(Vector &diag) const;
--
--   /// Indicates whether this integrator can use a Ceed backend.
--   virtual bool SupportsCeed() const { return false; }
-+   /// Compute the local (to the MPI rank) energy with partial assembly.
-+   /** Here the state @a x is an E-vector. This method can be called only after
-+       the method AssemblePA() has been called. */
-+   virtual double GetLocalStateEnergyPA(const Vector &x) const;
- 
-    /// Method defining fully unassembled operator.
-    virtual void AssembleMF(const FiniteElementSpace &fes);
-@@ -142,7 +103,39 @@ public:
-        called. */
-    virtual void AddMultMF(const Vector &x, Vector &y) const;
- 
--   ceed::Operator& GetCeedOp() { return *ceedOp; }
-+   /// Compute the local energy
-+   virtual double GetElementEnergy(const FiniteElement &el,
-+                                   ElementTransformation &Tr,
-+                                   const Vector &elfun);
-+
-+   /// Perform the local action of the NonlinearFormIntegrator
-+   virtual void AssembleElementVector(const FiniteElement &el,
-+                                      ElementTransformation &Tr,
-+                                      const Vector &elfun, Vector &elvect);
-+
-+   /// @brief Perform the local action of the NonlinearFormIntegrator resulting
-+   /// from a face integral term.
-+   virtual void AssembleFaceVector(const FiniteElement &el1,
-+                                   const FiniteElement &el2,
-+                                   FaceElementTransformations &Tr,
-+                                   const Vector &elfun, Vector &elvect);
-+
-+   /// Assemble the local gradient matrix
-+   virtual void AssembleElementGrad(const FiniteElement &el,
-+                                    ElementTransformation &Tr,
-+                                    const Vector &elfun, DenseMatrix &elmat);
-+
-+   /// @brief Assemble the local action of the gradient of the
-+   /// NonlinearFormIntegrator resulting from a face integral term.
-+   virtual void AssembleFaceGrad(const FiniteElement &el1,
-+                                 const FiniteElement &el2,
-+                                 FaceElementTransformations &Tr,
-+                                 const Vector &elfun, DenseMatrix &elmat);
-+
-+   /// Indicates whether this integrator can use a Ceed backend.
-+   virtual bool SupportsCeed() const { return false; }
-+
-+   ceed::Operator &GetCeedOp() { return *ceedOp; }
- 
-    virtual ~NonlinearFormIntegrator()
-    {
-@@ -150,6 +143,7 @@ public:
-    }
- };
- 
-+
- /** The abstract base class BlockNonlinearFormIntegrator is
-     a generalization of the NonlinearFormIntegrator class suitable
-     for block state vectors. */
-@@ -185,7 +179,7 @@ public:
-                                  const Array<const Vector *> &elfun,
-                                  const Array2D<DenseMatrix *> &elmats);
- 
--   virtual ~BlockNonlinearFormIntegrator() { }
-+   virtual ~BlockNonlinearFormIntegrator() {}
- };
- 
- 
-@@ -197,8 +191,8 @@ protected:
-                                     transformation. */
- 
- public:
--   HyperelasticModel() : Ttr(NULL) { }
--   virtual ~HyperelasticModel() { }
-+   HyperelasticModel() : Ttr(NULL) {}
-+   virtual ~HyperelasticModel() {}
- 
-    /// A reference-element to target-element transformation that can be used to
-    /// evaluate Coefficient%s.
-@@ -277,7 +271,7 @@ public:
- 
-    NeoHookeanModel(Coefficient &mu_, Coefficient &K_, Coefficient *g_ = NULL)
-       : mu(0.0), K(0.0), g(1.0), c_mu(&mu_), c_K(&K_), c_g(g_),
--        have_coeffs(true) { }
-+        have_coeffs(true) {}
- 
-    virtual double EvalW(const DenseMatrix &J) const;
- 
-@@ -314,7 +308,7 @@ private:
- 
- public:
-    /** @param[in] m  HyperelasticModel that will be integrated. */
--   HyperelasticNLFIntegrator(HyperelasticModel *m) : model(m) { }
-+   HyperelasticNLFIntegrator(HyperelasticModel *m) : model(m) {}
- 
-    /** @brief Computes the integral of W(Jacobian(Trt)) over a target zone
-        @param[in] el     Type of FiniteElement.
-@@ -333,6 +327,7 @@ public:
-                                     const Vector &elfun, DenseMatrix &elmat);
- };
- 
-+
- /** Hyperelastic incompressible Neo-Hookean integrator with the PK1 stress
-     \f$P = \mu F - p F^{-T}\f$ where \f$\mu\f$ is the shear modulus,
-     \f$p\f$ is the pressure, and \f$F\f$ is the deformation gradient */
-@@ -345,7 +340,7 @@ private:
-    Vector Sh_p;
- 
- public:
--   IncompressibleNeoHookeanIntegrator(Coefficient &mu_) : c_mu(&mu_) { }
-+   IncompressibleNeoHookeanIntegrator(Coefficient &mu_) : c_mu(&mu_) {}
- 
-    virtual double GetElementEnergy(const Array<const FiniteElement *>&el,
-                                    ElementTransformation &Tr,
-@@ -371,6 +366,7 @@ private:
-    Coefficient *Q{};
-    DenseMatrix dshape, dshapex, EF, gradEF, ELV, elmat_comp;
-    Vector shape;
-+
-    // PA extension
-    Vector pa_data;
-    const DofToQuad *maps;         ///< Not owned
-@@ -378,7 +374,7 @@ private:
-    int dim, ne, nq;
- 
- public:
--   VectorConvectionNLFIntegrator(Coefficient &q): Q(&q) { }
-+   VectorConvectionNLFIntegrator(Coefficient &q): Q(&q) {}
- 
-    VectorConvectionNLFIntegrator() = default;
- 
-@@ -395,14 +391,12 @@ public:
-                                     const Vector &elfun,
-                                     DenseMatrix &elmat);
- 
--   using NonlinearFormIntegrator::AssemblePA;
--
-    virtual void AssemblePA(const FiniteElementSpace &fes);
- 
--   virtual void AssembleMF(const FiniteElementSpace &fes);
--
-    virtual void AddMultPA(const Vector &x, Vector &y) const;
- 
-+   virtual void AssembleMF(const FiniteElementSpace &fes);
-+
-    virtual void AddMultMF(const Vector &x, Vector &y) const;
- };
- 
-@@ -418,7 +412,7 @@ private:
-    Vector shape;
- 
- public:
--   ConvectiveVectorConvectionNLFIntegrator(Coefficient &q): Q(&q) { }
-+   ConvectiveVectorConvectionNLFIntegrator(Coefficient &q): Q(&q) {}
- 
-    ConvectiveVectorConvectionNLFIntegrator() = default;
- 
-@@ -441,7 +435,7 @@ private:
-    Vector shape;
- 
- public:
--   SkewSymmetricVectorConvectionNLFIntegrator(Coefficient &q): Q(&q) { }
-+   SkewSymmetricVectorConvectionNLFIntegrator(Coefficient &q): Q(&q) {}
- 
-    SkewSymmetricVectorConvectionNLFIntegrator() = default;
- 
-diff --git a/fem/pbilinearform.cpp b/fem/pbilinearform.cpp
-index ee1030c48..707473f87 100644
---- a/fem/pbilinearform.cpp
-+++ b/fem/pbilinearform.cpp
-@@ -19,107 +19,109 @@
- namespace mfem
- {
- 
--void ParBilinearForm::pAllocMat()
--{
--   int nbr_size = pfes->GetFaceNbrVSize();
- 
--   if (precompute_sparsity == 0 || fes->GetVDim() > 1)
--   {
--      if (keep_nbr_block)
--      {
--         mat = new SparseMatrix(height + nbr_size, width + nbr_size);
--      }
--      else
--      {
--         mat = new SparseMatrix(height, width + nbr_size);
--      }
--      return;
--   }
--
--   // the sparsity pattern is defined from the map: face->element->dof
--   const Table &lelem_ldof = fes->GetElementToDofTable(); // <-- dofs
--   const Table &nelem_ndof = pfes->face_nbr_element_dof; // <-- vdofs
--   Table elem_dof; // element + nbr-element <---> dof
--   if (nbr_size > 0)
--   {
--      // merge lelem_ldof and nelem_ndof into elem_dof
--      int s1 = lelem_ldof.Size(), s2 = nelem_ndof.Size();
--      const int *I1 = lelem_ldof.GetI(), *J1 = lelem_ldof.GetJ();
--      const int *I2 = nelem_ndof.GetI(), *J2 = nelem_ndof.GetJ();
--      const int nnz1 = I1[s1], nnz2 = I2[s2];
--
--      elem_dof.SetDims(s1 + s2, nnz1 + nnz2);
--
--      int *I = elem_dof.GetI(), *J = elem_dof.GetJ();
--      for (int i = 0; i <= s1; i++)
--      {
--         I[i] = I1[i];
--      }
--      for (int j = 0; j < nnz1; j++)
--      {
--         J[j] = J1[j];
--      }
--      for (int i = 0; i <= s2; i++)
--      {
--         I[s1+i] = I2[i] + nnz1;
--      }
--      for (int j = 0; j < nnz2; j++)
--      {
--         J[nnz1+j] = J2[j] + height;
--      }
--   }
--   //   dof_elem x  elem_face x face_elem x elem_dof  (keep_nbr_block = true)
--   // ldof_lelem x lelem_face x face_elem x elem_dof  (keep_nbr_block = false)
--   Table dof_dof;
--   {
--      Table face_dof; // face_elem x elem_dof
--      {
--         Table *face_elem = pfes->GetParMesh()->GetFaceToAllElementTable();
--         if (nbr_size > 0)
--         {
--            mfem::Mult(*face_elem, elem_dof, face_dof);
--         }
--         else
--         {
--            mfem::Mult(*face_elem, lelem_ldof, face_dof);
--         }
--         delete face_elem;
--         if (nbr_size > 0)
--         {
--            elem_dof.Clear();
--         }
--      }
--
--      if (keep_nbr_block)
--      {
--         Table dof_face;
--         Transpose(face_dof, dof_face, height + nbr_size);
--         mfem::Mult(dof_face, face_dof, dof_dof);
--      }
--      else
--      {
--         Table ldof_face;
--         {
--            Table face_ldof;
--            Table *face_lelem = fes->GetMesh()->GetFaceToElementTable();
--            mfem::Mult(*face_lelem, lelem_ldof, face_ldof);
--            delete face_lelem;
--            Transpose(face_ldof, ldof_face, height);
--         }
--         mfem::Mult(ldof_face, face_dof, dof_dof);
--      }
--   }
--
--   int *I = dof_dof.GetI();
--   int *J = dof_dof.GetJ();
--   int nrows = dof_dof.Size();
--   double *data = Memory<double>(I[nrows]);
--
--   mat = new SparseMatrix(I, J, data, nrows, height + nbr_size);
--   *mat = 0.0;
--
--   dof_dof.LoseData();
--}
-+//XX TODO
-+// void ParBilinearForm::pAllocMat()
-+// {
-+//    int nbr_size = pfes->GetFaceNbrVSize();
-+
-+//    if (precompute_sparsity == 0 || fes->GetVDim() > 1)
-+//    {
-+//       if (keep_nbr_block)
-+//       {
-+//          mat = new SparseMatrix(height + nbr_size, width + nbr_size);
-+//       }
-+//       else
-+//       {
-+//          mat = new SparseMatrix(height, width + nbr_size);
-+//       }
-+//       return;
-+//    }
-+
-+//    // the sparsity pattern is defined from the map: face->element->dof
-+//    const Table &lelem_ldof = fes->GetElementToDofTable(); // <-- dofs
-+//    const Table &nelem_ndof = pfes->face_nbr_element_dof; // <-- vdofs
-+//    Table elem_dof; // element + nbr-element <---> dof
-+//    if (nbr_size > 0)
-+//    {
-+//       // merge lelem_ldof and nelem_ndof into elem_dof
-+//       int s1 = lelem_ldof.Size(), s2 = nelem_ndof.Size();
-+//       const int *I1 = lelem_ldof.GetI(), *J1 = lelem_ldof.GetJ();
-+//       const int *I2 = nelem_ndof.GetI(), *J2 = nelem_ndof.GetJ();
-+//       const int nnz1 = I1[s1], nnz2 = I2[s2];
-+
-+//       elem_dof.SetDims(s1 + s2, nnz1 + nnz2);
-+
-+//       int *I = elem_dof.GetI(), *J = elem_dof.GetJ();
-+//       for (int i = 0; i <= s1; i++)
-+//       {
-+//          I[i] = I1[i];
-+//       }
-+//       for (int j = 0; j < nnz1; j++)
-+//       {
-+//          J[j] = J1[j];
-+//       }
-+//       for (int i = 0; i <= s2; i++)
-+//       {
-+//          I[s1+i] = I2[i] + nnz1;
-+//       }
-+//       for (int j = 0; j < nnz2; j++)
-+//       {
-+//          J[nnz1+j] = J2[j] + height;
-+//       }
-+//    }
-+//    //   dof_elem x  elem_face x face_elem x elem_dof  (keep_nbr_block = true)
-+//    // ldof_lelem x lelem_face x face_elem x elem_dof  (keep_nbr_block = false)
-+//    Table dof_dof;
-+//    {
-+//       Table face_dof; // face_elem x elem_dof
-+//       {
-+//          Table *face_elem = pfes->GetParMesh()->GetFaceToAllElementTable();
-+//          if (nbr_size > 0)
-+//          {
-+//             mfem::Mult(*face_elem, elem_dof, face_dof);
-+//          }
-+//          else
-+//          {
-+//             mfem::Mult(*face_elem, lelem_ldof, face_dof);
-+//          }
-+//          delete face_elem;
-+//          if (nbr_size > 0)
-+//          {
-+//             elem_dof.Clear();
-+//          }
-+//       }
-+
-+//       if (keep_nbr_block)
-+//       {
-+//          Table dof_face;
-+//          Transpose(face_dof, dof_face, height + nbr_size);
-+//          mfem::Mult(dof_face, face_dof, dof_dof);
-+//       }
-+//       else
-+//       {
-+//          Table ldof_face;
-+//          {
-+//             Table face_ldof;
-+//             Table *face_lelem = fes->GetMesh()->GetFaceToElementTable();
-+//             mfem::Mult(*face_lelem, lelem_ldof, face_ldof);
-+//             delete face_lelem;
-+//             Transpose(face_ldof, ldof_face, height);
-+//          }
-+//          mfem::Mult(ldof_face, face_dof, dof_dof);
-+//       }
-+//    }
-+
-+//    int *I = dof_dof.GetI();
-+//    int *J = dof_dof.GetJ();
-+//    int nrows = dof_dof.Size();
-+//    double *data = Memory<double>(I[nrows]);
-+
-+//    mat = new SparseMatrix(I, J, data, nrows, height + nbr_size);
-+//    *mat = 0.0;
-+
-+//    dof_dof.LoseData();
-+// }
- 
- void ParBilinearForm::ParallelRAP(SparseMatrix &loc_A, OperatorHandle &A,
-                                   bool steal_loc_A)
-@@ -151,7 +153,8 @@ void ParBilinearForm::ParallelRAP(SparseMatrix &loc_A, OperatorHandle &A,
-    }
- }
- 
--void ParBilinearForm::ParallelAssemble(OperatorHandle &A, SparseMatrix *A_local)
-+void ParBilinearForm::ParallelAssemble(OperatorHandle &A,
-+                                       SparseMatrix *A_local) const
- {
-    A.Clear();
- 
-@@ -201,6 +204,7 @@ void ParBilinearForm::ParallelAssemble(OperatorHandle &A, SparseMatrix *A_local)
- 
-    // TODO - assemble the Dof_TrueDof_Matrix directly in the required format?
-    Ph.ConvertFrom(pfes->Dof_TrueDof_Matrix());
-+
-    // TODO: When Ph.Type() == Operator::ANY_TYPE we want to use the Operator
-    // returned by pfes->GetProlongationMatrix(), however that Operator is a
-    // const Operator, so we cannot store it in OperatorHandle. We need a const
-@@ -209,7 +213,7 @@ void ParBilinearForm::ParallelAssemble(OperatorHandle &A, SparseMatrix *A_local)
-    A.MakePtAP(dA, Ph);
- }
- 
--HypreParMatrix *ParBilinearForm::ParallelAssemble(SparseMatrix *m)
-+HypreParMatrix *ParBilinearForm::ParallelAssemble(SparseMatrix *m) const
- {
-    OperatorHandle Mh(Operator::Hypre_ParCSR);
-    ParallelAssemble(Mh, m);
-@@ -246,10 +250,9 @@ void ParBilinearForm::AssembleSharedFaces(int skip_zeros)
-       vdofs_all.Append(vdofs2);
-       for (int k = 0; k < interior_face_integs.Size(); k++)
-       {
--         interior_face_integs[k]->
--         AssembleFaceMatrix(*pfes->GetFE(T->Elem1No),
--                            *pfes->GetFaceNbrFE(Elem2NbrNo),
--                            *T, elemmat);
-+         interior_face_integs[k]->AssembleFaceMatrix(*pfes->GetFE(T->Elem1No),
-+                                                     *pfes->GetFaceNbrFE(Elem2NbrNo),
-+                                                     *T, elemmat);
-          if (keep_nbr_block)
-          {
-             mat->AddSubMatrix(vdofs_all, vdofs_all, elemmat, skip_zeros);
-@@ -269,7 +272,15 @@ void ParBilinearForm::Assemble(int skip_zeros)
-       pfes->ExchangeFaceNbrData();
-       if (!ext && mat == NULL)
-       {
--         pAllocMat();
-+         int nbr_size = pfes->GetFaceNbrVSize();
-+         if (keep_nbr_block)
-+         {
-+            mat = new SparseMatrix(height + nbr_size, width + nbr_size);
-+         }
-+         else
-+         {
-+            mat = new SparseMatrix(height, width + nbr_size);
-+         }
-       }
-    }
- 
-@@ -301,29 +312,23 @@ void ParBilinearForm::AssembleDiagonal(Vector &diag) const
-    // Here, we have extension, ext, and parallel/conforming prolongation, P.
-    Vector local_diag(P->Height());
-    ext->AssembleDiagonal(local_diag);
--   if (fes->Conforming())
-+   const HypreParMatrix *HP = dynamic_cast<const HypreParMatrix*>(P);
-+   if (!HP)
-    {
-+      // This is a parallel prolongation
-       P->MultTranspose(local_diag, diag);
-       return;
-    }
-    // For an AMR mesh, a convergent diagonal is assembled with |P^T| d_l,
-    // where |P^T| has the entry-wise absolute values of the conforming
-    // prolongation transpose operator.
--   const HypreParMatrix *HP = dynamic_cast<const HypreParMatrix*>(P);
--   if (HP)
--   {
--      HP->AbsMultTranspose(1.0, local_diag, 0.0, diag);
--   }
--   else
--   {
--      MFEM_ABORT("unsupported prolongation matrix type.");
--   }
-+   HP->AbsMultTranspose(1.0, local_diag, 0.0, diag);
- }
- 
--void ParBilinearForm
--::ParallelEliminateEssentialBC(const Array<int> &bdr_attr_is_ess,
--                               HypreParMatrix &A, const HypreParVector &X,
--                               HypreParVector &B) const
-+void ParBilinearForm::ParallelEliminateEssentialBC(
-+   const Array<int> &bdr_attr_is_ess,
-+   HypreParMatrix &A, const HypreParVector &X,
-+   HypreParVector &B) const
- {
-    Array<int> dof_list;
- 
-@@ -333,9 +338,9 @@ void ParBilinearForm
-    A.EliminateRowsCols(dof_list, X, B);
- }
- 
--HypreParMatrix *ParBilinearForm::
--ParallelEliminateEssentialBC(const Array<int> &bdr_attr_is_ess,
--                             HypreParMatrix &A) const
-+HypreParMatrix *ParBilinearForm::ParallelEliminateEssentialBC(
-+   const Array<int> &bdr_attr_is_ess,
-+   HypreParMatrix &A) const
- {
-    Array<int> dof_list;
- 
-@@ -374,7 +379,17 @@ void ParBilinearForm::FormLinearSystem(
- {
-    if (ext)
-    {
--      ext->FormLinearSystem(ess_tdof_list, x, b, A, X, B, copy_interior);
-+      Operator *oper;
-+      ext->FormLinearSystem(ess_tdof_list, x, b, oper, X, B, copy_interior);
-+      if (assembly == AssemblyLevel::FULL)
-+      {
-+         delete oper;
-+         FormSystemMatrix(ess_tdof_list, A);
-+      }
-+      else
-+      {
-+         A.Reset(oper);
-+      }
-       return;
-    }
- 
-@@ -418,18 +433,28 @@ void ParBilinearForm::FormLinearSystem(
-    }
- }
- 
--void ParBilinearForm::EliminateVDofsInRHS(
--   const Array<int> &vdofs, const Vector &x, Vector &b)
--{
--   p_mat.EliminateBC(p_mat_e, vdofs, x, b);
--}
--
- void ParBilinearForm::FormSystemMatrix(const Array<int> &ess_tdof_list,
-                                        OperatorHandle &A)
- {
-    if (ext)
-    {
--      ext->FormSystemMatrix(ess_tdof_list, A);
-+      if (assembly == AssemblyLevel::FULL)
-+      {
-+         // Always does `DIAG_ONE` policy to be consistent with
-+         // `Operator::FormConstrainedSystemOperator`.
-+         MFEM_VERIFY(diag_policy == DiagonalPolicy::DIAG_ONE,
-+                     "Only DiagonalPolicy::DIAG_ONE supported with"
-+                     " FABilinearFormExtension.");
-+         ParallelRAP(*mat, A);
-+         A.As<HypreParMatrix>()->EliminateBC(ess_tdof_list,
-+                                             DiagonalPolicy::DIAG_ONE);
-+      }
-+      else
-+      {
-+         Operator *oper;
-+         ext->FormSystemOperator(ess_tdof_list, oper);
-+         A.Reset(oper);
-+      }
-       return;
-    }
- 
-@@ -460,6 +485,7 @@ void ParBilinearForm::FormSystemMatrix(const Array<int> &ess_tdof_list,
-          delete mat_e;
-          mat_e = NULL;
-          p_mat_e.EliminateRowsCols(p_mat, ess_tdof_list);
-+         A = p_mat;
-       }
-       if (hybridization)
-       {
-@@ -521,30 +547,22 @@ void ParBilinearForm::Update(FiniteElementSpace *nfes)
-    p_mat_e.Clear();
- }
- 
--
--HypreParMatrix *ParMixedBilinearForm::ParallelAssemble()
-+void ParBilinearForm::EliminateVDofsInRHS(
-+   const Array<int> &vdofs, const Vector &x, Vector &b)
- {
--   // construct the block-diagonal matrix A
--   HypreParMatrix *A =
--      new HypreParMatrix(trial_pfes->GetComm(),
--                         test_pfes->GlobalVSize(),
--                         trial_pfes->GlobalVSize(),
--                         test_pfes->GetDofOffsets(),
--                         trial_pfes->GetDofOffsets(),
--                         mat);
-+   p_mat.EliminateBC(p_mat_e, vdofs, x, b);
-+}
- 
--   HypreParMatrix *rap = RAP(test_pfes->Dof_TrueDof_Matrix(), A,
--                             trial_pfes->Dof_TrueDof_Matrix());
-+void ParMixedBilinearForm::ParallelAssemble(OperatorHandle &A) const
-+{
-+   A.Clear();
- 
--   delete A;
-+   if (mat == NULL) { return; }
-+   MFEM_VERIFY(mat->Finalized(), "the local matrix must be finalized");
- 
--   return rap;
--}
-+   OperatorHandle dA(A.Type()), P_test(A.Type()), P_trial(A.Type());
- 
--void ParMixedBilinearForm::ParallelAssemble(OperatorHandle &A)
--{
-    // construct the rectangular block-diagonal matrix dA
--   OperatorHandle dA(A.Type());
-    dA.MakeRectangularBlockDiag(trial_pfes->GetComm(),
-                                test_pfes->GlobalVSize(),
-                                trial_pfes->GlobalVSize(),
-@@ -552,8 +570,6 @@ void ParMixedBilinearForm::ParallelAssemble(OperatorHandle &A)
-                                trial_pfes->GetDofOffsets(),
-                                mat);
- 
--   OperatorHandle P_test(A.Type()), P_trial(A.Type());
--
-    // TODO - construct the Dof_TrueDof_Matrix directly in the required format.
-    P_test.ConvertFrom(test_pfes->Dof_TrueDof_Matrix());
-    P_trial.ConvertFrom(trial_pfes->Dof_TrueDof_Matrix());
-@@ -561,6 +577,14 @@ void ParMixedBilinearForm::ParallelAssemble(OperatorHandle &A)
-    A.MakeRAP(P_test, dA, P_trial);
- }
- 
-+HypreParMatrix *ParMixedBilinearForm::ParallelAssemble() const
-+{
-+   OperatorHandle Mh(Operator::Hypre_ParCSR);
-+   ParallelAssemble(Mh);
-+   Mh.SetOperatorOwner(false);
-+   return Mh.As<HypreParMatrix>();
-+}
-+
- /// Compute y += a (P^t A P) x, where x and y are vectors on the true dofs
- void ParMixedBilinearForm::TrueAddMult(const Vector &x, Vector &y,
-                                        const double a) const
-@@ -576,21 +600,55 @@ void ParMixedBilinearForm::TrueAddMult(const Vector &x, Vector &y,
-    test_pfes->Dof_TrueDof_Matrix()->MultTranspose(a, Yaux, 1.0, y);
- }
- 
-+void ParMixedBilinearForm::FormRectangularLinearSystem(
-+   const Array<int> &trial_tdof_list,
-+   const Array<int> &test_tdof_list, Vector &x,
-+   Vector &b, OperatorHandle &A, Vector &X,
-+   Vector &B)
-+{
-+   if (ext)
-+   {
-+      Operator *oper;
-+      ext->FormRectangularLinearSystem(trial_tdof_list, test_tdof_list,
-+                                       x, b, oper, X, B);
-+      A.Reset(oper);
-+      return;
-+   }
-+
-+   FormRectangularSystemMatrix(trial_tdof_list, test_tdof_list, A);
-+
-+   const Operator *test_P = test_pfes->GetProlongationMatrix();
-+   const SparseMatrix *trial_R = trial_pfes->GetRestrictionMatrix();
-+
-+   X.SetSize(trial_pfes->TrueVSize());
-+   B.SetSize(test_pfes->TrueVSize());
-+   test_P->MultTranspose(b, B);
-+   trial_R->Mult(x, X);
-+
-+   p_mat_e.As<HypreParMatrix>()->Mult(-1.0, X, 1.0, B);
-+   B.SetSubVector(test_tdof_list, 0.0);
-+}
-+
- void ParMixedBilinearForm::FormRectangularSystemMatrix(
--   const Array<int>
--   &trial_tdof_list,
-+   const Array<int> &trial_tdof_list,
-    const Array<int> &test_tdof_list,
-    OperatorHandle &A)
- {
-    if (ext)
-    {
--      ext->FormRectangularSystemOperator(trial_tdof_list, test_tdof_list, A);
-+      Operator *oper;
-+      ext->FormRectangularSystemOperator(trial_tdof_list, test_tdof_list, oper);
-+      A.Reset(oper);
-       return;
-    }
- 
-    if (mat)
-    {
--      Finalize();
-+      const int remove_zeros = 0;
-+      Finalize(remove_zeros);
-+      MFEM_VERIFY(p_mat.Ptr() == NULL && p_mat_e.Ptr() == NULL,
-+                  "The ParMixedBilinearFormBilinearForm must be updated "
-+                  "with Update() before re-assembling the ParMixedBilinearFormBilinearForm.");
-       ParallelAssemble(p_mat);
-       delete mat;
-       mat = NULL;
-@@ -600,97 +658,89 @@ void ParMixedBilinearForm::FormRectangularSystemMatrix(
-          p_mat.As<HypreParMatrix>()->EliminateCols(trial_tdof_list);
-       p_mat.As<HypreParMatrix>()->EliminateRows(test_tdof_list);
-       p_mat_e.Reset(temp, true);
-+      A = p_mat;
-    }
--
--   A = p_mat;
- }
- 
--void ParMixedBilinearForm::FormRectangularLinearSystem(
--   const Array<int>
--   &trial_tdof_list,
--   const Array<int> &test_tdof_list, Vector &x,
--   Vector &b, OperatorHandle &A, Vector &X,
--   Vector &B)
-+void ParDiscreteLinearOperator::ParallelAssemble(OperatorHandle &A) const
- {
--   if (ext)
-+   A.Clear();
-+
-+   if (mat == NULL) { return; }
-+   MFEM_VERIFY(mat->Finalized(), "the local matrix must be finalized");
-+
-+   if (A.Type() == Operator::Hypre_ParCSR)
-    {
--      ext->FormRectangularLinearSystem(trial_tdof_list, test_tdof_list,
--                                       x, b, A, X, B);
--      return;
-+      const SparseMatrix *R = range_fes->GetRestrictionMatrix();
-+      const HypreParMatrix *P = domain_fes->Dof_TrueDof_Matrix();
-+      SparseMatrix *RA = mfem::Mult(*R, *mat);
-+      A.Reset(P->LeftDiagMult(*RA, range_fes->GetTrueDofOffsets()));
-+      delete RA;
-    }
-+   else
-+   {
-+      OperatorHandle dA(A.Type()), P_trial(A.Type()), Rt_test(A.Type());
- 
--   FormRectangularSystemMatrix(trial_tdof_list, test_tdof_list, A);
--
--   const Operator *test_P = test_pfes->GetProlongationMatrix();
--   const SparseMatrix *trial_R = trial_pfes->GetRestrictionMatrix();
-+      // construct the rectangular block-diagonal matrix dA
-+      dA.MakeRectangularBlockDiag(domain_fes->GetComm(),
-+                                  range_fes->GlobalVSize(),
-+                                  domain_fes->GlobalVSize(),
-+                                  range_fes->GetDofOffsets(),
-+                                  domain_fes->GetDofOffsets(),
-+                                  mat);
- 
--   X.SetSize(trial_pfes->TrueVSize());
--   B.SetSize(test_pfes->TrueVSize());
--   test_P->MultTranspose(b, B);
--   trial_R->Mult(x, X);
-+      SparseMatrix *Rt = Transpose(*range_fes->GetRestrictionMatrix());
-+      Rt_test.MakeRectangularBlockDiag(range_fes->GetComm(),
-+                                       range_fes->GlobalVSize(),
-+                                       range_fes->GlobalTrueVSize(),
-+                                       range_fes->GetDofOffsets(),
-+                                       range_fes->GetTrueDofOffsets(),
-+                                       Rt);
- 
--   p_mat_e.As<HypreParMatrix>()->Mult(-1.0, X, 1.0, B);
--   B.SetSubVector(test_tdof_list, 0.0);
--}
-+      // TODO - construct the Dof_TrueDof_Matrix directly in the required format.
-+      P_trial.ConvertFrom(domain_fes->Dof_TrueDof_Matrix());
- 
--HypreParMatrix* ParDiscreteLinearOperator::ParallelAssemble() const
--{
--   MFEM_ASSERT(mat, "Matrix is not assembled");
--   MFEM_ASSERT(mat->Finalized(), "Matrix is not finalized");
--   SparseMatrix* RA = mfem::Mult(*range_fes->GetRestrictionMatrix(), *mat);
--   HypreParMatrix* P = domain_fes->Dof_TrueDof_Matrix();
--   HypreParMatrix* RAP = P->LeftDiagMult(*RA, range_fes->GetTrueDofOffsets());
--   delete RA;
--   return RAP;
-+      A.MakeRAP(Rt_test, dA, P_trial);
-+      delete Rt;
-+   }
- }
- 
--void ParDiscreteLinearOperator::ParallelAssemble(OperatorHandle &A)
-+HypreParMatrix *ParDiscreteLinearOperator::ParallelAssemble() const
- {
--   // construct the rectangular block-diagonal matrix dA
--   OperatorHandle dA(A.Type());
--   dA.MakeRectangularBlockDiag(domain_fes->GetComm(),
--                               range_fes->GlobalVSize(),
--                               domain_fes->GlobalVSize(),
--                               range_fes->GetDofOffsets(),
--                               domain_fes->GetDofOffsets(),
--                               mat);
--
--   SparseMatrix *Rt = Transpose(*range_fes->GetRestrictionMatrix());
--   OperatorHandle R_test_transpose(A.Type());
--   R_test_transpose.MakeRectangularBlockDiag(range_fes->GetComm(),
--                                             range_fes->GlobalVSize(),
--                                             range_fes->GlobalTrueVSize(),
--                                             range_fes->GetDofOffsets(),
--                                             range_fes->GetTrueDofOffsets(),
--                                             Rt);
--
--   // TODO - construct the Dof_TrueDof_Matrix directly in the required format.
--   OperatorHandle P_trial(A.Type());
--   P_trial.ConvertFrom(domain_fes->Dof_TrueDof_Matrix());
--
--   A.MakeRAP(R_test_transpose, dA, P_trial);
--   delete Rt;
-+   OperatorHandle Mh(Operator::Hypre_ParCSR);
-+   ParallelAssemble(Mh);
-+   Mh.SetOperatorOwner(false);
-+   return Mh.As<HypreParMatrix>();
- }
- 
--void ParDiscreteLinearOperator::FormRectangularSystemMatrix(OperatorHandle &A)
-+void ParDiscreteLinearOperator::FormDiscreteOperatorMatrix(OperatorHandle &A)
- {
-    if (ext)
-    {
--      Array<int> empty;
--      ext->FormRectangularSystemOperator(empty, empty, A);
-+      Operator *oper;
-+      ext->FormDiscreteOperator(oper);
-+      A.Reset(oper);
-       return;
-    }
- 
--   mfem_error("not implemented!");
-+   if (mat)
-+   {
-+      Finalize();
-+      ParallelAssemble(A);
-+      delete mat;
-+      mat = NULL;
-+      delete mat_e;
-+      mat_e = NULL;
-+   }
- }
- 
--void ParDiscreteLinearOperator::GetParBlocks(Array2D<HypreParMatrix *> &blocks)
--const
-+void ParDiscreteLinearOperator::GetParBlocks(
-+   Array2D<HypreParMatrix *> &blocks)
- {
-    MFEM_VERIFY(mat->Finalized(), "Local matrix needs to be finalized for "
-                "GetParBlocks");
- 
--   HypreParMatrix* RLP = ParallelAssemble();
-+   HypreParMatrix *RLP = ParallelAssemble();
- 
-    blocks.SetSize(range_fes->GetVDim(), domain_fes->GetVDim());
- 
-diff --git a/fem/pbilinearform.hpp b/fem/pbilinearform.hpp
-index c8fef567b..be8e50ca3 100644
---- a/fem/pbilinearform.hpp
-+++ b/fem/pbilinearform.hpp
-@@ -28,27 +28,31 @@ namespace mfem
- class ParBilinearForm : public BilinearForm
- {
-    friend FABilinearFormExtension;
-+
- protected:
--   ParFiniteElementSpace *pfes; ///< Points to the same object as #fes
-+   ///< Points to the same object as #fes
-+   ParFiniteElementSpace *pfes;
- 
-    /// Auxiliary vectors used in TrueAddMult(): L-, L-, and T-vector, resp.
-    mutable Vector Xaux, Yaux, Ytmp;
- 
-+   /// Matrix and eliminated matrix
-    OperatorHandle p_mat, p_mat_e;
- 
-    bool keep_nbr_block;
- 
--   // Allocate mat - called when (mat == NULL && fbfi.Size() > 0)
--   void pAllocMat();
-+   //XX TODO
-+   // // Allocate mat - called when (mat == NULL && fbfi.Size() > 0)
-+   // void pAllocMat();
- 
-    void AssembleSharedFaces(int skip_zeros = 1);
- 
- private:
--   /// Copy construction is not supported; body is undefined.
--   ParBilinearForm(const ParBilinearForm &);
-+   /// Copy construction is not supported.
-+   ParBilinearForm(const ParBilinearForm &) = delete;
- 
--   /// Copy assignment is not supported; body is undefined.
--   ParBilinearForm &operator=(const ParBilinearForm &);
-+   /// Copy assignment is not supported.
-+   ParBilinearForm &operator=(const ParBilinearForm &) = delete;
- 
- public:
-    /// Creates parallel bilinear form associated with the FE space @a *pf.
-@@ -103,40 +107,40 @@ public:
- 
-    /// Returns the matrix assembled on the true dofs, i.e. P^t A P.
-    /** The returned matrix has to be deleted by the caller. */
--   HypreParMatrix *ParallelAssemble() { return ParallelAssemble(mat); }
-+   HypreParMatrix *ParallelAssemble() const { return ParallelAssemble(mat); }
- 
-    /// Returns the eliminated matrix assembled on the true dofs, i.e. P^t A_e P.
-    /** The returned matrix has to be deleted by the caller. */
--   HypreParMatrix *ParallelAssembleElim() { return ParallelAssemble(mat_e); }
-+   HypreParMatrix *ParallelAssembleElim() const { return ParallelAssemble(mat_e); }
- 
-    /// Return the matrix @a m assembled on the true dofs, i.e. P^t A P.
-    /** The returned matrix has to be deleted by the caller. */
--   HypreParMatrix *ParallelAssemble(SparseMatrix *m);
--
--   /** @brief Compute parallel RAP operator and store it in @a A as a HypreParMatrix.
--
--       @param[in] loc_A The rank-local `SparseMatrix`.
--       @param[out] A The `OperatorHandle` containing the global `HypreParMatrix`.
--       @param[in] steal_loc_A Have the `HypreParMatrix` in @a A take ownership of
--                              the memory objects in @a loc_A.
--       */
--   void ParallelRAP(SparseMatrix &loc_A,
--                    OperatorHandle &A,
--                    bool steal_loc_A = false);
-+   HypreParMatrix *ParallelAssemble(SparseMatrix *m) const;
- 
-    /** @brief Returns the matrix assembled on the true dofs, i.e.
-        @a A = P^t A_local P, in the format (type id) specified by @a A. */
--   void ParallelAssemble(OperatorHandle &A) { ParallelAssemble(A, mat); }
-+   void ParallelAssemble(OperatorHandle &A) const { ParallelAssemble(A, mat); }
- 
-    /** Returns the eliminated matrix assembled on the true dofs, i.e.
-        @a A_elim = P^t A_elim_local P in the format (type id) specified by @a A.
-     */
--   void ParallelAssembleElim(OperatorHandle &A_elim)
-+   void ParallelAssembleElim(OperatorHandle &A_elim) const
-    { ParallelAssemble(A_elim, mat_e); }
- 
-    /** Returns the matrix @a A_local assembled on the true dofs, i.e.
-        @a A = P^t A_local P in the format (type id) specified by @a A. */
--   void ParallelAssemble(OperatorHandle &A, SparseMatrix *A_local);
-+   void ParallelAssemble(OperatorHandle &A, SparseMatrix *A_local) const;
-+
-+   /** @brief Compute parallel RAP operator and store it in @a A as a HypreParMatrix.
-+
-+       @param[in] loc_A The rank-local `SparseMatrix`.
-+       @param[out] A The `OperatorHandle` containing the global `HypreParMatrix`.
-+       @param[in] steal_loc_A Have the `HypreParMatrix` in @a A take ownership of
-+                              the memory objects in @a loc_A.
-+       */
-+   void ParallelRAP(SparseMatrix &loc_A,
-+                    OperatorHandle &A,
-+                    bool steal_loc_A = false);
- 
-    /// Eliminate essential boundary DOFs from a parallel assembled system.
-    /** The array @a bdr_attr_is_ess marks boundary attributes that constitute
-@@ -183,9 +187,7 @@ public:
-    /// Get the parallel finite element space prolongation matrix
-    virtual const Operator *GetProlongation() const
-    { return pfes->GetProlongationMatrix(); }
--   /// Get the transpose of GetRestriction, useful for matrix-free RAP
--   virtual const Operator *GetRestrictionTranspose() const
--   { return pfes->GetRestrictionTransposeOperator(); }
-+
-    /// Get the parallel finite element space restriction matrix
-    virtual const Operator *GetRestriction() const
-    { return pfes->GetRestrictionMatrix(); }
-@@ -209,7 +211,7 @@ public:
- 
-    void EliminateVDofsInRHS(const Array<int> &vdofs, const Vector &x, Vector &b);
- 
--   virtual ~ParBilinearForm() { }
-+   virtual ~ParBilinearForm() {}
- };
- 
- /// Class for parallel bilinear form using different test and trial FE spaces.
-@@ -220,6 +222,7 @@ protected:
-    ParFiniteElementSpace *trial_pfes;
-    /// Points to the same object as #test_fes
-    ParFiniteElementSpace *test_pfes;
-+
-    /// Auxiliary objects used in TrueAddMult().
-    mutable ParGridFunction Xaux, Yaux;
- 
-@@ -227,11 +230,11 @@ protected:
-    OperatorHandle p_mat, p_mat_e;
- 
- private:
--   /// Copy construction is not supported; body is undefined.
--   ParMixedBilinearForm(const ParMixedBilinearForm &);
-+   /// Copy construction is not supported.
-+   ParMixedBilinearForm(const ParMixedBilinearForm &) = delete;
- 
--   /// Copy assignment is not supported; body is undefined.
--   ParMixedBilinearForm &operator=(const ParMixedBilinearForm &);
-+   /// Copy assignment is not supported.
-+   ParMixedBilinearForm &operator=(const ParMixedBilinearForm &) = delete;
- 
- public:
-    /** @brief Construct a ParMixedBilinearForm on the given FiniteElementSpace%s
-@@ -258,7 +261,7 @@ public:
-        by the newly constructed ParMixedBilinearForm. */
-    ParMixedBilinearForm(ParFiniteElementSpace *trial_fes,
-                         ParFiniteElementSpace *test_fes,
--                        ParMixedBilinearForm * mbf)
-+                        ParMixedBilinearForm *mbf)
-       : MixedBilinearForm(trial_fes, test_fes, mbf),
-         p_mat(Operator::Hypre_ParCSR), p_mat_e(Operator::Hypre_ParCSR)
-    {
-@@ -267,24 +270,16 @@ public:
-    }
- 
-    /// Returns the matrix assembled on the true dofs, i.e. P_test^t A P_trial.
--   HypreParMatrix *ParallelAssemble();
-+   HypreParMatrix *ParallelAssemble() const;
- 
-    /** @brief Returns the matrix assembled on the true dofs, i.e.
-        @a A = P_test^t A_local P_trial, in the format (type id) specified by
-        @a A. */
--   void ParallelAssemble(OperatorHandle &A);
-+   void ParallelAssemble(OperatorHandle &A) const;
- 
-    using MixedBilinearForm::FormRectangularSystemMatrix;
-    using MixedBilinearForm::FormRectangularLinearSystem;
- 
--   /** @brief Return in @a A a parallel (on truedofs) version of this operator.
--
--       This returns the same operator as FormRectangularLinearSystem(), but does
--       without the transformations of the right-hand side. */
--   virtual void FormRectangularSystemMatrix(const Array<int> &trial_tdof_list,
--                                            const Array<int> &test_tdof_list,
--                                            OperatorHandle &A);
--
-    /** @brief Form the parallel linear system A X = B, corresponding to this mixed
-        bilinear form and the linear form @a b(.).
- 
-@@ -296,10 +291,18 @@ public:
-                                             Vector &b, OperatorHandle &A, Vector &X,
-                                             Vector &B);
- 
-+   /** @brief Return in @a A a parallel (on truedofs) version of this operator.
-+
-+       This returns the same operator as FormRectangularLinearSystem(), but does
-+       without the transformations of the right-hand side. */
-+   virtual void FormRectangularSystemMatrix(const Array<int> &trial_tdof_list,
-+                                            const Array<int> &test_tdof_list,
-+                                            OperatorHandle &A);
-+
-    /// Compute y += a (P^t A P) x, where x and y are vectors on the true dofs
-    void TrueAddMult(const Vector &x, Vector &y, const double a = 1.0) const;
- 
--   virtual ~ParMixedBilinearForm() { }
-+   virtual ~ParMixedBilinearForm() {}
- };
- 
- /** The parallel matrix representation a linear operator between parallel finite
-@@ -313,11 +316,12 @@ protected:
-    ParFiniteElementSpace *range_fes;
- 
- private:
--   /// Copy construction is not supported; body is undefined.
--   ParDiscreteLinearOperator(const ParDiscreteLinearOperator &);
-+   /// Copy construction is not supported.
-+   ParDiscreteLinearOperator(const ParDiscreteLinearOperator &) = delete;
- 
--   /// Copy assignment is not supported; body is undefined.
--   ParDiscreteLinearOperator &operator=(const ParDiscreteLinearOperator &);
-+   /// Copy assignment is not supported.
-+   ParDiscreteLinearOperator &operator=(const ParDiscreteLinearOperator &) =
-+      delete;
- 
- public:
-    /** @brief Construct a ParDiscreteLinearOperator on the given
-@@ -327,7 +331,11 @@ public:
-        object. */
-    ParDiscreteLinearOperator(ParFiniteElementSpace *dfes,
-                              ParFiniteElementSpace *rfes)
--      : DiscreteLinearOperator(dfes, rfes) { domain_fes=dfes; range_fes=rfes; }
-+      : DiscreteLinearOperator(dfes, rfes)
-+   {
-+      domain_fes = dfes;
-+      range_fes = rfes;
-+   }
- 
-    /// Returns the matrix "assembled" on the true dofs
-    HypreParMatrix *ParallelAssemble() const;
-@@ -335,18 +343,18 @@ public:
-    /** @brief Returns the matrix assembled on the true dofs, i.e.
-        @a A = R_test A_local P_trial, in the format (type id) specified by
-        @a A. */
--   void ParallelAssemble(OperatorHandle &A);
-+   void ParallelAssemble(OperatorHandle &A) const;
- 
--   /** Extract the parallel blocks corresponding to the vector dimensions of the
--       domain and range parallel finite element spaces */
--   void GetParBlocks(Array2D<HypreParMatrix *> &blocks) const;
--
--   using MixedBilinearForm::FormRectangularSystemMatrix;
-+   using DiscreteLinearOperator::FormDiscreteOperatorMatrix;
- 
-    /** @brief Return in @a A a parallel (on truedofs) version of this operator. */
--   virtual void FormRectangularSystemMatrix(OperatorHandle &A);
-+   virtual void FormDiscreteOperatorMatrix(OperatorHandle &A);
-+
-+   /** Extract the parallel blocks corresponding to the vector dimensions of the
-+       domain and range parallel finite element spaces */
-+   void GetParBlocks(Array2D<HypreParMatrix *> &blocks);
- 
--   virtual ~ParDiscreteLinearOperator() { }
-+   virtual ~ParDiscreteLinearOperator() {}
- };
- 
- }
-diff --git a/fem/pfespace.cpp b/fem/pfespace.cpp
-index 6f0af11c6..d74e08580 100644
---- a/fem/pfespace.cpp
-+++ b/fem/pfespace.cpp
-@@ -101,8 +101,6 @@ void ParFiniteElementSpace::ParInit(ParMesh *pm)
-    P = NULL;
-    Pconf = NULL;
-    nonconf_P = false;
--   Rconf = NULL;
--   R_transpose = NULL;
-    R = NULL;
- 
-    num_face_nbr_dofs = -1;
-@@ -961,6 +959,34 @@ void ParFiniteElementSpace::Build_Dof_TrueDof_Matrix() const // matrix P
-    R = Transpose(Pdiag);
- }
- 
-+const Operator *ParFiniteElementSpace::GetProlongationMatrix() const
-+{
-+   if (Conforming() && !nd_strias)
-+   {
-+      if (Pconf) { return Pconf; }
-+      if (NRanks == 1)
-+      {
-+         Pconf = new IdentityOperator(GetTrueVSize());
-+      }
-+      else
-+      {
-+         if (!Device::Allows(Backend::DEVICE_MASK))
-+         {
-+            Pconf = new ConformingProlongationOperator(*this);
-+         }
-+         else
-+         {
-+            Pconf = new DeviceConformingProlongationOperator(*this);
-+         }
-+      }
-+      return Pconf;
-+   }
-+   else
-+   {
-+      return Dof_TrueDof_Matrix();
-+   }
-+}
-+
- HypreParMatrix *ParFiniteElementSpace::GetPartialConformingInterpolation()
- {
-    HypreParMatrix *P_pc;
-@@ -1150,76 +1176,6 @@ HYPRE_BigInt ParFiniteElementSpace::GetMyTDofOffset() const
-    return HYPRE_AssumedPartitionCheck()? tdof_offsets[0] : tdof_offsets[MyRank];
- }
- 
--const Operator *ParFiniteElementSpace::GetProlongationMatrix() const
--{
--   if (Conforming())
--   {
--      if (Pconf) { return Pconf; }
--
--      if (nd_strias) { return Dof_TrueDof_Matrix(); }
--
--      if (NRanks == 1)
--      {
--         Pconf = new IdentityOperator(GetTrueVSize());
--      }
--      else
--      {
--         if (!Device::Allows(Backend::DEVICE_MASK))
--         {
--            Pconf = new ConformingProlongationOperator(*this);
--         }
--         else
--         {
--            Pconf = new DeviceConformingProlongationOperator(*this);
--         }
--      }
--      return Pconf;
--   }
--   else
--   {
--      return Dof_TrueDof_Matrix();
--   }
--}
--
--const Operator *ParFiniteElementSpace::GetRestrictionOperator() const
--{
--   if (Conforming())
--   {
--      if (Rconf) { return Rconf; }
--
--      if (NRanks == 1)
--      {
--         R_transpose = new IdentityOperator(GetTrueVSize());
--      }
--      else
--      {
--         if (!Device::Allows(Backend::DEVICE_MASK))
--         {
--            R_transpose = new ConformingProlongationOperator(*this, true);
--         }
--         else
--         {
--            R_transpose =
--               new DeviceConformingProlongationOperator(*this, true);
--         }
--      }
--      Rconf = new TransposeOperator(R_transpose);
--      return Rconf;
--   }
--   else
--   {
--      Dof_TrueDof_Matrix();
--      R_transpose = new TransposeOperator(R);
--      return R;
--   }
--}
--
--const Operator *ParFiniteElementSpace::GetRestrictionTransposeOperator() const
--{
--   GetRestrictionOperator();
--   return R_transpose;
--}
--
- void ParFiniteElementSpace::ExchangeFaceNbrData()
- {
-    if (num_face_nbr_dofs >= 0) { return; }
-@@ -3186,8 +3142,6 @@ void ParFiniteElementSpace::Destroy()
- 
-    delete P; P = NULL;
-    delete Pconf; Pconf = NULL;
--   delete Rconf; Rconf = NULL;
--   delete R_transpose; R_transpose = NULL;
-    delete R; R = NULL;
- 
-    delete gcomm; gcomm = NULL;
-@@ -3393,8 +3347,8 @@ void ParFiniteElementSpace::UpdateMeshPointer(Mesh *new_mesh)
- }
- 
- ConformingProlongationOperator::ConformingProlongationOperator(
--   int lsize, const GroupCommunicator &gc_, bool local_)
--   : gc(gc_), local(local_)
-+   int lsize, const GroupCommunicator &gc_)
-+   : gc(gc_)
- {
-    const Table &group_ldof = gc.GroupLDofTable();
- 
-@@ -3429,11 +3383,10 @@ const
- }
- 
- ConformingProlongationOperator::ConformingProlongationOperator(
--   const ParFiniteElementSpace &pfes, bool local_)
-+   const ParFiniteElementSpace &pfes)
-    : Operator(pfes.GetVSize(), pfes.GetTrueVSize()),
-      external_ldofs(),
--     gc(pfes.GroupComm()),
--     local(local_)
-+     gc(pfes.GroupComm())
- {
-    MFEM_VERIFY(pfes.Conforming(), "");
-    const Table &group_ldof = gc.GroupLDofTable();
-@@ -3482,14 +3435,7 @@ void ConformingProlongationOperator::Mult(const Vector &x, Vector &y) const
-    const int m = external_ldofs.Size();
- 
-    const int in_layout = 2; // 2 - input is ltdofs array
--   if (local)
--   {
--      y = 0.0;
--   }
--   else
--   {
--      gc.BcastBegin(const_cast<double*>(xdata), in_layout);
--   }
-+   gc.BcastBegin(const_cast<double*>(xdata), in_layout);
- 
-    int j = 0;
-    for (int i = 0; i < m; i++)
-@@ -3501,10 +3447,7 @@ void ConformingProlongationOperator::Mult(const Vector &x, Vector &y) const
-    std::copy(xdata+j-m, xdata+Width(), ydata+j);
- 
-    const int out_layout = 0; // 0 - output is ldofs array
--   if (!local)
--   {
--      gc.BcastEnd(ydata, out_layout);
--   }
-+   gc.BcastEnd(ydata, out_layout);
- }
- 
- void ConformingProlongationOperator::MultTranspose(
-@@ -3517,10 +3460,7 @@ void ConformingProlongationOperator::MultTranspose(
-    double *ydata = y.HostWrite();
-    const int m = external_ldofs.Size();
- 
--   if (!local)
--   {
--      gc.ReduceBegin(xdata);
--   }
-+   gc.ReduceBegin(xdata);
- 
-    int j = 0;
-    for (int i = 0; i < m; i++)
-@@ -3532,15 +3472,12 @@ void ConformingProlongationOperator::MultTranspose(
-    std::copy(xdata+j, xdata+Height(), ydata+j-m);
- 
-    const int out_layout = 2; // 2 - output is an array on all ltdofs
--   if (!local)
--   {
--      gc.ReduceEnd<double>(ydata, out_layout, GroupCommunicator::Sum);
--   }
-+   gc.ReduceEnd<double>(ydata, out_layout, GroupCommunicator::Sum);
- }
- 
- DeviceConformingProlongationOperator::DeviceConformingProlongationOperator(
--   const GroupCommunicator &gc_, const SparseMatrix *R, bool local_)
--   : ConformingProlongationOperator(R->Width(), gc_, local_),
-+   const GroupCommunicator &gc_, const SparseMatrix *R)
-+   : ConformingProlongationOperator(R->Width(), gc_),
-      mpi_gpu_aware(Device::GetGPUAwareMPI())
- {
-    MFEM_ASSERT(R->Finalized(), "");
-@@ -3605,10 +3542,9 @@ DeviceConformingProlongationOperator::DeviceConformingProlongationOperator(
- }
- 
- DeviceConformingProlongationOperator::DeviceConformingProlongationOperator(
--   const ParFiniteElementSpace &pfes, bool local_)
-+   const ParFiniteElementSpace &pfes)
-    : DeviceConformingProlongationOperator(pfes.GroupComm(),
--                                          pfes.GetRestrictionMatrix(),
--                                          local_)
-+                                          pfes.GetRestrictionMatrix())
- {
-    MFEM_ASSERT(pfes.Conforming(), "internal error");
-    MFEM_ASSERT(pfes.GetRestrictionMatrix()->Height() == pfes.GetTrueVSize(), "");
-@@ -3672,48 +3608,36 @@ void DeviceConformingProlongationOperator::Mult(const Vector &x,
-                                                 Vector &y) const
- {
-    const GroupTopology &gtopo = gc.GetGroupTopology();
--   int req_counter = 0;
-    // Make sure 'y' is marked as valid on device and for use on device.
-    // This ensures that there is no unnecessary host to device copy when the
--   // input 'y' is valid on host (in 'y.SetSubVector(ext_ldof, 0.0)' when local
--   // is true) or BcastLocalCopy (when local is false).
-+   // input 'y' is valid on host.
-    y.Write();
--   if (local)
--   {
--      // done on device since we've marked ext_ldof for use on device:
--      y.SetSubVector(ext_ldof, 0.0);
--   }
--   else
-+   BcastBeginCopy(x); // copy to 'shr_buf'
-+   int req_counter = 0;
-+   for (int nbr = 1; nbr < gtopo.GetNumNeighbors(); nbr++)
-    {
--      BcastBeginCopy(x); // copy to 'shr_buf'
--      for (int nbr = 1; nbr < gtopo.GetNumNeighbors(); nbr++)
-+      const int send_offset = shr_buf_offsets[nbr];
-+      const int send_size = shr_buf_offsets[nbr+1] - send_offset;
-+      if (send_size > 0)
-       {
--         const int send_offset = shr_buf_offsets[nbr];
--         const int send_size = shr_buf_offsets[nbr+1] - send_offset;
--         if (send_size > 0)
--         {
--            auto send_buf = mpi_gpu_aware ? shr_buf.Read() : shr_buf.HostRead();
--            MPI_Isend(send_buf + send_offset, send_size, MPI_DOUBLE,
--                      gtopo.GetNeighborRank(nbr), 41822,
--                      gtopo.GetComm(), &requests[req_counter++]);
--         }
--         const int recv_offset = ext_buf_offsets[nbr];
--         const int recv_size = ext_buf_offsets[nbr+1] - recv_offset;
--         if (recv_size > 0)
--         {
--            auto recv_buf = mpi_gpu_aware ? ext_buf.Write() : ext_buf.HostWrite();
--            MPI_Irecv(recv_buf + recv_offset, recv_size, MPI_DOUBLE,
--                      gtopo.GetNeighborRank(nbr), 41822,
--                      gtopo.GetComm(), &requests[req_counter++]);
--         }
-+         auto send_buf = mpi_gpu_aware ? shr_buf.Read() : shr_buf.HostRead();
-+         MPI_Isend(send_buf + send_offset, send_size, MPI_DOUBLE,
-+                   gtopo.GetNeighborRank(nbr), 41822,
-+                   gtopo.GetComm(), &requests[req_counter++]);
-+      }
-+      const int recv_offset = ext_buf_offsets[nbr];
-+      const int recv_size = ext_buf_offsets[nbr+1] - recv_offset;
-+      if (recv_size > 0)
-+      {
-+         auto recv_buf = mpi_gpu_aware ? ext_buf.Write() : ext_buf.HostWrite();
-+         MPI_Irecv(recv_buf + recv_offset, recv_size, MPI_DOUBLE,
-+                   gtopo.GetNeighborRank(nbr), 41822,
-+                   gtopo.GetComm(), &requests[req_counter++]);
-       }
-    }
-    BcastLocalCopy(x, y);
--   if (!local)
--   {
--      MPI_Waitall(req_counter, requests, MPI_STATUSES_IGNORE);
--      BcastEndCopy(y); // copy from 'ext_buf'
--   }
-+   MPI_Waitall(req_counter, requests, MPI_STATUSES_IGNORE);
-+   BcastEndCopy(y); // copy from 'ext_buf'
- }
- 
- DeviceConformingProlongationOperator::~DeviceConformingProlongationOperator()
-@@ -3774,38 +3698,32 @@ void DeviceConformingProlongationOperator::MultTranspose(const Vector &x,
-                                                          Vector &y) const
- {
-    const GroupTopology &gtopo = gc.GetGroupTopology();
-+   ReduceBeginCopy(x); // copy to 'ext_buf'
-    int req_counter = 0;
--   if (!local)
-+   for (int nbr = 1; nbr < gtopo.GetNumNeighbors(); nbr++)
-    {
--      ReduceBeginCopy(x); // copy to 'ext_buf'
--      for (int nbr = 1; nbr < gtopo.GetNumNeighbors(); nbr++)
-+      const int send_offset = ext_buf_offsets[nbr];
-+      const int send_size = ext_buf_offsets[nbr+1] - send_offset;
-+      if (send_size > 0)
-       {
--         const int send_offset = ext_buf_offsets[nbr];
--         const int send_size = ext_buf_offsets[nbr+1] - send_offset;
--         if (send_size > 0)
--         {
--            auto send_buf = mpi_gpu_aware ? ext_buf.Read() : ext_buf.HostRead();
--            MPI_Isend(send_buf + send_offset, send_size, MPI_DOUBLE,
--                      gtopo.GetNeighborRank(nbr), 41823,
--                      gtopo.GetComm(), &requests[req_counter++]);
--         }
--         const int recv_offset = shr_buf_offsets[nbr];
--         const int recv_size = shr_buf_offsets[nbr+1] - recv_offset;
--         if (recv_size > 0)
--         {
--            auto recv_buf = mpi_gpu_aware ? shr_buf.Write() : shr_buf.HostWrite();
--            MPI_Irecv(recv_buf + recv_offset, recv_size, MPI_DOUBLE,
--                      gtopo.GetNeighborRank(nbr), 41823,
--                      gtopo.GetComm(), &requests[req_counter++]);
--         }
-+         auto send_buf = mpi_gpu_aware ? ext_buf.Read() : ext_buf.HostRead();
-+         MPI_Isend(send_buf + send_offset, send_size, MPI_DOUBLE,
-+                   gtopo.GetNeighborRank(nbr), 41823,
-+                   gtopo.GetComm(), &requests[req_counter++]);
-+      }
-+      const int recv_offset = shr_buf_offsets[nbr];
-+      const int recv_size = shr_buf_offsets[nbr+1] - recv_offset;
-+      if (recv_size > 0)
-+      {
-+         auto recv_buf = mpi_gpu_aware ? shr_buf.Write() : shr_buf.HostWrite();
-+         MPI_Irecv(recv_buf + recv_offset, recv_size, MPI_DOUBLE,
-+                   gtopo.GetNeighborRank(nbr), 41823,
-+                   gtopo.GetComm(), &requests[req_counter++]);
-       }
-    }
-    ReduceLocalCopy(x, y);
--   if (!local)
--   {
--      MPI_Waitall(req_counter, requests, MPI_STATUSES_IGNORE);
--      ReduceEndAssemble(y); // assemble from 'shr_buf'
--   }
-+   MPI_Waitall(req_counter, requests, MPI_STATUSES_IGNORE);
-+   ReduceEndAssemble(y); // assemble from 'shr_buf'
- }
- 
- } // namespace mfem
-diff --git a/fem/pfespace.hpp b/fem/pfespace.hpp
-index 8f574670b..c4a95a365 100644
---- a/fem/pfespace.hpp
-+++ b/fem/pfespace.hpp
-@@ -70,6 +70,7 @@ private:
- 
-    /// The matrix P (interpolation from true dof to dof). Owned.
-    mutable HypreParMatrix *P;
-+
-    /// Optimized action-only prolongation operator for conforming meshes. Owned.
-    mutable Operator *Pconf;
- 
-@@ -80,12 +81,6 @@ private:
- 
-    /// The (block-diagonal) matrix R (restriction of dof to true dof). Owned.
-    mutable SparseMatrix *R;
--   /// Optimized action-only restriction operator for conforming meshes. Owned.
--   mutable Operator *Rconf;
--   /** Transpose of R or Rconf. For conforming mesh, this is a matrix-free
--       (Device)ConformingProlongationOperator, for a non-conforming mesh
--       this is a TransposeOperator wrapping R. */
--   mutable Operator *R_transpose;
- 
-    /// Flag indicating the existence of shared triangles with interior ND dofs
-    bool nd_strias;
-@@ -321,6 +316,13 @@ public:
-    HypreParMatrix *Dof_TrueDof_Matrix() const
-    { if (!P) { Build_Dof_TrueDof_Matrix(); } return P; }
- 
-+   /// Get the P matrix which prolongates a true dof vector to local dof vector.
-+   virtual const Operator *GetProlongationMatrix() const;
-+
-+   /// Get the R matrix which restricts a local dof vector to true dof vector.
-+   virtual const SparseMatrix *GetRestrictionMatrix() const
-+   { Dof_TrueDof_Matrix(); return R; }
-+
-    /** @brief For a non-conforming mesh, construct and return the interpolation
-        matrix from the partially conforming true dofs to the local dofs. */
-    /** @note The returned pointer must be deleted by the caller. */
-@@ -374,21 +376,6 @@ public:
-    HYPRE_BigInt GetMyDofOffset() const;
-    HYPRE_BigInt GetMyTDofOffset() const;
- 
--   virtual const Operator *GetProlongationMatrix() const;
--   /** @brief Return logical transpose of restriction matrix, but in
--       non-assembled optimized matrix-free form.
--
--       The implementation is like GetProlongationMatrix, but it sets local
--       DOFs to the true DOF values if owned locally, otherwise zero. */
--   virtual const Operator *GetRestrictionTransposeOperator() const;
--   /** Get an Operator that performs the action of GetRestrictionMatrix(),
--       but potentially with a non-assembled optimized matrix-free
--       implementation. */
--   virtual const Operator *GetRestrictionOperator() const;
--   /// Get the R matrix which restricts a local dof vector to true dof vector.
--   virtual const SparseMatrix *GetRestrictionMatrix() const
--   { Dof_TrueDof_Matrix(); return R; }
--
-    // Face-neighbor functions
-    void ExchangeFaceNbrData();
-    int GetFaceNbrVSize() const { return num_face_nbr_dofs; }
-@@ -434,21 +421,17 @@ public:
-    int TrueVSize() const { return ltdof_size; }
- };
- 
--
- /// Auxiliary class used by ParFiniteElementSpace.
- class ConformingProlongationOperator : public Operator
- {
- protected:
-    Array<int> external_ldofs;
-    const GroupCommunicator &gc;
--   bool local;
- 
- public:
--   ConformingProlongationOperator(int lsize, const GroupCommunicator &gc_,
--                                  bool local_=false);
-+   ConformingProlongationOperator(int lsize, const GroupCommunicator &gc_);
- 
--   ConformingProlongationOperator(const ParFiniteElementSpace &pfes,
--                                  bool local_=false);
-+   ConformingProlongationOperator(const ParFiniteElementSpace &pfes);
- 
-    const GroupCommunicator &GetGroupCommunicator() const;
- 
-@@ -458,8 +441,8 @@ public:
- };
- 
- /// Auxiliary device class used by ParFiniteElementSpace.
--class DeviceConformingProlongationOperator: public
--   ConformingProlongationOperator
-+class DeviceConformingProlongationOperator :
-+   public ConformingProlongationOperator
- {
- protected:
-    bool mpi_gpu_aware;
-@@ -495,11 +478,10 @@ protected:
-    void ReduceEndAssemble(Vector &dst) const;
- 
- public:
--   DeviceConformingProlongationOperator(
--      const GroupCommunicator &gc_, const SparseMatrix *R, bool local_=false);
-+   DeviceConformingProlongationOperator(const GroupCommunicator &gc_,
-+                                        const SparseMatrix *R);
- 
--   DeviceConformingProlongationOperator(const ParFiniteElementSpace &pfes,
--                                        bool local_=false);
-+   DeviceConformingProlongationOperator(const ParFiniteElementSpace &pfes);
- 
-    virtual ~DeviceConformingProlongationOperator();
- 
-diff --git a/fem/prestriction.cpp b/fem/prestriction.cpp
-index 37dee3c8d..89ce189db 100644
---- a/fem/prestriction.cpp
-+++ b/fem/prestriction.cpp
-@@ -325,7 +325,7 @@ void ParL2FaceRestriction::DoubleValuedConformingMult(
-    auto d_x_shared = Reshape(x_gf.FaceNbrData().Read(),
-                              t?vd:nsdofs, t?nsdofs:vd);
-    auto d_y = Reshape(y.Write(), nface_dofs, vd, 2, nf);
--   mfem::forall(nfdofs, [=] MFEM_HOST_DEVICE (int i)
-+   mfem::forall(face_dofs*nf, [=] MFEM_HOST_DEVICE (int i)
-    {
-       const int dof = i % nface_dofs;
-       const int face = i / nface_dofs;
-diff --git a/fem/restriction.cpp b/fem/restriction.cpp
-index c7343d7c7..e0877606b 100644
---- a/fem/restriction.cpp
-+++ b/fem/restriction.cpp
-@@ -12,27 +12,24 @@
- #include "restriction.hpp"
- #include "gridfunc.hpp"
- #include "fespace.hpp"
--#include "../general/forall.hpp"
--#include <climits>
--
- #ifdef MFEM_USE_MPI
--
- #include "pfespace.hpp"
--
- #endif
-+#include "../general/forall.hpp"
-+#include <climits>
- 
- namespace mfem
- {
- 
--ElementRestriction::ElementRestriction(const FiniteElementSpace &f,
--                                       ElementDofOrdering e_ordering)
-+ConformingElementRestriction::ConformingElementRestriction(
-+   const FiniteElementSpace &f,
-+   ElementDofOrdering e_ordering)
-    : fes(f),
-      ne(fes.GetNE()),
-      vdim(fes.GetVDim()),
-      byvdim(fes.GetOrdering() == Ordering::byVDIM),
-      ndofs(fes.GetNDofs()),
-      dof(ne > 0 ? fes.GetFE(0)->GetDof() : 0),
--     nedofs(ne*dof),
-      offsets(ndofs+1),
-      indices(ne*dof),
-      gather_map(ne*dof)
-@@ -104,7 +101,7 @@ ElementRestriction::ElementRestriction(const FiniteElementSpace &f,
-    offsets[0] = 0;
- }
- 
--void ElementRestriction::Mult(const Vector& x, Vector& y) const
-+void ConformingElementRestriction::Mult(const Vector& x, Vector& y) const
- {
-    // Assumes all elements have the same number of dofs
-    const int nd = dof;
-@@ -126,7 +123,8 @@ void ElementRestriction::Mult(const Vector& x, Vector& y) const
-    });
- }
- 
--void ElementRestriction::MultUnsigned(const Vector& x, Vector& y) const
-+void ConformingElementRestriction::MultUnsigned(const Vector& x,
-+                                                Vector& y) const
- {
-    // Assumes all elements have the same number of dofs
-    const int nd = dof;
-@@ -148,12 +146,13 @@ void ElementRestriction::MultUnsigned(const Vector& x, Vector& y) const
- }
- 
- template <bool ADD>
--void ElementRestriction::TAddMultTranspose(const Vector& x, Vector& y) const
-+static void TAddMultTranspose(const int nd, const int vd, const bool t,
-+                              const int ndofs, const int ne,
-+                              const Array<int>& offsets,
-+                              const Array<int>& indices,
-+                              const Vector& x, Vector& y)
- {
-    // Assumes all elements have the same number of dofs
--   const int nd = dof;
--   const int vd = vdim;
--   const bool t = byvdim;
-    auto d_offsets = offsets.Read();
-    auto d_indices = indices.Read();
-    auto d_x = Reshape(x.Read(), nd, vd, ne);
-@@ -177,21 +176,23 @@ void ElementRestriction::TAddMultTranspose(const Vector& x, Vector& y) const
-    });
- }
- 
--void ElementRestriction::MultTranspose(const Vector& x, Vector& y) const
-+void ConformingElementRestriction::MultTranspose(const Vector& x,
-+                                                 Vector& y) const
- {
-    constexpr bool ADD = false;
--   TAddMultTranspose<ADD>(x, y);
-+   TAddMultTranspose<ADD>(dof, vdim, byvdim, ndofs, ne, offsets, indices, x, y);
- }
- 
--void ElementRestriction::AddMultTranspose(const Vector& x, Vector& y,
--                                          const double a) const
-+void ConformingElementRestriction::AddMultTranspose(const Vector& x, Vector& y,
-+                                                    const double a) const
- {
-    MFEM_VERIFY(a == 1.0, "General coefficient case is not yet supported!");
-    constexpr bool ADD = true;
--   TAddMultTranspose<ADD>(x, y);
-+   TAddMultTranspose<ADD>(dof, vdim, byvdim, ndofs, ne, offsets, indices, x, y);
- }
- 
--void ElementRestriction::MultTransposeUnsigned(const Vector& x, Vector& y) const
-+void ConformingElementRestriction::MultTransposeUnsigned(const Vector& x,
-+                                                         Vector& y) const
- {
-    // Assumes all elements have the same number of dofs
-    const int nd = dof;
-@@ -218,32 +219,7 @@ void ElementRestriction::MultTransposeUnsigned(const Vector& x, Vector& y) const
-    });
- }
- 
--void ElementRestriction::MultLeftInverse(const Vector& x, Vector& y) const
--{
--   // Assumes all elements have the same number of dofs
--   const int nd = dof;
--   const int vd = vdim;
--   const bool t = byvdim;
--   auto d_offsets = offsets.Read();
--   auto d_indices = indices.Read();
--   auto d_x = Reshape(x.Read(), nd, vd, ne);
--   auto d_y = Reshape(y.Write(), t?vd:ndofs, t?ndofs:vd);
--   mfem::forall(ndofs, [=] MFEM_HOST_DEVICE (int i)
--   {
--      const int next_offset = d_offsets[i + 1];
--      for (int c = 0; c < vd; ++c)
--      {
--         double dof_value = 0;
--         const int j = next_offset - 1;
--         const int idx_j = (d_indices[j] >= 0) ? d_indices[j] : -1 - d_indices[j];
--         dof_value = (d_indices[j] >= 0) ? d_x(idx_j % nd, c, idx_j / nd) :
--                     -d_x(idx_j % nd, c, idx_j / nd);
--         d_y(t?c:i,t?i:c) = dof_value;
--      }
--   });
--}
--
--void ElementRestriction::BooleanMask(Vector& y) const
-+void ConformingElementRestriction::BooleanMask(Vector& y) const
- {
-    // Assumes all elements have the same number of dofs
-    const int nd = dof;
-@@ -280,8 +256,8 @@ void ElementRestriction::BooleanMask(Vector& y) const
-    }
- }
- 
--void ElementRestriction::FillSparseMatrix(const Vector &mat_ea,
--                                          SparseMatrix &mat) const
-+void ConformingElementRestriction::FillSparseMatrix(const Vector &mat_ea,
-+                                                    SparseMatrix &mat) const
- {
-    mat.GetMemoryI().New(mat.Height()+1, mat.GetMemoryI().GetMemoryType());
-    const int nnz = FillI(mat);
-@@ -319,7 +295,7 @@ static MFEM_HOST_DEVICE int GetAndIncrementNnzIndex(const int i_L, int* I)
-    return ind;
- }
- 
--int ElementRestriction::FillI(SparseMatrix &mat) const
-+int ConformingElementRestriction::FillI(SparseMatrix &mat) const
- {
-    static constexpr int Max = MaxNbNbr;
-    const int all_dofs = ndofs;
-@@ -396,8 +372,8 @@ int ElementRestriction::FillI(SparseMatrix &mat) const
-    return h_I[nTdofs];
- }
- 
--void ElementRestriction::FillJAndData(const Vector &ea_data,
--                                      SparseMatrix &mat) const
-+void ConformingElementRestriction::FillJAndData(const Vector &ea_data,
-+                                                SparseMatrix &mat) const
- {
-    static constexpr int Max = MaxNbNbr;
-    const int all_dofs = ndofs;
-@@ -523,11 +499,10 @@ void L2ElementRestriction::Mult(const Vector &x, Vector &y) const
- }
- 
- template <bool ADD>
--void L2ElementRestriction::TAddMultTranspose(const Vector &x, Vector &y) const
-+static void L2TAddMultTranspose(const int nd, const int vd, const bool t,
-+                                const int ndofs, const int ne,
-+                                const Vector &x, Vector &y)
- {
--   const int nd = ndof;
--   const int vd = vdim;
--   const bool t = byvdim;
-    auto d_x = Reshape(x.Read(), nd, vd, ne);
-    auto d_y = Reshape(ADD ? y.ReadWrite() : y.Write(), t?vd:ndofs, t?ndofs:vd);
-    mfem::forall(ndofs, [=] MFEM_HOST_DEVICE (int i)
-@@ -546,7 +521,7 @@ void L2ElementRestriction::TAddMultTranspose(const Vector &x, Vector &y) const
- void L2ElementRestriction::MultTranspose(const Vector &x, Vector &y) const
- {
-    constexpr bool ADD = false;
--   TAddMultTranspose<ADD>(x, y);
-+   L2TAddMultTranspose<ADD>(ndof, vdim, byvdim, ndofs, ne, x, y);
- }
- 
- void L2ElementRestriction::AddMultTranspose(const Vector &x, Vector &y,
-@@ -554,7 +529,7 @@ void L2ElementRestriction::AddMultTranspose(const Vector &x, Vector &y,
- {
-    MFEM_VERIFY(a == 1.0, "General coefficient case is not yet supported!");
-    constexpr bool ADD = true;
--   TAddMultTranspose<ADD>(x, y);
-+   L2TAddMultTranspose<ADD>(ndof, vdim, byvdim, ndofs, ne, x, y);
- }
- 
- void L2ElementRestriction::FillI(SparseMatrix &mat) const
-@@ -609,7 +584,6 @@ ConformingFaceRestriction::ConformingFaceRestriction(
-      byvdim(fes.GetOrdering() == Ordering::byVDIM),
-      face_dofs(nf > 0 ? fes.GetFaceElement(0)->GetDof() : 0),
-      elem_dofs(fes.GetFE(0)->GetDof()),
--     nfdofs(nf*face_dofs),
-      ndofs(fes.GetNDofs()),
-      scatter_indices(nf*face_dofs),
-      gather_offsets(ndofs+1),
-@@ -651,62 +625,102 @@ ConformingFaceRestriction::ConformingFaceRestriction(
-    : ConformingFaceRestriction(fes, f_ordering, type, true)
- { }
- 
--void ConformingFaceRestriction::Mult(const Vector& x, Vector& y) const
-+static void ConformingFaceRestriction_Mult(
-+   const int ndofs,
-+   const int face_dofs,
-+   const int nf,
-+   const int vdim,
-+   const bool by_vdim,
-+   const Array<int> &scatter_indices,
-+   const Vector &x,
-+   Vector &y,
-+   bool use_signs)
- {
-    if (nf==0) { return; }
-    // Assumes all elements have the same number of dofs
--   const int nface_dofs = face_dofs;
--   const int vd = vdim;
--   const bool t = byvdim;
-    auto d_indices = scatter_indices.Read();
--   auto d_x = Reshape(x.Read(), t?vd:ndofs, t?ndofs:vd);
--   auto d_y = Reshape(y.Write(), nface_dofs, vd, nf);
--   mfem::forall(nfdofs, [=] MFEM_HOST_DEVICE (int i)
--   {
--      const int s_idx = d_indices[i];
--      const int sgn = (s_idx >= 0) ? 1 : -1;
--      const int idx = (s_idx >= 0) ? s_idx : -1 - s_idx;
--      const int dof = i % nface_dofs;
--      const int face = i / nface_dofs;
--      for (int c = 0; c < vd; ++c)
-+   auto d_x = Reshape(x.Read(), by_vdim?vdim:ndofs, by_vdim?ndofs:vdim);
-+   auto d_y = Reshape(y.Write(), face_dofs, vdim, nf);
-+   mfem::forall(face_dofs*nf, [=] MFEM_HOST_DEVICE (int i)
-+   {
-+      const int s_idx_j = d_indices[i];
-+      const double sgn = (s_idx_j >= 0 || !use_signs) ? 1.0 : -1.0;
-+      const int idx_j = (s_idx_j >= 0) ? s_idx_j : -1 - s_idx_j;
-+      for (int c = 0; c < vdim; ++c)
-       {
--         d_y(dof, c, face) = sgn*d_x(t?c:idx, t?idx:c);
-+         d_y(i % face_dofs, c, i / face_dofs) =
-+            sgn*d_x(by_vdim?c:idx_j, by_vdim?idx_j:c);
-       }
-    });
- }
- 
--void ConformingFaceRestriction::AddMultTranspose(
--   const Vector& x, Vector& y, const double a) const
-+void ConformingFaceRestriction::Mult(const Vector& x, Vector& y) const
-+{
-+   ConformingFaceRestriction_Mult(
-+      ndofs, face_dofs, nf, vdim, byvdim, scatter_indices, x, y, true);
-+}
-+
-+void ConformingFaceRestriction::MultUnsigned(const Vector& x, Vector& y) const
-+{
-+   ConformingFaceRestriction_Mult(
-+      ndofs, face_dofs, nf, vdim, byvdim, scatter_indices, x, y, false);
-+}
-+
-+static void ConformingFaceRestriction_AddMultTranspose(
-+   const int ndofs,
-+   const int face_dofs,
-+   const int nf,
-+   const int vdim,
-+   const bool by_vdim,
-+   const Array<int> &gather_offsets,
-+   const Array<int> &gather_indices,
-+   const Vector &x,
-+   Vector &y,
-+   bool use_signs,
-+   const double a)
- {
-    MFEM_VERIFY(a == 1.0, "General coefficient case is not yet supported!");
-    if (nf==0) { return; }
-    // Assumes all elements have the same number of dofs
--   const int nface_dofs = face_dofs;
--   const int vd = vdim;
--   const bool t = byvdim;
-    auto d_offsets = gather_offsets.Read();
-    auto d_indices = gather_indices.Read();
--   auto d_x = Reshape(x.Read(), nface_dofs, vd, nf);
--   auto d_y = Reshape(y.ReadWrite(), t?vd:ndofs, t?ndofs:vd);
-+   auto d_x = Reshape(x.Read(), face_dofs, vdim, nf);
-+   auto d_y = Reshape(y.ReadWrite(), by_vdim?vdim:ndofs, by_vdim?ndofs:vdim);
-    mfem::forall(ndofs, [=] MFEM_HOST_DEVICE (int i)
-    {
-       const int offset = d_offsets[i];
-       const int next_offset = d_offsets[i + 1];
--      for (int c = 0; c < vd; ++c)
-+      for (int c = 0; c < vdim; ++c)
-       {
-          double dof_value = 0;
-          for (int j = offset; j < next_offset; ++j)
-          {
-             const int s_idx_j = d_indices[j];
--            const int sgn = (s_idx_j >= 0) ? 1 : -1;
-+            const double sgn = (s_idx_j >= 0 || !use_signs) ? 1.0 : -1.0;
-             const int idx_j = (s_idx_j >= 0) ? s_idx_j : -1 - s_idx_j;
--            dof_value += sgn*d_x(idx_j % nface_dofs, c, idx_j / nface_dofs);
-+            dof_value += sgn*d_x(idx_j % face_dofs, c, idx_j / face_dofs);
-          }
--         d_y(t?c:i,t?i:c) += dof_value;
-+         d_y(by_vdim?c:i,by_vdim?i:c) += dof_value;
-       }
-    });
- }
- 
-+void ConformingFaceRestriction::AddMultTranspose(
-+   const Vector& x, Vector& y, const double a) const
-+{
-+   ConformingFaceRestriction_AddMultTranspose(
-+      ndofs, face_dofs, nf, vdim, byvdim, gather_offsets, gather_indices, x, y,
-+      true, a);
-+}
-+
-+void ConformingFaceRestriction::AddMultTransposeUnsigned(
-+   const Vector& x, Vector& y, const double a) const
-+{
-+   ConformingFaceRestriction_AddMultTranspose(
-+      ndofs, face_dofs, nf, vdim, byvdim, gather_offsets, gather_indices, x, y,
-+      false, a);
-+}
-+
- void ConformingFaceRestriction::CheckFESpace(const ElementDofOrdering
-                                              f_ordering)
- {
-@@ -1019,7 +1033,6 @@ L2FaceRestriction::L2FaceRestriction(const FiniteElementSpace &fes,
-                fes.GetTraceElement(0, fes.GetMesh()->GetFaceGeometry(0))->GetDof()
-                : 0),
-      elem_dofs(fes.GetFE(0)->GetDof()),
--     nfdofs(nf*face_dofs),
-      ndofs(fes.GetNDofs()),
-      type(type),
-      m(m),
-@@ -1060,7 +1073,7 @@ void L2FaceRestriction::SingleValuedConformingMult(const Vector& x,
-    auto d_indices1 = scatter_indices1.Read();
-    auto d_x = Reshape(x.Read(), t?vd:ndofs, t?ndofs:vd);
-    auto d_y = Reshape(y.Write(), nface_dofs, vd, nf);
--   mfem::forall(nfdofs, [=] MFEM_HOST_DEVICE (int i)
-+   mfem::forall(face_dofs*nf, [=] MFEM_HOST_DEVICE (int i)
-    {
-       const int dof = i % nface_dofs;
-       const int face = i / nface_dofs;
-@@ -1086,7 +1099,7 @@ void L2FaceRestriction::DoubleValuedConformingMult(const Vector& x,
-    auto d_indices2 = scatter_indices2.Read();
-    auto d_x = Reshape(x.Read(), t?vd:ndofs, t?ndofs:vd);
-    auto d_y = Reshape(y.Write(), nface_dofs, vd, 2, nf);
--   mfem::forall(nfdofs, [=] MFEM_HOST_DEVICE (int i)
-+   mfem::forall(face_dofs*nf, [=] MFEM_HOST_DEVICE (int i)
-    {
-       const int dof = i % nface_dofs;
-       const int face = i / nface_dofs;
-@@ -1137,7 +1150,7 @@ void L2FaceRestriction::SingleValuedConformingAddMultTranspose(
-          for (int j = offset; j < next_offset; ++j)
-          {
-             int idx_j = d_indices[j];
--            dof_value +=  d_x(idx_j % nface_dofs, c, idx_j / nface_dofs);
-+            dof_value += d_x(idx_j % nface_dofs, c, idx_j / nface_dofs);
-          }
-          d_y(t?c:i,t?i:c) += dof_value;
-       }
-@@ -1151,7 +1164,7 @@ void L2FaceRestriction::DoubleValuedConformingAddMultTranspose(
-    const int nface_dofs = face_dofs;
-    const int vd = vdim;
-    const bool t = byvdim;
--   const int dofs = nfdofs;
-+   const int dofs = face_dofs*nf;
-    auto d_offsets = gather_offsets.Read();
-    auto d_indices = gather_indices.Read();
-    auto d_x = Reshape(x.Read(), nface_dofs, vd, 2, nf);
-@@ -1168,9 +1181,8 @@ void L2FaceRestriction::DoubleValuedConformingAddMultTranspose(
-             int idx_j = d_indices[j];
-             bool isE1 = idx_j < dofs;
-             idx_j = isE1 ? idx_j : idx_j - dofs;
--            dof_value +=  isE1 ?
--                          d_x(idx_j % nface_dofs, c, 0, idx_j / nface_dofs)
--                          :d_x(idx_j % nface_dofs, c, 1, idx_j / nface_dofs);
-+            dof_value += (isE1 ? d_x(idx_j % nface_dofs, c, 0, idx_j / nface_dofs)
-+                          : d_x(idx_j % nface_dofs, c, 1, idx_j / nface_dofs));
-          }
-          d_y(t?c:i,t?i:c) += dof_value;
-       }
-@@ -1565,7 +1577,7 @@ void L2FaceRestriction::PermuteAndSetFaceDofsGatherIndices2(
-       const int global_dof_elem2 = elem_map[elem_index*elem_dofs + volume_dof_elem2];
-       const int restriction_dof_elem2 = face_dofs*face_index + face_dof_elem1;
-       // We shift restriction_dof_elem2 to express that it's elem2 of the face
--      gather_indices[gather_offsets[global_dof_elem2]++] = nfdofs +
-+      gather_indices[gather_offsets[global_dof_elem2]++] = face_dofs*nf +
-                                                            restriction_dof_elem2;
-    }
- }
-diff --git a/fem/restriction.hpp b/fem/restriction.hpp
-index 617305541..7eb6eaede 100644
---- a/fem/restriction.hpp
-+++ b/fem/restriction.hpp
-@@ -22,19 +22,44 @@ class FiniteElementSpace;
- enum class ElementDofOrdering;
- 
- /// Abstract base class that defines an interface for element restrictions.
--class ElementRestrictionOperator : public Operator
-+class ElementRestriction : public Operator
- {
- public:
--   /// @brief Add the E-vector degrees of freedom @a x to the L-vector degrees
--   /// of freedom @a y.
-+   /** @brief Extract the degrees of freedom from @a x into @a y. */
-+   void Mult(const Vector &x, Vector &y) const override = 0;
-+
-+   /** @brief Set the degrees of freedom in the element degrees of freedom
-+       @a y to the values given in @a x. */
-+   void MultTranspose(const Vector &x, Vector &y) const override
-+   {
-+      y = 0.0;
-+      AddMultTranspose(x, y);
-+   }
-+
-+   /** @brief Add the degrees of freedom @a x to the element degrees of
-+       freedom @a y. */
-    void AddMultTranspose(const Vector &x, Vector &y,
-                          const double a = 1.0) const override = 0;
-+
-+   /** @brief Add the degrees of freedom @a x to the element degrees of
-+       freedom @a y ignoring the signs from DOF orientation. */
-+   virtual void MultUnsigned(const Vector &x, Vector &y) const
-+   {
-+      Mult(x, y);
-+   }
-+
-+   /** @brief Add the degrees of freedom @a x to the element degrees of
-+       freedom @a y ignoring the signs from DOF orientation. */
-+   virtual void MultTransposeUnsigned(const Vector &x, Vector &y) const
-+   {
-+      MultTranspose(x, y);
-+   }
- };
- 
- /// Operator that converts FiniteElementSpace L-vectors to E-vectors.
- /** Objects of this type are typically created and owned by FiniteElementSpace
-     objects, see FiniteElementSpace::GetElementRestriction(). */
--class ElementRestriction : public ElementRestrictionOperator
-+class ConformingElementRestriction : public ElementRestriction
- {
- private:
-    /** This number defines the maximum number of elements any dof can belong to
-@@ -48,7 +73,6 @@ protected:
-    const bool byvdim;
-    const int ndofs;
-    const int dof;
--   const int nedofs;
-    Array<int> offsets;
-    Array<int> indices;
-    Array<int> gather_map;
-@@ -65,20 +89,18 @@ protected:
-    ///@}
- 
- public:
--   ElementRestriction(const FiniteElementSpace&, ElementDofOrdering);
-+   ConformingElementRestriction(const FiniteElementSpace&, ElementDofOrdering);
-+
-    void Mult(const Vector &x, Vector &y) const override;
-+
-    void MultTranspose(const Vector &x, Vector &y) const override;
-+
-    void AddMultTranspose(const Vector &x, Vector &y,
-                          const double a = 1.0) const override;
- 
--   /// Compute Mult without applying signs based on DOF orientations.
--   void MultUnsigned(const Vector &x, Vector &y) const;
--   /// Compute MultTranspose without applying signs based on DOF orientations.
--   void MultTransposeUnsigned(const Vector &x, Vector &y) const;
-+   void MultUnsigned(const Vector &x, Vector &y) const override;
- 
--   /// Compute MultTranspose by setting (rather than adding) element
--   /// contributions; this is a left inverse of the Mult() operation
--   void MultLeftInverse(const Vector &x, Vector &y) const;
-+   void MultTransposeUnsigned(const Vector &x, Vector &y) const override;
- 
-    /// @brief Fills the E-vector y with `boolean` values 0.0 and 1.0 such that each
-    /// each entry of the L-vector is uniquely represented in `y`.
-@@ -92,16 +114,13 @@ public:
-    void FillSparseMatrix(const Vector &mat_ea, SparseMatrix &mat) const;
- 
-    /** Fill the I array of SparseMatrix corresponding to the sparsity pattern
--       given by this ElementRestriction. */
-+       given by this ConformingElementRestriction. */
-    int FillI(SparseMatrix &mat) const;
-+
-    /** Fill the J and Data arrays of SparseMatrix corresponding to the sparsity
--       pattern given by this ElementRestriction, and the values of ea_data. */
-+       pattern given by this ConformingElementRestriction, and the values of
-+       ea_data. */
-    void FillJAndData(const Vector &ea_data, SparseMatrix &mat) const;
--   /// @private Not part of the public interface (device kernel limitation).
--   ///
--   /// Performs either MultTranspose or AddMultTranspose depending on the
--   /// boolean template parameter @a ADD.
--   template <bool ADD> void TAddMultTranspose(const Vector &x, Vector &y) const;
- };
- 
- /// Operator that converts L2 FiniteElementSpace L-vectors to E-vectors.
-@@ -109,37 +128,39 @@ public:
-     objects, see FiniteElementSpace::GetElementRestriction(). L-vectors
-     corresponding to grid functions in L2 finite element spaces differ from
-     E-vectors only in the ordering of the degrees of freedom. */
--class L2ElementRestriction : public ElementRestrictionOperator
-+class L2ElementRestriction : public ElementRestriction
- {
-+private:
-    const int ne;
-    const int vdim;
-    const bool byvdim;
-    const int ndof;
-    const int ndofs;
-+
- public:
-    L2ElementRestriction(const FiniteElementSpace&);
-+
-    void Mult(const Vector &x, Vector &y) const override;
-+
-    void MultTranspose(const Vector &x, Vector &y) const override;
-+
-    void AddMultTranspose(const Vector &x, Vector &y,
-                          const double a = 1.0) const override;
-+
-    /** Fill the I array of SparseMatrix corresponding to the sparsity pattern
-        given by this ElementRestriction. */
-    void FillI(SparseMatrix &mat) const;
-+
-    /** Fill the J and Data arrays of SparseMatrix corresponding to the sparsity
-        pattern given by this L2FaceRestriction, and the values of ea_data. */
-    void FillJAndData(const Vector &ea_data, SparseMatrix &mat) const;
--   /// @private Not part of the public interface (device kernel limitation).
--   ///
--   /// Performs either MultTranspose or AddMultTranspose depending on the
--   /// boolean template parameter @a ADD.
--   template <bool ADD> void TAddMultTranspose(const Vector &x, Vector &y) const;
- };
- 
- /** An enum type to specify if only e1 value is requested (SingleValued) or both
-     e1 and e2 (DoubleValued). */
- enum class L2FaceValues : bool {SingleValued, DoubleValued};
- 
--/** @brief Base class for operators that extracts Face degrees of freedom.
-+/** @brief Abstract base class for operators that extracts Face degrees of freedom.
- 
-     In order to compute quantities on the faces of a mesh, it is often useful to
-     extract the degrees of freedom on the faces of the elements. This class
-@@ -177,6 +198,19 @@ public:
-    */
-    void Mult(const Vector &x, Vector &y) const override = 0;
- 
-+   /** @brief Set the face degrees of freedom in the element degrees of freedom
-+       @a y to the values given in @a x.
-+
-+       @param[in]     x The face degrees of freedom on the face.
-+       @param[in,out] y The L-vector of degrees of freedom to which we add the
-+                        face degrees of freedom.
-+   */
-+   void MultTranspose(const Vector &x, Vector &y) const override
-+   {
-+      y = 0.0;
-+      AddMultTranspose(x, y);
-+   }
-+
-    /** @brief Add the face degrees of freedom @a x to the element degrees of
-        freedom @a y.
- 
-@@ -185,14 +219,29 @@ public:
-                         face degrees of freedom.
-        @param[in]     a Scalar coefficient for addition.
-    */
--   virtual void AddMultTranspose(const Vector &x, Vector &y,
--                                 const double a = 1.0) const override = 0;
-+   void AddMultTranspose(const Vector &x, Vector &y,
-+                         const double a = 1.0) const override = 0;
-+
-+   /** @brief Extract the face degrees of freedom from @a x into @a y ignoring
-+       the signs from DOF orientation. */
-+   virtual void MultUnsigned(const Vector &x, Vector &y) const
-+   {
-+      Mult(x, y);
-+   }
-+
-+   /** @brief Add the face degrees of freedom @a x to the element degrees of
-+       freedom @a y ignoring the signs from DOF orientation. */
-+   virtual void AddMultTransposeUnsigned(const Vector &x, Vector &y,
-+                                         const double a = 1.0) const
-+   {
-+      AddMultTranspose(x, y, a);
-+   }
- 
-    /** @brief Add the face degrees of freedom @a x to the element degrees of
-        freedom @a y. Perform the same computation as AddMultTranspose, but
-        @a x is invalid after calling this method.
- 
--       @param[in,out]     x The face degrees of freedom on the face.
-+       @param[in,out] x The face degrees of freedom on the face.
-        @param[in,out] y The L-vector of degrees of freedom to which we add the
-                         face degrees of freedom.
- 
-@@ -203,19 +252,6 @@ public:
-    {
-       AddMultTranspose(x, y);
-    }
--
--   /** @brief Set the face degrees of freedom in the element degrees of freedom
--       @a y to the values given in @a x.
--
--       @param[in]     x The face degrees of freedom on the face.
--       @param[in,out] y The L-vector of degrees of freedom to which we add the
--                        face degrees of freedom.
--   */
--   void MultTranspose(const Vector &x, Vector &y) const override
--   {
--      y = 0.0;
--      AddMultTranspose(x, y);
--   }
- };
- 
- /// @brief Operator that extracts face degrees of freedom for H1, ND, or RT
-@@ -232,7 +268,6 @@ protected:
-    const bool byvdim;
-    const int face_dofs; // Number of dofs on each face
-    const int elem_dofs; // Number of dofs in each element
--   const int nfdofs; // Total number of face E-vector dofs
-    const int ndofs; // Total number of dofs
-    Array<int> scatter_indices; // Scattering indices for element 1 on each face
-    Array<int> gather_offsets; // offsets for the gathering indices of each dof
-@@ -252,6 +287,7 @@ protected:
-                              const ElementDofOrdering f_ordering,
-                              const FaceType type,
-                              bool build);
-+
- public:
-    /** @brief Construct a ConformingFaceRestriction.
- 
-@@ -274,22 +310,33 @@ public:
-                      ElementDofOrdering. */
-    void Mult(const Vector &x, Vector &y) const override;
- 
--   using FaceRestriction::AddMultTransposeInPlace;
-+   /** @brief Extract the face degrees of freedom from @a x into @a y ignoring
-+       the signs from DOF orientation.
-+
-+       @sa Mult(). */
-+   void MultUnsigned(const Vector &x, Vector &y) const override;
- 
-    /** @brief Gather the degrees of freedom, i.e. goes from face E-Vector to
-        L-Vector.
- 
--       @param[in]  x The face E-Vector degrees of freedom with the given format:
--                     face_dofs x vdim x nf
--                     where nf is the number of interior or boundary faces
--                     requested by @a type in the constructor.
--                     The face_dofs should be ordered according to the given
--                     ElementDofOrdering
-+       @param[in]     x The face E-Vector degrees of freedom with the given format:
-+                        face_dofs x vdim x nf
-+                        where nf is the number of interior or boundary faces
-+                        requested by @a type in the constructor.
-+                        The face_dofs should be ordered according to the given
-+                        ElementDofOrdering
-        @param[in,out] y The L-vector degrees of freedom.
--       @param[in]  a Scalar coefficient for addition. */
-+       @param[in]     a Scalar coefficient for addition. */
-    void AddMultTranspose(const Vector &x, Vector &y,
-                          const double a = 1.0) const override;
- 
-+   /** @brief Gather the degrees of freedom, i.e. goes from face E-Vector to
-+       L-Vector @b not taking into account signs from DOF orientations.
-+
-+       @sa AddMultTranspose(). */
-+   void AddMultTransposeUnsigned(const Vector &x, Vector &y,
-+                                 const double a = 1.0) const override;
-+
- private:
-    /** @brief Compute the scatter indices: L-vector to E-vector, and the offsets
-        for the gathering: E-vector to L-vector.
-@@ -360,7 +407,6 @@ protected:
-    const bool byvdim;
-    const int face_dofs; // Number of dofs on each face
-    const int elem_dofs; // Number of dofs in each element
--   const int nfdofs; // Total number of dofs on the faces
-    const int ndofs; // Total number of dofs
-    const FaceType type;
-    const L2FaceValues m;
-@@ -412,8 +458,6 @@ public:
-                      ElementDofOrdering. */
-    void Mult(const Vector &x, Vector &y) const override;
- 
--   using FaceRestriction::AddMultTranspose;
--
-    /** @brief Gather the degrees of freedom, i.e. goes from face E-Vector to
-        L-Vector.
- 
-@@ -810,6 +854,7 @@ protected:
-                        const FaceType type,
-                        const L2FaceValues m,
-                        bool build);
-+
- public:
-    /** @brief Constructs an NCL2FaceRestriction, this is a specialization of a
-        L2FaceRestriction for nonconforming meshes.
-diff --git a/fem/transfer.cpp b/fem/transfer.cpp
-index 68dab72d5..7f95ca9fe 100644
---- a/fem/transfer.cpp
-+++ b/fem/transfer.cpp
-@@ -92,9 +92,9 @@ const Operator &GridTransfer::MakeTrueOperator(
-    else // Parallel() == true
-    {
- #ifdef MFEM_USE_MPI
-+      const SparseMatrix *out_R = fes_out.GetRestrictionMatrix();
-       if (oper_type == Operator::Hypre_ParCSR)
-       {
--         const SparseMatrix *out_R = fes_out.GetRestrictionMatrix();
-          const ParFiniteElementSpace *pfes_in =
-             dynamic_cast<const ParFiniteElementSpace *>(&fes_in);
-          const ParFiniteElementSpace *pfes_out =
-@@ -122,7 +122,6 @@ const Operator &GridTransfer::MakeTrueOperator(
-       }
-       else if (oper_type == Operator::ANY_TYPE)
-       {
--         const Operator *out_R = fes_out.GetRestrictionOperator();
-          t_oper.Reset(new TripleProductOperator(
-                          out_R, &oper, fes_in.GetProlongationMatrix(),
-                          false, false, false));
-@@ -1159,12 +1158,12 @@ TensorProductPRefinementTransferOperator(
-    localL.UseDevice(true);
-    localH.UseDevice(true);
- 
--   MFEM_VERIFY(dynamic_cast<const ElementRestriction*>(elem_restrict_lex_h),
-+   const auto *elem_restrict =
-+      dynamic_cast<const ConformingElementRestriction*>(elem_restrict_lex_h);
-+   MFEM_VERIFY(elem_restrict,
-                "High order element restriction is of unsupported type");
--
-    mask.SetSize(localH.Size(), Device::GetMemoryType());
--   static_cast<const ElementRestriction*>(elem_restrict_lex_h)
--   ->BooleanMask(mask);
-+   elem_restrict->BooleanMask(mask);
-    mask.UseDevice(true);
- }
- 
-diff --git a/general/communication.hpp b/general/communication.hpp
-index 474486f1b..c7d00f1e4 100644
---- a/general/communication.hpp
-+++ b/general/communication.hpp
-@@ -217,7 +217,6 @@ protected:
-    int group_buf_size;
-    mutable Array<char> group_buf;
-    MPI_Request *requests;
--   // MPI_Status  *statuses;
-    // comm_lock: 0 - no lock, 1 - locked for Bcast, 2 - locked for Reduce
-    mutable int comm_lock;
-    mutable int num_requests;
-diff --git a/general/version.cpp b/general/version.cpp
-index d2b05f8ed..f4c402d51 100644
---- a/general/version.cpp
-+++ b/general/version.cpp
-@@ -103,9 +103,6 @@ const char *GetConfigStr()
- #ifdef MFEM_USE_LAPACK
-       "MFEM_USE_LAPACK\n"
- #endif
--#ifdef MFEM_USE_LEGACY_OPENMP
--      "MFEM_USE_LEGACY_OPENMP\n"
--#endif
- #ifdef MFEM_USE_LIBUNWIND
-       "MFEM_USE_LIBUNWIND\n"
- #endif
-diff --git a/linalg/auxiliary.cpp b/linalg/auxiliary.cpp
-index 003e52695..bcb55ca3e 100644
---- a/linalg/auxiliary.cpp
-+++ b/linalg/auxiliary.cpp
-@@ -171,7 +171,6 @@ MatrixFreeAuxiliarySpace::MatrixFreeAuxiliarySpace(
-    {
-       a_lor.AddDomainIntegrator(new VectorMassIntegrator);
-    }
--   a_lor.UsePrecomputedSparsity();
-    a_lor.Assemble();
-    a_lor.EliminateEssentialBC(ess_bdr, policy);
-    a_lor.Finalize();
-@@ -244,7 +243,6 @@ MatrixFreeAuxiliarySpace::MatrixFreeAuxiliarySpace(
-    {
-       a_lor.AddDomainIntegrator(new DiffusionIntegrator);
-    }
--   a_lor.UsePrecomputedSparsity();
-    a_lor.Assemble();
-    if (ess_bdr.Size())
-    {
-@@ -460,14 +458,14 @@ MatrixFreeAMS::MatrixFreeAMS(
-    pa_grad->SetAssemblyLevel(AssemblyLevel::PARTIAL);
-    pa_grad->AddDomainInterpolator(new GradientInterpolator);
-    pa_grad->Assemble();
--   pa_grad->FormRectangularSystemMatrix(Gradient);
-+   pa_grad->FormDiscreteOperatorMatrix(Gradient);
- 
-    // build Pi operator
-    pa_interp = new ParDiscreteLinearOperator(h1_fespace_d, &nd_fespace);
-    pa_interp->SetAssemblyLevel(AssemblyLevel::PARTIAL);
-    pa_interp->AddDomainInterpolator(new IdentityInterpolator);
-    pa_interp->Assemble();
--   pa_interp->FormRectangularSystemMatrix(Pi);
-+   pa_interp->FormDiscreteOperatorMatrix(Pi);
- 
-    // build LOR space
-    ParMesh mesh_lor = ParMesh::MakeRefined(*mesh, order, BasisType::GaussLobatto);
-diff --git a/linalg/handle.hpp b/linalg/handle.hpp
-index 818294985..96f658d6a 100644
---- a/linalg/handle.hpp
-+++ b/linalg/handle.hpp
-@@ -207,7 +207,6 @@ public:
-                     const Vector &X, Vector &B) const;
- };
- 
--
- /// Add an alternative name for OperatorHandle -- OperatorPtr.
- typedef OperatorHandle OperatorPtr;
- 
-diff --git a/linalg/hypre.hpp b/linalg/hypre.hpp
-index 913bdb2d0..3d5b4b2fe 100644
---- a/linalg/hypre.hpp
-+++ b/linalg/hypre.hpp
-@@ -720,7 +720,7 @@ public:
- 
-    /** @brief The "Boolean" analog of y = alpha * A * x + beta * y, where
-        elements in the sparsity pattern of the matrix are treated as "true". */
--   void BooleanMult(int alpha, const int *x, int beta, int *y)
-+   void BooleanMult(int alpha, const int *x, int beta, int *y) const
-    {
-       HostRead();
-       internal::hypre_ParCSRMatrixBooleanMatvec(A, alpha, const_cast<int*>(x),
-@@ -730,7 +730,7 @@ public:
- 
-    /** @brief The "Boolean" analog of y = alpha * A^T * x + beta * y, where
-        elements in the sparsity pattern of the matrix are treated as "true". */
--   void BooleanMultTranspose(int alpha, const int *x, int beta, int *y)
-+   void BooleanMultTranspose(int alpha, const int *x, int beta, int *y) const
-    {
-       HostRead();
-       internal::hypre_ParCSRMatrixBooleanMatvecT(A, alpha, const_cast<int*>(x),
-diff --git a/linalg/operator.cpp b/linalg/operator.cpp
-index 1f214ece7..64f75c8a4 100644
---- a/linalg/operator.cpp
-+++ b/linalg/operator.cpp
-@@ -165,7 +165,7 @@ void Operator::RecoverFEMSolution(const Vector &X, const Vector &b, Vector &x)
-    }
- }
- 
--Operator * Operator::SetupRAP(const Operator *Pi, const Operator *Po)
-+Operator *Operator::SetupRAP(const Operator *Pi, const Operator *Po)
- {
-    Operator *rap;
-    if (!IsIdentityProlongation(Pi))
-@@ -176,15 +176,15 @@ Operator * Operator::SetupRAP(const Operator *Pi, const Operator *Po)
-       }
-       else
-       {
--         rap = new ProductOperator(this, Pi, false,false);
-+         rap = new ProductOperator(this, Pi, false, false);
-       }
-    }
-    else
-    {
-       if (!IsIdentityProlongation(Po))
-       {
--         TransposeOperator * PoT = new TransposeOperator(Po);
--         rap = new ProductOperator(PoT, this, true,false);
-+         TransposeOperator *PoT = new TransposeOperator(Po);
-+         rap = new ProductOperator(PoT, this, true, false);
-       }
-       else
-       {
-@@ -245,10 +245,10 @@ void Operator::FormDiscreteOperator(Operator* &Aout)
- {
-    const Operator *Pin  = this->GetProlongation();
-    const Operator *Rout = this->GetOutputRestriction();
--   Aout = new TripleProductOperator(Rout, this, Pin,false, false, false);
-+   Aout = new TripleProductOperator(Rout, this, Pin, false, false, false);
- }
- 
--void Operator::PrintMatlab(std::ostream & os, int n, int m) const
-+void Operator::PrintMatlab(std::ostream &os, int n, int m) const
- {
-    using namespace std;
-    if (n == 0) { n = width; }
-diff --git a/linalg/operator.hpp b/linalg/operator.hpp
-index baa9bf767..cdf700e17 100644
---- a/linalg/operator.hpp
-+++ b/linalg/operator.hpp
-@@ -29,8 +29,8 @@ protected:
- 
-    /// see FormSystemOperator()
-    /** @note Uses DiagonalPolicy::DIAG_ONE. */
--   void FormConstrainedSystemOperator(
--      const Array<int> &ess_tdof_list, ConstrainedOperator* &Aout);
-+   void FormConstrainedSystemOperator(const Array<int> &ess_tdof_list,
-+                                      ConstrainedOperator* &Aout);
- 
-    /// see FormRectangularSystemOperator()
-    void FormRectangularConstrainedSystemOperator(
-@@ -38,10 +38,6 @@ protected:
-       const Array<int> &test_tdof_list,
-       RectangularConstrainedOperator* &Aout);
- 
--   /** @brief Returns RAP Operator of this, using input/output Prolongation matrices
--       @a Pi corresponds to "P", @a Po corresponds to "Rt" */
--   Operator *SetupRAP(const Operator *Pi, const Operator *Po);
--
- public:
-    /// Defines operator diagonal policy upon elimination of rows and/or columns.
-    enum DiagonalPolicy
-@@ -149,12 +145,6 @@ public:
-       return GetProlongation(); // Assume square unless specialized
-    }
- 
--   /** @brief Transpose of GetOutputRestriction, directly available in this
--       form to facilitate matrix-free RAP-type operators.
--
--       `NULL` means identity. */
--   virtual const Operator *GetOutputRestrictionTranspose() const { return NULL; }
--
-    /** @brief Restriction operator from output vectors for the operator to linear
-        algebra (linear system) vectors. `NULL` means identity. */
-    virtual const Operator *GetOutputRestriction() const
-@@ -239,6 +229,10 @@ public:
-        forms, though currently @a b is not used in the implementation. */
-    virtual void RecoverFEMSolution(const Vector &X, const Vector &b, Vector &x);
- 
-+   /** @brief Returns RAP Operator of this, using input/output Prolongation matrices
-+       @a Pi corresponds to "P", @a Po corresponds to "Rt" */
-+   Operator *SetupRAP(const Operator *Pi, const Operator *Po);
-+
-    /** @brief Return in @a A a parallel (on truedofs) version of this square
-        operator.
- 
-@@ -270,10 +264,10 @@ public:
-    void FormDiscreteOperator(Operator* &A);
- 
-    /// Prints operator with input size n and output size m in Matlab format.
--   void PrintMatlab(std::ostream & out, int n, int m = 0) const;
-+   void PrintMatlab(std::ostream &out, int n, int m = 0) const;
- 
-    /// Prints operator in Matlab format.
--   virtual void PrintMatlab(std::ostream & out) const;
-+   virtual void PrintMatlab(std::ostream &out) const;
- 
-    /// Virtual destructor.
-    virtual ~Operator() { }
-@@ -722,6 +716,7 @@ inline bool IsIdentityProlongation(const Operator *P)
-    return !P || dynamic_cast<const IdentityOperator*>(P);
- }
- 
-+
- /// Scaled Operator B: x -> a A(x).
- class ScaledOperator : public Operator
- {
-@@ -928,6 +923,7 @@ public:
-    virtual ~ConstrainedOperator() { if (own_A) { delete A; } }
- };
- 
-+
- /** @brief Rectangular Operator for imposing essential boundary conditions on
-     the input space using only the action, Mult(), of a given unconstrained
-     Operator.
-@@ -981,6 +977,7 @@ public:
-    virtual ~RectangularConstrainedOperator() { if (own_A) { delete A; } }
- };
- 
-+
- /** @brief PowerMethod helper class to estimate the largest eigenvalue of an
-            operator using the iterative power method. */
- class PowerMethod
-diff --git a/linalg/solvers.hpp b/linalg/solvers.hpp
-index 085cea616..434864042 100644
---- a/linalg/solvers.hpp
-+++ b/linalg/solvers.hpp
-@@ -276,8 +276,11 @@ public:
- 
-    ///@}
- 
--   /// This should be called before SetOperator
-+   /// This should be called before SetOperator if you want SetOperator to
-+   /// set both the solver and preconditioner operators together
-    virtual void SetPreconditioner(Solver &pr);
-+   void SetPreconditioner(Solver *pr)
-+   { if (pr) { SetPreconditioner(*pr); } else { prec = nullptr; } }
- 
-    /// Also calls SetOperator for the preconditioner if there is one
-    virtual void SetOperator(const Operator &op) override;
-diff --git a/linalg/sparsemat.cpp b/linalg/sparsemat.cpp
-index 145379c00..e55ea47b4 100644
---- a/linalg/sparsemat.cpp
-+++ b/linalg/sparsemat.cpp
-@@ -764,7 +764,6 @@ void SparseMatrix::AddMult(const Vector &x, Vector &y, const double a) const
-       return;
-    }
- 
--#ifndef MFEM_USE_LEGACY_OPENMP
-    const int height = this->height;
-    const int nnz = J.Capacity();
-    auto d_I = Read(I, height+1);
-@@ -885,24 +884,6 @@ void SparseMatrix::AddMult(const Vector &x, Vector &y, const double a) const
-       });
- 
-    }
--
--#else // MFEM_USE_LEGACY_OPENMP
--   const double *Ap = A, *xp = x.GetData();
--   double *yp = y.GetData();
--   const int *Jp = J, *Ip = I;
--
--   #pragma omp parallel for
--   for (int i = 0; i < height; i++)
--   {
--      double d = 0.0;
--      const int end = Ip[i+1];
--      for (int j = Ip[i]; j < end; j++)
--      {
--         d += Ap[j] * xp[Jp[j]];
--      }
--      yp[i] += a * d;
--   }
--#endif // MFEM_USE_LEGACY_OPENMP
- }
- 
- void SparseMatrix::MultTranspose(const Vector &x, Vector &y) const
-diff --git a/linalg/vector.cpp b/linalg/vector.cpp
-index 4951ed914..db1a93666 100644
---- a/linalg/vector.cpp
-+++ b/linalg/vector.cpp
-@@ -113,9 +113,6 @@ const double &Vector::Elem(int i) const
- double Vector::operator*(const double *v) const
- {
-    double dot = 0.0;
--#ifdef MFEM_USE_LEGACY_OPENMP
--   #pragma omp parallel for reduction(+:dot)
--#endif
-    for (int i = 0; i < size; i++)
-    {
-       dot += data[i] * v[i];
-@@ -313,12 +310,19 @@ void Vector::Neg()
-    mfem::forall_switch(use_dev, N, [=] MFEM_HOST_DEVICE (int i) { y[i] = -y[i]; });
- }
- 
-+void Vector::Reciprocal()
-+{
-+   const bool use_dev = UseDevice();
-+   const int N = size;
-+   auto y = ReadWrite(use_dev);
-+   mfem::forall_switch(use_dev, N, [=] MFEM_HOST_DEVICE (int i) { y[i] = 1.0/y[i]; });
-+}
-+
- void add(const Vector &v1, const Vector &v2, Vector &v)
- {
-    MFEM_ASSERT(v.size == v1.size && v.size == v2.size,
-                "incompatible Vectors!");
- 
--#if !defined(MFEM_USE_LEGACY_OPENMP)
-    const bool use_dev = v1.UseDevice() || v2.UseDevice() || v.UseDevice();
-    const int N = v.size;
-    // Note: get read access first, in case v is the same as v1/v2.
-@@ -326,13 +330,6 @@ void add(const Vector &v1, const Vector &v2, Vector &v)
-    auto x2 = v2.Read(use_dev);
-    auto y = v.Write(use_dev);
-    mfem::forall_switch(use_dev, N, [=] MFEM_HOST_DEVICE (int i) { y[i] = x1[i] + x2[i]; });
--#else
--   #pragma omp parallel for
--   for (int i = 0; i < v.size; i++)
--   {
--      v.data[i] = v1.data[i] + v2.data[i];
--   }
--#endif
- }
- 
- void add(const Vector &v1, double alpha, const Vector &v2, Vector &v)
-@@ -350,7 +347,6 @@ void add(const Vector &v1, double alpha, const Vector &v2, Vector &v)
-    }
-    else
-    {
--#if !defined(MFEM_USE_LEGACY_OPENMP)
-       const bool use_dev = v1.UseDevice() || v2.UseDevice() || v.UseDevice();
-       const int N = v.size;
-       // Note: get read access first, in case v is the same as v1/v2.
-@@ -361,16 +357,6 @@ void add(const Vector &v1, double alpha, const Vector &v2, Vector &v)
-       {
-          d_z[i] = d_x[i] + alpha * d_y[i];
-       });
--#else
--      const double *v1p = v1.data, *v2p = v2.data;
--      double *vp = v.data;
--      const int s = v.size;
--      #pragma omp parallel for
--      for (int i = 0; i < s; i++)
--      {
--         vp[i] = v1p[i] + alpha*v2p[i];
--      }
--#endif
-    }
- }
- 
-@@ -389,7 +375,6 @@ void add(const double a, const Vector &x, const Vector &y, Vector &z)
-    }
-    else
-    {
--#if !defined(MFEM_USE_LEGACY_OPENMP)
-       const bool use_dev = x.UseDevice() || y.UseDevice() || z.UseDevice();
-       const int N = x.size;
-       // Note: get read access first, in case z is the same as x/y.
-@@ -400,17 +385,6 @@ void add(const double a, const Vector &x, const Vector &y, Vector &z)
-       {
-          zd[i] = a * (xd[i] + yd[i]);
-       });
--#else
--      const double *xp = x.data;
--      const double *yp = y.data;
--      double       *zp = z.data;
--      const int      s = x.size;
--      #pragma omp parallel for
--      for (int i = 0; i < s; i++)
--      {
--         zp[i] = a * (xp[i] + yp[i]);
--      }
--#endif
-    }
- }
- 
-@@ -444,7 +418,6 @@ void add(const double a, const Vector &x,
- #endif
-    else
-    {
--#if !defined(MFEM_USE_LEGACY_OPENMP)
-       const bool use_dev = x.UseDevice() || y.UseDevice() || z.UseDevice();
-       const int N = x.size;
-       // Note: get read access first, in case z is the same as x/y.
-@@ -455,17 +428,6 @@ void add(const double a, const Vector &x,
-       {
-          zd[i] = a * xd[i] + b * yd[i];
-       });
--#else
--      const double *xp = x.data;
--      const double *yp = y.data;
--      double       *zp = z.data;
--      const int      s = x.size;
--      #pragma omp parallel for
--      for (int i = 0; i < s; i++)
--      {
--         zp[i] = a * xp[i] + b * yp[i];
--      }
--#endif
-    }
- }
- 
-@@ -474,7 +436,6 @@ void subtract(const Vector &x, const Vector &y, Vector &z)
-    MFEM_ASSERT(x.size == y.size && x.size == z.size,
-                "incompatible Vectors!");
- 
--#if !defined(MFEM_USE_LEGACY_OPENMP)
-    const bool use_dev = x.UseDevice() || y.UseDevice() || z.UseDevice();
-    const int N = x.size;
-    // Note: get read access first, in case z is the same as x/y.
-@@ -485,17 +446,6 @@ void subtract(const Vector &x, const Vector &y, Vector &z)
-    {
-       zd[i] = xd[i] - yd[i];
-    });
--#else
--   const double *xp = x.data;
--   const double *yp = y.data;
--   double       *zp = z.data;
--   const int     s = x.size;
--   #pragma omp parallel for
--   for (int i = 0; i < s; i++)
--   {
--      zp[i] = xp[i] - yp[i];
--   }
--#endif
- }
- 
- void subtract(const double a, const Vector &x, const Vector &y, Vector &z)
-@@ -513,7 +463,6 @@ void subtract(const double a, const Vector &x, const Vector &y, Vector &z)
-    }
-    else
-    {
--#if !defined(MFEM_USE_LEGACY_OPENMP)
-       const bool use_dev = x.UseDevice() || y.UseDevice() || z.UseDevice();
-       const int N = x.size;
-       // Note: get read access first, in case z is the same as x/y.
-@@ -524,17 +473,6 @@ void subtract(const double a, const Vector &x, const Vector &y, Vector &z)
-       {
-          zd[i] = a * (xd[i] - yd[i]);
-       });
--#else
--      const double *xp = x.data;
--      const double *yp = y.data;
--      double       *zp = z.data;
--      const int      s = x.size;
--      #pragma omp parallel for
--      for (int i = 0; i < s; i++)
--      {
--         zp[i] = a * (xp[i] - yp[i]);
--      }
--#endif
-    }
- }
- 
-diff --git a/linalg/vector.hpp b/linalg/vector.hpp
-index 4d2dda36d..04605d2f9 100644
---- a/linalg/vector.hpp
-+++ b/linalg/vector.hpp
-@@ -323,6 +323,9 @@ public:
-    /// (*this) = -(*this)
-    void Neg();
- 
-+   /// (*this)(i) = 1.0 / (*this)(i)
-+   void Reciprocal();
-+
-    /// Swap the contents of two Vectors
-    inline void Swap(Vector &other);
- 
-diff --git a/makefile b/makefile
-index 00d139b28..a606f6dfe 100644
---- a/makefile
-+++ b/makefile
-@@ -265,16 +265,6 @@ endif
- 
- DEP_CXX ?= $(MFEM_CXX)
- 
--# Check legacy OpenMP configuration
--ifeq ($(MFEM_USE_LEGACY_OPENMP),YES)
--   MFEM_THREAD_SAFE ?= YES
--   ifneq ($(MFEM_THREAD_SAFE),YES)
--      $(error Incompatible config: MFEM_USE_LEGACY_OPENMP requires MFEM_THREAD_SAFE)
--   endif
--   # NOTE: MFEM_USE_LEGACY_OPENMP cannot be combined with any of:
--   # MFEM_USE_OPENMP, MFEM_USE_CUDA, MFEM_USE_RAJA, MFEM_USE_OCCA
--endif
--
- # List of MFEM dependencies, that require the *_LIB variable to be non-empty
- MFEM_REQ_LIB_DEPS = ENZYME SUPERLU MUMPS METIS FMS CONDUIT SIDRE LAPACK SUNDIALS\
-  SUITESPARSE STRUMPACK GINKGO GNUTLS NETCDF PETSC SLEPC MPFR PUMI HIOP\
-@@ -339,16 +329,16 @@ endif
- MFEM_DEFINES = MFEM_VERSION MFEM_VERSION_STRING MFEM_GIT_STRING MFEM_USE_MPI\
-  MFEM_USE_METIS MFEM_USE_METIS_5 MFEM_DEBUG MFEM_USE_EXCEPTIONS MFEM_USE_ZLIB\
-  MFEM_USE_LIBUNWIND MFEM_USE_LAPACK MFEM_THREAD_SAFE MFEM_USE_OPENMP\
-- MFEM_USE_LEGACY_OPENMP MFEM_USE_MEMALLOC MFEM_TIMER_TYPE MFEM_USE_SUNDIALS\
-- MFEM_USE_SUITESPARSE MFEM_USE_GINKGO MFEM_USE_SUPERLU MFEM_USE_SUPERLU5\
-- MFEM_USE_STRUMPACK MFEM_USE_GNUTLS MFEM_USE_NETCDF MFEM_USE_PETSC\
-- MFEM_USE_SLEPC MFEM_USE_MPFR MFEM_USE_SIDRE MFEM_USE_FMS MFEM_USE_CONDUIT\
-- MFEM_USE_PUMI MFEM_USE_HIOP MFEM_USE_GSLIB MFEM_USE_CUDA MFEM_USE_HIP\
-- MFEM_USE_OCCA MFEM_USE_MOONOLITH MFEM_USE_CEED MFEM_USE_RAJA MFEM_USE_UMPIRE\
-- MFEM_USE_SIMD MFEM_USE_ADIOS2 MFEM_USE_MKL_CPARDISO MFEM_USE_AMGX\
-- MFEM_USE_MUMPS MFEM_USE_ADFORWARD MFEM_USE_CODIPACK MFEM_USE_CALIPER\
-- MFEM_USE_BENCHMARK MFEM_USE_PARELAG MFEM_USE_ALGOIM MFEM_USE_ENZYME\
-- MFEM_SOURCE_DIR MFEM_INSTALL_DIR MFEM_SHARED_BUILD
-+ MFEM_USE_MEMALLOC MFEM_TIMER_TYPE MFEM_USE_SUNDIALS MFEM_USE_SUITESPARSE\
-+ MFEM_USE_GINKGO MFEM_USE_SUPERLU MFEM_USE_SUPERLU5 MFEM_USE_STRUMPACK\
-+ MFEM_USE_GNUTLS MFEM_USE_NETCDF MFEM_USE_PETSC MFEM_USE_SLEPC MFEM_USE_MPFR\
-+ MFEM_USE_SIDRE MFEM_USE_FMS MFEM_USE_CONDUIT MFEM_USE_PUMI MFEM_USE_HIOP\
-+ MFEM_USE_GSLIB MFEM_USE_CUDA MFEM_USE_HIP MFEM_USE_OCCA MFEM_USE_MOONOLITH\
-+ MFEM_USE_CEED MFEM_USE_RAJA MFEM_USE_UMPIRE MFEM_USE_SIMD MFEM_USE_ADIOS2\
-+ MFEM_USE_MKL_CPARDISO MFEM_USE_AMGX MFEM_USE_MUMPS MFEM_USE_ADFORWARD\
-+ MFEM_USE_CODIPACK MFEM_USE_CALIPER MFEM_USE_BENCHMARK MFEM_USE_PARELAG\
-+ MFEM_USE_ALGOIM MFEM_USE_ENZYME MFEM_SOURCE_DIR MFEM_INSTALL_DIR\
-+ MFEM_SHARED_BUILD
- 
- # List of makefile variables that will be written to config.mk:
- MFEM_CONFIG_VARS = MFEM_CXX MFEM_HOST_CXX MFEM_CPPFLAGS MFEM_CXXFLAGS\
-@@ -419,7 +409,7 @@ endif
- DIRS = general linalg linalg/simd mesh mesh/submesh fem fem/ceed/interface \
-        fem/ceed/integrators/mass fem/ceed/integrators/convection \
-        fem/ceed/integrators/diffusion fem/ceed/integrators/nlconvection \
--       fem/ceed/solvers fem/fe fem/lor fem/qinterp fem/tmop
-+       fem/ceed/solvers fem/fe fem/lor fem/qinterp fem/integ fem/tmop
- 
- ifeq ($(MFEM_USE_MOONOLITH),YES)
-    MFEM_CXXFLAGS += $(MOONOLITH_CXX_FLAGS)
-@@ -672,7 +662,6 @@ status info:
- 	$(info MFEM_USE_LAPACK        = $(MFEM_USE_LAPACK))
- 	$(info MFEM_THREAD_SAFE       = $(MFEM_THREAD_SAFE))
- 	$(info MFEM_USE_OPENMP        = $(MFEM_USE_OPENMP))
--	$(info MFEM_USE_LEGACY_OPENMP = $(MFEM_USE_LEGACY_OPENMP))
- 	$(info MFEM_USE_MEMALLOC      = $(MFEM_USE_MEMALLOC))
- 	$(info MFEM_TIMER_TYPE        = $(MFEM_TIMER_TYPE))
- 	$(info MFEM_USE_SUNDIALS      = $(MFEM_USE_SUNDIALS))
-@@ -756,10 +745,6 @@ deprecation-warnings:
- 	@if [ -t 1 ]; then\
- 	   red="\033[0;31m"; yellow="\033[0;33m"; end="\033[0m";\
- 	 fi;\
--	if [ $(MFEM_USE_LEGACY_OPENMP) = YES ]; then\
--	  printf $$red"[MFEM_USE_LEGACY_OPENMP]"$$end": "$$yellow"%s"$$end"\n"\
--	  $(DEPRECATION_WARNING);\
--	fi
- 
- # $(call mfem_check_command, command-to-execute, success_msg, failed_msg)
- mfem_check_command = \
-diff --git a/miniapps/performance/ex1.cpp b/miniapps/performance/ex1.cpp
-index e2271585c..916e7022e 100644
---- a/miniapps/performance/ex1.cpp
-+++ b/miniapps/performance/ex1.cpp
-@@ -317,8 +317,6 @@ int ex1_t<dim>::run(Mesh *mesh, int ref_levels, int order, int basis,
-    cout << "Assembling the bilinear form ..." << flush;
-    tic_toc.Clear();
-    tic_toc.Start();
--   // Pre-allocate sparsity assuming dense element matrices
--   a->UsePrecomputedSparsity();
- 
-    HPCBilinearForm *a_hpc = NULL;
-    Operator *a_oper = NULL;
-@@ -373,7 +371,6 @@ int ex1_t<dim>::run(Mesh *mesh, int ref_levels, int order, int basis,
-    {
-       // TODO: assemble the LOR matrix using the performance code
-       a_pc->AddDomainIntegrator(new DiffusionIntegrator(one));
--      a_pc->UsePrecomputedSparsity();
-       a_pc->Assemble();
-       a_pc->FormSystemMatrix(ess_tdof_list, A_pc);
-    }
-@@ -385,7 +382,6 @@ int ex1_t<dim>::run(Mesh *mesh, int ref_levels, int order, int basis,
-       }
-       else
-       {
--         a_pc->UsePrecomputedSparsity();
-          a_hpc->AssembleBilinearForm(*a_pc);
-          a_pc->FormSystemMatrix(ess_tdof_list, A_pc);
-       }
-diff --git a/miniapps/performance/ex1p.cpp b/miniapps/performance/ex1p.cpp
-index 79735c7ff..1e80576bb 100644
---- a/miniapps/performance/ex1p.cpp
-+++ b/miniapps/performance/ex1p.cpp
-@@ -390,8 +390,6 @@ int ex1_t<dim>::run(Mesh *mesh, int ser_ref_levels, int par_ref_levels,
-    }
-    tic_toc.Clear();
-    tic_toc.Start();
--   // Pre-allocate sparsity assuming dense element matrices
--   a->UsePrecomputedSparsity();
- 
-    HPCBilinearForm *a_hpc = NULL;
-    Operator *a_oper = NULL;
-@@ -460,7 +458,6 @@ int ex1_t<dim>::run(Mesh *mesh, int ser_ref_levels, int par_ref_levels,
-    {
-       // TODO: assemble the LOR matrix using the performance code
-       a_pc->AddDomainIntegrator(new DiffusionIntegrator(one));
--      a_pc->UsePrecomputedSparsity();
-       a_pc->Assemble();
-       a_pc->FormSystemMatrix(ess_tdof_list, A_pc);
-    }
-@@ -472,7 +469,6 @@ int ex1_t<dim>::run(Mesh *mesh, int ser_ref_levels, int par_ref_levels,
-       }
-       else
-       {
--         a_pc->UsePrecomputedSparsity();
-          a_hpc->AssembleBilinearForm(*a_pc);
-          a_pc->FormSystemMatrix(ess_tdof_list, A_pc);
-       }
-diff --git a/tests/unit/fem/test_assemblediagonalpa.cpp b/tests/unit/fem/test_assemblediagonalpa.cpp
-index ebbb8e224..050561e1d 100644
---- a/tests/unit/fem/test_assemblediagonalpa.cpp
-+++ b/tests/unit/fem/test_assemblediagonalpa.cpp
-@@ -17,11 +17,9 @@ using namespace mfem;
- namespace assemblediagonalpa
- {
- 
--int dimension;
--
- double coeffFunction(const Vector& x)
- {
--   if (dimension == 2)
-+   if (x.Size() == 2)
-    {
-       return sin(8.0 * M_PI * x[0]) * cos(6.0 * M_PI * x[1]) + 2.0;
-    }
-@@ -36,12 +34,12 @@ double coeffFunction(const Vector& x)
- void vectorCoeffFunction(const Vector & x, Vector & f)
- {
-    f = 0.0;
--   if (dimension > 1)
-+   if (x.Size() > 1)
-    {
-       f[0] = sin(M_PI * x[1]);
-       f[1] = sin(2.5 * M_PI * x[0]);
-    }
--   if (dimension == 3)
-+   if (x.Size() == 3)
-    {
-       f[2] = sin(6.1 * M_PI * x[2]);
-    }
-@@ -50,14 +48,14 @@ void vectorCoeffFunction(const Vector & x, Vector & f)
- void asymmetricMatrixCoeffFunction(const Vector & x, DenseMatrix & f)
- {
-    f = 0.0;
--   if (dimension == 2)
-+   if (x.Size() == 2)
-    {
-       f(0,0) = 1.1 + sin(M_PI * x[1]);  // 1,1
-       f(1,0) = cos(1.3 * M_PI * x[1]);  // 2,1
-       f(0,1) = cos(2.5 * M_PI * x[0]);  // 1,2
-       f(1,1) = 1.1 + sin(4.9 * M_PI * x[0]);  // 2,2
-    }
--   else if (dimension == 3)
-+   else if (x.Size() == 3)
-    {
-       f(0,0) = 1.1 + sin(M_PI * x[1]);  // 1,1
-       f(0,1) = cos(2.5 * M_PI * x[0]);  // 1,2
-@@ -74,13 +72,13 @@ void asymmetricMatrixCoeffFunction(const Vector & x, DenseMatrix & f)
- void symmetricMatrixCoeffFunction(const Vector & x, DenseSymmetricMatrix & f)
- {
-    f = 0.0;
--   if (dimension == 2)
-+   if (x.Size() == 2)
-    {
-       f(0,0) = 1.1 + sin(M_PI * x[1]);  // 1,1
-       f(0,1) = cos(2.5 * M_PI * x[0]);  // 1,2
-       f(1,1) = 1.1 + sin(4.9 * M_PI * x[0]);  // 2,2
-    }
--   else if (dimension == 3)
-+   else if (x.Size() == 3)
-    {
-       f(0,0) = sin(M_PI * x[1]);  // 1,1
-       f(0,1) = cos(2.5 * M_PI * x[0]);  // 1,2
-@@ -93,7 +91,7 @@ void symmetricMatrixCoeffFunction(const Vector & x, DenseSymmetricMatrix & f)
- 
- TEST_CASE("Mass Diagonal PA", "[PartialAssembly][AssembleDiagonal]")
- {
--   for (dimension = 2; dimension < 4; ++dimension)
-+   for (int dimension = 2; dimension < 4; ++dimension)
-    {
-       for (int ne = 1; ne < 3; ++ne)
-       {
-@@ -140,9 +138,45 @@ TEST_CASE("Mass Diagonal PA", "[PartialAssembly][AssembleDiagonal]")
-    }
- }
- 
-+TEST_CASE("Mass Boundary Diagonal PA", "[PartialAssembly][AssembleDiagonal]")
-+{
-+   const bool all_tests = launch_all_non_regression_tests;
-+
-+   auto fname = GENERATE("../../data/star.mesh", "../../data/star-q3.mesh",
-+                         "../../data/fichera.mesh", "../../data/fichera-q3.mesh");
-+   auto order = !all_tests ? 2 : GENERATE(1, 2, 3);
-+
-+   CAPTURE(fname, order);
-+
-+   Mesh mesh(fname);
-+   int dim = mesh.Dimension();
-+   RT_FECollection fec(order, dim);
-+   FiniteElementSpace fes(&mesh, &fec);
-+
-+   FunctionCoefficient coeff(coeffFunction);
-+
-+   Vector diag_fa(fes.GetTrueVSize()), diag_pa(fes.GetTrueVSize());
-+
-+   BilinearForm blf_fa(&fes);
-+   blf_fa.AddBoundaryIntegrator(new MassIntegrator(coeff));
-+   blf_fa.Assemble();
-+   blf_fa.Finalize();
-+   blf_fa.SpMat().GetDiag(diag_fa);
-+
-+   BilinearForm blf_pa(&fes);
-+   blf_pa.SetAssemblyLevel(AssemblyLevel::PARTIAL);
-+   blf_pa.AddBoundaryIntegrator(new MassIntegrator(coeff));
-+   blf_pa.Assemble();
-+   blf_pa.AssembleDiagonal(diag_pa);
-+
-+   diag_pa -= diag_fa;
-+
-+   REQUIRE(diag_pa.Normlinf() == MFEM_Approx(0.0));
-+}
-+
- TEST_CASE("Diffusion Diagonal PA", "[PartialAssembly][AssembleDiagonal]")
- {
--   for (dimension = 2; dimension < 4; ++dimension)
-+   for (int dimension = 2; dimension < 4; ++dimension)
-    {
-       for (int ne = 1; ne < 3; ++ne)
-       {
-@@ -322,7 +356,7 @@ TEST_CASE("Vector Diffusion Diagonal PA",
- TEST_CASE("Hcurl/Hdiv diagonal PA",
-           "[CUDA][PartialAssembly][AssembleDiagonal]")
- {
--   for (dimension = 2; dimension < 4; ++dimension)
-+   for (int dimension = 2; dimension < 4; ++dimension)
-    {
-       for (int coeffType = 0; coeffType < 5; ++coeffType)
-       {
-diff --git a/tests/unit/fem/test_bilinearform.cpp b/tests/unit/fem/test_bilinearform.cpp
-index 5fd00b3e1..647ae38f2 100644
---- a/tests/unit/fem/test_bilinearform.cpp
-+++ b/tests/unit/fem/test_bilinearform.cpp
-@@ -127,7 +127,7 @@ TEST_CASE("FormLinearSystem/SolutionScope",
-    // Legacy full assembly
-    {
-       GridFunction sol(&fes);
--      SolvePDE(AssemblyLevel::LEGACYFULL, sol);
-+      SolvePDE(AssemblyLevel::LEGACY, sol);
-       // Make sure the solution is still accessible after 'X' is destroyed
-       sol.HostRead();
-       REQUIRE(AsConst(sol)(bdr_dof) == 0.0);
-diff --git a/tests/unit/fem/test_pa_grad.cpp b/tests/unit/fem/test_pa_grad.cpp
-index a42d7c83c..af0038cee 100644
---- a/tests/unit/fem/test_pa_grad.cpp
-+++ b/tests/unit/fem/test_pa_grad.cpp
-@@ -154,7 +154,7 @@ double par_compare_pa_assembly(int dim, int num_elements, int order,
-    pa_grad.AddDomainInterpolator(new GradientInterpolator);
-    pa_grad.Assemble();
-    OperatorPtr pa_grad_oper;
--   pa_grad.FormRectangularSystemMatrix(pa_grad_oper);
-+   pa_grad.FormDiscreteOperatorMatrix(pa_grad_oper);
- 
-    int insize, outsize;
-    if (transpose)
-diff --git a/tests/unit/fem/test_pa_kernels.cpp b/tests/unit/fem/test_pa_kernels.cpp
-index 2277efdfa..b50a69886 100644
---- a/tests/unit/fem/test_pa_kernels.cpp
-+++ b/tests/unit/fem/test_pa_kernels.cpp
-@@ -9,11 +9,6 @@
- // terms of the BSD-3 license. We welcome feedback and contributions, see file
- // CONTRIBUTING.md for details.
- 
--#ifdef _WIN32
--#define _USE_MATH_DEFINES
--#include <cmath>
--#endif
--
- #include "unit_tests.hpp"
- #include "mfem.hpp"
- 
-@@ -520,17 +515,17 @@ static void test_pa_integrator()
-    GridFunction x(&fes), y_fa(&fes), y_pa(&fes);
-    x.Randomize(1);
- 
--   ConstantCoefficient pi(M_PI);
-+   FunctionCoefficient coeff(f1);
- 
-    BilinearForm blf_fa(&fes);
--   blf_fa.AddDomainIntegrator(new INTEGRATOR(pi,ir));
-+   blf_fa.AddDomainIntegrator(new INTEGRATOR(coeff,ir));
-    blf_fa.Assemble();
-    blf_fa.Finalize();
-    blf_fa.Mult(x, y_fa);
- 
-    BilinearForm blf_pa(&fes);
-    blf_pa.SetAssemblyLevel(AssemblyLevel::PARTIAL);
--   blf_pa.AddDomainIntegrator(new INTEGRATOR(pi,ir));
-+   blf_pa.AddDomainIntegrator(new INTEGRATOR(coeff,ir));
-    blf_pa.Assemble();
-    blf_pa.Mult(x, y_pa);
- 
-@@ -549,4 +544,39 @@ TEST_CASE("PA Diffusion", "[PartialAssembly], [CUDA]")
-    test_pa_integrator<DiffusionIntegrator>();
- } // PA Diffusion test case
- 
-+TEST_CASE("PA Boundary Mass", "[PartialAssembly], [CUDA]")
-+{
-+   const bool all_tests = launch_all_non_regression_tests;
-+
-+   auto fname = GENERATE("../../data/star.mesh", "../../data/star-q3.mesh",
-+                         "../../data/fichera.mesh", "../../data/fichera-q3.mesh");
-+   auto order = !all_tests ? 2 : GENERATE(1, 2, 3);
-+
-+   Mesh mesh(fname);
-+   int dim = mesh.Dimension();
-+   RT_FECollection fec(order, dim);
-+   FiniteElementSpace fes(&mesh, &fec);
-+
-+   GridFunction x(&fes), y_fa(&fes), y_pa(&fes);
-+   x.Randomize(1);
-+
-+   FunctionCoefficient coeff(f1);
-+
-+   BilinearForm blf_fa(&fes);
-+   blf_fa.AddBoundaryIntegrator(new MassIntegrator(coeff));
-+   blf_fa.Assemble();
-+   blf_fa.Finalize();
-+   blf_fa.Mult(x, y_fa);
-+
-+   BilinearForm blf_pa(&fes);
-+   blf_pa.SetAssemblyLevel(AssemblyLevel::PARTIAL);
-+   blf_pa.AddBoundaryIntegrator(new MassIntegrator(coeff));
-+   blf_pa.Assemble();
-+   blf_pa.Mult(x, y_pa);
-+
-+   y_fa -= y_pa;
-+
-+   REQUIRE(y_fa.Normlinf() == MFEM_Approx(0.0));
-+}
-+
- } // namespace pa_kernels
diff --git a/palace/deps/patch/mfem/patch_submesh.diff b/palace/deps/patch/mfem/patch_submesh.diff
index 273e307f9..4274d3707 100644
--- a/palace/deps/patch/mfem/patch_submesh.diff
+++ b/palace/deps/patch/mfem/patch_submesh.diff
@@ -1,3 +1,27 @@
+diff --git a/CHANGELOG b/CHANGELOG
+index 1d0a1c166..aa3b60cdf 100644
+--- a/CHANGELOG
++++ b/CHANGELOG
+@@ -26,6 +26,9 @@ New and updated examples and miniapps
+   integrators are added in support of DPG systems: TraceIntegrator,
+   NormalTraceIntegrator and TangentTraceIntegrator.
+ 
++- Added new SubMesh examples demonstrating source terms and boundary conditions
++  transferred from SubMesh objects.
++
+ - Added a new H(div) solvers miniapp in miniapps/hdiv-linear-solver,
+   demonstrating the use of a matrix-free saddle-point solver methodology,
+   suitable for high-order discretizations and for GPU acceleration. Examples
+@@ -47,6 +50,9 @@ Discretization improvements
+ - Face restriction operators for Nedelec and Raviart-Thomas finite element
+   spaces are now supported through the ConformingFaceRestriction class.
+ 
++- SubMesh and ParSubMesh have been extended to support the transfer of
++  Nedelec and Raviart-Thomas finite element spaces.
++
+ - VectorFEBoundaryFluxLFIntegrator is now supported on device/GPU.
+ 
+ - Added support for p-refined meshes in FindPointsGSLIB.
 diff --git a/data/fichera-quad-mixed.mesh b/data/fichera-quad-mixed.mesh
 new file mode 100644
 index 000000000..a3458665a
@@ -431,7 +455,7 @@ index 000000000..cd82b4bf1
 +0.6608093135547 0.8704406864453
 +0.8704406864453 0.8704406864453
 diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
-index afa98324c..16c3d0552 100644
+index 7d9c835c9..a89df6220 100644
 --- a/examples/CMakeLists.txt
 +++ b/examples/CMakeLists.txt
 @@ -77,6 +77,8 @@ if (MFEM_USE_MPI)
@@ -1839,14 +1863,23 @@ index 000000000..4abc2ed3a
 +   }
 +}
 diff --git a/examples/makefile b/examples/makefile
-index 85a22f832..7982a9f3e 100644
+index 85a22f832..1e3088f1a 100644
 --- a/examples/makefile
 +++ b/examples/makefile
+@@ -26,7 +26,7 @@ SEQ_EXAMPLES = ex0 ex1 ex2 ex3 ex4 ex5 ex6 ex7 ex8 ex9 ex10 ex14 ex15 ex16 \
+   ex31 ex33
+ PAR_EXAMPLES = ex0p ex1p ex2p ex3p ex4p ex5p ex6p ex7p ex8p ex9p ex10p ex11p \
+   ex12p ex13p ex14p ex15p ex16p ex17p ex18p ex19p ex20p ex21p ex22p ex24p \
+-  ex25p ex26p ex27p ex28p ex29p ex30p ex31p ex32p ex33p
++  ex25p ex26p ex27p ex28p ex29p ex30p ex31p ex32p ex33p ex34p ex35p
+ SEQ_DEVICE_EXAMPLES = ex1 ex3 ex4 ex5 ex6 ex9 ex22 ex24 ex25 ex26
+ PAR_DEVICE_EXAMPLES = ex1p ex2p ex3p ex4p ex5p ex6p ex7p ex9p ex13p ex22p \
+   ex24p ex25p ex26p
 @@ -183,3 +183,4 @@ clean-exec:
- 	@rm -f ex23.mesh ex23-*.gf
- 	@rm -f ex25.mesh ex25-*.gf ex25p-*.*
- 	@rm -rf ex28_* ex28p_*
-+	@rm -rf cond_mesh.* cond_j.* dsol.* port_mesh.* port_mode.*
+     @rm -f ex23.mesh ex23-*.gf
+     @rm -f ex25.mesh ex25-*.gf ex25p-*.*
+     @rm -rf ex28_* ex28p_*
++    @rm -rf cond_mesh.* cond_j.* dsol.* port_mesh.* port_mode.*
 diff --git a/fem/doftrans.cpp b/fem/doftrans.cpp
 index 95da3859d..06355ce75 100644
 --- a/fem/doftrans.cpp
@@ -2406,10 +2439,10 @@ index 95da3859d..06355ce75 100644
     }
  }
 diff --git a/fem/doftrans.hpp b/fem/doftrans.hpp
-index 9375246b5..fbe57bb2b 100644
+index 9375246b5..a1ddb3399 100644
 --- a/fem/doftrans.hpp
 +++ b/fem/doftrans.hpp
-@@ -15,19 +15,19 @@
+@@ -15,19 +15,31 @@
  #include "../config/config.hpp"
  #include "../linalg/linalg.hpp"
  #include "intrules.hpp"
@@ -2421,11 +2454,25 @@ index 9375246b5..fbe57bb2b 100644
 -/** The DofTransformation class is an abstract base class for a family of
 -    transformations that map local degrees of freedom (DoFs), contained within
 -    individual elements, to global degrees of freedom, stored within
+-    GridFunction objects. These transformations are necessary to ensure that
+-    basis functions in neighboring elements align correctly. Closely related but
 +/** The StatelessDofTransformation class is an abstract base class for a family
 +    of transformations that map local degrees of freedom (DoFs), contained
 +    within individual elements, to global degrees of freedom, stored within
-     GridFunction objects. These transformations are necessary to ensure that
-     basis functions in neighboring elements align correctly. Closely related but
++    GridFunction objects.
++
++    In this context "stateless" means that the concrete classes derived from
++    StatelessDofTransformation do not store information about the relative
++    orientations of the faces with respect to their neighboring elements. In
++    other words there is no information specific to a particular element (aside
++    from the element type e.g. tetrahedron, wedge, or pyramid). The
++    StatelessDofTransformation provides access to the transformation operators
++    for specific relative face orientations. These are useful, for example, when
++    relating DoFs associated with distinct overlapping meshes such as parent and
++    sub-meshes.
++
++    These transformations are necessary to ensure that basis functions in
++    neighboring (or overlapping) elements align correctly. Closely related but
      complementary transformations are required for the entries stored in
 -    LinearForm and BilinearForm objects. The DofTransformation class is designed
 -    to apply the action of both of these types of DoF transformations.
@@ -2435,7 +2482,7 @@ index 9375246b5..fbe57bb2b 100644
  
      Let the "primal transformation" be given by the operator T. This means that
      given a local element vector v the data that must be placed into a
-@@ -53,24 +53,87 @@ namespace mfem
+@@ -53,24 +65,87 @@ namespace mfem
      D_t = T * D * T^{-1}. This can be accomplished by using a primal
      transformation on the columns of D and a dual transformation on its rows.
  */
@@ -2528,7 +2575,7 @@ index 9375246b5..fbe57bb2b 100644
     /** @brief Configure the transformation using face orientations for the
         current element. */
     /// The face_orientation array can be obtained from Mesh::GetElementFaces.
-@@ -79,42 +142,82 @@ public:
+@@ -79,42 +154,82 @@ public:
  
     inline const Array<int> & GetFaceOrientations() const { return Fo; }
  
@@ -2625,7 +2672,7 @@ index 9375246b5..fbe57bb2b 100644
  };
  
  /** Transform a matrix of DoFs entries from different finite element spaces as
-@@ -133,66 +236,145 @@ void TransformDual(const DofTransformation *ran_dof_trans,
+@@ -133,66 +248,145 @@ void TransformDual(const DofTransformation *ran_dof_trans,
                     const DofTransformation *dom_dof_trans,
                     DenseMatrix &elmat);
  
@@ -2797,7 +2844,7 @@ index 9375246b5..fbe57bb2b 100644
  };
  
  /** Abstract base class for high-order Nedelec spaces on elements with
-@@ -207,17 +389,22 @@ public:
+@@ -207,17 +401,22 @@ public:
      be accessed as DenseMatrices using the GetFaceTransform() and
      GetFaceInverseTransform() methods.
  */
@@ -2826,7 +2873,7 @@ index 9375246b5..fbe57bb2b 100644
  
  public:
     // Return the 2x2 transformation operator for the given face orientation
-@@ -226,67 +413,119 @@ public:
+@@ -226,67 +425,119 @@ public:
     // Return the 2x2 inverse transformation operator
     static const DenseMatrix & GetFaceInverseTransform(int ori)
     { return TInv(ori); }
@@ -3151,7 +3198,7 @@ index 6b05c5f31..86c1f4e66 100644
     virtual int GetContType() const { return TANGENTIAL; }
     FiniteElementCollection *GetTraceCollection() const;
 diff --git a/fem/fespace.hpp b/fem/fespace.hpp
-index f777bf871..47ddbf438 100644
+index c30299bfe..e2c495506 100644
 --- a/fem/fespace.hpp
 +++ b/fem/fespace.hpp
 @@ -377,17 +377,6 @@ protected:
@@ -3205,10 +3252,10 @@ index 6f0af11c6..51f0df2f6 100644
              j_offd[i_offd[i] + 1] = j_offd[i_offd[i]] + 1;
              d_offd[i_offd[i]] = T[0]; d_offd[i_offd[i] + 1] = T[2];
 diff --git a/linalg/hypre.cpp b/linalg/hypre.cpp
-index d7069d4e6..04072c420 100644
+index 689ce675e..7c1a463be 100644
 --- a/linalg/hypre.cpp
 +++ b/linalg/hypre.cpp
-@@ -5349,20 +5349,8 @@ void HypreAMS::MakeGradientAndInterpolation(
+@@ -5340,20 +5340,8 @@ void HypreAMS::MakeGradientAndInterpolation(
     rt_trace_space = dynamic_cast<const RT_Trace_FECollection*>(edge_fec);
     trace_space = trace_space || rt_trace_space;
  
@@ -3231,7 +3278,7 @@ index d7069d4e6..04072c420 100644
  
     ParMesh *pmesh = edge_fespace->GetParMesh();
     if (rt_trace_space)
-@@ -5751,19 +5739,9 @@ void HypreADS::MakeDiscreteMatrices(ParFiniteElementSpace *face_fespace)
+@@ -5742,19 +5730,9 @@ void HypreADS::MakeDiscreteMatrices(ParFiniteElementSpace *face_fespace)
     const FiniteElementCollection *face_fec = face_fespace->FEColl();
     bool trace_space =
        (dynamic_cast<const RT_Trace_FECollection*>(face_fec) != NULL);
@@ -3255,10 +3302,10 @@ index d7069d4e6..04072c420 100644
     // define the nodal and edge finite element spaces associated with face_fespace
     ParMesh *pmesh = (ParMesh *) face_fespace->GetMesh();
 diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp
-index de2d80f15..851a61744 100644
+index 87e606510..296accf53 100644
 --- a/mesh/mesh.cpp
 +++ b/mesh/mesh.cpp
-@@ -3013,6 +3013,10 @@ void Mesh::FinalizeTopology(bool generate_bdr)
+@@ -3020,6 +3020,10 @@ void Mesh::FinalizeTopology(bool generate_bdr)
     if (Dim == 1)
     {
        GenerateFaces();
@@ -3269,7 +3316,7 @@ index de2d80f15..851a61744 100644
     }
  
     if (ncmesh)
-@@ -5653,13 +5657,54 @@ int Mesh::GetTriOrientation(const int *base, const int *test)
+@@ -5660,13 +5664,54 @@ int Mesh::GetTriOrientation(const int *base, const int *test)
     for (int j = 0; j < 3; j++)
        if (test[aor[j]] != base[j])
        {
@@ -3325,7 +3372,7 @@ index de2d80f15..851a61744 100644
  int Mesh::GetQuadOrientation(const int *base, const int *test)
  {
     int i;
-@@ -5708,6 +5753,37 @@ int Mesh::GetQuadOrientation(const int *base, const int *test)
+@@ -5715,6 +5760,37 @@ int Mesh::GetQuadOrientation(const int *base, const int *test)
     return 2*i+1;
  }
  
@@ -3363,7 +3410,7 @@ index de2d80f15..851a61744 100644
  int Mesh::GetTetOrientation(const int *base, const int *test)
  {
     // Static method.
-@@ -6523,9 +6599,9 @@ const Table & Mesh::ElementToEdgeTable() const
+@@ -6530,9 +6606,9 @@ const Table & Mesh::ElementToEdgeTable() const
  
  void Mesh::AddPointFaceElement(int lf, int gf, int el)
  {
@@ -3376,10 +3423,10 @@ index de2d80f15..851a61744 100644
        faces_info[gf].Elem1Inf = 64 * lf; // face lf with orientation 0
        faces_info[gf].Elem2No  = -1; // in case there's no other side
 diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp
-index 3e0590067..64bb85efc 100644
+index 64cf55ae4..06f4356a7 100644
 --- a/mesh/mesh.hpp
 +++ b/mesh/mesh.hpp
-@@ -472,8 +472,30 @@ protected:
+@@ -468,8 +468,30 @@ protected:
  
     /// Returns the orientation of "test" relative to "base"
     static int GetTriOrientation (const int * base, const int * test);
@@ -3411,10 +3458,23 @@ index 3e0590067..64bb85efc 100644
     static int GetTetOrientation (const int * base, const int * test);
  
 diff --git a/mesh/submesh/psubmesh.cpp b/mesh/submesh/psubmesh.cpp
-index b316d8b41..cf3e23d7d 100644
+index b316d8b41..1de148a76 100644
 --- a/mesh/submesh/psubmesh.cpp
 +++ b/mesh/submesh/psubmesh.cpp
-@@ -84,7 +84,8 @@ ParSubMesh::ParSubMesh(const ParMesh &parent, SubMesh::From from,
+@@ -38,10 +38,8 @@ ParSubMesh ParSubMesh::CreateFromBoundary(const ParMesh &parent,
+ ParSubMesh::ParSubMesh(const ParMesh &parent, SubMesh::From from,
+                        Array<int> &attributes) : parent_(parent), from_(from), attributes_(attributes)
+ {
+-   if (Nonconforming())
+-   {
+-      MFEM_ABORT("SubMesh does not support non-conforming meshes");
+-   }
++   MFEM_VERIFY(from == SubMesh::From::Boundary || !Nonconforming(),
++               "ParSubMesh does not support non-conforming meshes with From::Domain");
+ 
+    MyComm = parent.GetComm();
+    NRanks = parent.GetNRanks();
+@@ -84,7 +82,8 @@ ParSubMesh::ParSubMesh(const ParMesh &parent, SubMesh::From from,
        GetEdgeVertices(i, lv);
  
        // Find vertices/edge in parent mesh
@@ -3424,7 +3484,7 @@ index b316d8b41..cf3e23d7d 100644
        parent_edge_ids_.Append(parent_edge_id);
     }
  
-@@ -106,6 +107,72 @@ ParSubMesh::ParSubMesh(const ParMesh &parent, SubMesh::From from,
+@@ -106,6 +105,72 @@ ParSubMesh::ParSubMesh(const ParMesh &parent, SubMesh::From from,
        {
           parent_to_submesh_face_ids_[parent_face_ids_[i]] = i;
        }
@@ -3497,7 +3557,7 @@ index b316d8b41..cf3e23d7d 100644
     }
  
     ListOfIntegerSets groups;
-@@ -145,7 +212,7 @@ ParSubMesh::ParSubMesh(const ParMesh &parent, SubMesh::From from,
+@@ -145,7 +210,7 @@ ParSubMesh::ParSubMesh(const ParMesh &parent, SubMesh::From from,
     {
        BuildFaceGroup(ngroups, rht, nstrias, rhq, nsquads);
     }
@@ -3506,7 +3566,7 @@ index b316d8b41..cf3e23d7d 100644
     {
        group_stria.MakeI(ngroups);
        group_stria.MakeJ();
-@@ -167,7 +234,9 @@ ParSubMesh::ParSubMesh(const ParMesh &parent, SubMesh::From from,
+@@ -167,7 +232,9 @@ ParSubMesh::ParSubMesh(const ParMesh &parent, SubMesh::From from,
  
     // Add boundaries
     {
@@ -3517,7 +3577,7 @@ index b316d8b41..cf3e23d7d 100644
        Array<int> &be2face = (Dim == 2) ? be_to_edge : be_to_face;
  
        if (Dim == 3)
-@@ -190,9 +259,11 @@ ParSubMesh::ParSubMesh(const ParMesh &parent, SubMesh::From from,
+@@ -190,9 +257,11 @@ ParSubMesh::ParSubMesh(const ParMesh &parent, SubMesh::From from,
        boundary.SetSize(NumOfBdrElements);
        be2face.SetSize(NumOfBdrElements);
        Array<int> parent_face_to_be;
@@ -3529,7 +3589,7 @@ index b316d8b41..cf3e23d7d 100644
        }
        for (int i = 0, j = 0; i < num_of_faces_or_edges; i++)
        {
-@@ -209,7 +280,7 @@ ParSubMesh::ParSubMesh(const ParMesh &parent, SubMesh::From from,
+@@ -209,7 +278,7 @@ ParSubMesh::ParSubMesh(const ParMesh &parent, SubMesh::From from,
                 }
                 else
                 {
@@ -3538,7 +3598,7 @@ index b316d8b41..cf3e23d7d 100644
                 }
              }
              else
-@@ -743,9 +814,14 @@ void ParSubMesh::BuildSharedEdgesMapping(const int sedges_ct,
+@@ -743,9 +812,14 @@ void ParSubMesh::BuildSharedEdgesMapping(const int sedges_ct,
           else
           {
              Array<int> vert;
@@ -3556,7 +3616,7 @@ index b316d8b41..cf3e23d7d 100644
              sedge_ledge.Append(submesh_edge_id);
           }
        }
-@@ -760,36 +836,61 @@ void ParSubMesh::BuildSharedFacesMapping(const int nstrias,
+@@ -760,36 +834,61 @@ void ParSubMesh::BuildSharedFacesMapping(const int nstrias,
     shared_quads.Reserve(nsquads);
     sface_lface.Reserve(nstrias + nsquads);
  
@@ -3631,7 +3691,7 @@ index b316d8b41..cf3e23d7d 100644
           {
              // parent shared face is not in SubMesh or is not shared
           }
-@@ -798,7 +899,7 @@ void ParSubMesh::BuildSharedFacesMapping(const int nstrias,
+@@ -798,7 +897,7 @@ void ParSubMesh::BuildSharedFacesMapping(const int nstrias,
              Array<int> vert;
              GetFaceVertices(submesh_face_id, vert);
  
@@ -3641,7 +3701,7 @@ index b316d8b41..cf3e23d7d 100644
           }
        }
 diff --git a/mesh/submesh/psubmesh.hpp b/mesh/submesh/psubmesh.hpp
-index 58acd1d4f..8d35bc451 100644
+index 8c71c7181..651be6243 100644
 --- a/mesh/submesh/psubmesh.hpp
 +++ b/mesh/submesh/psubmesh.hpp
 @@ -128,6 +128,16 @@ public:
@@ -3944,10 +4004,23 @@ index bd7a7bd6c..683cc5bed 100644
  
     /// Temporary vector
 diff --git a/mesh/submesh/submesh.cpp b/mesh/submesh/submesh.cpp
-index ac2058c57..c9f0b8508 100644
+index ac2058c57..76d4456c5 100644
 --- a/mesh/submesh/submesh.cpp
 +++ b/mesh/submesh/submesh.cpp
-@@ -61,6 +61,7 @@ SubMesh::SubMesh(const Mesh &parent, From from,
+@@ -31,10 +31,8 @@ SubMesh SubMesh::CreateFromBoundary(const Mesh &parent,
+ SubMesh::SubMesh(const Mesh &parent, From from,
+                  Array<int> attributes) : parent_(parent), from_(from), attributes_(attributes)
+ {
+-   if (Nonconforming())
+-   {
+-      MFEM_ABORT("SubMesh does not support non-conforming meshes");
+-   }
++   MFEM_VERIFY(from == From::Boundary || !Nonconforming(),
++               "SubMesh does not support non-conforming meshes with From::Domain");
+ 
+    if (from == From::Domain)
+    {
+@@ -61,6 +59,7 @@ SubMesh::SubMesh(const Mesh &parent, From from,
                                                      parent_element_ids_);
  
        Array<int> parent_face_to_be = parent.GetFaceToBdrElMap();
@@ -3955,7 +4028,7 @@ index ac2058c57..c9f0b8508 100644
  
        for (int i = 0; i < NumOfBdrElements; i++)
        {
-@@ -75,7 +76,73 @@ SubMesh::SubMesh(const Mesh &parent, From from,
+@@ -75,7 +74,73 @@ SubMesh::SubMesh(const Mesh &parent, From from,
              // This case happens when a domain is extracted, but the root parent
              // mesh didn't have a boundary element on the surface that defined
              // it's boundary. It still creates a valid mesh, so we allow it.
@@ -5120,7 +5193,7 @@ index 02a98f628..c5057338a 100644
     multidomain_test_3d(fec_type);
  }
 diff --git a/tests/unit/mesh/test_submesh.cpp b/tests/unit/mesh/test_submesh.cpp
-index 590f706d2..325f51977 100644
+index 590f706d2..05751376c 100644
 --- a/tests/unit/mesh/test_submesh.cpp
 +++ b/tests/unit/mesh/test_submesh.cpp
 @@ -19,6 +19,7 @@ using namespace mfem;
@@ -5141,17 +5214,28 @@ index 590f706d2..325f51977 100644
        case L2:
           return new L2_FECollection(p, dim, BasisType::GaussLobatto);
           break;
-@@ -56,7 +60,8 @@ void test_2d(Element::Type element_type,
+@@ -56,12 +60,19 @@ void test_2d(Element::Type element_type,
               SubMesh::From from)
  {
     constexpr int dim = 2;
 -   const int vdim = (field_type == FieldType::SCALAR) ? 1 : dim;
 +   const int vdim = (field_type == FieldType::SCALAR ||
 +                     fec_type == ND) ? 1 : dim;
++   const bool nonconforming = true;
     double Hy = 1.0;
     Mesh mesh = Mesh::MakeCartesian2D(5, 5, element_type, true, 1.0, Hy, false);
  
-@@ -176,7 +181,7 @@ void test_2d(Element::Type element_type,
+    if (from == SubMesh::From::Boundary)
+    {
++      if (nonconforming)
++      {
++         mesh.EnsureNCMesh();
++         mesh.RandomRefinement(0.5);
++      }
+       for (int i = 0; i < mesh.GetNBE(); i++)
+       {
+          Element *el = mesh.GetBdrElement(i);
+@@ -176,7 +187,7 @@ void test_2d(Element::Type element_type,
     {
        GridFunction sub_ex_gf(&sub_fes);
  
@@ -5160,7 +5244,7 @@ index 590f706d2..325f51977 100644
        {
           parent_gf.ProjectCoefficient(coeff);
           sub_ex_gf.ProjectCoefficient(coeff);
-@@ -188,6 +193,8 @@ void test_2d(Element::Type element_type,
+@@ -188,6 +199,8 @@ void test_2d(Element::Type element_type,
        }
        SubMesh::Transfer(parent_gf, sub_gf);
  
@@ -5169,7 +5253,7 @@ index 590f706d2..325f51977 100644
        sub_gf -= sub_ex_gf;
        REQUIRE(sub_gf.Norml2() < 1e-10);
     }
-@@ -195,7 +202,7 @@ void test_2d(Element::Type element_type,
+@@ -195,7 +208,7 @@ void test_2d(Element::Type element_type,
     {
        GridFunction parent_ex_gf(&parent_fes);
  
@@ -5178,7 +5262,7 @@ index 590f706d2..325f51977 100644
        {
           parent_gf.ProjectCoefficient(coeff);
           sub_gf.ProjectCoefficient(coeff);
-@@ -210,6 +217,8 @@ void test_2d(Element::Type element_type,
+@@ -210,6 +223,8 @@ void test_2d(Element::Type element_type,
  
        SubMesh::Transfer(sub_gf, parent_gf);
  
@@ -5187,17 +5271,28 @@ index 590f706d2..325f51977 100644
        parent_gf -= parent_ex_gf;
        REQUIRE(parent_gf.Norml2() < 1e-10);
     }
-@@ -227,7 +236,8 @@ void test_3d(Element::Type element_type,
+@@ -227,12 +242,19 @@ void test_3d(Element::Type element_type,
               SubMesh::From from)
  {
     constexpr int dim = 3;
 -   const int vdim = (field_type == FieldType::SCALAR) ? 1 : dim;
 +   const int vdim = (field_type == FieldType::SCALAR ||
 +                     fec_type == ND) ? 1 : dim;
++   const bool nonconforming = true;
     double Hy = 1.0;
     Mesh mesh = Mesh::MakeCartesian3D(5, 5, 5, element_type, 1.0, Hy, 1.0, false);
  
-@@ -351,7 +361,7 @@ void test_3d(Element::Type element_type,
+    if (from == SubMesh::From::Boundary)
+    {
++      if (nonconforming)
++      {
++         mesh.EnsureNCMesh();
++         mesh.RandomRefinement(0.5);
++      }
+       for (int i = 0; i < mesh.GetNBE(); i++)
+       {
+          Element *el = mesh.GetBdrElement(i);
+@@ -351,7 +373,7 @@ void test_3d(Element::Type element_type,
     {
        GridFunction sub_ex_gf(&sub_fes);
  
@@ -5206,7 +5301,7 @@ index 590f706d2..325f51977 100644
        {
           parent_gf.ProjectCoefficient(coeff);
           sub_ex_gf.ProjectCoefficient(coeff);
-@@ -372,7 +382,7 @@ void test_3d(Element::Type element_type,
+@@ -372,7 +394,7 @@ void test_3d(Element::Type element_type,
     {
        GridFunction parent_ex_gf(&parent_fes);
  
@@ -5215,7 +5310,7 @@ index 590f706d2..325f51977 100644
        {
           parent_gf.ProjectCoefficient(coeff);
           sub_gf.ProjectCoefficient(coeff);
-@@ -401,13 +411,17 @@ TEST_CASE("SubMesh", "[SubMesh]")
+@@ -401,13 +423,17 @@ TEST_CASE("SubMesh", "[SubMesh]")
  {
     int polynomial_order = 4;
     int mesh_polynomial_order = 2;
@@ -5234,7 +5329,7 @@ index 590f706d2..325f51977 100644
     SECTION("2D")
     {
        auto element = GENERATE(Element::QUADRILATERAL, Element::TRIANGLE);
-@@ -421,7 +435,8 @@ TEST_CASE("SubMesh", "[SubMesh]")
+@@ -421,7 +447,8 @@ TEST_CASE("SubMesh", "[SubMesh]")
  
     SECTION("3D")
     {