Merge branch 'main' into inclusion_map

FEniCS · Jan 14, 2025 · 7261e23 · 7261e23
2 parents 8291322 + 477076e
commit 7261e23
Show file tree

Hide file tree

Showing 18 changed files with 1,219 additions and 660 deletions.
diff --git a/cpp/dolfinx/fem/CoordinateElement.h b/cpp/dolfinx/fem/CoordinateElement.h
@@ -78,7 +78,9 @@ class CoordinateElement
   /// compute. Use 0 for the basis functions only
   /// @param[in] num_points Number of points at which to evaluate the
   /// basis functions.
-  /// @return Shape of the array to be filled by `tabulate`.
+  /// @return Shape of the array to be filled by `tabulate`, where (0)
+  /// is derivative index, (1) is the point index, (2) is the basis
+  /// function index and (3) is the basis function component.
   std::array<std::size_t, 4> tabulate_shape(std::size_t nd,
                                             std::size_t num_points) const;
 

diff --git a/cpp/dolfinx/fem/discreteoperators.h b/cpp/dolfinx/fem/discreteoperators.h
diff --git a/cpp/dolfinx/fem/interpolate.h b/cpp/dolfinx/fem/interpolate.h
diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h
@@ -7,6 +7,7 @@
 #pragma once
 
 #include "SparsityPattern.h"
+#include "Vector.h"
 #include "matrix_csr_impl.h"
 #include <algorithm>
 #include <dolfinx/common/IndexMap.h>
@@ -288,7 +289,31 @@ class MatrixCSR
   /// expanded.
   /// @return Dense copy of the part of the matrix on the calling rank.
   /// Storage is row-major.
-  std::vector<value_type> to_dense() const;
+  std::vector<value_type> to_dense() const
+  {
+    const std::size_t nrows = num_all_rows();
+    const std::size_t ncols = _index_maps[1]->size_global();
+    std::vector<value_type> A(nrows * ncols * _bs[0] * _bs[1], 0.0);
+    for (std::size_t r = 0; r < nrows; ++r)
+    {
+      for (std::int32_t j = _row_ptr[r]; j < _row_ptr[r + 1]; ++j)
+      {
+        for (int i0 = 0; i0 < _bs[0]; ++i0)
+        {
+          for (int i1 = 0; i1 < _bs[1]; ++i1)
+          {
+            std::array<std::int32_t, 1> local_col{_cols[j]};
+            std::array<std::int64_t, 1> global_col{0};
+            _index_maps[1]->local_to_global(local_col, global_col);
+            A[(r * _bs[1] + i0) * ncols * _bs[0] + global_col[0] * _bs[1] + i1]
+                = _data[j * _bs[0] * _bs[1] + i0 * _bs[1] + i1];
+          }
+        }
+      }
+    }
+
+    return A;
+  }
 
   /// @brief Transfer ghost row data to the owning ranks accumulating
   /// received values on the owned rows, and zeroing any existing data
@@ -309,18 +334,102 @@ class MatrixCSR
   /// must not be changed.
   /// @note This function does not change the matrix data. Data update
   /// only occurs with `scatter_rev_end()`.
-  void scatter_rev_begin();
+  void scatter_rev_begin()
+  {
+    const std::int32_t local_size0 = _index_maps[0]->size_local();
+    const std::int32_t num_ghosts0 = _index_maps[0]->num_ghosts();
+    const int bs2 = _bs[0] * _bs[1];
+
+    // For each ghost row, pack and send values to send to neighborhood
+    std::vector<int> insert_pos = _val_send_disp;
+    _ghost_value_data.resize(_val_send_disp.back());
+    for (int i = 0; i < num_ghosts0; ++i)
+    {
+      const int rank = _ghost_row_to_rank[i];
+
+      // Get position in send buffer to place data to send to this
+      // neighbour
+      const std::int32_t val_pos = insert_pos[rank];
+      std::copy(std::next(_data.data(), _row_ptr[local_size0 + i] * bs2),
+                std::next(_data.data(), _row_ptr[local_size0 + i + 1] * bs2),
+                std::next(_ghost_value_data.begin(), val_pos));
+      insert_pos[rank]
+          += bs2 * (_row_ptr[local_size0 + i + 1] - _row_ptr[local_size0 + i]);
+    }
+
+    _ghost_value_data_in.resize(_val_recv_disp.back());
+
+    // Compute data sizes for send and receive from displacements
+    std::vector<int> val_send_count(_val_send_disp.size() - 1);
+    std::adjacent_difference(std::next(_val_send_disp.begin()),
+                             _val_send_disp.end(), val_send_count.begin());
+
+    std::vector<int> val_recv_count(_val_recv_disp.size() - 1);
+    std::adjacent_difference(std::next(_val_recv_disp.begin()),
+                             _val_recv_disp.end(), val_recv_count.begin());
+
+    int status = MPI_Ineighbor_alltoallv(
+        _ghost_value_data.data(), val_send_count.data(), _val_send_disp.data(),
+        dolfinx::MPI::mpi_t<value_type>, _ghost_value_data_in.data(),
+        val_recv_count.data(), _val_recv_disp.data(),
+        dolfinx::MPI::mpi_t<value_type>, _comm.comm(), &_request);
+    assert(status == MPI_SUCCESS);
+  }
 
   /// @brief End transfer of ghost row data to owning ranks.
   /// @note Must be preceded by MatrixCSR::scatter_rev_begin().
   /// @note Matrix data received from other processes will be
   /// accumulated into locally owned rows, and ghost rows will be
   /// zeroed.
-  void scatter_rev_end();
+  void scatter_rev_end()
+  {
+    int status = MPI_Wait(&_request, MPI_STATUS_IGNORE);
+    assert(status == MPI_SUCCESS);
+
+    _ghost_value_data.clear();
+    _ghost_value_data.shrink_to_fit();
+
+    // Add to local rows
+    const int bs2 = _bs[0] * _bs[1];
+    assert(_ghost_value_data_in.size() == _unpack_pos.size() * bs2);
+    for (std::size_t i = 0; i < _unpack_pos.size(); ++i)
+      for (int j = 0; j < bs2; ++j)
+        _data[_unpack_pos[i] * bs2 + j] += _ghost_value_data_in[i * bs2 + j];
+
+    _ghost_value_data_in.clear();
+    _ghost_value_data_in.shrink_to_fit();
+
+    // Set ghost row data to zero
+    const std::int32_t local_size0 = _index_maps[0]->size_local();
+    std::fill(std::next(_data.begin(), _row_ptr[local_size0] * bs2),
+              _data.end(), 0);
+  }
 
   /// @brief Compute the Frobenius norm squared across all processes.
   /// @note MPI Collective
-  double squared_norm() const;
+  double squared_norm() const
+  {
+    const std::size_t num_owned_rows = _index_maps[0]->size_local();
+    const int bs2 = _bs[0] * _bs[1];
+    assert(num_owned_rows < _row_ptr.size());
+    double norm_sq_local = std::accumulate(
+        _data.cbegin(),
+        std::next(_data.cbegin(), _row_ptr[num_owned_rows] * bs2), double(0),
+        [](auto norm, value_type y) { return norm + std::norm(y); });
+    double norm_sq;
+    MPI_Allreduce(&norm_sq_local, &norm_sq, 1, MPI_DOUBLE, MPI_SUM,
+                  _comm.comm());
+    return norm_sq;
+  }
+
+  /// @brief Compute the product `y += Ax`.
+  ///
+  /// The vectors `x` and `y` must have parallel layouts that are
+  /// compatible with `A`.
+  ///
+  /// @param[in] x Vector to be apply `A` to.
+  /// @param[in,out] y Vector to accumulate the result into.
+  void mult(Vector<value_type>& x, Vector<value_type>& y);
 
   /// @brief Index maps for the row and column space.
   ///
@@ -424,7 +533,7 @@ MatrixCSR<U, V, W, X>::MatrixCSR(const SparsityPattern& p, BlockMode mode)
     for (int i = 0; i < 2; ++i)
     {
       const auto im = _index_maps[i];
-      const int size_local = im->size_local() * _bs[i];
+      const std::int32_t size_local = im->size_local() * _bs[i];
       std::span ghost_i = im->ghosts();
       std::vector<std::int64_t> ghosts;
       const std::vector<int> ghost_owner_i(im->owners().begin(),
@@ -438,7 +547,8 @@ MatrixCSR<U, V, W, X>::MatrixCSR(const SparsityPattern& p, BlockMode mode)
           src_rank.push_back(ghost_owner_i[j]);
         }
       }
-      const std::array<std::vector<int>, 2> src_dest0
+
+      std::array<std::vector<int>, 2> src_dest0
           = {std::vector(_index_maps[i]->src().begin(),
                          _index_maps[i]->src().end()),
              std::vector(_index_maps[i]->dest().begin(),
@@ -510,7 +620,7 @@ MatrixCSR<U, V, W, X>::MatrixCSR(const SparsityPattern& p, BlockMode mode)
   {
     auto it = std::ranges::lower_bound(src_ranks, r);
     assert(it != src_ranks.end() and *it == r);
-    int pos = std::distance(src_ranks.begin(), it);
+    std::size_t pos = std::distance(src_ranks.begin(), it);
     _ghost_row_to_rank.push_back(pos);
   }
 
@@ -535,7 +645,7 @@ MatrixCSR<U, V, W, X>::MatrixCSR(const SparsityPattern& p, BlockMode mode)
     for (std::size_t i = 0; i < _ghost_row_to_rank.size(); ++i)
     {
       const int rank = _ghost_row_to_rank[i];
-      int row_id = local_size[0] + i;
+      std::int32_t row_id = local_size[0] + i;
       for (int j = _row_ptr[row_id]; j < _row_ptr[row_id + 1]; ++j)
       {
         // Get position in send buffer
@@ -629,109 +739,79 @@ MatrixCSR<U, V, W, X>::MatrixCSR(const SparsityPattern& p, BlockMode mode)
   }
 }
 //-----------------------------------------------------------------------------
-template <typename U, typename V, typename W, typename X>
-std::vector<typename MatrixCSR<U, V, W, X>::value_type>
-MatrixCSR<U, V, W, X>::to_dense() const
-{
-  const std::size_t nrows = num_all_rows();
-  const std::size_t ncols = _index_maps[1]->size_global();
-  std::vector<value_type> A(nrows * ncols * _bs[0] * _bs[1], 0.0);
-  for (std::size_t r = 0; r < nrows; ++r)
-    for (std::int32_t j = _row_ptr[r]; j < _row_ptr[r + 1]; ++j)
-      for (int i0 = 0; i0 < _bs[0]; ++i0)
-        for (int i1 = 0; i1 < _bs[1]; ++i1)
-        {
-          std::array<std::int32_t, 1> local_col{_cols[j]};
-          std::array<std::int64_t, 1> global_col{0};
-          _index_maps[1]->local_to_global(local_col, global_col);
-          A[(r * _bs[1] + i0) * ncols * _bs[0] + global_col[0] * _bs[1] + i1]
-              = _data[j * _bs[0] * _bs[1] + i0 * _bs[1] + i1];
-        }
 
-  return A;
-}
-//-----------------------------------------------------------------------------
-template <typename U, typename V, typename W, typename X>
-void MatrixCSR<U, V, W, X>::scatter_rev_begin()
-{
-  const std::int32_t local_size0 = _index_maps[0]->size_local();
-  const std::int32_t num_ghosts0 = _index_maps[0]->num_ghosts();
-  const int bs2 = _bs[0] * _bs[1];
+// The matrix A is distributed across P  processes by blocks of rows:
+//  A = |   A_0  |
+//      |   A_1  |
+//      |   ...  |
+//      |  A_P-1 |
+//
+// Each submatrix A_i is owned by a single process "i" and can be further
+// decomposed into diagonal (Ai[0]) and off diagonal (Ai[1]) blocks:
+//  Ai = |Ai[0] Ai[1]|
+//
+// If A is square, the diagonal block Ai[0] is also square and contains
+// only owned columns and rows. The block Ai[1] contains ghost columns
+// (unowned dofs).
 
-  // For each ghost row, pack and send values to send to neighborhood
-  std::vector<int> insert_pos = _val_send_disp;
-  _ghost_value_data.resize(_val_send_disp.back());
-  for (int i = 0; i < num_ghosts0; ++i)
-  {
-    const int rank = _ghost_row_to_rank[i];
-
-    // Get position in send buffer to place data to send to this
-    // neighbour
-    const std::int32_t val_pos = insert_pos[rank];
-    std::copy(std::next(_data.data(), _row_ptr[local_size0 + i] * bs2),
-              std::next(_data.data(), _row_ptr[local_size0 + i + 1] * bs2),
-              std::next(_ghost_value_data.begin(), val_pos));
-    insert_pos[rank]
-        += bs2 * (_row_ptr[local_size0 + i + 1] - _row_ptr[local_size0 + i]);
-  }
+// Likewise, a local vector x can be decomposed into owned and ghost blocks:
+// xi = |   x[0]  |
+//      |   x[1]  |
+//
+// So the product y = Ax can be computed into two separate steps:
+//  y[0] = |Ai[0] Ai[1]| |   x[0]  | = Ai[0] x[0] + Ai[1] x[1]
+//                       |   x[1]  |
+//
+/// Computes y += A*x for a parallel CSR matrix A and parallel dense vectors
+/// x,y
+template <typename Scalar, typename V, typename W, typename X>
+void MatrixCSR<Scalar, V, W, X>::mult(la::Vector<Scalar>& x,
+                                      la::Vector<Scalar>& y)
+{
+  // start communication (update ghosts)
+  x.scatter_fwd_begin();
 
-  _ghost_value_data_in.resize(_val_recv_disp.back());
+  const std::int32_t nrowslocal = num_owned_rows();
+  std::span<const std::int64_t> Arow_ptr(row_ptr().data(), nrowslocal + 1);
+  std::span<const std::int32_t> Acols(cols().data(), Arow_ptr[nrowslocal]);
+  std::span<const std::int64_t> Aoff_diag_offset(off_diag_offset().data(),
+                                                 nrowslocal);
+  std::span<const Scalar> Avalues(values().data(), Arow_ptr[nrowslocal]);
 
-  // Compute data sizes for send and receive from displacements
-  std::vector<int> val_send_count(_val_send_disp.size() - 1);
-  std::adjacent_difference(std::next(_val_send_disp.begin()),
-                           _val_send_disp.end(), val_send_count.begin());
+  std::span<const Scalar> _x = x.array();
+  std::span<Scalar> _y = y.mutable_array();
 
-  std::vector<int> val_recv_count(_val_recv_disp.size() - 1);
-  std::adjacent_difference(std::next(_val_recv_disp.begin()),
-                           _val_recv_disp.end(), val_recv_count.begin());
+  std::span<const std::int64_t> Arow_begin(Arow_ptr.data(), nrowslocal);
+  std::span<const std::int64_t> Arow_end(Arow_ptr.data() + 1, nrowslocal);
 
-  int status = MPI_Ineighbor_alltoallv(
-      _ghost_value_data.data(), val_send_count.data(), _val_send_disp.data(),
-      dolfinx::MPI::mpi_t<value_type>, _ghost_value_data_in.data(),
-      val_recv_count.data(), _val_recv_disp.data(),
-      dolfinx::MPI::mpi_t<value_type>, _comm.comm(), &_request);
-  assert(status == MPI_SUCCESS);
-}
-//-----------------------------------------------------------------------------
-template <typename U, typename V, typename W, typename X>
-void MatrixCSR<U, V, W, X>::scatter_rev_end()
-{
-  int status = MPI_Wait(&_request, MPI_STATUS_IGNORE);
-  assert(status == MPI_SUCCESS);
+  // First stage:  spmv - diagonal
+  // yi[0] += Ai[0] * xi[0]
+  if (_bs[1] == 1)
+  {
+    impl::spmv<Scalar, 1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y,
+                          _bs[0], 1);
+  }
+  else
+  {
+    impl::spmv<Scalar, -1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y,
+                           _bs[0], _bs[1]);
+  }
 
-  _ghost_value_data.clear();
-  _ghost_value_data.shrink_to_fit();
+  // finalize ghost update
+  x.scatter_fwd_end();
 
-  // Add to local rows
-  const int bs2 = _bs[0] * _bs[1];
-  assert(_ghost_value_data_in.size() == _unpack_pos.size() * bs2);
-  for (std::size_t i = 0; i < _unpack_pos.size(); ++i)
-    for (int j = 0; j < bs2; ++j)
-      _data[_unpack_pos[i] * bs2 + j] += _ghost_value_data_in[i * bs2 + j];
-
-  _ghost_value_data_in.clear();
-  _ghost_value_data_in.shrink_to_fit();
-
-  // Set ghost row data to zero
-  const std::int32_t local_size0 = _index_maps[0]->size_local();
-  std::fill(std::next(_data.begin(), _row_ptr[local_size0] * bs2), _data.end(),
-            0);
-}
-//-----------------------------------------------------------------------------
-template <typename U, typename V, typename W, typename X>
-double MatrixCSR<U, V, W, X>::squared_norm() const
-{
-  const std::size_t num_owned_rows = _index_maps[0]->size_local();
-  const int bs2 = _bs[0] * _bs[1];
-  assert(num_owned_rows < _row_ptr.size());
-  double norm_sq_local = std::accumulate(
-      _data.cbegin(), std::next(_data.cbegin(), _row_ptr[num_owned_rows] * bs2),
-      double(0), [](auto norm, value_type y) { return norm + std::norm(y); });
-  double norm_sq;
-  MPI_Allreduce(&norm_sq_local, &norm_sq, 1, MPI_DOUBLE, MPI_SUM, _comm.comm());
-  return norm_sq;
+  // Second stage:  spmv - off-diagonal
+  // yi[0] += Ai[1] * xi[1]
+  if (_bs[1] == 1)
+  {
+    impl::spmv<Scalar, 1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y,
+                          _bs[0], 1);
+  }
+  else
+  {
+    impl::spmv<Scalar, -1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y,
+                           _bs[0], _bs[1]);
+  }
 }
-//-----------------------------------------------------------------------------
 
 } // namespace dolfinx::la