From 3725b84aa26d36ee86ddf1f7ab5677eeb5375684 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Mon, 1 Jul 2024 15:44:43 +0200
Subject: [PATCH 01/30] file and folder structure for new algorithm

---
 include/dlaf/eigensolver/reduction_to_band.h  | 14 ++++
 .../dlaf/eigensolver/reduction_to_band/api.h  | 17 ++++-
 .../eigensolver/reduction_to_band/ca-impl.h   | 65 +++++++++++++++++++
 src/eigensolver/reduction_to_band/gpu.cpp     |  1 +
 src/eigensolver/reduction_to_band/mc.cpp      |  1 +
 5 files changed, 97 insertions(+), 1 deletion(-)
 create mode 100644 include/dlaf/eigensolver/reduction_to_band/ca-impl.h
diff --git a/include/dlaf/eigensolver/reduction_to_band.h b/include/dlaf/eigensolver/reduction_to_band.h
index 9a22ee168d..1f7dc13a3b 100644
--- a/include/dlaf/eigensolver/reduction_to_band.h
+++ b/include/dlaf/eigensolver/reduction_to_band.h
@@ -119,4 +119,18 @@ Matrix<T, Device::CPU> reduction_to_band(comm::CommunicatorGrid& grid, Matrix<T,
 
   return ReductionToBand<B, D, T>::call(grid, mat_a, band_size);
 }
+
+template <Backend B, Device D, class T>
+internal::CARed2BandResult<T, D> ca_reduction_to_band(comm::CommunicatorGrid& grid, Matrix<T, D>& mat_a,
+                                                      const SizeType band_size) {
+  DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
+  DLAF_ASSERT(matrix::square_blocksize(mat_a), mat_a);
+  DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
+  DLAF_ASSERT(matrix::equal_process_grid(mat_a, grid), mat_a, grid);
+
+  DLAF_ASSERT(band_size >= 2, band_size);
+  DLAF_ASSERT(mat_a.blockSize().rows() % band_size == 0, mat_a.blockSize().rows(), band_size);
+
+  return CAReductionToBand<B, D, T>::call(grid, mat_a, band_size);
+}
 }
diff --git a/include/dlaf/eigensolver/reduction_to_band/api.h b/include/dlaf/eigensolver/reduction_to_band/api.h
index 5ead6f0325..a107ee282c 100644
--- a/include/dlaf/eigensolver/reduction_to_band/api.h
+++ b/include/dlaf/eigensolver/reduction_to_band/api.h
@@ -24,9 +24,24 @@ struct ReductionToBand {
                                      const SizeType band_size);
 };
 
+template <class T, Device D>
+struct CARed2BandResult {
+  Matrix<T, Device::CPU> taus_1st;
+  // hh_1st are stored in-place
+  Matrix<T, Device::CPU> taus_2nd;
+  Matrix<T, D> hh_2nd;
+};
+
+template <Backend B, Device D, class T>
+struct CAReductionToBand {
+  static CARed2BandResult<T, D> call(comm::CommunicatorGrid& grid, Matrix<T, D>& mat_a,
+                                     const SizeType band_size);
+};
+
 // ETI
 #define DLAF_EIGENSOLVER_REDUCTION_TO_BAND_ETI(KWORD, BACKEND, DEVICE, DATATYPE) \
-  KWORD template struct ReductionToBand<BACKEND, DEVICE, DATATYPE>;
+  KWORD template struct ReductionToBand<BACKEND, DEVICE, DATATYPE>;              \
+  KWORD template struct CAReductionToBand<BACKEND, DEVICE, DATATYPE>;
 
 DLAF_EIGENSOLVER_REDUCTION_TO_BAND_ETI(extern, Backend::MC, Device::CPU, float)
 DLAF_EIGENSOLVER_REDUCTION_TO_BAND_ETI(extern, Backend::MC, Device::CPU, double)
diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
new file mode 100644
index 0000000000..af6ceb36c4
--- /dev/null
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -0,0 +1,65 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2024, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+
+#pragma once
+
+#include <pika/execution.hpp>
+
+#include <dlaf/communication/communicator_grid.h>
+#include <dlaf/eigensolver/reduction_to_band/api.h>
+#include <dlaf/matrix/matrix.h>
+
+namespace dlaf::eigensolver::internal {
+
+namespace ca_red2band {}
+
+// Distributed implementation of reduction to band
+template <Backend B, Device D, class T>
+CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid& grid,
+                                                        Matrix<T, D>& mat_a, const SizeType band_size) {
+  namespace ex = pika::execution::experimental;
+  namespace di = dlaf::internal;
+
+  const auto& dist = mat_a.distribution();
+  const comm::Index2D rank = dist.rank_index();
+
+  const SizeType nrefls = std::max<SizeType>(0, dist.size().cols() - band_size - 1);
+
+  // Note:
+  // Reflector of size = 1 is not considered whatever T is (i.e. neither real
+  // nor complex)
+  DLAF_ASSERT(dist.block_size().cols() % band_size == 0, dist.block_size().cols(), band_size);
+
+  // Note:
+  // row-vector that is distributed over columns, but replicated over rows.
+  // for historical reason it is stored and accessed as a column-vector.
+  DLAF_ASSERT(dist.block_size().cols() % band_size == 0, dist.block_size().cols(), band_size);
+  const matrix::Distribution dist_taus(GlobalElementSize(nrefls, 1),
+                                       TileElementSize(dist.block_size().cols(), 1),
+                                       comm::Size2D(dist.grid_size().cols(), 1),
+                                       comm::Index2D(rank.col(), 0),
+                                       comm::Index2D(dist.source_rank_index().col(), 0));
+  Matrix<T, Device::CPU> mat_taus_1st(dist_taus);
+  Matrix<T, Device::CPU> mat_taus_2nd(dist_taus);
+
+  // Note:
+  // row-panel distributed over columns, but replicated over rows
+  const matrix::Distribution dist_hh_2nd(GlobalElementSize(dist.block_size().rows(), dist.size().cols()),
+                                         dist.block_size(), comm::Size2D(1, dist.grid_size().cols()),
+                                         comm::Index2D(0, rank.col()),
+                                         comm::Index2D(0, dist.source_rank_index().col()));
+  Matrix<T, D> mat_hh_2nd(dist_hh_2nd);
+
+  if (nrefls == 0)
+    return {std::move(mat_taus_1st), std::move(mat_taus_2nd), std::move(mat_hh_2nd)};
+
+  return {std::move(mat_taus_1st), std::move(mat_taus_2nd), std::move(mat_hh_2nd)};
+}
+}  // namespace dlaf::eigensolver::internal
diff --git a/src/eigensolver/reduction_to_band/gpu.cpp b/src/eigensolver/reduction_to_band/gpu.cpp
index 7e0cce7e77..e079f22fe1 100644
--- a/src/eigensolver/reduction_to_band/gpu.cpp
+++ b/src/eigensolver/reduction_to_band/gpu.cpp
@@ -8,6 +8,7 @@
 // SPDX-License-Identifier: BSD-3-Clause
 //
 
+#include <dlaf/eigensolver/reduction_to_band/ca-impl.h>
 #include <dlaf/eigensolver/reduction_to_band/impl.h>
 
 namespace dlaf::eigensolver::internal {
diff --git a/src/eigensolver/reduction_to_band/mc.cpp b/src/eigensolver/reduction_to_band/mc.cpp
index 93c8be8806..5b3a041356 100644
--- a/src/eigensolver/reduction_to_band/mc.cpp
+++ b/src/eigensolver/reduction_to_band/mc.cpp
@@ -8,6 +8,7 @@
 // SPDX-License-Identifier: BSD-3-Clause
 //
 
+#include <dlaf/eigensolver/reduction_to_band/ca-impl.h>
 #include <dlaf/eigensolver/reduction_to_band/impl.h>
 
 namespace dlaf::eigensolver::internal {

From df2f1dec809bf9827e25163b03aa2c6595e48b09 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Mon, 1 Jul 2024 16:02:01 +0200
Subject: [PATCH 02/30] factor out red2band functions so they can be reused in
 new implementation variant

---
 .../eigensolver/reduction_to_band/common.h    | 284 ++++++++++++++++++
 .../dlaf/eigensolver/reduction_to_band/impl.h | 248 +--------------
 2 files changed, 285 insertions(+), 247 deletions(-)
 create mode 100644 include/dlaf/eigensolver/reduction_to_band/common.h

diff --git a/include/dlaf/eigensolver/reduction_to_band/common.h b/include/dlaf/eigensolver/reduction_to_band/common.h
new file mode 100644
index 0000000000..c15c749aaa
--- /dev/null
+++ b/include/dlaf/eigensolver/reduction_to_band/common.h
@@ -0,0 +1,284 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2024, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+
+#pragma once
+
+#include <vector>
+
+#include <pika/barrier.hpp>
+
+#include <dlaf/common/index2d.h>
+#include <dlaf/common/single_threaded_blas.h>
+#include <dlaf/common/vector.h>
+#include <dlaf/eigensolver/internal/get_red2band_barrier_busy_wait.h>
+#include <dlaf/eigensolver/internal/get_red2band_panel_nworkers.h>
+#include <dlaf/matrix/matrix.h>
+#include <dlaf/matrix/views.h>
+#include <dlaf/schedulers.h>
+
+namespace dlaf::eigensolver::internal {
+
+// Given a vector of vectors, reduce all vectors in the first one using sum operation
+template <class T>
+void reduceColumnVectors(std::vector<common::internal::vector<T>>& columnVectors) {
+  for (std::size_t i = 1; i < columnVectors.size(); ++i) {
+    DLAF_ASSERT_HEAVY(columnVectors[0].size() == columnVectors[i].size(), columnVectors[0].size(),
+                      columnVectors[i].size());
+    for (SizeType j = 0; j < columnVectors[0].size(); ++j)
+      columnVectors[0][j] += columnVectors[i][j];
+  }
+}
+
+namespace red2band {
+
+// Extract x0 and compute local cumulative sum of squares of the reflector column
+template <Device D, class T>
+std::array<T, 2> computeX0AndSquares(const bool has_head, const std::vector<matrix::Tile<T, D>>& panel,
+                                     SizeType j) {
+  std::array<T, 2> x0_and_squares{0, 0};
+  auto it_begin = panel.begin();
+  auto it_end = panel.end();
+
+  common::internal::SingleThreadedBlasScope single;
+
+  if (has_head) {
+    auto& tile_v0 = *it_begin++;
+
+    const TileElementIndex idx_x0(j, j);
+    x0_and_squares[0] = tile_v0(idx_x0);
+
+    T* reflector_ptr = tile_v0.ptr(idx_x0);
+    x0_and_squares[1] =
+        blas::dot(tile_v0.size().rows() - idx_x0.row(), reflector_ptr, 1, reflector_ptr, 1);
+  }
+
+  for (auto it = it_begin; it != it_end; ++it) {
+    const auto& tile = *it;
+
+    T* reflector_ptr = tile.ptr({0, j});
+    x0_and_squares[1] += blas::dot(tile.size().rows(), reflector_ptr, 1, reflector_ptr, 1);
+  }
+  return x0_and_squares;
+}
+
+template <Device D, class T>
+T computeReflectorAndTau(const bool has_head, const std::vector<matrix::Tile<T, D>>& panel,
+                         const SizeType j, std::array<T, 2> x0_and_squares) {
+  if (x0_and_squares[1] == T(0))
+    return T(0);
+
+  const T norm = std::sqrt(x0_and_squares[1]);
+  const T x0 = x0_and_squares[0];
+  const T y = std::signbit(std::real(x0_and_squares[0])) ? norm : -norm;
+  const T tau = (y - x0) / y;
+
+  auto it_begin = panel.begin();
+  auto it_end = panel.end();
+
+  common::internal::SingleThreadedBlasScope single;
+
+  if (has_head) {
+    const auto& tile_v0 = *it_begin++;
+
+    const TileElementIndex idx_x0(j, j);
+    tile_v0(idx_x0) = y;
+
+    if (j + 1 < tile_v0.size().rows()) {
+      T* v = tile_v0.ptr({j + 1, j});
+      blas::scal(tile_v0.size().rows() - (j + 1), T(1) / (x0 - y), v, 1);
+    }
+  }
+
+  for (auto it = it_begin; it != it_end; ++it) {
+    auto& tile_v = *it;
+    T* v = tile_v.ptr({0, j});
+    blas::scal(tile_v.size().rows(), T(1) / (x0 - y), v, 1);
+  }
+
+  return tau;
+}
+
+template <Device D, class T>
+void computeWTrailingPanel(const bool has_head, const std::vector<matrix::Tile<T, D>>& panel,
+                           common::internal::vector<T>& w, SizeType j, const SizeType pt_cols,
+                           const std::size_t begin, const std::size_t end) {
+  // for each tile in the panel, consider just the trailing panel
+  // i.e. all rows (height = reflector), just columns to the right of the current reflector
+  if (!(pt_cols > 0))
+    return;
+
+  const TileElementIndex index_el_x0(j, j);
+  bool has_first_component = has_head;
+
+  common::internal::SingleThreadedBlasScope single;
+
+  // W = Pt* . V
+  for (auto index = begin; index < end; ++index) {
+    const matrix::Tile<const T, D>& tile_a = panel[index];
+    const SizeType first_element = has_first_component ? index_el_x0.row() : 0;
+
+    TileElementIndex pt_start{first_element, index_el_x0.col() + 1};
+    TileElementSize pt_size{tile_a.size().rows() - pt_start.row(), pt_cols};
+    TileElementIndex v_start{first_element, index_el_x0.col()};
+
+    if (has_first_component) {
+      const TileElementSize offset{1, 0};
+
+      const T fake_v = 1;
+      blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, offset.rows(), pt_size.cols(), T(1),
+                 tile_a.ptr(pt_start), tile_a.ld(), &fake_v, 1, T(0), w.data(), 1);
+
+      pt_start = pt_start + offset;
+      v_start = v_start + offset;
+      pt_size = pt_size - offset;
+
+      has_first_component = false;
+    }
+
+    if (pt_start.isIn(tile_a.size())) {
+      // W += 1 . A* . V
+      blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, pt_size.rows(), pt_size.cols(), T(1),
+                 tile_a.ptr(pt_start), tile_a.ld(), tile_a.ptr(v_start), 1, T(1), w.data(), 1);
+    }
+  }
+}
+
+template <Device D, class T>
+void updateTrailingPanel(const bool has_head, const std::vector<matrix::Tile<T, D>>& panel, SizeType j,
+                         const std::vector<T>& w, const T tau, const std::size_t begin,
+                         const std::size_t end) {
+  const TileElementIndex index_el_x0(j, j);
+
+  bool has_first_component = has_head;
+
+  common::internal::SingleThreadedBlasScope single;
+
+  // GER Pt = Pt - tau . v . w*
+  for (auto index = begin; index < end; ++index) {
+    const matrix::Tile<T, D>& tile_a = panel[index];
+    const SizeType first_element = has_first_component ? index_el_x0.row() : 0;
+
+    TileElementIndex pt_start{first_element, index_el_x0.col() + 1};
+    TileElementSize pt_size{tile_a.size().rows() - pt_start.row(),
+                            tile_a.size().cols() - pt_start.col()};
+    TileElementIndex v_start{first_element, index_el_x0.col()};
+
+    if (has_first_component) {
+      const TileElementSize offset{1, 0};
+
+      // Pt = Pt - tau * v[0] * w*
+      const T fake_v = 1;
+      blas::ger(blas::Layout::ColMajor, 1, pt_size.cols(), -dlaf::conj(tau), &fake_v, 1, w.data(), 1,
+                tile_a.ptr(pt_start), tile_a.ld());
+
+      pt_start = pt_start + offset;
+      v_start = v_start + offset;
+      pt_size = pt_size - offset;
+
+      has_first_component = false;
+    }
+
+    if (pt_start.isIn(tile_a.size())) {
+      // Pt = Pt - tau * v * w*
+      blas::ger(blas::Layout::ColMajor, pt_size.rows(), pt_size.cols(), -dlaf::conj(tau),
+                tile_a.ptr(v_start), 1, w.data(), 1, tile_a.ptr(pt_start), tile_a.ld());
+    }
+  }
+}
+
+}
+
+namespace red2band::local {
+
+template <Device D, class T>
+T computeReflector(const std::vector<matrix::Tile<T, D>>& panel, SizeType j) {
+  constexpr bool has_head = true;
+
+  std::array<T, 2> x0_and_squares = computeX0AndSquares(has_head, panel, j);
+
+  auto tau = computeReflectorAndTau(has_head, panel, j, std::move(x0_and_squares));
+
+  return tau;
+}
+
+template <class MatrixLikeA, class MatrixLikeTaus>
+void computePanelReflectors(MatrixLikeA& mat_a, MatrixLikeTaus& mat_taus, const SizeType j_sub,
+                            const matrix::SubPanelView& panel_view) {
+  static Device constexpr D = MatrixLikeA::device;
+  using T = typename MatrixLikeA::ElementType;
+  using pika::execution::thread_priority;
+  namespace ex = pika::execution::experimental;
+  namespace di = dlaf::internal;
+
+  std::vector<matrix::ReadWriteTileSender<T, D>> panel_tiles;
+  const auto panel_range = panel_view.iteratorLocal();
+  const std::size_t panel_ntiles = to_sizet(std::distance(panel_range.begin(), panel_range.end()));
+
+  // if (panel_ntiles == 0) {
+  //   return;
+  // }
+
+  panel_tiles.reserve(panel_ntiles);
+  for (const auto& i : panel_range) {
+    const matrix::SubTileSpec& spec = panel_view(i);
+    panel_tiles.emplace_back(matrix::splitTile(mat_a.readwrite(i), spec));
+  }
+
+  const std::size_t nthreads = getReductionToBandPanelNWorkers();
+  ex::start_detached(
+      ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads),
+                            std::vector<common::internal::vector<T>>{}),  // w (internally required)
+                   mat_taus.readwrite(LocalTileIndex(j_sub, 0)),
+                   ex::when_all_vector(std::move(panel_tiles))) |
+      ex::transfer(di::getBackendScheduler<Backend::MC>(thread_priority::high)) |
+      ex::bulk(nthreads, [nthreads, cols = panel_view.cols()](const std::size_t index, auto& barrier_ptr,
+                                                              auto& w, auto& taus, auto& tiles) {
+        const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
+        const std::size_t batch_size = util::ceilDiv(tiles.size(), nthreads);
+        const std::size_t begin = index * batch_size;
+        const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
+        const SizeType nrefls = taus.size().rows();
+
+        if (index == 0) {
+          w.resize(nthreads);
+        }
+
+        for (SizeType j = 0; j < nrefls; ++j) {
+          // STEP1: compute tau and reflector (single-thread)
+          if (index == 0) {
+            taus({j, 0}) = computeReflector(tiles, j);
+          }
+
+          barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+          // STEP2a: compute w (multi-threaded)
+          const SizeType pt_cols = cols - (j + 1);
+          if (pt_cols == 0)
+            break;
+          const bool has_head = (index == 0);
+
+          w[index] = common::internal::vector<T>(pt_cols, 0);
+          computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
+          barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+          // STEP2b: reduce w results (single-threaded)
+          if (index == 0)
+            dlaf::eigensolver::internal::reduceColumnVectors(w);
+          barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+          // STEP3: update trailing panel (multi-threaded)
+          updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
+          barrier_ptr->arrive_and_wait(barrier_busy_wait);
+        }
+      }));
+}
+
+}
+}
diff --git a/include/dlaf/eigensolver/reduction_to_band/impl.h b/include/dlaf/eigensolver/reduction_to_band/impl.h
index 2a7882fd82..0899be139a 100644
--- a/include/dlaf/eigensolver/reduction_to_band/impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/impl.h
@@ -38,9 +38,8 @@
 #include <dlaf/communication/kernels/all_reduce.h>
 #include <dlaf/communication/kernels/reduce.h>
 #include <dlaf/communication/rdma.h>
-#include <dlaf/eigensolver/internal/get_red2band_barrier_busy_wait.h>
-#include <dlaf/eigensolver/internal/get_red2band_panel_nworkers.h>
 #include <dlaf/eigensolver/reduction_to_band/api.h>
+#include <dlaf/eigensolver/reduction_to_band/common.h>
 #include <dlaf/factorization/qr.h>
 #include <dlaf/lapack/tile.h>
 #include <dlaf/matrix/copy_tile.h>
@@ -59,174 +58,8 @@
 
 namespace dlaf::eigensolver::internal {
 
-// Given a vector of vectors, reduce all vectors in the first one using sum operation
-template <class T>
-void reduceColumnVectors(std::vector<common::internal::vector<T>>& columnVectors) {
-  for (std::size_t i = 1; i < columnVectors.size(); ++i) {
-    DLAF_ASSERT_HEAVY(columnVectors[0].size() == columnVectors[i].size(), columnVectors[0].size(),
-                      columnVectors[i].size());
-    for (SizeType j = 0; j < columnVectors[0].size(); ++j)
-      columnVectors[0][j] += columnVectors[i][j];
-  }
-}
-
 namespace red2band {
 
-// Extract x0 and compute local cumulative sum of squares of the reflector column
-template <Device D, class T>
-std::array<T, 2> computeX0AndSquares(const bool has_head, const std::vector<matrix::Tile<T, D>>& panel,
-                                     SizeType j) {
-  std::array<T, 2> x0_and_squares{0, 0};
-  auto it_begin = panel.begin();
-  auto it_end = panel.end();
-
-  common::internal::SingleThreadedBlasScope single;
-
-  if (has_head) {
-    auto& tile_v0 = *it_begin++;
-
-    const TileElementIndex idx_x0(j, j);
-    x0_and_squares[0] = tile_v0(idx_x0);
-
-    T* reflector_ptr = tile_v0.ptr({idx_x0});
-    x0_and_squares[1] =
-        blas::dot(tile_v0.size().rows() - idx_x0.row(), reflector_ptr, 1, reflector_ptr, 1);
-  }
-
-  for (auto it = it_begin; it != it_end; ++it) {
-    const auto& tile = *it;
-
-    T* reflector_ptr = tile.ptr({0, j});
-    x0_and_squares[1] += blas::dot(tile.size().rows(), reflector_ptr, 1, reflector_ptr, 1);
-  }
-  return x0_and_squares;
-}
-
-template <Device D, class T>
-T computeReflectorAndTau(const bool has_head, const std::vector<matrix::Tile<T, D>>& panel,
-                         const SizeType j, std::array<T, 2> x0_and_squares) {
-  if (x0_and_squares[1] == T(0))
-    return T(0);
-
-  const T norm = std::sqrt(x0_and_squares[1]);
-  const T x0 = x0_and_squares[0];
-  const T y = std::signbit(std::real(x0_and_squares[0])) ? norm : -norm;
-  const T tau = (y - x0) / y;
-
-  auto it_begin = panel.begin();
-  auto it_end = panel.end();
-
-  common::internal::SingleThreadedBlasScope single;
-
-  if (has_head) {
-    const auto& tile_v0 = *it_begin++;
-
-    const TileElementIndex idx_x0(j, j);
-    tile_v0(idx_x0) = y;
-
-    if (j + 1 < tile_v0.size().rows()) {
-      T* v = tile_v0.ptr({j + 1, j});
-      blas::scal(tile_v0.size().rows() - (j + 1), T(1) / (x0 - y), v, 1);
-    }
-  }
-
-  for (auto it = it_begin; it != it_end; ++it) {
-    auto& tile_v = *it;
-    T* v = tile_v.ptr({0, j});
-    blas::scal(tile_v.size().rows(), T(1) / (x0 - y), v, 1);
-  }
-
-  return tau;
-}
-
-template <Device D, class T>
-void computeWTrailingPanel(const bool has_head, const std::vector<matrix::Tile<T, D>>& panel,
-                           common::internal::vector<T>& w, SizeType j, const SizeType pt_cols,
-                           const std::size_t begin, const std::size_t end) {
-  // for each tile in the panel, consider just the trailing panel
-  // i.e. all rows (height = reflector), just columns to the right of the current reflector
-  if (!(pt_cols > 0))
-    return;
-
-  const TileElementIndex index_el_x0(j, j);
-  bool has_first_component = has_head;
-
-  common::internal::SingleThreadedBlasScope single;
-
-  // W = Pt* . V
-  for (auto index = begin; index < end; ++index) {
-    const matrix::Tile<const T, D>& tile_a = panel[index];
-    const SizeType first_element = has_first_component ? index_el_x0.row() : 0;
-
-    TileElementIndex pt_start{first_element, index_el_x0.col() + 1};
-    TileElementSize pt_size{tile_a.size().rows() - pt_start.row(), pt_cols};
-    TileElementIndex v_start{first_element, index_el_x0.col()};
-
-    if (has_first_component) {
-      const TileElementSize offset{1, 0};
-
-      const T fake_v = 1;
-      blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, offset.rows(), pt_size.cols(), T(1),
-                 tile_a.ptr(pt_start), tile_a.ld(), &fake_v, 1, T(0), w.data(), 1);
-
-      pt_start = pt_start + offset;
-      v_start = v_start + offset;
-      pt_size = pt_size - offset;
-
-      has_first_component = false;
-    }
-
-    if (pt_start.isIn(tile_a.size())) {
-      // W += 1 . A* . V
-      blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, pt_size.rows(), pt_size.cols(), T(1),
-                 tile_a.ptr(pt_start), tile_a.ld(), tile_a.ptr(v_start), 1, T(1), w.data(), 1);
-    }
-  }
-}
-
-template <Device D, class T>
-void updateTrailingPanel(const bool has_head, const std::vector<matrix::Tile<T, D>>& panel, SizeType j,
-                         const std::vector<T>& w, const T tau, const std::size_t begin,
-                         const std::size_t end) {
-  const TileElementIndex index_el_x0(j, j);
-
-  bool has_first_component = has_head;
-
-  common::internal::SingleThreadedBlasScope single;
-
-  // GER Pt = Pt - tau . v . w*
-  for (auto index = begin; index < end; ++index) {
-    const matrix::Tile<T, D>& tile_a = panel[index];
-    const SizeType first_element = has_first_component ? index_el_x0.row() : 0;
-
-    TileElementIndex pt_start{first_element, index_el_x0.col() + 1};
-    TileElementSize pt_size{tile_a.size().rows() - pt_start.row(),
-                            tile_a.size().cols() - pt_start.col()};
-    TileElementIndex v_start{first_element, index_el_x0.col()};
-
-    if (has_first_component) {
-      const TileElementSize offset{1, 0};
-
-      // Pt = Pt - tau * v[0] * w*
-      const T fake_v = 1;
-      blas::ger(blas::Layout::ColMajor, 1, pt_size.cols(), -dlaf::conj(tau), &fake_v, 1, w.data(), 1,
-                tile_a.ptr(pt_start), tile_a.ld());
-
-      pt_start = pt_start + offset;
-      v_start = v_start + offset;
-      pt_size = pt_size - offset;
-
-      has_first_component = false;
-    }
-
-    if (pt_start.isIn(tile_a.size())) {
-      // Pt = Pt - tau * v * w*
-      blas::ger(blas::Layout::ColMajor, pt_size.rows(), pt_size.cols(), -dlaf::conj(tau),
-                tile_a.ptr(v_start), 1, w.data(), 1, tile_a.ptr(pt_start), tile_a.ld());
-    }
-  }
-}
-
 template <Backend B, typename ASender, typename WSender, typename XSender>
 void hemmDiag(pika::execution::thread_priority priority, ASender&& tile_a, WSender&& tile_w,
               XSender&& tile_x) {
@@ -281,85 +114,6 @@ void her2kOffDiag(pika::execution::thread_priority priority, ASender&& tile_a, B
 }
 
 namespace local {
-
-template <Device D, class T>
-T computeReflector(const std::vector<matrix::Tile<T, D>>& panel, SizeType j) {
-  constexpr bool has_head = true;
-
-  std::array<T, 2> x0_and_squares = computeX0AndSquares(has_head, panel, j);
-
-  auto tau = computeReflectorAndTau(has_head, panel, j, std::move(x0_and_squares));
-
-  return tau;
-}
-
-template <class MatrixLikeA, class MatrixLikeTaus>
-void computePanelReflectors(MatrixLikeA& mat_a, MatrixLikeTaus& mat_taus, const SizeType j_sub,
-                            const matrix::SubPanelView& panel_view) {
-  static Device constexpr D = MatrixLikeA::device;
-  using T = typename MatrixLikeA::ElementType;
-  using pika::execution::thread_priority;
-  namespace ex = pika::execution::experimental;
-  namespace di = dlaf::internal;
-
-  std::vector<matrix::ReadWriteTileSender<T, D>> panel_tiles;
-  panel_tiles.reserve(to_sizet(std::distance(panel_view.iteratorLocal().begin(),
-                                             panel_view.iteratorLocal().end())));
-  for (const auto& i : panel_view.iteratorLocal()) {
-    const matrix::SubTileSpec& spec = panel_view(i);
-    panel_tiles.emplace_back(matrix::splitTile(mat_a.readwrite(i), spec));
-  }
-
-  const std::size_t nthreads = getReductionToBandPanelNWorkers();
-  auto s =
-      ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads),
-                            std::vector<common::internal::vector<T>>{}),  // w (internally required)
-                   mat_taus.readwrite(LocalTileIndex(j_sub, 0)),
-                   ex::when_all_vector(std::move(panel_tiles))) |
-      ex::transfer(di::getBackendScheduler<Backend::MC>(thread_priority::high)) |
-      ex::bulk(nthreads, [nthreads, cols = panel_view.cols()](const std::size_t index, auto& barrier_ptr,
-                                                              auto& w, auto& taus, auto& tiles) {
-        const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
-        const std::size_t batch_size = util::ceilDiv(tiles.size(), nthreads);
-        const std::size_t begin = index * batch_size;
-        const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
-        const SizeType nrefls = taus.size().rows();
-
-        if (index == 0) {
-          w.resize(nthreads);
-        }
-
-        for (SizeType j = 0; j < nrefls; ++j) {
-          // STEP1: compute tau and reflector (single-thread)
-          if (index == 0) {
-            taus({j, 0}) = computeReflector(tiles, j);
-          }
-
-          barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-          // STEP2a: compute w (multi-threaded)
-          const SizeType pt_cols = cols - (j + 1);
-          if (pt_cols == 0)
-            break;
-          const bool has_head = (index == 0);
-
-          w[index] = common::internal::vector<T>(pt_cols, 0);
-          computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
-          barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-          // STEP2b: reduce w results (single-threaded)
-          if (index == 0)
-            dlaf::eigensolver::internal::reduceColumnVectors(w);
-          barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-          // STEP3: update trailing panel (multi-threaded)
-          updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
-          barrier_ptr->arrive_and_wait(barrier_busy_wait);
-        }
-      });
-  ex::start_detached(std::move(s));
-}
-
 template <Backend B, Device D, class T>
 void setupReflectorPanelV(bool has_head, const matrix::SubPanelView& panel_view, const SizeType nrefls,
                           matrix::Panel<Coord::Col, T, D>& v, matrix::Matrix<const T, D>& mat_a,

From b9ce9964c43ce2f8f6e0524ecbfd37dd8696411b Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Mon, 1 Jul 2024 16:15:02 +0200
Subject: [PATCH 03/30] basic test structure

---
 .../eigensolver/test_reduction_to_band.cpp    | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/test/unit/eigensolver/test_reduction_to_band.cpp b/test/unit/eigensolver/test_reduction_to_band.cpp
index c4bee9db09..99dcc5a247 100644
--- a/test/unit/eigensolver/test_reduction_to_band.cpp
+++ b/test/unit/eigensolver/test_reduction_to_band.cpp
@@ -489,3 +489,71 @@ TYPED_TEST(ReductionToBandTestGPU, CorrectnessDistributedSubBand) {
   }
 }
 #endif
+
+template <class T>
+struct CAReductionToBandTest : public TestWithCommGrids {};
+
+template <class T>
+using CAReductionToBandTestMC = ReductionToBandTest<T>;
+
+using JustThisType = ::testing::Types<float>;
+TYPED_TEST_SUITE(CAReductionToBandTestMC, JustThisType);
+
+template <class T, Device D, Backend B>
+void testCAReductionToBand(comm::CommunicatorGrid& grid, const LocalElementSize size,
+                           const TileElementSize block_size, const SizeType band_size,
+                           const InputMatrixStructure input_matrix_structure) {
+  const SizeType k_reflectors = std::max(SizeType(0), size.rows() - band_size - 1);
+  DLAF_ASSERT(block_size.rows() % band_size == 0, block_size.rows(), band_size);
+
+  Distribution distribution({size.rows(), size.cols()}, block_size, grid.size(), grid.rank(), {0, 0});
+
+  // setup the reference input matrix
+  Matrix<const T, Device::CPU> reference = [&]() {
+    Matrix<T, Device::CPU> reference(distribution);
+    if (input_matrix_structure == InputMatrixStructure::banded)
+      // Matrix already in band form, with band smaller than band_size
+      matrix::util::set_random_hermitian_banded(reference, band_size - 1);
+    else
+      matrix::util::set_random_hermitian(reference);
+    return reference;
+  }();
+
+  Matrix<T, Device::CPU> matrix_a_h(distribution);
+  copy(reference, matrix_a_h);
+
+  eigensolver::internal::CARed2BandResult<T, D> red2band_result = [&]() {
+    MatrixMirror<T, D, Device::CPU> matrix_a(matrix_a_h);
+    return eigensolver::internal::ca_reduction_to_band<B>(grid, matrix_a.get(), band_size);
+  }();
+
+  Matrix<T, D>& mat_local_taus = red2band_result.taus_1st;
+
+  // what?!
+  // ASSERT_EQ(mat_local_taus.block_size().rows(), block_size.rows());
+
+  checkUpperPartUnchanged(reference, matrix_a_h);
+
+  // Wait for all work to finish before doing blocking communication
+  pika::wait();
+
+  auto mat_v = allGather(blas::Uplo::Lower, matrix_a_h, grid);
+  auto mat_b = makeLocal(matrix_a_h);
+  splitReflectorsAndBand(mat_v, mat_b, band_size);
+
+  auto taus = allGatherTaus(k_reflectors, mat_local_taus, grid);
+  ASSERT_EQ(taus.size(), k_reflectors);
+
+  // checkResult(k_reflectors, band_size, reference, mat_v, mat_b, taus);
+}
+
+TYPED_TEST(CAReductionToBandTestMC, CorrectnessDistributed) {
+  for (auto&& comm_grid : this->commGrids()) {
+    for (const auto& [size, block_size, band_size] : configs) {
+      for (auto input_matrix_structure : {InputMatrixStructure::full}) {
+        testCAReductionToBand<TypeParam, Device::CPU, Backend::MC>(comm_grid, size, block_size,
+                                                                   band_size, input_matrix_structure);
+      }
+    }
+  }
+}

From 52971b2d1c722e0de3df92ef908e5fe6e86631ad Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Mon, 1 Jul 2024 16:40:33 +0200
Subject: [PATCH 04/30] WIP: qr for both 1st and 2nd passes

---
 .../eigensolver/reduction_to_band/ca-impl.h   | 172 +++++++++++++++++-
 1 file changed, 171 insertions(+), 1 deletion(-)

diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index af6ceb36c4..5f3c28efd6 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -13,8 +13,21 @@
 #include <pika/execution.hpp>
 
 #include <dlaf/communication/communicator_grid.h>
+#include <dlaf/communication/kernels/broadcast.h>
 #include <dlaf/eigensolver/reduction_to_band/api.h>
+#include <dlaf/eigensolver/reduction_to_band/common.h>
+#include <dlaf/lapack/tile.h>
+#include <dlaf/matrix/copy_tile.h>
 #include <dlaf/matrix/matrix.h>
+#include <dlaf/matrix/panel.h>
+#include <dlaf/schedulers.h>
+#include <dlaf/sender/transform.h>
+#include <dlaf/sender/transform_mpi.h>
+
+//
+#include <iostream>
+
+#include <dlaf/matrix/print_numpy.h>
 
 namespace dlaf::eigensolver::internal {
 
@@ -60,6 +73,163 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
   if (nrefls == 0)
     return {std::move(mat_taus_1st), std::move(mat_taus_2nd), std::move(mat_hh_2nd)};
 
+  auto mpi_col_chain = grid.col_communicator_pipeline();
+
+  constexpr std::size_t n_workspaces = 2;
+
+  // TODO HEADS workspace
+  // - column vector
+  // - has to be fully local
+  // - no more than grid_size.rows() tiles (1 tile per rank in the column)
+  // - we use panel just because it offers the ability to shrink width/height
+  const matrix::Distribution dist_heads(
+      LocalElementSize(dist.grid_size().rows() * dist.block_size().rows(), dist.block_size().cols()),
+      dist.block_size());
+  common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_heads(n_workspaces, dist_heads);
+
+  DLAF_ASSERT(mat_a.block_size().cols() == band_size, mat_a.block_size().cols(), band_size);
+  const SizeType ntiles = (nrefls - 1) / band_size + 1;
+
+  for (SizeType j = 0; j < ntiles; ++j) {
+    const SizeType i = j + 1;
+    // const SizeType j_lc = dist.template next_local_tile_from_global_tile<Coord::Col>(j);
+
+    // panel
+    const GlobalTileIndex panel_offset(i, j);
+    const GlobalElementIndex panel_offset_el(panel_offset.row() * band_size,
+                                             panel_offset.col() * band_size);
+    matrix::SubPanelView panel_view(dist, panel_offset_el, band_size);
+    const comm::IndexT_MPI rank_panel = dist.template rank_global_tile<Coord::Col>(panel_offset.col());
+
+    // trailing
+    const GlobalTileIndex trailing_offset(i, j + 1);
+    const GlobalElementIndex trailing_offset_el(trailing_offset.row() * band_size,
+                                                trailing_offset.col() * band_size);
+    matrix::SubMatrixView(dist, trailing_offset_el);
+
+    // PANEL: just ranks in the current column
+    // QR local (HH reflectors stored in-place)
+    if (rank_panel == rank.col()) {
+      // Note:  SubPanelView is (at most) band_size wide, but it may contain a smaller number of
+      //        reflectors (i.e. at the end when last reflector size is 1)
+      red2band::local::computePanelReflectors(mat_a, mat_taus_1st, panel_offset.col(), panel_view);
+    }
+
+    // TRAILING 1st pass
+    {}
+
+    // PANEL: just ranks in the current column
+    // QR local with just heads (HH reflectors have to be computed elsewhere, 1st-pass in-place)
+    if (rank_panel == rank.col()) {
+      auto&& panel_heads = panels_heads.nextResource();
+
+      const SizeType i_begin_head = i;
+      const SizeType i_end_head =
+          std::min((i_begin_head + dist.grid_size().rows() + 1), dist.nr_tiles().rows());
+
+      const SizeType nheads = i_end_head - i_begin_head;
+      panel_heads.setRangeEnd({nheads, 0});
+
+      for (SizeType i_head = i_begin_head, idx = 0; i_head < i_end_head; ++i_head, ++idx) {
+        const LocalTileIndex idx_panel_head(idx, 0);
+        const GlobalTileIndex ij_head(i_head, j);
+
+        const comm::IndexT_MPI rank_owner = dist.template rank_global_tile<Coord::Row>(i_head);
+
+        if (rank.row() == rank_owner) {
+          // copy - set - send
+          ex::start_detached(ex::when_all(mat_a.read(ij_head), panel_heads.readwrite(idx_panel_head)) |
+                             di::transform(di::Policy<B>(), [=](const auto& head_in, auto&& head) {
+                               matrix::internal::copy(head_in, head);
+                               lapack::laset(blas::Uplo::Lower, head.size().rows() - 1,
+                                             head.size().cols(), T(0), T(0), head.ptr({1, 0}),
+                                             head.ld());
+                             }));
+
+          ex::start_detached(dlaf::comm::schedule_bcast_send(mpi_col_chain.exclusive(),
+                                                             panel_heads.read(idx_panel_head)));
+        }
+        else {
+          // receive
+          ex::start_detached(dlaf::comm::schedule_bcast_recv(mpi_col_chain.exclusive(), rank_owner,
+                                                             panel_heads.readwrite(idx_panel_head)));
+        }
+      }
+
+      // for (auto ij : panel_heads.iteratorLocal()) {
+      //   const auto& tile = pika::this_thread::experimental::sync_wait(panel_heads.read(ij)).get();
+      //   std::cout << "heads(" << ij << ") = ";
+      //   print(format::numpy{}, tile);
+      // }
+
+      // QR local on heads
+      // TODO check if some head tiles are not used and limit (or set to zero as a starting point)
+      matrix::SubPanelView panel_view(dist_heads, {0, 0}, band_size);
+      red2band::local::computePanelReflectors(panel_heads, mat_taus_2nd, 0, panel_view);
+
+      for (auto ij : panel_heads.iteratorLocal()) {
+        const auto& tile = pika::this_thread::experimental::sync_wait(panel_heads.read(ij)).get();
+        std::cout << "heads_u(" << ij << ") = ";
+        print(format::numpy{}, tile);
+      }
+
+      // TODO copy back data
+      // - just head of heads upper to mat_a
+      // - reflectors to hh_2nd
+
+      const GlobalTileIndex ij_hoh(i_begin_head, j);
+      if (rank.row() == dist.template rank_global_tile<Coord::Row>(ij_hoh.row()))
+        ex::start_detached(ex::when_all(panel_heads.read({0, 0}), mat_a.readwrite(ij_hoh)) |
+                           di::transform(di::Policy<B>(), [](const auto& hoh, auto&& hoh_a) {
+                             common::internal::SingleThreadedBlasScope single;
+                             lapack::lacpy(blas::Uplo::Upper, hoh.size().rows(), hoh.size().cols(),
+                                           hoh.ptr(), hoh.ld(), hoh_a.ptr(), hoh_a.ld());
+                             // std::cout << "hoh_a = ";
+                             // print(format::numpy{}, hoh_a);
+                           }));
+
+      for (SizeType i_head = i_begin_head, idx = 0; i_head < i_end_head; ++i_head, ++idx) {
+        const comm::IndexT_MPI rank_owner = dist.template rank_global_tile<Coord::Row>(i_head);
+
+        if (rank.row() == rank_owner) {
+          const LocalTileIndex idx_panel_head(idx, 0);
+          const GlobalTileIndex ij_hh_2nd(0, j);
+          const bool is_hoh = (idx == 0);
+
+          auto sender_heads =
+              ex::when_all(panel_heads.read(idx_panel_head), mat_hh_2nd.readwrite(ij_hh_2nd));
+
+          if (is_hoh)
+            ex::start_detached(std::move(sender_heads) |
+                               di::transform(di::Policy<B>(), [](const auto& head, auto&& head_a) {
+                                 common::internal::SingleThreadedBlasScope single;
+                                 lapack::laset(blas::Uplo::Upper, head_a.size().rows(),
+                                               head_a.size().cols(), T(0), T(1), head_a.ptr(),
+                                               head_a.ld());
+                                 lapack::lacpy(blas::Uplo::Lower, head.size().rows() - 1,
+                                               head.size().cols() - 1, head.ptr({1, 0}), head.ld(),
+                                               head_a.ptr({1, 0}), head_a.ld());
+                               }));
+          else
+            ex::start_detached(std::move(sender_heads) |
+                               di::transform(di::Policy<B>(), [](const auto& head, auto&& head_a) {
+                                 common::internal::SingleThreadedBlasScope single;
+                                 lapack::lacpy(blas::Uplo::General, head.size().rows(),
+                                               head.size().cols(), head.ptr(), head.ld(), head_a.ptr(),
+                                               head_a.ld());
+                               }));
+        }
+      }
+
+      panel_heads.reset();
+    }
+
+    // TRAILING 2nd pass
+    {}
+
+    break;
+  }
+
   return {std::move(mat_taus_1st), std::move(mat_taus_2nd), std::move(mat_hh_2nd)};
 }
-}  // namespace dlaf::eigensolver::internal
+}

From 7be419987293e88cb0520bfbea67647df4d1f1d7 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Thu, 11 Jul 2024 16:25:32 +0200
Subject: [PATCH 05/30] move some basic building block that can be shared
 between implementations

---
 .../eigensolver/reduction_to_band/common.h    | 187 ++++++++++++++++++
 .../dlaf/eigensolver/reduction_to_band/impl.h | 179 -----------------
 2 files changed, 187 insertions(+), 179 deletions(-)

diff --git a/include/dlaf/eigensolver/reduction_to_band/common.h b/include/dlaf/eigensolver/reduction_to_band/common.h
index c15c749aaa..8a6314bea1 100644
--- a/include/dlaf/eigensolver/reduction_to_band/common.h
+++ b/include/dlaf/eigensolver/reduction_to_band/common.h
@@ -14,14 +14,21 @@
 
 #include <pika/barrier.hpp>
 
+#include <dlaf/blas/tile.h>
 #include <dlaf/common/index2d.h>
 #include <dlaf/common/single_threaded_blas.h>
 #include <dlaf/common/vector.h>
+#include <dlaf/communication/kernels/reduce.h>
 #include <dlaf/eigensolver/internal/get_red2band_barrier_busy_wait.h>
 #include <dlaf/eigensolver/internal/get_red2band_panel_nworkers.h>
+#include <dlaf/matrix/copy.h>
 #include <dlaf/matrix/matrix.h>
+#include <dlaf/matrix/panel.h>
+#include <dlaf/matrix/tile.h>
 #include <dlaf/matrix/views.h>
 #include <dlaf/schedulers.h>
+#include <dlaf/sender/traits.h>
+#include <dlaf/sender/transform.h>
 
 namespace dlaf::eigensolver::internal {
 
@@ -193,6 +200,59 @@ void updateTrailingPanel(const bool has_head, const std::vector<matrix::Tile<T,
   }
 }
 
+template <Backend B, typename ASender, typename WSender, typename XSender>
+void hemmDiag(pika::execution::thread_priority priority, ASender&& tile_a, WSender&& tile_w,
+              XSender&& tile_x) {
+  using T = dlaf::internal::SenderElementType<ASender>;
+  using pika::execution::thread_stacksize;
+
+  pika::execution::experimental::start_detached(
+      dlaf::internal::whenAllLift(blas::Side::Left, blas::Uplo::Lower, T(1),
+                                  std::forward<ASender>(tile_a), std::forward<WSender>(tile_w), T(1),
+                                  std::forward<XSender>(tile_x)) |
+      tile::hemm(dlaf::internal::Policy<B>(priority, thread_stacksize::nostack)));
+}
+
+// X += op(A) * W
+template <Backend B, typename ASender, typename WSender, typename XSender>
+void hemmOffDiag(pika::execution::thread_priority priority, blas::Op op, ASender&& tile_a,
+                 WSender&& tile_w, XSender&& tile_x) {
+  using T = dlaf::internal::SenderElementType<ASender>;
+  using pika::execution::thread_stacksize;
+
+  pika::execution::experimental::start_detached(
+      dlaf::internal::whenAllLift(op, blas::Op::NoTrans, T(1), std::forward<ASender>(tile_a),
+                                  std::forward<WSender>(tile_w), T(1), std::forward<XSender>(tile_x)) |
+      tile::gemm(dlaf::internal::Policy<B>(priority, thread_stacksize::nostack)));
+}
+
+template <Backend B, typename VSender, typename XSender, typename ASender>
+void her2kDiag(pika::execution::thread_priority priority, VSender&& tile_v, XSender&& tile_x,
+               ASender&& tile_a) {
+  using T = dlaf::internal::SenderElementType<VSender>;
+  using pika::execution::thread_stacksize;
+
+  pika::execution::experimental::start_detached(
+      dlaf::internal::whenAllLift(blas::Uplo::Lower, blas::Op::NoTrans, T(-1),
+                                  std::forward<VSender>(tile_v), std::forward<XSender>(tile_x),
+                                  BaseType<T>(1), std::forward<ASender>(tile_a)) |
+      tile::her2k(dlaf::internal::Policy<B>(priority, thread_stacksize::nostack)));
+}
+
+// C -= A . B*
+template <Backend B, typename ASender, typename BSender, typename CSender>
+void her2kOffDiag(pika::execution::thread_priority priority, ASender&& tile_a, BSender&& tile_b,
+                  CSender&& tile_c) {
+  using T = dlaf::internal::SenderElementType<ASender>;
+  using pika::execution::thread_stacksize;
+
+  pika::execution::experimental::start_detached(
+      dlaf::internal::whenAllLift(blas::Op::NoTrans, blas::Op::ConjTrans, T(-1),
+                                  std::forward<ASender>(tile_a), std::forward<BSender>(tile_b), T(1),
+                                  std::forward<CSender>(tile_c)) |
+      tile::gemm(dlaf::internal::Policy<B>(priority, thread_stacksize::nostack)));
+}
+
 }
 
 namespace red2band::local {
@@ -280,5 +340,132 @@ void computePanelReflectors(MatrixLikeA& mat_a, MatrixLikeTaus& mat_taus, const
       }));
 }
 
+template <Backend B, Device D, class T>
+void setupReflectorPanelV(bool has_head, const matrix::SubPanelView& panel_view, const SizeType nrefls,
+                          matrix::Panel<Coord::Col, T, D>& v, matrix::Matrix<const T, D>& mat_a,
+                          bool force_copy = false) {
+  namespace ex = pika::execution::experimental;
+
+  using pika::execution::thread_priority;
+  using pika::execution::thread_stacksize;
+
+  // Note:
+  // Reflectors are stored in the lower triangular part of the A matrix leading to sharing memory
+  // between reflectors and results, which are in the upper triangular part. The problem exists only
+  // for the first tile (of the V, i.e. band excluded). Since refelectors will be used in next
+  // computations, they should be well-formed, i.e. a unit lower trapezoidal matrix. For this reason,
+  // a support tile is used, where just the reflectors values are copied, the diagonal is set to 1
+  // and the rest is zeroed out.
+  auto it_begin = panel_view.iteratorLocal().begin();
+  auto it_end = panel_view.iteratorLocal().end();
+
+  if (has_head) {
+    const LocalTileIndex i = *it_begin;
+    matrix::SubTileSpec spec = panel_view(i);
+
+    // Note:
+    // If the number of reflectors are limited by height (|reflector| > 1), the panel is narrower than
+    // the blocksize, leading to just using a part of A (first full nrefls columns)
+    spec.size = {spec.size.rows(), std::min(nrefls, spec.size.cols())};
+
+    // Note:
+    // copy + laset is done in two independent tasks, but it could be theoretically merged to into a
+    // single task doing both.
+    const auto p = dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack);
+    ex::start_detached(dlaf::internal::whenAllLift(splitTile(mat_a.read(i), spec), v.readwrite(i)) |
+                       matrix::copy(p));
+    ex::start_detached(dlaf::internal::whenAllLift(blas::Uplo::Upper, T(0), T(1), v.readwrite(i)) |
+                       tile::laset(p));
+
+    ++it_begin;
+  }
+
+  // The rest of the V panel of reflectors can just point to the values in A, since they are
+  // well formed in-place.
+  for (auto it = it_begin; it < it_end; ++it) {
+    const LocalTileIndex idx = *it;
+    const matrix::SubTileSpec& spec = panel_view(idx);
+
+    // Note:  This is a workaround for the deadlock problem with sub-tiles.
+    //        Without this copy, during matrix update the same tile would get accessed at the same
+    //        time both in readonly mode (for reflectors) and in readwrite mode (for updating the
+    //        matrix). This would result in a deadlock, so instead of linking the panel to an external
+    //        tile, memory provided internally by the panel is used as support. In this way, the two
+    //        subtiles used in the operation belong to different tiles.
+    if (force_copy)
+      ex::start_detached(ex::when_all(matrix::splitTile(mat_a.read(idx), spec), v.readwrite(idx)) |
+                         matrix::copy(dlaf::internal::Policy<B>(thread_priority::high,
+                                                                thread_stacksize::nostack)));
+    else
+      v.setTile(idx, matrix::splitTile(mat_a.read(idx), spec));
+  }
+}
+
+template <Backend B, Device D, class T>
+void trmmComputeW(matrix::Panel<Coord::Col, T, D>& w, matrix::Panel<Coord::Col, T, D>& v,
+                  matrix::ReadOnlyTileSender<T, D> tile_t) {
+  namespace ex = pika::execution::experimental;
+
+  using pika::execution::thread_priority;
+  using pika::execution::thread_stacksize;
+  using namespace blas;
+
+  auto it = w.iteratorLocal();
+
+  for (const auto& index_i : it) {
+    ex::start_detached(dlaf::internal::whenAllLift(Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit,
+                                                   T(1), tile_t, v.read(index_i), w.readwrite(index_i)) |
+                       tile::trmm3(dlaf::internal::Policy<B>(thread_priority::high,
+                                                             thread_stacksize::nostack)));
+  }
+
+  if (it.empty()) {
+    ex::start_detached(std::move(tile_t));
+  }
+}
+
+template <Backend B, Device D, class T>
+void gemmComputeW2(matrix::Matrix<T, D>& w2, matrix::Panel<Coord::Col, const T, D>& w,
+                   matrix::Panel<Coord::Col, const T, D>& x) {
+  using pika::execution::thread_priority;
+  using pika::execution::thread_stacksize;
+
+  namespace ex = pika::execution::experimental;
+
+  // Note:
+  // Not all ranks in the column always hold at least a tile in the panel Ai, but all ranks in
+  // the column are going to participate to the reduce. For them, it is important to set the
+  // partial result W2 to zero.
+  ex::start_detached(w2.readwrite(LocalTileIndex(0, 0)) |
+                     tile::set0(dlaf::internal::Policy<B>(thread_priority::high,
+                                                          thread_stacksize::nostack)));
+
+  using namespace blas;
+  // GEMM W2 = W* . X
+  for (const auto& index_tile : w.iteratorLocal())
+    ex::start_detached(
+        dlaf::internal::whenAllLift(Op::ConjTrans, Op::NoTrans, T(1), w.read(index_tile),
+                                    x.read(index_tile), T(1), w2.readwrite(LocalTileIndex(0, 0))) |
+        tile::gemm(dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack)));
+}
+
+template <Backend B, Device D, class T>
+void gemmUpdateX(matrix::Panel<Coord::Col, T, D>& x, matrix::Matrix<const T, D>& w2,
+                 matrix::Panel<Coord::Col, const T, D>& v) {
+  namespace ex = pika::execution::experimental;
+
+  using pika::execution::thread_priority;
+  using pika::execution::thread_stacksize;
+  using namespace blas;
+
+  // GEMM X = X - 0.5 . V . W2
+  for (const auto& index_i : v.iteratorLocal())
+    ex::start_detached(
+        dlaf::internal::whenAllLift(Op::NoTrans, Op::NoTrans, T(-0.5), v.read(index_i),
+                                    w2.read(LocalTileIndex(0, 0)), T(1), x.readwrite(index_i)) |
+        tile::gemm(dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack)));
+}
+
 }
+
 }
diff --git a/include/dlaf/eigensolver/reduction_to_band/impl.h b/include/dlaf/eigensolver/reduction_to_band/impl.h
index 0899be139a..7842a95254 100644
--- a/include/dlaf/eigensolver/reduction_to_band/impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/impl.h
@@ -60,161 +60,7 @@ namespace dlaf::eigensolver::internal {
 
 namespace red2band {
 
-template <Backend B, typename ASender, typename WSender, typename XSender>
-void hemmDiag(pika::execution::thread_priority priority, ASender&& tile_a, WSender&& tile_w,
-              XSender&& tile_x) {
-  using T = dlaf::internal::SenderElementType<ASender>;
-  using pika::execution::thread_stacksize;
-
-  pika::execution::experimental::start_detached(
-      dlaf::internal::whenAllLift(blas::Side::Left, blas::Uplo::Lower, T(1),
-                                  std::forward<ASender>(tile_a), std::forward<WSender>(tile_w), T(1),
-                                  std::forward<XSender>(tile_x)) |
-      tile::hemm(dlaf::internal::Policy<B>(priority, thread_stacksize::nostack)));
-}
-
-// X += op(A) * W
-template <Backend B, typename ASender, typename WSender, typename XSender>
-void hemmOffDiag(pika::execution::thread_priority priority, blas::Op op, ASender&& tile_a,
-                 WSender&& tile_w, XSender&& tile_x) {
-  using T = dlaf::internal::SenderElementType<ASender>;
-  using pika::execution::thread_stacksize;
-
-  pika::execution::experimental::start_detached(
-      dlaf::internal::whenAllLift(op, blas::Op::NoTrans, T(1), std::forward<ASender>(tile_a),
-                                  std::forward<WSender>(tile_w), T(1), std::forward<XSender>(tile_x)) |
-      tile::gemm(dlaf::internal::Policy<B>(priority, thread_stacksize::nostack)));
-}
-
-template <Backend B, typename VSender, typename XSender, typename ASender>
-void her2kDiag(pika::execution::thread_priority priority, VSender&& tile_v, XSender&& tile_x,
-               ASender&& tile_a) {
-  using T = dlaf::internal::SenderElementType<VSender>;
-  using pika::execution::thread_stacksize;
-
-  pika::execution::experimental::start_detached(
-      dlaf::internal::whenAllLift(blas::Uplo::Lower, blas::Op::NoTrans, T(-1),
-                                  std::forward<VSender>(tile_v), std::forward<XSender>(tile_x),
-                                  BaseType<T>(1), std::forward<ASender>(tile_a)) |
-      tile::her2k(dlaf::internal::Policy<B>(priority, thread_stacksize::nostack)));
-}
-
-// C -= A . B*
-template <Backend B, typename ASender, typename BSender, typename CSender>
-void her2kOffDiag(pika::execution::thread_priority priority, ASender&& tile_a, BSender&& tile_b,
-                  CSender&& tile_c) {
-  using T = dlaf::internal::SenderElementType<ASender>;
-  using pika::execution::thread_stacksize;
-
-  pika::execution::experimental::start_detached(
-      dlaf::internal::whenAllLift(blas::Op::NoTrans, blas::Op::ConjTrans, T(-1),
-                                  std::forward<ASender>(tile_a), std::forward<BSender>(tile_b), T(1),
-                                  std::forward<CSender>(tile_c)) |
-      tile::gemm(dlaf::internal::Policy<B>(priority, thread_stacksize::nostack)));
-}
-
 namespace local {
-template <Backend B, Device D, class T>
-void setupReflectorPanelV(bool has_head, const matrix::SubPanelView& panel_view, const SizeType nrefls,
-                          matrix::Panel<Coord::Col, T, D>& v, matrix::Matrix<const T, D>& mat_a,
-                          bool force_copy = false) {
-  namespace ex = pika::execution::experimental;
-
-  using pika::execution::thread_priority;
-  using pika::execution::thread_stacksize;
-
-  // Note:
-  // Reflectors are stored in the lower triangular part of the A matrix leading to sharing memory
-  // between reflectors and results, which are in the upper triangular part. The problem exists only
-  // for the first tile (of the V, i.e. band excluded). Since refelectors will be used in next
-  // computations, they should be well-formed, i.e. a unit lower trapezoidal matrix. For this reason,
-  // a support tile is used, where just the reflectors values are copied, the diagonal is set to 1
-  // and the rest is zeroed out.
-  auto it_begin = panel_view.iteratorLocal().begin();
-  auto it_end = panel_view.iteratorLocal().end();
-
-  if (has_head) {
-    const LocalTileIndex i = *it_begin;
-    matrix::SubTileSpec spec = panel_view(i);
-
-    // Note:
-    // If the number of reflectors are limited by height (|reflector| > 1), the panel is narrower than
-    // the blocksize, leading to just using a part of A (first full nrefls columns)
-    spec.size = {spec.size.rows(), std::min(nrefls, spec.size.cols())};
-
-    // Note:
-    // copy + laset is done in two independent tasks, but it could be theoretically merged to into a
-    // single task doing both.
-    const auto p = dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack);
-    ex::start_detached(dlaf::internal::whenAllLift(splitTile(mat_a.read(i), spec), v.readwrite(i)) |
-                       matrix::copy(p));
-    ex::start_detached(dlaf::internal::whenAllLift(blas::Uplo::Upper, T(0), T(1), v.readwrite(i)) |
-                       tile::laset(p));
-
-    ++it_begin;
-  }
-
-  // The rest of the V panel of reflectors can just point to the values in A, since they are
-  // well formed in-place.
-  for (auto it = it_begin; it < it_end; ++it) {
-    const LocalTileIndex idx = *it;
-    const matrix::SubTileSpec& spec = panel_view(idx);
-
-    // Note:  This is a workaround for the deadlock problem with sub-tiles.
-    //        Without this copy, during matrix update the same tile would get accessed at the same
-    //        time both in readonly mode (for reflectors) and in readwrite mode (for updating the
-    //        matrix). This would result in a deadlock, so instead of linking the panel to an external
-    //        tile, memory provided internally by the panel is used as support. In this way, the two
-    //        subtiles used in the operation belong to different tiles.
-    if (force_copy)
-      ex::start_detached(ex::when_all(matrix::splitTile(mat_a.read(idx), spec), v.readwrite(idx)) |
-                         matrix::copy(dlaf::internal::Policy<B>(thread_priority::high,
-                                                                thread_stacksize::nostack)));
-    else
-      v.setTile(idx, matrix::splitTile(mat_a.read(idx), spec));
-  }
-}
-
-template <Backend B, Device D, class T>
-void trmmComputeW(matrix::Panel<Coord::Col, T, D>& w, matrix::Panel<Coord::Col, T, D>& v,
-                  matrix::ReadOnlyTileSender<T, D> tile_t) {
-  namespace ex = pika::execution::experimental;
-
-  using pika::execution::thread_priority;
-  using pika::execution::thread_stacksize;
-  using namespace blas;
-
-  auto it = w.iteratorLocal();
-
-  for (const auto& index_i : it) {
-    ex::start_detached(dlaf::internal::whenAllLift(Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit,
-                                                   T(1), tile_t, v.read(index_i), w.readwrite(index_i)) |
-                       tile::trmm3(dlaf::internal::Policy<B>(thread_priority::high,
-                                                             thread_stacksize::nostack)));
-  }
-
-  if (it.empty()) {
-    ex::start_detached(std::move(tile_t));
-  }
-}
-
-template <Backend B, Device D, class T>
-void gemmUpdateX(matrix::Panel<Coord::Col, T, D>& x, matrix::Matrix<const T, D>& w2,
-                 matrix::Panel<Coord::Col, const T, D>& v) {
-  namespace ex = pika::execution::experimental;
-
-  using pika::execution::thread_priority;
-  using pika::execution::thread_stacksize;
-  using namespace blas;
-
-  // GEMM X = X - 0.5 . V . W2
-  for (const auto& index_i : v.iteratorLocal())
-    ex::start_detached(
-        dlaf::internal::whenAllLift(Op::NoTrans, Op::NoTrans, T(-0.5), v.read(index_i),
-                                    w2.read(LocalTileIndex(0, 0)), T(1), x.readwrite(index_i)) |
-        tile::gemm(dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack)));
-}
-
 template <Backend B, Device D, class T>
 void hemmComputeX(matrix::Panel<Coord::Col, T, D>& x, const matrix::SubMatrixView& view,
                   matrix::Matrix<const T, D>& a, matrix::Panel<Coord::Col, const T, D>& w) {
@@ -270,31 +116,6 @@ void hemmComputeX(matrix::Panel<Coord::Col, T, D>& x, const matrix::SubMatrixVie
   }
 }
 
-template <Backend B, Device D, class T>
-void gemmComputeW2(matrix::Matrix<T, D>& w2, matrix::Panel<Coord::Col, const T, D>& w,
-                   matrix::Panel<Coord::Col, const T, D>& x) {
-  using pika::execution::thread_priority;
-  using pika::execution::thread_stacksize;
-
-  namespace ex = pika::execution::experimental;
-
-  // Note:
-  // Not all ranks in the column always hold at least a tile in the panel Ai, but all ranks in
-  // the column are going to participate to the reduce. For them, it is important to set the
-  // partial result W2 to zero.
-  ex::start_detached(w2.readwrite(LocalTileIndex(0, 0)) |
-                     tile::set0(dlaf::internal::Policy<B>(thread_priority::high,
-                                                          thread_stacksize::nostack)));
-
-  using namespace blas;
-  // GEMM W2 = W* . X
-  for (const auto& index_tile : w.iteratorLocal())
-    ex::start_detached(
-        dlaf::internal::whenAllLift(Op::ConjTrans, Op::NoTrans, T(1), w.read(index_tile),
-                                    x.read(index_tile), T(1), w2.readwrite(LocalTileIndex(0, 0))) |
-        tile::gemm(dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack)));
-}
-
 template <Backend B, Device D, class T>
 void her2kUpdateTrailingMatrix(const matrix::SubMatrixView& view, matrix::Matrix<T, D>& a,
                                matrix::Panel<Coord::Col, const T, D>& x,

From 12015bd7ebbae51dcbbcc76333687882741dfd25 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Thu, 11 Jul 2024 17:43:30 +0200
Subject: [PATCH 06/30] first implementation of trailing matrix update (1st
 pass)

---
 .../eigensolver/reduction_to_band/ca-impl.h   | 408 +++++++++++++++++-
 1 file changed, 393 insertions(+), 15 deletions(-)

diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index 5f3c28efd6..7acf5e5726 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -12,10 +12,13 @@
 
 #include <pika/execution.hpp>
 
+#include <dlaf/communication/broadcast_panel.h>
 #include <dlaf/communication/communicator_grid.h>
+#include <dlaf/communication/kernels/all_reduce.h>
 #include <dlaf/communication/kernels/broadcast.h>
 #include <dlaf/eigensolver/reduction_to_band/api.h>
 #include <dlaf/eigensolver/reduction_to_band/common.h>
+#include <dlaf/factorization/qr.h>
 #include <dlaf/lapack/tile.h>
 #include <dlaf/matrix/copy_tile.h>
 #include <dlaf/matrix/matrix.h>
@@ -25,13 +28,198 @@
 #include <dlaf/sender/transform_mpi.h>
 
 //
-#include <iostream>
-
 #include <dlaf/matrix/print_numpy.h>
 
 namespace dlaf::eigensolver::internal {
 
-namespace ca_red2band {}
+namespace ca_red2band {
+template <Backend B, Device D, class T>
+void hemm(comm::Index2D rank_qr, matrix::Panel<Coord::Col, T, D>& W1,
+          matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>& W1T,
+          const matrix::SubMatrixView& at_view, matrix::Matrix<const T, D>& A,
+          matrix::Panel<Coord::Col, const T, D>& W0,
+          matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>& W0T,
+          comm::CommunicatorPipeline<comm::CommunicatorType::Row>& mpi_row_chain,
+          comm::CommunicatorPipeline<comm::CommunicatorType::Col>& mpi_col_chain) {
+  namespace ex = pika::execution::experimental;
+
+  using red2band::hemmDiag;
+  using red2band::hemmOffDiag;
+
+  using pika::execution::thread_priority;
+
+  const auto dist = A.distribution();
+  const auto rank = dist.rankIndex();
+
+  // Note:
+  // They have to be set to zero, because all tiles are going to be reduced, and some tiles may not get
+  // "initialized" during computation, so they should not contribute with any spurious value to final result.
+  matrix::util::set0<B>(thread_priority::high, W1);
+  matrix::util::set0<B>(thread_priority::high, W1T);
+
+  const LocalTileIndex at_offset = at_view.begin();
+
+  for (SizeType i_lc = at_offset.row(); i_lc < dist.local_nr_tiles().rows(); ++i_lc) {
+    // Note:
+    // diagonal included: get where the first upper tile is in local coordinates
+    const SizeType i = dist.template global_tile_from_local_tile<Coord::Row>(i_lc);
+    const auto j_end_lc = dist.template next_local_tile_from_global_tile<Coord::Col>(i + 1);
+
+    for (SizeType j_lc = j_end_lc - 1; j_lc >= at_offset.col(); --j_lc) {
+      const LocalTileIndex ij_lc{i_lc, j_lc};
+      const GlobalTileIndex ij = dist.global_tile_index(ij_lc);
+
+      const bool is_diagonal_tile = (ij.row() == ij.col());
+
+      auto getSubA = [&A, &at_view, ij_lc]() { return splitTile(A.read(ij_lc), at_view(ij_lc)); };
+
+      if (is_diagonal_tile) {
+        const comm::IndexT_MPI id_qr_R = dist.template rank_global_tile<Coord::Row>(ij.col());
+
+        // Note:
+        // Use W0 just if the tile belongs to the current local transformation.
+        if (id_qr_R != rank_qr.row())
+          continue;
+
+        hemmDiag<B>(thread_priority::high, getSubA(), W0.read(ij_lc), W1.readwrite(ij_lc));
+      }
+      else {
+        const GlobalTileIndex ijL = ij;
+        const comm::IndexT_MPI id_qr_lower_R = dist.template rank_global_tile<Coord::Row>(ijL.col());
+        if (id_qr_lower_R == rank_qr.row()) {
+          // Note:
+          // Since it is not a diagonal tile, otherwise it would have been managed in the previous
+          // branch, the second operand might not be available in W but it is accessible through the
+          // support panel W1T.
+          // However, since we are still computing the "straight" part, the result can be stored
+          // in the "local" panel W1.
+          hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, getSubA(), W0T.read(ij_lc),
+                         W1.readwrite(ij_lc));
+        }
+
+        const GlobalTileIndex ijU = transposed(ij);
+        const comm::IndexT_MPI id_qr_upper_R = dist.template rank_global_tile<Coord::Row>(ijU.col());
+        if (id_qr_upper_R == rank_qr.row()) {
+          // Note:
+          // Here we are considering the hermitian part of A, so pretend to deal with transposed coordinate.
+          // Check if the result still belongs to the same rank, otherwise store it in the support panel.
+          const comm::IndexT_MPI owner_row = dist.template rank_global_tile<Coord::Row>(ijU.row());
+          const SizeType iU_lc = dist.template local_tile_from_global_tile<Coord::Row>(ij.col());
+          const LocalTileIndex i_w1_lc(iU_lc, 0);
+          const LocalTileIndex i_w1t_lc(0, ij_lc.col());
+          auto tile_w1 = (rank.row() == owner_row) ? W1.readwrite(i_w1_lc) : W1T.readwrite(i_w1t_lc);
+
+          hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, getSubA(), W0.read(ij_lc),
+                         std::move(tile_w1));
+        }
+      }
+    }
+  }
+
+  // Note:
+  // At this point, partial results of W1 are available in the panels, and they have to be reduced,
+  // both row-wise and col-wise. The final W1 result will be available just on Ai panel column.
+
+  // Note:
+  // The first step in reducing partial results distributed over W1 and W1T, it is to reduce the row
+  // panel W1T col-wise, by collecting all W1T results on the rank which can "mirror" the result on its
+  // rows (i.e. diagonal). So, for each tile of the row panel, select who is the "diagonal" rank that can
+  // mirror and reduce on it.
+  if (mpi_col_chain.size() > 1) {
+    for (const auto& i_wt_lc : W1T.iteratorLocal()) {
+      const auto i_diag = dist.template global_tile_from_local_tile<Coord::Col>(i_wt_lc.col());
+      const auto rank_owner_row = dist.template rank_global_tile<Coord::Row>(i_diag);
+
+      if (rank_owner_row == rank.row()) {
+        // Note:
+        // Since it is the owner, it has to perform the "mirroring" of the results from columns to
+        // rows.
+        // Moreover, it reduces in place because the owner of the diagonal stores the partial result
+        // directly in W1 (without using W1T)
+        const auto i_w1_lc = dist.template local_tile_from_global_tile<Coord::Row>(i_diag);
+        ex::start_detached(comm::schedule_reduce_recv_in_place(mpi_col_chain.exclusive(), MPI_SUM,
+                                                               W1.readwrite({i_w1_lc, 0})));
+      }
+      else {
+        ex::start_detached(comm::schedule_reduce_send(mpi_col_chain.exclusive(), rank_owner_row, MPI_SUM,
+                                                      W1T.read(i_wt_lc)));
+      }
+    }
+  }
+
+  // Note:
+  // At this point partial results are all collected in X (Xt has been embedded in previous step),
+  // so the last step needed is to reduce these last partial results in the final results.
+  if (mpi_row_chain.size() > 1) {
+    for (const auto& i_w1_lc : W1.iteratorLocal()) {
+      if (rank_qr.col() == rank.col())
+        ex::start_detached(comm::schedule_reduce_recv_in_place(mpi_row_chain.exclusive(), MPI_SUM,
+                                                               W1.readwrite(i_w1_lc)));
+      else
+        ex::start_detached(comm::schedule_reduce_send(mpi_row_chain.exclusive(), rank_qr.col(), MPI_SUM,
+                                                      W1.read(i_w1_lc)));
+    }
+  }
+}
+
+template <Backend B, Device D, class T>
+void her2kUpdateTrailingMatrix(comm::Index2D rank_qr, const matrix::SubMatrixView& at_view,
+                               matrix::Matrix<T, D>& a, matrix::Panel<Coord::Col, const T, D>& W3,
+                               matrix::Panel<Coord::Col, const T, D>& V) {
+  static_assert(std::is_signed_v<BaseType<T>>, "alpha in computations requires to be -1");
+
+  using pika::execution::thread_priority;
+  using red2band::her2kDiag;
+  using red2band::her2kOffDiag;
+
+  const auto dist = a.distribution();
+  const comm::Index2D rank = dist.rank_index();
+
+  const LocalTileIndex at_offset = at_view.begin();
+
+  if (rank_qr.row() != rank.row())
+    return;
+
+  for (SizeType i_lc = at_offset.row(); i_lc < dist.local_nr_tiles().rows(); ++i_lc) {
+    // Note:
+    // diagonal included: get where the first upper tile is in local coordinates
+    const SizeType i = dist.template global_tile_from_local_tile<Coord::Row>(i_lc);
+    const auto j_end_lc = dist.template next_local_tile_from_global_tile<Coord::Col>(i + 1);
+
+    for (SizeType j_lc = j_end_lc - 1; j_lc >= at_offset.col(); --j_lc) {
+      const LocalTileIndex ij_lc{i_lc, j_lc};
+      const GlobalTileIndex ij = dist.global_tile_index(ij_lc);
+
+      const comm::IndexT_MPI id_qr_L = dist.template rank_global_tile<Coord::Row>(ij.row());
+      const comm::IndexT_MPI id_qr_R = dist.template rank_global_tile<Coord::Row>(ij.col());
+
+      // Note: this computation applies just to tiles where transformation applies both from L and R
+      if (id_qr_L != id_qr_R)
+        continue;
+
+      const bool is_diagonal_tile = (ij.row() == ij.col());
+
+      auto getSubA = [&a, &at_view, ij_lc]() { return splitTile(a.readwrite(ij_lc), at_view(ij_lc)); };
+
+      // The first column of the trailing matrix (except for the very first global tile) has to be
+      // updated first, in order to unlock the next iteration as soon as possible.
+      const auto priority = (j_lc == at_offset.col()) ? thread_priority::high : thread_priority::normal;
+
+      if (is_diagonal_tile) {
+        her2kDiag<B>(priority, V.read(ij_lc), W3.read(ij_lc), getSubA());
+      }
+      else {
+        // TODO check and document why tranposed operand can be accessed with same index locally
+        // A -= W3 . V*
+        her2kOffDiag<B>(priority, W3.read(ij_lc), V.read(ij_lc), getSubA());
+        // A -= V . W3*
+        her2kOffDiag<B>(priority, V.read(ij_lc), W3.read(ij_lc), getSubA());
+      }
+    }
+  }
+}
+
+}
 
 // Distributed implementation of reduction to band
 template <Backend B, Device D, class T>
@@ -40,6 +228,10 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
   namespace ex = pika::execution::experimental;
   namespace di = dlaf::internal;
 
+  using common::RoundRobin;
+  using matrix::Panel;
+  using matrix::StoreTransposed;
+
   const auto& dist = mat_a.distribution();
   const comm::Index2D rank = dist.rank_index();
 
@@ -74,6 +266,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
     return {std::move(mat_taus_1st), std::move(mat_taus_2nd), std::move(mat_hh_2nd)};
 
   auto mpi_col_chain = grid.col_communicator_pipeline();
+  auto mpi_row_chain = grid.row_communicator_pipeline();
 
   constexpr std::size_t n_workspaces = 2;
 
@@ -85,38 +278,223 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
   const matrix::Distribution dist_heads(
       LocalElementSize(dist.grid_size().rows() * dist.block_size().rows(), dist.block_size().cols()),
       dist.block_size());
-  common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_heads(n_workspaces, dist_heads);
+
+  RoundRobin<Panel<Coord::Col, T, D>> panels_heads(n_workspaces, dist_heads);
+
+  // update trailing matrix workspaces
+  RoundRobin<Panel<Coord::Col, T, D>> panels_v(n_workspaces, dist);
+  RoundRobin<Panel<Coord::Row, T, D, StoreTransposed::Yes>> panels_vt(n_workspaces, dist);
+
+  RoundRobin<Panel<Coord::Col, T, D>> panels_w0(n_workspaces, dist);
+  RoundRobin<Panel<Coord::Row, T, D, StoreTransposed::Yes>> panels_w0t(n_workspaces, dist);
+
+  RoundRobin<Panel<Coord::Col, T, D>> panels_w1(n_workspaces, dist);
+  RoundRobin<Panel<Coord::Row, T, D, StoreTransposed::Yes>> panels_w1t(n_workspaces, dist);
+
+  RoundRobin<Panel<Coord::Col, T, D>> panels_w3(n_workspaces, dist);
 
   DLAF_ASSERT(mat_a.block_size().cols() == band_size, mat_a.block_size().cols(), band_size);
   const SizeType ntiles = (nrefls - 1) / band_size + 1;
 
+  const bool is_full_band = (band_size == dist.blockSize().cols());
+
   for (SizeType j = 0; j < ntiles; ++j) {
     const SizeType i = j + 1;
-    // const SizeType j_lc = dist.template next_local_tile_from_global_tile<Coord::Col>(j);
+
+    const SizeType nrefls_step = dist_taus.tile_size_of({0, j}).rows();
 
     // panel
     const GlobalTileIndex panel_offset(i, j);
     const GlobalElementIndex panel_offset_el(panel_offset.row() * band_size,
                                              panel_offset.col() * band_size);
     matrix::SubPanelView panel_view(dist, panel_offset_el, band_size);
-    const comm::IndexT_MPI rank_panel = dist.template rank_global_tile<Coord::Col>(panel_offset.col());
+
+    const comm::IndexT_MPI rank_panel(dist.template rank_global_tile<Coord::Col>(panel_offset.col()));
+
+    const SizeType n_qr_heads =
+        std::min<SizeType>(panel_view.offset().row() + grid.size().rows(), dist.nr_tiles().rows()) -
+        panel_view.offset().row();
 
     // trailing
-    const GlobalTileIndex trailing_offset(i, j + 1);
-    const GlobalElementIndex trailing_offset_el(trailing_offset.row() * band_size,
-                                                trailing_offset.col() * band_size);
-    matrix::SubMatrixView(dist, trailing_offset_el);
+    const GlobalTileIndex at_offset(i, j + 1);
+    const GlobalElementIndex at_offset_el(at_offset.row() * band_size, at_offset.col() * band_size);
+    const LocalTileIndex at_offset_lc(
+        dist.template next_local_tile_from_global_tile<Coord::Row>(at_offset.row()),
+        dist.template next_local_tile_from_global_tile<Coord::Col>(at_offset.col()));
+    matrix::SubMatrixView at_view(dist, at_offset_el);
 
     // PANEL: just ranks in the current column
     // QR local (HH reflectors stored in-place)
-    if (rank_panel == rank.col()) {
-      // Note:  SubPanelView is (at most) band_size wide, but it may contain a smaller number of
-      //        reflectors (i.e. at the end when last reflector size is 1)
+    if (rank_panel == rank.col())
       red2band::local::computePanelReflectors(mat_a, mat_taus_1st, panel_offset.col(), panel_view);
-    }
 
     // TRAILING 1st pass
-    {}
+    if (at_offset_el.isIn(mat_a.size())) {
+      auto& ws_V = panels_v.nextResource();
+      auto& ws_VT = panels_vt.nextResource();
+
+      ws_V.setRangeStart(at_offset);
+      ws_V.setWidth(nrefls_step);
+
+      const LocalTileIndex zero_lc(0, 0);
+      matrix::Matrix<T, D> ws_T({nrefls_step, nrefls_step}, dist.block_size());
+
+      if (rank_panel == rank.col()) {
+        using factorization::internal::computeTFactor;
+        using red2band::local::setupReflectorPanelV;
+
+        constexpr bool has_head = true;  // Note: they are local, so every rank has its own head
+        setupReflectorPanelV<B, D, T>(has_head, panel_view, nrefls_step, ws_V, mat_a, !is_full_band);
+
+        const GlobalTileIndex j_tau(j, 0);
+        computeTFactor<B>(ws_V, mat_taus_1st.read(j_tau), ws_T.readwrite(zero_lc));
+      }
+
+      ws_VT.setRangeStart(at_offset);
+      ws_VT.setHeight(nrefls_step);
+
+      comm::broadcast(rank_panel, ws_V, ws_VT, mpi_row_chain, mpi_col_chain);
+
+      // W = V T
+      auto& ws_W0 = panels_w0.nextResource();
+      auto& ws_W0T = panels_w0t.nextResource();
+
+      ws_W0.setRangeStart(at_offset);
+      ws_W0T.setRangeStart(at_offset);
+
+      ws_W0.setWidth(nrefls_step);
+      ws_W0T.setHeight(nrefls_step);
+
+      if (rank_panel == rank.col())
+        red2band::local::trmmComputeW<B, D>(ws_W0, ws_V, ws_T.read(zero_lc));
+
+      comm::broadcast(rank_panel, ws_W0, ws_W0T, mpi_row_chain, mpi_col_chain);
+
+      // Note: apply local transformations, one after the other
+      auto& ws_W1 = panels_w1.nextResource();
+      auto& ws_W1T = panels_w1t.nextResource();
+      matrix::Matrix<T, D> ws_W2 = std::move(ws_T);
+
+      for (int idx_qr_head = 0; idx_qr_head < n_qr_heads; ++idx_qr_head) {
+        const SizeType head_qr = at_view.offset().row() + idx_qr_head;
+        const comm::Index2D rank_qr(dist.template rank_global_tile<Coord::Row>(head_qr), rank_panel);
+
+        ws_W1.setRangeStart(at_offset);
+        ws_W1.setWidth(nrefls_step);
+
+        ws_W1T.setRangeStart(at_offset);
+        ws_W1T.setHeight(nrefls_step);
+
+        // W1 = A W0
+        // Note:
+        // it will fill up all W1 (distributed) and it will read just part of A, i.e. columns where this
+        // local transformation should be applied from the right.
+        using ca_red2band::hemm;
+        hemm<B>(rank_qr, ws_W1, ws_W1T, at_view, mat_a, ws_W0, ws_W0T, mpi_row_chain, mpi_col_chain);
+
+        // Note:
+        // W1T has been used as support panel, so reset it again.
+        ws_W1T.reset();
+        ws_W1T.setRangeStart(at_offset);
+        ws_W1T.setHeight(nrefls_step);
+
+        comm::broadcast(rank_panel, ws_W1, ws_W1T, mpi_row_chain, mpi_col_chain);
+
+        // LR
+        // A -= V W1* + W1 V* - V W0* W1 V*
+        if (rank.row() == rank_qr.row()) {
+          // W2 = W0.T W1
+          red2band::local::gemmComputeW2<B, D>(ws_W2, ws_W0, ws_W1);
+
+          // Note:
+          // Next steps for L and R need W1, so we create a copy that we are going to update for this step.
+          auto& ws_W3 = panels_w3.nextResource();
+          ws_W3.setRangeStart(at_offset);
+          ws_W3.setWidth(nrefls_step);
+
+          for (const auto& idx : ws_W1.iteratorLocal())
+            ex::start_detached(ex::when_all(ws_W1.read(idx), ws_W3.readwrite(idx)) |
+                               dlaf::matrix::copy(di::Policy<Backend::MC>{}));
+
+          // W1 -= 0.5 V W2
+          red2band::local::gemmUpdateX<B, D>(ws_W3, ws_W2, ws_V);
+          // A -= W1 V.T + V W1.T
+          ca_red2band::her2kUpdateTrailingMatrix<B>(rank_qr, at_view, mat_a, ws_W3, ws_V);
+
+          ws_W3.reset();
+        }
+
+        // R (exclusively)
+        // A -= W1 V*
+        // Note: all rows, but just the columns that are in the local transformation rank
+        for (SizeType j_lc = at_offset_lc.col(); j_lc < dist.local_nr_tiles().cols(); ++j_lc) {
+          const SizeType j = dist.template global_tile_from_local_tile<Coord::Col>(j_lc);
+          const comm::IndexT_MPI id_qr_R = dist.template rank_global_tile<Coord::Row>(j);
+
+          if (rank_qr.row() != id_qr_R)
+            continue;
+
+          auto&& tile_vt = ws_VT.read({0, j_lc});
+
+          for (SizeType i_lc = at_offset_lc.row(); i_lc < dist.local_nr_tiles().rows(); ++i_lc) {
+            const LocalTileIndex ij_lc(i_lc, j_lc);
+            const GlobalTileIndex ij = dist.global_tile_index(ij_lc);
+
+            // TODO just lower part of trailing matrix
+            if (ij.row() < ij.col())
+              continue;
+
+            const comm::IndexT_MPI id_qr_L = dist.template rank_global_tile<Coord::Row>(ij.row());
+
+            // Note: exclusively from R, if it is an LR tile, it is computed elsewhere
+            if (id_qr_L == id_qr_R)
+              continue;
+
+            ex::start_detached(di::whenAllLift(blas::Op::NoTrans, blas::Op::ConjTrans, T(-1),
+                                               ws_W1.read(ij_lc), tile_vt, T(1),
+                                               mat_a.readwrite(ij_lc)) |
+                               tile::gemm(di::Policy<B>()));
+          }
+        }
+
+        // L (exclusively)
+        // A -= V W1*
+        // Note: all cols, but just the rows of current transformation
+        if (rank_qr.row() == rank.row()) {
+          for (SizeType i_lc = at_offset_lc.row(); i_lc < dist.local_nr_tiles().rows(); ++i_lc) {
+            const comm::IndexT_MPI id_qr_L = rank_qr.row();
+
+            for (SizeType j_lc = at_offset_lc.col(); j_lc < dist.local_nr_tiles().cols(); ++j_lc) {
+              const LocalTileIndex ij_lc(i_lc, j_lc);
+              const GlobalTileIndex ij = dist.global_tile_index(ij_lc);
+
+              // TODO just lower part of trailing matrix
+              if (ij.row() < ij.col())
+                continue;
+
+              const comm::IndexT_MPI id_qr_R = dist.template rank_global_tile<Coord::Row>(ij.col());
+
+              // Note: exclusively from L, if it is an LR tile, it is computed elsewhere
+              if (id_qr_L == id_qr_R)
+                continue;
+
+              ex::start_detached(di::whenAllLift(blas::Op::NoTrans, blas::Op::ConjTrans, T(-1),
+                                                 ws_V.read(ij_lc), ws_W1T.read(ij_lc), T(1),
+                                                 mat_a.readwrite(ij_lc)) |
+                                 tile::gemm(di::Policy<B>()));
+            }
+          }
+        }
+
+        ws_W1T.reset();
+        ws_W1.reset();
+      }
+
+      ws_W0T.reset();
+      ws_W0.reset();
+      ws_VT.reset();
+      ws_V.reset();
+    }
 
     // PANEL: just ranks in the current column
     // QR local with just heads (HH reflectors have to be computed elsewhere, 1st-pass in-place)

From 255b5e1e89d6752345491850ed750c48046db804 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Thu, 11 Jul 2024 17:44:01 +0200
Subject: [PATCH 07/30] fixes for qr 2nd pass

---
 .../eigensolver/reduction_to_band/ca-impl.h   | 64 ++++++++-----------
 1 file changed, 26 insertions(+), 38 deletions(-)

diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index 7acf5e5726..d3f5e12ce5 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -500,21 +500,20 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
     // QR local with just heads (HH reflectors have to be computed elsewhere, 1st-pass in-place)
     if (rank_panel == rank.col()) {
       auto&& panel_heads = panels_heads.nextResource();
+      panel_heads.setRangeEnd({n_qr_heads, 0});
+      panel_heads.setWidth(nrefls_step);
 
-      const SizeType i_begin_head = i;
-      const SizeType i_end_head =
-          std::min((i_begin_head + dist.grid_size().rows() + 1), dist.nr_tiles().rows());
+      for (int idx_head = 0; idx_head < n_qr_heads; ++idx_head) {
+        using dlaf::comm::schedule_bcast_recv;
+        using dlaf::comm::schedule_bcast_send;
 
-      const SizeType nheads = i_end_head - i_begin_head;
-      panel_heads.setRangeEnd({nheads, 0});
+        const SizeType i_head = panel_view.offset().row() + idx_head;
+        const comm::Index2D rank_head(dist.template rank_global_tile<Coord::Row>(i_head), rank_panel);
 
-      for (SizeType i_head = i_begin_head, idx = 0; i_head < i_end_head; ++i_head, ++idx) {
-        const LocalTileIndex idx_panel_head(idx, 0);
         const GlobalTileIndex ij_head(i_head, j);
+        const LocalTileIndex idx_panel_head(idx_head, 0);
 
-        const comm::IndexT_MPI rank_owner = dist.template rank_global_tile<Coord::Row>(i_head);
-
-        if (rank.row() == rank_owner) {
+        if (rank.row() == rank_head.row()) {
           // copy - set - send
           ex::start_detached(ex::when_all(mat_a.read(ij_head), panel_heads.readwrite(idx_panel_head)) |
                              di::transform(di::Policy<B>(), [=](const auto& head_in, auto&& head) {
@@ -523,59 +522,48 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
                                              head.size().cols(), T(0), T(0), head.ptr({1, 0}),
                                              head.ld());
                              }));
-
-          ex::start_detached(dlaf::comm::schedule_bcast_send(mpi_col_chain.exclusive(),
-                                                             panel_heads.read(idx_panel_head)));
+          ex::start_detached(schedule_bcast_send(mpi_col_chain.exclusive(),
+                                                 panel_heads.read(idx_panel_head)));
         }
         else {
           // receive
-          ex::start_detached(dlaf::comm::schedule_bcast_recv(mpi_col_chain.exclusive(), rank_owner,
-                                                             panel_heads.readwrite(idx_panel_head)));
+          ex::start_detached(schedule_bcast_recv(mpi_col_chain.exclusive(), rank_head.row(),
+                                                 panel_heads.readwrite(idx_panel_head)));
         }
       }
 
-      // for (auto ij : panel_heads.iteratorLocal()) {
-      //   const auto& tile = pika::this_thread::experimental::sync_wait(panel_heads.read(ij)).get();
-      //   std::cout << "heads(" << ij << ") = ";
-      //   print(format::numpy{}, tile);
-      // }
-
       // QR local on heads
       // TODO check if some head tiles are not used and limit (or set to zero as a starting point)
-      matrix::SubPanelView panel_view(dist_heads, {0, 0}, band_size);
-      red2band::local::computePanelReflectors(panel_heads, mat_taus_2nd, 0, panel_view);
-
-      for (auto ij : panel_heads.iteratorLocal()) {
-        const auto& tile = pika::this_thread::experimental::sync_wait(panel_heads.read(ij)).get();
-        std::cout << "heads_u(" << ij << ") = ";
-        print(format::numpy{}, tile);
+      {
+        matrix::SubPanelView panel_heads_view(dist_heads, {0, 0}, band_size);
+        red2band::local::computePanelReflectors(panel_heads, mat_taus_2nd, 0, panel_heads_view);
       }
 
       // TODO copy back data
       // - just head of heads upper to mat_a
       // - reflectors to hh_2nd
 
-      const GlobalTileIndex ij_hoh(i_begin_head, j);
+      const GlobalTileIndex ij_hoh(panel_view.offset().row(), j);
       if (rank.row() == dist.template rank_global_tile<Coord::Row>(ij_hoh.row()))
         ex::start_detached(ex::when_all(panel_heads.read({0, 0}), mat_a.readwrite(ij_hoh)) |
                            di::transform(di::Policy<B>(), [](const auto& hoh, auto&& hoh_a) {
                              common::internal::SingleThreadedBlasScope single;
                              lapack::lacpy(blas::Uplo::Upper, hoh.size().rows(), hoh.size().cols(),
                                            hoh.ptr(), hoh.ld(), hoh_a.ptr(), hoh_a.ld());
-                             // std::cout << "hoh_a = ";
-                             // print(format::numpy{}, hoh_a);
                            }));
 
-      for (SizeType i_head = i_begin_head, idx = 0; i_head < i_end_head; ++i_head, ++idx) {
-        const comm::IndexT_MPI rank_owner = dist.template rank_global_tile<Coord::Row>(i_head);
+      for (int idx_qr_head = 0; idx_qr_head < n_qr_heads; ++idx_qr_head) {
+        const SizeType i_head = panel_view.offset().row() + idx_qr_head;
+        const comm::Index2D rank_head(dist.template rank_global_tile<Coord::Row>(i_head), rank_panel);
+
+        if (rank.row() == rank_head.row()) {
+          const LocalTileIndex idx_panel_head(idx_qr_head, 0);
+          const GlobalTileIndex ij_head(i_head, j);
 
-        if (rank.row() == rank_owner) {
-          const LocalTileIndex idx_panel_head(idx, 0);
-          const GlobalTileIndex ij_hh_2nd(0, j);
-          const bool is_hoh = (idx == 0);
+          const bool is_hoh = (i_head == panel_view.offset().row());
 
           auto sender_heads =
-              ex::when_all(panel_heads.read(idx_panel_head), mat_hh_2nd.readwrite(ij_hh_2nd));
+              ex::when_all(panel_heads.read(idx_panel_head), mat_hh_2nd.readwrite(ij_head));
 
           if (is_hoh)
             ex::start_detached(std::move(sender_heads) |

From 382bee68da48bc6729905b56918eb0f2cf59debb Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Thu, 18 Jul 2024 15:08:06 +0200
Subject: [PATCH 08/30] implementation 2nd pass update trailing + fixes

---
 .../eigensolver/reduction_to_band/ca-impl.h   | 398 ++++++++++++++++--
 .../eigensolver/reduction_to_band/common.h    |   6 +-
 include/dlaf/factorization/qr/t_factor_impl.h |   3 +-
 3 files changed, 356 insertions(+), 51 deletions(-)

diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index d3f5e12ce5..53339fb072 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -219,6 +219,175 @@ void her2kUpdateTrailingMatrix(comm::Index2D rank_qr, const matrix::SubMatrixVie
   }
 }
 
+template <Backend B, Device D, class T>
+void hemm2nd(comm::IndexT_MPI rank_panel, matrix::Panel<Coord::Col, T, D>& W1,
+             matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>& W1T,
+             const matrix::SubMatrixView& at_view, const SizeType j_end, matrix::Matrix<const T, D>& A,
+             matrix::Panel<Coord::Col, const T, D>& W0,
+             matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>& W0T,
+             comm::CommunicatorPipeline<comm::CommunicatorType::Row>& mpi_row_chain,
+             comm::CommunicatorPipeline<comm::CommunicatorType::Col>& mpi_col_chain) {
+  namespace ex = pika::execution::experimental;
+
+  using red2band::hemmDiag;
+  using red2band::hemmOffDiag;
+
+  using pika::execution::thread_priority;
+
+  const auto dist = A.distribution();
+  const auto rank = dist.rankIndex();
+
+  // Note:
+  // They have to be set to zero, because all tiles are going to be reduced, and some tiles may not get
+  // "initialized" during computation, so they should not contribute with any spurious value to final result.
+  matrix::util::set0<B>(thread_priority::high, W1);
+  matrix::util::set0<B>(thread_priority::high, W1T);
+
+  const LocalTileIndex at_offset = at_view.begin();
+
+  const SizeType jR_end_lc = dist.template next_local_tile_from_global_tile<Coord::Col>(j_end);
+  for (SizeType i_lc = at_offset.row(); i_lc < dist.localNrTiles().rows(); ++i_lc) {
+    const auto j_end_lc =
+        std::min(jR_end_lc, dist.template next_local_tile_from_global_tile<Coord::Col>(
+                                dist.template global_tile_from_local_tile<Coord::Row>(i_lc) + 1));
+    for (SizeType j_lc = at_offset.col(); j_lc < j_end_lc; ++j_lc) {
+      const LocalTileIndex ij_lc(i_lc, j_lc);
+      const GlobalTileIndex ij = dist.global_tile_index(ij_lc);
+      std::cout << "hemm2nd @ " << ij << "\n";
+
+      // skip upper
+      if (ij.row() < ij.col()) {
+        std::cout << "hemm2nd-skipping " << ij << "\n";
+        continue;
+      }
+
+      const bool is_diag = (ij.row() == ij.col());
+
+      if (is_diag) {
+        std::cout << "hemm2nd-diag " << ij << "\n";
+        hemmDiag<B>(thread_priority::high, A.read(ij_lc), W0.read(ij_lc), W1.readwrite(ij_lc));
+      }
+      else {
+        std::cout << "hemm2nd-odL " << ij << "\n";
+        // Lower
+        hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, A.read(ij_lc), W0T.read(ij_lc),
+                       W1.readwrite(ij_lc));
+
+        // Upper
+        const GlobalTileIndex ijU = transposed(ij);
+
+        // Note: if it is out of the "sub-matrix"
+        if (ijU.col() >= j_end)
+          continue;
+
+        std::cout << "hemm2nd-odU " << ij << "\n";
+
+        const comm::IndexT_MPI owner_row = dist.template rank_global_tile<Coord::Row>(ijU.row());
+        const SizeType iU_lc = dist.template local_tile_from_global_tile<Coord::Row>(ij.col());
+        const LocalTileIndex i_w1_lc(iU_lc, 0);
+        const LocalTileIndex i_w1t_lc(0, ij_lc.col());
+        auto tile_w1 = (rank.row() == owner_row) ? W1.readwrite(i_w1_lc) : W1T.readwrite(i_w1t_lc);
+
+        hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, A.read(ij_lc), W0.read(ij_lc),
+                       std::move(tile_w1));
+      }
+    }
+  }
+
+  // Note:
+  // At this point, partial results of W1 are available in the panels, and they have to be reduced,
+  // both row-wise and col-wise. The final W1 result will be available just on Ai panel column.
+
+  // Note:
+  // The first step in reducing partial results distributed over W1 and W1T, it is to reduce the row
+  // panel W1T col-wise, by collecting all W1T results on the rank which can "mirror" the result on its
+  // rows (i.e. diagonal). So, for each tile of the row panel, select who is the "diagonal" rank that can
+  // mirror and reduce on it.
+  if (mpi_col_chain.size() > 1) {
+    for (const auto& i_wt_lc : W1T.iteratorLocal()) {
+      const auto i_diag = dist.template global_tile_from_local_tile<Coord::Col>(i_wt_lc.col());
+      const auto rank_owner_row = dist.template rank_global_tile<Coord::Row>(i_diag);
+
+      if (rank_owner_row == rank.row()) {
+        // Note:
+        // Since it is the owner, it has to perform the "mirroring" of the results from columns to
+        // rows.
+        // Moreover, it reduces in place because the owner of the diagonal stores the partial result
+        // directly in W1 (without using W1T)
+        const auto i_w1_lc = dist.template local_tile_from_global_tile<Coord::Row>(i_diag);
+        ex::start_detached(comm::schedule_reduce_recv_in_place(mpi_col_chain.exclusive(), MPI_SUM,
+                                                               W1.readwrite({i_w1_lc, 0})));
+      }
+      else {
+        ex::start_detached(comm::schedule_reduce_send(mpi_col_chain.exclusive(), rank_owner_row, MPI_SUM,
+                                                      W1T.read(i_wt_lc)));
+      }
+    }
+  }
+
+  // Note:
+  // At this point partial results are all collected in X (Xt has been embedded in previous step),
+  // so the last step needed is to reduce these last partial results in the final results.
+  if (mpi_row_chain.size() > 1) {
+    for (const auto& i_w1_lc : W1.iteratorLocal()) {
+      if (rank_panel == rank.col())
+        ex::start_detached(comm::schedule_reduce_recv_in_place(mpi_row_chain.exclusive(), MPI_SUM,
+                                                               W1.readwrite(i_w1_lc)));
+      else
+        ex::start_detached(comm::schedule_reduce_send(mpi_row_chain.exclusive(), rank_panel, MPI_SUM,
+                                                      W1.read(i_w1_lc)));
+    }
+  }
+}
+
+template <Backend B, Device D, class T>
+void her2k_2nd(const SizeType j_end, const matrix::SubMatrixView& at_view, matrix::Matrix<T, D>& a,
+               matrix::Panel<Coord::Col, const T, D>& W1,
+               matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>& W1T,
+               matrix::Panel<Coord::Col, const T, D>& V,
+               matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>& VT) {
+  static_assert(std::is_signed_v<BaseType<T>>, "alpha in computations requires to be -1");
+
+  using pika::execution::thread_priority;
+  using red2band::her2kDiag;
+  using red2band::her2kOffDiag;
+
+  const auto dist = a.distribution();
+
+  const LocalTileIndex at_offset = at_view.begin();
+
+  const SizeType jR_end_lc = dist.template next_local_tile_from_global_tile<Coord::Col>(j_end);
+  for (SizeType i_lc = at_offset.row(); i_lc < dist.localNrTiles().rows(); ++i_lc) {
+    const auto j_end_lc =
+        std::min(jR_end_lc, dist.template next_local_tile_from_global_tile<Coord::Col>(
+                                dist.template global_tile_from_local_tile<Coord::Row>(i_lc) + 1));
+    for (SizeType j_lc = at_offset.col(); j_lc < j_end_lc; ++j_lc) {
+      const LocalTileIndex ij_local{i_lc, j_lc};
+      const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
+
+      const bool is_diagonal_tile = (ij.row() == ij.col());
+
+      auto getSubA = [&a, &at_view, ij_local]() {
+        return splitTile(a.readwrite(ij_local), at_view(ij_local));
+      };
+
+      // The first column of the trailing matrix (except for the very first global tile) has to be
+      // updated first, in order to unlock the next iteration as soon as possible.
+      const auto priority = (j_lc == at_offset.col()) ? thread_priority::high : thread_priority::normal;
+
+      if (is_diagonal_tile) {
+        her2kDiag<B>(priority, V.read(ij_local), W1.read(ij_local), getSubA());
+      }
+      else {
+        // A -= X . V*
+        her2kOffDiag<B>(priority, W1.read(ij_local), VT.read(ij_local), getSubA());
+
+        // A -= V . X*
+        her2kOffDiag<B>(priority, V.read(ij_local), W1T.read(ij_local), getSubA());
+      }
+    }
+  }
+}
 }
 
 // Distributed implementation of reduction to band
@@ -301,7 +470,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
   for (SizeType j = 0; j < ntiles; ++j) {
     const SizeType i = j + 1;
 
-    const SizeType nrefls_step = dist_taus.tile_size_of({0, j}).rows();
+    const SizeType nrefls_step = dist_taus.tile_size_of({j, 0}).rows();
 
     // panel
     const GlobalTileIndex panel_offset(i, j);
@@ -331,7 +500,6 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
     // TRAILING 1st pass
     if (at_offset_el.isIn(mat_a.size())) {
       auto& ws_V = panels_v.nextResource();
-      auto& ws_VT = panels_vt.nextResource();
 
       ws_V.setRangeStart(at_offset);
       ws_V.setWidth(nrefls_step);
@@ -343,13 +511,14 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
         using factorization::internal::computeTFactor;
         using red2band::local::setupReflectorPanelV;
 
-        constexpr bool has_head = true;  // Note: they are local, so every rank has its own head
+        const bool has_head = !panel_view.iteratorLocal().empty();
         setupReflectorPanelV<B, D, T>(has_head, panel_view, nrefls_step, ws_V, mat_a, !is_full_band);
 
         const GlobalTileIndex j_tau(j, 0);
         computeTFactor<B>(ws_V, mat_taus_1st.read(j_tau), ws_T.readwrite(zero_lc));
       }
 
+      auto& ws_VT = panels_vt.nextResource();
       ws_VT.setRangeStart(at_offset);
       ws_VT.setHeight(nrefls_step);
 
@@ -414,7 +583,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
 
           for (const auto& idx : ws_W1.iteratorLocal())
             ex::start_detached(ex::when_all(ws_W1.read(idx), ws_W3.readwrite(idx)) |
-                               dlaf::matrix::copy(di::Policy<Backend::MC>{}));
+                               matrix::copy(di::Policy<Backend::MC>{}));
 
           // W1 -= 0.5 V W2
           red2band::local::gemmUpdateX<B, D>(ws_W3, ws_W2, ws_V);
@@ -496,28 +665,39 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
       ws_V.reset();
     }
 
+    // ===== 2nd pass
+
     // PANEL: just ranks in the current column
     // QR local with just heads (HH reflectors have to be computed elsewhere, 1st-pass in-place)
+    auto&& panel_heads = panels_heads.nextResource();
+    panel_heads.setRangeEnd({n_qr_heads, 0});
+    panel_heads.setWidth(nrefls_step);
+
+    const matrix::Distribution dist_heads_current(LocalElementSize(n_qr_heads * dist.block_size().rows(),
+                                                                   dist.block_size().cols()),
+                                                  dist.block_size());
+    const matrix::SubPanelView panel_heads_view(dist_heads_current, {0, 0}, nrefls_step);
+
     if (rank_panel == rank.col()) {
-      auto&& panel_heads = panels_heads.nextResource();
-      panel_heads.setRangeEnd({n_qr_heads, 0});
-      panel_heads.setWidth(nrefls_step);
+      const comm::IndexT_MPI rank_hoh = dist.template rank_global_tile<Coord::Row>(panel_offset.row());
 
       for (int idx_head = 0; idx_head < n_qr_heads; ++idx_head) {
         using dlaf::comm::schedule_bcast_recv;
         using dlaf::comm::schedule_bcast_send;
 
-        const SizeType i_head = panel_view.offset().row() + idx_head;
-        const comm::Index2D rank_head(dist.template rank_global_tile<Coord::Row>(i_head), rank_panel);
-
-        const GlobalTileIndex ij_head(i_head, j);
         const LocalTileIndex idx_panel_head(idx_head, 0);
 
-        if (rank.row() == rank_head.row()) {
+        const GlobalTileIndex ij_head(panel_view.offset().row() + idx_head, j);
+        const comm::IndexT_MPI rank_head = dist.template rank_global_tile<Coord::Row>(ij_head.row());
+
+        if (rank.row() == rank_head) {
           // copy - set - send
           ex::start_detached(ex::when_all(mat_a.read(ij_head), panel_heads.readwrite(idx_panel_head)) |
                              di::transform(di::Policy<B>(), [=](const auto& head_in, auto&& head) {
-                               matrix::internal::copy(head_in, head);
+                               // TODO FIXME change copy and if possible just lower
+                               // matrix::internal::copy(head_in, head);
+                               lapack::lacpy(blas::Uplo::General, head.size().rows(), head.size().cols(),
+                                             head_in.ptr(), head_in.ld(), head.ptr(), head.ld());
                                lapack::laset(blas::Uplo::Lower, head.size().rows() - 1,
                                              head.size().cols(), T(0), T(0), head.ptr({1, 0}),
                                              head.ld());
@@ -527,45 +707,40 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
         }
         else {
           // receive
-          ex::start_detached(schedule_bcast_recv(mpi_col_chain.exclusive(), rank_head.row(),
+          ex::start_detached(schedule_bcast_recv(mpi_col_chain.exclusive(), rank_head,
                                                  panel_heads.readwrite(idx_panel_head)));
         }
       }
 
       // QR local on heads
-      // TODO check if some head tiles are not used and limit (or set to zero as a starting point)
-      {
-        matrix::SubPanelView panel_heads_view(dist_heads, {0, 0}, band_size);
-        red2band::local::computePanelReflectors(panel_heads, mat_taus_2nd, 0, panel_heads_view);
-      }
-
-      // TODO copy back data
-      // - just head of heads upper to mat_a
-      // - reflectors to hh_2nd
+      red2band::local::computePanelReflectors(panel_heads, mat_taus_2nd, j, panel_heads_view);
 
-      const GlobalTileIndex ij_hoh(panel_view.offset().row(), j);
-      if (rank.row() == dist.template rank_global_tile<Coord::Row>(ij_hoh.row()))
-        ex::start_detached(ex::when_all(panel_heads.read({0, 0}), mat_a.readwrite(ij_hoh)) |
-                           di::transform(di::Policy<B>(), [](const auto& hoh, auto&& hoh_a) {
-                             common::internal::SingleThreadedBlasScope single;
-                             lapack::lacpy(blas::Uplo::Upper, hoh.size().rows(), hoh.size().cols(),
-                                           hoh.ptr(), hoh.ld(), hoh_a.ptr(), hoh_a.ld());
-                           }));
-
-      for (int idx_qr_head = 0; idx_qr_head < n_qr_heads; ++idx_qr_head) {
-        const SizeType i_head = panel_view.offset().row() + idx_qr_head;
-        const comm::Index2D rank_head(dist.template rank_global_tile<Coord::Row>(i_head), rank_panel);
-
-        if (rank.row() == rank_head.row()) {
-          const LocalTileIndex idx_panel_head(idx_qr_head, 0);
-          const GlobalTileIndex ij_head(i_head, j);
+      // copy back data
+      {
+        // - just head of heads upper to mat_a
+        // - reflectors to hh_2nd
+        const GlobalTileIndex ij_hoh(panel_view.offset().row(), j);
+        if (rank.row() == dist.template rank_global_tile<Coord::Row>(ij_hoh.row()))
+          ex::start_detached(ex::when_all(panel_heads.read({0, 0}), mat_a.readwrite(ij_hoh)) |
+                             di::transform(di::Policy<B>(), [](const auto& hoh, auto&& hoh_a) {
+                               common::internal::SingleThreadedBlasScope single;
+                               lapack::lacpy(blas::Uplo::Upper, hoh.size().rows(), hoh.size().cols(),
+                                             hoh.ptr(), hoh.ld(), hoh_a.ptr(), hoh_a.ld());
+                             }));
 
-          const bool is_hoh = (i_head == panel_view.offset().row());
+        // Note: not all ranks might have an head
+        if (!panel_view.iteratorLocal().empty()) {
+          const auto i_head_lc =
+              dist.template next_local_tile_from_global_tile<Coord::Row>(panel_view.offset().row());
+          const auto i_head = dist.template global_tile_from_local_tile<Coord::Row>(i_head_lc);
+          const auto idx_head = i_head - panel_view.offset().row();
+          const LocalTileIndex idx_panel_head(idx_head, 0);
+          const GlobalTileIndex ij_head(0, j);
 
           auto sender_heads =
               ex::when_all(panel_heads.read(idx_panel_head), mat_hh_2nd.readwrite(ij_head));
 
-          if (is_hoh)
+          if (rank.row() == rank_hoh) {
             ex::start_detached(std::move(sender_heads) |
                                di::transform(di::Policy<B>(), [](const auto& head, auto&& head_a) {
                                  common::internal::SingleThreadedBlasScope single;
@@ -576,7 +751,8 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
                                                head.size().cols() - 1, head.ptr({1, 0}), head.ld(),
                                                head_a.ptr({1, 0}), head_a.ld());
                                }));
-          else
+          }
+          else {
             ex::start_detached(std::move(sender_heads) |
                                di::transform(di::Policy<B>(), [](const auto& head, auto&& head_a) {
                                  common::internal::SingleThreadedBlasScope single;
@@ -584,16 +760,146 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
                                                head.size().cols(), head.ptr(), head.ld(), head_a.ptr(),
                                                head_a.ld());
                                }));
+          }
         }
       }
-
-      panel_heads.reset();
     }
 
     // TRAILING 2nd pass
-    {}
+    {
+      const GlobalTileIndex at_end_L(at_offset.row() + n_qr_heads, 0);
+      const GlobalTileIndex at_end_R(0, at_offset.col() + n_qr_heads);
+
+      const LocalTileIndex zero_lc(0, 0);
+      matrix::Matrix<T, D> ws_T({nrefls_step, nrefls_step}, dist.block_size());
+
+      auto& ws_V = panels_v.nextResource();
+      ws_V.setRange(at_offset, at_end_L);
+      ws_V.setWidth(nrefls_step);
+
+      if (rank_panel == rank.col()) {
+        // setup reflector panel
+        const GlobalTileIndex ij_head(0, j);
+        const LocalTileIndex ij_vhh_lc(ws_V.rangeStartLocal(), 0);
+
+        if (!ws_V.iteratorLocal().empty()) {
+          ex::start_detached(ex::when_all(mat_hh_2nd.read(ij_head), ws_V.readwrite(ij_vhh_lc)) |
+                             di::transform(di::Policy<B>(), [=](const auto& head_in, auto&& head) {
+                               lapack::lacpy(blas::Uplo::General, head.size().rows(), head.size().cols(),
+                                             head_in.ptr(), head_in.ld(), head.ptr(), head.ld());
+                             }));
+        }
+
+        for (const auto& i_lc : ws_V.iteratorLocal()) {
+          std::ostringstream ss;
+          ss << "V2nd(" << dist.global_tile_index(i_lc) << ")";
+          print_sync(ss.str(), ws_V.read(i_lc));
+        }
+
+        using factorization::internal::computeTFactor;
+        const GlobalTileIndex j_tau(j, 0);
+        computeTFactor<B>(ws_V, mat_taus_2nd.read(j_tau), ws_T.readwrite(zero_lc), mpi_col_chain);
+
+        print_sync("T2nd", ws_T.read(zero_lc));
+      }
+
+      auto& ws_VT = panels_vt.nextResource();
+      ws_VT.setRange(at_offset, at_end_R);
+      ws_VT.setHeight(nrefls_step);
+
+      comm::broadcast(rank_panel, ws_V, ws_VT, mpi_row_chain, mpi_col_chain);
+
+      for (const auto& i_lc : ws_VT.iteratorLocal()) {
+        std::ostringstream ss;
+        ss << "VT2nd(" << dist.global_tile_index(i_lc) << ")";
+        print_sync(ss.str(), ws_VT.read(i_lc));
+      }
+
+      // Note:
+      // Differently from 1st pass, where transformations are independent one from the other,
+      // this 2nd pass is a single QR transformation that has to be applied from L and R.
+
+      // W0 = V T
+      auto& ws_W0 = panels_w0.nextResource();
+      ws_W0.setRange(at_offset, at_end_L);
+      ws_W0.setWidth(nrefls_step);
+
+      if (rank.col() == rank_panel)
+        red2band::local::trmmComputeW<B, D>(ws_W0, ws_V, ws_T.read(zero_lc));
+
+      // distribute W0 -> W0T
+      auto& ws_W0T = panels_w0t.nextResource();
+      ws_W0T.setRange(at_offset, at_end_R);
+      ws_W0T.setHeight(nrefls_step);
+
+      comm::broadcast(rank_panel, ws_W0, ws_W0T, mpi_row_chain, mpi_col_chain);
+
+      for (const auto& idx : ws_W0.iteratorLocal()) {
+        std::ostringstream ss;
+        ss << "W0(" << dist.global_tile_index(idx) << ")";
+        print_sync(ss.str(), ws_W0.read(idx));
+      }
+
+      for (const auto& idx : ws_W0T.iteratorLocal()) {
+        std::ostringstream ss;
+        ss << "W0T(" << dist.global_tile_index(idx) << ")";
+        print_sync(ss.str(), ws_W0T.read(idx));
+      }
+
+      // W1 = A W0
+      auto& ws_W1 = panels_w1.nextResource();
+      ws_W1.setRange(at_offset, at_end_L);
+      ws_W1.setWidth(nrefls_step);
+
+      auto& ws_W1T = panels_w1t.nextResource();
+      ws_W1T.setRange(at_offset, at_end_R);
+      ws_W1T.setHeight(nrefls_step);
+
+      ca_red2band::hemm2nd<B, D>(rank_panel, ws_W1, ws_W1T, at_view, at_end_R.col(), mat_a, ws_W0,
+                                 ws_W0T, mpi_row_chain, mpi_col_chain);
+
+      for (const auto& idx : ws_W1.iteratorLocal()) {
+        std::ostringstream ss;
+        ss << "W1(" << dist.global_tile_index(idx) << ")";
+        print_sync(ss.str(), ws_W1.read(idx));
+      }
+
+      // W1 = W1 - 0.5 V W0* W1
+      if (rank.col() == rank_panel) {
+        matrix::Matrix<T, D> ws_W2 = std::move(ws_T);
+
+        // W2 = W0T W1
+        red2band::local::gemmComputeW2<B, D>(ws_W2, ws_W0, ws_W1);
+        if (mpi_col_chain.size() > 1) {
+          ex::start_detached(comm::schedule_all_reduce_in_place(mpi_col_chain.exclusive(), MPI_SUM,
+                                                                ws_W2.readwrite(zero_lc)));
+        }
+
+        print_sync("W2", ws_W2.read(zero_lc));
+
+        // W1 = W1 - 0.5 V W2
+        red2band::local::gemmUpdateX<B, D>(ws_W1, ws_W2, ws_V);
+      }
+
+      // distribute W1 -> W1T
+      ws_W1T.reset();
+      ws_W1T.setRange(at_offset, at_end_R);
+      ws_W1T.setHeight(nrefls_step);
+
+      comm::broadcast(rank_panel, ws_W1, ws_W1T, mpi_row_chain, mpi_col_chain);
+
+      // A -= W1 VT + V W1T
+      ca_red2band::her2k_2nd<B>(at_end_R.col(), at_view, mat_a, ws_W1, ws_W1T, ws_V, ws_VT);
+
+      ws_W1T.reset();
+      ws_W1.reset();
+      ws_W0T.reset();
+      ws_W0.reset();
+      ws_VT.reset();
+      ws_V.reset();
+    }
 
-    break;
+    panel_heads.reset();
   }
 
   return {std::move(mat_taus_1st), std::move(mat_taus_2nd), std::move(mat_hh_2nd)};
diff --git a/include/dlaf/eigensolver/reduction_to_band/common.h b/include/dlaf/eigensolver/reduction_to_band/common.h
index 8a6314bea1..73588cf284 100644
--- a/include/dlaf/eigensolver/reduction_to_band/common.h
+++ b/include/dlaf/eigensolver/reduction_to_band/common.h
@@ -281,9 +281,9 @@ void computePanelReflectors(MatrixLikeA& mat_a, MatrixLikeTaus& mat_taus, const
   const auto panel_range = panel_view.iteratorLocal();
   const std::size_t panel_ntiles = to_sizet(std::distance(panel_range.begin(), panel_range.end()));
 
-  // if (panel_ntiles == 0) {
-  //   return;
-  // }
+  if (panel_ntiles == 0) {
+    return;
+  }
 
   panel_tiles.reserve(panel_ntiles);
   for (const auto& i : panel_range) {
diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
index 87ed14e845..b297175178 100644
--- a/include/dlaf/factorization/qr/t_factor_impl.h
+++ b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -105,8 +105,6 @@ struct Helpers<Backend::MC, Device::CPU, T> {
 
   template <typename TSender>
   static auto trmvUpdateColumn(TSender&& tile_t) noexcept {
-    namespace ex = pika::execution::experimental;
-
     // Update each column (in order) t = T . t
     // remember that T is upper triangular, so it is possible to use TRMV
     auto trmv_func = [](matrix::Tile<T, Device::CPU>&& tile_t) {
@@ -121,6 +119,7 @@ struct Helpers<Backend::MC, Device::CPU, T> {
       // TODO: Why return if the tile is unused?
       return std::move(tile_t);
     };
+
     return dlaf::internal::transform(
         dlaf::internal::Policy<Backend::MC>(pika::execution::thread_priority::high),
         std::move(trmv_func), std::forward<TSender>(tile_t));

From e4647f01ce097c08020a5dafe89fe99ac7e8badc Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Thu, 18 Jul 2024 16:21:52 +0200
Subject: [PATCH 09/30] switch to local tfactor for 2nd pass + minor changes

---
 .../dlaf/eigensolver/reduction_to_band/ca-impl.h    | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index 53339fb072..d2e3fabca4 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -253,22 +253,18 @@ void hemm2nd(comm::IndexT_MPI rank_panel, matrix::Panel<Coord::Col, T, D>& W1,
     for (SizeType j_lc = at_offset.col(); j_lc < j_end_lc; ++j_lc) {
       const LocalTileIndex ij_lc(i_lc, j_lc);
       const GlobalTileIndex ij = dist.global_tile_index(ij_lc);
-      std::cout << "hemm2nd @ " << ij << "\n";
 
       // skip upper
       if (ij.row() < ij.col()) {
-        std::cout << "hemm2nd-skipping " << ij << "\n";
         continue;
       }
 
       const bool is_diag = (ij.row() == ij.col());
 
       if (is_diag) {
-        std::cout << "hemm2nd-diag " << ij << "\n";
         hemmDiag<B>(thread_priority::high, A.read(ij_lc), W0.read(ij_lc), W1.readwrite(ij_lc));
       }
       else {
-        std::cout << "hemm2nd-odL " << ij << "\n";
         // Lower
         hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, A.read(ij_lc), W0T.read(ij_lc),
                        W1.readwrite(ij_lc));
@@ -280,8 +276,6 @@ void hemm2nd(comm::IndexT_MPI rank_panel, matrix::Panel<Coord::Col, T, D>& W1,
         if (ijU.col() >= j_end)
           continue;
 
-        std::cout << "hemm2nd-odU " << ij << "\n";
-
         const comm::IndexT_MPI owner_row = dist.template rank_global_tile<Coord::Row>(ijU.row());
         const SizeType iU_lc = dist.template local_tile_from_global_tile<Coord::Row>(ij.col());
         const LocalTileIndex i_w1_lc(iU_lc, 0);
@@ -678,6 +672,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
                                                   dist.block_size());
     const matrix::SubPanelView panel_heads_view(dist_heads_current, {0, 0}, nrefls_step);
 
+    const bool rank_has_head_row = !panel_view.iteratorLocal().empty();
     if (rank_panel == rank.col()) {
       const comm::IndexT_MPI rank_hoh = dist.template rank_global_tile<Coord::Row>(panel_offset.row());
 
@@ -729,7 +724,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
                              }));
 
         // Note: not all ranks might have an head
-        if (!panel_view.iteratorLocal().empty()) {
+        if (rank_has_head_row) {
           const auto i_head_lc =
               dist.template next_local_tile_from_global_tile<Coord::Row>(panel_view.offset().row());
           const auto i_head = dist.template global_tile_from_local_tile<Coord::Row>(i_head_lc);
@@ -782,7 +777,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
         const GlobalTileIndex ij_head(0, j);
         const LocalTileIndex ij_vhh_lc(ws_V.rangeStartLocal(), 0);
 
-        if (!ws_V.iteratorLocal().empty()) {
+        if (rank_has_head_row) {
           ex::start_detached(ex::when_all(mat_hh_2nd.read(ij_head), ws_V.readwrite(ij_vhh_lc)) |
                              di::transform(di::Policy<B>(), [=](const auto& head_in, auto&& head) {
                                lapack::lacpy(blas::Uplo::General, head.size().rows(), head.size().cols(),
@@ -798,7 +793,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
 
         using factorization::internal::computeTFactor;
         const GlobalTileIndex j_tau(j, 0);
-        computeTFactor<B>(ws_V, mat_taus_2nd.read(j_tau), ws_T.readwrite(zero_lc), mpi_col_chain);
+        computeTFactor<B>(panel_heads, mat_taus_2nd.read(j_tau), ws_T.readwrite(zero_lc));
 
         print_sync("T2nd", ws_T.read(zero_lc));
       }

From 9c6a4173c65d7669401d61d55168d494f568496a Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Thu, 18 Jul 2024 18:13:34 +0200
Subject: [PATCH 10/30] change mat_hh_2nd distribution to an equivalent one

it is easier to manage on the test side and it does not change much on
the algorithmic side
---
 .../dlaf/eigensolver/reduction_to_band/ca-impl.h   | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index d2e3fabca4..fd2be689e6 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -418,11 +418,10 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
   Matrix<T, Device::CPU> mat_taus_2nd(dist_taus);
 
   // Note:
-  // row-panel distributed over columns, but replicated over rows
-  const matrix::Distribution dist_hh_2nd(GlobalElementSize(dist.block_size().rows(), dist.size().cols()),
-                                         dist.block_size(), comm::Size2D(1, dist.grid_size().cols()),
-                                         comm::Index2D(0, rank.col()),
-                                         comm::Index2D(0, dist.source_rank_index().col()));
+  // Matrix distributed as the input one, but it has room for storing just one row per rank.
+  const matrix::Distribution dist_hh_2nd(
+      GlobalElementSize(dist.grid_size().rows() * dist.block_size().rows(), dist.size().cols()),
+      dist.block_size(), dist.grid_size(), rank, dist.source_rank_index());
   Matrix<T, D> mat_hh_2nd(dist_hh_2nd);
 
   if (nrefls == 0)
@@ -463,6 +462,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
 
   for (SizeType j = 0; j < ntiles; ++j) {
     const SizeType i = j + 1;
+    const SizeType j_lc = dist.template local_tile_from_global_tile<Coord::Col>(j);
 
     const SizeType nrefls_step = dist_taus.tile_size_of({j, 0}).rows();
 
@@ -730,7 +730,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
           const auto i_head = dist.template global_tile_from_local_tile<Coord::Row>(i_head_lc);
           const auto idx_head = i_head - panel_view.offset().row();
           const LocalTileIndex idx_panel_head(idx_head, 0);
-          const GlobalTileIndex ij_head(0, j);
+          const LocalTileIndex ij_head(0, j_lc);
 
           auto sender_heads =
               ex::when_all(panel_heads.read(idx_panel_head), mat_hh_2nd.readwrite(ij_head));
@@ -774,7 +774,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
 
       if (rank_panel == rank.col()) {
         // setup reflector panel
-        const GlobalTileIndex ij_head(0, j);
+        const LocalTileIndex ij_head(0, j_lc);
         const LocalTileIndex ij_vhh_lc(ws_V.rangeStartLocal(), 0);
 
         if (rank_has_head_row) {

From a76d9ebfbeb645e5871bb96dec7c56e76867ce4c Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Thu, 18 Jul 2024 18:45:02 +0200
Subject: [PATCH 11/30] adapt computePanelReflectors to deal with a tile for
 taus

---
 .../dlaf/eigensolver/reduction_to_band/ca-impl.h    | 10 +++++++---
 include/dlaf/eigensolver/reduction_to_band/common.h | 13 +++++++------
 include/dlaf/eigensolver/reduction_to_band/impl.h   |  8 ++++----
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index fd2be689e6..d6570b40e6 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -488,8 +488,10 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
 
     // PANEL: just ranks in the current column
     // QR local (HH reflectors stored in-place)
-    if (rank_panel == rank.col())
-      red2band::local::computePanelReflectors(mat_a, mat_taus_1st, panel_offset.col(), panel_view);
+    if (rank_panel == rank.col()) {
+      using red2band::local::computePanelReflectors;
+      computePanelReflectors(mat_a, mat_taus_1st.readwrite(LocalTileIndex(j_lc, 0)), panel_view);
+    }
 
     // TRAILING 1st pass
     if (at_offset_el.isIn(mat_a.size())) {
@@ -708,7 +710,9 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
       }
 
       // QR local on heads
-      red2band::local::computePanelReflectors(panel_heads, mat_taus_2nd, j, panel_heads_view);
+      using red2band::local::computePanelReflectors;
+      computePanelReflectors(panel_heads, mat_taus_2nd.readwrite(LocalTileIndex(j_lc, 0)),
+                             panel_heads_view);
 
       // copy back data
       {
diff --git a/include/dlaf/eigensolver/reduction_to_band/common.h b/include/dlaf/eigensolver/reduction_to_band/common.h
index 73588cf284..7506faf745 100644
--- a/include/dlaf/eigensolver/reduction_to_band/common.h
+++ b/include/dlaf/eigensolver/reduction_to_band/common.h
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <type_traits>
 #include <vector>
 
 #include <pika/barrier.hpp>
@@ -268,11 +269,12 @@ T computeReflector(const std::vector<matrix::Tile<T, D>>& panel, SizeType j) {
   return tau;
 }
 
-template <class MatrixLikeA, class MatrixLikeTaus>
-void computePanelReflectors(MatrixLikeA& mat_a, MatrixLikeTaus& mat_taus, const SizeType j_sub,
+template <class T, Device D, class MatrixLikeA>
+void computePanelReflectors(MatrixLikeA& mat_a, matrix::ReadWriteTileSender<T, D> tile_tau,
                             const matrix::SubPanelView& panel_view) {
-  static Device constexpr D = MatrixLikeA::device;
-  using T = typename MatrixLikeA::ElementType;
+  static_assert(D == MatrixLikeA::device);
+  static_assert(std::is_same_v<T, typename MatrixLikeA::ElementType>);
+
   using pika::execution::thread_priority;
   namespace ex = pika::execution::experimental;
   namespace di = dlaf::internal;
@@ -295,8 +297,7 @@ void computePanelReflectors(MatrixLikeA& mat_a, MatrixLikeTaus& mat_taus, const
   ex::start_detached(
       ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads),
                             std::vector<common::internal::vector<T>>{}),  // w (internally required)
-                   mat_taus.readwrite(LocalTileIndex(j_sub, 0)),
-                   ex::when_all_vector(std::move(panel_tiles))) |
+                   std::move(tile_tau), ex::when_all_vector(std::move(panel_tiles))) |
       ex::transfer(di::getBackendScheduler<Backend::MC>(thread_priority::high)) |
       ex::bulk(nthreads, [nthreads, cols = panel_view.cols()](const std::size_t index, auto& barrier_ptr,
                                                               auto& w, auto& taus, auto& tiles) {
diff --git a/include/dlaf/eigensolver/reduction_to_band/impl.h b/include/dlaf/eigensolver/reduction_to_band/impl.h
index 7842a95254..3dbbdd7b9c 100644
--- a/include/dlaf/eigensolver/reduction_to_band/impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/impl.h
@@ -434,10 +434,10 @@ template <class T>
 struct ComputePanelHelper<Backend::MC, Device::CPU, T> {
   ComputePanelHelper(const std::size_t, matrix::Distribution) {}
 
-  void call(Matrix<T, Device::CPU>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
+  void call(Matrix<T, Device::CPU>& mat_a, matrix::ReadWriteTileSender<T, Device::CPU> tile_tau,
             const matrix::SubPanelView& panel_view) {
     using red2band::local::computePanelReflectors;
-    computePanelReflectors(mat_a, mat_taus, j_sub, panel_view);
+    computePanelReflectors(mat_a, std::move(tile_tau), panel_view);
   }
 
   template <Device D, class CommSender, class TriggerSender>
@@ -471,7 +471,7 @@ struct ComputePanelHelper<Backend::GPU, Device::GPU, T> {
     auto& v = panels_v.nextResource();
 
     copyToCPU(panel_view, mat_a, v);
-    computePanelReflectors(v, mat_taus, j_sub, panel_view);
+    computePanelReflectors(v, mat_taus.readwrite(GlobalTileIndex(j_sub, 0)), panel_view);
     copyFromCPU(panel_view, v, mat_a);
   }
 
@@ -614,7 +614,7 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(Matrix<T, D>& mat_a, const
       v.setWidth(nrefls_tile);
 
     // PANEL
-    compute_panel_helper.call(mat_a, mat_taus_retiled, j_sub, panel_view);
+    compute_panel_helper.call(mat_a, mat_taus_retiled.readwrite(GlobalTileIndex(j_sub, 0)), panel_view);
 
     // Note:
     // - has_reflector_head tells if this rank owns the first tile of the panel (being local, always true)

From a2d563892ada36c32919bd27dc3c6da31c16f9bb Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Fri, 19 Jul 2024 15:13:08 +0200
Subject: [PATCH 12/30] fix distribution for both taus_1st and taus_2nd

---
 .../eigensolver/reduction_to_band/ca-impl.h   | 37 ++++++++++---------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index d6570b40e6..3c54d02de9 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -398,32 +398,36 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
   const auto& dist = mat_a.distribution();
   const comm::Index2D rank = dist.rank_index();
 
+  // Note:
+  // Reflector of size = 1 is not considered whatever T is (i.e. neither real nor complex)
   const SizeType nrefls = std::max<SizeType>(0, dist.size().cols() - band_size - 1);
 
   // Note:
-  // Reflector of size = 1 is not considered whatever T is (i.e. neither real
-  // nor complex)
-  DLAF_ASSERT(dist.block_size().cols() % band_size == 0, dist.block_size().cols(), band_size);
-
+  // Each rank has space for storing taus for one tile. It is distributed as the input matrix (i.e. 2D)
   // Note:
-  // row-vector that is distributed over columns, but replicated over rows.
-  // for historical reason it is stored and accessed as a column-vector.
+  // It is distributed "transposed" because of implicit assumptions in functions, i.e.
+  // computePanelReflectors and computeTFactor, which expect a column vector.
   DLAF_ASSERT(dist.block_size().cols() % band_size == 0, dist.block_size().cols(), band_size);
-  const matrix::Distribution dist_taus(GlobalElementSize(nrefls, 1),
-                                       TileElementSize(dist.block_size().cols(), 1),
-                                       comm::Size2D(dist.grid_size().cols(), 1),
-                                       comm::Index2D(rank.col(), 0),
-                                       comm::Index2D(dist.source_rank_index().col(), 0));
-  Matrix<T, Device::CPU> mat_taus_1st(dist_taus);
-  Matrix<T, Device::CPU> mat_taus_2nd(dist_taus);
+  Matrix<T, Device::CPU> mat_taus_1st(matrix::Distribution(
+      GlobalElementSize(nrefls, dist.grid_size().rows()), TileElementSize(dist.block_size().cols(), 1),
+      transposed(dist.grid_size()), transposed(rank), transposed(dist.source_rank_index())));
 
   // Note:
-  // Matrix distributed as the input one, but it has room for storing just one row per rank.
+  // It has room for storing one tile per rank and it is distributed as the input matrix (i.e. 2D)
   const matrix::Distribution dist_hh_2nd(
       GlobalElementSize(dist.grid_size().rows() * dist.block_size().rows(), dist.size().cols()),
       dist.block_size(), dist.grid_size(), rank, dist.source_rank_index());
   Matrix<T, D> mat_hh_2nd(dist_hh_2nd);
 
+  // Note:
+  // Is stored as a column but it acts as a row vector. It is replicated over rows and it is distributed
+  // over columns (i.e. 1D distributed)
+  DLAF_ASSERT(dist.block_size().cols() % band_size == 0, dist.block_size().cols(), band_size);
+  Matrix<T, Device::CPU> mat_taus_2nd(
+      matrix::Distribution(GlobalElementSize(nrefls, 1), TileElementSize(dist.block_size().cols(), 1),
+                           comm::Size2D(dist.grid_size().cols(), 1), comm::Index2D(rank.col(), 0),
+                           comm::Index2D(dist.source_rank_index().col(), 0)));
+
   if (nrefls == 0)
     return {std::move(mat_taus_1st), std::move(mat_taus_2nd), std::move(mat_hh_2nd)};
 
@@ -464,7 +468,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
     const SizeType i = j + 1;
     const SizeType j_lc = dist.template local_tile_from_global_tile<Coord::Col>(j);
 
-    const SizeType nrefls_step = dist_taus.tile_size_of({j, 0}).rows();
+    const SizeType nrefls_step = mat_taus_1st.tile_size_of(GlobalTileIndex(j, rank.row())).rows();
 
     // panel
     const GlobalTileIndex panel_offset(i, j);
@@ -510,8 +514,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
         const bool has_head = !panel_view.iteratorLocal().empty();
         setupReflectorPanelV<B, D, T>(has_head, panel_view, nrefls_step, ws_V, mat_a, !is_full_band);
 
-        const GlobalTileIndex j_tau(j, 0);
-        computeTFactor<B>(ws_V, mat_taus_1st.read(j_tau), ws_T.readwrite(zero_lc));
+        computeTFactor<B>(ws_V, mat_taus_1st.read(LocalTileIndex{j_lc, 0}), ws_T.readwrite(zero_lc));
       }
 
       auto& ws_VT = panels_vt.nextResource();

From 5c146f85c3be61960c9f6ffc67b8bc0c49b0ae82 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Fri, 19 Jul 2024 15:13:36 +0200
Subject: [PATCH 13/30] update test structure

---
 .../eigensolver/test_reduction_to_band.cpp    | 126 ++++++++++++++++--
 1 file changed, 117 insertions(+), 9 deletions(-)

diff --git a/test/unit/eigensolver/test_reduction_to_band.cpp b/test/unit/eigensolver/test_reduction_to_band.cpp
index 99dcc5a247..c7846d0b90 100644
--- a/test/unit/eigensolver/test_reduction_to_band.cpp
+++ b/test/unit/eigensolver/test_reduction_to_band.cpp
@@ -499,6 +499,95 @@ using CAReductionToBandTestMC = ReductionToBandTest<T>;
 using JustThisType = ::testing::Types<float>;
 TYPED_TEST_SUITE(CAReductionToBandTestMC, JustThisType);
 
+template <class T>
+MatrixLocal<T> allGatherT(Matrix<const T, Device::CPU>& source, comm::CommunicatorGrid& comm_grid) {
+  // TODO tranposed distribution
+  // DLAF_ASSERT(matrix::equal_process_grid(source, comm_grid), source, comm_grid);
+
+  namespace tt = pika::this_thread::experimental;
+
+  MatrixLocal<std::remove_const_t<T>> dest(source.size(), source.baseTileSize());
+
+  const auto& dist_source = source.distribution();
+  const auto rank = transposed(dist_source.rank_index());
+
+  for (const auto& ij : iterate_range2d(dist_source.nr_tiles())) {
+    const comm::Index2D owner = transposed(dist_source.rank_global_tile(ij));
+
+    auto& dest_tile = dest.tile(ij);
+
+    if (owner == rank) {
+      const auto source_tile_holder = tt::sync_wait(source.read(ij));
+      const auto& source_tile = source_tile_holder.get();
+      comm::sync::broadcast::send(comm_grid.fullCommunicator(), source_tile);
+      matrix::internal::copy(source_tile, dest_tile);
+    }
+    else {
+      comm::sync::broadcast::receive_from(comm_grid.rankFullCommunicator(owner),
+                                          comm_grid.fullCommunicator(), dest_tile);
+    }
+  }
+
+  return MatrixLocal<T>(std::move(dest));
+}
+
+template <class T>
+auto checkResult(const SizeType k, const SizeType band_size, Matrix<const T, Device::CPU>& reference,
+                 const MatrixLocal<T>& mat_b, const MatrixLocal<T>& mat_hh_1st,
+                 const MatrixLocal<T>& taus_1st, const MatrixLocal<T>& mat_hh_2nd,
+                 const std::vector<T>& taus_2nd) {
+  dlaf::internal::silenceUnusedWarningFor(k, band_size, reference, mat_b);
+
+  const GlobalElementIndex offset(band_size, 0);
+  // Now that all input are collected locally, it's time to apply the transformation,
+  // ...but just if there is any
+  if (offset.isIn(mat_hh_1st.size())) {
+    // Reduction to band returns a sequence of transformations applied from left and right to A
+    // allowing to reduce the matrix A to a band matrix B
+    //
+    // Hn* ... H2* H1* A H1 H2 ... Hn
+    // Q* A Q = B
+    //
+    // Applying the inverse of the same transformations, we can go from B to A
+    // Q B Q* = A
+    // Q = H1 H2 ... Hn
+    // H1 H2 ... Hn B Hn* ... H2* H1*
+
+    dlaf::common::internal::SingleThreadedBlasScope single;
+
+    // apply from left...
+    // const GlobalElementIndex left_offset = offset;
+    // const GlobalElementSize left_size{mat_b.size().rows() - band_size, mat_b.size().cols()};
+    // lapack::unmqr(lapack::Side::Left, lapack::Op::NoTrans, left_size.rows(), left_size.cols(), k,
+    //               mat_hh_1st.ptr(offset), mat_hh_1st.ld(), taus_1st.data(), mat_b.ptr(left_offset),
+    //               mat_b.ld());
+
+    // ... and from right
+    // const GlobalElementIndex right_offset = common::transposed(left_offset);
+    // const GlobalElementSize right_size = common::transposed(left_size);
+
+    // lapack::unmqr(lapack::Side::Right, lapack::Op::ConjTrans, right_size.rows(), right_size.cols(), k,
+    //               mat_hh_1st.ptr(offset), mat_hh_1st.ld(), taus_1st.data(), mat_b.ptr(right_offset),
+    //               mat_b.ld());
+  }
+
+  dlaf::internal::silenceUnusedWarningFor(mat_hh_1st, taus_1st);
+
+  // Eventually, check the result obtained by applying the inverse transformation equals the original matrix
+  auto result = [&dist = reference.distribution(),
+                 &mat_local = mat_b](const GlobalElementIndex& element) {
+    const auto tile_index = dist.globalTileIndex(element);
+    const auto tile_element = dist.tileElementIndex(element);
+    return mat_local.tile_read(tile_index)(tile_element);
+  };
+  // TODO FIXME
+  dlaf::internal::silenceUnusedWarningFor(result);
+  // CHECK_MATRIX_NEAR(result, reference, 0,
+  //                   std::max<SizeType>(1, mat_b.size().linear_size()) * TypeUtilities<T>::error);
+
+  dlaf::internal::silenceUnusedWarningFor(mat_hh_2nd, taus_2nd);
+}
+
 template <class T, Device D, Backend B>
 void testCAReductionToBand(comm::CommunicatorGrid& grid, const LocalElementSize size,
                            const TileElementSize block_size, const SizeType band_size,
@@ -522,29 +611,48 @@ void testCAReductionToBand(comm::CommunicatorGrid& grid, const LocalElementSize
   Matrix<T, Device::CPU> matrix_a_h(distribution);
   copy(reference, matrix_a_h);
 
+  print(format::numpy{}, "A", matrix_a_h);
+
   eigensolver::internal::CARed2BandResult<T, D> red2band_result = [&]() {
     MatrixMirror<T, D, Device::CPU> matrix_a(matrix_a_h);
     return eigensolver::internal::ca_reduction_to_band<B>(grid, matrix_a.get(), band_size);
   }();
 
-  Matrix<T, D>& mat_local_taus = red2band_result.taus_1st;
+  // print(format::numpy{}, "R", matrix_a_h);
+  // print(format::numpy{}, "taus_1st", red2band_result.taus_1st);
+  // print(format::numpy{}, "mat_hh_2nd", red2band_result.hh_2nd);
+  // print(format::numpy{}, "taus_2nd", red2band_result.taus_2nd);
 
-  // what?!
-  // ASSERT_EQ(mat_local_taus.block_size().rows(), block_size.rows());
+  ASSERT_EQ(red2band_result.taus_1st.block_size().rows(), block_size.rows());
+  ASSERT_EQ(red2band_result.taus_2nd.block_size().rows(), block_size.rows());
 
   checkUpperPartUnchanged(reference, matrix_a_h);
 
   // Wait for all work to finish before doing blocking communication
   pika::wait();
 
-  auto mat_v = allGather(blas::Uplo::Lower, matrix_a_h, grid);
-  auto mat_b = makeLocal(matrix_a_h);
-  splitReflectorsAndBand(mat_v, mat_b, band_size);
+  auto mat_hh_1st = allGather(blas::Uplo::Lower, matrix_a_h, grid);
 
-  auto taus = allGatherTaus(k_reflectors, mat_local_taus, grid);
-  ASSERT_EQ(taus.size(), k_reflectors);
+  auto taus_1st = allGatherT(red2band_result.taus_1st, grid);
+  ASSERT_EQ(taus_1st.size().rows(), k_reflectors);
+  ASSERT_EQ(taus_1st.size().cols(), grid.size().rows());
+
+  auto mat_hh_2nd = allGather(blas::Uplo::General, red2band_result.hh_2nd, grid);
+
+  auto taus_2nd = allGatherTaus(k_reflectors, red2band_result.taus_2nd, grid);
+  ASSERT_EQ(taus_2nd.size(), k_reflectors);
+
+  print(format::numpy{}, "mat_hh_2nd", mat_hh_2nd);
+  print(format::numpy{}, "mat_taus_1st", taus_1st);
+  std::cout << "taus_2nd = ";
+  for (auto tau : taus_2nd)
+    std::cout << tau << ", ";
+  std::cout << std::endl;
+
+  auto mat_band = makeLocal(matrix_a_h);
+  splitReflectorsAndBand(mat_hh_1st, mat_band, band_size);
 
-  // checkResult(k_reflectors, band_size, reference, mat_v, mat_b, taus);
+  checkResult(k_reflectors, band_size, reference, mat_band, mat_hh_1st, taus_1st, mat_hh_2nd, taus_2nd);
 }
 
 TYPED_TEST(CAReductionToBandTestMC, CorrectnessDistributed) {

From ad6ade00e3967765356e30a2650f20ea62cec32e Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Fri, 19 Jul 2024 15:15:56 +0200
Subject: [PATCH 14/30] minor changes

---
 test/unit/matrix/test_layout_info.cpp | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/test/unit/matrix/test_layout_info.cpp b/test/unit/matrix/test_layout_info.cpp
index 935507853b..e160ca80dc 100644
--- a/test/unit/matrix/test_layout_info.cpp
+++ b/test/unit/matrix/test_layout_info.cpp
@@ -20,6 +20,7 @@
 using namespace dlaf;
 using namespace testing;
 
+// size, block_size, ld, row_offset, col_offset, min_memory
 const std::vector<std::tuple<LocalElementSize, TileElementSize, SizeType, SizeType, SizeType, SizeType>>
     values({{{31, 17}, {7, 11}, 31, 7, 341, 527},      // Scalapack like layout
             {{31, 17}, {32, 11}, 31, 31, 341, 527},    // only one row of tiles
@@ -53,12 +54,7 @@ const std::vector<std::tuple<LocalElementSize, TileElementSize, SizeType, SizeTy
 
 TEST(LayoutInfoTest, Constructor) {
   for (const auto& v : values) {
-    auto size = std::get<0>(v);
-    auto block_size = std::get<1>(v);
-    auto ld = std::get<2>(v);
-    auto row_offset = std::get<3>(v);
-    auto col_offset = std::get<4>(v);
-    auto min_memory = std::get<5>(v);
+    auto [size, block_size, ld, row_offset, col_offset, min_memory] = v;
 
     matrix::LayoutInfo layout(size, block_size, ld, row_offset, col_offset);
 
@@ -102,12 +98,7 @@ TEST(LayoutInfoTest, ComparisonOperator) {
   matrix::LayoutInfo layout0({25, 25}, {5, 5}, 50, 8, 1000);
 
   for (const auto& v : comp_values) {
-    auto size = std::get<0>(v);
-    auto block_size = std::get<1>(v);
-    auto ld = std::get<2>(v);
-    auto row_offset = std::get<3>(v);
-    auto col_offset = std::get<4>(v);
-    auto is_equal = std::get<5>(v);
+    auto [size, block_size, ld, row_offset, col_offset, is_equal] = v;
 
     matrix::LayoutInfo layout(size, block_size, ld, row_offset, col_offset);
 
@@ -135,12 +126,7 @@ const std::vector<std::tuple<LocalElementSize, TileElementSize, SizeType, SizeTy
 
 TEST(LayoutInfoTest, ColMajorLayout) {
   for (const auto& v : col_major_values) {
-    auto size = std::get<0>(v);
-    auto block_size = std::get<1>(v);
-    auto ld = std::get<2>(v);
-    auto row_offset = std::get<3>(v);
-    auto col_offset = std::get<4>(v);
-    auto min_memory = std::get<5>(v);
+    auto [size, block_size, ld, row_offset, col_offset, min_memory] = v;
 
     matrix::LayoutInfo exp_layout(size, block_size, ld, row_offset, col_offset);
     matrix::LayoutInfo layout = colMajorLayout(size, block_size, ld);

From b1b61852b5c388f7b05c1564bdfd6714a5de860b Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 14 Aug 2024 17:23:59 +0200
Subject: [PATCH 15/30] CHECK: broadcast panel now communicate also last tile

---
 include/dlaf/communication/broadcast_panel.h  | 12 +---------
 .../dlaf/eigensolver/reduction_to_band/impl.h | 24 -------------------
 2 files changed, 1 insertion(+), 35 deletions(-)

diff --git a/include/dlaf/communication/broadcast_panel.h b/include/dlaf/communication/broadcast_panel.h
index e83e35a98d..2fa15e335b 100644
--- a/include/dlaf/communication/broadcast_panel.h
+++ b/include/dlaf/communication/broadcast_panel.h
@@ -105,9 +105,6 @@ auto& get_taskchain(comm::CommunicatorPipeline<comm::CommunicatorType::Row>& row
 /// - linking as external tile, if the tile is already available locally for the rank
 /// - receiving the tile from the owning rank (via a broadcast)
 ///
-/// Be aware that the last tile will just be available on @p panel, but it won't be transposed to
-/// @p panelT.
-///
 /// @param rank_root specifies on which rank the @p panel is the source of the data
 /// @param panel
 ///   on rank_root it is the source panel (a)
@@ -183,14 +180,7 @@ void broadcast(comm::IndexT_MPI rank_root, matrix::Panel<axis, T, D, storage>& p
 
   auto& chain_step2 = internal::get_taskchain<comm_dir_step2>(row_task_chain, col_task_chain);
 
-  const SizeType last_tile = std::max(panelT.rangeStart(), panelT.rangeEnd() - 1);
-  const auto owner = dist.template rankGlobalTile<coordT>(last_tile);
-  const auto range = dist.rankIndex().get(coordT) == owner
-                         ? common::iterate_range2d(*panelT.iteratorLocal().begin(),
-                                                   LocalTileIndex(coordT, panelT.rangeEndLocal() - 1, 1))
-                         : panelT.iteratorLocal();
-
-  for (const auto& indexT : range) {
+  for (const auto& indexT : panelT.iteratorLocal()) {
     auto [index_diag, owner_diag] = internal::transposedOwner<coordT>(dist, indexT);
 
     namespace ex = pika::execution::experimental;
diff --git a/include/dlaf/eigensolver/reduction_to_band/impl.h b/include/dlaf/eigensolver/reduction_to_band/impl.h
index 3dbbdd7b9c..75acd0a06f 100644
--- a/include/dlaf/eigensolver/reduction_to_band/impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/impl.h
@@ -953,30 +953,6 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
           dist.template nextLocalTileFromGlobalElement<Coord::Col>(at_offset.col()),
       };
 
-      // Note:
-      // This additional communication of the last tile is a workaround for supporting following trigger
-      // when b < mb.
-      // Indeed, if b < mb the last column have (at least) a panel to compute, but differently from
-      // other columns, broadcast transposed doesn't communicate the last tile, which is an assumption
-      // needed to make the following trigger work correctly.
-      const SizeType at_tile_col =
-          dist.template globalTileFromGlobalElement<Coord::Col>(at_offset.col());
-
-      if (at_tile_col == dist.nrTiles().cols() - 1) {
-        const comm::IndexT_MPI owner = rank_v0.row();
-        if (rank.row() == owner) {
-          xt.setTile(at, x.read(at));
-
-          if (dist.commGridSize().rows() > 1)
-            ex::start_detached(comm::schedule_bcast_send(mpi_col_chain.exclusive(), xt.read(at)));
-        }
-        else {
-          if (dist.commGridSize().rows() > 1)
-            ex::start_detached(comm::schedule_bcast_recv(mpi_col_chain.exclusive(), owner,
-                                                         xt.readwrite(at)));
-        }
-      }
-
       if constexpr (dlaf::comm::CommunicationDevice_v<D> == D) {
         // Note:
         // if there is no need for additional buffers, we can just wait that xt[0] is ready for

From 8f8b19958a54fddfe5b6e844b8af9a0372365445 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 14 Aug 2024 17:26:47 +0200
Subject: [PATCH 16/30] hh2 fix panel size problem for reflectors

---
 .../eigensolver/reduction_to_band/ca-impl.h   | 35 ++++++-------------
 1 file changed, 10 insertions(+), 25 deletions(-)

diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index 3c54d02de9..f933a0f6ae 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -525,17 +525,16 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
 
       // W = V T
       auto& ws_W0 = panels_w0.nextResource();
-      auto& ws_W0T = panels_w0t.nextResource();
-
       ws_W0.setRangeStart(at_offset);
-      ws_W0T.setRangeStart(at_offset);
-
       ws_W0.setWidth(nrefls_step);
-      ws_W0T.setHeight(nrefls_step);
 
       if (rank_panel == rank.col())
         red2band::local::trmmComputeW<B, D>(ws_W0, ws_V, ws_T.read(zero_lc));
 
+      auto& ws_W0T = panels_w0t.nextResource();
+      ws_W0T.setRangeStart(at_offset);
+      ws_W0T.setHeight(nrefls_step);
+
       comm::broadcast(rank_panel, ws_W0, ws_W0T, mpi_row_chain, mpi_col_chain);
 
       // Note: apply local transformations, one after the other
@@ -670,12 +669,11 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
     // QR local with just heads (HH reflectors have to be computed elsewhere, 1st-pass in-place)
     auto&& panel_heads = panels_heads.nextResource();
     panel_heads.setRangeEnd({n_qr_heads, 0});
-    panel_heads.setWidth(nrefls_step);
 
     const matrix::Distribution dist_heads_current(LocalElementSize(n_qr_heads * dist.block_size().rows(),
                                                                    dist.block_size().cols()),
                                                   dist.block_size());
-    const matrix::SubPanelView panel_heads_view(dist_heads_current, {0, 0}, nrefls_step);
+    const matrix::SubPanelView panel_heads_view(dist_heads_current, {0, 0}, band_size);
 
     const bool rank_has_head_row = !panel_view.iteratorLocal().empty();
     if (rank_panel == rank.col()) {
@@ -765,10 +763,15 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
           }
         }
       }
+
+      panel_heads.reset();
     }
 
     // TRAILING 2nd pass
     {
+      panel_heads.setRangeEnd({n_qr_heads, 0});
+      panel_heads.setWidth(nrefls_step);
+
       const GlobalTileIndex at_end_L(at_offset.row() + n_qr_heads, 0);
       const GlobalTileIndex at_end_R(0, at_offset.col() + n_qr_heads);
 
@@ -811,12 +814,6 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
 
       comm::broadcast(rank_panel, ws_V, ws_VT, mpi_row_chain, mpi_col_chain);
 
-      for (const auto& i_lc : ws_VT.iteratorLocal()) {
-        std::ostringstream ss;
-        ss << "VT2nd(" << dist.global_tile_index(i_lc) << ")";
-        print_sync(ss.str(), ws_VT.read(i_lc));
-      }
-
       // Note:
       // Differently from 1st pass, where transformations are independent one from the other,
       // this 2nd pass is a single QR transformation that has to be applied from L and R.
@@ -836,18 +833,6 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
 
       comm::broadcast(rank_panel, ws_W0, ws_W0T, mpi_row_chain, mpi_col_chain);
 
-      for (const auto& idx : ws_W0.iteratorLocal()) {
-        std::ostringstream ss;
-        ss << "W0(" << dist.global_tile_index(idx) << ")";
-        print_sync(ss.str(), ws_W0.read(idx));
-      }
-
-      for (const auto& idx : ws_W0T.iteratorLocal()) {
-        std::ostringstream ss;
-        ss << "W0T(" << dist.global_tile_index(idx) << ")";
-        print_sync(ss.str(), ws_W0T.read(idx));
-      }
-
       // W1 = A W0
       auto& ws_W1 = panels_w1.nextResource();
       ws_W1.setRange(at_offset, at_end_L);

From 3883430bd85c9f2ecb1c0876dc6316a7cdd0b6d2 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 14 Aug 2024 17:28:03 +0200
Subject: [PATCH 17/30] hh2 fix W1 panel, which has a different size wrt others

---
 include/dlaf/eigensolver/reduction_to_band/ca-impl.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index f933a0f6ae..799c743727 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -835,22 +835,16 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
 
       // W1 = A W0
       auto& ws_W1 = panels_w1.nextResource();
-      ws_W1.setRange(at_offset, at_end_L);
+      ws_W1.setRangeStart(at_offset);
       ws_W1.setWidth(nrefls_step);
 
       auto& ws_W1T = panels_w1t.nextResource();
-      ws_W1T.setRange(at_offset, at_end_R);
+      ws_W1T.setRangeStart(at_offset);
       ws_W1T.setHeight(nrefls_step);
 
       ca_red2band::hemm2nd<B, D>(rank_panel, ws_W1, ws_W1T, at_view, at_end_R.col(), mat_a, ws_W0,
                                  ws_W0T, mpi_row_chain, mpi_col_chain);
 
-      for (const auto& idx : ws_W1.iteratorLocal()) {
-        std::ostringstream ss;
-        ss << "W1(" << dist.global_tile_index(idx) << ")";
-        print_sync(ss.str(), ws_W1.read(idx));
-      }
-
       // W1 = W1 - 0.5 V W0* W1
       if (rank.col() == rank_panel) {
         matrix::Matrix<T, D> ws_W2 = std::move(ws_T);
@@ -870,7 +864,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
 
       // distribute W1 -> W1T
       ws_W1T.reset();
-      ws_W1T.setRange(at_offset, at_end_R);
+      ws_W1T.setRangeStart(at_offset);
       ws_W1T.setHeight(nrefls_step);
 
       comm::broadcast(rank_panel, ws_W1, ws_W1T, mpi_row_chain, mpi_col_chain);

From 783956410222485f19e1b57b4037bf11355d8640 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 14 Aug 2024 17:28:43 +0200
Subject: [PATCH 18/30] fix deadlock: potentially unused tile

---
 include/dlaf/eigensolver/reduction_to_band/ca-impl.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index 799c743727..28c91e928e 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -601,8 +601,6 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
           if (rank_qr.row() != id_qr_R)
             continue;
 
-          auto&& tile_vt = ws_VT.read({0, j_lc});
-
           for (SizeType i_lc = at_offset_lc.row(); i_lc < dist.local_nr_tiles().rows(); ++i_lc) {
             const LocalTileIndex ij_lc(i_lc, j_lc);
             const GlobalTileIndex ij = dist.global_tile_index(ij_lc);
@@ -618,7 +616,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
               continue;
 
             ex::start_detached(di::whenAllLift(blas::Op::NoTrans, blas::Op::ConjTrans, T(-1),
-                                               ws_W1.read(ij_lc), tile_vt, T(1),
+                                               ws_W1.read(ij_lc), ws_VT.read(ij_lc), T(1),
                                                mat_a.readwrite(ij_lc)) |
                                tile::gemm(di::Policy<B>()));
           }

From e26812b4179a3aa1b711e59474888f8ab70e4296 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 14 Aug 2024 17:29:16 +0200
Subject: [PATCH 19/30] hh1: fix her2k transposed indices usage

---
 include/dlaf/eigensolver/reduction_to_band/ca-impl.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index 28c91e928e..5d586a6819 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -210,10 +210,14 @@ void her2kUpdateTrailingMatrix(comm::Index2D rank_qr, const matrix::SubMatrixVie
       }
       else {
         // TODO check and document why tranposed operand can be accessed with same index locally
+
+        const GlobalTileIndex ijT = transposed(ij);
+        const LocalTileIndex ijT_lc = dist.local_tile_index(ijT);
+
         // A -= W3 . V*
-        her2kOffDiag<B>(priority, W3.read(ij_lc), V.read(ij_lc), getSubA());
+        her2kOffDiag<B>(priority, W3.read(ij_lc), V.read(ijT_lc), getSubA());
         // A -= V . W3*
-        her2kOffDiag<B>(priority, V.read(ij_lc), W3.read(ij_lc), getSubA());
+        her2kOffDiag<B>(priority, V.read(ij_lc), W3.read(ijT_lc), getSubA());
       }
     }
   }
@@ -246,6 +250,7 @@ void hemm2nd(comm::IndexT_MPI rank_panel, matrix::Panel<Coord::Col, T, D>& W1,
   const LocalTileIndex at_offset = at_view.begin();
 
   const SizeType jR_end_lc = dist.template next_local_tile_from_global_tile<Coord::Col>(j_end);
+
   for (SizeType i_lc = at_offset.row(); i_lc < dist.localNrTiles().rows(); ++i_lc) {
     const auto j_end_lc =
         std::min(jR_end_lc, dist.template next_local_tile_from_global_tile<Coord::Col>(

From 9d8d648dbf3f6e52c5847d3eef70f530b2d9ba9f Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 14 Aug 2024 17:29:41 +0200
Subject: [PATCH 20/30] hh2: fix her2k computation LR + R

---
 .../eigensolver/reduction_to_band/ca-impl.h   | 43 +++++++++++++++----
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index 5d586a6819..8ac12c297f 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -340,8 +340,8 @@ void hemm2nd(comm::IndexT_MPI rank_panel, matrix::Panel<Coord::Col, T, D>& W1,
 }
 
 template <Backend B, Device D, class T>
-void her2k_2nd(const SizeType j_end, const matrix::SubMatrixView& at_view, matrix::Matrix<T, D>& a,
-               matrix::Panel<Coord::Col, const T, D>& W1,
+void her2k_2nd(const SizeType i_end, const SizeType j_end, const matrix::SubMatrixView& at_view,
+               matrix::Matrix<T, D>& a, matrix::Panel<Coord::Col, const T, D>& W1,
                matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>& W1T,
                matrix::Panel<Coord::Col, const T, D>& V,
                matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>& VT) {
@@ -353,14 +353,15 @@ void her2k_2nd(const SizeType j_end, const matrix::SubMatrixView& at_view, matri
 
   const auto dist = a.distribution();
 
-  const LocalTileIndex at_offset = at_view.begin();
+  const LocalTileIndex at_offset_lc = at_view.begin();
 
+  const SizeType iL_end_lc = dist.template next_local_tile_from_global_tile<Coord::Row>(i_end);
   const SizeType jR_end_lc = dist.template next_local_tile_from_global_tile<Coord::Col>(j_end);
-  for (SizeType i_lc = at_offset.row(); i_lc < dist.localNrTiles().rows(); ++i_lc) {
+  for (SizeType i_lc = at_offset_lc.row(); i_lc < iL_end_lc; ++i_lc) {
     const auto j_end_lc =
         std::min(jR_end_lc, dist.template next_local_tile_from_global_tile<Coord::Col>(
                                 dist.template global_tile_from_local_tile<Coord::Row>(i_lc) + 1));
-    for (SizeType j_lc = at_offset.col(); j_lc < j_end_lc; ++j_lc) {
+    for (SizeType j_lc = at_offset_lc.col(); j_lc < j_end_lc; ++j_lc) {
       const LocalTileIndex ij_local{i_lc, j_lc};
       const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
 
@@ -372,7 +373,8 @@ void her2k_2nd(const SizeType j_end, const matrix::SubMatrixView& at_view, matri
 
       // The first column of the trailing matrix (except for the very first global tile) has to be
       // updated first, in order to unlock the next iteration as soon as possible.
-      const auto priority = (j_lc == at_offset.col()) ? thread_priority::high : thread_priority::normal;
+      const auto priority =
+          (j_lc == at_offset_lc.col()) ? thread_priority::high : thread_priority::normal;
 
       if (is_diagonal_tile) {
         her2kDiag<B>(priority, V.read(ij_local), W1.read(ij_local), getSubA());
@@ -386,6 +388,29 @@ void her2k_2nd(const SizeType j_end, const matrix::SubMatrixView& at_view, matri
       }
     }
   }
+
+  // This is just going to update rows that are going to be updated just from right.
+  for (SizeType i_lc = iL_end_lc; i_lc < dist.local_nr_tiles().rows(); ++i_lc) {
+    const auto j_end_lc =
+        std::min(jR_end_lc, dist.template next_local_tile_from_global_tile<Coord::Col>(
+                                dist.template global_tile_from_local_tile<Coord::Row>(i_lc) + 1));
+    for (SizeType j_lc = at_offset_lc.col(); j_lc < j_end_lc; ++j_lc) {
+      const LocalTileIndex ij_lc{i_lc, j_lc};
+
+      auto getSubA = [&a, &at_view, ij_lc]() { return splitTile(a.readwrite(ij_lc), at_view(ij_lc)); };
+
+      // The first column of the trailing matrix (except for the very first global tile) has to be
+      // updated first, in order to unlock the next iteration as soon as possible.
+      const auto priority =
+          (j_lc == at_offset_lc.col()) ? thread_priority::high : thread_priority::normal;
+
+      std::cout << "HER2K-EXTRA" << dist.global_tile_index(ij_lc) << std::endl;
+      print_sync("tile_w1", W1.read(ij_lc));
+      print_sync("tile_v", VT.read(ij_lc));
+      // A -= X . V*
+      her2kOffDiag<B>(priority, W1.read(ij_lc), VT.read(ij_lc), getSubA());
+    }
+  }
 }
 }
 
@@ -872,8 +897,10 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
 
       comm::broadcast(rank_panel, ws_W1, ws_W1T, mpi_row_chain, mpi_col_chain);
 
-      // A -= W1 VT + V W1T
-      ca_red2band::her2k_2nd<B>(at_end_R.col(), at_view, mat_a, ws_W1, ws_W1T, ws_V, ws_VT);
+      // LR: A -= W1 VT + V W1T
+      // R : [at_end_L.row():, :at_endR_col()] A = A - W1 V.T
+      ca_red2band::her2k_2nd<B>(at_end_L.row(), at_end_R.col(), at_view, mat_a, ws_W1, ws_W1T, ws_V,
+                                ws_VT);
 
       ws_W1T.reset();
       ws_W1.reset();

From 155fd59e9915733412958ba55e47790d6affc3b4 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 14 Aug 2024 17:30:26 +0200
Subject: [PATCH 21/30] t-factor minor changes

---
 include/dlaf/factorization/qr/t_factor_impl.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
index b297175178..63f362b96e 100644
--- a/include/dlaf/factorization/qr/t_factor_impl.h
+++ b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -58,9 +58,9 @@ struct Helpers<Backend::MC, Device::CPU, T> {
         std::forward<TSender>(t));
   }
 
-  template <class VISender, class TSender>
-  static auto gemvColumnT(SizeType first_row_tile, VISender tile_vi,
-                          matrix::ReadOnlyTileSender<T, Device::CPU> taus, TSender&& tile_t) {
+  static auto gemvColumnT(SizeType first_row_tile, matrix::ReadOnlyTileSender<T, Device::CPU> tile_vi,
+                          matrix::ReadOnlyTileSender<T, Device::CPU> taus,
+                          matrix::ReadWriteTileSender<T, Device::CPU>&& tile_t) {
     namespace ex = pika::execution::experimental;
 
     auto gemv_func = [first_row_tile](const auto& tile_v, const auto& taus, auto&& tile_t) noexcept {
@@ -76,12 +76,13 @@ struct Helpers<Backend::MC, Device::CPU, T> {
 
         // Position of the 1 in the diagonal in the current column.
         SizeType i_diag = j - first_row_tile;
-        const SizeType first_element_in_col = std::max<SizeType>(0, i_diag);
 
         // Break if the reflector starts in the next tile.
         if (i_diag >= tile_v.size().rows())
           break;
 
+        const SizeType first_element_in_col = std::max<SizeType>(0, i_diag);
+
         // T(0:j, j) = -tau . V(j:, 0:j)* . V(j:, j)
         // [j x 1] = [(n-j) x j]* . [(n-j) x 1]
         TileElementIndex va_start{first_element_in_col, 0};
@@ -93,14 +94,14 @@ struct Helpers<Backend::MC, Device::CPU, T> {
         }
 
         blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, va_size.rows(), va_size.cols(), -tau,
-                   tile_v.ptr(va_start), tile_v.ld(), tile_v.ptr(vb_start), 1, 1, tile_t.ptr(t_start),
+                   tile_v.ptr(va_start), tile_v.ld(), tile_v.ptr(vb_start), 1, T(1), tile_t.ptr(t_start),
                    1);
       }
       return std::move(tile_t);
     };
     return dlaf::internal::transform(
         dlaf::internal::Policy<Backend::MC>(pika::execution::thread_priority::high),
-        std::move(gemv_func), ex::when_all(tile_vi, std::move(taus), std::forward<TSender>(tile_t)));
+        std::move(gemv_func), ex::when_all(tile_vi, std::move(taus), std::move(tile_t)));
   }
 
   template <typename TSender>

From 7c3dc6079b7568e855dea3c3bb1751fed798b34e Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 14 Aug 2024 17:46:36 +0200
Subject: [PATCH 22/30] fix t-factor for local usage on distributed matrix

---
 include/dlaf/factorization/qr/t_factor_impl.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
index 63f362b96e..0bf1a2616c 100644
--- a/include/dlaf/factorization/qr/t_factor_impl.h
+++ b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -237,8 +237,6 @@ void QR_Tfactor<backend, device, T>::call(matrix::Panel<Coord::Col, T, device>&
   if (hh_panel.getWidth() == 0)
     return;
 
-  const auto v_start = hh_panel.offsetElement();
-
   matrix::ReadWriteTileSender<T, device> t_local = Helpers::set0(std::move(t));
 
   // Note:
@@ -259,15 +257,15 @@ void QR_Tfactor<backend, device, T>::call(matrix::Panel<Coord::Col, T, device>&
   // 1st step: compute the column partial result `t`
   // First we compute the matrix vector multiplication for each column
   // -tau(j) . V(j:, 0:j)* . V(j:, j)
-  for (const auto& v_i : hh_panel.iteratorLocal()) {
+  for (const auto& i_lc : hh_panel.iteratorLocal()) {
     const SizeType first_row_tile =
-        std::max<SizeType>(0, v_i.row() * hh_panel.parentDistribution().blockSize().rows() - v_start);
+        (i_lc.row() - hh_panel.rangeStartLocal()) * hh_panel.parentDistribution().tile_size().rows();
 
     // Note:
     // Since we are writing always on the same t, the gemv are serialized
     // A possible solution to this would be to have multiple places where to store partial
     // results, and then locally reduce them just before the reduce over ranks
-    t_local = Helpers::gemvColumnT(first_row_tile, hh_panel.read(v_i), taus, std::move(t_local));
+    t_local = Helpers::gemvColumnT(first_row_tile, hh_panel.read(i_lc), taus, std::move(t_local));
   }
 
   // 2nd step: compute the T factor, by performing the last step on each column

From 2b81a930c4e629956670108fb9763496d496b83c Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Fri, 16 Aug 2024 14:34:37 +0200
Subject: [PATCH 23/30] fix her2k HH1 + remove console output

---
 .../eigensolver/reduction_to_band/ca-impl.h   | 37 ++++++-------------
 1 file changed, 12 insertions(+), 25 deletions(-)

diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index 8ac12c297f..035e60a340 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -27,9 +27,6 @@
 #include <dlaf/sender/transform.h>
 #include <dlaf/sender/transform_mpi.h>
 
-//
-#include <dlaf/matrix/print_numpy.h>
-
 namespace dlaf::eigensolver::internal {
 
 namespace ca_red2band {
@@ -209,15 +206,19 @@ void her2kUpdateTrailingMatrix(comm::Index2D rank_qr, const matrix::SubMatrixVie
         her2kDiag<B>(priority, V.read(ij_lc), W3.read(ij_lc), getSubA());
       }
       else {
-        // TODO check and document why tranposed operand can be accessed with same index locally
-
-        const GlobalTileIndex ijT = transposed(ij);
-        const LocalTileIndex ijT_lc = dist.local_tile_index(ijT);
+        // TODO fix doc
+        // Note:
+        // - We are updating from both L and R.
+        // - We are computing all combinations of W3 and V (and viceversa), and putting results in A
+        // - By looping on position of A that will contain the result
+        // - We use the same row for the first operand
+        // - We use the col as the row for the second operand
+        const SizeType iT_lc = dist.template local_tile_from_global_tile<Coord::Row>(ij.col());
 
         // A -= W3 . V*
-        her2kOffDiag<B>(priority, W3.read(ij_lc), V.read(ijT_lc), getSubA());
+        her2kOffDiag<B>(priority, W3.read(ij_lc), V.read({iT_lc, 0}), getSubA());
         // A -= V . W3*
-        her2kOffDiag<B>(priority, V.read(ij_lc), W3.read(ijT_lc), getSubA());
+        her2kOffDiag<B>(priority, V.read(ij_lc), W3.read({iT_lc, 0}), getSubA());
       }
     }
   }
@@ -404,9 +405,6 @@ void her2k_2nd(const SizeType i_end, const SizeType j_end, const matrix::SubMatr
       const auto priority =
           (j_lc == at_offset_lc.col()) ? thread_priority::high : thread_priority::normal;
 
-      std::cout << "HER2K-EXTRA" << dist.global_tile_index(ij_lc) << std::endl;
-      print_sync("tile_w1", W1.read(ij_lc));
-      print_sync("tile_v", VT.read(ij_lc));
       // A -= X . V*
       her2kOffDiag<B>(priority, W1.read(ij_lc), VT.read(ij_lc), getSubA());
     }
@@ -698,9 +696,8 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
     auto&& panel_heads = panels_heads.nextResource();
     panel_heads.setRangeEnd({n_qr_heads, 0});
 
-    const matrix::Distribution dist_heads_current(LocalElementSize(n_qr_heads * dist.block_size().rows(),
-                                                                   dist.block_size().cols()),
-                                                  dist.block_size());
+    const matrix::Distribution dist_heads_current(
+        LocalElementSize(n_qr_heads * dist.block_size().rows(), band_size), dist.block_size());
     const matrix::SubPanelView panel_heads_view(dist_heads_current, {0, 0}, band_size);
 
     const bool rank_has_head_row = !panel_view.iteratorLocal().empty();
@@ -823,17 +820,9 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
                              }));
         }
 
-        for (const auto& i_lc : ws_V.iteratorLocal()) {
-          std::ostringstream ss;
-          ss << "V2nd(" << dist.global_tile_index(i_lc) << ")";
-          print_sync(ss.str(), ws_V.read(i_lc));
-        }
-
         using factorization::internal::computeTFactor;
         const GlobalTileIndex j_tau(j, 0);
         computeTFactor<B>(panel_heads, mat_taus_2nd.read(j_tau), ws_T.readwrite(zero_lc));
-
-        print_sync("T2nd", ws_T.read(zero_lc));
       }
 
       auto& ws_VT = panels_vt.nextResource();
@@ -884,8 +873,6 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
                                                                 ws_W2.readwrite(zero_lc)));
         }
 
-        print_sync("W2", ws_W2.read(zero_lc));
-
         // W1 = W1 - 0.5 V W2
         red2band::local::gemmUpdateX<B, D>(ws_W1, ws_W2, ws_V);
       }

From 41fc7ab7f965b9ef0e0048830f30c0b80182cd0d Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Fri, 16 Aug 2024 14:42:48 +0200
Subject: [PATCH 24/30] basic test implementation

---
 .../eigensolver/test_reduction_to_band.cpp    | 189 ++++++++++++------
 1 file changed, 133 insertions(+), 56 deletions(-)

diff --git a/test/unit/eigensolver/test_reduction_to_band.cpp b/test/unit/eigensolver/test_reduction_to_band.cpp
index c7846d0b90..df327dc789 100644
--- a/test/unit/eigensolver/test_reduction_to_band.cpp
+++ b/test/unit/eigensolver/test_reduction_to_band.cpp
@@ -8,7 +8,9 @@
 // SPDX-License-Identifier: BSD-3-Clause
 //
 
+#include <algorithm>
 #include <cmath>
+#include <functional>
 #include <vector>
 
 #include <pika/execution.hpp>
@@ -24,6 +26,7 @@
 #include <dlaf/eigensolver/reduction_to_band.h>
 #include <dlaf/lapack/tile.h>
 #include <dlaf/matrix/copy.h>
+#include <dlaf/matrix/copy_tile.h>
 #include <dlaf/matrix/index.h>
 #include <dlaf/matrix/matrix.h>
 #include <dlaf/matrix/matrix_mirror.h>
@@ -496,8 +499,7 @@ struct CAReductionToBandTest : public TestWithCommGrids {};
 template <class T>
 using CAReductionToBandTestMC = ReductionToBandTest<T>;
 
-using JustThisType = ::testing::Types<float>;
-TYPED_TEST_SUITE(CAReductionToBandTestMC, JustThisType);
+TYPED_TEST_SUITE(CAReductionToBandTestMC, MatrixElementTypes);
 
 template <class T>
 MatrixLocal<T> allGatherT(Matrix<const T, Device::CPU>& source, comm::CommunicatorGrid& comm_grid) {
@@ -532,46 +534,138 @@ MatrixLocal<T> allGatherT(Matrix<const T, Device::CPU>& source, comm::Communicat
 }
 
 template <class T>
-auto checkResult(const SizeType k, const SizeType band_size, Matrix<const T, Device::CPU>& reference,
-                 const MatrixLocal<T>& mat_b, const MatrixLocal<T>& mat_hh_1st,
-                 const MatrixLocal<T>& taus_1st, const MatrixLocal<T>& mat_hh_2nd,
-                 const std::vector<T>& taus_2nd) {
-  dlaf::internal::silenceUnusedWarningFor(k, band_size, reference, mat_b);
-
+auto checkResult(const Distribution dist, const SizeType band_size,
+                 Matrix<const T, Device::CPU>& reference, const MatrixLocal<T>& mat_b,
+                 const MatrixLocal<T>& mat_hh_1st, const MatrixLocal<T>& taus_1st,
+                 const MatrixLocal<T>& mat_hh_2nd, const std::vector<T>& taus_2nd) {
   const GlobalElementIndex offset(band_size, 0);
   // Now that all input are collected locally, it's time to apply the transformation,
   // ...but just if there is any
   if (offset.isIn(mat_hh_1st.size())) {
-    // Reduction to band returns a sequence of transformations applied from left and right to A
-    // allowing to reduce the matrix A to a band matrix B
-    //
-    // Hn* ... H2* H1* A H1 H2 ... Hn
-    // Q* A Q = B
-    //
-    // Applying the inverse of the same transformations, we can go from B to A
-    // Q B Q* = A
-    // Q = H1 H2 ... Hn
-    // H1 H2 ... Hn B Hn* ... H2* H1*
-
     dlaf::common::internal::SingleThreadedBlasScope single;
 
-    // apply from left...
-    // const GlobalElementIndex left_offset = offset;
-    // const GlobalElementSize left_size{mat_b.size().rows() - band_size, mat_b.size().cols()};
-    // lapack::unmqr(lapack::Side::Left, lapack::Op::NoTrans, left_size.rows(), left_size.cols(), k,
-    //               mat_hh_1st.ptr(offset), mat_hh_1st.ld(), taus_1st.data(), mat_b.ptr(left_offset),
-    //               mat_b.ld());
+    const SizeType ntiles = mat_b.nrTiles().cols() - 1;
+
+    // Apply in reverse order (blocked algorithm), which means both from last to first, inverting
+    // intra-step too, i.e. 2nd first and 1st last.
+    for (SizeType j = ntiles - 1; j >= 0; --j) {
+      const SizeType i = j + 1;
+      const SizeType i_el =
+          dist.template global_element_from_global_tile_and_tile_element<Coord::Row>(i, 0);
+
+      const std::size_t nranks_with_data =
+          to_sizet(std::min<SizeType>(mat_b.nrTiles().rows() - i, dist.grid_size().rows()));
+
+      // === 2nd pass
+
+      // prepare workspace (height = max(nranks)) + reorder heads
+      std::vector<comm::IndexT_MPI> col_rank_order(nranks_with_data, -1);
+      const comm::IndexT_MPI first_rank = dist.template rank_global_tile<Coord::Row>(i);
+      std::iota(col_rank_order.begin(), col_rank_order.end(), first_rank);
+      std::transform(col_rank_order.begin(), col_rank_order.end(), col_rank_order.begin(),
+                     [size = dist.grid_size().rows()](const comm::IndexT_MPI& value) {
+                       return std::modulus<comm::IndexT_MPI>{}(value, size);
+                     });
+
+      // HH2
+      // TODO it does not deal with incomplete tiles yet
+      MatrixLocal<T> hh_2nd({to_SizeType(nranks_with_data) * mat_b.blockSize().rows(),
+                             mat_b.blockSize().cols()},
+                            mat_b.blockSize());
+
+      for (SizeType i = 0; i < to_SizeType(col_rank_order.size()); ++i) {
+        const SizeType ii = to_SizeType(col_rank_order[to_sizet(i)]);
+        matrix::internal::copy(mat_hh_2nd.tile({ii, j}), hh_2nd.tile({i, 0}));
+      }
 
-    // ... and from right
-    // const GlobalElementIndex right_offset = common::transposed(left_offset);
-    // const GlobalElementSize right_size = common::transposed(left_size);
+      const SizeType nrefls = [&]() {
+        // TODO workaround (j == ntiles - 1 ? 1 : 0)
+        return std::min(dist.template tile_size_of<Coord::Col>(j),
+                        hh_2nd.size().rows() - (j == ntiles - 1 ? 1 : 0));
+      }();
+
+      // T2
+      const SizeType j_el =
+          dist.template global_element_from_global_tile_and_tile_element<Coord::Col>(j, 0);
+      MatrixLocal<T> T_2nd({nrefls, nrefls}, mat_b.blockSize());
+      lapack::larft(lapack::Direction::Forward, lapack::StoreV::Columnwise, hh_2nd.size().rows(), nrefls,
+                    hh_2nd.ptr(), hh_2nd.ld(), taus_2nd.data() + j_el, T_2nd.ptr(), T_2nd.ld());
+
+      // Apply HH2 from L and R
+      lapack::larfb(lapack::Side::Left, lapack::Op::NoTrans, lapack::Direction::Forward,
+                    lapack::StoreV::Columnwise, hh_2nd.size().rows(), mat_b.size().cols() - j_el, nrefls,
+                    hh_2nd.ptr(), hh_2nd.ld(), T_2nd.ptr(), T_2nd.ld(), mat_b.ptr({i_el, j_el}),
+                    mat_b.ld());
+      lapack::larfb(lapack::Side::Right, lapack::Op::ConjTrans, lapack::Direction::Forward,
+                    lapack::StoreV::Columnwise, mat_b.size().rows() - j_el, hh_2nd.size().rows(), nrefls,
+                    hh_2nd.ptr(), hh_2nd.ld(), T_2nd.ptr(), T_2nd.ld(), mat_b.ptr({j_el, i_el}),
+                    mat_b.ld());
+
+      // === 1st pass
+
+      // HH1 (for all ranks)
+      // prepare workspace (height = local matrix for each rank) with zeros to fill voids
+      // Note: HH_1st workspaces is stored as a matrix where each column of tiles is for a specific rank.
+      MatrixLocal<T> hh_1st({dist.size().rows() - i_el,
+                             to_SizeType(nranks_with_data) * dist.tile_size().cols()},
+                            dist.tile_size());
+
+      std::size_t col_rank_current = 0;
+      for (SizeType i_a = i; i_a < dist.nr_tiles().rows(); ++i_a, ++col_rank_current) {
+        col_rank_current %= col_rank_order.size();
+
+        const SizeType i_hh = i_a - i;
+
+        for (SizeType j_hh = 0; j_hh < hh_1st.nrTiles().cols(); ++j_hh) {
+          const auto& tile_hh = hh_1st.tile({i_hh, j_hh});
+
+          if (j_hh == to_SizeType(col_rank_current)) {
+            dlaf::matrix::internal::copy(mat_hh_1st.tile({i_a, j}), tile_hh);
+          }
+          else {
+            dlaf::tile::internal::set0(tile_hh);
+          }
+        }
+      }
 
-    // lapack::unmqr(lapack::Side::Right, lapack::Op::ConjTrans, right_size.rows(), right_size.cols(), k,
-    //               mat_hh_1st.ptr(offset), mat_hh_1st.ld(), taus_1st.data(), mat_b.ptr(right_offset),
-    //               mat_b.ld());
-  }
+      // Note: well-formed heads
+      for (SizeType j = 0; j < hh_1st.nrTiles().cols(); ++j) {
+        const auto& tile_hh = hh_1st.tile({j, j});
+        dlaf::tile::internal::laset(blas::Uplo::Upper, T(0), T(1), tile_hh);
+      }
 
-  dlaf::internal::silenceUnusedWarningFor(mat_hh_1st, taus_1st);
+      // Note: apply one HH1 per time, independently, order not relevant
+      for (SizeType col_rank = 0; col_rank < to_SizeType(col_rank_order.size()); ++col_rank) {
+        const SizeType rank = to_SizeType(col_rank_order[to_sizet(col_rank)]);
+
+        // Compute T1
+        const auto& tile_taus = taus_1st.tile({j, rank});
+        const SizeType nrefls = tile_taus.size().rows();
+        const auto& hh_1st_head = hh_1st.tile({col_rank, col_rank});
+
+        MatrixLocal<T> T_1st({nrefls, nrefls}, mat_b.blockSize());
+        lapack::larft(lapack::Direction::Forward, lapack::StoreV::Columnwise,
+                      hh_1st.size().rows() - col_rank * dist.tile_size().rows(), nrefls,
+                      hh_1st_head.ptr(), hh_1st.ld(), tile_taus.ptr(), T_1st.ptr(), T_1st.ld());
+
+        // Apply HH1 (of a rank) from L and R
+        {
+          const SizeType m = dist.size().rows() - (i + col_rank) * dist.tile_size().rows();
+          const SizeType n = dist.size().cols() - j * dist.tile_size().cols();
+          lapack::larfb(lapack::Side::Left, lapack::Op::NoTrans, lapack::Direction::Forward,
+                        lapack::StoreV::Columnwise, m, n, nrefls, hh_1st_head.ptr(), hh_1st.ld(),
+                        T_1st.ptr(), T_1st.ld(), mat_b.tile({i + col_rank, j}).ptr(), mat_b.ld());
+        }
+        {
+          const SizeType m = dist.size().rows() - j * dist.tile_size().rows();
+          const SizeType n = dist.size().cols() - (i + col_rank) * dist.tile_size().cols();
+          lapack::larfb(lapack::Side::Right, lapack::Op::ConjTrans, lapack::Direction::Forward,
+                        lapack::StoreV::Columnwise, m, n, nrefls, hh_1st_head.ptr(), hh_1st.ld(),
+                        T_1st.ptr(), T_1st.ld(), mat_b.tile({j, i + col_rank}).ptr(), mat_b.ld());
+        }
+      }
+    }
+  }
 
   // Eventually, check the result obtained by applying the inverse transformation equals the original matrix
   auto result = [&dist = reference.distribution(),
@@ -580,12 +674,9 @@ auto checkResult(const SizeType k, const SizeType band_size, Matrix<const T, Dev
     const auto tile_element = dist.tileElementIndex(element);
     return mat_local.tile_read(tile_index)(tile_element);
   };
-  // TODO FIXME
-  dlaf::internal::silenceUnusedWarningFor(result);
-  // CHECK_MATRIX_NEAR(result, reference, 0,
-  //                   std::max<SizeType>(1, mat_b.size().linear_size()) * TypeUtilities<T>::error);
 
-  dlaf::internal::silenceUnusedWarningFor(mat_hh_2nd, taus_2nd);
+  CHECK_MATRIX_NEAR(result, reference, 0,
+                    std::max<SizeType>(1, mat_b.size().linear_size()) * TypeUtilities<T>::error);
 }
 
 template <class T, Device D, Backend B>
@@ -595,11 +686,11 @@ void testCAReductionToBand(comm::CommunicatorGrid& grid, const LocalElementSize
   const SizeType k_reflectors = std::max(SizeType(0), size.rows() - band_size - 1);
   DLAF_ASSERT(block_size.rows() % band_size == 0, block_size.rows(), band_size);
 
-  Distribution distribution({size.rows(), size.cols()}, block_size, grid.size(), grid.rank(), {0, 0});
+  const Distribution dist({size.rows(), size.cols()}, block_size, grid.size(), grid.rank(), {0, 0});
 
   // setup the reference input matrix
   Matrix<const T, Device::CPU> reference = [&]() {
-    Matrix<T, Device::CPU> reference(distribution);
+    Matrix<T, Device::CPU> reference(dist);
     if (input_matrix_structure == InputMatrixStructure::banded)
       // Matrix already in band form, with band smaller than band_size
       matrix::util::set_random_hermitian_banded(reference, band_size - 1);
@@ -608,21 +699,14 @@ void testCAReductionToBand(comm::CommunicatorGrid& grid, const LocalElementSize
     return reference;
   }();
 
-  Matrix<T, Device::CPU> matrix_a_h(distribution);
+  Matrix<T, Device::CPU> matrix_a_h(dist);
   copy(reference, matrix_a_h);
 
-  print(format::numpy{}, "A", matrix_a_h);
-
   eigensolver::internal::CARed2BandResult<T, D> red2band_result = [&]() {
     MatrixMirror<T, D, Device::CPU> matrix_a(matrix_a_h);
     return eigensolver::internal::ca_reduction_to_band<B>(grid, matrix_a.get(), band_size);
   }();
 
-  // print(format::numpy{}, "R", matrix_a_h);
-  // print(format::numpy{}, "taus_1st", red2band_result.taus_1st);
-  // print(format::numpy{}, "mat_hh_2nd", red2band_result.hh_2nd);
-  // print(format::numpy{}, "taus_2nd", red2band_result.taus_2nd);
-
   ASSERT_EQ(red2band_result.taus_1st.block_size().rows(), block_size.rows());
   ASSERT_EQ(red2band_result.taus_2nd.block_size().rows(), block_size.rows());
 
@@ -642,17 +726,10 @@ void testCAReductionToBand(comm::CommunicatorGrid& grid, const LocalElementSize
   auto taus_2nd = allGatherTaus(k_reflectors, red2band_result.taus_2nd, grid);
   ASSERT_EQ(taus_2nd.size(), k_reflectors);
 
-  print(format::numpy{}, "mat_hh_2nd", mat_hh_2nd);
-  print(format::numpy{}, "mat_taus_1st", taus_1st);
-  std::cout << "taus_2nd = ";
-  for (auto tau : taus_2nd)
-    std::cout << tau << ", ";
-  std::cout << std::endl;
-
   auto mat_band = makeLocal(matrix_a_h);
   splitReflectorsAndBand(mat_hh_1st, mat_band, band_size);
 
-  checkResult(k_reflectors, band_size, reference, mat_band, mat_hh_1st, taus_1st, mat_hh_2nd, taus_2nd);
+  checkResult(dist, band_size, reference, mat_band, mat_hh_1st, taus_1st, mat_hh_2nd, taus_2nd);
 }
 
 TYPED_TEST(CAReductionToBandTestMC, CorrectnessDistributed) {

From 185293eac539c36427ec3e628d8cfe33e7acfc16 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <alberto.invernizzi@cscs.ch>
Date: Wed, 11 Sep 2024 16:20:17 +0200
Subject: [PATCH 25/30] WIP: just a quick switch for testing performances of
 both algo variants

---
 miniapp/miniapp_reduction_to_band.cpp | 29 ++++++++++++++++-----------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/miniapp/miniapp_reduction_to_band.cpp b/miniapp/miniapp_reduction_to_band.cpp
index 299af230b0..147aa073bd 100644
--- a/miniapp/miniapp_reduction_to_band.cpp
+++ b/miniapp/miniapp_reduction_to_band.cpp
@@ -47,6 +47,7 @@ struct Options
   SizeType m;
   SizeType mb;
   SizeType b;
+  bool use_ca_algo;
 #ifdef DLAF_WITH_HDF5
   std::filesystem::path input_file;
   std::string input_dataset;
@@ -55,7 +56,7 @@ struct Options
 
   Options(const pika::program_options::variables_map& vm)
       : MiniappOptions(vm), m(vm["matrix-size"].as<SizeType>()), mb(vm["block-size"].as<SizeType>()),
-        b(vm["band-size"].as<SizeType>()) {
+        b(vm["band-size"].as<SizeType>()), use_ca_algo(vm["ca"].as<bool>()) {
     DLAF_ASSERT(m > 0, m);
     DLAF_ASSERT(mb > 0, mb);
 
@@ -148,17 +149,20 @@ struct reductionToBandMiniapp {
         DLAF_MPI_CHECK_ERROR(MPI_Barrier(world));
 
         dlaf::common::Timer<> timeit;
-        auto bench = [&]() {
-          if (opts.local)
-            return dlaf::eigensolver::internal::reduction_to_band<backend>(matrix, opts.b);
-          else
-            return dlaf::eigensolver::internal::reduction_to_band<backend>(comm_grid, matrix, opts.b);
-        };
-        auto taus = bench();
+        if (opts.use_ca_algo) {
+          auto res =
+              dlaf::eigensolver::internal::ca_reduction_to_band<backend>(comm_grid, matrix, opts.b);
+          res.taus_1st.waitLocalTiles();
+          res.taus_2nd.waitLocalTiles();
+          res.hh_2nd.waitLocalTiles();
+        }
+        else {
+          auto taus = dlaf::eigensolver::internal::reduction_to_band<backend>(comm_grid, matrix, opts.b);
+          taus.waitLocalTiles();
+        }
 
         // wait and barrier for all ranks
         matrix.waitLocalTiles();
-        taus.waitLocalTiles();
         DLAF_MPI_CHECK_ERROR(MPI_Barrier(world));
 
         elapsed_time = timeit.elapsed();
@@ -244,9 +248,10 @@ int main(int argc, char** argv) {
 
   // clang-format off
   desc_commandline.add_options()
-    ("matrix-size", value<SizeType>()   ->default_value(4096), "Matrix rows")
-    ("block-size",  value<SizeType>()   ->default_value( 256), "Block cyclic distribution size")
-    ("band-size",   value<SizeType>()   ->default_value(  -1), "Band size (a negative value implies band-size=block-size")
+    ("matrix-size", value<SizeType>()   ->default_value(4096),  "Matrix rows")
+    ("block-size",  value<SizeType>()   ->default_value( 256),  "Block cyclic distribution size")
+    ("band-size",   value<SizeType>()   ->default_value(  -1),  "Band size (a negative value implies band-size=block-size")
+    ("ca",          bool_switch()       ->default_value(false), "Use communication avoiding")
 #ifdef DLAF_WITH_HDF5
     ("input-file",    value<std::filesystem::path>()                 , "Load matrix from given HDF5 file")
     ("output-file",   value<std::filesystem::path>()                 , "Save band matrix to given HDF5 file")

From 9326fa8330a032955479c029243593142d906cb9 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <alberto.invernizzi@cscs.ch>
Date: Fri, 20 Sep 2024 10:39:04 +0200
Subject: [PATCH 26/30] WIP: refactor algo for incomplete tile code needs some
 cleanup but it passes tests for incomplete tiles too

---
 .../eigensolver/reduction_to_band/ca-impl.h   | 220 ++++++++++++++----
 .../eigensolver/test_reduction_to_band.cpp    | 113 +++++----
 2 files changed, 249 insertions(+), 84 deletions(-)

diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index 035e60a340..8ab7c3331d 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -12,8 +12,10 @@
 
 #include <pika/execution.hpp>
 
+#include <dlaf/common/index2d.h>
 #include <dlaf/communication/broadcast_panel.h>
 #include <dlaf/communication/communicator_grid.h>
+#include <dlaf/communication/index.h>
 #include <dlaf/communication/kernels/all_reduce.h>
 #include <dlaf/communication/kernels/broadcast.h>
 #include <dlaf/eigensolver/reduction_to_band/api.h>
@@ -21,6 +23,9 @@
 #include <dlaf/factorization/qr.h>
 #include <dlaf/lapack/tile.h>
 #include <dlaf/matrix/copy_tile.h>
+#include <dlaf/matrix/distribution.h>
+#include <dlaf/matrix/distribution_extensions.h>
+#include <dlaf/matrix/index.h>
 #include <dlaf/matrix/matrix.h>
 #include <dlaf/matrix/panel.h>
 #include <dlaf/schedulers.h>
@@ -491,12 +496,39 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
   const SizeType ntiles = (nrefls - 1) / band_size + 1;
 
   const bool is_full_band = (band_size == dist.blockSize().cols());
+  DLAF_ASSERT(is_full_band, is_full_band);
 
   for (SizeType j = 0; j < ntiles; ++j) {
     const SizeType i = j + 1;
     const SizeType j_lc = dist.template local_tile_from_global_tile<Coord::Col>(j);
 
-    const SizeType nrefls_step = mat_taus_1st.tile_size_of(GlobalTileIndex(j, rank.row())).rows();
+    const SizeType nrefls_1st = [&]() -> SizeType {
+      const SizeType i_head_lc = dist.template next_local_tile_from_global_tile<Coord::Row>(i);
+
+      if (i_head_lc >= dist.local_nr_tiles().rows())
+        return 0;
+
+      const SizeType i_head = dist.template global_tile_from_local_tile<Coord::Row>(i_head_lc);
+      const GlobalTileIndex ij_head(i_head, j);
+      const TileElementSize head_size = mat_a.tile_size_of(ij_head);
+
+      if (i_head_lc == dist.local_nr_tiles().rows() - 1)
+        return head_size.rows() - 1;
+      else
+        return head_size.rows();
+    }();
+
+    auto get_tile_tau = [&]() {
+      if (nrefls_1st == band_size)
+        return mat_taus_1st.readwrite(LocalTileIndex(j_lc, 0));
+      return splitTile(mat_taus_1st.readwrite(LocalTileIndex(j_lc, 0)), {{0, 0}, {nrefls_1st, 1}});
+    };
+
+    auto get_tile_tau_ro = [&]() {
+      if (nrefls_1st == band_size)
+        return mat_taus_1st.read(LocalTileIndex(j_lc, 0));
+      return splitTile(mat_taus_1st.read(LocalTileIndex(j_lc, 0)), {{0, 0}, {nrefls_1st, 1}});
+    };
 
     // panel
     const GlobalTileIndex panel_offset(i, j);
@@ -522,68 +554,141 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
     // QR local (HH reflectors stored in-place)
     if (rank_panel == rank.col()) {
       using red2band::local::computePanelReflectors;
-      computePanelReflectors(mat_a, mat_taus_1st.readwrite(LocalTileIndex(j_lc, 0)), panel_view);
+      computePanelReflectors(mat_a, get_tile_tau(), panel_view);
     }
 
     // TRAILING 1st pass
     if (at_offset_el.isIn(mat_a.size())) {
-      auto& ws_V = panels_v.nextResource();
+      // TODO FIXME workaround for possibly non-participating ranks (they cannot have width = 0)
+      const SizeType nrefls_1st_min = std::max<SizeType>(nrefls_1st, 1);
+
+      const LocalTileIndex zero_lc(0, 0);
+      matrix::Matrix<T, D> ws_T({nrefls_1st_min, nrefls_1st_min}, dist.block_size());
 
+      auto& ws_V = panels_v.nextResource();
       ws_V.setRangeStart(at_offset);
-      ws_V.setWidth(nrefls_step);
+      ws_V.setWidth(nrefls_1st_min);
 
-      const LocalTileIndex zero_lc(0, 0);
-      matrix::Matrix<T, D> ws_T({nrefls_step, nrefls_step}, dist.block_size());
+      auto& ws_W0 = panels_w0.nextResource();
+      ws_W0.setRangeStart(at_offset);
+      ws_W0.setWidth(nrefls_1st_min);
 
-      if (rank_panel == rank.col()) {
+      if (rank_panel == rank.col() && nrefls_1st != 0) {
         using factorization::internal::computeTFactor;
         using red2band::local::setupReflectorPanelV;
 
         const bool has_head = !panel_view.iteratorLocal().empty();
-        setupReflectorPanelV<B, D, T>(has_head, panel_view, nrefls_step, ws_V, mat_a, !is_full_band);
+        setupReflectorPanelV<B, D, T>(has_head, panel_view, nrefls_1st_min, ws_V, mat_a, !is_full_band);
 
-        computeTFactor<B>(ws_V, mat_taus_1st.read(LocalTileIndex{j_lc, 0}), ws_T.readwrite(zero_lc));
+        computeTFactor<B>(ws_V, get_tile_tau_ro(), ws_T.readwrite(zero_lc));
+
+        // W = V T
+        red2band::local::trmmComputeW<B, D>(ws_W0, ws_V, ws_T.read(zero_lc));
       }
 
-      auto& ws_VT = panels_vt.nextResource();
-      ws_VT.setRangeStart(at_offset);
-      ws_VT.setHeight(nrefls_step);
+      // Note: apply local transformations, one after the other
+      // matrix::Matrix<T, D> ws_W2 = std::move(ws_T);
 
-      comm::broadcast(rank_panel, ws_V, ws_VT, mpi_row_chain, mpi_col_chain);
+      for (int idx_qr_head = 0; idx_qr_head < n_qr_heads; ++idx_qr_head) {
+        const SizeType head_qr = at_view.offset().row() + idx_qr_head;
+        const comm::Index2D rank_qr(dist.template rank_global_tile<Coord::Row>(head_qr), rank_panel);
 
-      // W = V T
-      auto& ws_W0 = panels_w0.nextResource();
-      ws_W0.setRangeStart(at_offset);
-      ws_W0.setWidth(nrefls_step);
+        const bool is_row_involved = rank_qr.row() == rank.row();
+        const bool is_col_involved = [head_qr, dist, rank]() {
+          for (SizeType k = head_qr; k < dist.nr_tiles().rows(); k += dist.grid_size().rows()) {
+            const comm::IndexT_MPI rank_owner_col = dist.template rank_global_tile<Coord::Col>(k);
+            if (rank_owner_col == rank.col())
+              return true;
+          }
+          return false;
+        }();
+
+        const SizeType nrtiles_transf =
+            util::ceilDiv<SizeType>(dist.nr_tiles().rows() - head_qr, dist.grid_size().rows());
+
+        // TODO FIXME cannot skip because otherwise no W1 reduction if any rank skip?!
+        // // this rank is not involved at all
+        // if (!is_row_involved && !is_col_involved)
+        //   continue;
+
+        // number of reflectors of this "local" transformation
+        const SizeType nrefls_this = [&]() {
+          using matrix::internal::distribution::global_tile_from_local_tile_on_rank;
+          const SizeType i_head = head_qr;
+          if (nrtiles_transf == 1) {
+            const TileElementSize tile_size = dist.tile_size_of({i_head, j});
+            return std::min<SizeType>(tile_size.rows() - 1, tile_size.cols());
+          }
+          return dist.block_size().cols();
+        }();
 
-      if (rank_panel == rank.col())
-        red2band::local::trmmComputeW<B, D>(ws_W0, ws_V, ws_T.read(zero_lc));
+        if (nrefls_this == 0)
+          continue;
 
-      auto& ws_W0T = panels_w0t.nextResource();
-      ws_W0T.setRangeStart(at_offset);
-      ws_W0T.setHeight(nrefls_step);
+        // "local" broadcast along rows involved in this local transformation
+        if (is_row_involved) {
+          comm::broadcast(rank_panel, ws_V, mpi_row_chain);
+          comm::broadcast(rank_panel, ws_W0, mpi_row_chain);
+        }
 
-      comm::broadcast(rank_panel, ws_W0, ws_W0T, mpi_row_chain, mpi_col_chain);
+        auto& ws_VT = panels_vt.nextResource();
+        // TODO FIXME workaround for panel problem on reset about range
+        ws_VT.setRange(at_offset, common::indexFromOrigin(dist.nr_tiles()));
+        ws_VT.setHeight(nrefls_this);
 
-      // Note: apply local transformations, one after the other
-      auto& ws_W1 = panels_w1.nextResource();
-      auto& ws_W1T = panels_w1t.nextResource();
-      matrix::Matrix<T, D> ws_W2 = std::move(ws_T);
+        auto& ws_W0T = panels_w0t.nextResource();
+        // TODO FIXME workaround for panel problem on reset about range
+        ws_W0T.setRange(at_offset, common::indexFromOrigin(dist.nr_tiles()));
+        ws_W0T.setHeight(nrefls_this);
 
-      for (int idx_qr_head = 0; idx_qr_head < n_qr_heads; ++idx_qr_head) {
-        const SizeType head_qr = at_view.offset().row() + idx_qr_head;
-        const comm::Index2D rank_qr(dist.template rank_global_tile<Coord::Row>(head_qr), rank_panel);
+        // broadcast along cols involved in this local transformation
+        if (is_col_involved) {  // diagonal
+          // set diagonal tiles
+          for (const auto ij_lc : ws_VT.iteratorLocal()) {
+            const SizeType k = dist.template global_tile_from_local_tile<Coord::Col>(ij_lc.col());
+            const comm::IndexT_MPI rank_src = dist.template rank_global_tile<Coord::Row>(k);
 
-        ws_W1.setRangeStart(at_offset);
-        ws_W1.setWidth(nrefls_step);
+            if (rank_qr.row() != rank_src)
+              continue;
 
-        ws_W1T.setRangeStart(at_offset);
-        ws_W1T.setHeight(nrefls_step);
+            using comm::schedule_bcast_recv;
+            using comm::schedule_bcast_send;
+
+            if (rank_src == rank.row()) {
+              const SizeType i_lc = dist.template local_tile_from_global_tile<Coord::Row>(k);
+
+              ws_VT.setTile(ij_lc, ws_V.read({i_lc, 0}));
+              ws_W0T.setTile(ij_lc, ws_W0.read({i_lc, 0}));
+
+              // if (nrtiles_transf > 1) {
+              ex::start_detached(schedule_bcast_send(mpi_col_chain.exclusive(), ws_V.read({i_lc, 0})));
+              ex::start_detached(schedule_bcast_send(mpi_col_chain.exclusive(), ws_W0.read({i_lc, 0})));
+              // }
+            }
+            else {
+              // if (nrtiles_transf > 1) {
+              ex::start_detached(schedule_bcast_recv(mpi_col_chain.exclusive(), rank_src,
+                                                     ws_VT.readwrite(ij_lc)));
+              ex::start_detached(schedule_bcast_recv(mpi_col_chain.exclusive(), rank_src,
+                                                     ws_W0T.readwrite(ij_lc)));
+              // }
+            }
+          }
+        }
 
         // W1 = A W0
         // Note:
-        // it will fill up all W1 (distributed) and it will read just part of A, i.e. columns where this
-        // local transformation should be applied from the right.
+        // it will fill up all W1 (distributed) and it will read just part of A, i.e. columns where
+        // this local transformation should be applied from the right.
+        auto& ws_W1 = panels_w1.nextResource();
+        auto& ws_W1T = panels_w1t.nextResource();
+        ws_W1.setRangeStart(at_offset);
+        ws_W1.setWidth(nrefls_this);
+
+        ws_W1T.setRangeStart(at_offset);
+        ws_W1T.setHeight(nrefls_this);
+
+        // TODO FIXME restrict communication to just interested ones
         using ca_red2band::hemm;
         hemm<B>(rank_qr, ws_W1, ws_W1T, at_view, mat_a, ws_W0, ws_W0T, mpi_row_chain, mpi_col_chain);
 
@@ -591,13 +696,15 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
         // W1T has been used as support panel, so reset it again.
         ws_W1T.reset();
         ws_W1T.setRangeStart(at_offset);
-        ws_W1T.setHeight(nrefls_step);
+        ws_W1T.setHeight(nrefls_this);
 
+        // TODO FIXME restrict communication to just interested ones
         comm::broadcast(rank_panel, ws_W1, ws_W1T, mpi_row_chain, mpi_col_chain);
 
         // LR
         // A -= V W1* + W1 V* - V W0* W1 V*
         if (rank.row() == rank_qr.row()) {
+          matrix::Matrix<T, D> ws_W2({nrefls_this, nrefls_this}, dist.block_size());
           // W2 = W0.T W1
           red2band::local::gemmComputeW2<B, D>(ws_W2, ws_W0, ws_W1);
 
@@ -605,7 +712,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
           // Next steps for L and R need W1, so we create a copy that we are going to update for this step.
           auto& ws_W3 = panels_w3.nextResource();
           ws_W3.setRangeStart(at_offset);
-          ws_W3.setWidth(nrefls_step);
+          ws_W3.setWidth(nrefls_this);
 
           for (const auto& idx : ws_W1.iteratorLocal())
             ex::start_detached(ex::when_all(ws_W1.read(idx), ws_W3.readwrite(idx)) |
@@ -681,23 +788,46 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
 
         ws_W1T.reset();
         ws_W1.reset();
+
+        ws_W0T.reset();
+        ws_VT.reset();
       }
 
-      ws_W0T.reset();
       ws_W0.reset();
-      ws_VT.reset();
       ws_V.reset();
     }
 
     // ===== 2nd pass
+    const matrix::Distribution dist_heads_current = [&]() {
+      using matrix::internal::distribution::global_tile_element_distance;
+      const SizeType i_begin = i;
+      const SizeType i_end = std::min<SizeType>(i + dist.grid_size().rows(), dist.nr_tiles().rows());
+      const SizeType nrows = global_tile_element_distance<Coord::Row>(dist, i_begin, i_end);
+      return matrix::Distribution{LocalElementSize(nrows, band_size), dist.block_size()};
+    }();
+
+    const SizeType nrefls_step = [&]() {
+      const SizeType reflector_size = dist_heads_current.size().rows();
+      return std::min(dist_heads_current.size().cols(), reflector_size - 1);
+    }();
+
+    auto get_tile_tau2 = [&]() {
+      if (nrefls_step == band_size)
+        return mat_taus_2nd.readwrite(LocalTileIndex(j_lc, 0));
+      return splitTile(mat_taus_2nd.readwrite(LocalTileIndex(j_lc, 0)), {{0, 0}, {nrefls_step, 1}});
+    };
+
+    auto get_tile_tau2_ro = [&]() {
+      if (nrefls_step == band_size)
+        return mat_taus_2nd.read(LocalTileIndex(j_lc, 0));
+      return splitTile(mat_taus_2nd.read(LocalTileIndex(j_lc, 0)), {{0, 0}, {nrefls_step, 1}});
+    };
 
     // PANEL: just ranks in the current column
     // QR local with just heads (HH reflectors have to be computed elsewhere, 1st-pass in-place)
     auto&& panel_heads = panels_heads.nextResource();
     panel_heads.setRangeEnd({n_qr_heads, 0});
 
-    const matrix::Distribution dist_heads_current(
-        LocalElementSize(n_qr_heads * dist.block_size().rows(), band_size), dist.block_size());
     const matrix::SubPanelView panel_heads_view(dist_heads_current, {0, 0}, band_size);
 
     const bool rank_has_head_row = !panel_view.iteratorLocal().empty();
@@ -737,8 +867,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
 
       // QR local on heads
       using red2band::local::computePanelReflectors;
-      computePanelReflectors(panel_heads, mat_taus_2nd.readwrite(LocalTileIndex(j_lc, 0)),
-                             panel_heads_view);
+      computePanelReflectors(panel_heads, get_tile_tau2(), panel_heads_view);
 
       // copy back data
       {
@@ -822,10 +951,11 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
 
         using factorization::internal::computeTFactor;
         const GlobalTileIndex j_tau(j, 0);
-        computeTFactor<B>(panel_heads, mat_taus_2nd.read(j_tau), ws_T.readwrite(zero_lc));
+        computeTFactor<B>(panel_heads, get_tile_tau2_ro(), ws_T.readwrite(zero_lc));
       }
 
       auto& ws_VT = panels_vt.nextResource();
+
       ws_VT.setRange(at_offset, at_end_R);
       ws_VT.setHeight(nrefls_step);
 
diff --git a/test/unit/eigensolver/test_reduction_to_band.cpp b/test/unit/eigensolver/test_reduction_to_band.cpp
index df327dc789..34382467d6 100644
--- a/test/unit/eigensolver/test_reduction_to_band.cpp
+++ b/test/unit/eigensolver/test_reduction_to_band.cpp
@@ -16,6 +16,7 @@
 #include <pika/execution.hpp>
 #include <pika/init.hpp>
 
+#include <dlaf/common/assert.h>
 #include <dlaf/common/index2d.h>
 #include <dlaf/common/pipeline.h>
 #include <dlaf/common/single_threaded_blas.h>
@@ -87,10 +88,14 @@ std::vector<config_t> configs{
     {{0, 0}, {3, 3}},
     // full-tile band
     {{3, 3}, {3, 3}},    // single tile (nothing to do)
+    {{6, 6}, {3, 3}},    // tile always full size (less room for distribution over ranks)
+    {{9, 9}, {3, 3}},    // tile always full size (less room for distribution over ranks)
     {{12, 12}, {3, 3}},  // tile always full size (less room for distribution over ranks)
-    {{13, 13}, {3, 3}},  // tile incomplete
     {{24, 24}, {3, 3}},  // tile always full size (more room for distribution)
     {{40, 40}, {5, 5}},
+    // tile incomplete
+    {{8, 8}, {3, 3}},
+    {{13, 13}, {3, 3}},
 };
 
 std::vector<config_t> configs_subband{
@@ -557,7 +562,6 @@ auto checkResult(const Distribution dist, const SizeType band_size,
           to_sizet(std::min<SizeType>(mat_b.nrTiles().rows() - i, dist.grid_size().rows()));
 
       // === 2nd pass
-
       // prepare workspace (height = max(nranks)) + reorder heads
       std::vector<comm::IndexT_MPI> col_rank_order(nranks_with_data, -1);
       const comm::IndexT_MPI first_rank = dist.template rank_global_tile<Coord::Row>(i);
@@ -568,47 +572,64 @@ auto checkResult(const Distribution dist, const SizeType band_size,
                      });
 
       // HH2
-      // TODO it does not deal with incomplete tiles yet
-      MatrixLocal<T> hh_2nd({to_SizeType(nranks_with_data) * mat_b.blockSize().rows(),
-                             mat_b.blockSize().cols()},
-                            mat_b.blockSize());
-
-      for (SizeType i = 0; i < to_SizeType(col_rank_order.size()); ++i) {
-        const SizeType ii = to_SizeType(col_rank_order[to_sizet(i)]);
-        matrix::internal::copy(mat_hh_2nd.tile({ii, j}), hh_2nd.tile({i, 0}));
-      }
+      const matrix::Distribution dist_hh_2nd = [&]() {
+        using matrix::internal::distribution::global_tile_element_distance;
+        const SizeType i_begin = i;
+        const SizeType i_end = std::min<SizeType>(i + dist.grid_size().rows(), dist.nr_tiles().rows());
+        const SizeType nrows = global_tile_element_distance<Coord::Row>(dist, i_begin, i_end);
+        return matrix::Distribution({nrows, mat_b.blockSize().cols()}, mat_b.blockSize());
+      }();
+      MatrixLocal<T> hh_2nd(dist_hh_2nd.size(), dist_hh_2nd.block_size());
 
       const SizeType nrefls = [&]() {
-        // TODO workaround (j == ntiles - 1 ? 1 : 0)
-        return std::min(dist.template tile_size_of<Coord::Col>(j),
-                        hh_2nd.size().rows() - (j == ntiles - 1 ? 1 : 0));
+        const SizeType reflector_size = hh_2nd.size().rows();
+        return std::min(hh_2nd.size().cols(), reflector_size - 1);
       }();
 
-      // T2
-      const SizeType j_el =
-          dist.template global_element_from_global_tile_and_tile_element<Coord::Col>(j, 0);
-      MatrixLocal<T> T_2nd({nrefls, nrefls}, mat_b.blockSize());
-      lapack::larft(lapack::Direction::Forward, lapack::StoreV::Columnwise, hh_2nd.size().rows(), nrefls,
-                    hh_2nd.ptr(), hh_2nd.ld(), taus_2nd.data() + j_el, T_2nd.ptr(), T_2nd.ld());
-
-      // Apply HH2 from L and R
-      lapack::larfb(lapack::Side::Left, lapack::Op::NoTrans, lapack::Direction::Forward,
-                    lapack::StoreV::Columnwise, hh_2nd.size().rows(), mat_b.size().cols() - j_el, nrefls,
-                    hh_2nd.ptr(), hh_2nd.ld(), T_2nd.ptr(), T_2nd.ld(), mat_b.ptr({i_el, j_el}),
-                    mat_b.ld());
-      lapack::larfb(lapack::Side::Right, lapack::Op::ConjTrans, lapack::Direction::Forward,
-                    lapack::StoreV::Columnwise, mat_b.size().rows() - j_el, hh_2nd.size().rows(), nrefls,
-                    hh_2nd.ptr(), hh_2nd.ld(), T_2nd.ptr(), T_2nd.ld(), mat_b.ptr({j_el, i_el}),
-                    mat_b.ld());
+      if (nrefls > 0) {
+        for (SizeType i = 0; i < to_SizeType(col_rank_order.size()); ++i) {
+          const SizeType ii = to_SizeType(col_rank_order[to_sizet(i)]);
 
-      // === 1st pass
+          const bool is_last = i == (to_SizeType(col_rank_order.size()) - 1);
+          const SizeType last_rows = hh_2nd.size().rows() % dist.block_size().rows();
+          if (!is_last || last_rows == 0) {
+            matrix::internal::copy(mat_hh_2nd.tile({ii, j}), hh_2nd.tile({i, 0}));
+          }
+          else {
+            const auto& tile = mat_hh_2nd.tile({ii, j}).subTileReference(
+                {{0, 0}, {last_rows, dist.block_size().cols()}});
+            matrix::internal::copy(tile, hh_2nd.tile({i, 0}));
+          }
+        }
+
+        // T2
+        const SizeType j_el =
+            dist.template global_element_from_global_tile_and_tile_element<Coord::Col>(j, 0);
+
+        MatrixLocal<T> T_2nd({nrefls, nrefls}, mat_b.blockSize());
+        lapack::larft(lapack::Direction::Forward, lapack::StoreV::Columnwise, hh_2nd.size().rows(),
+                      nrefls, hh_2nd.ptr(), hh_2nd.ld(), taus_2nd.data() + j_el, T_2nd.ptr(),
+                      T_2nd.ld());
+
+        // Apply HH2 from L and R
+        lapack::larfb(lapack::Side::Left, lapack::Op::NoTrans, lapack::Direction::Forward,
+                      lapack::StoreV::Columnwise, hh_2nd.size().rows(), mat_b.size().cols() - j_el,
+                      nrefls, hh_2nd.ptr(), hh_2nd.ld(), T_2nd.ptr(), T_2nd.ld(),
+                      mat_b.ptr({i_el, j_el}), mat_b.ld());
+        lapack::larfb(lapack::Side::Right, lapack::Op::ConjTrans, lapack::Direction::Forward,
+                      lapack::StoreV::Columnwise, mat_b.size().rows() - j_el, hh_2nd.size().rows(),
+                      nrefls, hh_2nd.ptr(), hh_2nd.ld(), T_2nd.ptr(), T_2nd.ld(),
+                      mat_b.ptr({j_el, i_el}), mat_b.ld());
+      }
 
+      // === 1st pass
       // HH1 (for all ranks)
       // prepare workspace (height = local matrix for each rank) with zeros to fill voids
       // Note: HH_1st workspaces is stored as a matrix where each column of tiles is for a specific rank.
-      MatrixLocal<T> hh_1st({dist.size().rows() - i_el,
-                             to_SizeType(nranks_with_data) * dist.tile_size().cols()},
-                            dist.tile_size());
+      const matrix::Distribution dist_hh_1st({dist.size().rows() - i_el,
+                                              to_SizeType(nranks_with_data) * dist.tile_size().cols()},
+                                             dist.tile_size());
+      MatrixLocal<T> hh_1st(dist_hh_1st.size(), dist_hh_1st.tile_size());
 
       std::size_t col_rank_current = 0;
       for (SizeType i_a = i; i_a < dist.nr_tiles().rows(); ++i_a, ++col_rank_current) {
@@ -638,15 +659,29 @@ auto checkResult(const Distribution dist, const SizeType band_size,
       for (SizeType col_rank = 0; col_rank < to_SizeType(col_rank_order.size()); ++col_rank) {
         const SizeType rank = to_SizeType(col_rank_order[to_sizet(col_rank)]);
 
+        const SizeType i_begin = col_rank;
+        const SizeType i_end_gap = dist_hh_1st.nr_tiles().rows();
+        const SizeType i_end = i_begin + dlaf::util::ceilDiv(dist_hh_1st.nr_tiles().rows() - i_begin,
+                                                             to_SizeType(col_rank_order.size()));
+        using matrix::internal::distribution::global_tile_element_distance;
+
+        const SizeType refl_size = global_tile_element_distance<Coord::Row>(dist_hh_1st, i_begin, i_end);
+
+        const auto& hh_1st_head = hh_1st.tile({col_rank, col_rank});
+        const SizeType nrefls = std::min(refl_size - 1, hh_1st_head.size().cols());
+
+        if (nrefls <= 0)
+          continue;
+
         // Compute T1
         const auto& tile_taus = taus_1st.tile({j, rank});
-        const SizeType nrefls = tile_taus.size().rows();
-        const auto& hh_1st_head = hh_1st.tile({col_rank, col_rank});
+
+        const SizeType refl_size_gap =
+            global_tile_element_distance<Coord::Row>(dist_hh_1st, i_begin, i_end_gap);
 
         MatrixLocal<T> T_1st({nrefls, nrefls}, mat_b.blockSize());
-        lapack::larft(lapack::Direction::Forward, lapack::StoreV::Columnwise,
-                      hh_1st.size().rows() - col_rank * dist.tile_size().rows(), nrefls,
-                      hh_1st_head.ptr(), hh_1st.ld(), tile_taus.ptr(), T_1st.ptr(), T_1st.ld());
+        lapack::larft(lapack::Direction::Forward, lapack::StoreV::Columnwise, refl_size_gap, nrefls,
+                      hh_1st_head.ptr(), hh_1st_head.ld(), tile_taus.ptr(), T_1st.ptr(), T_1st.ld());
 
         // Apply HH1 (of a rank) from L and R
         {

From 045c5bfb289476947e8d84a403866e7b48e54d52 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <alberto.invernizzi@cscs.ch>
Date: Fri, 20 Sep 2024 10:44:48 +0200
Subject: [PATCH 27/30] fix missing headers

---
 include/dlaf/eigensolver/reduction_to_band/ca-impl.h | 3 +++
 include/dlaf/eigensolver/reduction_to_band/common.h  | 3 +++
 test/unit/eigensolver/test_reduction_to_band.cpp     | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index 8ab7c3331d..3b32b254ad 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -10,6 +10,9 @@
 
 #pragma once
 
+#include <cstddef>
+#include <utility>
+
 #include <pika/execution.hpp>
 
 #include <dlaf/common/index2d.h>
diff --git a/include/dlaf/eigensolver/reduction_to_band/common.h b/include/dlaf/eigensolver/reduction_to_band/common.h
index 7506faf745..9c0b8ab240 100644
--- a/include/dlaf/eigensolver/reduction_to_band/common.h
+++ b/include/dlaf/eigensolver/reduction_to_band/common.h
@@ -10,7 +10,10 @@
 
 #pragma once
 
+#include <array>
+#include <cstddef>
 #include <type_traits>
+#include <utility>
 #include <vector>
 
 #include <pika/barrier.hpp>
diff --git a/test/unit/eigensolver/test_reduction_to_band.cpp b/test/unit/eigensolver/test_reduction_to_band.cpp
index 34382467d6..0a9dbae12b 100644
--- a/test/unit/eigensolver/test_reduction_to_band.cpp
+++ b/test/unit/eigensolver/test_reduction_to_band.cpp
@@ -10,7 +10,9 @@
 
 #include <algorithm>
 #include <cmath>
+#include <cstddef>
 #include <functional>
+#include <utility>
 #include <vector>
 
 #include <pika/execution.hpp>

From 9f3aa79cf32b6e22a842fbab8436814ab6b6df99 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <alberto.invernizzi@cscs.ch>
Date: Wed, 25 Sep 2024 11:20:14 +0200
Subject: [PATCH 28/30] WIP: workaround for t-factor local

---
 include/dlaf/factorization/qr/t_factor_impl.h | 9 ++++++---
 include/dlaf/matrix/panel.h                   | 4 ++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
index 0bf1a2616c..49a9ac73b4 100644
--- a/include/dlaf/factorization/qr/t_factor_impl.h
+++ b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -237,6 +237,9 @@ void QR_Tfactor<backend, device, T>::call(matrix::Panel<Coord::Col, T, device>&
   if (hh_panel.getWidth() == 0)
     return;
 
+  const SizeType bs = hh_panel.parentDistribution().blockSize().rows();
+  const SizeType offset_lc = (bs - hh_panel.tile_size_of_local_head().rows());
+
   matrix::ReadWriteTileSender<T, device> t_local = Helpers::set0(std::move(t));
 
   // Note:
@@ -257,15 +260,15 @@ void QR_Tfactor<backend, device, T>::call(matrix::Panel<Coord::Col, T, device>&
   // 1st step: compute the column partial result `t`
   // First we compute the matrix vector multiplication for each column
   // -tau(j) . V(j:, 0:j)* . V(j:, j)
-  for (const auto& i_lc : hh_panel.iteratorLocal()) {
+  for (const auto& v_i : hh_panel.iteratorLocal()) {
     const SizeType first_row_tile =
-        (i_lc.row() - hh_panel.rangeStartLocal()) * hh_panel.parentDistribution().tile_size().rows();
+        std::max<SizeType>(0, (v_i.row() - hh_panel.rangeStartLocal()) * bs - offset_lc);
 
     // Note:
     // Since we are writing always on the same t, the gemv are serialized
     // A possible solution to this would be to have multiple places where to store partial
     // results, and then locally reduce them just before the reduce over ranks
-    t_local = Helpers::gemvColumnT(first_row_tile, hh_panel.read(i_lc), taus, std::move(t_local));
+    t_local = Helpers::gemvColumnT(first_row_tile, hh_panel.read(v_i), taus, std::move(t_local));
   }
 
   // 2nd step: compute the T factor, by performing the last step on each column
diff --git a/include/dlaf/matrix/panel.h b/include/dlaf/matrix/panel.h
index 498ed60697..16b79476d0 100644
--- a/include/dlaf/matrix/panel.h
+++ b/include/dlaf/matrix/panel.h
@@ -304,6 +304,10 @@ struct Panel<axis, const T, D, StoreTransposed::No> {
     has_been_used_ = false;
   }
 
+  TileElementSize tile_size_of_local_head() const {
+    return tileSize(LocalTileIndex(coord, rangeStartLocal()));
+  }
+
 protected:
   using ReadWriteSenderType = typename BaseT::ReadWriteSenderType;
 

From 350de054f2be2198eb43189df747c01a5f8540ab Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <alberto.invernizzi@cscs.ch>
Date: Wed, 25 Sep 2024 11:30:32 +0200
Subject: [PATCH 29/30] WIP: workaround problem of last tile not communicated
 in panel bcast transpose

---
 include/dlaf/communication/broadcast_panel.h  | 41 +++++++++++++++++--
 .../eigensolver/reduction_to_band/ca-impl.h   |  4 +-
 .../dlaf/eigensolver/reduction_to_band/impl.h |  2 +-
 3 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/include/dlaf/communication/broadcast_panel.h b/include/dlaf/communication/broadcast_panel.h
index 2fa15e335b..c3cf7e0725 100644
--- a/include/dlaf/communication/broadcast_panel.h
+++ b/include/dlaf/communication/broadcast_panel.h
@@ -18,11 +18,13 @@
 #include <pika/execution.hpp>
 
 #include <dlaf/common/index2d.h>
+#include <dlaf/common/range2d.h>
 #include <dlaf/communication/communicator_pipeline.h>
 #include <dlaf/communication/index.h>
 #include <dlaf/communication/kernels/broadcast.h>
 #include <dlaf/communication/message.h>
 #include <dlaf/matrix/copy_tile.h>
+#include <dlaf/matrix/index.h>
 #include <dlaf/matrix/panel.h>
 #include <dlaf/matrix/tile.h>
 #include <dlaf/types.h>
@@ -89,7 +91,6 @@ auto& get_taskchain(comm::CommunicatorPipeline<comm::CommunicatorType::Row>& row
     return col_task_chain;
   }
 }
-}  // namespace internal
 
 /// Broadcast
 ///
@@ -122,7 +123,8 @@ template <class T, Device D, Coord axis, matrix::StoreTransposed storage,
 void broadcast(comm::IndexT_MPI rank_root, matrix::Panel<axis, T, D, storage>& panel,
                matrix::Panel<orthogonal(axis), T, D, storageT>& panelT,
                comm::CommunicatorPipeline<comm::CommunicatorType::Row>& row_task_chain,
-               comm::CommunicatorPipeline<comm::CommunicatorType::Col>& col_task_chain) {
+               comm::CommunicatorPipeline<comm::CommunicatorType::Col>& col_task_chain,
+               common::IterableRange2D<SizeType, matrix::LocalTile_TAG> range) {
   constexpr Coord axisT = orthogonal(axis);
 
   constexpr Coord coord = std::decay_t<decltype(panel)>::coord;
@@ -180,7 +182,7 @@ void broadcast(comm::IndexT_MPI rank_root, matrix::Panel<axis, T, D, storage>& p
 
   auto& chain_step2 = internal::get_taskchain<comm_dir_step2>(row_task_chain, col_task_chain);
 
-  for (const auto& indexT : panelT.iteratorLocal()) {
+  for (const auto& indexT : range) {
     auto [index_diag, owner_diag] = internal::transposedOwner<coordT>(dist, indexT);
 
     namespace ex = pika::execution::experimental;
@@ -198,6 +200,39 @@ void broadcast(comm::IndexT_MPI rank_root, matrix::Panel<axis, T, D, storage>& p
     }
   }
 }
+}  // namespace internal
+
+template <class T, Device D, Coord axis, matrix::StoreTransposed storage,
+          matrix::StoreTransposed storageT, class = std::enable_if_t<!std::is_const_v<T>>>
+void broadcast(comm::IndexT_MPI rank_root, matrix::Panel<axis, T, D, storage>& panel,
+               matrix::Panel<orthogonal(axis), T, D, storageT>& panelT,
+               comm::CommunicatorPipeline<comm::CommunicatorType::Row>& row_task_chain,
+               comm::CommunicatorPipeline<comm::CommunicatorType::Col>& col_task_chain) {
+  constexpr Coord coordT = std::decay_t<decltype(panelT)>::coord;
 
+  const auto& dist = panel.parentDistribution();
+
+  const SizeType last_tile = std::max(panelT.rangeStart(), panelT.rangeEnd() - 1);
+
+  if (panel.rangeStart() == panel.rangeEnd())
+    return;
+
+  const auto owner = dist.template rankGlobalTile<coordT>(last_tile);
+  const auto range = dist.rankIndex().get(coordT) == owner
+                         ? common::iterate_range2d(*panelT.iteratorLocal().begin(),
+                                                   LocalTileIndex(coordT, panelT.rangeEndLocal() - 1, 1))
+                         : panelT.iteratorLocal();
+
+  internal::broadcast(rank_root, panel, panelT, row_task_chain, col_task_chain, range);
+}
+
+template <class T, Device D, Coord axis, matrix::StoreTransposed storage,
+          matrix::StoreTransposed storageT, class = std::enable_if_t<!std::is_const_v<T>>>
+void broadcast_all(comm::IndexT_MPI rank_root, matrix::Panel<axis, T, D, storage>& panel,
+                   matrix::Panel<orthogonal(axis), T, D, storageT>& panelT,
+                   comm::CommunicatorPipeline<comm::CommunicatorType::Row>& row_task_chain,
+                   comm::CommunicatorPipeline<comm::CommunicatorType::Col>& col_task_chain) {
+  internal::broadcast(rank_root, panel, panelT, row_task_chain, col_task_chain, panelT.iteratorLocal());
+}
 }
 }
diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index 3b32b254ad..4a2b525234 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -962,7 +962,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
       ws_VT.setRange(at_offset, at_end_R);
       ws_VT.setHeight(nrefls_step);
 
-      comm::broadcast(rank_panel, ws_V, ws_VT, mpi_row_chain, mpi_col_chain);
+      comm::broadcast_all(rank_panel, ws_V, ws_VT, mpi_row_chain, mpi_col_chain);
 
       // Note:
       // Differently from 1st pass, where transformations are independent one from the other,
@@ -981,7 +981,7 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
       ws_W0T.setRange(at_offset, at_end_R);
       ws_W0T.setHeight(nrefls_step);
 
-      comm::broadcast(rank_panel, ws_W0, ws_W0T, mpi_row_chain, mpi_col_chain);
+      comm::broadcast_all(rank_panel, ws_W0, ws_W0T, mpi_row_chain, mpi_col_chain);
 
       // W1 = A W0
       auto& ws_W1 = panels_w1.nextResource();
diff --git a/include/dlaf/eigensolver/reduction_to_band/impl.h b/include/dlaf/eigensolver/reduction_to_band/impl.h
index 75acd0a06f..165b87c546 100644
--- a/include/dlaf/eigensolver/reduction_to_band/impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/impl.h
@@ -896,7 +896,7 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
     xt.setRangeStart(at_offset);
     xt.setHeight(nrefls_tile);
 
-    comm::broadcast(rank_v0.col(), x, xt, mpi_row_chain, mpi_col_chain);
+    comm::broadcast_all(rank_v0.col(), x, xt, mpi_row_chain, mpi_col_chain);
 
     // TRAILING MATRIX UPDATE
 

From 8e820a1ed481c986f975ce64daaff9d3077f0939 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <alberto.invernizzi@cscs.ch>
Date: Wed, 25 Sep 2024 11:31:19 +0200
Subject: [PATCH 30/30] WIP: workaround for oversized 2nd step panel workspace

---
 include/dlaf/eigensolver/reduction_to_band/ca-impl.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
index 4a2b525234..5031af338b 100644
--- a/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/ca-impl.h
@@ -850,10 +850,15 @@ CARed2BandResult<T, D> CAReductionToBand<B, D, T>::call(comm::CommunicatorGrid&
           // copy - set - send
           ex::start_detached(ex::when_all(mat_a.read(ij_head), panel_heads.readwrite(idx_panel_head)) |
                              di::transform(di::Policy<B>(), [=](const auto& head_in, auto&& head) {
-                               // TODO FIXME change copy and if possible just lower
+                               // TODO FIXME workaround for over-sized panel
+                               if (head_in.size() != head.size())
+                                 tile::internal::set0(head);
+
+                               // TODO FIXME change copy and if possible just upper
                                // matrix::internal::copy(head_in, head);
-                               lapack::lacpy(blas::Uplo::General, head.size().rows(), head.size().cols(),
-                                             head_in.ptr(), head_in.ld(), head.ptr(), head.ld());
+                               lapack::lacpy(blas::Uplo::General, head_in.size().rows(),
+                                             head_in.size().cols(), head_in.ptr(), head_in.ld(),
+                                             head.ptr(), head.ld());
                                lapack::laset(blas::Uplo::Lower, head.size().rows() - 1,
                                              head.size().cols(), T(0), T(0), head.ptr({1, 0}),
                                              head.ld());