Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CA-Red2Band: Communication-Avoiding Reduction To Band #1177

Draft
wants to merge 30 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
3725b84
file and folder structure for new algorithm
albestro Jul 1, 2024
df2f1de
factor out red2band functions so they can be reused in new implementa…
albestro Jul 1, 2024
b9ce996
basic test structure
albestro Jul 1, 2024
52971b2
WIP: qr for both 1st and 2nd passes
albestro Jul 1, 2024
7be4199
move some basic building block that can be shared between
albestro Jul 11, 2024
12015bd
first implementation of trailing matrix update (1st pass)
albestro Jul 11, 2024
255b5e1
fixes for qr 2nd pass
albestro Jul 11, 2024
382bee6
implementation 2nd pass update trailing + fixes
albestro Jul 18, 2024
e4647f0
switch to local tfactor for 2nd pass + minor changes
albestro Jul 18, 2024
9c6a417
change mat_hh_2nd distribution to an equivalent one
albestro Jul 18, 2024
a76d9eb
adapt computePanelReflectors to deal with a tile for taus
albestro Jul 18, 2024
a2d5638
fix distribution for both taus_1st and taus_2nd
albestro Jul 19, 2024
5c146f8
update test structure
albestro Jul 19, 2024
ad6ade0
minor changes
albestro Jul 19, 2024
b1b6185
CHECK: broadcast panel now communicate also last tile
albestro Aug 14, 2024
8f8b199
hh2 fix panel size problem for reflectors
albestro Aug 14, 2024
3883430
hh2 fix W1 panel, which has a different size wrt others
albestro Aug 14, 2024
7839564
fix deadlock: potentially unused tile
albestro Aug 14, 2024
e26812b
hh1: fix her2k transposed indices usage
albestro Aug 14, 2024
9d8d648
hh2: fix her2k computation LR + R
albestro Aug 14, 2024
155fd59
t-factor minor changes
albestro Aug 14, 2024
7c3dc60
fix t-factor for local usage on distributed matrix
albestro Aug 14, 2024
2b81a93
fix her2k HH1 + remove console output
albestro Aug 16, 2024
41fc7ab
basic test implementation
albestro Aug 16, 2024
185293e
WIP: just a quick switch for testing performances of both algo variants
albestro Sep 11, 2024
9326fa8
WIP: refactor algo for incomplete tile
albestro Sep 20, 2024
045c5bf
fix missing headers
albestro Sep 20, 2024
9f3aa79
WIP: workaround for t-factor local
albestro Sep 25, 2024
350de05
WIP: workaround problem of last tile not communicated in panel bcast …
albestro Sep 25, 2024
8e820a1
WIP: workaround for oversized 2nd step panel workspace
albestro Sep 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 37 additions & 12 deletions include/dlaf/communication/broadcast_panel.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@
#include <pika/execution.hpp>

#include <dlaf/common/index2d.h>
#include <dlaf/common/range2d.h>
#include <dlaf/communication/communicator_pipeline.h>
#include <dlaf/communication/index.h>
#include <dlaf/communication/kernels/broadcast.h>
#include <dlaf/communication/message.h>
#include <dlaf/matrix/copy_tile.h>
#include <dlaf/matrix/index.h>
#include <dlaf/matrix/panel.h>
#include <dlaf/matrix/tile.h>
#include <dlaf/types.h>
Expand Down Expand Up @@ -89,7 +91,6 @@ auto& get_taskchain(comm::CommunicatorPipeline<comm::CommunicatorType::Row>& row
return col_task_chain;
}
}
} // namespace internal

/// Broadcast
///
Expand All @@ -105,9 +106,6 @@ auto& get_taskchain(comm::CommunicatorPipeline<comm::CommunicatorType::Row>& row
/// - linking as external tile, if the tile is already available locally for the rank
/// - receiving the tile from the owning rank (via a broadcast)
///
/// Be aware that the last tile will just be available on @p panel, but it won't be transposed to
/// @p panelT.
///
/// @param rank_root specifies on which rank the @p panel is the source of the data
/// @param panel
/// on rank_root it is the source panel (a)
Expand All @@ -125,7 +123,8 @@ template <class T, Device D, Coord axis, matrix::StoreTransposed storage,
void broadcast(comm::IndexT_MPI rank_root, matrix::Panel<axis, T, D, storage>& panel,
matrix::Panel<orthogonal(axis), T, D, storageT>& panelT,
comm::CommunicatorPipeline<comm::CommunicatorType::Row>& row_task_chain,
comm::CommunicatorPipeline<comm::CommunicatorType::Col>& col_task_chain) {
comm::CommunicatorPipeline<comm::CommunicatorType::Col>& col_task_chain,
common::IterableRange2D<SizeType, matrix::LocalTile_TAG> range) {
constexpr Coord axisT = orthogonal(axis);

constexpr Coord coord = std::decay_t<decltype(panel)>::coord;
Expand Down Expand Up @@ -183,13 +182,6 @@ void broadcast(comm::IndexT_MPI rank_root, matrix::Panel<axis, T, D, storage>& p

auto& chain_step2 = internal::get_taskchain<comm_dir_step2>(row_task_chain, col_task_chain);

const SizeType last_tile = std::max(panelT.rangeStart(), panelT.rangeEnd() - 1);
const auto owner = dist.template rankGlobalTile<coordT>(last_tile);
const auto range = dist.rankIndex().get(coordT) == owner
? common::iterate_range2d(*panelT.iteratorLocal().begin(),
LocalTileIndex(coordT, panelT.rangeEndLocal() - 1, 1))
: panelT.iteratorLocal();

for (const auto& indexT : range) {
auto [index_diag, owner_diag] = internal::transposedOwner<coordT>(dist, indexT);

Expand All @@ -208,6 +200,39 @@ void broadcast(comm::IndexT_MPI rank_root, matrix::Panel<axis, T, D, storage>& p
}
}
}
} // namespace internal

template <class T, Device D, Coord axis, matrix::StoreTransposed storage,
matrix::StoreTransposed storageT, class = std::enable_if_t<!std::is_const_v<T>>>
void broadcast(comm::IndexT_MPI rank_root, matrix::Panel<axis, T, D, storage>& panel,
matrix::Panel<orthogonal(axis), T, D, storageT>& panelT,
comm::CommunicatorPipeline<comm::CommunicatorType::Row>& row_task_chain,
comm::CommunicatorPipeline<comm::CommunicatorType::Col>& col_task_chain) {
constexpr Coord coordT = std::decay_t<decltype(panelT)>::coord;

const auto& dist = panel.parentDistribution();

const SizeType last_tile = std::max(panelT.rangeStart(), panelT.rangeEnd() - 1);

if (panel.rangeStart() == panel.rangeEnd())
return;

const auto owner = dist.template rankGlobalTile<coordT>(last_tile);
const auto range = dist.rankIndex().get(coordT) == owner
? common::iterate_range2d(*panelT.iteratorLocal().begin(),
LocalTileIndex(coordT, panelT.rangeEndLocal() - 1, 1))
: panelT.iteratorLocal();

internal::broadcast(rank_root, panel, panelT, row_task_chain, col_task_chain, range);
}

template <class T, Device D, Coord axis, matrix::StoreTransposed storage,
matrix::StoreTransposed storageT, class = std::enable_if_t<!std::is_const_v<T>>>
void broadcast_all(comm::IndexT_MPI rank_root, matrix::Panel<axis, T, D, storage>& panel,
matrix::Panel<orthogonal(axis), T, D, storageT>& panelT,
comm::CommunicatorPipeline<comm::CommunicatorType::Row>& row_task_chain,
comm::CommunicatorPipeline<comm::CommunicatorType::Col>& col_task_chain) {
internal::broadcast(rank_root, panel, panelT, row_task_chain, col_task_chain, panelT.iteratorLocal());
}
}
}
14 changes: 14 additions & 0 deletions include/dlaf/eigensolver/reduction_to_band.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,18 @@ Matrix<T, Device::CPU> reduction_to_band(comm::CommunicatorGrid& grid, Matrix<T,

return ReductionToBand<B, D, T>::call(grid, mat_a, band_size);
}

template <Backend B, Device D, class T>
internal::CARed2BandResult<T, D> ca_reduction_to_band(comm::CommunicatorGrid& grid, Matrix<T, D>& mat_a,
const SizeType band_size) {
DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
DLAF_ASSERT(matrix::square_blocksize(mat_a), mat_a);
DLAF_ASSERT(matrix::single_tile_per_block(mat_a), mat_a);
DLAF_ASSERT(matrix::equal_process_grid(mat_a, grid), mat_a, grid);

DLAF_ASSERT(band_size >= 2, band_size);
DLAF_ASSERT(mat_a.blockSize().rows() % band_size == 0, mat_a.blockSize().rows(), band_size);

return CAReductionToBand<B, D, T>::call(grid, mat_a, band_size);
}
}
17 changes: 16 additions & 1 deletion include/dlaf/eigensolver/reduction_to_band/api.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,24 @@ struct ReductionToBand {
const SizeType band_size);
};

template <class T, Device D>
struct CARed2BandResult {
Matrix<T, Device::CPU> taus_1st;
// hh_1st are stored in-place
Matrix<T, Device::CPU> taus_2nd;
Matrix<T, D> hh_2nd;
};

template <Backend B, Device D, class T>
struct CAReductionToBand {
static CARed2BandResult<T, D> call(comm::CommunicatorGrid& grid, Matrix<T, D>& mat_a,
const SizeType band_size);
};

// ETI
#define DLAF_EIGENSOLVER_REDUCTION_TO_BAND_ETI(KWORD, BACKEND, DEVICE, DATATYPE) \
KWORD template struct ReductionToBand<BACKEND, DEVICE, DATATYPE>;
KWORD template struct ReductionToBand<BACKEND, DEVICE, DATATYPE>; \
KWORD template struct CAReductionToBand<BACKEND, DEVICE, DATATYPE>;

DLAF_EIGENSOLVER_REDUCTION_TO_BAND_ETI(extern, Backend::MC, Device::CPU, float)
DLAF_EIGENSOLVER_REDUCTION_TO_BAND_ETI(extern, Backend::MC, Device::CPU, double)
Expand Down
Loading
Loading