Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ExtraBuffers: revamping of the idea #802

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions include/dlaf/eigensolver/reduction_to_band/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "dlaf/lapack/tile.h"
#include "dlaf/matrix/copy_tile.h"
#include "dlaf/matrix/distribution.h"
#include "dlaf/matrix/extra_buffers.h"
#include "dlaf/matrix/index.h"
#include "dlaf/matrix/matrix.h"
#include "dlaf/matrix/panel.h"
Expand Down Expand Up @@ -455,20 +456,24 @@ void gemmComputeW2(matrix::Matrix<T, D>& w2, matrix::Panel<Coord::Col, const T,

namespace ex = pika::execution::experimental;

// Note:
// Not all ranks in the column always hold at least a tile in the panel Ai, but all ranks in
// the column are going to participate to the reduce. For them, it is important to set the
// partial result W2 to zero.
ex::start_detached(w2.readwrite_sender(LocalTileIndex(0, 0)) |
tile::set0(dlaf::internal::Policy<B>(thread_priority::high)));
ExtraBuffers<T, D> buffers(w2.blockSize(), 6);

//// Note:
//// Not all ranks in the column always hold at least a tile in the panel Ai, but all ranks in
//// the column are going to participate to the reduce. For them, it is important to set the
//// partial result W2 to zero.

using namespace blas;
// GEMM W2 = W* . X
for (const auto& index_tile : w.iteratorLocal())
for (const auto& index_tile : w.iteratorLocal()) {
ex::start_detached(dlaf::internal::whenAllLift(Op::ConjTrans, Op::NoTrans, T(1),
w.read_sender(index_tile), x.read_sender(index_tile),
T(1), w2.readwrite_sender(LocalTileIndex(0, 0))) |
T(1), buffers.readwrite_sender(index_tile.row())) |
tile::gemm(dlaf::internal::Policy<B>(thread_priority::high)));
}

ex::start_detached(tile::set0(dlaf::internal::Policy<B>(), w2.readwrite_sender(LocalTileIndex(0, 0))));
ex::start_detached(buffers.reduce(w2.readwrite_sender(LocalTileIndex(0, 0))));
Comment on lines +475 to +476
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably better to create a continuation with a single task?

}

template <Backend B, Device D, class T>
Expand Down Expand Up @@ -959,7 +964,7 @@ common::internal::vector<pika::shared_future<common::internal::vector<T>>> Reduc
const LocalTileIndex t_idx(0, 0);
// TODO used just by the column, maybe we can re-use a panel tile?
// TODO probably the first one in any panel is ok?
Matrix<T, D> t({nrefls_block, nrefls_block}, dist.blockSize());
Matrix<T, D> t({nrefls_block, nrefls_block}, {nrefls_block, nrefls_block});

computeTFactor<B>(v, taus.back(), t.readwrite_sender(t_idx));

Expand Down Expand Up @@ -1107,7 +1112,7 @@ common::internal::vector<pika::shared_future<common::internal::vector<T>>> Reduc
const LocalTileIndex t_idx(0, 0);
// TODO used just by the column, maybe we can re-use a panel tile?
// TODO or we can keep just the sh_future and allocate just inside if (is_panel_rank_col)
matrix::Matrix<T, D> t({nrefls_block, nrefls_block}, dist.blockSize());
matrix::Matrix<T, D> t({nrefls_block, nrefls_block}, {nrefls_block, nrefls_block});

// PANEL
const matrix::SubPanelView panel_view(dist, ij_offset, band_size);
Expand Down
66 changes: 66 additions & 0 deletions include/dlaf/matrix/extra_buffers.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
//
// Distributed Linear Algebra with Future (DLAF)
//
// Copyright (c) 2018-2023, ETH Zurich
// All rights reserved.
//
// Please, refer to the LICENSE file in the root directory.
// SPDX-License-Identifier: BSD-3-Clause
//

#pragma once

#include <vector>

#include "dlaf/blas/tile_extensions.h"
#include "dlaf/matrix/matrix.h"
#include "dlaf/types.h"

namespace dlaf {

template <class T, Device D>
struct ExtraBuffers : protected Matrix<T, D> {
ExtraBuffers(const TileElementSize bs, const SizeType size)
: Matrix<T, D>{{bs.rows() * size, bs.cols()}, bs}, nbuffers_(size) {
namespace ex = pika::execution::experimental;
for (const auto& i : common::iterate_range2d(Matrix<T, D>::distribution().localNrTiles()))
ex::start_detached(Matrix<T, D>::readwrite_sender(i) |
tile::set0(dlaf::internal::Policy<dlaf::DefaultBackend_v<D>>(
pika::execution::thread_priority::high)));
}

auto read_sender(SizeType index) {
return Matrix<T, D>::read_sender(internalIndex(index));
}

auto readwrite_sender(SizeType index) {
return Matrix<T, D>::readwrite_sender(internalIndex(index));
}

template <class TileSender>
[[nodiscard]] auto reduce(TileSender tile) {
namespace di = dlaf::internal;
namespace ex = pika::execution::experimental;

std::vector<ex::any_sender<pika::shared_future<matrix::Tile<const T, D>>>> buffers;
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The type is so cumbersome, probably @msimberg is aware of this and is going to say that it is something we will manage when we will make the tile sender mechanism pika::future free.

I don't know if there is any workaround available, but AFAIK currently our Unwrapping facilities does not unwrap vector, so we just get it sent as it is even using our dlaf::internal::transform.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see your complaint, but typedef exists 😉 I see that complex eventually being a helper typedef inside e.g. Matrix, or at least in the dlaf::matrix namespace and then you'll be able to write std::vector<ReadOnlyTileSender> or something like that. Would that partially or completely take care of your concerns here? This is what I'm planning on doing in #766 (because currently it's also dealing with long unwieldy types).

for (SizeType index = 0; index < nbuffers_; ++index)
buffers.emplace_back(read_sender(index));

return ex::when_all(std::move(tile), ex::when_all_vector(std::move(buffers))) |
di::transform(di::Policy<DefaultBackend_v<D>>(),
[](const matrix::Tile<T, D>& tile,
const std::vector<pika::shared_future<matrix::Tile<const T, D>>>& buffers,
auto&&... ts) {
for (const auto& buffer : buffers)
dlaf::tile::internal::add(T(1), buffer.get(), tile, ts...);
});
}

protected:
LocalTileIndex internalIndex(SizeType index) const noexcept {
return LocalTileIndex{index % nbuffers_, 0};
}

SizeType nbuffers_;
};
}
7 changes: 7 additions & 0 deletions test/unit/matrix/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,10 @@ DLAF_addTest(
USE_MAIN MPIPIKA
MPIRANKS 6
)

DLAF_addTest(
test_extra_buffers
SOURCES test_extra_buffers.cpp
LIBRARIES dlaf.core
USE_MAIN PIKA
)
42 changes: 42 additions & 0 deletions test/unit/matrix/test_extra_buffers.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
//
// Distributed Linear Algebra with Future (DLAF)
//
// Copyright (c) 2018-2023, ETH Zurich
// All rights reserved.
//
// Please, refer to the LICENSE file in the root directory.
// SPDX-License-Identifier: BSD-3-Clause
//

#include "dlaf/matrix/extra_buffers.h"

#include <gtest/gtest.h>

#include "dlaf/common/range2d.h"
#include "dlaf/matrix/print_numpy.h"

#include "dlaf_test/matrix/util_tile.h"

using namespace dlaf;

TEST(ExtraBuffersTest, Basic) {
using T = float;
constexpr auto D = Device::CPU;

namespace ex = pika::execution::experimental;
namespace tt = pika::this_thread::experimental;

TileElementSize tile_size(2, 2);
Matrix<T, D> tile({tile_size.rows(), tile_size.cols()}, tile_size);
constexpr SizeType nbuffers = 10;
ExtraBuffers<T, D> buffers(tile_size, nbuffers);

for (SizeType i = 0; i < nbuffers; ++i) {
tt::sync_wait(ex::when_all(buffers.readwrite_sender(i), ex::just(T(1))) |
ex::then([](const auto& tile, const T value) { matrix::test::set(tile, value); }));
}

ex::start_detached(buffers.reduce(tile.readwrite_sender(LocalTileIndex{0, 0})));

print(format::numpy{}, tile.read(LocalTileIndex(0, 0)).get());
}