Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TFactor: Separate larft_gemv kernel and add a faster option #1219

Merged
merged 11 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 1 addition & 122 deletions include/dlaf/blas/tile.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,130 +28,9 @@
#include <dlaf/util_blas.h>

#ifdef DLAF_WITH_GPU
#include <whip.hpp>

#include <dlaf/gpu/blas/api.h>
#include <dlaf/gpu/blas/error.h>
#include <dlaf/gpu/blas/gpublas.h>
#include <dlaf/util_cublas.h>

#ifdef DLAF_WITH_HIP

#define DLAF_GET_ROCBLAS_WORKSPACE(f) \
[&]() { \
std::size_t workspace_size; \
DLAF_GPUBLAS_CHECK_ERROR( \
rocblas_start_device_memory_size_query(static_cast<rocblas_handle>(handle))); \
DLAF_ROCBLAS_WORKSPACE_CHECK_ERROR(rocblas_##f(handle, std::forward<Args>(args)...)); \
DLAF_GPUBLAS_CHECK_ERROR(rocblas_stop_device_memory_size_query(static_cast<rocblas_handle>(handle), \
&workspace_size)); \
return ::dlaf::memory::MemoryView<std::byte, Device::GPU>(to_int(workspace_size)); \
}();

namespace dlaf::tile::internal {
inline void extendROCBlasWorkspace(cublasHandle_t handle,
::dlaf::memory::MemoryView<std::byte, Device::GPU>&& workspace) {
whip::stream_t stream;
DLAF_GPUBLAS_CHECK_ERROR(cublasGetStream(handle, &stream));
auto f = [workspace = std::move(workspace)](whip::error_t status) { whip::check_error(status); };
pika::cuda::experimental::detail::add_event_callback(std::move(f), stream);
}
}

#define DLAF_DEFINE_GPUBLAS_OP(Name, Type, f) \
template <> \
struct Name<Type> { \
template <typename... Args> \
static void call(cublasHandle_t handle, Args&&... args) { \
auto workspace = DLAF_GET_ROCBLAS_WORKSPACE(f); \
DLAF_GPUBLAS_CHECK_ERROR(rocblas_set_workspace(static_cast<rocblas_handle>(handle), workspace(), \
to_sizet(workspace.size()))); \
DLAF_GPUBLAS_CHECK_ERROR(rocblas_##f(handle, std::forward<Args>(args)...)); \
DLAF_GPUBLAS_CHECK_ERROR(rocblas_set_workspace(static_cast<rocblas_handle>(handle), nullptr, 0)); \
::dlaf::tile::internal::extendROCBlasWorkspace(handle, std::move(workspace)); \
} \
}

#elif defined(DLAF_WITH_CUDA)

#define DLAF_DEFINE_GPUBLAS_OP(Name, Type, f) \
template <> \
struct Name<Type> { \
template <typename... Args> \
static void call(Args&&... args) { \
DLAF_GPUBLAS_CHECK_ERROR(cublas##f##_v2(std::forward<Args>(args)...)); \
} \
}

#endif

#define DLAF_DECLARE_GPUBLAS_OP(Name) \
template <typename T> \
struct Name

#ifdef DLAF_WITH_HIP
#define DLAF_MAKE_GPUBLAS_OP(Name, f) \
DLAF_DECLARE_GPUBLAS_OP(Name); \
DLAF_DEFINE_GPUBLAS_OP(Name, float, s##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, double, d##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, c##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, z##f)

#define DLAF_MAKE_GPUBLAS_SYHE_OP(Name, f) \
DLAF_DECLARE_GPUBLAS_OP(Name); \
DLAF_DEFINE_GPUBLAS_OP(Name, float, ssy##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, double, dsy##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, che##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, zhe##f)

#elif defined(DLAF_WITH_CUDA)
#define DLAF_MAKE_GPUBLAS_OP(Name, f) \
DLAF_DECLARE_GPUBLAS_OP(Name); \
DLAF_DEFINE_GPUBLAS_OP(Name, float, S##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, double, D##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, C##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, Z##f)

#define DLAF_MAKE_GPUBLAS_SYHE_OP(Name, f) \
DLAF_DECLARE_GPUBLAS_OP(Name); \
DLAF_DEFINE_GPUBLAS_OP(Name, float, Ssy##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, double, Dsy##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, Che##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, Zhe##f)
#endif

namespace dlaf::gpublas::internal {

// Level 1
DLAF_MAKE_GPUBLAS_OP(Axpy, axpy);

// Level 2
DLAF_MAKE_GPUBLAS_OP(Gemv, gemv);

DLAF_MAKE_GPUBLAS_OP(Trmv, trmv);

// Level 3
DLAF_MAKE_GPUBLAS_OP(Gemm, gemm);

DLAF_MAKE_GPUBLAS_SYHE_OP(Hemm, mm);

DLAF_MAKE_GPUBLAS_SYHE_OP(Her2k, r2k);

DLAF_MAKE_GPUBLAS_SYHE_OP(Herk, rk);

#if defined(DLAF_WITH_CUDA)
DLAF_MAKE_GPUBLAS_OP(Trmm, trmm);
#elif defined(DLAF_WITH_HIP)

#if ROCBLAS_VERSION_MAJOR >= 3 && defined(ROCBLAS_V3)
DLAF_MAKE_GPUBLAS_OP(Trmm, trmm);
#else
DLAF_MAKE_GPUBLAS_OP(Trmm, trmm_outofplace);
#endif

#endif

DLAF_MAKE_GPUBLAS_OP(Trsm, trsm);
}
#endif

namespace dlaf {
Expand Down
22 changes: 5 additions & 17 deletions include/dlaf/factorization/qr/t_factor_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <dlaf/communication/communicator_pipeline.h>
#include <dlaf/communication/kernels/all_reduce.h>
#include <dlaf/factorization/qr/api.h>
#include <dlaf/lapack/gpu/larft.h>
#include <dlaf/lapack/tile.h>
#include <dlaf/matrix/matrix.h>
#include <dlaf/matrix/tile.h>
Expand Down Expand Up @@ -172,28 +173,15 @@ struct Helpers<Backend::GPU, Device::GPU, T> {
auto gemv_func = [](cublasHandle_t handle, const matrix::Tile<const T, Device::GPU>& tile_v,
const matrix::Tile<const T, Device::CPU>& taus,
matrix::Tile<T, Device::GPU>& tile_t) noexcept {
const SizeType m = tile_v.size().rows();
const SizeType k = tile_t.size().cols();
DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k);
DLAF_ASSERT(taus.size().rows() == k, taus.size().rows(), k);
DLAF_ASSERT(taus.size().cols() == 1, taus.size().cols());

for (SizeType j = 0; j < k; ++j) {
// T(0:j, j) = -tau . V(j:, 0:j)* . V(j:, j)
// [j x 1] = [(n-j) x j]* . [(n-j) x 1]
const TileElementIndex va_start{0, 0};
const TileElementIndex vb_start{0, j};
const TileElementSize va_size{tile_v.size().rows(), j};
const TileElementIndex t_start{0, j};
const auto neg_tau = util::blasToCublasCast(-taus({j, 0}));
const auto one = util::blasToCublasCast(T{1});

gpublas::internal::Gemv<T>::call(handle, CUBLAS_OP_C, to_int(va_size.rows()),
to_int(va_size.cols()), &neg_tau,
util::blasToCublasCast(tile_v.ptr(va_start)),
to_int(tile_v.ld()),
util::blasToCublasCast(tile_v.ptr(vb_start)), 1, &one,
util::blasToCublasCast(tile_t.ptr(t_start)), 1);
}
gpulapack::larft_gemv0(handle, m, k, tile_v.ptr(), tile_v.ld(), taus.ptr(), tile_t.ptr(),
tile_t.ld());

return std::move(tile_t);
};

Expand Down
141 changes: 141 additions & 0 deletions include/dlaf/gpu/blas/gpublas.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
//
// Distributed Linear Algebra with Future (DLAF)
//
// Copyright (c) 2018-2024, ETH Zurich
// All rights reserved.
//
// Please, refer to the LICENSE file in the root directory.
// SPDX-License-Identifier: BSD-3-Clause
//
#pragma once

/// @file
/// Provides gpublas wrappers for BLAS operations.

#include <cstddef>
#include <utility>

#include <whip.hpp>

#include <dlaf/gpu/blas/api.h>
#include <dlaf/gpu/blas/error.h>
#include <dlaf/util_cublas.h>

#ifdef DLAF_WITH_HIP

#define DLAF_GET_ROCBLAS_WORKSPACE(f) \
[&]() { \
std::size_t workspace_size; \
DLAF_GPUBLAS_CHECK_ERROR( \
rocblas_start_device_memory_size_query(static_cast<rocblas_handle>(handle))); \
DLAF_ROCBLAS_WORKSPACE_CHECK_ERROR(rocblas_##f(handle, std::forward<Args>(args)...)); \
DLAF_GPUBLAS_CHECK_ERROR(rocblas_stop_device_memory_size_query(static_cast<rocblas_handle>(handle), \
&workspace_size)); \
return ::dlaf::memory::MemoryView<std::byte, Device::GPU>(to_int(workspace_size)); \
}();

namespace dlaf::tile::internal {
inline void extendROCBlasWorkspace(cublasHandle_t handle,
::dlaf::memory::MemoryView<std::byte, Device::GPU>&& workspace) {
whip::stream_t stream;
DLAF_GPUBLAS_CHECK_ERROR(cublasGetStream(handle, &stream));
auto f = [workspace = std::move(workspace)](whip::error_t status) { whip::check_error(status); };
pika::cuda::experimental::detail::add_event_callback(std::move(f), stream);
}
}

#define DLAF_DEFINE_GPUBLAS_OP(Name, Type, f) \
template <> \
struct Name<Type> { \
template <typename... Args> \
static void call(cublasHandle_t handle, Args&&... args) { \
auto workspace = DLAF_GET_ROCBLAS_WORKSPACE(f); \
DLAF_GPUBLAS_CHECK_ERROR(rocblas_set_workspace(static_cast<rocblas_handle>(handle), workspace(), \
to_sizet(workspace.size()))); \
DLAF_GPUBLAS_CHECK_ERROR(rocblas_##f(handle, std::forward<Args>(args)...)); \
DLAF_GPUBLAS_CHECK_ERROR(rocblas_set_workspace(static_cast<rocblas_handle>(handle), nullptr, 0)); \
::dlaf::tile::internal::extendROCBlasWorkspace(handle, std::move(workspace)); \
} \
}

#elif defined(DLAF_WITH_CUDA)

#define DLAF_DEFINE_GPUBLAS_OP(Name, Type, f) \
template <> \
struct Name<Type> { \
template <typename... Args> \
static void call(Args&&... args) { \
DLAF_GPUBLAS_CHECK_ERROR(cublas##f##_v2(std::forward<Args>(args)...)); \
} \
}

#endif

#define DLAF_DECLARE_GPUBLAS_OP(Name) \
template <typename T> \
struct Name

#ifdef DLAF_WITH_HIP
#define DLAF_MAKE_GPUBLAS_OP(Name, f) \
DLAF_DECLARE_GPUBLAS_OP(Name); \
DLAF_DEFINE_GPUBLAS_OP(Name, float, s##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, double, d##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, c##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, z##f)

#define DLAF_MAKE_GPUBLAS_SYHE_OP(Name, f) \
DLAF_DECLARE_GPUBLAS_OP(Name); \
DLAF_DEFINE_GPUBLAS_OP(Name, float, ssy##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, double, dsy##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, che##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, zhe##f)

#elif defined(DLAF_WITH_CUDA)
#define DLAF_MAKE_GPUBLAS_OP(Name, f) \
DLAF_DECLARE_GPUBLAS_OP(Name); \
DLAF_DEFINE_GPUBLAS_OP(Name, float, S##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, double, D##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, C##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, Z##f)

#define DLAF_MAKE_GPUBLAS_SYHE_OP(Name, f) \
DLAF_DECLARE_GPUBLAS_OP(Name); \
DLAF_DEFINE_GPUBLAS_OP(Name, float, Ssy##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, double, Dsy##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, Che##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, Zhe##f)
#endif

namespace dlaf::gpublas::internal {

// Level 1
DLAF_MAKE_GPUBLAS_OP(Axpy, axpy);

// Level 2
DLAF_MAKE_GPUBLAS_OP(Gemv, gemv);

DLAF_MAKE_GPUBLAS_OP(Trmv, trmv);

// Level 3
DLAF_MAKE_GPUBLAS_OP(Gemm, gemm);

DLAF_MAKE_GPUBLAS_SYHE_OP(Hemm, mm);

DLAF_MAKE_GPUBLAS_SYHE_OP(Her2k, r2k);

DLAF_MAKE_GPUBLAS_SYHE_OP(Herk, rk);

#if defined(DLAF_WITH_CUDA)
DLAF_MAKE_GPUBLAS_OP(Trmm, trmm);
#elif defined(DLAF_WITH_HIP)

#if ROCBLAS_VERSION_MAJOR >= 3 && defined(ROCBLAS_V3)
DLAF_MAKE_GPUBLAS_OP(Trmm, trmm);
#else
DLAF_MAKE_GPUBLAS_OP(Trmm, trmm_outofplace);
#endif

#endif

DLAF_MAKE_GPUBLAS_OP(Trsm, trsm);
}
51 changes: 51 additions & 0 deletions include/dlaf/lapack/gpu/larft.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
//
// Distributed Linear Algebra with Future (DLAF)
//
// Copyright (c) 2018-2024, ETH Zurich
// All rights reserved.
//
// Please, refer to the LICENSE file in the root directory.
// SPDX-License-Identifier: BSD-3-Clause
//

#pragma once

#ifdef DLAF_WITH_GPU

#include <blas.hh>
#include <whip.hpp>

#include <dlaf/gpu/blas/api.h>
#include <dlaf/types.h>

namespace dlaf::gpulapack {

template <class T>
void larft_gemv0(cublasHandle_t handle, const SizeType m, SizeType k, const T* v, const SizeType ldv,
const T* tau, T* t, const SizeType ldt);

template <class T>
void larft_gemv1_notau(cublasHandle_t handle, const SizeType m, const SizeType k, const T* v,
const SizeType ldv, T* t, const SizeType ldt);

template <class T>
void larft_gemv1_fixtau(const SizeType k, const T* tau, const SizeType inctau, T* t, const SizeType ldt,
whip::stream_t stream);

#define DLAF_CUBLAS_LARFT_GEMV_ETI(kword, Type) \
kword template void larft_gemv0(cublasHandle_t handle, const SizeType n, SizeType k, const Type* v, \
const SizeType ldv, const Type* tau, Type* t, const SizeType ldt); \
kword template void larft_gemv1_notau(cublasHandle_t handle, const SizeType m, const SizeType k, \
const Type* v, const SizeType ldv, Type* t, \
const SizeType ldt); \
kword template void larft_gemv1_fixtau(const SizeType k, const Type* tau, const SizeType inctau, \
Type* t, const SizeType ldt, whip::stream_t stream)

DLAF_CUBLAS_LARFT_GEMV_ETI(extern, float);
DLAF_CUBLAS_LARFT_GEMV_ETI(extern, double);
DLAF_CUBLAS_LARFT_GEMV_ETI(extern, std::complex<float>);
DLAF_CUBLAS_LARFT_GEMV_ETI(extern, std::complex<double>);

}

#endif
4 changes: 4 additions & 0 deletions miniapp/kernel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,9 @@

if(DLAF_BUILD_TESTING)
# TODO they depends on DLAF_TEST exclusively for the createTile method.
DLAF_addMiniapp(
miniapp_larft_gemv SOURCES miniapp_larft_gemv.cpp LIBRARIES dlaf.core DLAF_test DLAF_miniapp
)

DLAF_addMiniapp(miniapp_laset SOURCES miniapp_laset.cpp LIBRARIES dlaf.core DLAF_test)
endif()
Loading
Loading