From 0be8cc98565d43ad6929da9c2007228b0794a417 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Raffaele=20Solc=C3=A0?= Date: Mon, 25 Nov 2024 19:53:01 +0100 Subject: [PATCH] TFactor: Separate larft_gemv kernel and add a faster option (#1219) Co-authored-by: Alberto Invernizzi --- include/dlaf/blas/tile.h | 123 +--------- include/dlaf/factorization/qr/t_factor_impl.h | 22 +- include/dlaf/gpu/blas/gpublas.h | 147 ++++++++++++ include/dlaf/lapack/gpu/larft.h | 51 ++++ miniapp/include/dlaf/miniapp/kernel_runner.h | 1 + miniapp/kernel/CMakeLists.txt | 4 + miniapp/kernel/miniapp_larft_gemv.cpp | 227 ++++++++++++++++++ miniapp/kernel/miniapp_laset.cpp | 1 - src/CMakeLists.txt | 1 + src/lapack/gpu/larft.cu | 130 ++++++++++ src/lapack/gpu/laset.cu | 1 + 11 files changed, 568 insertions(+), 140 deletions(-) create mode 100644 include/dlaf/gpu/blas/gpublas.h create mode 100644 include/dlaf/lapack/gpu/larft.h create mode 100644 miniapp/kernel/miniapp_larft_gemv.cpp create mode 100644 src/lapack/gpu/larft.cu diff --git a/include/dlaf/blas/tile.h b/include/dlaf/blas/tile.h index d534f93d5d..1f1f8ee58c 100644 --- a/include/dlaf/blas/tile.h +++ b/include/dlaf/blas/tile.h @@ -28,130 +28,9 @@ #include #ifdef DLAF_WITH_GPU -#include - #include -#include +#include #include - -#ifdef DLAF_WITH_HIP - -#define DLAF_GET_ROCBLAS_WORKSPACE(f) \ - [&]() { \ - std::size_t workspace_size; \ - DLAF_GPUBLAS_CHECK_ERROR( \ - rocblas_start_device_memory_size_query(static_cast(handle))); \ - DLAF_ROCBLAS_WORKSPACE_CHECK_ERROR(rocblas_##f(handle, std::forward(args)...)); \ - DLAF_GPUBLAS_CHECK_ERROR(rocblas_stop_device_memory_size_query(static_cast(handle), \ - &workspace_size)); \ - return ::dlaf::memory::MemoryView(to_int(workspace_size)); \ - }(); - -namespace dlaf::tile::internal { -inline void extendROCBlasWorkspace(cublasHandle_t handle, - ::dlaf::memory::MemoryView&& workspace) { - whip::stream_t stream; - DLAF_GPUBLAS_CHECK_ERROR(cublasGetStream(handle, &stream)); - auto f = [workspace = std::move(workspace)](whip::error_t status) { whip::check_error(status); }; - pika::cuda::experimental::detail::add_event_callback(std::move(f), stream); -} -} - -#define DLAF_DEFINE_GPUBLAS_OP(Name, Type, f) \ - template <> \ - struct Name { \ - template \ - static void call(cublasHandle_t handle, Args&&... args) { \ - auto workspace = DLAF_GET_ROCBLAS_WORKSPACE(f); \ - DLAF_GPUBLAS_CHECK_ERROR(rocblas_set_workspace(static_cast(handle), workspace(), \ - to_sizet(workspace.size()))); \ - DLAF_GPUBLAS_CHECK_ERROR(rocblas_##f(handle, std::forward(args)...)); \ - DLAF_GPUBLAS_CHECK_ERROR(rocblas_set_workspace(static_cast(handle), nullptr, 0)); \ - ::dlaf::tile::internal::extendROCBlasWorkspace(handle, std::move(workspace)); \ - } \ - } - -#elif defined(DLAF_WITH_CUDA) - -#define DLAF_DEFINE_GPUBLAS_OP(Name, Type, f) \ - template <> \ - struct Name { \ - template \ - static void call(Args&&... args) { \ - DLAF_GPUBLAS_CHECK_ERROR(cublas##f##_v2(std::forward(args)...)); \ - } \ - } - -#endif - -#define DLAF_DECLARE_GPUBLAS_OP(Name) \ - template \ - struct Name - -#ifdef DLAF_WITH_HIP -#define DLAF_MAKE_GPUBLAS_OP(Name, f) \ - DLAF_DECLARE_GPUBLAS_OP(Name); \ - DLAF_DEFINE_GPUBLAS_OP(Name, float, s##f); \ - DLAF_DEFINE_GPUBLAS_OP(Name, double, d##f); \ - DLAF_DEFINE_GPUBLAS_OP(Name, std::complex, c##f); \ - DLAF_DEFINE_GPUBLAS_OP(Name, std::complex, z##f) - -#define DLAF_MAKE_GPUBLAS_SYHE_OP(Name, f) \ - DLAF_DECLARE_GPUBLAS_OP(Name); \ - DLAF_DEFINE_GPUBLAS_OP(Name, float, ssy##f); \ - DLAF_DEFINE_GPUBLAS_OP(Name, double, dsy##f); \ - DLAF_DEFINE_GPUBLAS_OP(Name, std::complex, che##f); \ - DLAF_DEFINE_GPUBLAS_OP(Name, std::complex, zhe##f) - -#elif defined(DLAF_WITH_CUDA) -#define DLAF_MAKE_GPUBLAS_OP(Name, f) \ - DLAF_DECLARE_GPUBLAS_OP(Name); \ - DLAF_DEFINE_GPUBLAS_OP(Name, float, S##f); \ - DLAF_DEFINE_GPUBLAS_OP(Name, double, D##f); \ - DLAF_DEFINE_GPUBLAS_OP(Name, std::complex, C##f); \ - DLAF_DEFINE_GPUBLAS_OP(Name, std::complex, Z##f) - -#define DLAF_MAKE_GPUBLAS_SYHE_OP(Name, f) \ - DLAF_DECLARE_GPUBLAS_OP(Name); \ - DLAF_DEFINE_GPUBLAS_OP(Name, float, Ssy##f); \ - DLAF_DEFINE_GPUBLAS_OP(Name, double, Dsy##f); \ - DLAF_DEFINE_GPUBLAS_OP(Name, std::complex, Che##f); \ - DLAF_DEFINE_GPUBLAS_OP(Name, std::complex, Zhe##f) -#endif - -namespace dlaf::gpublas::internal { - -// Level 1 -DLAF_MAKE_GPUBLAS_OP(Axpy, axpy); - -// Level 2 -DLAF_MAKE_GPUBLAS_OP(Gemv, gemv); - -DLAF_MAKE_GPUBLAS_OP(Trmv, trmv); - -// Level 3 -DLAF_MAKE_GPUBLAS_OP(Gemm, gemm); - -DLAF_MAKE_GPUBLAS_SYHE_OP(Hemm, mm); - -DLAF_MAKE_GPUBLAS_SYHE_OP(Her2k, r2k); - -DLAF_MAKE_GPUBLAS_SYHE_OP(Herk, rk); - -#if defined(DLAF_WITH_CUDA) -DLAF_MAKE_GPUBLAS_OP(Trmm, trmm); -#elif defined(DLAF_WITH_HIP) - -#if ROCBLAS_VERSION_MAJOR >= 3 && defined(ROCBLAS_V3) -DLAF_MAKE_GPUBLAS_OP(Trmm, trmm); -#else -DLAF_MAKE_GPUBLAS_OP(Trmm, trmm_outofplace); -#endif - -#endif - -DLAF_MAKE_GPUBLAS_OP(Trsm, trsm); -} #endif namespace dlaf { diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h index a4dfdd615b..af3222f0af 100644 --- a/include/dlaf/factorization/qr/t_factor_impl.h +++ b/include/dlaf/factorization/qr/t_factor_impl.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -172,28 +173,15 @@ struct Helpers { auto gemv_func = [](cublasHandle_t handle, const matrix::Tile& tile_v, const matrix::Tile& taus, matrix::Tile& tile_t) noexcept { + const SizeType m = tile_v.size().rows(); const SizeType k = tile_t.size().cols(); DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k); DLAF_ASSERT(taus.size().rows() == k, taus.size().rows(), k); DLAF_ASSERT(taus.size().cols() == 1, taus.size().cols()); - for (SizeType j = 0; j < k; ++j) { - // T(0:j, j) = -tau . V(j:, 0:j)* . V(j:, j) - // [j x 1] = [(n-j) x j]* . [(n-j) x 1] - const TileElementIndex va_start{0, 0}; - const TileElementIndex vb_start{0, j}; - const TileElementSize va_size{tile_v.size().rows(), j}; - const TileElementIndex t_start{0, j}; - const auto neg_tau = util::blasToCublasCast(-taus({j, 0})); - const auto one = util::blasToCublasCast(T{1}); - - gpublas::internal::Gemv::call(handle, CUBLAS_OP_C, to_int(va_size.rows()), - to_int(va_size.cols()), &neg_tau, - util::blasToCublasCast(tile_v.ptr(va_start)), - to_int(tile_v.ld()), - util::blasToCublasCast(tile_v.ptr(vb_start)), 1, &one, - util::blasToCublasCast(tile_t.ptr(t_start)), 1); - } + gpulapack::larft_gemv0(handle, m, k, tile_v.ptr(), tile_v.ld(), taus.ptr(), tile_t.ptr(), + tile_t.ld()); + return std::move(tile_t); }; diff --git a/include/dlaf/gpu/blas/gpublas.h b/include/dlaf/gpu/blas/gpublas.h new file mode 100644 index 0000000000..18bb0dfa31 --- /dev/null +++ b/include/dlaf/gpu/blas/gpublas.h @@ -0,0 +1,147 @@ +// +// Distributed Linear Algebra with Future (DLAF) +// +// Copyright (c) 2018-2024, ETH Zurich +// All rights reserved. +// +// Please, refer to the LICENSE file in the root directory. +// SPDX-License-Identifier: BSD-3-Clause +// +#pragma once + +/// @file +/// Provides gpublas wrappers for BLAS operations. + +#ifdef DLAF_WITH_GPU +#include +#include + +#include + +#include +#include +#include + +#ifdef DLAF_WITH_HIP + +#include + +#include + +#define DLAF_GET_ROCBLAS_WORKSPACE(f) \ + [&]() { \ + std::size_t workspace_size; \ + DLAF_GPUBLAS_CHECK_ERROR( \ + rocblas_start_device_memory_size_query(static_cast(handle))); \ + DLAF_ROCBLAS_WORKSPACE_CHECK_ERROR(rocblas_##f(handle, std::forward(args)...)); \ + DLAF_GPUBLAS_CHECK_ERROR(rocblas_stop_device_memory_size_query(static_cast(handle), \ + &workspace_size)); \ + return ::dlaf::memory::MemoryView(to_int(workspace_size)); \ + }(); + +namespace dlaf::tile::internal { +inline void extendROCBlasWorkspace(cublasHandle_t handle, + ::dlaf::memory::MemoryView&& workspace) { + whip::stream_t stream; + DLAF_GPUBLAS_CHECK_ERROR(cublasGetStream(handle, &stream)); + auto f = [workspace = std::move(workspace)](whip::error_t status) { whip::check_error(status); }; + pika::cuda::experimental::detail::add_event_callback(std::move(f), stream); +} +} + +#define DLAF_DEFINE_GPUBLAS_OP(Name, Type, f) \ + template <> \ + struct Name { \ + template \ + static void call(cublasHandle_t handle, Args&&... args) { \ + auto workspace = DLAF_GET_ROCBLAS_WORKSPACE(f); \ + DLAF_GPUBLAS_CHECK_ERROR(rocblas_set_workspace(static_cast(handle), workspace(), \ + to_sizet(workspace.size()))); \ + DLAF_GPUBLAS_CHECK_ERROR(rocblas_##f(handle, std::forward(args)...)); \ + DLAF_GPUBLAS_CHECK_ERROR(rocblas_set_workspace(static_cast(handle), nullptr, 0)); \ + ::dlaf::tile::internal::extendROCBlasWorkspace(handle, std::move(workspace)); \ + } \ + } + +#elif defined(DLAF_WITH_CUDA) + +#define DLAF_DEFINE_GPUBLAS_OP(Name, Type, f) \ + template <> \ + struct Name { \ + template \ + static void call(Args&&... args) { \ + DLAF_GPUBLAS_CHECK_ERROR(cublas##f##_v2(std::forward(args)...)); \ + } \ + } + +#endif + +#define DLAF_DECLARE_GPUBLAS_OP(Name) \ + template \ + struct Name + +#ifdef DLAF_WITH_HIP +#define DLAF_MAKE_GPUBLAS_OP(Name, f) \ + DLAF_DECLARE_GPUBLAS_OP(Name); \ + DLAF_DEFINE_GPUBLAS_OP(Name, float, s##f); \ + DLAF_DEFINE_GPUBLAS_OP(Name, double, d##f); \ + DLAF_DEFINE_GPUBLAS_OP(Name, std::complex, c##f); \ + DLAF_DEFINE_GPUBLAS_OP(Name, std::complex, z##f) + +#define DLAF_MAKE_GPUBLAS_SYHE_OP(Name, f) \ + DLAF_DECLARE_GPUBLAS_OP(Name); \ + DLAF_DEFINE_GPUBLAS_OP(Name, float, ssy##f); \ + DLAF_DEFINE_GPUBLAS_OP(Name, double, dsy##f); \ + DLAF_DEFINE_GPUBLAS_OP(Name, std::complex, che##f); \ + DLAF_DEFINE_GPUBLAS_OP(Name, std::complex, zhe##f) + +#elif defined(DLAF_WITH_CUDA) +#define DLAF_MAKE_GPUBLAS_OP(Name, f) \ + DLAF_DECLARE_GPUBLAS_OP(Name); \ + DLAF_DEFINE_GPUBLAS_OP(Name, float, S##f); \ + DLAF_DEFINE_GPUBLAS_OP(Name, double, D##f); \ + DLAF_DEFINE_GPUBLAS_OP(Name, std::complex, C##f); \ + DLAF_DEFINE_GPUBLAS_OP(Name, std::complex, Z##f) + +#define DLAF_MAKE_GPUBLAS_SYHE_OP(Name, f) \ + DLAF_DECLARE_GPUBLAS_OP(Name); \ + DLAF_DEFINE_GPUBLAS_OP(Name, float, Ssy##f); \ + DLAF_DEFINE_GPUBLAS_OP(Name, double, Dsy##f); \ + DLAF_DEFINE_GPUBLAS_OP(Name, std::complex, Che##f); \ + DLAF_DEFINE_GPUBLAS_OP(Name, std::complex, Zhe##f) +#endif + +namespace dlaf::gpublas::internal { + +// Level 1 +DLAF_MAKE_GPUBLAS_OP(Axpy, axpy); + +// Level 2 +DLAF_MAKE_GPUBLAS_OP(Gemv, gemv); + +DLAF_MAKE_GPUBLAS_OP(Trmv, trmv); + +// Level 3 +DLAF_MAKE_GPUBLAS_OP(Gemm, gemm); + +DLAF_MAKE_GPUBLAS_SYHE_OP(Hemm, mm); + +DLAF_MAKE_GPUBLAS_SYHE_OP(Her2k, r2k); + +DLAF_MAKE_GPUBLAS_SYHE_OP(Herk, rk); + +#if defined(DLAF_WITH_CUDA) +DLAF_MAKE_GPUBLAS_OP(Trmm, trmm); +#elif defined(DLAF_WITH_HIP) + +#if ROCBLAS_VERSION_MAJOR >= 3 && defined(ROCBLAS_V3) +DLAF_MAKE_GPUBLAS_OP(Trmm, trmm); +#else +DLAF_MAKE_GPUBLAS_OP(Trmm, trmm_outofplace); +#endif + +#endif + +DLAF_MAKE_GPUBLAS_OP(Trsm, trsm); +} +#endif diff --git a/include/dlaf/lapack/gpu/larft.h b/include/dlaf/lapack/gpu/larft.h new file mode 100644 index 0000000000..5e08f59683 --- /dev/null +++ b/include/dlaf/lapack/gpu/larft.h @@ -0,0 +1,51 @@ +// +// Distributed Linear Algebra with Future (DLAF) +// +// Copyright (c) 2018-2024, ETH Zurich +// All rights reserved. +// +// Please, refer to the LICENSE file in the root directory. +// SPDX-License-Identifier: BSD-3-Clause +// + +#pragma once + +#ifdef DLAF_WITH_GPU + +#include +#include + +#include +#include + +namespace dlaf::gpulapack { + +template +void larft_gemv0(cublasHandle_t handle, const SizeType m, SizeType k, const T* v, const SizeType ldv, + const T* tau, T* t, const SizeType ldt); + +template +void larft_gemv1_notau(cublasHandle_t handle, const SizeType m, const SizeType k, const T* v, + const SizeType ldv, T* t, const SizeType ldt); + +template +void larft_gemv1_fixtau(const SizeType k, const T* tau, const SizeType inctau, T* t, const SizeType ldt, + whip::stream_t stream); + +#define DLAF_CUBLAS_LARFT_GEMV_ETI(kword, Type) \ + kword template void larft_gemv0(cublasHandle_t handle, const SizeType n, SizeType k, const Type* v, \ + const SizeType ldv, const Type* tau, Type* t, const SizeType ldt); \ + kword template void larft_gemv1_notau(cublasHandle_t handle, const SizeType m, const SizeType k, \ + const Type* v, const SizeType ldv, Type* t, \ + const SizeType ldt); \ + kword template void larft_gemv1_fixtau(const SizeType k, const Type* tau, const SizeType inctau, \ + Type* t, const SizeType ldt, whip::stream_t stream) + +DLAF_CUBLAS_LARFT_GEMV_ETI(extern, float); +DLAF_CUBLAS_LARFT_GEMV_ETI(extern, double); +DLAF_CUBLAS_LARFT_GEMV_ETI(extern, std::complex); +DLAF_CUBLAS_LARFT_GEMV_ETI(extern, std::complex); + +} + +#endif diff --git a/miniapp/include/dlaf/miniapp/kernel_runner.h b/miniapp/include/dlaf/miniapp/kernel_runner.h index 1223f344ea..9ed19007ae 100644 --- a/miniapp/include/dlaf/miniapp/kernel_runner.h +++ b/miniapp/include/dlaf/miniapp/kernel_runner.h @@ -10,6 +10,7 @@ #pragma once +#include #include #ifdef DLAF_WITH_GPU diff --git a/miniapp/kernel/CMakeLists.txt b/miniapp/kernel/CMakeLists.txt index d573ff32bd..9e0c8b39c9 100644 --- a/miniapp/kernel/CMakeLists.txt +++ b/miniapp/kernel/CMakeLists.txt @@ -10,5 +10,9 @@ if(DLAF_BUILD_TESTING) # TODO they depends on DLAF_TEST exclusively for the createTile method. + DLAF_addMiniapp( + miniapp_larft_gemv SOURCES miniapp_larft_gemv.cpp LIBRARIES dlaf.core DLAF_test DLAF_miniapp + ) + DLAF_addMiniapp(miniapp_laset SOURCES miniapp_laset.cpp LIBRARIES dlaf.core DLAF_test) endif() diff --git a/miniapp/kernel/miniapp_larft_gemv.cpp b/miniapp/kernel/miniapp_larft_gemv.cpp new file mode 100644 index 0000000000..0a1a919503 --- /dev/null +++ b/miniapp/kernel/miniapp_larft_gemv.cpp @@ -0,0 +1,227 @@ +// +// Distributed Linear Algebra with Future (DLAF) +// +// Copyright (c) 2018-2024, ETH Zurich +// All rights reserved. +// +// Please, refer to the LICENSE file in the root directory. +// SPDX-License-Identifier: BSD-3-Clause +// + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace dlaf; +using namespace dlaf::miniapp; +using dlaf::matrix::test::createTile; +using dlaf::matrix::util::internal::getter_random; + +struct Options : MiniappKernelOptions { + SizeType m; + SizeType n; + SizeType k; + SizeType ldv; + SizeType ldt; + SizeType kernel_id; + + Options(const pika::program_options::variables_map& vm) + : MiniappKernelOptions(vm), m(vm["m"].as()), n(vm["n"].as()), + k(vm["k"].as()), ldv(vm["ldv"].as()), ldt(vm["ldt"].as()), + kernel_id(vm["kernel_id"].as()) { + DLAF_ASSERT(m > 0, m); + DLAF_ASSERT(n > 0, n); + DLAF_ASSERT(k > 0, k); + // Limit the benchmarks to relevant cases. + DLAF_ASSERT(k <= n, k, n); + DLAF_ASSERT(ldv >= m && ldv > 0, ldv, m); + DLAF_ASSERT(ldt >= k && ldt > 0, ldt, k); + } + + Options(Options&&) = default; + Options(const Options&) = default; + Options& operator=(Options&&) = default; + Options& operator=(const Options&) = default; +}; + +template +double ops(const double m, const double k) { + if (m == 0) + return 0; + double add_mul = (m + 1) * k * (k - 1) / 2; + return dlaf::total_ops(add_mul, add_mul); +} + +template +void MC_reference(const SizeType m, const SizeType k, const T* v, const SizeType ldv, const T* tau, T* t, + const SizeType ldt) { + for (int j = 1; j < k; ++j) { + auto v_ = [v, ldv](SizeType i, SizeType j) { return v + i + j * ldv; }; + auto t_ = [t, ldt](SizeType i, SizeType j) { return t + i + j * ldt; }; + blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, m, j, -tau[j], v_(0, 0), ldv, v_(0, j), 1, + T{1}, t_(0, j), 1); + } + for (int j = 0; j < k; ++j) { + t[j * (ldt + 1)] = tau[j]; + } +} + +struct Test { + template + static void run(const Options& opts) { + constexpr Device device = DefaultDevice_v; + + const SizeType m = opts.m; + const SizeType n = opts.n; + const SizeType k = opts.k; + const SizeType ldv = opts.ldv; + const SizeType ldt = opts.ldt; + + getter_random random_value(25698); + auto rnd = [&random_value](const TileElementIndex&) { return random_value(); }; + auto tau = createTile(rnd, {k, 1}, k); + auto v = createTile(rnd, {m, k}, ldv); + + // As the kernels need T to be zeroed we start from there. + auto el_t = [](const TileElementIndex&) { return T{0.}; }; + + WorkTiles vs(opts.count, m, k, ldv); + WorkTiles ts(opts.count, k, k, ldt); + WorkTiles taus(opts.count, k, 1, k); + + vs.setElementsFromTile(v); + ts.setElements(el_t); + + [[maybe_unused]] auto kernel_MC = [m, k, &vs, &tau, &ts](SizeType i) { + MC_reference(m, k, vs(i).ptr(), vs(i).ld(), tau.ptr(), ts(i).ptr(), ts(i).ld()); + }; +#ifdef DLAF_WITH_CUDA + [[maybe_unused]] auto kernel_GPU0 = [m, k, &vs, &tau, &ts](SizeType i, cublasHandle_t handle) { + gpulapack::larft_gemv0(handle, m, k, vs(i).ptr(), vs(i).ld(), tau.ptr(), ts(i).ptr(), ts(i).ld()); + }; + + [[maybe_unused]] auto copy_tau_in_t = [k, &tau, &ts](SizeType i, whip::stream_t stream) { + gpulapack::lacpy(blas::Uplo::General, 1, k, tau.ptr(), 1, ts(i).ptr(), ts(i).ld() + 1, stream); + }; + + [[maybe_unused]] auto copy_tau = [k, &tau, &taus](SizeType i, whip::stream_t stream) { + gpulapack::lacpy(blas::Uplo::General, k, 1, tau.ptr(), tau.ld(), taus(i).ptr(), taus(i).ld(), + stream); + }; + + [[maybe_unused]] auto kernel_GPU1 = [m, k, &vs, &ts](SizeType i, cublasHandle_t handle) { + gpulapack::larft_gemv1_notau(handle, m, k, vs(i).ptr(), vs(i).ld(), ts(i).ptr(), ts(i).ld()); + }; + + [[maybe_unused]] auto post_kernel_GPU1 = [m, k, &taus, &ts](SizeType i, whip::stream_t stream) { + gpulapack::larft_gemv1_fixtau(k, taus(i).ptr(), 1, ts(i).ptr(), ts(i).ld(), stream); + }; +#endif + const double flop = ops(n, k); + + KernelRunner runner(opts.count, opts.nparallel); + + for (SizeType run_index = 0; run_index < opts.nruns; ++run_index) { + double elapsed_time_pre = 0; + double elapsed_time_kernel = 0; + double elapsed_time_post = 0; + ts.setElements(el_t); + + if constexpr (backend == Backend::MC) { + elapsed_time_kernel = runner.run(kernel_MC); + } +#ifdef DLAF_WITH_CUDA + if constexpr (backend == Backend::GPU) { + switch (opts.kernel_id) { + case 0: + elapsed_time_kernel = runner.runHandle(kernel_GPU0); + elapsed_time_post = runner.runStream(copy_tau_in_t); + break; + case 1: + elapsed_time_pre = runner.runStream(copy_tau); + elapsed_time_kernel = runner.runHandle(kernel_GPU1); + elapsed_time_post = runner.runStream(post_kernel_GPU1); + break; + default: + std::cout << "Error: Nonexistent kernel id" << opts.kernel_id << std::endl; + DLAF_UNREACHABLE_PLAIN; + } + } +#endif + + double elapsed_time = elapsed_time_pre + elapsed_time_kernel + elapsed_time_post; + const double gflops = flop / elapsed_time / 1e9; + const double gflops_kernel = flop / elapsed_time_kernel / 1e9; + + std::cout << "[" << run_index << "]" + << " " << elapsed_time << "s" + << " " << gflops << "GFlop/s" + << " " << dlaf::internal::FormatShort{opts.type} << " "; + std::cout << m << " " << k << " " << ldv << " " << ldt << " " << opts.nparallel << " " << backend; + if (backend == Backend::GPU) + std::cout << " " << opts.kernel_id; + + std::cout << " |" + << " PRE: " << elapsed_time_pre << "s" + << " KERNEL: " << elapsed_time_kernel << "s " << gflops_kernel << "GFlop/s" + << " POST: " << elapsed_time_post << "s" << std::endl; + + if ((opts.do_check == dlaf::miniapp::CheckIterFreq::Last && run_index == (opts.nruns - 1)) || + opts.do_check == dlaf::miniapp::CheckIterFreq::All) { + auto t = createTile(el_t, {k, k}, k); + MC_reference(m, k, v.ptr(), v.ld(), tau.ptr(), t.ptr(), t.ld()); + + auto error = ts.check(t); + if (error > k) + std::cout << "CHECK FAILED!!!: "; + + std::cout << "| res - ref | / | res | / eps: " << error << std::endl; + } + } + } +}; + +int main(int argc, char** argv) { + // options + using namespace pika::program_options; + options_description desc_commandline("Usage: miniapp_larft_gemv [options]"); + desc_commandline.add(getMiniappKernelOptionsDescription()); + + // clang-format off + desc_commandline.add_options() + ("m", value() ->default_value(512), "Tile row size") + ("n", value() ->default_value(256), "Tile row size") + ("k", value() ->default_value(256), "Tile col size") + ("ldv", value() ->default_value(512), "Tile leading dimension") + ("ldt", value() ->default_value(512), "Tile leading dimension") + ("kernel_id", value() ->default_value(1), "GPU kernel id") + ; + // clang-format on + addUploOption(desc_commandline); + + variables_map vm; + store(parse_command_line(argc, argv, desc_commandline), vm); + notify(vm); + if (vm.count("help")) { + std::cout << desc_commandline << "\n"; + return 1; + } + Options options(vm); + + dispatchMiniapp(options); + + return 0; +} diff --git a/miniapp/kernel/miniapp_laset.cpp b/miniapp/kernel/miniapp_laset.cpp index e640d1e5e5..7e66df7935 100644 --- a/miniapp/kernel/miniapp_laset.cpp +++ b/miniapp/kernel/miniapp_laset.cpp @@ -8,7 +8,6 @@ // SPDX-License-Identifier: BSD-3-Clause // -#include #include #include #include diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4af59a4e25..e26a290731 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -214,6 +214,7 @@ DLAF_addSublibrary( memory/memory_chunk.cpp tune.cpp GPU_SOURCES cusolver/assert_info.cu lapack/gpu/add.cu lapack/gpu/lacpy.cu lapack/gpu/laset.cu + lapack/gpu/larft.cu COMPILE_OPTIONS $<$:--extended-lambda> ) diff --git a/src/lapack/gpu/larft.cu b/src/lapack/gpu/larft.cu new file mode 100644 index 0000000000..aa34d7e9c6 --- /dev/null +++ b/src/lapack/gpu/larft.cu @@ -0,0 +1,130 @@ +// +// Distributed Linear Algebra with Future (DLAF) +// +// Copyright (c) 2018-2024, ETH Zurich +// All rights reserved. +// +// Please, refer to the LICENSE file in the root directory. +// SPDX-License-Identifier: BSD-3-Clause +// + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dlaf::gpulapack { +namespace kernels { + +using namespace dlaf::util::cuda_operators; + +template +__global__ void fix_tau(const unsigned k, const T* tau, unsigned inctau, T* t, unsigned ldt) { + DLAF_GPU_ASSERT_HEAVY(kts_rows_t == blockDim.x); + DLAF_GPU_ASSERT_HEAVY(kts_cols_t == blockDim.y); + DLAF_GPU_ASSERT_HEAVY(1 == blockDim.z); + DLAF_GPU_ASSERT_HEAVY(ceilDiv(k, kts_rows_t) == gridDim.x); + DLAF_GPU_ASSERT_HEAVY(ceilDiv(k, kts_cols_t) == gridDim.y); + DLAF_GPU_ASSERT_HEAVY(1 == gridDim.z); + + const unsigned i = blockIdx.x * kts_rows_t + threadIdx.x; + const unsigned j = blockIdx.y * kts_cols_t + threadIdx.y; + + // quick return if outside of t. + if (i >= k || j >= k) + return; + + T& t_ij = t[i + j * ldt]; + T tau_j = tau[j * inctau]; + if (i > j) + t_ij = {}; + else if (i == j) + t_ij = tau_j; + else + t_ij = -tau_j * t_ij; +} +} + +template +void larft_gemv0(cublasHandle_t handle, const SizeType m, const SizeType k, const T* v, + const SizeType ldv, const T* tau, T* t, const SizeType ldt) { + DLAF_ASSERT(m >= 0, m); + DLAF_ASSERT(k >= 0, k); + DLAF_ASSERT(ldv >= m, ldv, m); + DLAF_ASSERT(ldt >= k, ldt, k); + + const int m_ = to_int(m); + const int k_ = to_int(k); + const int ldv_ = to_int(ldv); + const int ldt_ = to_int(ldt); + + auto v_ = [v, ldv_](int i, int j) { return util::blasToCublasCast(v + i + j * ldv_); }; + auto t_ = [t, ldt_](int i, int j) { return util::blasToCublasCast(t + i + j * ldt_); }; + + for (int j = 1; j < k_; ++j) { + const auto mtau = util::blasToCublasCast(-tau[j]); + const auto one = util::blasToCublasCast(T{1}); + gpublas::internal::Gemv::call(handle, CUBLAS_OP_C, m_, j, &mtau, v_(0, 0), ldv_, v_(0, j), 1, + &one, t_(0, j), 1); + } +} + +template +void larft_gemv1_notau(cublasHandle_t handle, const SizeType m, const SizeType k, const T* v, + const SizeType ldv, T* t, const SizeType ldt) { + DLAF_ASSERT(m >= 0, m); + DLAF_ASSERT(k >= 0, k); + DLAF_ASSERT(ldv >= m, ldv, m); + DLAF_ASSERT(ldt >= k, ldt, k); + + const int m_ = to_int(m); + const int k_ = to_int(k); + const int ldv_ = to_int(ldv); + const int ldt_ = to_int(ldt); + + auto v_ = util::blasToCublasCast(v); + auto t_ = util::blasToCublasCast(t); + + const auto one = util::blasToCublasCast(T{1}); + gpublas::internal::Gemm::call(handle, CUBLAS_OP_C, CUBLAS_OP_N, k_, k_, m_, &one, v_, ldv_, v_, + ldv_, &one, t_, ldt_); +} + +template +void larft_gemv1_fixtau(const SizeType k, const T* tau, const SizeType inctau, T* t, const SizeType ldt, + whip::stream_t stream) { + constexpr unsigned kts_rows_t = 32; + constexpr unsigned kts_cols_t = 32; + + DLAF_ASSERT(k >= 0, k); + DLAF_ASSERT(ldt >= k, ldt, k); + DLAF_ASSERT(inctau >= 1, inctau); + + if (k == 0) + return; + + const unsigned uk = to_uint(k); + const unsigned uinctau = to_uint(inctau); + const unsigned uldt = to_uint(ldt); + + dim3 nr_threads(kts_rows_t, kts_cols_t); + dim3 nr_blocks(util::ceilDiv(uk, kts_rows_t), util::ceilDiv(uk, kts_cols_t)); + + auto tau_ = util::blasToCublasCast(tau); + auto t_ = util::blasToCublasCast(t); + kernels::fix_tau + <<>>(uk, tau_, uinctau, t_, uldt); +} + +DLAF_CUBLAS_LARFT_GEMV_ETI(, float); +DLAF_CUBLAS_LARFT_GEMV_ETI(, double); +DLAF_CUBLAS_LARFT_GEMV_ETI(, std::complex); +DLAF_CUBLAS_LARFT_GEMV_ETI(, std::complex); + +} diff --git a/src/lapack/gpu/laset.cu b/src/lapack/gpu/laset.cu index dea96572d9..c0e68ca1d3 100644 --- a/src/lapack/gpu/laset.cu +++ b/src/lapack/gpu/laset.cu @@ -12,6 +12,7 @@ #include #include +#include #include #include