From 66649125b8192c2b4c83064fd4adb9ce6b384987 Mon Sep 17 00:00:00 2001 From: Doc CI Action Date: Mon, 25 Nov 2024 18:53:32 +0000 Subject: [PATCH] Doc: TFactor: Separate larft_gemv kernel and add a faster option (#1219) --- master/blas_2tile_8h.html | 56 +- master/blas_2tile_8h_source.html | 689 ++++++++---------- .../dir_2e3e3bc658385778082583f320919a4c.html | 2 + .../dir_5ca20fc8e29b0f8739133582ef745158.html | 2 + master/factorization_2qr_2api_8h_source.html | 2 +- master/files.html | 4 +- master/gpublas_8h.html | 152 ++++ master/gpublas_8h_source.html | 237 ++++++ master/larft_8h_source.html | 142 ++++ master/qr_8h_source.html | 2 +- master/search/all_6.js | 11 +- master/search/files_6.js | 3 +- master/t__factor__impl_8h_source.html | 660 ++++++++--------- 13 files changed, 1158 insertions(+), 804 deletions(-) create mode 100644 master/gpublas_8h.html create mode 100644 master/gpublas_8h_source.html create mode 100644 master/larft_8h_source.html diff --git a/master/blas_2tile_8h.html b/master/blas_2tile_8h.html index ddf21eabce..2c4bf0c428 100644 --- a/master/blas_2tile_8h.html +++ b/master/blas_2tile_8h.html @@ -74,7 +74,6 @@
tile.h File Reference
@@ -91,44 +90,14 @@ #include <dlaf/sender/transform.h>
#include <dlaf/types.h>
#include <dlaf/util_blas.h>
-#include <whip.hpp>
#include <dlaf/gpu/blas/api.h>
-#include <dlaf/gpu/blas/error.h>
+#include <dlaf/gpu/blas/gpublas.h>
#include <dlaf/util_cublas.h>

Go to the source code of this file.

- - - -

-Macros

#define DLAF_DECLARE_GPUBLAS_OP(Name)
 
- - - - - - - - - - - - - - - - @@ -195,28 +164,7 @@

Functions

dlaf::gpublas::internal::DLAF_MAKE_GPUBLAS_OP (Axpy, axpy)
 
dlaf::gpublas::internal::DLAF_MAKE_GPUBLAS_OP (Gemv, gemv)
 
dlaf::gpublas::internal::DLAF_MAKE_GPUBLAS_OP (Trmv, trmv)
 
dlaf::gpublas::internal::DLAF_MAKE_GPUBLAS_OP (Gemm, gemm)
 
dlaf::gpublas::internal::DLAF_MAKE_GPUBLAS_SYHE_OP (Hemm, mm)
 
dlaf::gpublas::internal::DLAF_MAKE_GPUBLAS_SYHE_OP (Her2k, r2k)
 
dlaf::gpublas::internal::DLAF_MAKE_GPUBLAS_SYHE_OP (Herk, rk)
 
dlaf::gpublas::internal::DLAF_MAKE_GPUBLAS_OP (Trsm, trsm)
 
template<Backend B, class T , Device D>
void dlaf::tile::gemm (const blas::Op op_a, const blas::Op op_b, const T alpha, const Tile< const T, D > &a, const Tile< const T, D > &b, const T beta, const Tile< T, D > &c)
 

Detailed Description

Provides Tile wrappers for BLAS operations.

-

Macro Definition Documentation

- -

◆ DLAF_DECLARE_GPUBLAS_OP

- -
-
- - - - - - - - -
#define DLAF_DECLARE_GPUBLAS_OP( Name)
-
-Value:
template <typename T> \
-
struct Name
-
-
-
-

Function Documentation

+

Function Documentation

◆ gemm() [1/3]

diff --git a/master/blas_2tile_8h_source.html b/master/blas_2tile_8h_source.html index d02fe35bed..24944242af 100644 --- a/master/blas_2tile_8h_source.html +++ b/master/blas_2tile_8h_source.html @@ -109,420 +109,299 @@
28#include <dlaf/util_blas.h>
29
30#ifdef DLAF_WITH_GPU
-
31#include <whip.hpp>
-
32
-
33#include <dlaf/gpu/blas/api.h>
-
34#include <dlaf/gpu/blas/error.h>
-
35#include <dlaf/util_cublas.h>
-
36
-
37#ifdef DLAF_WITH_HIP
-
38
-
39#define DLAF_GET_ROCBLAS_WORKSPACE(f) \
-
40 [&]() { \
-
41 std::size_t workspace_size; \
-
42 DLAF_GPUBLAS_CHECK_ERROR( \
-
43 rocblas_start_device_memory_size_query(static_cast<rocblas_handle>(handle))); \
-
44 DLAF_ROCBLAS_WORKSPACE_CHECK_ERROR(rocblas_##f(handle, std::forward<Args>(args)...)); \
-
45 DLAF_GPUBLAS_CHECK_ERROR(rocblas_stop_device_memory_size_query(static_cast<rocblas_handle>(handle), \
-
46 &workspace_size)); \
-
47 return ::dlaf::memory::MemoryView<std::byte, Device::GPU>(to_int(workspace_size)); \
-
48 }();
-
49
-
50namespace dlaf::tile::internal {
-
51inline void extendROCBlasWorkspace(cublasHandle_t handle,
-
52 ::dlaf::memory::MemoryView<std::byte, Device::GPU>&& workspace) {
-
53 whip::stream_t stream;
-
54 DLAF_GPUBLAS_CHECK_ERROR(cublasGetStream(handle, &stream));
-
55 auto f = [workspace = std::move(workspace)](whip::error_t status) { whip::check_error(status); };
-
56 pika::cuda::experimental::detail::add_event_callback(std::move(f), stream);
-
57}
-
58}
+
31#include <dlaf/gpu/blas/api.h>
+
32#include <dlaf/gpu/blas/gpublas.h>
+
33#include <dlaf/util_cublas.h>
+
34#endif
+
35
+
36namespace dlaf {
+
37namespace tile {
+
38using matrix::Tile;
+
39
+
40#ifdef DLAF_DOXYGEN
+
41
+
45template <Backend B, class T, Device D>
+
46void gemm(const blas::Op op_a, const blas::Op op_b, const T alpha, const Tile<const T, D>& a,
+
47 const Tile<const T, D>& b, const T beta, const Tile<T, D>& c);
+
48
+
51template <Backend B, typename Sender,
+
52 typename = std::enable_if_t<pika::execution::experimental::is_sender_v<Sender>>>
+
53auto gemm(const dlaf::internal::Policy<B>& p, Sender&& s);
+
54
+
57template <Backend B>
+
58auto gemm(const dlaf::internal::Policy<B>& p);
59
-
60#define DLAF_DEFINE_GPUBLAS_OP(Name, Type, f) \
-
61 template <> \
-
62 struct Name<Type> { \
-
63 template <typename... Args> \
-
64 static void call(cublasHandle_t handle, Args&&... args) { \
-
65 auto workspace = DLAF_GET_ROCBLAS_WORKSPACE(f); \
-
66 DLAF_GPUBLAS_CHECK_ERROR(rocblas_set_workspace(static_cast<rocblas_handle>(handle), workspace(), \
-
67 to_sizet(workspace.size()))); \
-
68 DLAF_GPUBLAS_CHECK_ERROR(rocblas_##f(handle, std::forward<Args>(args)...)); \
-
69 DLAF_GPUBLAS_CHECK_ERROR(rocblas_set_workspace(static_cast<rocblas_handle>(handle), nullptr, 0)); \
-
70 ::dlaf::tile::internal::extendROCBlasWorkspace(handle, std::move(workspace)); \
-
71 } \
-
72 }
-
73
-
74#elif defined(DLAF_WITH_CUDA)
-
75
-
76#define DLAF_DEFINE_GPUBLAS_OP(Name, Type, f) \
-
77 template <> \
-
78 struct Name<Type> { \
-
79 template <typename... Args> \
-
80 static void call(Args&&... args) { \
-
81 DLAF_GPUBLAS_CHECK_ERROR(cublas##f##_v2(std::forward<Args>(args)...)); \
-
82 } \
-
83 }
+
63template <Backend B, class T, Device D>
+
64void hemm(const blas::Side side, const blas::Uplo uplo, const T alpha, const Tile<const T, D>& a,
+
65 const Tile<const T, D>& b, const T beta, const Tile<T, D>& c);
+
66
+
69template <Backend B, typename Sender,
+
70 typename = std::enable_if_t<pika::execution::experimental::is_sender_v<Sender>>>
+
71auto hemm(const dlaf::internal::Policy<B>& p, Sender&& s);
+
72
+
75template <Backend B>
+
76auto hemm(const dlaf::internal::Policy<B>& p);
+
77
+
81template <Backend B, class T, Device D>
+
82void her2k(const blas::Uplo uplo, const blas::Op op, const T alpha, const Tile<const T, D>& a,
+
83 const Tile<const T, D>& b, const BaseType<T> beta, const Tile<T, D>& c);
84
-
85#endif
-
86
-
87#define DLAF_DECLARE_GPUBLAS_OP(Name) \
-
88 template <typename T> \
-
89 struct Name
+
87template <Backend B, typename Sender,
+
88 typename = std::enable_if_t<pika::execution::experimental::is_sender_v<Sender>>>
+
89auto her2k(const dlaf::internal::Policy<B>& p, Sender&& s);
90
-
91#ifdef DLAF_WITH_HIP
-
92#define DLAF_MAKE_GPUBLAS_OP(Name, f) \
-
93 DLAF_DECLARE_GPUBLAS_OP(Name); \
-
94 DLAF_DEFINE_GPUBLAS_OP(Name, float, s##f); \
-
95 DLAF_DEFINE_GPUBLAS_OP(Name, double, d##f); \
-
96 DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, c##f); \
-
97 DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, z##f)
-
98
-
99#define DLAF_MAKE_GPUBLAS_SYHE_OP(Name, f) \
-
100 DLAF_DECLARE_GPUBLAS_OP(Name); \
-
101 DLAF_DEFINE_GPUBLAS_OP(Name, float, ssy##f); \
-
102 DLAF_DEFINE_GPUBLAS_OP(Name, double, dsy##f); \
-
103 DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, che##f); \
-
104 DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, zhe##f)
-
105
-
106#elif defined(DLAF_WITH_CUDA)
-
107#define DLAF_MAKE_GPUBLAS_OP(Name, f) \
-
108 DLAF_DECLARE_GPUBLAS_OP(Name); \
-
109 DLAF_DEFINE_GPUBLAS_OP(Name, float, S##f); \
-
110 DLAF_DEFINE_GPUBLAS_OP(Name, double, D##f); \
-
111 DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, C##f); \
-
112 DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, Z##f)
+
93template <Backend B>
+
94auto her2k(const dlaf::internal::Policy<B>& p);
+
95
+
99template <Backend B, class T, Device D>
+
100void herk(const blas::Uplo uplo, const blas::Op op, const BaseType<T> alpha, const Tile<const T, D>& a,
+
101 const BaseType<T> beta, const Tile<T, D>& c);
+
102
+
105template <Backend B, typename Sender,
+
106 typename = std::enable_if_t<pika::execution::experimental::is_sender_v<Sender>>>
+
107auto herk(const dlaf::internal::Policy<B>& p, Sender&& s);
+
108
+
111template <Backend B>
+
112auto herk(const dlaf::internal::Policy<B>& p);
113
-
114#define DLAF_MAKE_GPUBLAS_SYHE_OP(Name, f) \
-
115 DLAF_DECLARE_GPUBLAS_OP(Name); \
-
116 DLAF_DEFINE_GPUBLAS_OP(Name, float, Ssy##f); \
-
117 DLAF_DEFINE_GPUBLAS_OP(Name, double, Dsy##f); \
-
118 DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, Che##f); \
-
119 DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, Zhe##f)
-
120#endif
+
117template <Backend B, class T, Device D>
+
118void trmm(const dlaf::internal::Policy<B>& policy, const blas::Side side, const blas::Uplo uplo,
+
119 const blas::Op op, const blas::Diag diag, const T alpha, const Tile<const T, D>& a,
+
120 const Tile<T, D>& b);
121
-
122namespace dlaf::gpublas::internal {
-
123
-
124// Level 1
-
125DLAF_MAKE_GPUBLAS_OP(Axpy, axpy);
-
126
-
127// Level 2
-
128DLAF_MAKE_GPUBLAS_OP(Gemv, gemv);
-
129
-
130DLAF_MAKE_GPUBLAS_OP(Trmv, trmv);
-
131
-
132// Level 3
-
133DLAF_MAKE_GPUBLAS_OP(Gemm, gemm);
-
134
-
135DLAF_MAKE_GPUBLAS_SYHE_OP(Hemm, mm);
-
136
-
137DLAF_MAKE_GPUBLAS_SYHE_OP(Her2k, r2k);
-
138
-
139DLAF_MAKE_GPUBLAS_SYHE_OP(Herk, rk);
-
140
-
141#if defined(DLAF_WITH_CUDA)
-
142DLAF_MAKE_GPUBLAS_OP(Trmm, trmm);
-
143#elif defined(DLAF_WITH_HIP)
-
144
-
145#if ROCBLAS_VERSION_MAJOR >= 3 && defined(ROCBLAS_V3)
-
146DLAF_MAKE_GPUBLAS_OP(Trmm, trmm);
-
147#else
-
148DLAF_MAKE_GPUBLAS_OP(Trmm, trmm_outofplace);
-
149#endif
-
150
-
151#endif
+
124template <Backend B, typename Sender,
+
125 typename = std::enable_if_t<pika::execution::experimental::is_sender_v<Sender>>>
+
126auto trmm(const dlaf::internal::Policy<B>& p, Sender&& s);
+
127
+
130template <Backend B>
+
131auto trmm(const dlaf::internal::Policy<B>& p);
+
132
+
137template <Backend B, class T, Device D>
+
138void trmm3(const dlaf::internal::Policy<B>& policy, const blas::Side side, const blas::Uplo uplo,
+
139 const blas::Op op, const blas::Diag diag, const T alpha, const Tile<const T, D>& a,
+
140 const Tile<const T, D>& b, const Tile<T, D>& c);
+
141
+
144template <Backend B, typename Sender,
+
145 typename = std::enable_if_t<pika::execution::experimental::is_sender_v<Sender>>>
+
146auto trmm3(const dlaf::internal::Policy<B>& p, Sender&& s);
+
147
+
150template <Backend B>
+
151auto trmm3(const dlaf::internal::Policy<B>& p);
152
-
153DLAF_MAKE_GPUBLAS_OP(Trsm, trsm);
-
154}
-
155#endif
-
156
-
157namespace dlaf {
-
158namespace tile {
-
159using matrix::Tile;
+
156template <Backend B, class T, Device D>
+
157void trsm(const dlaf::internal::Policy<B>& policy, const blas::Side side, const blas::Uplo uplo,
+
158 const blas::Op op, const blas::Diag diag, const T alpha, const Tile<const T, D>& a,
+
159 const Tile<T, D>& b);
160
-
161#ifdef DLAF_DOXYGEN
-
162
-
166template <Backend B, class T, Device D>
-
167void gemm(const blas::Op op_a, const blas::Op op_b, const T alpha, const Tile<const T, D>& a,
-
168 const Tile<const T, D>& b, const T beta, const Tile<T, D>& c);
-
169
-
172template <Backend B, typename Sender,
-
173 typename = std::enable_if_t<pika::execution::experimental::is_sender_v<Sender>>>
-
174auto gemm(const dlaf::internal::Policy<B>& p, Sender&& s);
-
175
-
178template <Backend B>
-
179auto gemm(const dlaf::internal::Policy<B>& p);
-
180
-
184template <Backend B, class T, Device D>
-
185void hemm(const blas::Side side, const blas::Uplo uplo, const T alpha, const Tile<const T, D>& a,
-
186 const Tile<const T, D>& b, const T beta, const Tile<T, D>& c);
-
187
-
190template <Backend B, typename Sender,
-
191 typename = std::enable_if_t<pika::execution::experimental::is_sender_v<Sender>>>
-
192auto hemm(const dlaf::internal::Policy<B>& p, Sender&& s);
+
163template <Backend B, typename Sender,
+
164 typename = std::enable_if_t<pika::execution::experimental::is_sender_v<Sender>>>
+
165auto trsm(const dlaf::internal::Policy<B>& p, Sender&& s);
+
166
+
169template <Backend B>
+
170auto trsm(const dlaf::internal::Policy<B>& p);
+
171#else
+
172
+
173namespace internal {
+
174
+
175template <class T>
+
176void gemm(const blas::Op op_a, const blas::Op op_b, const T alpha, const Tile<const T, Device::CPU>& a,
+
177 const Tile<const T, Device::CPU>& b, const T beta, const Tile<T, Device::CPU>& c) noexcept {
+
178 auto s = tile::internal::getGemmSizes(op_a, op_b, a, b, c);
+
179 common::internal::SingleThreadedBlasScope single;
+
180 blas::gemm(blas::Layout::ColMajor, op_a, op_b, s.m, s.n, s.k, alpha, a.ptr(), a.ld(), b.ptr(), b.ld(),
+
181 beta, c.ptr(), c.ld());
+
182}
+
183
+
184template <class T>
+
185void hemm(const blas::Side side, const blas::Uplo uplo, const T alpha,
+
186 const Tile<const T, Device::CPU>& a, const Tile<const T, Device::CPU>& b, const T beta,
+
187 const Tile<T, Device::CPU>& c) {
+
188 auto s = tile::internal::getHemmSizes(side, a, b, c);
+
189 common::internal::SingleThreadedBlasScope single;
+
190 blas::hemm(blas::Layout::ColMajor, side, uplo, s.m, s.n, alpha, a.ptr(), a.ld(), b.ptr(), b.ld(), beta,
+
191 c.ptr(), c.ld());
+
192}
193
-
196template <Backend B>
-
197auto hemm(const dlaf::internal::Policy<B>& p);
-
198
-
202template <Backend B, class T, Device D>
-
203void her2k(const blas::Uplo uplo, const blas::Op op, const T alpha, const Tile<const T, D>& a,
-
204 const Tile<const T, D>& b, const BaseType<T> beta, const Tile<T, D>& c);
-
205
-
208template <Backend B, typename Sender,
-
209 typename = std::enable_if_t<pika::execution::experimental::is_sender_v<Sender>>>
-
210auto her2k(const dlaf::internal::Policy<B>& p, Sender&& s);
-
211
-
214template <Backend B>
-
215auto her2k(const dlaf::internal::Policy<B>& p);
-
216
-
220template <Backend B, class T, Device D>
-
221void herk(const blas::Uplo uplo, const blas::Op op, const BaseType<T> alpha, const Tile<const T, D>& a,
-
222 const BaseType<T> beta, const Tile<T, D>& c);
-
223
-
226template <Backend B, typename Sender,
-
227 typename = std::enable_if_t<pika::execution::experimental::is_sender_v<Sender>>>
-
228auto herk(const dlaf::internal::Policy<B>& p, Sender&& s);
-
229
-
232template <Backend B>
-
233auto herk(const dlaf::internal::Policy<B>& p);
-
234
-
238template <Backend B, class T, Device D>
-
239void trmm(const dlaf::internal::Policy<B>& policy, const blas::Side side, const blas::Uplo uplo,
-
240 const blas::Op op, const blas::Diag diag, const T alpha, const Tile<const T, D>& a,
-
241 const Tile<T, D>& b);
-
242
-
245template <Backend B, typename Sender,
-
246 typename = std::enable_if_t<pika::execution::experimental::is_sender_v<Sender>>>
-
247auto trmm(const dlaf::internal::Policy<B>& p, Sender&& s);
+
194template <class T>
+
195void her2k(const blas::Uplo uplo, const blas::Op op, const T alpha, const Tile<const T, Device::CPU>& a,
+
196 const Tile<const T, Device::CPU>& b, const BaseType<T> beta,
+
197 const Tile<T, Device::CPU>& c) noexcept {
+
198 auto s = tile::internal::getHer2kSizes(op, a, b, c);
+
199 common::internal::SingleThreadedBlasScope single;
+
200 blas::her2k(blas::Layout::ColMajor, uplo, op, s.n, s.k, alpha, a.ptr(), a.ld(), b.ptr(), b.ld(), beta,
+
201 c.ptr(), c.ld());
+
202}
+
203
+
204template <class T>
+
205void herk(const blas::Uplo uplo, const blas::Op op, const BaseType<T> alpha,
+
206 const Tile<const T, Device::CPU>& a, const BaseType<T> beta,
+
207 const Tile<T, Device::CPU>& c) noexcept {
+
208 auto s = tile::internal::getHerkSizes(op, a, c);
+
209 common::internal::SingleThreadedBlasScope single;
+
210 blas::herk(blas::Layout::ColMajor, uplo, op, s.n, s.k, alpha, a.ptr(), a.ld(), beta, c.ptr(), c.ld());
+
211}
+
212
+
213// Triangular matrix-matrix multiplication.
+
214template <class T>
+
215void trmm(const blas::Side side, const blas::Uplo uplo, const blas::Op op, const blas::Diag diag,
+
216 const T alpha, const Tile<const T, Device::CPU>& a, const Tile<T, Device::CPU>& b) noexcept {
+
217 auto s = tile::internal::getTrmmSizes(side, a, b);
+
218 common::internal::SingleThreadedBlasScope single;
+
219 blas::trmm(blas::Layout::ColMajor, side, uplo, op, diag, s.m, s.n, alpha, a.ptr(), a.ld(), b.ptr(),
+
220 b.ld());
+
221}
+
222
+
223// Triangular matrix-matrix multiplication.
+
224// Version with 3 tile arguments (different output tile).
+
225template <class T>
+
226void trmm3(const blas::Side side, const blas::Uplo uplo, const blas::Op op, const blas::Diag diag,
+
227 const T alpha, const Tile<const T, Device::CPU>& a, const Tile<const T, Device::CPU>& b,
+
228 const Tile<T, Device::CPU>& c) noexcept {
+
229 auto s = tile::internal::getTrmm3Sizes(side, a, b, c);
+
230 DLAF_ASSERT(b.ptr() == nullptr || b.ptr() != c.ptr(), b.ptr(), c.ptr());
+
231
+
232 matrix::internal::copy(b, c);
+
233 common::internal::SingleThreadedBlasScope single;
+
234 blas::trmm(blas::Layout::ColMajor, side, uplo, op, diag, s.m, s.n, alpha, a.ptr(), a.ld(), c.ptr(),
+
235 c.ld());
+
236}
+
237
+
238template <class T>
+
239void trsm(const blas::Side side, const blas::Uplo uplo, const blas::Op op, const blas::Diag diag,
+
240 const T alpha, const Tile<const T, Device::CPU>& a, const Tile<T, Device::CPU>& b) noexcept {
+
241 auto s = tile::internal::getTrsmSizes(side, a, b);
+
242 common::internal::SingleThreadedBlasScope single;
+
243 blas::trsm(blas::Layout::ColMajor, side, uplo, op, diag, s.m, s.n, alpha, a.ptr(), a.ld(), b.ptr(),
+
244 b.ld());
+
245}
+
246
+
247#ifdef DLAF_WITH_GPU
248
-
251template <Backend B>
-
252auto trmm(const dlaf::internal::Policy<B>& p);
-
253
-
258template <Backend B, class T, Device D>
-
259void trmm3(const dlaf::internal::Policy<B>& policy, const blas::Side side, const blas::Uplo uplo,
-
260 const blas::Op op, const blas::Diag diag, const T alpha, const Tile<const T, D>& a,
-
261 const Tile<const T, D>& b, const Tile<T, D>& c);
+
249template <class T>
+
250void gemm(cublasHandle_t handle, const blas::Op op_a, const blas::Op op_b, const T alpha,
+
251 const matrix::Tile<const T, Device::GPU>& a, const matrix::Tile<const T, Device::GPU>& b,
+
252 const T beta, const matrix::Tile<T, Device::GPU>& c) {
+
253 using util::blasToCublas;
+
254 using util::blasToCublasCast;
+
255 auto s = getGemmSizes(op_a, op_b, a, b, c);
+
256 gpublas::internal::Gemm<T>::call(handle, blasToCublas(op_a), blasToCublas(op_b), to_int(s.m),
+
257 to_int(s.n), to_int(s.k), blasToCublasCast(&alpha),
+
258 blasToCublasCast(a.ptr()), to_int(a.ld()), blasToCublasCast(b.ptr()),
+
259 to_int(b.ld()), blasToCublasCast(&beta), blasToCublasCast(c.ptr()),
+
260 to_int(c.ld()));
+
261}
262
-
265template <Backend B, typename Sender,
-
266 typename = std::enable_if_t<pika::execution::experimental::is_sender_v<Sender>>>
-
267auto trmm3(const dlaf::internal::Policy<B>& p, Sender&& s);
-
268
-
271template <Backend B>
-
272auto trmm3(const dlaf::internal::Policy<B>& p);
-
273
-
277template <Backend B, class T, Device D>
-
278void trsm(const dlaf::internal::Policy<B>& policy, const blas::Side side, const blas::Uplo uplo,
-
279 const blas::Op op, const blas::Diag diag, const T alpha, const Tile<const T, D>& a,
-
280 const Tile<T, D>& b);
-
281
-
284template <Backend B, typename Sender,
-
285 typename = std::enable_if_t<pika::execution::experimental::is_sender_v<Sender>>>
-
286auto trsm(const dlaf::internal::Policy<B>& p, Sender&& s);
-
287
-
290template <Backend B>
-
291auto trsm(const dlaf::internal::Policy<B>& p);
-
292#else
-
293
-
294namespace internal {
-
295
-
296template <class T>
-
297void gemm(const blas::Op op_a, const blas::Op op_b, const T alpha, const Tile<const T, Device::CPU>& a,
-
298 const Tile<const T, Device::CPU>& b, const T beta, const Tile<T, Device::CPU>& c) noexcept {
-
299 auto s = tile::internal::getGemmSizes(op_a, op_b, a, b, c);
-
300 common::internal::SingleThreadedBlasScope single;
-
301 blas::gemm(blas::Layout::ColMajor, op_a, op_b, s.m, s.n, s.k, alpha, a.ptr(), a.ld(), b.ptr(), b.ld(),
-
302 beta, c.ptr(), c.ld());
-
303}
-
304
-
305template <class T>
-
306void hemm(const blas::Side side, const blas::Uplo uplo, const T alpha,
-
307 const Tile<const T, Device::CPU>& a, const Tile<const T, Device::CPU>& b, const T beta,
-
308 const Tile<T, Device::CPU>& c) {
-
309 auto s = tile::internal::getHemmSizes(side, a, b, c);
-
310 common::internal::SingleThreadedBlasScope single;
-
311 blas::hemm(blas::Layout::ColMajor, side, uplo, s.m, s.n, alpha, a.ptr(), a.ld(), b.ptr(), b.ld(), beta,
-
312 c.ptr(), c.ld());
-
313}
-
314
-
315template <class T>
-
316void her2k(const blas::Uplo uplo, const blas::Op op, const T alpha, const Tile<const T, Device::CPU>& a,
-
317 const Tile<const T, Device::CPU>& b, const BaseType<T> beta,
-
318 const Tile<T, Device::CPU>& c) noexcept {
-
319 auto s = tile::internal::getHer2kSizes(op, a, b, c);
-
320 common::internal::SingleThreadedBlasScope single;
-
321 blas::her2k(blas::Layout::ColMajor, uplo, op, s.n, s.k, alpha, a.ptr(), a.ld(), b.ptr(), b.ld(), beta,
-
322 c.ptr(), c.ld());
-
323}
-
324
-
325template <class T>
-
326void herk(const blas::Uplo uplo, const blas::Op op, const BaseType<T> alpha,
-
327 const Tile<const T, Device::CPU>& a, const BaseType<T> beta,
-
328 const Tile<T, Device::CPU>& c) noexcept {
-
329 auto s = tile::internal::getHerkSizes(op, a, c);
-
330 common::internal::SingleThreadedBlasScope single;
-
331 blas::herk(blas::Layout::ColMajor, uplo, op, s.n, s.k, alpha, a.ptr(), a.ld(), beta, c.ptr(), c.ld());
-
332}
-
333
-
334// Triangular matrix-matrix multiplication.
-
335template <class T>
-
336void trmm(const blas::Side side, const blas::Uplo uplo, const blas::Op op, const blas::Diag diag,
-
337 const T alpha, const Tile<const T, Device::CPU>& a, const Tile<T, Device::CPU>& b) noexcept {
-
338 auto s = tile::internal::getTrmmSizes(side, a, b);
-
339 common::internal::SingleThreadedBlasScope single;
-
340 blas::trmm(blas::Layout::ColMajor, side, uplo, op, diag, s.m, s.n, alpha, a.ptr(), a.ld(), b.ptr(),
-
341 b.ld());
-
342}
-
343
-
344// Triangular matrix-matrix multiplication.
-
345// Version with 3 tile arguments (different output tile).
-
346template <class T>
-
347void trmm3(const blas::Side side, const blas::Uplo uplo, const blas::Op op, const blas::Diag diag,
-
348 const T alpha, const Tile<const T, Device::CPU>& a, const Tile<const T, Device::CPU>& b,
-
349 const Tile<T, Device::CPU>& c) noexcept {
-
350 auto s = tile::internal::getTrmm3Sizes(side, a, b, c);
-
351 DLAF_ASSERT(b.ptr() == nullptr || b.ptr() != c.ptr(), b.ptr(), c.ptr());
-
352
-
353 matrix::internal::copy(b, c);
-
354 common::internal::SingleThreadedBlasScope single;
-
355 blas::trmm(blas::Layout::ColMajor, side, uplo, op, diag, s.m, s.n, alpha, a.ptr(), a.ld(), c.ptr(),
-
356 c.ld());
-
357}
-
358
-
359template <class T>
-
360void trsm(const blas::Side side, const blas::Uplo uplo, const blas::Op op, const blas::Diag diag,
-
361 const T alpha, const Tile<const T, Device::CPU>& a, const Tile<T, Device::CPU>& b) noexcept {
-
362 auto s = tile::internal::getTrsmSizes(side, a, b);
-
363 common::internal::SingleThreadedBlasScope single;
-
364 blas::trsm(blas::Layout::ColMajor, side, uplo, op, diag, s.m, s.n, alpha, a.ptr(), a.ld(), b.ptr(),
-
365 b.ld());
-
366}
-
367
-
368#ifdef DLAF_WITH_GPU
-
369
-
370template <class T>
-
371void gemm(cublasHandle_t handle, const blas::Op op_a, const blas::Op op_b, const T alpha,
-
372 const matrix::Tile<const T, Device::GPU>& a, const matrix::Tile<const T, Device::GPU>& b,
-
373 const T beta, const matrix::Tile<T, Device::GPU>& c) {
-
374 using util::blasToCublas;
-
375 using util::blasToCublasCast;
-
376 auto s = getGemmSizes(op_a, op_b, a, b, c);
-
377 gpublas::internal::Gemm<T>::call(handle, blasToCublas(op_a), blasToCublas(op_b), to_int(s.m),
-
378 to_int(s.n), to_int(s.k), blasToCublasCast(&alpha),
-
379 blasToCublasCast(a.ptr()), to_int(a.ld()), blasToCublasCast(b.ptr()),
-
380 to_int(b.ld()), blasToCublasCast(&beta), blasToCublasCast(c.ptr()),
-
381 to_int(c.ld()));
-
382}
-
383
-
384template <class T>
-
385void hemm(cublasHandle_t handle, const blas::Side side, const blas::Uplo uplo, const T alpha,
-
386 const Tile<const T, Device::GPU>& a, const Tile<const T, Device::GPU>& b, const T beta,
-
387 const Tile<T, Device::GPU>& c) {
-
388 using util::blasToCublas;
-
389 using util::blasToCublasCast;
-
390 auto s = getHemmSizes(side, a, b, c);
-
391 gpublas::internal::Hemm<T>::call(handle, blasToCublas(side), blasToCublas(uplo), to_int(s.m),
-
392 to_int(s.n), blasToCublasCast(&alpha), blasToCublasCast(a.ptr()),
-
393 to_int(a.ld()), blasToCublasCast(b.ptr()), to_int(b.ld()),
-
394 blasToCublasCast(&beta), blasToCublasCast(c.ptr()), to_int(c.ld()));
-
395}
-
396
-
397template <class T>
-
398void her2k(cublasHandle_t handle, const blas::Uplo uplo, blas::Op op, const T alpha,
-
399 const matrix::Tile<const T, Device::GPU>& a, const Tile<const T, Device::GPU>& b,
-
400 const BaseType<T> beta, const matrix::Tile<T, Device::GPU>& c) {
-
401 using util::blasToCublas;
-
402 using util::blasToCublasCast;
-
403 auto s = getHer2kSizes(op, a, b, c);
-
404#if defined(DLAF_WITH_HIP) && HIP_VERSION < 50200000
-
405 if (!isComplex_v<T> && op == blas::Op::ConjTrans)
-
406 op = blas::Op::Trans;
-
407#endif
-
408 gpublas::internal::Her2k<T>::call(handle, blasToCublas(uplo), blasToCublas(op), to_int(s.n),
-
409 to_int(s.k), blasToCublasCast(&alpha), blasToCublasCast(a.ptr()),
-
410 to_int(a.ld()), blasToCublasCast(b.ptr()), to_int(b.ld()),
-
411 blasToCublasCast(&beta), blasToCublasCast(c.ptr()), to_int(c.ld()));
-
412}
-
413
-
414template <class T>
-
415void herk(cublasHandle_t handle, const blas::Uplo uplo, const blas::Op op, const BaseType<T> alpha,
-
416 const matrix::Tile<const T, Device::GPU>& a, const BaseType<T> beta,
-
417 const matrix::Tile<T, Device::GPU>& c) {
-
418 using util::blasToCublas;
-
419 using util::blasToCublasCast;
-
420 auto s = getHerkSizes(op, a, c);
-
421 gpublas::internal::Herk<T>::call(handle, blasToCublas(uplo), blasToCublas(op), to_int(s.n),
-
422 to_int(s.k), blasToCublasCast(&alpha), blasToCublasCast(a.ptr()),
-
423 to_int(a.ld()), blasToCublasCast(&beta), blasToCublasCast(c.ptr()),
-
424 to_int(c.ld()));
-
425}
-
426
-
427template <class T>
-
428void trmm(cublasHandle_t handle, const blas::Side side, const blas::Uplo uplo, const blas::Op op,
-
429 const blas::Diag diag, const T alpha, const matrix::Tile<const T, Device::GPU>& a,
-
430 const matrix::Tile<T, Device::GPU>& b) {
-
431 using util::blasToCublas;
-
432 using util::blasToCublasCast;
-
433 auto s = tile::internal::getTrmmSizes(side, a, b);
-
434
-
435 gpublas::internal::Trmm<T>::call(handle, blasToCublas(side), blasToCublas(uplo), blasToCublas(op),
-
436 blasToCublas(diag), to_int(s.m), to_int(s.n),
-
437 blasToCublasCast(&alpha), blasToCublasCast(a.ptr()), to_int(a.ld()),
-
438 blasToCublasCast(b.ptr()), to_int(b.ld()), blasToCublasCast(b.ptr()),
-
439 to_int(b.ld()));
-
440}
-
441
-
442template <class T>
-
443void trmm3(cublasHandle_t handle, const blas::Side side, const blas::Uplo uplo, const blas::Op op,
-
444 const blas::Diag diag, const T alpha, const matrix::Tile<const T, Device::GPU>& a,
-
445 const matrix::Tile<const T, Device::GPU>& b, const matrix::Tile<T, Device::GPU>& c) {
-
446 using util::blasToCublas;
-
447 using util::blasToCublasCast;
-
448 auto s = tile::internal::getTrmm3Sizes(side, a, b, c);
-
449 DLAF_ASSERT(b.ptr() == nullptr || b.ptr() != c.ptr(), b.ptr(), c.ptr());
-
450
-
451 gpublas::internal::Trmm<T>::call(handle, blasToCublas(side), blasToCublas(uplo), blasToCublas(op),
-
452 blasToCublas(diag), to_int(s.m), to_int(s.n),
-
453 blasToCublasCast(&alpha), blasToCublasCast(a.ptr()), to_int(a.ld()),
-
454 blasToCublasCast(b.ptr()), to_int(b.ld()), blasToCublasCast(c.ptr()),
-
455 to_int(c.ld()));
-
456}
-
457
-
458template <class T>
-
459void trsm(cublasHandle_t handle, const blas::Side side, const blas::Uplo uplo, const blas::Op op,
-
460 const blas::Diag diag, const T alpha, const matrix::Tile<const T, Device::GPU>& a,
-
461 const matrix::Tile<T, Device::GPU>& b) {
-
462 using util::blasToCublas;
-
463 using util::blasToCublasCast;
-
464 auto s = getTrsmSizes(side, a, b);
-
465 auto a_ptr = blasToCublasCast(a.ptr());
-
466 gpublas::internal::Trsm<T>::call(handle, blasToCublas(side), blasToCublas(uplo), blasToCublas(op),
-
467 blasToCublas(diag), to_int(s.m), to_int(s.n),
-
468 blasToCublasCast(&alpha), a_ptr, to_int(a.ld()),
-
469 blasToCublasCast(b.ptr()), to_int(b.ld()));
-
470}
-
471#endif // defined(DLAF_WITH_GPU)
-
472
-
473DLAF_MAKE_CALLABLE_OBJECT(gemm);
-
474DLAF_MAKE_CALLABLE_OBJECT(hemm);
-
475DLAF_MAKE_CALLABLE_OBJECT(her2k);
-
476DLAF_MAKE_CALLABLE_OBJECT(herk);
-
477DLAF_MAKE_CALLABLE_OBJECT(trmm);
-
478DLAF_MAKE_CALLABLE_OBJECT(trmm3);
-
479DLAF_MAKE_CALLABLE_OBJECT(trsm);
-
480}
-
481
-
482DLAF_MAKE_SENDER_ALGORITHM_OVERLOADS(dlaf::internal::TransformDispatchType::Blas, gemm, internal::gemm_o)
-
483DLAF_MAKE_SENDER_ALGORITHM_OVERLOADS(dlaf::internal::TransformDispatchType::Blas, hemm, internal::hemm_o)
-
484DLAF_MAKE_SENDER_ALGORITHM_OVERLOADS(dlaf::internal::TransformDispatchType::Blas, her2k,
-
485 internal::her2k_o)
-
486DLAF_MAKE_SENDER_ALGORITHM_OVERLOADS(dlaf::internal::TransformDispatchType::Blas, herk, internal::herk_o)
-
487DLAF_MAKE_SENDER_ALGORITHM_OVERLOADS(dlaf::internal::TransformDispatchType::Blas, trmm, internal::trmm_o)
-
488DLAF_MAKE_SENDER_ALGORITHM_OVERLOADS(dlaf::internal::TransformDispatchType::Blas, trmm3,
-
489 internal::trmm3_o)
-
490DLAF_MAKE_SENDER_ALGORITHM_OVERLOADS(dlaf::internal::TransformDispatchType::Blas, trsm, internal::trsm_o)
-
491
-
492#endif
-
493}
-
494}
+
263template <class T>
+
264void hemm(cublasHandle_t handle, const blas::Side side, const blas::Uplo uplo, const T alpha,
+
265 const Tile<const T, Device::GPU>& a, const Tile<const T, Device::GPU>& b, const T beta,
+
266 const Tile<T, Device::GPU>& c) {
+
267 using util::blasToCublas;
+
268 using util::blasToCublasCast;
+
269 auto s = getHemmSizes(side, a, b, c);
+
270 gpublas::internal::Hemm<T>::call(handle, blasToCublas(side), blasToCublas(uplo), to_int(s.m),
+
271 to_int(s.n), blasToCublasCast(&alpha), blasToCublasCast(a.ptr()),
+
272 to_int(a.ld()), blasToCublasCast(b.ptr()), to_int(b.ld()),
+
273 blasToCublasCast(&beta), blasToCublasCast(c.ptr()), to_int(c.ld()));
+
274}
+
275
+
276template <class T>
+
277void her2k(cublasHandle_t handle, const blas::Uplo uplo, blas::Op op, const T alpha,
+
278 const matrix::Tile<const T, Device::GPU>& a, const Tile<const T, Device::GPU>& b,
+
279 const BaseType<T> beta, const matrix::Tile<T, Device::GPU>& c) {
+
280 using util::blasToCublas;
+
281 using util::blasToCublasCast;
+
282 auto s = getHer2kSizes(op, a, b, c);
+
283#if defined(DLAF_WITH_HIP) && HIP_VERSION < 50200000
+
284 if (!isComplex_v<T> && op == blas::Op::ConjTrans)
+
285 op = blas::Op::Trans;
+
286#endif
+
287 gpublas::internal::Her2k<T>::call(handle, blasToCublas(uplo), blasToCublas(op), to_int(s.n),
+
288 to_int(s.k), blasToCublasCast(&alpha), blasToCublasCast(a.ptr()),
+
289 to_int(a.ld()), blasToCublasCast(b.ptr()), to_int(b.ld()),
+
290 blasToCublasCast(&beta), blasToCublasCast(c.ptr()), to_int(c.ld()));
+
291}
+
292
+
293template <class T>
+
294void herk(cublasHandle_t handle, const blas::Uplo uplo, const blas::Op op, const BaseType<T> alpha,
+
295 const matrix::Tile<const T, Device::GPU>& a, const BaseType<T> beta,
+
296 const matrix::Tile<T, Device::GPU>& c) {
+
297 using util::blasToCublas;
+
298 using util::blasToCublasCast;
+
299 auto s = getHerkSizes(op, a, c);
+
300 gpublas::internal::Herk<T>::call(handle, blasToCublas(uplo), blasToCublas(op), to_int(s.n),
+
301 to_int(s.k), blasToCublasCast(&alpha), blasToCublasCast(a.ptr()),
+
302 to_int(a.ld()), blasToCublasCast(&beta), blasToCublasCast(c.ptr()),
+
303 to_int(c.ld()));
+
304}
+
305
+
306template <class T>
+
307void trmm(cublasHandle_t handle, const blas::Side side, const blas::Uplo uplo, const blas::Op op,
+
308 const blas::Diag diag, const T alpha, const matrix::Tile<const T, Device::GPU>& a,
+
309 const matrix::Tile<T, Device::GPU>& b) {
+
310 using util::blasToCublas;
+
311 using util::blasToCublasCast;
+
312 auto s = tile::internal::getTrmmSizes(side, a, b);
+
313
+
314 gpublas::internal::Trmm<T>::call(handle, blasToCublas(side), blasToCublas(uplo), blasToCublas(op),
+
315 blasToCublas(diag), to_int(s.m), to_int(s.n),
+
316 blasToCublasCast(&alpha), blasToCublasCast(a.ptr()), to_int(a.ld()),
+
317 blasToCublasCast(b.ptr()), to_int(b.ld()), blasToCublasCast(b.ptr()),
+
318 to_int(b.ld()));
+
319}
+
320
+
321template <class T>
+
322void trmm3(cublasHandle_t handle, const blas::Side side, const blas::Uplo uplo, const blas::Op op,
+
323 const blas::Diag diag, const T alpha, const matrix::Tile<const T, Device::GPU>& a,
+
324 const matrix::Tile<const T, Device::GPU>& b, const matrix::Tile<T, Device::GPU>& c) {
+
325 using util::blasToCublas;
+
326 using util::blasToCublasCast;
+
327 auto s = tile::internal::getTrmm3Sizes(side, a, b, c);
+
328 DLAF_ASSERT(b.ptr() == nullptr || b.ptr() != c.ptr(), b.ptr(), c.ptr());
+
329
+
330 gpublas::internal::Trmm<T>::call(handle, blasToCublas(side), blasToCublas(uplo), blasToCublas(op),
+
331 blasToCublas(diag), to_int(s.m), to_int(s.n),
+
332 blasToCublasCast(&alpha), blasToCublasCast(a.ptr()), to_int(a.ld()),
+
333 blasToCublasCast(b.ptr()), to_int(b.ld()), blasToCublasCast(c.ptr()),
+
334 to_int(c.ld()));
+
335}
+
336
+
337template <class T>
+
338void trsm(cublasHandle_t handle, const blas::Side side, const blas::Uplo uplo, const blas::Op op,
+
339 const blas::Diag diag, const T alpha, const matrix::Tile<const T, Device::GPU>& a,
+
340 const matrix::Tile<T, Device::GPU>& b) {
+
341 using util::blasToCublas;
+
342 using util::blasToCublasCast;
+
343 auto s = getTrsmSizes(side, a, b);
+
344 auto a_ptr = blasToCublasCast(a.ptr());
+
345 gpublas::internal::Trsm<T>::call(handle, blasToCublas(side), blasToCublas(uplo), blasToCublas(op),
+
346 blasToCublas(diag), to_int(s.m), to_int(s.n),
+
347 blasToCublasCast(&alpha), a_ptr, to_int(a.ld()),
+
348 blasToCublasCast(b.ptr()), to_int(b.ld()));
+
349}
+
350#endif // defined(DLAF_WITH_GPU)
+
351
+
352DLAF_MAKE_CALLABLE_OBJECT(gemm);
+
353DLAF_MAKE_CALLABLE_OBJECT(hemm);
+
354DLAF_MAKE_CALLABLE_OBJECT(her2k);
+
355DLAF_MAKE_CALLABLE_OBJECT(herk);
+
356DLAF_MAKE_CALLABLE_OBJECT(trmm);
+
357DLAF_MAKE_CALLABLE_OBJECT(trmm3);
+
358DLAF_MAKE_CALLABLE_OBJECT(trsm);
+
359}
+
360
+
361DLAF_MAKE_SENDER_ALGORITHM_OVERLOADS(dlaf::internal::TransformDispatchType::Blas, gemm, internal::gemm_o)
+
362DLAF_MAKE_SENDER_ALGORITHM_OVERLOADS(dlaf::internal::TransformDispatchType::Blas, hemm, internal::hemm_o)
+
363DLAF_MAKE_SENDER_ALGORITHM_OVERLOADS(dlaf::internal::TransformDispatchType::Blas, her2k,
+
364 internal::her2k_o)
+
365DLAF_MAKE_SENDER_ALGORITHM_OVERLOADS(dlaf::internal::TransformDispatchType::Blas, herk, internal::herk_o)
+
366DLAF_MAKE_SENDER_ALGORITHM_OVERLOADS(dlaf::internal::TransformDispatchType::Blas, trmm, internal::trmm_o)
+
367DLAF_MAKE_SENDER_ALGORITHM_OVERLOADS(dlaf::internal::TransformDispatchType::Blas, trmm3,
+
368 internal::trmm3_o)
+
369DLAF_MAKE_SENDER_ALGORITHM_OVERLOADS(dlaf::internal::TransformDispatchType::Blas, trsm, internal::trsm_o)
+
370
+
371#endif
+
372}
+
373}
void gemm(const blas::Op op_a, const blas::Op op_b, const T alpha, const Tile< const T, D > &a, const Tile< const T, D > &b, const T beta, const Tile< T, D > &c)
void trsm(const dlaf::internal::Policy< B > &policy, const blas::Side side, const blas::Uplo uplo, const blas::Op op, const blas::Diag diag, const T alpha, const Tile< const T, D > &a, const Tile< T, D > &b)
void her2k(const blas::Uplo uplo, const blas::Op op, const T alpha, const Tile< const T, D > &a, const Tile< const T, D > &b, const BaseType< T > beta, const Tile< T, D > &c)
@@ -535,8 +414,8 @@
Definition single_threaded_blas.h:20
Definition policy.h:24
Definition tile.h:332
-
Definition memory_view.h:32
+
diff --git a/master/dir_2e3e3bc658385778082583f320919a4c.html b/master/dir_2e3e3bc658385778082583f320919a4c.html index 4160fe3484..174779ed7d 100644 --- a/master/dir_2e3e3bc658385778082583f320919a4c.html +++ b/master/dir_2e3e3bc658385778082583f320919a4c.html @@ -83,6 +83,8 @@    lacpy.h   + larft.h laset.h   diff --git a/master/dir_5ca20fc8e29b0f8739133582ef745158.html b/master/dir_5ca20fc8e29b0f8739133582ef745158.html index 496865bca8..411ec93637 100644 --- a/master/dir_5ca20fc8e29b0f8739133582ef745158.html +++ b/master/dir_5ca20fc8e29b0f8739133582ef745158.html @@ -83,6 +83,8 @@    error.h   + gpublas.h diff --git a/master/factorization_2qr_2api_8h_source.html b/master/factorization_2qr_2api_8h_source.html index b698df830c..2fc65e3e6c 100644 --- a/master/factorization_2qr_2api_8h_source.html +++ b/master/factorization_2qr_2api_8h_source.html @@ -142,7 +142,7 @@
Definition api.h:27
-
static void call(matrix::Panel< Coord::Col, T, device > &panel_view, matrix::ReadOnlyTileSender< T, Device::CPU > taus, matrix::ReadWriteTileSender< T, device > t)
Definition t_factor_impl.h:260
+
static void call(matrix::Panel< Coord::Col, T, device > &panel_view, matrix::ReadOnlyTileSender< T, Device::CPU > taus, matrix::ReadWriteTileSender< T, device > t)
Definition t_factor_impl.h:248
Definition api.h:24
diff --git a/master/files.html b/master/files.html index 156026ba95..40ec64de49 100644 --- a/master/files.html +++ b/master/files.html @@ -199,6 +199,7 @@   blas  api.h  error.h + gpublas.h   cublas  error.h   cusolver @@ -217,7 +218,8 @@   gpu  add.h  lacpy.h - laset.h + larft.h + laset.h  enum_output.h  tile.h   matrix diff --git a/master/gpublas_8h.html b/master/gpublas_8h.html new file mode 100644 index 0000000000..620d9e1337 --- /dev/null +++ b/master/gpublas_8h.html @@ -0,0 +1,152 @@ + + + + + + + +DLAF: /home/runner/work/DLA-Future/DLA-Future/include/dlaf/gpu/blas/gpublas.h File Reference + + + + + + + + + +
+
+ + + + + + +
+
DLAF +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+ +
gpublas.h File Reference
+
+
+
#include <cstddef>
+#include <utility>
+#include <whip.hpp>
+#include <dlaf/gpu/blas/api.h>
+#include <dlaf/gpu/blas/error.h>
+#include <dlaf/util_cublas.h>
+
+

Go to the source code of this file.

+ + + + +

+Macros

#define DLAF_DECLARE_GPUBLAS_OP(Name)
 
+ + + + + + + + + + + + + + + + + +

+Functions

dlaf::gpublas::internal::DLAF_MAKE_GPUBLAS_OP (Axpy, axpy)
 
dlaf::gpublas::internal::DLAF_MAKE_GPUBLAS_OP (Gemv, gemv)
 
dlaf::gpublas::internal::DLAF_MAKE_GPUBLAS_OP (Trmv, trmv)
 
dlaf::gpublas::internal::DLAF_MAKE_GPUBLAS_OP (Gemm, gemm)
 
dlaf::gpublas::internal::DLAF_MAKE_GPUBLAS_SYHE_OP (Hemm, mm)
 
dlaf::gpublas::internal::DLAF_MAKE_GPUBLAS_SYHE_OP (Her2k, r2k)
 
dlaf::gpublas::internal::DLAF_MAKE_GPUBLAS_SYHE_OP (Herk, rk)
 
dlaf::gpublas::internal::DLAF_MAKE_GPUBLAS_OP (Trsm, trsm)
 
+

Detailed Description

+

Provides gpublas wrappers for BLAS operations.

+

Macro Definition Documentation

+ +

◆ DLAF_DECLARE_GPUBLAS_OP

+ +
+
+ + + + + + + + +
#define DLAF_DECLARE_GPUBLAS_OP( Name)
+
+Value:
template <typename T> \
+
struct Name
+
+
+
+
+ + + + diff --git a/master/gpublas_8h_source.html b/master/gpublas_8h_source.html new file mode 100644 index 0000000000..42128c0ac5 --- /dev/null +++ b/master/gpublas_8h_source.html @@ -0,0 +1,237 @@ + + + + + + + +DLAF: /home/runner/work/DLA-Future/DLA-Future/include/dlaf/gpu/blas/gpublas.h Source File + + + + + + + + + +
+
+ + + + + + +
+
DLAF +
+
+
+ + + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
gpublas.h
+
+
+Go to the documentation of this file.
1//
+
2// Distributed Linear Algebra with Future (DLAF)
+
3//
+
4// Copyright (c) 2018-2024, ETH Zurich
+
5// All rights reserved.
+
6//
+
7// Please, refer to the LICENSE file in the root directory.
+
8// SPDX-License-Identifier: BSD-3-Clause
+
9//
+
10#pragma once
+
11
+
14
+
15#ifdef DLAF_WITH_GPU
+
16#include <cstddef>
+
17#include <utility>
+
18
+
19#include <whip.hpp>
+
20
+
21#include <dlaf/gpu/blas/api.h>
+
22#include <dlaf/gpu/blas/error.h>
+
23#include <dlaf/util_cublas.h>
+
24
+
25#ifdef DLAF_WITH_HIP
+
26
+
27#include <pika/async_cuda/detail/cuda_event_callback.hpp>
+
28
+ +
30
+
31#define DLAF_GET_ROCBLAS_WORKSPACE(f) \
+
32 [&]() { \
+
33 std::size_t workspace_size; \
+
34 DLAF_GPUBLAS_CHECK_ERROR( \
+
35 rocblas_start_device_memory_size_query(static_cast<rocblas_handle>(handle))); \
+
36 DLAF_ROCBLAS_WORKSPACE_CHECK_ERROR(rocblas_##f(handle, std::forward<Args>(args)...)); \
+
37 DLAF_GPUBLAS_CHECK_ERROR(rocblas_stop_device_memory_size_query(static_cast<rocblas_handle>(handle), \
+
38 &workspace_size)); \
+
39 return ::dlaf::memory::MemoryView<std::byte, Device::GPU>(to_int(workspace_size)); \
+
40 }();
+
41
+
42namespace dlaf::tile::internal {
+
43inline void extendROCBlasWorkspace(cublasHandle_t handle,
+ +
45 whip::stream_t stream;
+
46 DLAF_GPUBLAS_CHECK_ERROR(cublasGetStream(handle, &stream));
+
47 auto f = [workspace = std::move(workspace)](whip::error_t status) { whip::check_error(status); };
+
48 pika::cuda::experimental::detail::add_event_callback(std::move(f), stream);
+
49}
+
50}
+
51
+
52#define DLAF_DEFINE_GPUBLAS_OP(Name, Type, f) \
+
53 template <> \
+
54 struct Name<Type> { \
+
55 template <typename... Args> \
+
56 static void call(cublasHandle_t handle, Args&&... args) { \
+
57 auto workspace = DLAF_GET_ROCBLAS_WORKSPACE(f); \
+
58 DLAF_GPUBLAS_CHECK_ERROR(rocblas_set_workspace(static_cast<rocblas_handle>(handle), workspace(), \
+
59 to_sizet(workspace.size()))); \
+
60 DLAF_GPUBLAS_CHECK_ERROR(rocblas_##f(handle, std::forward<Args>(args)...)); \
+
61 DLAF_GPUBLAS_CHECK_ERROR(rocblas_set_workspace(static_cast<rocblas_handle>(handle), nullptr, 0)); \
+
62 ::dlaf::tile::internal::extendROCBlasWorkspace(handle, std::move(workspace)); \
+
63 } \
+
64 }
+
65
+
66#elif defined(DLAF_WITH_CUDA)
+
67
+
68#define DLAF_DEFINE_GPUBLAS_OP(Name, Type, f) \
+
69 template <> \
+
70 struct Name<Type> { \
+
71 template <typename... Args> \
+
72 static void call(Args&&... args) { \
+
73 DLAF_GPUBLAS_CHECK_ERROR(cublas##f##_v2(std::forward<Args>(args)...)); \
+
74 } \
+
75 }
+
76
+
77#endif
+
78
+
79#define DLAF_DECLARE_GPUBLAS_OP(Name) \
+
80 template <typename T> \
+
81 struct Name
+
82
+
83#ifdef DLAF_WITH_HIP
+
84#define DLAF_MAKE_GPUBLAS_OP(Name, f) \
+
85 DLAF_DECLARE_GPUBLAS_OP(Name); \
+
86 DLAF_DEFINE_GPUBLAS_OP(Name, float, s##f); \
+
87 DLAF_DEFINE_GPUBLAS_OP(Name, double, d##f); \
+
88 DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, c##f); \
+
89 DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, z##f)
+
90
+
91#define DLAF_MAKE_GPUBLAS_SYHE_OP(Name, f) \
+
92 DLAF_DECLARE_GPUBLAS_OP(Name); \
+
93 DLAF_DEFINE_GPUBLAS_OP(Name, float, ssy##f); \
+
94 DLAF_DEFINE_GPUBLAS_OP(Name, double, dsy##f); \
+
95 DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, che##f); \
+
96 DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, zhe##f)
+
97
+
98#elif defined(DLAF_WITH_CUDA)
+
99#define DLAF_MAKE_GPUBLAS_OP(Name, f) \
+
100 DLAF_DECLARE_GPUBLAS_OP(Name); \
+
101 DLAF_DEFINE_GPUBLAS_OP(Name, float, S##f); \
+
102 DLAF_DEFINE_GPUBLAS_OP(Name, double, D##f); \
+
103 DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, C##f); \
+
104 DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, Z##f)
+
105
+
106#define DLAF_MAKE_GPUBLAS_SYHE_OP(Name, f) \
+
107 DLAF_DECLARE_GPUBLAS_OP(Name); \
+
108 DLAF_DEFINE_GPUBLAS_OP(Name, float, Ssy##f); \
+
109 DLAF_DEFINE_GPUBLAS_OP(Name, double, Dsy##f); \
+
110 DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, Che##f); \
+
111 DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, Zhe##f)
+
112#endif
+
113
+
114namespace dlaf::gpublas::internal {
+
115
+
116// Level 1
+
117DLAF_MAKE_GPUBLAS_OP(Axpy, axpy);
+
118
+
119// Level 2
+
120DLAF_MAKE_GPUBLAS_OP(Gemv, gemv);
+
121
+
122DLAF_MAKE_GPUBLAS_OP(Trmv, trmv);
+
123
+
124// Level 3
+
125DLAF_MAKE_GPUBLAS_OP(Gemm, gemm);
+
126
+
127DLAF_MAKE_GPUBLAS_SYHE_OP(Hemm, mm);
+
128
+
129DLAF_MAKE_GPUBLAS_SYHE_OP(Her2k, r2k);
+
130
+
131DLAF_MAKE_GPUBLAS_SYHE_OP(Herk, rk);
+
132
+
133#if defined(DLAF_WITH_CUDA)
+
134DLAF_MAKE_GPUBLAS_OP(Trmm, trmm);
+
135#elif defined(DLAF_WITH_HIP)
+
136
+
137#if ROCBLAS_VERSION_MAJOR >= 3 && defined(ROCBLAS_V3)
+
138DLAF_MAKE_GPUBLAS_OP(Trmm, trmm);
+
139#else
+
140DLAF_MAKE_GPUBLAS_OP(Trmm, trmm_outofplace);
+
141#endif
+
142
+
143#endif
+
144
+
145DLAF_MAKE_GPUBLAS_OP(Trsm, trsm);
+
146}
+
147#endif
+
Definition memory_view.h:32
+ +
+ + + + diff --git a/master/larft_8h_source.html b/master/larft_8h_source.html new file mode 100644 index 0000000000..bf7e585559 --- /dev/null +++ b/master/larft_8h_source.html @@ -0,0 +1,142 @@ + + + + + + + +DLAF: /home/runner/work/DLA-Future/DLA-Future/include/dlaf/lapack/gpu/larft.h Source File + + + + + + + + + +
+
+ + + + + + +
+
DLAF +
+
+
+ + + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
larft.h
+
+
+
1//
+
2// Distributed Linear Algebra with Future (DLAF)
+
3//
+
4// Copyright (c) 2018-2024, ETH Zurich
+
5// All rights reserved.
+
6//
+
7// Please, refer to the LICENSE file in the root directory.
+
8// SPDX-License-Identifier: BSD-3-Clause
+
9//
+
10
+
11#pragma once
+
12
+
13#ifdef DLAF_WITH_GPU
+
14
+
15#include <blas.hh>
+
16#include <whip.hpp>
+
17
+
18#include <dlaf/gpu/blas/api.h>
+
19#include <dlaf/types.h>
+
20
+
21namespace dlaf::gpulapack {
+
22
+
23template <class T>
+
24void larft_gemv0(cublasHandle_t handle, const SizeType m, SizeType k, const T* v, const SizeType ldv,
+
25 const T* tau, T* t, const SizeType ldt);
+
26
+
27template <class T>
+
28void larft_gemv1_notau(cublasHandle_t handle, const SizeType m, const SizeType k, const T* v,
+
29 const SizeType ldv, T* t, const SizeType ldt);
+
30
+
31template <class T>
+
32void larft_gemv1_fixtau(const SizeType k, const T* tau, const SizeType inctau, T* t, const SizeType ldt,
+
33 whip::stream_t stream);
+
34
+
35#define DLAF_CUBLAS_LARFT_GEMV_ETI(kword, Type) \
+
36 kword template void larft_gemv0(cublasHandle_t handle, const SizeType n, SizeType k, const Type* v, \
+
37 const SizeType ldv, const Type* tau, Type* t, const SizeType ldt); \
+
38 kword template void larft_gemv1_notau(cublasHandle_t handle, const SizeType m, const SizeType k, \
+
39 const Type* v, const SizeType ldv, Type* t, \
+
40 const SizeType ldt); \
+
41 kword template void larft_gemv1_fixtau(const SizeType k, const Type* tau, const SizeType inctau, \
+
42 Type* t, const SizeType ldt, whip::stream_t stream)
+
43
+
44DLAF_CUBLAS_LARFT_GEMV_ETI(extern, float);
+
45DLAF_CUBLAS_LARFT_GEMV_ETI(extern, double);
+
46DLAF_CUBLAS_LARFT_GEMV_ETI(extern, std::complex<float>);
+
47DLAF_CUBLAS_LARFT_GEMV_ETI(extern, std::complex<double>);
+
48
+
49}
+
50
+
51#endif
+ +
+ + + + diff --git a/master/qr_8h_source.html b/master/qr_8h_source.html index 1575836033..354051df01 100644 --- a/master/qr_8h_source.html +++ b/master/qr_8h_source.html @@ -126,7 +126,7 @@
-
static void call(matrix::Panel< Coord::Col, T, device > &panel_view, matrix::ReadOnlyTileSender< T, Device::CPU > taus, matrix::ReadWriteTileSender< T, device > t)
Definition t_factor_impl.h:260
+
static void call(matrix::Panel< Coord::Col, T, device > &panel_view, matrix::ReadOnlyTileSender< T, Device::CPU > taus, matrix::ReadWriteTileSender< T, device > t)
Definition t_factor_impl.h:248