From fb2a10d9fafa3326e820ef38c6bfeacd67884649 Mon Sep 17 00:00:00 2001 From: rolandschulz Date: Wed, 17 Apr 2024 19:07:31 -0700 Subject: [PATCH] Use cute::bfloat16_t --- examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp | 26 +++++++++++--------- include/cutlass/bfloat16.h | 4 +++ 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp b/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp index 56f60b8cd4..105d86fbac 100644 --- a/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp +++ b/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp @@ -20,11 +20,11 @@ using test_clock = std::chrono::high_resolution_clock; -using sycl::ext::oneapi::bfloat16; +using namespace cute; -using dtype_a = bfloat16; -using dtype_b = bfloat16; -using dtype_c = bfloat16; +using dtype_a = bfloat16_t; +using dtype_b = bfloat16_t; +using dtype_c = float; using dtype_acc = float; bool identityData = false; @@ -50,18 +50,19 @@ std::string makeTestName(const std::string &func, int tM, int tN, int tK, template static void fill_matrix(std::vector &M, size_t numRows, size_t numCols) { if (identityData) { - std::generate(std::begin(M), std::end(M), [&] { return 1.0f; }); + std::generate(std::begin(M), std::end(M), [&] { return 1.0_bf16; }); } else if (fixedData) { for (size_t r = 0; r < numRows; r++) { for (size_t c = 0; c < numCols; c++) { - M[r * numCols + c] = static_cast(r + c); + M[r * numCols + c] = bfloat16_t(float(r + c)); } } } else { std::random_device dev; std::mt19937 rng(dev()); std::uniform_real_distribution dist(-1.0, 1.0); - std::generate(std::begin(M), std::end(M), [&] { return dist(rng); }); + std::generate(std::begin(M), std::end(M), + [&] { return bfloat16_t(dist(rng)); }); } } @@ -179,16 +180,17 @@ go_dpas_blockread_vnni_tiled(sycl::queue queue, std::vector &c_vec, auto B = accB.get_multi_ptr().get(); auto C = accC.get_multi_ptr().get(); - using namespace cute; - Tensor tAr = make_tensor(Shape<_8, Int>{}); Tensor tBr = make_tensor(Shape<_8, Int>{}); Tensor tCr = make_tensor(Shape<_8, Int, Int>{}); - auto A_copy = make_xe_2d_copy(make_tensor(make_gmem_ptr(A), make_shape(M, K))); - auto B_copy = make_xe_2d_copy(make_tensor(make_gmem_ptr(B), make_shape(K, N))); - auto C_copy = make_xe_2d_copy(make_tensor(make_gmem_ptr(C), make_shape(M, N))); + auto A_copy = make_xe_2d_copy( + make_tensor(make_gmem_ptr(A), make_shape(M, K))); + auto B_copy = make_xe_2d_copy( + make_tensor(make_gmem_ptr(B), make_shape(K, N))); + auto C_copy = make_xe_2d_copy( + make_tensor(make_gmem_ptr(C), make_shape(M, N))); // TODO: - decide on how to deal with vector types // - create layouts with tiling/partitioning diff --git a/include/cutlass/bfloat16.h b/include/cutlass/bfloat16.h index 75cadbfa43..eabbd820db 100644 --- a/include/cutlass/bfloat16.h +++ b/include/cutlass/bfloat16.h @@ -118,6 +118,10 @@ struct alignas(2) bfloat16_t { asm("cvt.rn.bf16.f32 %0, %1;\n" : "=h"(storage) : "f"(x)); + #elif defined(CUTLASS_ENABLE_SYCL) + + storage = sycl::ext::oneapi::detail::bfloat16ToBits(sycl::ext::oneapi::bfloat16(x)); + #else uint32_t bits;