diff --git a/sycl/test-e2e/Matrix/Inputs/common.hpp b/sycl/test-e2e/Matrix/Inputs/common.hpp index 58937722642df..dca215ae574d2 100644 --- a/sycl/test-e2e/Matrix/Inputs/common.hpp +++ b/sycl/test-e2e/Matrix/Inputs/common.hpp @@ -183,10 +183,10 @@ bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) { } } else if constexpr (exact || std::is_integral_v) { if (src[i * cols + j] != ref[i * cols + j]) { - std::cerr << "Incorrect result in matrix." + std::cerr << "Incorrect result in matrix. " << "i: " << i << ", j: " << j - << ", Ref: " << ref[i * cols + j] - << ", Val: " << src[i * cols + j] << "\n"; + << ", Ref: " << (int)ref[i * cols + j] + << ", Val: " << (int)src[i * cols + j] << "\n"; return false; } } else { @@ -221,3 +221,16 @@ template size_t get_sg_size(queue q) { .template get_info( q.get_device()); } + +template +void matrix_print(unsigned int rows, unsigned int cols, T *mat) { + for (unsigned int i = 0; i < rows; i++) { + for (unsigned int j = 0; j < cols; j++) { + if constexpr (std::is_integral_v) + std::cout << (int)mat[i * cols + j] << " "; + else + std::cout << (float)mat[i * cols + j] << " "; + } + std::cout << "\n"; + } +} diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeAB.cpp b/sycl/test-e2e/Matrix/joint_matrix_transposeAB.cpp new file mode 100644 index 0000000000000..7a838d97a8336 --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_transposeAB.cpp @@ -0,0 +1,133 @@ +//===---joint_matrix_transposeAB.cpp - DPC++ joint_matrix--------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// RUN: %if !arch-intel_gpu_dg2 %{ %{build} -o %t_sg32.out -DSG_SZ=32 %} +// RUN: %if !arch-intel_gpu_dg2 %{ %{run} %t_sg32.out %} + +// XFAIL: gpu +// XFAIL-TRACKER: GSD-5768 + +// XFAIL: cpu +// XFAIL-TRACKER: CMPLRLLVM-52693 + +#include "common.hpp" +#include + +template class MT; + +template +void matrix_transpose(T *in, T *out, queue q) { + static_assert((NR % TR) == 0); + static_assert((NC % TC) == 0); + size_t sg_size = get_sg_size>(q); + std::cout << "SG size " << sg_size << " "; + + q.submit([&](handler &cgh) { + cgh.parallel_for>( + nd_range<2>({NR / TR, NC / TC * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[sycl::reqd_sub_group_size(SG_SZ)]] +#endif + { + auto in_ptr = + address_space_cast(in); + auto out_ptr = + address_space_cast(out); + + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + matrix_row_major; + joint_matrix + matrix_col_major; + + auto row_major_offset = + (sg_startx * TR) * NC + sg_starty / sg_size * TC; + auto col_major_offset = + (sg_startx * TR) + (sg_starty / sg_size * TC) * NR; + + joint_matrix_load(sg, matrix_row_major, in_ptr + row_major_offset, + NC); + joint_matrix_copy(sg, matrix_row_major, matrix_col_major); + ext::intel::experimental::matrix::joint_matrix_store( + sg, matrix_col_major, out_ptr + col_major_offset, NR); + }); // parallel for + }).wait(); +} + +template void test() { + std::cout << "Test " << TR << " x " << TC << " "; + static constexpr size_t SCALE = 2; + static constexpr size_t MATRIX_R = TR * SCALE; + static constexpr size_t MATRIX_C = TC * SCALE; + + queue q; + T *in = malloc_shared(MATRIX_R * MATRIX_C, q); + T *col_major = malloc_shared(MATRIX_C * MATRIX_R, q); + T *ref_col_major = malloc_shared(MATRIX_C * MATRIX_R, q); + + matrix_rand(MATRIX_R, MATRIX_C, in, (T)5); + matrix_transpose(in, col_major, q); + matrix_transpose(MATRIX_R, MATRIX_C, ref_col_major, in); + assert((matrix_compare(MATRIX_C, MATRIX_R, col_major, + ref_col_major))); + std::cout << "PASSED\n"; + + free(in, q); + free(col_major, q); + free(ref_col_major, q); +} + +int main() { + queue q; + std::vector combinations = + q.get_device().get_info(); + bool bf16_run = false; + bool half_run = false; + bool int8_run = false; + + for (auto &combination : combinations) { + if (!bf16_run && combination.atype == matrix_type::bf16) { + std::cout << "bf16:\n"; + test(); + test(); + bf16_run = true; + } + + if (!half_run && combination.atype == matrix_type::fp16) { + std::cout << "half:\n"; + test(); + test(); + half_run = true; + } + + if (!int8_run && combination.atype == matrix_type::sint8) { + std::cout << "int8:\n"; + test(); + test(); + int8_run = true; + } + + if (bf16_run && half_run && int8_run) + break; + } + + return 0; +}