From 26d4b4a1303c8c52ee1628e68351f3fcde566ef9 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Mon, 20 May 2024 14:34:24 +0100 Subject: [PATCH] Run sycl ampere example on cuda --- benchmarks/CMakeLists.txt | 20 +- benchmarks/ampere/CMakeLists.txt | 4 +- ...ere_gemm_bf16_bf16_fp32_tensor_op_fp32.cu} | 0 ...ere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu} | 0 examples/sycl/CMakeLists.txt | 2 - examples/sycl/ampere/CMakeLists.txt | 38 -- ...ere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp | 0 ...ere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp | 153 ------- examples/sycl/ampere/gemm_configuration.hpp | 197 --------- examples/sycl/common/example_runner.hpp | 387 ------------------ 10 files changed, 13 insertions(+), 788 deletions(-) rename benchmarks/ampere/{bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp => bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cu} (100%) rename benchmarks/ampere/{bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp => bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu} (100%) delete mode 100644 examples/sycl/ampere/CMakeLists.txt delete mode 100644 examples/sycl/ampere/ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp delete mode 100644 examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp delete mode 100644 examples/sycl/ampere/gemm_configuration.hpp delete mode 100644 examples/sycl/common/example_runner.hpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 395af2dc2..083106728 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -43,20 +43,22 @@ function(cutlass_benchmark_add_executable NAME) add_dependencies(cutlass_benchmarks ${NAME}) + if (NOT CUTLASS_ENABLE_SYCL) + SET(ADD_CUDA ON) + endif() + target_link_libraries( ${NAME} PRIVATE CUTLASS cutlass_tools_util_includes - ) - - target_include_directories( - ${NAME} - PRIVATE - ${CUTLASS_BENCHMARKS_COMMON_SOURCE_DIR} - ) + $<$:nvidia::cublas> + $<$:cuda> + ) - add_sycl_to_target(TARGET ${NAME}) + if (CUTLASS_ENABLE_SYCL) + add_sycl_to_target(TARGET ${NAME}) + endif() install( TARGETS ${NAME} @@ -66,6 +68,6 @@ endfunction() if(SYCL_INTEL_TARGET) add_subdirectory(pvc) -else(SYCL_NVIDIA_TARGET) +else(SYCL_NVIDIA_TARGET OR NOT CUTLASS_ENABLE_SYCL) add_subdirectory(ampere) endif() diff --git a/benchmarks/ampere/CMakeLists.txt b/benchmarks/ampere/CMakeLists.txt index 666d9cac6..a77901594 100644 --- a/benchmarks/ampere/CMakeLists.txt +++ b/benchmarks/ampere/CMakeLists.txt @@ -29,10 +29,10 @@ cutlass_benchmark_add_executable( bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32 - bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp + bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu ) cutlass_benchmark_add_executable( bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32 - bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp + bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cu ) diff --git a/benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp b/benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cu similarity index 100% rename from benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp rename to benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cu diff --git a/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp b/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu similarity index 100% rename from benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp rename to benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt index 8dac4f6c5..b736ce35e 100644 --- a/examples/sycl/CMakeLists.txt +++ b/examples/sycl/CMakeLists.txt @@ -29,6 +29,4 @@ if(SYCL_INTEL_TARGET) add_subdirectory(pvc) -else(SYCL_NVIDIA_TARGET) - add_subdirectory(ampere) endif() diff --git a/examples/sycl/ampere/CMakeLists.txt b/examples/sycl/ampere/CMakeLists.txt deleted file mode 100644 index 81c8545ec..000000000 --- a/examples/sycl/ampere/CMakeLists.txt +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -cutlass_example_add_executable( - ampere_gemm_fp16_fp16_fp32_tensor_op_fp32 - ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp -) - -cutlass_example_add_executable( - ampere_gemm_bf16_bf16_fp32_tensor_op_fp32 - ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp -) diff --git a/examples/sycl/ampere/ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp b/examples/sycl/ampere/ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp deleted file mode 100644 index e69de29bb..000000000 diff --git a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp b/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp deleted file mode 100644 index a7e212fe8..000000000 --- a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp +++ /dev/null @@ -1,153 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - -#include "../common/example_runner.hpp" -#include "gemm_configuration.hpp" - -int main(int argc, const char** argv) -{ - // - // Parse options - // - - Options options; - - options.parse(argc, argv); - - if (options.help) { - options.print_usage(std::cout) << std::endl; - return 0; - } - - if (options.error) { - std::cerr << "Aborting execution." << std::endl; - return -1; - } - - // - // Run examples - // - - // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This - // information is used by the underlying kernel. - cutlass::KernelHardwareInfo hw_info; - - // Change device_id to another value if you are running on a machine with multiple GPUs and wish - // to use a GPU other than that with device ID 0. - hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); - -// The code section below describes datatype for input, output matrices and computation between -// elements in input matrices. - using ElementAccumulator = float; // <- data type of accumulator - using ElementComputeEpilogue = float; // <- data type of epilogue operations - using ElementInputA = half_t; // <- data type of elements in input matrix A - using ElementInputB = half_t; // <- data type of elements in input matrix B - using ElementOutput = float; // <- data type of elements in output matrix D - - using LayoutA = cutlass::layout::ColumnMajor; - using LayoutB = cutlass::layout::RowMajor; - using LayoutC = cutlass::layout::ColumnMajor; - using LayoutD = cutlass::layout::ColumnMajor; - - using TileShape = Shape<_128, _128, _32>; - - using TiledMma = TiledMMA< - MMA_Atom, - Layout>, // 2x2x1 thread group - Tile<_32,_32,_16>>; // 32x32x8 MMA for LDSM, 1x2x1 value group - - static constexpr int kAlignmentA = 8; - using DefaultOperandA = DefaultGemm_TensorOpSm80_OperandA< - ElementInputA, LayoutA, kAlignmentA, 32>; - using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom; // M, K - using SmemCopyAtomA = typename DefaultOperandA::SmemCopyAtom; - using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy; - - static constexpr int kAlignmentB = 8; - using DefaultOperandB = DefaultGemm_TensorOpSm80_OperandB< - ElementInputB, LayoutB, kAlignmentB, 32>; - using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom; // N, K - using SmemCopyAtomB = typename DefaultOperandB::SmemCopyAtom; - using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy; - - using Stages = Int<3>; - - // This code section describes the epilogue part of the kernel - using EpilogueOp = cutlass::epilogue::thread::LinearCombination< - ElementOutput, // <- data type of output matrix - 128 / cutlass::sizeof_bits::value, // <- the number of elements per vectorized - // memory access. For a byte, it's 16 - // elements. This becomes the vector width of - // math instructions in the epilogue too - ElementAccumulator, // <- data type of accumulator - ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function - - using DispatchPolicy = cutlass::gemm::MainloopSm80CpAsync; - - // Define strides (mixed) - using StrideA = cutlass::detail::TagToStrideA_t; - using StrideB = cutlass::detail::TagToStrideB_t; - using StrideC = cutlass::detail::TagToStrideC_t; - using StrideD = cutlass::detail::TagToStrideC_t; - - using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue< - StrideC, - StrideD, - EpilogueOp, - cutlass::gemm::EpilogueDefault>; - - // Mainloop - using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< - DispatchPolicy, - TileShape, - ElementInputA, - StrideA, - ElementInputB, - StrideB, - TiledMma, - GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity, // A - GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity // B - >; - - using GemmKernel = cutlass::gemm::kernel::GemmUniversal< - Shape, - CollectiveMainloop, - CollectiveEpilogue - >; - - using Gemm = cutlass::gemm::device::GemmUniversalAdapter; - - ExampleRunner runner; - - runner.run(options, hw_info); - - return 0; -} diff --git a/examples/sycl/ampere/gemm_configuration.hpp b/examples/sycl/ampere/gemm_configuration.hpp deleted file mode 100644 index d0740af47..000000000 --- a/examples/sycl/ampere/gemm_configuration.hpp +++ /dev/null @@ -1,197 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - -#include "cutlass/half.h" -#include "cutlass/layout/layout.h" - -#include "cute/swizzle.hpp" -#include "cute/layout.hpp" -#include "cute/arch/copy_sm75.hpp" -#include "cute/arch/copy_sm80.hpp" -#include "cute/atom/copy_atom.hpp" - -using namespace cute; - -template -struct DefaultGemm_TensorOpSm80_OperandA; - -template -struct DefaultGemm_TensorOpSm80_OperandB; - -///////////////////////////////////////////////////////////////////////// - -// half - -/// Operand A - Row-major (K-Major) -template <> -struct DefaultGemm_TensorOpSm80_OperandA -{ - // Smem - using SmemLayoutAtom = decltype( - composition(Swizzle<3,3,3>{}, - Layout, - Stride<_64, _1>>{})); - using SmemCopyAtom = Copy_Atom; - - // Gmem - using GmemTiledCopy = decltype( - make_tiled_copy(Copy_Atom, half_t>{}, - Layout, - Stride< _8,_1>>{}, - Layout>{})); -}; - -/// Operand A - Column-major (M-major) -template -struct DefaultGemm_TensorOpSm80_OperandA -{ - // Smem - using SmemLayoutAtom = decltype( - composition(Swizzle<3,3,3>{}, - Layout, - Stride< _1,_64>>{})); - using SmemCopyAtom = Copy_Atom; - - // Gmem - using GmemTiledCopy = decltype( - make_tiled_copy(Copy_Atom, half_t>{}, - Layout, - Stride< _1,_16>>{}, - Layout>{})); -}; - -/// Operand A - Row-major (K-Major) -template <> -struct DefaultGemm_TensorOpSm80_OperandA -{ - // Smem - using SmemLayoutAtom = decltype( - composition(Swizzle<2,3,3>{}, - Layout, - Stride<_32, _1>>{})); - using SmemCopyAtom = Copy_Atom; - - // Gmem - using GmemTiledCopy = decltype( - make_tiled_copy(Copy_Atom, half_t>{}, - Layout, - Stride< _4,_1>>{}, - Layout>{})); -}; - -// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands - -// Operand B - Column-Major (K-major) -template -struct DefaultGemm_TensorOpSm80_OperandB - : DefaultGemm_TensorOpSm80_OperandA -{}; - -// Operand B - Row-Major (N-major) -template -struct DefaultGemm_TensorOpSm80_OperandB - : DefaultGemm_TensorOpSm80_OperandA -{}; - -///////////////////////////////////////////////////////////////////////// - -// Bfloat - -/// Operand A - Row-major (K-Major) -template <> -struct DefaultGemm_TensorOpSm80_OperandA -{ - // Smem - using SmemLayoutAtom = decltype( - composition(Swizzle<3,3,3>{}, - Layout, - Stride<_64, _1>>{})); - using SmemCopyAtom = Copy_Atom; - - // Gmem - using GmemTiledCopy = decltype( - make_tiled_copy(Copy_Atom, bfloat16_t>{}, - Layout, - Stride< _8,_1>>{}, - Layout>{})); -}; - -/// Operand A - Column-major (M-major) -template -struct DefaultGemm_TensorOpSm80_OperandA -{ - // Smem - using SmemLayoutAtom = decltype( - composition(Swizzle<3,3,3>{}, - Layout, - Stride< _1,_64>>{})); - using SmemCopyAtom = Copy_Atom; - - // Gmem - using GmemTiledCopy = decltype( - make_tiled_copy(Copy_Atom, bfloat16_t>{}, - Layout, - Stride< _1,_16>>{}, - Layout>{})); -}; - -/// Operand A - Row-major (K-Major) -template <> -struct DefaultGemm_TensorOpSm80_OperandA -{ - // Smem - using SmemLayoutAtom = decltype( - composition(Swizzle<2,3,3>{}, - Layout, - Stride<_32, _1>>{})); - using SmemCopyAtom = Copy_Atom; - - // Gmem - using GmemTiledCopy = decltype( - make_tiled_copy(Copy_Atom, bfloat16_t>{}, - Layout, - Stride< _4,_1>>{}, - Layout>{})); -}; - -// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands - -// Operand B - Column-Major (K-major) -template -struct DefaultGemm_TensorOpSm80_OperandB - : DefaultGemm_TensorOpSm80_OperandA -{}; - -// Operand B - Row-Major (N-major) -template -struct DefaultGemm_TensorOpSm80_OperandB - : DefaultGemm_TensorOpSm80_OperandA -{}; diff --git a/examples/sycl/common/example_runner.hpp b/examples/sycl/common/example_runner.hpp deleted file mode 100644 index f300012ca..000000000 --- a/examples/sycl/common/example_runner.hpp +++ /dev/null @@ -1,387 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - -#include "cutlass/gemm/device/gemm.h" -#include "cutlass/epilogue/collective/default_epilogue.hpp" -#include "cutlass/gemm/device/gemm_universal.h" -#include "cutlass/gemm/device/gemm_universal_adapter.h" -#include "cutlass/gemm/collective/collective_mma.hpp" -#include "cutlass/util/GPU_Clock.hpp" - -#include "cutlass/util/host_tensor.h" -#include "cutlass/util/reference/host/tensor_compare.h" -#include "cutlass/util/reference/host/tensor_copy.h" -#include "cutlass/util/reference/host/tensor_fill.h" -#include "cute/tensor.hpp" - -#include "cutlass/util/command_line.h" -#include "cutlass/util/device_memory.h" -#include "cutlass/util/packed_stride.hpp" -#include "cutlass/util/reference/device/gemm_complex.h" -#include "cutlass/util/reference/device/tensor_compare.h" -#include "cutlass/util/print_error.hpp" - -template -static void fill_matrix(std::vector &M) -{ - std::generate(std::begin(M), std::end(M), [&] - { return static_cast( 2*(rand() / double(RAND_MAX)) - 1 ); }); -} - -using namespace cute; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Command line options parsing -struct Options { - - bool help; - bool error; - - int m, n, k, l, iterations; - float alpha, beta; - - Options(): - help(false), - error(false), - m(4096), n(4096), k(4096), l(1), iterations(100), - alpha(1.f), beta(0.f) - { } - - // Parses the command line - void parse(int argc, char const **args) { - cutlass::CommandLine cmd(argc, args); - - if (cmd.check_cmd_line_flag("help")) { - help = true; - return; - } - - cmd.get_cmd_line_argument("m", m, 4096); - cmd.get_cmd_line_argument("n", n, 4096); - cmd.get_cmd_line_argument("k", k, 4096); - cmd.get_cmd_line_argument("l", l, 1); - cmd.get_cmd_line_argument("alpha", alpha, 1.f); - cmd.get_cmd_line_argument("beta", beta, 0.f); - cmd.get_cmd_line_argument("iterations", iterations, 100); - } - - /// Prints the usage statement. - std::ostream & print_usage(std::ostream &out) const { - - out << "PVC GEMM Example\n\n" - << "Options:\n\n" - << " --help If specified, displays this usage statement\n\n" - << " --m= Sets the M extent of the GEMM\n" - << " --n= Sets the N extent of the GEMM\n" - << " --k= Sets the K extent of the GEMM\n" - << " --l= Sets the L extent (batch count) of the GEMM\n" - << " --alpha= Epilogue scalar alpha\n" - << " --beta= Epilogue scalar beta\n\n" - << " --iterations= Iterations\n\n"; - - return out; - } -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -struct ExampleRunner { - - using StrideA = typename Gemm::GemmKernel::StrideA; - using StrideB = typename Gemm::GemmKernel::StrideB; - using StrideC = typename Gemm::GemmKernel::StrideC; - using StrideD = typename Gemm::GemmKernel::StrideD; - - using LayoutA = typename Gemm::LayoutA; - using LayoutB = typename Gemm::LayoutB; - using LayoutC = typename Gemm::LayoutC; - using LayoutD = typename Gemm::LayoutD; - - using ElementA = typename Gemm::ElementA; - using ElementB = typename Gemm::ElementB; - using ElementAcc = typename Gemm::ElementAccumulator; - - using CollectiveEpilogue = typename Gemm::CollectiveEpilogue; - using ElementC = typename Gemm::ElementC; - using ElementOutput = typename CollectiveEpilogue::ElementOutput; - using ElementCompute = typename CollectiveEpilogue::ElementCompute; - using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator; - - using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; - - // - // Data members - // - - /// Initialization - StrideA stride_A; - StrideB stride_B; - StrideC stride_C; - StrideD stride_D; - - cutlass::DeviceAllocation block_A; - cutlass::DeviceAllocation block_B; - cutlass::DeviceAllocation block_C; - cutlass::DeviceAllocation block_D; - cutlass::DeviceAllocation block_ref_D; - - // - // Methods - // - - bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) { - auto [M, N, K, L] = problem_size; - - cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K})); - cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N})); - cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N})); - cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); - - cutlass::reference::device::GemmComplex( - {M, N, K}, - alpha, - ref_A, - cutlass::ComplexTransform::kNone, - ref_B, - cutlass::ComplexTransform::kNone, - beta, - ref_C, - ref_D, - ElementAccumulator(0), - L, // batch_count - M * K, // batch_stride_A - K * N, // batch_stride_B - M * N, // batch_stride_C - M * N // batch_stride_D - ); - -#if defined(CUTLASS_ENABLE_SYCL) - syclcompat::wait(); -#else - cudaDeviceSynchronize(); -#endif - - // Check if output from CUTLASS kernel and reference kernel are relatively equal or not - // need to set a larger error margin for comparison to succeed - auto epsilon = static_cast(0.1f); - auto nonzero_floor = static_cast(0.1f); - - bool passed = cutlass::reference::device::BlockCompareRelativelyEqual( - block_ref_D.get(), block_D.get(), block_D.size(), - epsilon, nonzero_floor); - - return passed; - } - - /// Initialize operands to be used in the GEMM and reference GEMM - virtual void initialize(const ProblemShapeType& problem_size) { - auto problem_shape_MNKL = cute::append<4>(problem_size, 1); - auto [M, N, K, L] = problem_shape_MNKL; - - stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); - stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); - stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); - stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - - block_A.reset(M * K * L); - block_B.reset(K * N * L); - block_C.reset(M * N * L); - block_D.reset(M * N * L); - block_ref_D.reset(M * N * L); - - // TODO: Enable initialization on device directly once RNG is - // available through SYCL. - std::vector a(K * M * L); - std::vector b(K * N * L); - std::vector c(M * N * L); - std::vector d(M * N * L, ElementC{0}); - - fill_matrix(a); - fill_matrix(b); - fill_matrix(c); - - block_A.copy_from_host(a.data(), a.size()); - block_B.copy_from_host(b.data(), b.size()); - block_C.copy_from_host(c.data(), c.size()); - block_D.copy_from_host(d.data(), d.size()); - block_ref_D.copy_from_host(d.data(), d.size()); - } - - virtual void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) { - ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; - - initialize(problem_size); - - typename Gemm::GemmKernel::Arguments arguments{ - cutlass::gemm::GemmUniversalMode::kGemm, - problem_size, - {block_A.get(), stride_A, block_B.get(), stride_B}, - {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, - hw_info - }; - - Gemm gemm_op; - - size_t workspace_size = Gemm::get_workspace_size(arguments); - cutlass::device_memory::allocation workspace(workspace_size); - - gemm_op.can_implement(arguments); - - gemm_op.initialize(arguments, workspace.get()); - - // Run the GEMM - gemm_op.run(); - -#if defined(CUTLASS_ENABLE_SYCL) - syclcompat::wait(); -#else - cudaDeviceSynchronize(); -#endif - - // Verify that the result is correct - bool passed = verify(problem_size, options.alpha, options.beta); - std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; - - if (passed && options.iterations > 0) { - GPU_Clock timer; - timer.start(); - for (int i = 0; i < options.iterations; ++i) { - gemm_op.run(); - } - - float cute_time = timer.seconds() / options.iterations; - double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; - std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l - << std::endl; - printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time * 1000); - } - } -}; - -template -struct PvcExampleRunner : ExampleRunner { - using Base = ExampleRunner; - - using ElementB = typename Base::ElementB; - - using ProblemShapeType = typename Base::ProblemShapeType; - - cutlass::DeviceAllocation block_B_vnni; - - template - void vnni_matrix( - T* dst, const T* src, - int batch, int numRows, int numCols, int factor) - { - for (int b = 0; b < batch; b++) { - for (int r = 0; r < numRows / factor; r++) { - for (int c = 0; c < numCols; c++) { - for (int k = 0; k < factor; k++) { - dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] = - src[((b * (numRows / factor) + r) * factor + k) * numCols + c]; - } - } - } - } - } - - void initialize(const ProblemShapeType& problem_size) override { - Base::initialize(problem_size); - - auto problem_shape_MNKL = cute::append<4>(problem_size, 1); - auto [M, N, K, L] = problem_shape_MNKL; - - block_B_vnni.reset(Base::block_B.size()); - - std::vector b(K * N * L); - std::vector b_vnni(b.size()); - - Base::block_B.copy_to_host(b.data()); - vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2); - - block_B_vnni.copy_from_host(b_vnni.data()); - } - - void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) override { - ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; - - initialize(problem_size); - - typename Gemm::GemmKernel::Arguments arguments{ - cutlass::gemm::GemmUniversalMode::kGemm, - problem_size, - {Base::block_A.get(), Base::stride_A, block_B_vnni.get(), Base::stride_B}, - { - {options.alpha, options.beta}, - Base::block_C.get(), Base::stride_C, Base::block_D.get(), Base::stride_D - }, - hw_info - }; - - Gemm gemm_op; - - size_t workspace_size = Gemm::get_workspace_size(arguments); - cutlass::device_memory::allocation workspace(workspace_size); - - gemm_op.can_implement(arguments); - - gemm_op.initialize(arguments, workspace.get()); - - // Run the GEMM - gemm_op.run(); - -#if defined(CUTLASS_ENABLE_SYCL) - syclcompat::wait(); -#else - cudaDeviceSynchronize(); -#endif - - // Verify that the result is correct - bool passed = Base::verify(problem_size, options.alpha, options.beta); - std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; - - if (passed && options.iterations > 0) { - GPU_Clock timer; - timer.start(); - for (int i = 0; i < options.iterations; ++i) { - gemm_op.run(); - } - - float cute_time = timer.seconds() / options.iterations; - double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; - std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl; - printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000); - } - } -}; -