From dfc0c886a887916983f9ddb277d5cc02326a8dea Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Mon, 20 May 2024 13:58:18 +0100 Subject: [PATCH 01/10] Add generic example runner --- examples/sycl/common/example_runner.hpp | 387 ++++++++++++++++++++++++ 1 file changed, 387 insertions(+) create mode 100644 examples/sycl/common/example_runner.hpp diff --git a/examples/sycl/common/example_runner.hpp b/examples/sycl/common/example_runner.hpp new file mode 100644 index 0000000000..f300012caa --- /dev/null +++ b/examples/sycl/common/example_runner.hpp @@ -0,0 +1,387 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/gemm/device/gemm_universal.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/collective/collective_mma.hpp" +#include "cutlass/util/GPU_Clock.hpp" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cute/tensor.hpp" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/reference/device/gemm_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/util/print_error.hpp" + +template +static void fill_matrix(std::vector &M) +{ + std::generate(std::begin(M), std::end(M), [&] + { return static_cast( 2*(rand() / double(RAND_MAX)) - 1 ); }); +} + +using namespace cute; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + bool error; + + int m, n, k, l, iterations; + float alpha, beta; + + Options(): + help(false), + error(false), + m(4096), n(4096), k(4096), l(1), iterations(100), + alpha(1.f), beta(0.f) + { } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + return; + } + + cmd.get_cmd_line_argument("m", m, 4096); + cmd.get_cmd_line_argument("n", n, 4096); + cmd.get_cmd_line_argument("k", k, 4096); + cmd.get_cmd_line_argument("l", l, 1); + cmd.get_cmd_line_argument("alpha", alpha, 1.f); + cmd.get_cmd_line_argument("beta", beta, 0.f); + cmd.get_cmd_line_argument("iterations", iterations, 100); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "PVC GEMM Example\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement\n\n" + << " --m= Sets the M extent of the GEMM\n" + << " --n= Sets the N extent of the GEMM\n" + << " --k= Sets the K extent of the GEMM\n" + << " --l= Sets the L extent (batch count) of the GEMM\n" + << " --alpha= Epilogue scalar alpha\n" + << " --beta= Epilogue scalar beta\n\n" + << " --iterations= Iterations\n\n"; + + return out; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct ExampleRunner { + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = typename Gemm::GemmKernel::StrideD; + + using LayoutA = typename Gemm::LayoutA; + using LayoutB = typename Gemm::LayoutB; + using LayoutC = typename Gemm::LayoutC; + using LayoutD = typename Gemm::LayoutD; + + using ElementA = typename Gemm::ElementA; + using ElementB = typename Gemm::ElementB; + using ElementAcc = typename Gemm::ElementAccumulator; + + using CollectiveEpilogue = typename Gemm::CollectiveEpilogue; + using ElementC = typename Gemm::ElementC; + using ElementOutput = typename CollectiveEpilogue::ElementOutput; + using ElementCompute = typename CollectiveEpilogue::ElementCompute; + using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator; + + using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; + + // + // Data members + // + + /// Initialization + StrideA stride_A; + StrideB stride_B; + StrideC stride_C; + StrideD stride_D; + + cutlass::DeviceAllocation block_A; + cutlass::DeviceAllocation block_B; + cutlass::DeviceAllocation block_C; + cutlass::DeviceAllocation block_D; + cutlass::DeviceAllocation block_ref_D; + + // + // Methods + // + + bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) { + auto [M, N, K, L] = problem_size; + + cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K})); + cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N})); + cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N})); + cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); + + cutlass::reference::device::GemmComplex( + {M, N, K}, + alpha, + ref_A, + cutlass::ComplexTransform::kNone, + ref_B, + cutlass::ComplexTransform::kNone, + beta, + ref_C, + ref_D, + ElementAccumulator(0), + L, // batch_count + M * K, // batch_stride_A + K * N, // batch_stride_B + M * N, // batch_stride_C + M * N // batch_stride_D + ); + +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::wait(); +#else + cudaDeviceSynchronize(); +#endif + + // Check if output from CUTLASS kernel and reference kernel are relatively equal or not + // need to set a larger error margin for comparison to succeed + auto epsilon = static_cast(0.1f); + auto nonzero_floor = static_cast(0.1f); + + bool passed = cutlass::reference::device::BlockCompareRelativelyEqual( + block_ref_D.get(), block_D.get(), block_D.size(), + epsilon, nonzero_floor); + + return passed; + } + + /// Initialize operands to be used in the GEMM and reference GEMM + virtual void initialize(const ProblemShapeType& problem_size) { + auto problem_shape_MNKL = cute::append<4>(problem_size, 1); + auto [M, N, K, L] = problem_shape_MNKL; + + stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); + stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); + stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); + stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); + + block_A.reset(M * K * L); + block_B.reset(K * N * L); + block_C.reset(M * N * L); + block_D.reset(M * N * L); + block_ref_D.reset(M * N * L); + + // TODO: Enable initialization on device directly once RNG is + // available through SYCL. + std::vector a(K * M * L); + std::vector b(K * N * L); + std::vector c(M * N * L); + std::vector d(M * N * L, ElementC{0}); + + fill_matrix(a); + fill_matrix(b); + fill_matrix(c); + + block_A.copy_from_host(a.data(), a.size()); + block_B.copy_from_host(b.data(), b.size()); + block_C.copy_from_host(c.data(), c.size()); + block_D.copy_from_host(d.data(), d.size()); + block_ref_D.copy_from_host(d.data(), d.size()); + } + + virtual void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) { + ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; + + initialize(problem_size); + + typename Gemm::GemmKernel::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_size, + {block_A.get(), stride_A, block_B.get(), stride_B}, + {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, + hw_info + }; + + Gemm gemm_op; + + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + + gemm_op.can_implement(arguments); + + gemm_op.initialize(arguments, workspace.get()); + + // Run the GEMM + gemm_op.run(); + +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::wait(); +#else + cudaDeviceSynchronize(); +#endif + + // Verify that the result is correct + bool passed = verify(problem_size, options.alpha, options.beta); + std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; + + if (passed && options.iterations > 0) { + GPU_Clock timer; + timer.start(); + for (int i = 0; i < options.iterations; ++i) { + gemm_op.run(); + } + + float cute_time = timer.seconds() / options.iterations; + double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; + std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l + << std::endl; + printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time * 1000); + } + } +}; + +template +struct PvcExampleRunner : ExampleRunner { + using Base = ExampleRunner; + + using ElementB = typename Base::ElementB; + + using ProblemShapeType = typename Base::ProblemShapeType; + + cutlass::DeviceAllocation block_B_vnni; + + template + void vnni_matrix( + T* dst, const T* src, + int batch, int numRows, int numCols, int factor) + { + for (int b = 0; b < batch; b++) { + for (int r = 0; r < numRows / factor; r++) { + for (int c = 0; c < numCols; c++) { + for (int k = 0; k < factor; k++) { + dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] = + src[((b * (numRows / factor) + r) * factor + k) * numCols + c]; + } + } + } + } + } + + void initialize(const ProblemShapeType& problem_size) override { + Base::initialize(problem_size); + + auto problem_shape_MNKL = cute::append<4>(problem_size, 1); + auto [M, N, K, L] = problem_shape_MNKL; + + block_B_vnni.reset(Base::block_B.size()); + + std::vector b(K * N * L); + std::vector b_vnni(b.size()); + + Base::block_B.copy_to_host(b.data()); + vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2); + + block_B_vnni.copy_from_host(b_vnni.data()); + } + + void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) override { + ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; + + initialize(problem_size); + + typename Gemm::GemmKernel::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_size, + {Base::block_A.get(), Base::stride_A, block_B_vnni.get(), Base::stride_B}, + { + {options.alpha, options.beta}, + Base::block_C.get(), Base::stride_C, Base::block_D.get(), Base::stride_D + }, + hw_info + }; + + Gemm gemm_op; + + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + + gemm_op.can_implement(arguments); + + gemm_op.initialize(arguments, workspace.get()); + + // Run the GEMM + gemm_op.run(); + +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::wait(); +#else + cudaDeviceSynchronize(); +#endif + + // Verify that the result is correct + bool passed = Base::verify(problem_size, options.alpha, options.beta); + std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; + + if (passed && options.iterations > 0) { + GPU_Clock timer; + timer.start(); + for (int i = 0; i < options.iterations; ++i) { + gemm_op.run(); + } + + float cute_time = timer.seconds() / options.iterations; + double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; + std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl; + printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000); + } + } +}; + From fd35e3ab71e14795602adf635e7409d0cbf346cc Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Mon, 20 May 2024 16:24:54 +0100 Subject: [PATCH 02/10] Init d and ref_d with different values --- examples/sycl/common/example_runner.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/sycl/common/example_runner.hpp b/examples/sycl/common/example_runner.hpp index f300012caa..2cc14556fe 100644 --- a/examples/sycl/common/example_runner.hpp +++ b/examples/sycl/common/example_runner.hpp @@ -224,7 +224,8 @@ struct ExampleRunner { std::vector a(K * M * L); std::vector b(K * N * L); std::vector c(M * N * L); - std::vector d(M * N * L, ElementC{0}); + std::vector d(M * N * L, ElementC{-1}); + std::vector ref_d(M * N * L, ElementC{-2}); fill_matrix(a); fill_matrix(b); @@ -234,7 +235,7 @@ struct ExampleRunner { block_B.copy_from_host(b.data(), b.size()); block_C.copy_from_host(c.data(), c.size()); block_D.copy_from_host(d.data(), d.size()); - block_ref_D.copy_from_host(d.data(), d.size()); + block_ref_D.copy_from_host(ref_d.data(), d.size()); } virtual void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) { From 45d47e4895766067d6aeb21c6fdc49f4c0c7d669 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Mon, 27 May 2024 16:03:41 +0100 Subject: [PATCH 03/10] Move runner to benchmark folder --- examples/sycl/common/example_runner.hpp | 388 ------------------------ 1 file changed, 388 deletions(-) delete mode 100644 examples/sycl/common/example_runner.hpp diff --git a/examples/sycl/common/example_runner.hpp b/examples/sycl/common/example_runner.hpp deleted file mode 100644 index 2cc14556fe..0000000000 --- a/examples/sycl/common/example_runner.hpp +++ /dev/null @@ -1,388 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - -#include "cutlass/gemm/device/gemm.h" -#include "cutlass/epilogue/collective/default_epilogue.hpp" -#include "cutlass/gemm/device/gemm_universal.h" -#include "cutlass/gemm/device/gemm_universal_adapter.h" -#include "cutlass/gemm/collective/collective_mma.hpp" -#include "cutlass/util/GPU_Clock.hpp" - -#include "cutlass/util/host_tensor.h" -#include "cutlass/util/reference/host/tensor_compare.h" -#include "cutlass/util/reference/host/tensor_copy.h" -#include "cutlass/util/reference/host/tensor_fill.h" -#include "cute/tensor.hpp" - -#include "cutlass/util/command_line.h" -#include "cutlass/util/device_memory.h" -#include "cutlass/util/packed_stride.hpp" -#include "cutlass/util/reference/device/gemm_complex.h" -#include "cutlass/util/reference/device/tensor_compare.h" -#include "cutlass/util/print_error.hpp" - -template -static void fill_matrix(std::vector &M) -{ - std::generate(std::begin(M), std::end(M), [&] - { return static_cast( 2*(rand() / double(RAND_MAX)) - 1 ); }); -} - -using namespace cute; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Command line options parsing -struct Options { - - bool help; - bool error; - - int m, n, k, l, iterations; - float alpha, beta; - - Options(): - help(false), - error(false), - m(4096), n(4096), k(4096), l(1), iterations(100), - alpha(1.f), beta(0.f) - { } - - // Parses the command line - void parse(int argc, char const **args) { - cutlass::CommandLine cmd(argc, args); - - if (cmd.check_cmd_line_flag("help")) { - help = true; - return; - } - - cmd.get_cmd_line_argument("m", m, 4096); - cmd.get_cmd_line_argument("n", n, 4096); - cmd.get_cmd_line_argument("k", k, 4096); - cmd.get_cmd_line_argument("l", l, 1); - cmd.get_cmd_line_argument("alpha", alpha, 1.f); - cmd.get_cmd_line_argument("beta", beta, 0.f); - cmd.get_cmd_line_argument("iterations", iterations, 100); - } - - /// Prints the usage statement. - std::ostream & print_usage(std::ostream &out) const { - - out << "PVC GEMM Example\n\n" - << "Options:\n\n" - << " --help If specified, displays this usage statement\n\n" - << " --m= Sets the M extent of the GEMM\n" - << " --n= Sets the N extent of the GEMM\n" - << " --k= Sets the K extent of the GEMM\n" - << " --l= Sets the L extent (batch count) of the GEMM\n" - << " --alpha= Epilogue scalar alpha\n" - << " --beta= Epilogue scalar beta\n\n" - << " --iterations= Iterations\n\n"; - - return out; - } -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -struct ExampleRunner { - - using StrideA = typename Gemm::GemmKernel::StrideA; - using StrideB = typename Gemm::GemmKernel::StrideB; - using StrideC = typename Gemm::GemmKernel::StrideC; - using StrideD = typename Gemm::GemmKernel::StrideD; - - using LayoutA = typename Gemm::LayoutA; - using LayoutB = typename Gemm::LayoutB; - using LayoutC = typename Gemm::LayoutC; - using LayoutD = typename Gemm::LayoutD; - - using ElementA = typename Gemm::ElementA; - using ElementB = typename Gemm::ElementB; - using ElementAcc = typename Gemm::ElementAccumulator; - - using CollectiveEpilogue = typename Gemm::CollectiveEpilogue; - using ElementC = typename Gemm::ElementC; - using ElementOutput = typename CollectiveEpilogue::ElementOutput; - using ElementCompute = typename CollectiveEpilogue::ElementCompute; - using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator; - - using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; - - // - // Data members - // - - /// Initialization - StrideA stride_A; - StrideB stride_B; - StrideC stride_C; - StrideD stride_D; - - cutlass::DeviceAllocation block_A; - cutlass::DeviceAllocation block_B; - cutlass::DeviceAllocation block_C; - cutlass::DeviceAllocation block_D; - cutlass::DeviceAllocation block_ref_D; - - // - // Methods - // - - bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) { - auto [M, N, K, L] = problem_size; - - cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K})); - cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N})); - cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N})); - cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); - - cutlass::reference::device::GemmComplex( - {M, N, K}, - alpha, - ref_A, - cutlass::ComplexTransform::kNone, - ref_B, - cutlass::ComplexTransform::kNone, - beta, - ref_C, - ref_D, - ElementAccumulator(0), - L, // batch_count - M * K, // batch_stride_A - K * N, // batch_stride_B - M * N, // batch_stride_C - M * N // batch_stride_D - ); - -#if defined(CUTLASS_ENABLE_SYCL) - syclcompat::wait(); -#else - cudaDeviceSynchronize(); -#endif - - // Check if output from CUTLASS kernel and reference kernel are relatively equal or not - // need to set a larger error margin for comparison to succeed - auto epsilon = static_cast(0.1f); - auto nonzero_floor = static_cast(0.1f); - - bool passed = cutlass::reference::device::BlockCompareRelativelyEqual( - block_ref_D.get(), block_D.get(), block_D.size(), - epsilon, nonzero_floor); - - return passed; - } - - /// Initialize operands to be used in the GEMM and reference GEMM - virtual void initialize(const ProblemShapeType& problem_size) { - auto problem_shape_MNKL = cute::append<4>(problem_size, 1); - auto [M, N, K, L] = problem_shape_MNKL; - - stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); - stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); - stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); - stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - - block_A.reset(M * K * L); - block_B.reset(K * N * L); - block_C.reset(M * N * L); - block_D.reset(M * N * L); - block_ref_D.reset(M * N * L); - - // TODO: Enable initialization on device directly once RNG is - // available through SYCL. - std::vector a(K * M * L); - std::vector b(K * N * L); - std::vector c(M * N * L); - std::vector d(M * N * L, ElementC{-1}); - std::vector ref_d(M * N * L, ElementC{-2}); - - fill_matrix(a); - fill_matrix(b); - fill_matrix(c); - - block_A.copy_from_host(a.data(), a.size()); - block_B.copy_from_host(b.data(), b.size()); - block_C.copy_from_host(c.data(), c.size()); - block_D.copy_from_host(d.data(), d.size()); - block_ref_D.copy_from_host(ref_d.data(), d.size()); - } - - virtual void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) { - ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; - - initialize(problem_size); - - typename Gemm::GemmKernel::Arguments arguments{ - cutlass::gemm::GemmUniversalMode::kGemm, - problem_size, - {block_A.get(), stride_A, block_B.get(), stride_B}, - {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, - hw_info - }; - - Gemm gemm_op; - - size_t workspace_size = Gemm::get_workspace_size(arguments); - cutlass::device_memory::allocation workspace(workspace_size); - - gemm_op.can_implement(arguments); - - gemm_op.initialize(arguments, workspace.get()); - - // Run the GEMM - gemm_op.run(); - -#if defined(CUTLASS_ENABLE_SYCL) - syclcompat::wait(); -#else - cudaDeviceSynchronize(); -#endif - - // Verify that the result is correct - bool passed = verify(problem_size, options.alpha, options.beta); - std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; - - if (passed && options.iterations > 0) { - GPU_Clock timer; - timer.start(); - for (int i = 0; i < options.iterations; ++i) { - gemm_op.run(); - } - - float cute_time = timer.seconds() / options.iterations; - double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; - std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l - << std::endl; - printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time * 1000); - } - } -}; - -template -struct PvcExampleRunner : ExampleRunner { - using Base = ExampleRunner; - - using ElementB = typename Base::ElementB; - - using ProblemShapeType = typename Base::ProblemShapeType; - - cutlass::DeviceAllocation block_B_vnni; - - template - void vnni_matrix( - T* dst, const T* src, - int batch, int numRows, int numCols, int factor) - { - for (int b = 0; b < batch; b++) { - for (int r = 0; r < numRows / factor; r++) { - for (int c = 0; c < numCols; c++) { - for (int k = 0; k < factor; k++) { - dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] = - src[((b * (numRows / factor) + r) * factor + k) * numCols + c]; - } - } - } - } - } - - void initialize(const ProblemShapeType& problem_size) override { - Base::initialize(problem_size); - - auto problem_shape_MNKL = cute::append<4>(problem_size, 1); - auto [M, N, K, L] = problem_shape_MNKL; - - block_B_vnni.reset(Base::block_B.size()); - - std::vector b(K * N * L); - std::vector b_vnni(b.size()); - - Base::block_B.copy_to_host(b.data()); - vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2); - - block_B_vnni.copy_from_host(b_vnni.data()); - } - - void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) override { - ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; - - initialize(problem_size); - - typename Gemm::GemmKernel::Arguments arguments{ - cutlass::gemm::GemmUniversalMode::kGemm, - problem_size, - {Base::block_A.get(), Base::stride_A, block_B_vnni.get(), Base::stride_B}, - { - {options.alpha, options.beta}, - Base::block_C.get(), Base::stride_C, Base::block_D.get(), Base::stride_D - }, - hw_info - }; - - Gemm gemm_op; - - size_t workspace_size = Gemm::get_workspace_size(arguments); - cutlass::device_memory::allocation workspace(workspace_size); - - gemm_op.can_implement(arguments); - - gemm_op.initialize(arguments, workspace.get()); - - // Run the GEMM - gemm_op.run(); - -#if defined(CUTLASS_ENABLE_SYCL) - syclcompat::wait(); -#else - cudaDeviceSynchronize(); -#endif - - // Verify that the result is correct - bool passed = Base::verify(problem_size, options.alpha, options.beta); - std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; - - if (passed && options.iterations > 0) { - GPU_Clock timer; - timer.start(); - for (int i = 0; i < options.iterations; ++i) { - gemm_op.run(); - } - - float cute_time = timer.seconds() / options.iterations; - double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; - std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl; - printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000); - } - } -}; - From c763cf04998e63546d3126bddcdcf5156b390af9 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Mon, 20 May 2024 13:58:18 +0100 Subject: [PATCH 04/10] Add generic example runner --- examples/sycl/common/example_runner.hpp | 387 ++++++++++++++++++++++++ 1 file changed, 387 insertions(+) create mode 100644 examples/sycl/common/example_runner.hpp diff --git a/examples/sycl/common/example_runner.hpp b/examples/sycl/common/example_runner.hpp new file mode 100644 index 0000000000..f300012caa --- /dev/null +++ b/examples/sycl/common/example_runner.hpp @@ -0,0 +1,387 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/gemm/device/gemm_universal.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/collective/collective_mma.hpp" +#include "cutlass/util/GPU_Clock.hpp" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cute/tensor.hpp" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/reference/device/gemm_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/util/print_error.hpp" + +template +static void fill_matrix(std::vector &M) +{ + std::generate(std::begin(M), std::end(M), [&] + { return static_cast( 2*(rand() / double(RAND_MAX)) - 1 ); }); +} + +using namespace cute; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + bool error; + + int m, n, k, l, iterations; + float alpha, beta; + + Options(): + help(false), + error(false), + m(4096), n(4096), k(4096), l(1), iterations(100), + alpha(1.f), beta(0.f) + { } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + return; + } + + cmd.get_cmd_line_argument("m", m, 4096); + cmd.get_cmd_line_argument("n", n, 4096); + cmd.get_cmd_line_argument("k", k, 4096); + cmd.get_cmd_line_argument("l", l, 1); + cmd.get_cmd_line_argument("alpha", alpha, 1.f); + cmd.get_cmd_line_argument("beta", beta, 0.f); + cmd.get_cmd_line_argument("iterations", iterations, 100); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "PVC GEMM Example\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement\n\n" + << " --m= Sets the M extent of the GEMM\n" + << " --n= Sets the N extent of the GEMM\n" + << " --k= Sets the K extent of the GEMM\n" + << " --l= Sets the L extent (batch count) of the GEMM\n" + << " --alpha= Epilogue scalar alpha\n" + << " --beta= Epilogue scalar beta\n\n" + << " --iterations= Iterations\n\n"; + + return out; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct ExampleRunner { + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = typename Gemm::GemmKernel::StrideD; + + using LayoutA = typename Gemm::LayoutA; + using LayoutB = typename Gemm::LayoutB; + using LayoutC = typename Gemm::LayoutC; + using LayoutD = typename Gemm::LayoutD; + + using ElementA = typename Gemm::ElementA; + using ElementB = typename Gemm::ElementB; + using ElementAcc = typename Gemm::ElementAccumulator; + + using CollectiveEpilogue = typename Gemm::CollectiveEpilogue; + using ElementC = typename Gemm::ElementC; + using ElementOutput = typename CollectiveEpilogue::ElementOutput; + using ElementCompute = typename CollectiveEpilogue::ElementCompute; + using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator; + + using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; + + // + // Data members + // + + /// Initialization + StrideA stride_A; + StrideB stride_B; + StrideC stride_C; + StrideD stride_D; + + cutlass::DeviceAllocation block_A; + cutlass::DeviceAllocation block_B; + cutlass::DeviceAllocation block_C; + cutlass::DeviceAllocation block_D; + cutlass::DeviceAllocation block_ref_D; + + // + // Methods + // + + bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) { + auto [M, N, K, L] = problem_size; + + cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K})); + cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N})); + cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N})); + cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); + + cutlass::reference::device::GemmComplex( + {M, N, K}, + alpha, + ref_A, + cutlass::ComplexTransform::kNone, + ref_B, + cutlass::ComplexTransform::kNone, + beta, + ref_C, + ref_D, + ElementAccumulator(0), + L, // batch_count + M * K, // batch_stride_A + K * N, // batch_stride_B + M * N, // batch_stride_C + M * N // batch_stride_D + ); + +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::wait(); +#else + cudaDeviceSynchronize(); +#endif + + // Check if output from CUTLASS kernel and reference kernel are relatively equal or not + // need to set a larger error margin for comparison to succeed + auto epsilon = static_cast(0.1f); + auto nonzero_floor = static_cast(0.1f); + + bool passed = cutlass::reference::device::BlockCompareRelativelyEqual( + block_ref_D.get(), block_D.get(), block_D.size(), + epsilon, nonzero_floor); + + return passed; + } + + /// Initialize operands to be used in the GEMM and reference GEMM + virtual void initialize(const ProblemShapeType& problem_size) { + auto problem_shape_MNKL = cute::append<4>(problem_size, 1); + auto [M, N, K, L] = problem_shape_MNKL; + + stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); + stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); + stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); + stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); + + block_A.reset(M * K * L); + block_B.reset(K * N * L); + block_C.reset(M * N * L); + block_D.reset(M * N * L); + block_ref_D.reset(M * N * L); + + // TODO: Enable initialization on device directly once RNG is + // available through SYCL. + std::vector a(K * M * L); + std::vector b(K * N * L); + std::vector c(M * N * L); + std::vector d(M * N * L, ElementC{0}); + + fill_matrix(a); + fill_matrix(b); + fill_matrix(c); + + block_A.copy_from_host(a.data(), a.size()); + block_B.copy_from_host(b.data(), b.size()); + block_C.copy_from_host(c.data(), c.size()); + block_D.copy_from_host(d.data(), d.size()); + block_ref_D.copy_from_host(d.data(), d.size()); + } + + virtual void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) { + ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; + + initialize(problem_size); + + typename Gemm::GemmKernel::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_size, + {block_A.get(), stride_A, block_B.get(), stride_B}, + {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, + hw_info + }; + + Gemm gemm_op; + + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + + gemm_op.can_implement(arguments); + + gemm_op.initialize(arguments, workspace.get()); + + // Run the GEMM + gemm_op.run(); + +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::wait(); +#else + cudaDeviceSynchronize(); +#endif + + // Verify that the result is correct + bool passed = verify(problem_size, options.alpha, options.beta); + std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; + + if (passed && options.iterations > 0) { + GPU_Clock timer; + timer.start(); + for (int i = 0; i < options.iterations; ++i) { + gemm_op.run(); + } + + float cute_time = timer.seconds() / options.iterations; + double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; + std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l + << std::endl; + printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time * 1000); + } + } +}; + +template +struct PvcExampleRunner : ExampleRunner { + using Base = ExampleRunner; + + using ElementB = typename Base::ElementB; + + using ProblemShapeType = typename Base::ProblemShapeType; + + cutlass::DeviceAllocation block_B_vnni; + + template + void vnni_matrix( + T* dst, const T* src, + int batch, int numRows, int numCols, int factor) + { + for (int b = 0; b < batch; b++) { + for (int r = 0; r < numRows / factor; r++) { + for (int c = 0; c < numCols; c++) { + for (int k = 0; k < factor; k++) { + dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] = + src[((b * (numRows / factor) + r) * factor + k) * numCols + c]; + } + } + } + } + } + + void initialize(const ProblemShapeType& problem_size) override { + Base::initialize(problem_size); + + auto problem_shape_MNKL = cute::append<4>(problem_size, 1); + auto [M, N, K, L] = problem_shape_MNKL; + + block_B_vnni.reset(Base::block_B.size()); + + std::vector b(K * N * L); + std::vector b_vnni(b.size()); + + Base::block_B.copy_to_host(b.data()); + vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2); + + block_B_vnni.copy_from_host(b_vnni.data()); + } + + void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) override { + ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; + + initialize(problem_size); + + typename Gemm::GemmKernel::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_size, + {Base::block_A.get(), Base::stride_A, block_B_vnni.get(), Base::stride_B}, + { + {options.alpha, options.beta}, + Base::block_C.get(), Base::stride_C, Base::block_D.get(), Base::stride_D + }, + hw_info + }; + + Gemm gemm_op; + + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + + gemm_op.can_implement(arguments); + + gemm_op.initialize(arguments, workspace.get()); + + // Run the GEMM + gemm_op.run(); + +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::wait(); +#else + cudaDeviceSynchronize(); +#endif + + // Verify that the result is correct + bool passed = Base::verify(problem_size, options.alpha, options.beta); + std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; + + if (passed && options.iterations > 0) { + GPU_Clock timer; + timer.start(); + for (int i = 0; i < options.iterations; ++i) { + gemm_op.run(); + } + + float cute_time = timer.seconds() / options.iterations; + double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; + std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl; + printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000); + } + } +}; + From a0dc044ba85a3db85c77ad9f3bb8c673b72e853f Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Mon, 20 May 2024 14:23:37 +0100 Subject: [PATCH 05/10] Add Ampere half-float example --- benchmarks/CMakeLists.txt | 2 + ...ere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp | 2 +- examples/sycl/common/example_runner.hpp | 387 ------------------ 3 files changed, 3 insertions(+), 388 deletions(-) delete mode 100644 examples/sycl/common/example_runner.hpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index d499f839a9..446f8fd8bb 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -66,6 +66,8 @@ endfunction() if(SYCL_INTEL_TARGET) add_subdirectory(pvc) +else(SYCL_NVIDIA_TARGET) + add_subdirectory(ampere) endif() if (SYCL_NVIDIA_TARGET) add_subdirectory(ampere) diff --git a/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp b/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp index 69b6159f76..69bc482f12 100644 --- a/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp +++ b/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp @@ -53,7 +53,7 @@ int main(int argc, const char** argv) } // - // Run Benchmark + // Run Benchmark // // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This diff --git a/examples/sycl/common/example_runner.hpp b/examples/sycl/common/example_runner.hpp deleted file mode 100644 index f300012caa..0000000000 --- a/examples/sycl/common/example_runner.hpp +++ /dev/null @@ -1,387 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - -#include "cutlass/gemm/device/gemm.h" -#include "cutlass/epilogue/collective/default_epilogue.hpp" -#include "cutlass/gemm/device/gemm_universal.h" -#include "cutlass/gemm/device/gemm_universal_adapter.h" -#include "cutlass/gemm/collective/collective_mma.hpp" -#include "cutlass/util/GPU_Clock.hpp" - -#include "cutlass/util/host_tensor.h" -#include "cutlass/util/reference/host/tensor_compare.h" -#include "cutlass/util/reference/host/tensor_copy.h" -#include "cutlass/util/reference/host/tensor_fill.h" -#include "cute/tensor.hpp" - -#include "cutlass/util/command_line.h" -#include "cutlass/util/device_memory.h" -#include "cutlass/util/packed_stride.hpp" -#include "cutlass/util/reference/device/gemm_complex.h" -#include "cutlass/util/reference/device/tensor_compare.h" -#include "cutlass/util/print_error.hpp" - -template -static void fill_matrix(std::vector &M) -{ - std::generate(std::begin(M), std::end(M), [&] - { return static_cast( 2*(rand() / double(RAND_MAX)) - 1 ); }); -} - -using namespace cute; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Command line options parsing -struct Options { - - bool help; - bool error; - - int m, n, k, l, iterations; - float alpha, beta; - - Options(): - help(false), - error(false), - m(4096), n(4096), k(4096), l(1), iterations(100), - alpha(1.f), beta(0.f) - { } - - // Parses the command line - void parse(int argc, char const **args) { - cutlass::CommandLine cmd(argc, args); - - if (cmd.check_cmd_line_flag("help")) { - help = true; - return; - } - - cmd.get_cmd_line_argument("m", m, 4096); - cmd.get_cmd_line_argument("n", n, 4096); - cmd.get_cmd_line_argument("k", k, 4096); - cmd.get_cmd_line_argument("l", l, 1); - cmd.get_cmd_line_argument("alpha", alpha, 1.f); - cmd.get_cmd_line_argument("beta", beta, 0.f); - cmd.get_cmd_line_argument("iterations", iterations, 100); - } - - /// Prints the usage statement. - std::ostream & print_usage(std::ostream &out) const { - - out << "PVC GEMM Example\n\n" - << "Options:\n\n" - << " --help If specified, displays this usage statement\n\n" - << " --m= Sets the M extent of the GEMM\n" - << " --n= Sets the N extent of the GEMM\n" - << " --k= Sets the K extent of the GEMM\n" - << " --l= Sets the L extent (batch count) of the GEMM\n" - << " --alpha= Epilogue scalar alpha\n" - << " --beta= Epilogue scalar beta\n\n" - << " --iterations= Iterations\n\n"; - - return out; - } -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -struct ExampleRunner { - - using StrideA = typename Gemm::GemmKernel::StrideA; - using StrideB = typename Gemm::GemmKernel::StrideB; - using StrideC = typename Gemm::GemmKernel::StrideC; - using StrideD = typename Gemm::GemmKernel::StrideD; - - using LayoutA = typename Gemm::LayoutA; - using LayoutB = typename Gemm::LayoutB; - using LayoutC = typename Gemm::LayoutC; - using LayoutD = typename Gemm::LayoutD; - - using ElementA = typename Gemm::ElementA; - using ElementB = typename Gemm::ElementB; - using ElementAcc = typename Gemm::ElementAccumulator; - - using CollectiveEpilogue = typename Gemm::CollectiveEpilogue; - using ElementC = typename Gemm::ElementC; - using ElementOutput = typename CollectiveEpilogue::ElementOutput; - using ElementCompute = typename CollectiveEpilogue::ElementCompute; - using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator; - - using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; - - // - // Data members - // - - /// Initialization - StrideA stride_A; - StrideB stride_B; - StrideC stride_C; - StrideD stride_D; - - cutlass::DeviceAllocation block_A; - cutlass::DeviceAllocation block_B; - cutlass::DeviceAllocation block_C; - cutlass::DeviceAllocation block_D; - cutlass::DeviceAllocation block_ref_D; - - // - // Methods - // - - bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) { - auto [M, N, K, L] = problem_size; - - cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K})); - cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N})); - cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N})); - cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); - - cutlass::reference::device::GemmComplex( - {M, N, K}, - alpha, - ref_A, - cutlass::ComplexTransform::kNone, - ref_B, - cutlass::ComplexTransform::kNone, - beta, - ref_C, - ref_D, - ElementAccumulator(0), - L, // batch_count - M * K, // batch_stride_A - K * N, // batch_stride_B - M * N, // batch_stride_C - M * N // batch_stride_D - ); - -#if defined(CUTLASS_ENABLE_SYCL) - syclcompat::wait(); -#else - cudaDeviceSynchronize(); -#endif - - // Check if output from CUTLASS kernel and reference kernel are relatively equal or not - // need to set a larger error margin for comparison to succeed - auto epsilon = static_cast(0.1f); - auto nonzero_floor = static_cast(0.1f); - - bool passed = cutlass::reference::device::BlockCompareRelativelyEqual( - block_ref_D.get(), block_D.get(), block_D.size(), - epsilon, nonzero_floor); - - return passed; - } - - /// Initialize operands to be used in the GEMM and reference GEMM - virtual void initialize(const ProblemShapeType& problem_size) { - auto problem_shape_MNKL = cute::append<4>(problem_size, 1); - auto [M, N, K, L] = problem_shape_MNKL; - - stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); - stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); - stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); - stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - - block_A.reset(M * K * L); - block_B.reset(K * N * L); - block_C.reset(M * N * L); - block_D.reset(M * N * L); - block_ref_D.reset(M * N * L); - - // TODO: Enable initialization on device directly once RNG is - // available through SYCL. - std::vector a(K * M * L); - std::vector b(K * N * L); - std::vector c(M * N * L); - std::vector d(M * N * L, ElementC{0}); - - fill_matrix(a); - fill_matrix(b); - fill_matrix(c); - - block_A.copy_from_host(a.data(), a.size()); - block_B.copy_from_host(b.data(), b.size()); - block_C.copy_from_host(c.data(), c.size()); - block_D.copy_from_host(d.data(), d.size()); - block_ref_D.copy_from_host(d.data(), d.size()); - } - - virtual void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) { - ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; - - initialize(problem_size); - - typename Gemm::GemmKernel::Arguments arguments{ - cutlass::gemm::GemmUniversalMode::kGemm, - problem_size, - {block_A.get(), stride_A, block_B.get(), stride_B}, - {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, - hw_info - }; - - Gemm gemm_op; - - size_t workspace_size = Gemm::get_workspace_size(arguments); - cutlass::device_memory::allocation workspace(workspace_size); - - gemm_op.can_implement(arguments); - - gemm_op.initialize(arguments, workspace.get()); - - // Run the GEMM - gemm_op.run(); - -#if defined(CUTLASS_ENABLE_SYCL) - syclcompat::wait(); -#else - cudaDeviceSynchronize(); -#endif - - // Verify that the result is correct - bool passed = verify(problem_size, options.alpha, options.beta); - std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; - - if (passed && options.iterations > 0) { - GPU_Clock timer; - timer.start(); - for (int i = 0; i < options.iterations; ++i) { - gemm_op.run(); - } - - float cute_time = timer.seconds() / options.iterations; - double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; - std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l - << std::endl; - printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time * 1000); - } - } -}; - -template -struct PvcExampleRunner : ExampleRunner { - using Base = ExampleRunner; - - using ElementB = typename Base::ElementB; - - using ProblemShapeType = typename Base::ProblemShapeType; - - cutlass::DeviceAllocation block_B_vnni; - - template - void vnni_matrix( - T* dst, const T* src, - int batch, int numRows, int numCols, int factor) - { - for (int b = 0; b < batch; b++) { - for (int r = 0; r < numRows / factor; r++) { - for (int c = 0; c < numCols; c++) { - for (int k = 0; k < factor; k++) { - dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] = - src[((b * (numRows / factor) + r) * factor + k) * numCols + c]; - } - } - } - } - } - - void initialize(const ProblemShapeType& problem_size) override { - Base::initialize(problem_size); - - auto problem_shape_MNKL = cute::append<4>(problem_size, 1); - auto [M, N, K, L] = problem_shape_MNKL; - - block_B_vnni.reset(Base::block_B.size()); - - std::vector b(K * N * L); - std::vector b_vnni(b.size()); - - Base::block_B.copy_to_host(b.data()); - vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2); - - block_B_vnni.copy_from_host(b_vnni.data()); - } - - void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) override { - ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; - - initialize(problem_size); - - typename Gemm::GemmKernel::Arguments arguments{ - cutlass::gemm::GemmUniversalMode::kGemm, - problem_size, - {Base::block_A.get(), Base::stride_A, block_B_vnni.get(), Base::stride_B}, - { - {options.alpha, options.beta}, - Base::block_C.get(), Base::stride_C, Base::block_D.get(), Base::stride_D - }, - hw_info - }; - - Gemm gemm_op; - - size_t workspace_size = Gemm::get_workspace_size(arguments); - cutlass::device_memory::allocation workspace(workspace_size); - - gemm_op.can_implement(arguments); - - gemm_op.initialize(arguments, workspace.get()); - - // Run the GEMM - gemm_op.run(); - -#if defined(CUTLASS_ENABLE_SYCL) - syclcompat::wait(); -#else - cudaDeviceSynchronize(); -#endif - - // Verify that the result is correct - bool passed = Base::verify(problem_size, options.alpha, options.beta); - std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; - - if (passed && options.iterations > 0) { - GPU_Clock timer; - timer.start(); - for (int i = 0; i < options.iterations; ++i) { - gemm_op.run(); - } - - float cute_time = timer.seconds() / options.iterations; - double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; - std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl; - printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000); - } - } -}; - From cf4a32a809c8737a90d98b892f435bfba16bd51d Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Wed, 29 May 2024 13:35:05 +0100 Subject: [PATCH 06/10] Update benchmarks/CMakeLists.txt Co-authored-by: Mehdi Goli --- benchmarks/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 446f8fd8bb..8b3aed1b82 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -66,7 +66,8 @@ endfunction() if(SYCL_INTEL_TARGET) add_subdirectory(pvc) -else(SYCL_NVIDIA_TARGET) +endif() +if (SYCL_NVIDIA_TARGET) add_subdirectory(ampere) endif() if (SYCL_NVIDIA_TARGET) From 69d066a5e3821e1602ab9632f68b62fac1cc1fdf Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Mon, 20 May 2024 14:23:37 +0100 Subject: [PATCH 07/10] Add Ampere half-float example --- benchmarks/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 8b3aed1b82..79d88583fa 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -66,6 +66,8 @@ endfunction() if(SYCL_INTEL_TARGET) add_subdirectory(pvc) +else(SYCL_NVIDIA_TARGET) + add_subdirectory(ampere) endif() if (SYCL_NVIDIA_TARGET) add_subdirectory(ampere) From 1d190bc056367b15ba1b43f3348093aeddc88e25 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Mon, 20 May 2024 14:23:37 +0100 Subject: [PATCH 08/10] Add Ampere half-float example --- examples/sycl/CMakeLists.txt | 2 + examples/sycl/ampere/CMakeLists.txt | 33 ++++ ...ere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp | 153 ++++++++++++++++++ examples/sycl/ampere/gemm_configuration.hpp | 122 ++++++++++++++ 4 files changed, 310 insertions(+) create mode 100644 examples/sycl/ampere/CMakeLists.txt create mode 100644 examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp create mode 100644 examples/sycl/ampere/gemm_configuration.hpp diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt index b736ce35e8..8dac4f6c55 100644 --- a/examples/sycl/CMakeLists.txt +++ b/examples/sycl/CMakeLists.txt @@ -29,4 +29,6 @@ if(SYCL_INTEL_TARGET) add_subdirectory(pvc) +else(SYCL_NVIDIA_TARGET) + add_subdirectory(ampere) endif() diff --git a/examples/sycl/ampere/CMakeLists.txt b/examples/sycl/ampere/CMakeLists.txt new file mode 100644 index 0000000000..d25c28fc1d --- /dev/null +++ b/examples/sycl/ampere/CMakeLists.txt @@ -0,0 +1,33 @@ +# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +cutlass_example_add_executable( + ampere_gemm_fp16_fp16_fp32_tensor_op_fp32 + ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp +) diff --git a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp b/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp new file mode 100644 index 0000000000..a7e212fe80 --- /dev/null +++ b/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp @@ -0,0 +1,153 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "../common/example_runner.hpp" +#include "gemm_configuration.hpp" + +int main(int argc, const char** argv) +{ + // + // Parse options + // + + Options options; + + options.parse(argc, argv); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + if (options.error) { + std::cerr << "Aborting execution." << std::endl; + return -1; + } + + // + // Run examples + // + + // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This + // information is used by the underlying kernel. + cutlass::KernelHardwareInfo hw_info; + + // Change device_id to another value if you are running on a machine with multiple GPUs and wish + // to use a GPU other than that with device ID 0. + hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); + +// The code section below describes datatype for input, output matrices and computation between +// elements in input matrices. + using ElementAccumulator = float; // <- data type of accumulator + using ElementComputeEpilogue = float; // <- data type of epilogue operations + using ElementInputA = half_t; // <- data type of elements in input matrix A + using ElementInputB = half_t; // <- data type of elements in input matrix B + using ElementOutput = float; // <- data type of elements in output matrix D + + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::RowMajor; + using LayoutC = cutlass::layout::ColumnMajor; + using LayoutD = cutlass::layout::ColumnMajor; + + using TileShape = Shape<_128, _128, _32>; + + using TiledMma = TiledMMA< + MMA_Atom, + Layout>, // 2x2x1 thread group + Tile<_32,_32,_16>>; // 32x32x8 MMA for LDSM, 1x2x1 value group + + static constexpr int kAlignmentA = 8; + using DefaultOperandA = DefaultGemm_TensorOpSm80_OperandA< + ElementInputA, LayoutA, kAlignmentA, 32>; + using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom; // M, K + using SmemCopyAtomA = typename DefaultOperandA::SmemCopyAtom; + using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy; + + static constexpr int kAlignmentB = 8; + using DefaultOperandB = DefaultGemm_TensorOpSm80_OperandB< + ElementInputB, LayoutB, kAlignmentB, 32>; + using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom; // N, K + using SmemCopyAtomB = typename DefaultOperandB::SmemCopyAtom; + using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy; + + using Stages = Int<3>; + + // This code section describes the epilogue part of the kernel + using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, // <- data type of output matrix + 128 / cutlass::sizeof_bits::value, // <- the number of elements per vectorized + // memory access. For a byte, it's 16 + // elements. This becomes the vector width of + // math instructions in the epilogue too + ElementAccumulator, // <- data type of accumulator + ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function + + using DispatchPolicy = cutlass::gemm::MainloopSm80CpAsync; + + // Define strides (mixed) + using StrideA = cutlass::detail::TagToStrideA_t; + using StrideB = cutlass::detail::TagToStrideB_t; + using StrideC = cutlass::detail::TagToStrideC_t; + using StrideD = cutlass::detail::TagToStrideC_t; + + using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue< + StrideC, + StrideD, + EpilogueOp, + cutlass::gemm::EpilogueDefault>; + + // Mainloop + using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< + DispatchPolicy, + TileShape, + ElementInputA, + StrideA, + ElementInputB, + StrideB, + TiledMma, + GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity, // A + GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity // B + >; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, + CollectiveMainloop, + CollectiveEpilogue + >; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + ExampleRunner runner; + + runner.run(options, hw_info); + + return 0; +} diff --git a/examples/sycl/ampere/gemm_configuration.hpp b/examples/sycl/ampere/gemm_configuration.hpp new file mode 100644 index 0000000000..8a32e77e7f --- /dev/null +++ b/examples/sycl/ampere/gemm_configuration.hpp @@ -0,0 +1,122 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "cutlass/half.h" +#include "cutlass/layout/layout.h" + +#include "cute/swizzle.hpp" +#include "cute/layout.hpp" +#include "cute/arch/copy_sm75.hpp" +#include "cute/arch/copy_sm80.hpp" +#include "cute/atom/copy_atom.hpp" + +using namespace cute; + +template +struct DefaultGemm_TensorOpSm80_OperandA; + +template +struct DefaultGemm_TensorOpSm80_OperandB; + +///////////////////////////////////////////////////////////////////////// + +// half + +/// Operand A - Row-major (K-Major) +template <> +struct DefaultGemm_TensorOpSm80_OperandA +{ + // Smem + using SmemLayoutAtom = decltype( + composition(Swizzle<3,3,3>{}, + Layout, + Stride<_64, _1>>{})); + using SmemCopyAtom = Copy_Atom; + + // Gmem + using GmemTiledCopy = decltype( + make_tiled_copy(Copy_Atom, half_t>{}, + Layout, + Stride< _8,_1>>{}, + Layout>{})); +}; + +/// Operand A - Column-major (M-major) +template +struct DefaultGemm_TensorOpSm80_OperandA +{ + // Smem + using SmemLayoutAtom = decltype( + composition(Swizzle<3,3,3>{}, + Layout, + Stride< _1,_64>>{})); + using SmemCopyAtom = Copy_Atom; + + // Gmem + using GmemTiledCopy = decltype( + make_tiled_copy(Copy_Atom, half_t>{}, + Layout, + Stride< _1,_16>>{}, + Layout>{})); +}; + +/// Operand A - Row-major (K-Major) +template <> +struct DefaultGemm_TensorOpSm80_OperandA +{ + // Smem + using SmemLayoutAtom = decltype( + composition(Swizzle<2,3,3>{}, + Layout, + Stride<_32, _1>>{})); + using SmemCopyAtom = Copy_Atom; + + // Gmem + using GmemTiledCopy = decltype( + make_tiled_copy(Copy_Atom, half_t>{}, + Layout, + Stride< _4,_1>>{}, + Layout>{})); +}; + +// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands + +// Operand B - Column-Major (K-major) +template +struct DefaultGemm_TensorOpSm80_OperandB + : DefaultGemm_TensorOpSm80_OperandA +{}; + +// Operand B - Row-Major (N-major) +template +struct DefaultGemm_TensorOpSm80_OperandB + : DefaultGemm_TensorOpSm80_OperandA +{}; From 3b8108c56b7c029eea72eba23b1938c038626bc8 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Mon, 20 May 2024 14:23:37 +0100 Subject: [PATCH 09/10] Add Ampere half-float example --- ...pere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu | 155 ++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu diff --git a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu b/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu new file mode 100644 index 0000000000..67aabf76d3 --- /dev/null +++ b/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu @@ -0,0 +1,155 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "../common/example_runner.hpp" +#include "gemm_configuration.hpp" + +int main(int argc, const char** argv) +{ + // + // Parse options + // + + Options options; + + options.parse(argc, argv); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + if (options.error) { + std::cerr << "Aborting execution." << std::endl; + return -1; + } + + // + // Run examples + // + + // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This + // information is used by the underlying kernel. + cutlass::KernelHardwareInfo hw_info; + + // Change device_id to another value if you are running on a machine with multiple GPUs and wish + // to use a GPU other than that with device ID 0. + hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); + + bool passed; + +// The code section below describes datatype for input, output matrices and computation between +// elements in input matrices. + using ElementAccumulator = float; // <- data type of accumulator + using ElementComputeEpilogue = float; // <- data type of epilogue operations + using ElementInputA = half_t; // <- data type of elements in input matrix A + using ElementInputB = half_t; // <- data type of elements in input matrix B + using ElementOutput = float; // <- data type of elements in output matrix D + + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::RowMajor; + using LayoutC = cutlass::layout::ColumnMajor; + using LayoutD = cutlass::layout::ColumnMajor; + + using TileShape = Shape<_128, _128, _32>; + + using TiledMma = TiledMMA< + MMA_Atom, + Layout>, // 2x2x1 thread group + Tile<_32,_32,_16>>; // 32x32x8 MMA for LDSM, 1x2x1 value group + + static constexpr int kAlignmentA = 8; + using DefaultOperandA = DefaultGemm_TensorOpSm80_OperandA< + ElementInputA, LayoutA, kAlignmentA, 32>; + using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom; // M, K + using SmemCopyAtomA = typename DefaultOperandA::SmemCopyAtom; + using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy; + + static constexpr int kAlignmentB = 8; + using DefaultOperandB = DefaultGemm_TensorOpSm80_OperandB< + ElementInputB, LayoutB, kAlignmentB, 32>; + using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom; // N, K + using SmemCopyAtomB = typename DefaultOperandB::SmemCopyAtom; + using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy; + + using Stages = Int<3>; + + // This code section describes the epilogue part of the kernel + using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, // <- data type of output matrix + 128 / cutlass::sizeof_bits::value, // <- the number of elements per vectorized + // memory access. For a byte, it's 16 + // elements. This becomes the vector width of + // math instructions in the epilogue too + ElementAccumulator, // <- data type of accumulator + ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function + + using DispatchPolicy = cutlass::gemm::MainloopSm80CpAsync; + + // Define strides (mixed) + using StrideA = cutlass::detail::TagToStrideA_t; + using StrideB = cutlass::detail::TagToStrideB_t; + using StrideC = cutlass::detail::TagToStrideC_t; + using StrideD = cutlass::detail::TagToStrideC_t; + + using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue< + StrideC, + StrideD, + EpilogueOp, + cutlass::gemm::EpilogueDefault>; + + // Mainloop + using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< + DispatchPolicy, + TileShape, + ElementInputA, + StrideA, + ElementInputB, + StrideB, + TiledMma, + GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity, // A + GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity // B + >; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, + CollectiveMainloop, + CollectiveEpilogue + >; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + ExampleRunner runner; + + runner.run(options, hw_info); + + return 0; +} From 74cbb6d74dbf00f5464dfc9d5c2711d946ec7fd4 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Mon, 20 May 2024 14:29:48 +0100 Subject: [PATCH 10/10] Add Ampere bfloat-float example --- benchmarks/CMakeLists.txt | 5 - benchmarks/ampere/CMakeLists.txt | 5 + ...ere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp | 14 +- benchmarks/ampere/gemm_configuration.hpp | 87 +++++++++- benchmarks/common/benchmark_runner.hpp | 2 +- ...ench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp | 2 +- examples/sycl/CMakeLists.txt | 2 - examples/sycl/ampere/CMakeLists.txt | 33 ---- ...pere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu | 155 ------------------ examples/sycl/ampere/gemm_configuration.hpp | 122 -------------- 10 files changed, 95 insertions(+), 332 deletions(-) rename examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp => benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp (93%) delete mode 100644 examples/sycl/ampere/CMakeLists.txt delete mode 100644 examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu delete mode 100644 examples/sycl/ampere/gemm_configuration.hpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 79d88583fa..d499f839a9 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -66,11 +66,6 @@ endfunction() if(SYCL_INTEL_TARGET) add_subdirectory(pvc) -else(SYCL_NVIDIA_TARGET) - add_subdirectory(ampere) -endif() -if (SYCL_NVIDIA_TARGET) - add_subdirectory(ampere) endif() if (SYCL_NVIDIA_TARGET) add_subdirectory(ampere) diff --git a/benchmarks/ampere/CMakeLists.txt b/benchmarks/ampere/CMakeLists.txt index 70c2bdc990..666d9cac60 100644 --- a/benchmarks/ampere/CMakeLists.txt +++ b/benchmarks/ampere/CMakeLists.txt @@ -31,3 +31,8 @@ cutlass_benchmark_add_executable( bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32 bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp ) + +cutlass_benchmark_add_executable( + bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32 + bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp +) diff --git a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp b/benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp similarity index 93% rename from examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp rename to benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp index a7e212fe80..8dad127417 100644 --- a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp +++ b/benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp @@ -29,7 +29,7 @@ * **************************************************************************************************/ -#include "../common/example_runner.hpp" +#include "../common/benchmark_runner.hpp" #include "gemm_configuration.hpp" int main(int argc, const char** argv) @@ -53,7 +53,7 @@ int main(int argc, const char** argv) } // - // Run examples + // Run benchmark // // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This @@ -68,19 +68,19 @@ int main(int argc, const char** argv) // elements in input matrices. using ElementAccumulator = float; // <- data type of accumulator using ElementComputeEpilogue = float; // <- data type of epilogue operations - using ElementInputA = half_t; // <- data type of elements in input matrix A - using ElementInputB = half_t; // <- data type of elements in input matrix B + using ElementInputA = bfloat16_t; // <- data type of elements in input matrix A + using ElementInputB = bfloat16_t; // <- data type of elements in input matrix B using ElementOutput = float; // <- data type of elements in output matrix D using LayoutA = cutlass::layout::ColumnMajor; - using LayoutB = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; using LayoutC = cutlass::layout::ColumnMajor; using LayoutD = cutlass::layout::ColumnMajor; using TileShape = Shape<_128, _128, _32>; using TiledMma = TiledMMA< - MMA_Atom, + MMA_Atom, Layout>, // 2x2x1 thread group Tile<_32,_32,_16>>; // 32x32x8 MMA for LDSM, 1x2x1 value group @@ -145,7 +145,7 @@ int main(int argc, const char** argv) using Gemm = cutlass::gemm::device::GemmUniversalAdapter; - ExampleRunner runner; + BenchmarkRunner runner; runner.run(options, hw_info); diff --git a/benchmarks/ampere/gemm_configuration.hpp b/benchmarks/ampere/gemm_configuration.hpp index 8a32e77e7f..484786567f 100644 --- a/benchmarks/ampere/gemm_configuration.hpp +++ b/benchmarks/ampere/gemm_configuration.hpp @@ -58,14 +58,14 @@ struct DefaultGemm_TensorOpSm80_OperandA{}, Layout, - Stride<_64, _1>>{})); + Stride<_64, _1>>{})); using SmemCopyAtom = Copy_Atom; // Gmem using GmemTiledCopy = decltype( make_tiled_copy(Copy_Atom, half_t>{}, Layout, - Stride< _8,_1>>{}, + Stride< _8,_1>>{}, Layout>{})); }; @@ -77,14 +77,14 @@ struct DefaultGemm_TensorOpSm80_OperandA{}, Layout, - Stride< _1,_64>>{})); + Stride< _1,_64>>{})); using SmemCopyAtom = Copy_Atom; // Gmem using GmemTiledCopy = decltype( make_tiled_copy(Copy_Atom, half_t>{}, Layout, - Stride< _1,_16>>{}, + Stride< _1,_16>>{}, Layout>{})); }; @@ -96,14 +96,14 @@ struct DefaultGemm_TensorOpSm80_OperandA{}, Layout, - Stride<_32, _1>>{})); + Stride<_32, _1>>{})); using SmemCopyAtom = Copy_Atom; // Gmem using GmemTiledCopy = decltype( make_tiled_copy(Copy_Atom, half_t>{}, Layout, - Stride< _4,_1>>{}, + Stride< _4,_1>>{}, Layout>{})); }; @@ -120,3 +120,78 @@ template struct DefaultGemm_TensorOpSm80_OperandB : DefaultGemm_TensorOpSm80_OperandA {}; + +///////////////////////////////////////////////////////////////////////// + +// Bfloat + +/// Operand A - Row-major (K-Major) +template <> +struct DefaultGemm_TensorOpSm80_OperandA +{ + // Smem + using SmemLayoutAtom = decltype( + composition(Swizzle<3,3,3>{}, + Layout, + Stride<_64, _1>>{})); + using SmemCopyAtom = Copy_Atom; + + // Gmem + using GmemTiledCopy = decltype( + make_tiled_copy(Copy_Atom, bfloat16_t>{}, + Layout, + Stride< _8,_1>>{}, + Layout>{})); +}; + +/// Operand A - Column-major (M-major) +template +struct DefaultGemm_TensorOpSm80_OperandA +{ + // Smem + using SmemLayoutAtom = decltype( + composition(Swizzle<3,3,3>{}, + Layout, + Stride< _1,_64>>{})); + using SmemCopyAtom = Copy_Atom; + + // Gmem + using GmemTiledCopy = decltype( + make_tiled_copy(Copy_Atom, bfloat16_t>{}, + Layout, + Stride< _1,_16>>{}, + Layout>{})); +}; + +/// Operand A - Row-major (K-Major) +template <> +struct DefaultGemm_TensorOpSm80_OperandA +{ + // Smem + using SmemLayoutAtom = decltype( + composition(Swizzle<2,3,3>{}, + Layout, + Stride<_32, _1>>{})); + using SmemCopyAtom = Copy_Atom; + + // Gmem + using GmemTiledCopy = decltype( + make_tiled_copy(Copy_Atom, bfloat16_t>{}, + Layout, + Stride< _4,_1>>{}, + Layout>{})); +}; + +// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands + +// Operand B - Column-Major (K-major) +template +struct DefaultGemm_TensorOpSm80_OperandB + : DefaultGemm_TensorOpSm80_OperandA +{}; + +// Operand B - Row-Major (N-major) +template +struct DefaultGemm_TensorOpSm80_OperandB + : DefaultGemm_TensorOpSm80_OperandA +{}; diff --git a/benchmarks/common/benchmark_runner.hpp b/benchmarks/common/benchmark_runner.hpp index e3d5d8f3a5..5eb2ade3eb 100644 --- a/benchmarks/common/benchmark_runner.hpp +++ b/benchmarks/common/benchmark_runner.hpp @@ -97,7 +97,7 @@ struct Options { /// Prints the usage statement. std::ostream & print_usage(std::ostream &out) const { - out << "PVC GEMM Example\n\n" + out << "PVC GEMM Benchmark\n\n" << "Options:\n\n" << " --help If specified, displays this usage statement\n\n" << " --m= Sets the M extent of the GEMM\n" diff --git a/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp b/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp index 2ff30d7a79..67b76929db 100644 --- a/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp +++ b/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp @@ -56,7 +56,7 @@ int main(int argc, const char** argv) } // - // Run examples + // Run benchmark // // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt index 8dac4f6c55..b736ce35e8 100644 --- a/examples/sycl/CMakeLists.txt +++ b/examples/sycl/CMakeLists.txt @@ -29,6 +29,4 @@ if(SYCL_INTEL_TARGET) add_subdirectory(pvc) -else(SYCL_NVIDIA_TARGET) - add_subdirectory(ampere) endif() diff --git a/examples/sycl/ampere/CMakeLists.txt b/examples/sycl/ampere/CMakeLists.txt deleted file mode 100644 index d25c28fc1d..0000000000 --- a/examples/sycl/ampere/CMakeLists.txt +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -cutlass_example_add_executable( - ampere_gemm_fp16_fp16_fp32_tensor_op_fp32 - ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp -) diff --git a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu b/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu deleted file mode 100644 index 67aabf76d3..0000000000 --- a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu +++ /dev/null @@ -1,155 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - -#include "../common/example_runner.hpp" -#include "gemm_configuration.hpp" - -int main(int argc, const char** argv) -{ - // - // Parse options - // - - Options options; - - options.parse(argc, argv); - - if (options.help) { - options.print_usage(std::cout) << std::endl; - return 0; - } - - if (options.error) { - std::cerr << "Aborting execution." << std::endl; - return -1; - } - - // - // Run examples - // - - // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This - // information is used by the underlying kernel. - cutlass::KernelHardwareInfo hw_info; - - // Change device_id to another value if you are running on a machine with multiple GPUs and wish - // to use a GPU other than that with device ID 0. - hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); - - bool passed; - -// The code section below describes datatype for input, output matrices and computation between -// elements in input matrices. - using ElementAccumulator = float; // <- data type of accumulator - using ElementComputeEpilogue = float; // <- data type of epilogue operations - using ElementInputA = half_t; // <- data type of elements in input matrix A - using ElementInputB = half_t; // <- data type of elements in input matrix B - using ElementOutput = float; // <- data type of elements in output matrix D - - using LayoutA = cutlass::layout::ColumnMajor; - using LayoutB = cutlass::layout::RowMajor; - using LayoutC = cutlass::layout::ColumnMajor; - using LayoutD = cutlass::layout::ColumnMajor; - - using TileShape = Shape<_128, _128, _32>; - - using TiledMma = TiledMMA< - MMA_Atom, - Layout>, // 2x2x1 thread group - Tile<_32,_32,_16>>; // 32x32x8 MMA for LDSM, 1x2x1 value group - - static constexpr int kAlignmentA = 8; - using DefaultOperandA = DefaultGemm_TensorOpSm80_OperandA< - ElementInputA, LayoutA, kAlignmentA, 32>; - using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom; // M, K - using SmemCopyAtomA = typename DefaultOperandA::SmemCopyAtom; - using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy; - - static constexpr int kAlignmentB = 8; - using DefaultOperandB = DefaultGemm_TensorOpSm80_OperandB< - ElementInputB, LayoutB, kAlignmentB, 32>; - using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom; // N, K - using SmemCopyAtomB = typename DefaultOperandB::SmemCopyAtom; - using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy; - - using Stages = Int<3>; - - // This code section describes the epilogue part of the kernel - using EpilogueOp = cutlass::epilogue::thread::LinearCombination< - ElementOutput, // <- data type of output matrix - 128 / cutlass::sizeof_bits::value, // <- the number of elements per vectorized - // memory access. For a byte, it's 16 - // elements. This becomes the vector width of - // math instructions in the epilogue too - ElementAccumulator, // <- data type of accumulator - ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function - - using DispatchPolicy = cutlass::gemm::MainloopSm80CpAsync; - - // Define strides (mixed) - using StrideA = cutlass::detail::TagToStrideA_t; - using StrideB = cutlass::detail::TagToStrideB_t; - using StrideC = cutlass::detail::TagToStrideC_t; - using StrideD = cutlass::detail::TagToStrideC_t; - - using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue< - StrideC, - StrideD, - EpilogueOp, - cutlass::gemm::EpilogueDefault>; - - // Mainloop - using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< - DispatchPolicy, - TileShape, - ElementInputA, - StrideA, - ElementInputB, - StrideB, - TiledMma, - GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity, // A - GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity // B - >; - - using GemmKernel = cutlass::gemm::kernel::GemmUniversal< - Shape, - CollectiveMainloop, - CollectiveEpilogue - >; - - using Gemm = cutlass::gemm::device::GemmUniversalAdapter; - - ExampleRunner runner; - - runner.run(options, hw_info); - - return 0; -} diff --git a/examples/sycl/ampere/gemm_configuration.hpp b/examples/sycl/ampere/gemm_configuration.hpp deleted file mode 100644 index 8a32e77e7f..0000000000 --- a/examples/sycl/ampere/gemm_configuration.hpp +++ /dev/null @@ -1,122 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - -#include "cutlass/half.h" -#include "cutlass/layout/layout.h" - -#include "cute/swizzle.hpp" -#include "cute/layout.hpp" -#include "cute/arch/copy_sm75.hpp" -#include "cute/arch/copy_sm80.hpp" -#include "cute/atom/copy_atom.hpp" - -using namespace cute; - -template -struct DefaultGemm_TensorOpSm80_OperandA; - -template -struct DefaultGemm_TensorOpSm80_OperandB; - -///////////////////////////////////////////////////////////////////////// - -// half - -/// Operand A - Row-major (K-Major) -template <> -struct DefaultGemm_TensorOpSm80_OperandA -{ - // Smem - using SmemLayoutAtom = decltype( - composition(Swizzle<3,3,3>{}, - Layout, - Stride<_64, _1>>{})); - using SmemCopyAtom = Copy_Atom; - - // Gmem - using GmemTiledCopy = decltype( - make_tiled_copy(Copy_Atom, half_t>{}, - Layout, - Stride< _8,_1>>{}, - Layout>{})); -}; - -/// Operand A - Column-major (M-major) -template -struct DefaultGemm_TensorOpSm80_OperandA -{ - // Smem - using SmemLayoutAtom = decltype( - composition(Swizzle<3,3,3>{}, - Layout, - Stride< _1,_64>>{})); - using SmemCopyAtom = Copy_Atom; - - // Gmem - using GmemTiledCopy = decltype( - make_tiled_copy(Copy_Atom, half_t>{}, - Layout, - Stride< _1,_16>>{}, - Layout>{})); -}; - -/// Operand A - Row-major (K-Major) -template <> -struct DefaultGemm_TensorOpSm80_OperandA -{ - // Smem - using SmemLayoutAtom = decltype( - composition(Swizzle<2,3,3>{}, - Layout, - Stride<_32, _1>>{})); - using SmemCopyAtom = Copy_Atom; - - // Gmem - using GmemTiledCopy = decltype( - make_tiled_copy(Copy_Atom, half_t>{}, - Layout, - Stride< _4,_1>>{}, - Layout>{})); -}; - -// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands - -// Operand B - Column-Major (K-major) -template -struct DefaultGemm_TensorOpSm80_OperandB - : DefaultGemm_TensorOpSm80_OperandA -{}; - -// Operand B - Row-Major (N-major) -template -struct DefaultGemm_TensorOpSm80_OperandB - : DefaultGemm_TensorOpSm80_OperandA -{};