diff --git a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt index adcd6bc1d3..e6bb308b47 100644 --- a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt +++ b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt @@ -27,22 +27,19 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - if (CUTLASS_ENABLE_SYCL) - if (SYCL_NVIDIA_TARGET) - cutlass_example_add_executable( - 14_ampere_tf32_tensorop_gemm_cute - ampere_tf32_tensorop_gemm_cute.cpp + cutlass_example_add_executable( + 14_ampere_tf32_tensorop_gemm_cute + ampere_tf32_tensorop_gemm_cute.cu ) - endif() else() cutlass_example_add_executable( 14_ampere_tf32_tensorop_gemm ampere_tf32_tensorop_gemm.cu - ) + ) cutlass_example_add_executable( 14_ampere_tf32_tensorop_gemm_cute ampere_tf32_tensorop_gemm_cute.cu - ) + ) endif() diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu index f576f27ecf..0916300a85 100644 --- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu +++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu @@ -1,4 +1,5 @@ /*************************************************************************************************** + * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * @@ -29,308 +30,482 @@ * **************************************************************************************************/ -#include -#include +/*! \file + \brief Simple GEMM example using Cute and CUTLASS 3.x APIs for NVIDIA Ampere architecture -#include -#include + This example demonstrate how to instantiate and run a TF32 GEMM using the Cute and + CUTLASS 3.x APIs on NVIDIA Ampere architecture. Please check example 07 and 08 for + the basics of tensor op gemm kernels. On NVIDIA Ampere architecture, most concept + still holds. The two main differences are: + (1) NVIDIA Ampere architecture introduces a new series of tensor core instructions + (see include/cute/arch/mma_sm80.hpp) which are more efficient on Ampere. + (2) NVIDIA Ampere architecture uses CP_ASYNC (see include/cute/arch/copy_sm80.hpp) + to build a multistage software pipeline to better hide latency (see + include/cutlass/gemm/collective/sm80_mma_multistage.hpp). + + Moreover, NVIDIA Ampere architecture starts supporting tfloat32 (see include/cutlass/tfloat32.h) + data types in tensor cores. One big advantage is that we can load in fp32 data and convert + them implicitly to tf32 inside the GEMM kernel which means no change is needed to accelerate + traditional fp32 data by using NVIDIA Ampere architecture. + + Examples: + + $ ./examples/14_ampere_tf32_tensorop_gemm/14_ampere_tf32_tensorop_gemm_cute + +*/ + +#include + +#include "cutlass/cutlass.h" #include "cutlass/gemm/device/gemm.h" -#include "cutlass/epilogue/collective/default_epilogue.hpp" #include "cutlass/gemm/device/gemm_universal.h" #include "cutlass/gemm/device/gemm_universal_adapter.h" #include "cutlass/gemm/collective/collective_mma.hpp" +#include "cutlass/epilogue/collective/default_epilogue.hpp" -#include - -#include "cutlass/util/print_error.hpp" -#include "cutlass/util/GPU_Clock.hpp" -#if defined(CUTLASS_ENABLE_CUBLAS) && CUTLASS_ENABLE_CUBLAS != 0 -# include "cutlass/util/cublas_wrappers.hpp" -#endif -#include "cutlass/util/helper_cuda.hpp" +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/reference/device/gemm_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/util/reference/device/tensor_fill.h" +#include "helper.h" -#include "cutlass/detail/dependent_false.hpp" -#include "cutlass/util/device_memory.h" +///////////////////////////////////////////////////////////////////////////////////////////////// using namespace cute; -using TileShape = Shape<_128, _128, _32>; - -using TiledMma = TiledMMA< - MMA_Atom, - Layout, Stride<_2, _1, _1>>, // 2x2x1 thread group - Tile<_32,_32,_8>>; // 32x32x8 MMA for LDSM, 1x2x1 value group - -// Smem -using SmemLayoutAtomA = decltype( -composition(Swizzle <2,3,2> {}, - Layout, - Stride<_1, _32>>{})); -using SmemCopyAtomA = Copy_Atom, float>; -// Gmem -using GmemTiledCopyA = decltype( -make_tiled_copy(Copy_Atom, float>{}, - Layout, - Stride<_1, _16>>{}, - Layout>{})); - -// Smem -using SmemLayoutAtomB = decltype( -composition(Swizzle <2,3,2> {}, - Layout, - Stride<_1, _32>>{})); -using SmemCopyAtomB = Copy_Atom, float>; -// Gmem -using GmemTiledCopyB = decltype( -make_tiled_copy(Copy_Atom, float>{}, - Layout, - Stride<_1, _16>>{}, - Layout>{})); - -using Stages = Int<3>; - -using SmemLayoutA = decltype(tile_to_shape( - SmemLayoutAtomA{}, - make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Stages{}))); -using SmemLayoutB = decltype(tile_to_shape( - SmemLayoutAtomB{}, - make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Stages{}))); - -// The code section below describes datatype for input, output matrices and computation between -// elements in input matrices. -using ElementAccumulator = float; // <- data type of accumulator -using ElementComputeEpilogue = float; // <- data type of epilogue operations -using ElementInputA = float; // <- data type of elements in input matrix A -using ElementInputB = float; // <- data type of elements in input matrix B -using ElementOutput = float; // <- data type of elements in output matrix D - -// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM -using MMAOp = cutlass::arch::OpClassTensorOp; - -// This code section describes CUDA SM architecture number -using SmArch = cutlass::arch::Sm80; - -//// This code section describes the epilogue part of the kernel -using EpilogueOp = cutlass::epilogue::thread::LinearCombination< - ElementOutput, // <- data type of output matrix - 128 / cutlass::sizeof_bits::value, // <- the number of elements per vectorized - // memory access. For a byte, it's 16 - // elements. This becomes the vector width of - // math instructions in the epilogue too - ElementAccumulator, // <- data type of accumulator - ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function - -using DispatchPolicy = cutlass::gemm::MainloopSm80CpAsync; - -template -void -run(Gemm_Op gemm_op) -{ - gemm_op(); +/// Result structure +struct Result { + + double avg_runtime_ms; + double gflops; + bool passed; + + // + // Methods + // + + Result( + double avg_runtime_ms = 0, + double gflops = 0) + : + avg_runtime_ms(avg_runtime_ms), gflops(gflops), passed(false) + {} +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + + int m, n, k, l; + float alpha, beta; + int iterations; + + Options(): + help(false), + m(5120), n(4096), k(4096), l(1), + alpha(1), beta(0), + iterations(100) + { } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + return; + } + + cmd.get_cmd_line_argument("m", m, 5120); + cmd.get_cmd_line_argument("n", n, 4096); + cmd.get_cmd_line_argument("k", k, 4096); + cmd.get_cmd_line_argument("l", l, 1); + cmd.get_cmd_line_argument("alpha", alpha, 1.f); + cmd.get_cmd_line_argument("beta", beta, 0.f); + cmd.get_cmd_line_argument("iterations", iterations); + + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "14_ampere_tf32_tensorop_gemm_cute example\n\n" + << " This example uses the CUTLASS Library to execute TF32 tensorop GEMM computations.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement.\n\n" + << " --m= Sets the M extent of the GEMM\n" + << " --n= Sets the N extent of the GEMM\n" + << " --k= Sets the K extent of the GEMM\n" + << " --l= Sets the L extent (batch count) of the GEMM\n" + << " --alpha= Epilogue scalar alpha\n" + << " --beta= Epilogue scalar beta\n\n" + << " --iterations= Number of profiling iterations to perform.\n\n"; + + return out; + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const + { + // Two flops per multiply-add + uint64_t flop = uint64_t(2) * m * n * k * l; + double gflop = double(flop) / double(1.0e9); + return gflop / runtime_s; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Helper to initialize a block of device data +template +bool initialize_block( + cutlass::DeviceAllocation& block, + uint64_t seed=2023) { + + Element scope_max, scope_min; + int bits_input = cutlass::sizeof_bits::value; + + if (bits_input == 1) { + scope_max = 2; + scope_min = 0; + } else if (bits_input <= 8) { + scope_max = 2; + scope_min = -2; + } else { + scope_max = 8; + scope_min = -8; + } + + cutlass::reference::device::BlockFillRandomUniform( + block.get(), block.size(), seed, scope_max, scope_min, 0); + + return true; } -void test_gemm(int m, int n, int k) -{ - cute::device_init(0); +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Wrapper to run and verify a GEMM. +template < + class Gemm +> +struct ExampleRunner { + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = typename Gemm::GemmKernel::StrideD; + + using LayoutA = typename Gemm::LayoutA; + using LayoutB = typename Gemm::LayoutB; + using LayoutC = typename Gemm::LayoutC; + using LayoutD = typename Gemm::LayoutD; + + using ElementA = typename Gemm::ElementA; + using ElementB = typename Gemm::ElementB; + using ElementAcc = typename Gemm::ElementAccumulator; + + using CollectiveEpilogue = typename Gemm::CollectiveEpilogue; + using ElementC = typename Gemm::ElementC; + using ElementOutput = typename CollectiveEpilogue::ElementOutput; + using ElementCompute = typename CollectiveEpilogue::ElementCompute; + using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator; + + using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; + + // + // Data members + // + + /// Initialization + StrideA stride_A; + StrideB stride_B; + StrideC stride_C; + StrideD stride_D; + uint64_t seed = 0; + + cutlass::DeviceAllocation block_A; + cutlass::DeviceAllocation block_B; + cutlass::DeviceAllocation block_C; + cutlass::DeviceAllocation block_D; + cutlass::DeviceAllocation block_ref_D; + + // + // Methods + // + + bool verify(const ProblemShapeType& problem_size, ElementOutput alpha, ElementOutput beta) { + auto [M, N, K, L] = problem_size; + + cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K})); + cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N})); + cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N})); + cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); + + cutlass::reference::device::GemmComplex( + {M, N, K}, + ElementCompute(alpha), + ref_A, + cutlass::ComplexTransform::kNone, + ref_B, + cutlass::ComplexTransform::kNone, + ElementCompute(beta), + ref_C, + ref_D, + ElementAccumulator(0), + L, // batch_count + M * K, // batch_stride_A + K * N, // batch_stride_B + M * N, // batch_stride_C + M * N // batch_stride_D + ); + + cudaError_t result = cudaDeviceSynchronize(); + if (result != cudaSuccess) { + std::cerr << "Reference kernel failed. Last CUDA error: " + << cudaGetErrorString(result) << std::endl; + return false; + } + + // Check if output from CUTLASS kernel and reference kernel are equal or not + bool passed = cutlass::reference::device::BlockCompareEqual(block_ref_D.get(), block_D.get(), block_D.size()); + + return passed; + } + + /// Initialize operands to be used in the GEMM and reference GEMM + void initialize(const ProblemShapeType& problem_size) { + auto problem_shape_MNKL = cute::append<4>(problem_size, 1); + auto [M, N, K, L] = problem_shape_MNKL; + + stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); + stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); + stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); + stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); + + block_A.reset(M * K * L); + block_B.reset(K * N * L); + block_C.reset(M * N * L); + block_D.reset(M * N * L); + block_ref_D.reset(M * N * L); + + initialize_block(block_A, seed + 2023); + initialize_block(block_B, seed + 2022); + initialize_block(block_C, seed + 2021); + } + + void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) { + ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; + + initialize(problem_size); + + typename Gemm::GemmKernel::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_size, + {block_A.get(), stride_A, block_B.get(), stride_B}, + {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, + hw_info + }; + + Gemm gemm_op; - std::cout << "M = " << m << std::endl; - std::cout << "N = " << n << std::endl; - std::cout << "K = " << k << std::endl; + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); - using TA = float; - using TB = float; - using TC = float; - using TI = float; + CUTLASS_CHECK(gemm_op.can_implement(arguments)); - thrust::host_vector h_A(m*k); - thrust::host_vector h_B(n*k); - thrust::host_vector h_C(m*n); + CUTLASS_CHECK(gemm_op.initialize(arguments, workspace.get())); - for (int j = 0; j < m*k; ++j) h_A[j] = static_cast( 2*(rand() / double(RAND_MAX)) - 1 ); - for (int j = 0; j < n*k; ++j) h_B[j] = static_cast( 2*(rand() / double(RAND_MAX)) - 1 ); - for (int j = 0; j < m*n; ++j) h_C[j] = static_cast(-1); + // Run the GEMM + CUTLASS_CHECK(gemm_op.run()); - thrust::device_vector d_A = h_A; - thrust::device_vector d_B = h_B; - thrust::device_vector d_C = h_C; + // Check if output from CUTLASS kernel and reference kernel are equal or not + Result result; + result.passed = verify(problem_size, options.alpha, options.beta); - TI alpha = 1.0; - TI beta = 0.0; + std::cout << " Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl; - double tflops = (2.0*m*n*k) * 1e-12; + if (!result.passed) { + exit(-1); + } - const int timing_iterations = 100; - GPU_Clock timer; + // Run profiling loop + if (options.iterations > 0) + { + GpuTimer timer; + timer.start(); + for (int iter = 0; iter < options.iterations; ++iter) { + CUTLASS_CHECK(gemm_op.run()); + } + timer.stop(); + + // Compute average runtime and GFLOPs. + float elapsed_ms = timer.elapsed_millis(); + result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); + result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); + + std::cout << " Problem Size: " << options.m << 'x' << options.n << 'x' + << options.k << 'x' << options.l << std::endl; + std::cout << " Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl; + std::cout << " GFLOPS: " << result.gflops << std::endl; + } + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char const **args) { + + // Ampere Tensor Core operations exposed with mma.sync and ldmatrix are first available + // in CUDA 11.0. + // + // CUTLASS must be compiled with CUDA 11.0 Toolkit to run these examples. + if (!(__CUDACC_VER_MAJOR__ >= 11)) { + std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl; + return 0; + } + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (!((props.major * 10 + props.minor) >= 80)) { + std::cerr << "Ampere Tensor Core operations must be run on a machine with compute capability at least 80." + << std::endl; + return 0; + } -#if defined(CUTLASS_ENABLE_CUBLAS) && CUTLASS_ENABLE_CUBLAS != 0 // - // cuBLas + // Parse options // - cublasHandle_t handle; - cublasCreate(&handle); - - // Run once - d_C = h_C; - blam::cublas::gemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, - m, n, k, - &alpha, - d_A.data().get(), m, - d_B.data().get(), n, - &beta, - d_C.data().get(), m); - CUTE_CHECK_LAST(); - thrust::host_vector cublas_result = d_C; - - // Timing iterations - timer.start(); - for (int i = 0; i < timing_iterations; ++i) { - blam::cublas::gemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, - m, n, k, - &alpha, - d_A.data().get(), m, - d_B.data().get(), n, - &beta, - d_C.data().get(), m); + Options options; + + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; } - double cublas_time = timer.seconds() / timing_iterations; - CUTE_CHECK_LAST(); - printf("CUBLAS_GEMM: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cublas_time, cublas_time*1000); -#else + // + // Run examples + // + + // The KernelHardwareInfo struct holds the number of SMs on the GPU with a given device ID. + // This information is used by the underlying kernel. + cutlass::KernelHardwareInfo hw_info; - std::cout << "Verification by comparison with cuBLAS is disabled, " - "either because the CMake option CUTLASS_ENABLE_CUBLAS " - "was explicitly set to OFF, or because CMake could not find cuBLAS. " - "If you would like to enable verification with cuBLAS, " - "please set the CMake option CUTLASS_ENABLE_CUBLAS to ON, " - "rerun CMake, and recompile this example.\n"; + // Change device_id to another value if you are running on a machine with multiple GPUs and wish + // to use a GPU other than that with device ID 0. + hw_info.device_id = 0; + hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); -#endif // CUTLASS_ENABLE_CUBLAS + // Problem configuration + using ElementA = float; + using ElementB = float; + using ElementAcc = float; + using ElementOutput = float; + + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + using LayoutC = cutlass::layout::ColumnMajor; + using LayoutD = cutlass::layout::ColumnMajor; + + // Tiling configuration selection + using TileShape = Shape<_128,_128,_32>; // - // CuTe + // Assembling the CollectiveMainloop type // + // Number of pipelines you want to use + constexpr int PipelineStages = 4; - d_C = h_C; + using DispatchPolicy = cutlass::gemm::MainloopSm80CpAsync; - // Define strides (mixed) - auto dA = make_stride(Int<1>{}, m, Int<1>{}); - auto dB = make_stride(Int<1>{}, n, Int<1>{}); - auto dC = make_stride(Int<1>{}, m, Int<1>{}); + // This code section describes the MMA op and the tile size a warp will compute + using TiledMma = TiledMMA< + MMA_Atom, + Layout, Stride<_2,_1,_1>>, // 2x2x1 thread group + Tile<_32,_32,_8>>; // 32x32x8 MMA for LDSM, 1x2x1 value group - using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue< - decltype(dC), - decltype(dC), - EpilogueOp, - cutlass::gemm::EpilogueDefault>; + // Define the copy layout and atom for device memory copy. + using GmemTiledCopyA = decltype( + make_tiled_copy(Copy_Atom, float>{}, + Layout, Stride<_1,_16>>{}, + Layout>{})); + + using GmemTiledCopyB = decltype( + make_tiled_copy(Copy_Atom, float>{}, + Layout, Stride<_8,_1>>{}, + Layout>{})); -// Mainloop + // Define the copy layout and atom for shared memory copy. + using SmemLayoutAtomA = decltype(composition(Swizzle<2,3,2>{}, Layout, Stride< _1,_32>>{})); + using SmemCopyAtomA = Copy_Atom, float>; + + using SmemLayoutAtomB = decltype(composition(Swizzle<3,2,3>{}, Layout, Stride<_32, _1>>{})); + using SmemCopyAtomB = Copy_Atom; + + // Mainloop using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< DispatchPolicy, TileShape, - ElementInputA, - decltype(dA), - ElementInputB, - decltype(dB), + ElementA, + cutlass::detail::TagToStrideA_t, + ElementB, + cutlass::detail::TagToStrideB_t, TiledMma, GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity, // A GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity // B >; + // + // Assembling the Collective Epilogue Type + // + + using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, // <- data type of output matrix + 128 / cutlass::sizeof_bits::value, // <- the number of elements per vectorized + // memory access. For a byte, it's 16 + // elements. This becomes the vector width of + // math instructions in the epilogue too + ElementAcc, // <- data type of accumulator + ElementOutput>; // <- data type for alpha/beta in linear combination function + + using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue< + cutlass::detail::TagToStrideC_t, + cutlass::detail::TagToStrideC_t, + EpilogueOp, + cutlass::gemm::EpilogueDefault>; + + // + // Assembling the GemmKernel + // + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< - Shape, + Shape, CollectiveMainloop, CollectiveEpilogue >; using Gemm = cutlass::gemm::device::GemmUniversalAdapter; - using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; - - ProblemShapeType cute_problem_size = ProblemShapeType{m, n, k, 1}; - - // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch - // instantiated CUTLASS kernel - typename Gemm::Arguments arguments{ - cutlass::gemm::GemmUniversalMode::kGemm, - cute_problem_size, // <- problem size of matrix multiplication - { d_A.data().get(), dA, d_B.data().get(), dB }, - { - { alpha, beta }, - d_C.data().get(), dC, d_C.data().get(), dC - } - }; - - // Using the arguments, query for extra workspace required for matrix multiplication computation - size_t workspace_size = Gemm::get_workspace_size(arguments); - - // Allocate workspace memory - cutlass::device_memory::allocation workspace(workspace_size); - - // Instantiate CUTLASS kernel depending on templates - Gemm gemm_op; - - // Check the problem size is supported or not - gemm_op.can_implement(arguments); - CUTE_CHECK_LAST(); - - // Initialize CUTLASS kernel with arguments and workspace pointer - gemm_op.initialize(arguments, workspace.get()); - CUTE_CHECK_LAST(); - - // Run once (and check) - run(gemm_op); - CUTE_CHECK_LAST(); - thrust::host_vector cute_result = d_C; - - // Timing iterations - timer.start(); - for (int i = 0; i < timing_iterations; ++i) { - run(gemm_op); - } - CUTE_CHECK_LAST(); - double cute_time = timer.seconds() / timing_iterations; - printf("CUTLASS_GEMM: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000); - -#if defined(CUTLASS_ENABLE_CUBLAS) && CUTLASS_ENABLE_CUBLAS != 0 - printf("Empirical Perf: %.1f%%\n", (cublas_time / cute_time) * 100); - - auto host_matrix_to_const_column_major_cute_tensor = - [](const auto& X, int num_rows, int num_cols, int LDX) { - const auto shape = cute::Shape{num_rows, num_cols}; - const auto strides = cute::Stride{1, LDX}; - return cute::make_tensor(X.data(), cute::make_layout(shape, strides)); - }; - - const auto A_view = host_matrix_to_const_column_major_cute_tensor(h_A, m, k, m); - // B^T is k x n, so B is n x k. - const auto B_view = host_matrix_to_const_column_major_cute_tensor(h_B, n, k, n); - const auto C_computed_view = host_matrix_to_const_column_major_cute_tensor(cute_result, m, n, m); - const auto C_expected_view = host_matrix_to_const_column_major_cute_tensor(cublas_result, m, n, m); - print_matrix_multiply_mollified_relative_error("float", A_view, B_view, C_computed_view, C_expected_view); - -#endif // CUTLASS_ENABLE_CUBLAS -} - -int main(int argc, char** argv) -{ - int m = 5120; - if (argc >= 2) - sscanf(argv[1], "%d", &m); - - int n = 5120; - if (argc >= 3) - sscanf(argv[2], "%d", &n); - - int k = 4096; - if (argc >= 4) - sscanf(argv[3], "%d", &k); + ExampleRunner runner; - test_gemm(m, n, k); + runner.run(options, hw_info); return 0; }