From df71ab73c39ef95590fc6424913f6b1a9a357464 Mon Sep 17 00:00:00 2001
From: Alejandro Acosta <alejandro.acosta@codeplay.com>
Date: Mon, 27 May 2024 16:03:41 +0100
Subject: [PATCH] Move runner to benchmark folder

---
 .../common/benchmark_runner.hpp               |   2 +-
 examples/sycl/CMakeLists.txt                  |   2 +-
 .../sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp    | 265 +++++++++++++++++-
 3 files changed, 262 insertions(+), 7 deletions(-)
 rename examples/sycl/common/example_runner.hpp => benchmarks/common/benchmark_runner.hpp (99%)
diff --git a/examples/sycl/common/example_runner.hpp b/benchmarks/common/benchmark_runner.hpp
similarity index 99%
rename from examples/sycl/common/example_runner.hpp
rename to benchmarks/common/benchmark_runner.hpp
index 2cc14556fe..ccb18e1c35 100644
--- a/examples/sycl/common/example_runner.hpp
+++ b/benchmarks/common/benchmark_runner.hpp
@@ -53,7 +53,7 @@ template <typename T>
 static void fill_matrix(std::vector<T> &M)
 {
   std::generate(std::begin(M), std::end(M), [&]
-  { return static_cast<T>( 2*(rand() / double(RAND_MAX)) - 1 ); });
+  { return static_cast<T>( (rand() / double(RAND_MAX)) ); });
 }
 
 using namespace cute;
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index ef0449f902..b736ce35e8 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -27,6 +27,6 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
-if("${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_pvc")
+if(SYCL_INTEL_TARGET)
   add_subdirectory(pvc)
 endif()
diff --git a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
index 0879034f16..120e6a60f0 100644
--- a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
+++ b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
@@ -37,17 +37,272 @@
 #include "cutlass/util/GPU_Clock.hpp"
 
 #include <cute/tensor.hpp>
+#include <random>
 
+#include "cutlass/util/command_line.h"
 #include "cutlass/util/device_memory.h"
 #include "cutlass/util/packed_stride.hpp"
 #include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
 
-#include "../common/example_runner.hpp"
+template <typename T>
+static void fill_matrix(std::vector<T> &vector)
+{
+  std::generate(std::begin(vector), std::end(vector), [&] {
+      return static_cast<T>( (rand() / double(RAND_MAX)) );
+  });
+}
+
+template <typename T>
+static void vnni_matrix(
+        T* dst, const T* src,
+        int batch, int numRows, int numCols, int factor)
+{
+  for (int b = 0; b < batch; b++) {
+    for (int r = 0; r < numRows / factor; r++) {
+      for (int c = 0; c < numCols; c++) {
+        for (int k = 0; k < factor; k++) {
+          dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] =
+                  src[((b * (numRows / factor) + r) * factor + k) * numCols + c];
+        }
+      }
+    }
+  }
+}
 
 using namespace cute;
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+// Command line options parsing
+struct Options {
+
+    bool help;
+    bool error;
+
+    int m, n, k, l, iterations;
+    float alpha, beta;
+
+    Options():
+            help(false),
+            error(false),
+            m(4096), n(4096), k(4096), l(1), iterations(100),
+            alpha(1.f), beta(0.f)
+    { }
+
+    // Parses the command line
+    void parse(int argc, char const **args) {
+      cutlass::CommandLine cmd(argc, args);
+
+      if (cmd.check_cmd_line_flag("help")) {
+        help = true;
+        return;
+      }
+
+      cmd.get_cmd_line_argument("m", m, 4096);
+      cmd.get_cmd_line_argument("n", n, 4096);
+      cmd.get_cmd_line_argument("k", k, 4096);
+      cmd.get_cmd_line_argument("l", l, 1);
+      cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+      cmd.get_cmd_line_argument("beta", beta, 0.f);
+      cmd.get_cmd_line_argument("iterations", iterations, 100);
+    }
+
+    /// Prints the usage statement.
+    std::ostream & print_usage(std::ostream &out) const {
+
+      out << "PVC GEMM Example\n\n"
+          << "Options:\n\n"
+          << "  --help                      If specified, displays this usage statement\n\n"
+          << "  --m=<int>                   Sets the M extent of the GEMM\n"
+          << "  --n=<int>                   Sets the N extent of the GEMM\n"
+          << "  --k=<int>                   Sets the K extent of the GEMM\n"
+          << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
+          << "  --alpha=<s32>               Epilogue scalar alpha\n"
+          << "  --beta=<s32>                Epilogue scalar beta\n\n"
+          << "  --iterations=<int>          Iterations\n\n";
+
+      return out;
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+        class Gemm
+>
+struct ExampleRunner {
+
+    using StrideA = typename Gemm::GemmKernel::StrideA;
+    using StrideB = typename Gemm::GemmKernel::StrideB;
+    using StrideC = typename Gemm::GemmKernel::StrideC;
+    using StrideD = typename Gemm::GemmKernel::StrideD;
+
+    using LayoutA = typename Gemm::LayoutA;
+    using LayoutB = typename Gemm::LayoutB;
+    using LayoutC = typename Gemm::LayoutC;
+    using LayoutD = typename Gemm::LayoutD;
+
+    using ElementA = typename Gemm::ElementA;
+    using ElementB = typename Gemm::ElementB;
+    using ElementAcc = typename Gemm::ElementAccumulator;
+
+    using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+    using ElementC = typename Gemm::ElementC;
+    using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+    using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+    using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+    using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+    //
+    // Data members
+    //
+
+    /// Initialization
+    StrideA stride_A;
+    StrideB stride_B;
+    StrideC stride_C;
+    StrideD stride_D;
+
+    cutlass::DeviceAllocation<ElementA> block_A;
+    cutlass::DeviceAllocation<ElementB> block_B;
+    cutlass::DeviceAllocation<ElementB> block_B_vnni;
+    cutlass::DeviceAllocation<ElementC> block_C;
+    cutlass::DeviceAllocation<ElementOutput> block_D;
+    cutlass::DeviceAllocation<ElementOutput> block_ref_D;
+
+    //
+    // Methods
+    //
+
+    bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
+      auto [M, N, K, L] = problem_size;
+
+      cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
+      cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
+      cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
+      cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
+
+      cutlass::reference::device::GemmComplex(
+              {M, N, K},
+              alpha,
+              ref_A,
+              cutlass::ComplexTransform::kNone,
+              ref_B,
+              cutlass::ComplexTransform::kNone,
+              beta,
+              ref_C,
+              ref_D,
+              ElementAccumulator(0),
+              L,     // batch_count
+              M * K, // batch_stride_A
+              K * N, // batch_stride_B
+              M * N, // batch_stride_C
+              M * N  // batch_stride_D
+      );
+
+      syclcompat::wait();
+
+      // Check if output from CUTLASS kernel and reference kernel are relatively equal or not
+      // need to set a larger error margin for comparison to succeed
+      auto epsilon = static_cast<ElementOutput>(0.1f);
+      auto nonzero_floor = static_cast<ElementOutput>(0.1f);
+
+      bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(
+              block_ref_D.get(), block_D.get(), block_D.size(),
+              epsilon, nonzero_floor);
+
+      return passed;
+    }
+
+    /// Initialize operands to be used in the GEMM and reference GEMM
+    void initialize(const ProblemShapeType& problem_size) {
+      auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+      auto [M, N, K, L] = problem_shape_MNKL;
+
+      stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+      stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+      stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+      stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+      block_A.reset(M * K * L);
+      block_B.reset(K * N * L);
+      block_B_vnni.reset(K * N * L);
+      block_C.reset(M * N * L);
+      block_D.reset(M * N * L);
+      block_ref_D.reset(M * N * L);
+
+      // TODO: Enable initialization on device directly once RNG is
+      // available through SYCL.
+      std::vector<ElementA> a(K * M * L);
+      std::vector<ElementB> b(K * N * L);
+      std::vector<ElementB> b_vnni(b.size());
+      std::vector<ElementC> c(M * N * L);
+      std::vector<ElementC> d(M * N * L, ElementC{0});
+
+      fill_matrix(a);
+      fill_matrix(b);
+      fill_matrix(c);
+      vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2);
+
+      syclcompat::memcpy(block_A.get(), a.data(), a.size() * sizeof(ElementA));
+      syclcompat::memcpy(block_B.get(), b.data(), b.size() * sizeof(ElementB));
+      syclcompat::memcpy(block_B_vnni.get(), b_vnni.data(), b.size() * sizeof(ElementB));
+      syclcompat::memcpy(block_C.get(), c.data(), c.size() * sizeof(ElementC));
+      syclcompat::memcpy(block_D.get(), d.data(), d.size() * sizeof(ElementC));
+    }
+
+    void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+      ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+      initialize(problem_size);
+
+      typename Gemm::GemmKernel::Arguments arguments{
+              cutlass::gemm::GemmUniversalMode::kGemm,
+              problem_size,
+              {block_A.get(), stride_A, block_B_vnni.get(), stride_B},
+              {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
+              hw_info
+      };
+
+      Gemm gemm_op;
+
+      size_t workspace_size = Gemm::get_workspace_size(arguments);
+      cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+      gemm_op.can_implement(arguments);
+
+      gemm_op.initialize(arguments, workspace.get());
+
+      // Run the GEMM
+      gemm_op.run();
+
+      syclcompat::wait();
+
+      // Verify that the result is correct
+      bool passed = verify(problem_size, options.alpha, options.beta);
+      std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+      if (passed && options.iterations > 0) {
+        GPU_Clock timer;
+        timer.start();
+        for (int i = 0; i < options.iterations; ++i) {
+          gemm_op.run();
+        }
+        syclcompat::wait();
+
+        float cute_time = timer.seconds() / options.iterations;
+        double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+        std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+        printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
+      }
+
+      return;
+    }
+
+};
+
 int main(int argc, const char** argv)
 {
   //
@@ -134,14 +389,14 @@ int main(int argc, const char** argv)
   >;
 
   using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-  Shape<int, int, int, int>,
-  CollectiveMainloop,
-  CollectiveEpilogue
+          Shape<int, int, int, int>,
+          CollectiveMainloop,
+          CollectiveEpilogue
   >;
 
   using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 
-  PvcExampleRunner<Gemm> runner;
+  ExampleRunner<Gemm> runner;
 
   runner.run(options, hw_info);