From c1a3cd7acba859d9df200e557bd3454dc93c1abf Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:15:21 +0100
Subject: [PATCH 01/32] rebsing

---
 src/main.cc | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)
diff --git a/src/main.cc b/src/main.cc
index 2d046e3..c61df37 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -1,7 +1,6 @@
 #include "../include/main.hh"
 
 int iters = 10;
-int startDim = 1;
 int upperLimit = 128;
 
 bool doCpu = CPU_ENABLED;
@@ -141,6 +140,32 @@ void getParameters(int argc, char* argv[]) {
       doCpu = false;
     } else if (!strcmp(argv[i], "--no_gpu")) {
       doGpu = false;
+    } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) {
+	    sgemm = dgemm = sp_sgemm = sp_dgemm = false;
+	    std::string kernelList = argv[++i];
+	    if (kernelList.find("sp-sgemm") != std::string::npos) {
+		    sp_sgemm = true;
+		    if (kernelList.find("sgemm") != std::string::npos &&
+						kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) {
+			    sgemm = true;
+		    }
+	    } else if (kernelList.find("sgemm") != std::string::npos) {
+			    sgemm = true;
+			}
+	    if (kernelList.find("sp-dgemm") != std::string::npos) {
+		    sp_dgemm = true;
+		    if (kernelList.find("dgemm") != std::string::npos &&
+		        kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) {
+			    dgemm = true;
+		    }
+	    } else if (kernelList.find("dgemm") != std::string::npos) {
+		    dgemm = true;
+	    }
+
+	    if (!sgemm && !dgemm && !sp_sgemm && !sp_dgemm) {
+		    std::cout << "ERROR - no implemented kernels in list" << std::endl;
+		    exit(1);
+	    }
     } else if (!strcmp(argv[i], "--output_dir") || !strcmp(argv[i], "-o")) {
       if (++i >= argc) {
         std::cout << "ERROR - Invalid output directory" << std::endl;

From 21366b4359101379b640faf814173620f0635e4d Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:22:26 +0100
Subject: [PATCH 02/32] rebsing

---
 DefaultCPU/sp_gemm.hh          |  55 ++++++
 DefaultGPU/sp_gemm.hh          |  54 ++++++
 cuBLAS/sp_gemm.hh              | 295 +++++++++++++++++++++++++++++++++
 include/doGemm.hh              |  94 +++++++++--
 include/kernels/CPU/sp_gemm.hh | 110 ++++++++++++
 include/kernels/GPU/sp_gemm.hh |  27 +++
 src/main.cc                    |   4 +
 7 files changed, 626 insertions(+), 13 deletions(-)
 create mode 100644 DefaultCPU/sp_gemm.hh
 create mode 100644 DefaultGPU/sp_gemm.hh
 create mode 100644 cuBLAS/sp_gemm.hh
 create mode 100644 include/kernels/CPU/sp_gemm.hh
 create mode 100644 include/kernels/GPU/sp_gemm.hh

diff --git a/DefaultCPU/sp_gemm.hh b/DefaultCPU/sp_gemm.hh
new file mode 100644
index 0000000..d7ecb37
--- /dev/null
+++ b/DefaultCPU/sp_gemm.hh
@@ -0,0 +1,55 @@
+#pragma once
+
+#if defined CPU_DEFAULT
+
+#include "../include/kernels/CPU/sp_gemm.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+/** A class for GEMM CPU BLAS kernels. */
+template <typename T>
+class sp_gemm_cpu : public sp_gemm<T> {
+ public:
+  using sp_gemm<T>::sp_gemm;
+  using sp_gemm<T>::callConsume;
+  using sp_gemm<T>::m_;
+  using sp_gemm<T>::n_;
+  using sp_gemm<T>::k_;
+  using sp_gemm<T>::A_;
+  using sp_gemm<T>::B_;
+  using sp_gemm<T>::C_;
+
+ private:
+  /** Perform the GEMM kernel. */
+  void callGemm() override {
+    /** A naive implementation of a column-major GEMM. Alpha and Beta are always
+     * 1 and 0 respectively.
+     * Operation takes the form of C[M,N] = A[M,K] * B[K,N].
+     * callConsume() is required to ensure that the compiler does not optimise
+     * away this function. */
+    int x, y, z;
+    T acc;
+    for (x = 0; x < m_; x++) {
+      for (y = 0; y < n_; y++) {
+        acc = 0.0;
+        for (z = 0; z < k_; z++) {
+          acc += A_[z * m_ + x] * B_[y * k_ + z];
+        }
+        C_[y * m_ + x] = acc;
+      }
+    }
+    // Ensure compiler doesn't optimise away the work being done
+    callConsume();
+  }
+
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {}
+
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {}
+};
+
+}  // namespace cpu
+#endif
diff --git a/DefaultGPU/sp_gemm.hh b/DefaultGPU/sp_gemm.hh
new file mode 100644
index 0000000..92d157c
--- /dev/null
+++ b/DefaultGPU/sp_gemm.hh
@@ -0,0 +1,54 @@
+#pragma once
+
+#if defined GPU_DEFAULT
+
+#include <cmath>
+
+#include "../include/kernels/GPU/sp_gemm.hh"
+#include "../include/utilities.hh"
+
+namespace gpu {
+/** A class for GEMM GPU BLAS kernels. */
+template <typename T>
+class sp_gemm_gpu : public sp_gemm<T> {
+ public:
+  using sp_gemm<T>::sp_gemm;
+
+  /** Call the BLAS kernel n times, with 1 warmup run.
+   * Returns the time elapsed for n BLAS calls in seconds. */
+  time_checksum_gflop compute() {
+    // Override function in base `kernel` class as DefaultGPU should do nothing.
+    return {INFINITY, INFINITY, 0.0};
+  }
+
+  /** Initialise the required data structures. */
+  void initialise(gpuOffloadType offload, int m, int n, int k) override {
+    // Default GPU implementation - do nothing.
+  }
+
+ private:
+  /** Make a call to the BLAS Library Kernel. */
+  void callGemm() override {
+    // Default GPU implementation - do nothing.
+  }
+
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {
+    // Default GPU implementation - do nothing.
+  }
+
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {
+    // Default GPU implementation - do nothing.
+  }
+
+  /** Do any necessary cleanup (free pointers, close library handles, etc.)
+   * after Kernel has been called. */
+  void postCallKernelCleanup() override {
+    // Default GPU implementation - do nothing.
+  }
+};
+}  // namespace gpu
+#endif
\ No newline at end of file
diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
new file mode 100644
index 0000000..3a9cff0
--- /dev/null
+++ b/cuBLAS/sp_gemm.hh
@@ -0,0 +1,295 @@
+#pragma once
+
+#ifdef GPU_CUBLAS
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+#include "../include/kernels/GPU/gemm.hh"
+#include "../include/utilities.hh"
+#include "common.hh"
+
+namespace gpu {
+/** A class for GEMM GPU BLAS kernels. */
+template <typename T>
+class sp_gemm_gpu : public gemm<T> {
+ public:
+  using gemm<T>::gemm;
+  using gemm<T>::m_;
+  using gemm<T>::n_;
+  using gemm<T>::k_;
+  using gemm<T>::A_;
+  using gemm<T>::B_;
+  using gemm<T>::C_;
+  using gemm<T>::offload_;
+
+  /** Initialise the required data structures.
+   * `offload` refers to the data offload type:
+   *  - Once:    Move data from host to device before all iterations & move from
+   *             device to host after all iterations
+   *  - Always:  Move data from host to device and device to host each iteration
+   *  - Unified: Initialise data as unified memory; no data movement semantics
+   *             required */
+  void initialise(gpuOffloadType offload, int m, int n, int k) override {
+    offload_ = offload;
+
+    m_ = m;
+    n_ = n;
+    k_ = k;
+
+    // Create a handle for CUBLAS
+    cublasCreate(&handle_);
+
+    // Get device identifier
+    cudaCheckError(cudaGetDevice(&gpuDevice_));
+
+    // Initialise 3 streams to asynchronously move data between host and device
+    cudaCheckError(cudaStreamCreate(&s1_));
+    cudaCheckError(cudaStreamCreate(&s2_));
+    cudaCheckError(cudaStreamCreate(&s3_));
+
+    if (offload_ == gpuOffloadType::unified) {
+      cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * m_ * k_));
+      cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * k_ * n_));
+      cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * m_ * n_));
+    } else {
+      // Allocate matrices on host
+      A_ = (T*)malloc(sizeof(T) * m_ * k_);
+      B_ = (T*)malloc(sizeof(T) * k_ * n_);
+      C_ = (T*)malloc(sizeof(T) * m_ * n_);
+      // Allocate matrices on device
+      cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * m_ * k_));
+      cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * k_ * n_));
+      cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * m_ * n_));
+    }
+
+    // Initialise the host matricies
+    srand(SEED);
+    for (int y = 0; y < m_; y++) {
+      for (int x = 0; x < k_; x++) {
+        A_[y * k_ + x] = (((T)(rand() % 10000) / 100.0) - 30.0);
+      }
+    }
+    for (int y = 0; y < k_; y++) {
+      for (int x = 0; x < n_; x++) {
+        B_[y * n_ + x] = (((T)(rand() % 10000) / 100.0) - 30.0);
+      }
+    }
+  }
+
+ private:
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {
+    switch (offload_) {
+      case gpuOffloadType::always: {
+        // Offload data each iteration - no requirements
+        break;
+      }
+      case gpuOffloadType::once: {
+        // Offload data from host to the device.
+        cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * k_,
+                                       cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_device_, B_, sizeof(T) * k_ * n_,
+                                       cudaMemcpyHostToDevice, s2_));
+        cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_,
+                                       cudaMemcpyHostToDevice, s3_));
+        break;
+      }
+      case gpuOffloadType::unified: {
+        // Prefetch memory to device
+        cudaCheckError(
+            cudaMemPrefetchAsync(A_, sizeof(T) * m_ * k_, gpuDevice_, s1_));
+        cudaCheckError(
+            cudaMemPrefetchAsync(B_, sizeof(T) * k_ * n_, gpuDevice_, s2_));
+        cudaCheckError(
+            cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_, gpuDevice_, s3_));
+        break;
+      }
+    }
+  }
+
+  /** Make a call to the BLAS Library Kernel. */
+  void callGemm() override {
+    switch (offload_) {
+      case gpuOffloadType::always: {
+        // Offload data from host to the device.
+        cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * k_,
+                                       cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_device_, B_, sizeof(T) * k_ * n_,
+                                       cudaMemcpyHostToDevice, s2_));
+        cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_,
+                                       cudaMemcpyHostToDevice, s3_));
+        // Call cuBLAS GEMM kernel
+        if constexpr (std::is_same_v<T, float>) {
+          cublasStatus_t stat =
+              cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha,
+                          A_device_, std::max(1, m_), B_device_,
+                          std::max(1, k_), &beta, C_device_, std::max(1, m_));
+          if (stat != CUBLAS_STATUS_SUCCESS) {
+            std::cout << "cuBLAS error:" << stat << std::endl;
+            exit(1);
+          }
+        } else if constexpr (std::is_same_v<T, double>) {
+          cublasStatus_t stat =
+              cublasDgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha,
+                          A_device_, std::max(1, m_), B_device_,
+                          std::max(1, k_), &beta, C_device_, std::max(1, m_));
+          if (stat != CUBLAS_STATUS_SUCCESS) {
+            std::cout << "cuBLAS error:" << stat << std::endl;
+            exit(1);
+          }
+        }
+        // Offload data from device to host
+        cudaCheckError(cudaMemcpyAsync(A_, A_device_, sizeof(T) * m_ * k_,
+                                       cudaMemcpyDeviceToHost, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_, B_device_, sizeof(T) * k_ * n_,
+                                       cudaMemcpyDeviceToHost, s2_));
+        cudaCheckError(cudaMemcpyAsync(C_, C_device_, sizeof(T) * m_ * n_,
+                                       cudaMemcpyDeviceToHost, s3_));
+        // Ensure device has finished all work.
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+      case gpuOffloadType::once: {
+        // Call cuBLAS GEMM kernel
+        if constexpr (std::is_same_v<T, float>) {
+          cublasStatus_t stat =
+              cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha,
+                          A_device_, std::max(1, m_), B_device_,
+                          std::max(1, k_), &beta, C_device_, std::max(1, m_));
+          if (stat != CUBLAS_STATUS_SUCCESS) {
+            std::cout << "cuBLAS error:" << stat << std::endl;
+            exit(1);
+          }
+        } else if constexpr (std::is_same_v<T, double>) {
+          cublasStatus_t stat =
+              cublasDgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha,
+                          A_device_, std::max(1, m_), B_device_,
+                          std::max(1, k_), &beta, C_device_, std::max(1, m_));
+          if (stat != CUBLAS_STATUS_SUCCESS) {
+            std::cout << "cuBLAS error:" << stat << std::endl;
+            exit(1);
+          }
+        }
+        break;
+      }
+      case gpuOffloadType::unified: {
+        // Call cuBLAS GEMM kernel
+        if constexpr (std::is_same_v<T, float>) {
+          cublasStatus_t stat = cublasSgemm(
+              handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, A_,
+              std::max(1, m_), B_, std::max(1, k_), &beta, C_, std::max(1, m_));
+          if (stat != CUBLAS_STATUS_SUCCESS) {
+            std::cout << "cuBLAS error:" << stat << std::endl;
+            exit(1);
+          }
+        } else if constexpr (std::is_same_v<T, double>) {
+          cublasStatus_t stat = cublasDgemm(
+              handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, A_,
+              std::max(1, m_), B_, std::max(1, k_), &beta, C_, std::max(1, m_));
+          if (stat != CUBLAS_STATUS_SUCCESS) {
+            std::cout << "cuBLAS error:" << stat << std::endl;
+            exit(1);
+          }
+        }
+        break;
+      }
+    }
+  }
+
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {
+    switch (offload_) {
+      case gpuOffloadType::always: {
+        // Offload data each iteration - no requirements
+        break;
+      }
+      case gpuOffloadType::once: {
+        // Offload data from device to host
+        cudaCheckError(cudaMemcpyAsync(A_, A_device_, sizeof(T) * m_ * k_,
+                                       cudaMemcpyDeviceToHost, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_, B_device_, sizeof(T) * k_ * n_,
+                                       cudaMemcpyDeviceToHost, s2_));
+        cudaCheckError(cudaMemcpyAsync(C_, C_device_, sizeof(T) * m_ * n_,
+                                       cudaMemcpyDeviceToHost, s3_));
+        // Ensure device has finished all work.
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+      case gpuOffloadType::unified: {
+        // Ensure all data resides on host once work has completed
+        cudaCheckError(cudaMemPrefetchAsync(A_, sizeof(T) * m_ * k_,
+                                            cudaCpuDeviceId, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(B_, sizeof(T) * k_ * n_,
+                                            cudaCpuDeviceId, s2_));
+        cudaCheckError(cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_,
+                                            cudaCpuDeviceId, s3_));
+        // Ensure device has finished all work.
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+    }
+  }
+
+  /** Do any necessary cleanup (free pointers, close library handles, etc.)
+   * after Kernel has been called. */
+  void postCallKernelCleanup() override {
+    // Destroy the handle
+    cublasDestroy(handle_);
+
+    // Destroy streams after use
+    cudaCheckError(cudaStreamDestroy(s1_));
+    cudaCheckError(cudaStreamDestroy(s2_));
+    cudaCheckError(cudaStreamDestroy(s3_));
+
+    if (offload_ == gpuOffloadType::unified) {
+      cudaFree(A_);
+      cudaFree(B_);
+      cudaFree(C_);
+    } else {
+      // Free the memory held on host and device
+      free(A_);
+      free(B_);
+      free(C_);
+      cudaFree(A_device_);
+      cudaFree(B_device_);
+      cudaFree(C_device_);
+    }
+  }
+
+  /** Handle used when calling cuBLAS. */
+  cublasHandle_t handle_;
+
+  /** CUDA Stream 1 - used to asynchronously move data between host and device.
+   */
+  cudaStream_t s1_;
+
+  /** CUDA Stream 1 - used to asynchronously move data between host and device.
+   */
+  cudaStream_t s2_;
+
+  /** CUDA Stream 1 - used to asynchronously move data between host and device.
+   */
+  cudaStream_t s3_;
+
+  /** The ID of the target GPU Device. */
+  int gpuDevice_;
+
+  /** Input matrix A, held on the device. */
+  T* A_device_;
+
+  /** Input matrix B, held on the device. */
+  T* B_device_;
+
+  /** Input matrix C, held on the device. */
+  T* C_device_;
+
+  /** The constant value Alpha. */
+  const T alpha = ALPHA;
+
+  /** The constant value Beta. */
+  const T beta = BETA;
+};
+}  // namespace gpu
+#endif
\ No newline at end of file
diff --git a/include/doGemm.hh b/include/doGemm.hh
index c1aa742..4a7c564 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -20,6 +20,7 @@
 
 #if defined GPU_CUBLAS
 #include "../cuBLAS/gemm.hh"
+#include "../cuBLAS/sp_gemm.hh"
 #elif defined GPU_ONEMKL
 #include "../oneMKL/GPU/gemm.hh"
 #elif defined GPU_ROCBLAS
@@ -42,11 +43,13 @@ class doGemm {
         doGPU_(gpuEnabled)
 #if CPU_ENABLED
         ,
-        gemmCpu_(iterations_)
+        gemmCpu_(iterations_),
+        spGemmCpu_(iterations_)
 #endif
 #if GPU_ENABLED
         ,
-        gemmGpu_(iterations_)
+        gemmGpu_(iterations_),
+        spGemmGpu_(iterations_)
 #endif
   {
     static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) &&
@@ -68,7 +71,7 @@ class doGemm {
                                         "_square_square_M=N=K.csv");
     for (int dim = startDimention_; dim <= upperLimit_; dim++) {
       // M = dim, N = dim, K = dim;
-      callKernels(csvFile, dim, dim, dim);
+      callDenseKernels(csvFile, dim, dim, dim);
     }
     // Close file
     csvFile.close();
@@ -94,7 +97,7 @@ class doGemm {
     int M = 16 * K;
     int N = 16 * K;
     while (M <= upperLimit_) {
-      callKernels(csvFile, M, N, K);
+      callDenseKernels(csvFile, M, N, K);
       M += 16;
       N += 16;
       K++;
@@ -121,7 +124,7 @@ class doGemm {
     if (upperLimit_ >= 32) {
       for (int dim = startDimention_; dim <= upperLimit_; dim++) {
         // M = dim, N = dim, K = 32;
-        callKernels(csvFile, dim, dim, 32);
+        callDenseKernels(csvFile, dim, dim, 32);
       }
     }
     // Close file
@@ -147,7 +150,7 @@ class doGemm {
     N = startDimention_;
     K = 16 * M;
     while (K <= upperLimit_) {
-      callKernels(csvFile, M, N, K);
+      callDenseKernels(csvFile, M, N, K);
       M++;
       N++;
       K += 16;
@@ -174,7 +177,7 @@ class doGemm {
     if (upperLimit_ >= 32) {
       for (int dim = startDimention_; dim <= upperLimit_; dim++) {
         // M = 32, N = 32, K = dim;
-        callKernels(csvFile, 32, 32, dim);
+        callDenseKernels(csvFile, 32, 32, dim);
       }
     }
     // Close file
@@ -200,7 +203,7 @@ class doGemm {
     N = startDimention_;
     M = 16 * K;
     while (M <= upperLimit_) {
-      callKernels(csvFile, M, N, K);
+      callDenseKernels(csvFile, M, N, K);
       M += 16;
       N++;
       K++;
@@ -227,7 +230,7 @@ class doGemm {
     if (upperLimit_ >= 32) {
       for (int dim = startDimention_; dim <= upperLimit_; dim++) {
         // M = dim, N = 32, K = 32;
-        callKernels(csvFile, dim, 32, 32);
+        callDenseKernels(csvFile, dim, 32, 32);
       }
     }
     // Close file
@@ -253,7 +256,7 @@ class doGemm {
     K = startDimention_;
     N = 16 * K;
     while (N <= upperLimit_) {
-      callKernels(csvFile, M, N, K);
+      callDenseKernels(csvFile, M, N, K);
       M++;
       N += 16;
       K++;
@@ -280,7 +283,7 @@ class doGemm {
     if (upperLimit_ >= 32) {
       for (int dim = startDimention_; dim <= upperLimit_; dim++) {
         // M = 32, N = dim, K = 32;
-        callKernels(csvFile, 32, dim, 32);
+        callDenseKernels(csvFile, 32, dim, 32);
       }
     }
     // Close file
@@ -291,12 +294,27 @@ class doGemm {
       printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)");
     }
 #endif
+
+    // Square sparse matrix - sparse matrix multiplication
+    cpuGpu_always_ = cpuGpu_offloadThreshold();
+    cpuGpu_once_ = cpuGpu_offloadThreshold();
+    cpuGpu_unified_ = cpuGpu_offloadThreshold();
+    csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() +
+                          "_sparse_square.csv");
+    if (upperLimit_ >= 32) {
+      for (int dim = 1; dim <= upperLimit_; dim++) {
+        const int N = dim;
+        callSparseKernels(csvFile, N, 0.99);
+      }
+    }
+    // Close file
+    csvFile.close();
   }
 
  private:
   /** Call the appropriate CPU and GPU GEMM kernels. */
-  void callKernels(std::ofstream& csvFile, const int M, const int N,
-                   const int K) {
+  void callDenseKernels(std::ofstream& csvFile, const int M, const int N,
+                        const int K) {
     const double probSize = calcKib(M, N, K);
     const uint64_t flops = calcFlops(M, N, K);
     std::string kernelName = getKernelName();
@@ -488,6 +506,52 @@ class doGemm {
     }
   }
 
+	void callSparseKernels(std::ofstream& csvFile, const int N, const float
+	sparsity) {
+		const double probSize = calcKib(N, N, N);
+		const uint64_t flops = calcFlops(N, N, N);
+		std::string kernelName = getKernelName();
+
+		spGemmCpu_.initialise(N, sparsity);
+		time_checksum_gflop cpuResult = spGemmCpu_.compute();
+		cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
+
+		// Perform the GPU kernels
+		// - ONCE : Offload to/from GPU once before all iterations and once
+		// after
+		spGemmGpu_.initialise(gpuOffloadType::once, N, N, N);
+		time_checksum_gflop gpuResult_once = gemmGpu_.compute();
+		gpuResult_once.gflops =
+						calcGflops(flops, iterations_, gpuResult_once.runtime);
+
+		// - ALWAYS: Offload to/from GPU every iteration
+		spGemmGpu_.initialise(gpuOffloadType::always, N, N, N);
+		time_checksum_gflop gpuResult_always = gemmGpu_.compute();
+		gpuResult_always.gflops =
+						calcGflops(flops, iterations_, gpuResult_always.runtime);
+
+		// - UNIFIED : data passed from host to device (and device to host) as
+		//             needed
+		spGemmGpu_.initialise(gpuOffloadType::unified, N, N, N);
+		time_checksum_gflop gpuResult_unified = gemmGpu_.compute();
+		gpuResult_unified.gflops =
+						calcGflops(flops, iterations_, gpuResult_unified.runtime);
+
+		// ToDo -- non-default GPU operations
+
+		// Write lines to CSV file
+		writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
+		               cpuResult.runtime, cpuResult.gflops);
+		writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
+		               iterations_, gpuResult_once.runtime, gpuResult_once.gflops);
+		writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
+		               iterations_, gpuResult_always.runtime,
+		               gpuResult_always.gflops);
+		writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize,
+		               iterations_, gpuResult_unified.runtime,
+		               gpuResult_unified.gflops);
+	}
+
   /** A function for calculating FLOPs performed by a GEMM.
    * C = alpha*AB + beta*C */
   constexpr uint64_t calcFlops(const int M, const int N, const int K) const {
@@ -623,11 +687,15 @@ class doGemm {
   cpu::gemm_cpu<T> gemmCpu_;
 #endif
 
+	cpu::sp_gemm_cpu<T> spGemmCpu_;
+
 #if GPU_ENABLED
   /** The GEMM GPU kernel. */
   gpu::gemm_gpu<T> gemmGpu_;
 #endif
 
+	gpu::sp_gemm_gpu<T> spGemmGpu_;
+
   /** The point at which offloading to GPU (offload once) becomes worthwhile. */
   cpuGpu_offloadThreshold cpuGpu_once_;
 
diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh
new file mode 100644
index 0000000..3de5ea5
--- /dev/null
+++ b/include/kernels/CPU/sp_gemm.hh
@@ -0,0 +1,110 @@
+#pragma once
+
+#include "../gemm.hh"
+
+#include <random>
+
+namespace cpu {
+
+/** An abstract class for GEMM BLAS kernels. */
+		template <typename T>
+		class sp_gemm : public ::gemm<T> {
+		public:
+				using ::gemm<T>::gemm;
+				using ::gemm<T>::m_;
+				using ::gemm<T>::n_;
+				using ::gemm<T>::k_;
+				using ::gemm<T>::A_;
+				using ::gemm<T>::B_;
+				using ::gemm<T>::C_;
+
+		public:
+			/** Initialise the required data structures. */
+			virtual void initialise(int n, double sparsity, bool binary = false) {
+				n_ = n;
+
+				A_ = (T*)malloc(sizeof(T) * n_ * n_);
+				B_ = (T*)malloc(sizeof(T) * n_ * n_);
+				C_ = (T*)malloc(sizeof(T) * n_ * n_);
+
+				// Set initial values to 0
+				for (int i = 0; i < (n_ * n_); i++) {
+					A_[i] = 0.0;
+					B_[i] = 0.0;
+				}
+
+				// Random number generator objects for use in descent
+				std::default_random_engine gen;
+				gen.seed(std::chrono::system_clock::now()
+								         .time_since_epoch().count());
+				std::uniform_real_distribution<double> dist(0.0, 1.0);
+
+				// Work out number of edges needed to achieve target sparsity
+				int edges = 1 + (int) (n * n * (1 - sparsity));
+
+				// Initialise the matrices
+				// Using a=0.45 and b=c=0.22 as default probabilities
+				for (int i = 0; i < edges; i++) {
+					while (!rMat(A_, n, 0, n - 1, 0, n - 1,
+					             0.45, 0.22, 0.22,
+					             &gen, dist, false)) {}
+					while (!rMat(B_, n, 0, n - 1, 0, n - 1,
+					             0.45, 0.22, 0.22,
+					             &gen, dist, false)) {}
+				}
+			}
+
+			private:
+				bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
+					        float a, float b, float c, std::default_random_engine* gen,
+					        std::uniform_real_distribution<double> dist, bool bin) {
+					// If a 1x1 submatrix, then add an edge and return out
+					if (x1 >= x2 && y1 >= y2) {
+						if (abs(M[(y1 * n) + x1]) > 0.1) {
+							return false;
+						} else {
+							// Add 1.0 if this is a binary graph, and a random real number otherwise
+							M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
+											100.0) - 50.0);
+							return true;
+						}
+					} else {
+						// Divide up the matrix
+						int xMidPoint = x1 + floor((x2 - x1) / 2);
+						int yMidPoint = y1 + floor((y2 - y1) / 2);
+
+						// ToDo -- add some noise to these values between iterations
+						float newA = a;
+						float newB = b;
+						float newC = c;
+
+						// Work out which quarter to recurse into
+						// There are some ugly ternary operators here to avoid going out of bounds in the edge case
+						// that we are already at 1 width or 1 height
+						float randomNum = dist(*gen);
+						if (randomNum < a) {
+							return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+							            newA, newB, newC, gen, dist, bin);
+						} else if (randomNum < (a + b)) {
+							return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+							            newA, newB, newC, gen, dist, bin);
+						} else if (randomNum < (a + b + c)) {
+							return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+							            newA, newB, newC, gen, dist, bin);
+						} else {
+							return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+							            ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
+							            gen, dist, bin);
+						}
+					}
+					return true;
+				}
+				/** Do any necessary cleanup (free pointers, close library handles, etc.)
+				 * after Kernel has been called. */
+				void postCallKernelCleanup() {
+					free(A_);
+					free(B_);
+					free(C_);
+				}
+		};
+}  // namespace cpu
\ No newline at end of file
diff --git a/include/kernels/GPU/sp_gemm.hh b/include/kernels/GPU/sp_gemm.hh
new file mode 100644
index 0000000..684c166
--- /dev/null
+++ b/include/kernels/GPU/sp_gemm.hh
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "../gemm.hh"
+
+namespace gpu {
+
+/** An abstract class for GEMM BLAS kernels. */
+		template <typename T>
+		class sp_gemm : public ::gemm<T> {
+		public:
+				using ::gemm<T>::gemm;
+
+				/** Initialise the required data structures.
+				 * `offload` refers to the data offload type:
+				 *  - Once:    Move data from host to device before all iterations & move from
+				 *             device to host after all iterations
+				 *  - Always:  Move data from host to device and device to host each iteration
+				 *  - Unified: Initialise data as unified memory; no data movement semantics
+				 *             required */
+				virtual void initialise(gpuOffloadType offload, int m, int n, int k) = 0;
+
+		protected:
+				/** Whether data should be offloaded to/from the GPU each iteration, or just
+				 * before & after. */
+				gpuOffloadType offload_ = gpuOffloadType::always;
+		};
+}  // namespace gpu
\ No newline at end of file
diff --git a/src/main.cc b/src/main.cc
index c61df37..38e2b5a 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -2,6 +2,10 @@
 
 int iters = 10;
 int upperLimit = 128;
+bool sgemm = true;
+bool dgemm = true;
+bool sp_sgemm = true;
+bool sp_dgemm = true;
 
 bool doCpu = CPU_ENABLED;
 bool doGpu = GPU_ENABLED;

From f2ed11f5325e2e063d0f92e07d09b13db6b356d7 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Wed, 13 Mar 2024 13:43:05 +0000
Subject: [PATCH 03/32] Implementing cuSPARSE kernel

---
 cuBLAS/sp_gemm.hh | 208 +++++++++++++++++++++++++---------------------
 1 file changed, 111 insertions(+), 97 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 3a9cff0..67d030c 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -1,7 +1,7 @@
 #pragma once
 
 #ifdef GPU_CUBLAS
-#include <cublas_v2.h>
+#include "cusparse.h"
 #include <cuda_runtime.h>
 
 #include "../include/kernels/GPU/gemm.hh"
@@ -14,9 +14,7 @@ template <typename T>
 class sp_gemm_gpu : public gemm<T> {
  public:
   using gemm<T>::gemm;
-  using gemm<T>::m_;
   using gemm<T>::n_;
-  using gemm<T>::k_;
   using gemm<T>::A_;
   using gemm<T>::B_;
   using gemm<T>::C_;
@@ -29,15 +27,28 @@ class sp_gemm_gpu : public gemm<T> {
    *  - Always:  Move data from host to device and device to host each iteration
    *  - Unified: Initialise data as unified memory; no data movement semantics
    *             required */
-  void initialise(gpuOffloadType offload, int m, int n, int k) override {
+  void initialise(gpuOffloadType offload, int n, float sparsity) override {
     offload_ = offload;
 
-    m_ = m;
+		// Create a handle for cuSPARSE
+    cusparseCreate(&handle_);
+
     n_ = n;
-    k_ = k;
 
-    // Create a handle for CUBLAS
-    cublasCreate(&handle_);
+		// Create descriptors for matrices A->C
+		cusparseMatDescr_t descrA, descrB, descrC;
+
+		cusparseCreateMatDescr(&descrA);
+		cusparseCreateMatDescr(&descrB);
+		cusparseCreateMatDescr(&descrC);
+
+		cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
+		cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL);
+		cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL);
+
+		cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
+		cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO);
+		cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO);
 
     // Get device identifier
     cudaCheckError(cudaGetDevice(&gpuDevice_));
@@ -47,38 +58,96 @@ class sp_gemm_gpu : public gemm<T> {
     cudaCheckError(cudaStreamCreate(&s2_));
     cudaCheckError(cudaStreamCreate(&s3_));
 
+
+		// Work out number of edges needed to achieve target sparsity
+		int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
+
     if (offload_ == gpuOffloadType::unified) {
-      cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * m_ * k_));
-      cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * k_ * n_));
-      cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * m_ * n_));
+      cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * n_ * n_));
+      cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * n_ * n_));
+      cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * n_ * n_));
+			cudaCheckError(cudaMallocManaged(&DANnzPerRow, sizeof(int) * n_));
     } else {
       // Allocate matrices on host
-      A_ = (T*)malloc(sizeof(T) * m_ * k_);
-      B_ = (T*)malloc(sizeof(T) * k_ * n_);
-      C_ = (T*)malloc(sizeof(T) * m_ * n_);
+			A_ = (T*)malloc(sizeof(T) * n_ * n_);
+			B_ = (T*)malloc(sizeof(T) * n_ * n_);
+			C_ = (T*)malloc(sizeof(T) * n_ * n_);
+
       // Allocate matrices on device
-      cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * m_ * k_));
-      cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * k_ * n_));
-      cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * m_ * n_));
+      cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * n_ * n_));
+      cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * n_ * n_));
+      cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * n_ * n_));
+			// Alloce non-zero vector for A
+			cudaCheckError(cudaMalloc((void**)&dANnzPerRow, sizeof(int) * n_));
     }
 
-    // Initialise the host matricies
-    srand(SEED);
-    for (int y = 0; y < m_; y++) {
-      for (int x = 0; x < k_; x++) {
-        A_[y * k_ + x] = (((T)(rand() % 10000) / 100.0) - 30.0);
-      }
-    }
-    for (int y = 0; y < k_; y++) {
-      for (int x = 0; x < n_; x++) {
-        B_[y * n_ + x] = (((T)(rand() % 10000) / 100.0) - 30.0);
-      }
-    }
+		// Initialise the host matricies
+		// cusparseSpGEMM() works on CSR format only.  This helpfully makes our
+		// sparse matrix format decision for us!
+		// ToDo -- do the RMAT instantiation of A_ and B_.  Need to think about
+		//  how this can be done in the context of CSR.
+
+		// Initialise the matrices
+		// Using a=0.45 and b=c=0.22 as default probabilities
+		for (int i = 0; i < edges; i++) {
+			while (!rMat(A_, n, 0, n - 1, 0, n - 1,
+			             0.45, 0.22, 0.22,
+			             &gen, dist, false)) {}
+			while (!rMat(B_, n, 0, n - 1, 0, n - 1,
+			             0.45, 0.22, 0.22,
+			             &gen, dist, false)) {}
+		}
   }
 
  private:
+		bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
+					        float a, float b, float c, std::default_random_engine* gen,
+					        std::uniform_real_distribution<double> dist, bool bin) {
+					// If a 1x1 submatrix, then add an edge and return out
+					if (x1 >= x2 && y1 >= y2) {
+						if (abs(M[(y1 * n) + x1]) > 0.1) {
+							return false;
+						} else {
+							// Add 1.0 if this is a binary graph, and a random real number otherwise
+							M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
+											100.0) - 50.0);
+							return true;
+						}
+					} else {
+						// Divide up the matrix
+						int xMidPoint = x1 + floor((x2 - x1) / 2);
+						int yMidPoint = y1 + floor((y2 - y1) / 2);
+
+						// ToDo -- add some noise to these values between iterations
+						float newA = a;
+						float newB = b;
+						float newC = c;
+
+						// Work out which quarter to recurse into
+						// There are some ugly ternary operators here to avoid going out of bounds in the edge case
+						// that we are already at 1 width or 1 height
+						float randomNum = dist(*gen);
+						if (randomNum < a) {
+							return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+							            newA, newB, newC, gen, dist, bin);
+						} else if (randomNum < (a + b)) {
+							return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+							            newA, newB, newC, gen, dist, bin);
+						} else if (randomNum < (a + b + c)) {
+							return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+							            newA, newB, newC, gen, dist, bin);
+						} else {
+							return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+							            ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
+							            gen, dist, bin);
+						}
+					}
+					return true;
+				}
+
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
+	// ToDo -- update this to apply to CSR format
   void preLoopRequirements() override {
     switch (offload_) {
       case gpuOffloadType::always: {
@@ -119,79 +188,20 @@ class sp_gemm_gpu : public gemm<T> {
                                        cudaMemcpyHostToDevice, s2_));
         cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_,
                                        cudaMemcpyHostToDevice, s3_));
-        // Call cuBLAS GEMM kernel
-        if constexpr (std::is_same_v<T, float>) {
-          cublasStatus_t stat =
-              cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha,
-                          A_device_, std::max(1, m_), B_device_,
-                          std::max(1, k_), &beta, C_device_, std::max(1, m_));
-          if (stat != CUBLAS_STATUS_SUCCESS) {
-            std::cout << "cuBLAS error:" << stat << std::endl;
-            exit(1);
-          }
-        } else if constexpr (std::is_same_v<T, double>) {
-          cublasStatus_t stat =
-              cublasDgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha,
-                          A_device_, std::max(1, m_), B_device_,
-                          std::max(1, k_), &beta, C_device_, std::max(1, m_));
-          if (stat != CUBLAS_STATUS_SUCCESS) {
-            std::cout << "cuBLAS error:" << stat << std::endl;
-            exit(1);
-          }
-        }
-        // Offload data from device to host
-        cudaCheckError(cudaMemcpyAsync(A_, A_device_, sizeof(T) * m_ * k_,
-                                       cudaMemcpyDeviceToHost, s1_));
-        cudaCheckError(cudaMemcpyAsync(B_, B_device_, sizeof(T) * k_ * n_,
-                                       cudaMemcpyDeviceToHost, s2_));
-        cudaCheckError(cudaMemcpyAsync(C_, C_device_, sizeof(T) * m_ * n_,
-                                       cudaMemcpyDeviceToHost, s3_));
-        // Ensure device has finished all work.
-        cudaCheckError(cudaDeviceSynchronize());
+        // Call cuSPARSE SpGEMM kernel
+				// ToDo -- implement
         break;
       }
       case gpuOffloadType::once: {
-        // Call cuBLAS GEMM kernel
-        if constexpr (std::is_same_v<T, float>) {
-          cublasStatus_t stat =
-              cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha,
-                          A_device_, std::max(1, m_), B_device_,
-                          std::max(1, k_), &beta, C_device_, std::max(1, m_));
-          if (stat != CUBLAS_STATUS_SUCCESS) {
-            std::cout << "cuBLAS error:" << stat << std::endl;
-            exit(1);
-          }
-        } else if constexpr (std::is_same_v<T, double>) {
-          cublasStatus_t stat =
-              cublasDgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha,
-                          A_device_, std::max(1, m_), B_device_,
-                          std::max(1, k_), &beta, C_device_, std::max(1, m_));
-          if (stat != CUBLAS_STATUS_SUCCESS) {
-            std::cout << "cuBLAS error:" << stat << std::endl;
-            exit(1);
-          }
-        }
+        // Call cuSPRASE SpGEMM kernel
+				// ToDo -- implement
+
         break;
       }
       case gpuOffloadType::unified: {
-        // Call cuBLAS GEMM kernel
-        if constexpr (std::is_same_v<T, float>) {
-          cublasStatus_t stat = cublasSgemm(
-              handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, A_,
-              std::max(1, m_), B_, std::max(1, k_), &beta, C_, std::max(1, m_));
-          if (stat != CUBLAS_STATUS_SUCCESS) {
-            std::cout << "cuBLAS error:" << stat << std::endl;
-            exit(1);
-          }
-        } else if constexpr (std::is_same_v<T, double>) {
-          cublasStatus_t stat = cublasDgemm(
-              handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, A_,
-              std::max(1, m_), B_, std::max(1, k_), &beta, C_, std::max(1, m_));
-          if (stat != CUBLAS_STATUS_SUCCESS) {
-            std::cout << "cuBLAS error:" << stat << std::endl;
-            exit(1);
-          }
-        }
+        // Call cuSPARSE SpGEMM kernel
+				// ToDo -- implement
+
         break;
       }
     }
@@ -199,6 +209,7 @@ class sp_gemm_gpu : public gemm<T> {
 
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
+	// ToDo -- check that this all still works
   void postLoopRequirements() override {
     switch (offload_) {
       case gpuOffloadType::always: {
@@ -236,7 +247,7 @@ class sp_gemm_gpu : public gemm<T> {
    * after Kernel has been called. */
   void postCallKernelCleanup() override {
     // Destroy the handle
-    cublasDestroy(handle_);
+    cusparseDestroy(handle_);
 
     // Destroy streams after use
     cudaCheckError(cudaStreamDestroy(s1_));
@@ -285,6 +296,9 @@ class sp_gemm_gpu : public gemm<T> {
   /** Input matrix C, held on the device. */
   T* C_device_;
 
+	/** Vector for number non-zeros, held on the device */
+	int* dANnzPerRow;
+
   /** The constant value Alpha. */
   const T alpha = ALPHA;
 

From c208246927e738615a94c0308e845cf42c198f98 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Wed, 13 Mar 2024 14:05:20 +0000
Subject: [PATCH 04/32] Trying to work out CSR malloc bug

---
 cuBLAS/sp_gemm.hh | 126 ++++++++++++++++++++++++++++------------------
 1 file changed, 76 insertions(+), 50 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 67d030c..3232293 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -66,7 +66,19 @@ class sp_gemm_gpu : public gemm<T> {
       cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * n_ * n_));
       cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * n_ * n_));
       cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * n_ * n_));
-			cudaCheckError(cudaMallocManaged(&DANnzPerRow, sizeof(int) * n_));
+
+			cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges));
+			cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges));
+			cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * edges));
+
+			cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges));
+			cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges));
+			cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * edges));
+
+			cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * edges));
+			cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * edges));
+			cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * edges));
+//			cudaCheckError(cudaMallocManaged(&DANnzPerRow, sizeof(int) * n_));
     } else {
       // Allocate matrices on host
 			A_ = (T*)malloc(sizeof(T) * n_ * n_);
@@ -78,7 +90,7 @@ class sp_gemm_gpu : public gemm<T> {
       cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * n_ * n_));
       cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * n_ * n_));
 			// Alloce non-zero vector for A
-			cudaCheckError(cudaMalloc((void**)&dANnzPerRow, sizeof(int) * n_));
+//			cudaCheckError(cudaMalloc((void**)&dANnzPerRow, sizeof(int) * n_));
     }
 
 		// Initialise the host matricies
@@ -88,6 +100,11 @@ class sp_gemm_gpu : public gemm<T> {
 		//  how this can be done in the context of CSR.
 
 		// Initialise the matrices
+		// Set initial values to 0
+		for (int i = 0; i < (n_ * n_); i++) {
+			A_[i] = 0.0;
+			B_[i] = 0.0;
+		}
 		// Using a=0.45 and b=c=0.22 as default probabilities
 		for (int i = 0; i < edges; i++) {
 			while (!rMat(A_, n, 0, n - 1, 0, n - 1,
@@ -97,57 +114,17 @@ class sp_gemm_gpu : public gemm<T> {
 			             0.45, 0.22, 0.22,
 			             &gen, dist, false)) {}
 		}
+
+//		for (int i = 0; i < (n_ * n_); i++) {
+//			C_[i] = 0.0;
+//		}
   }
 
  private:
-		bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
-					        float a, float b, float c, std::default_random_engine* gen,
-					        std::uniform_real_distribution<double> dist, bool bin) {
-					// If a 1x1 submatrix, then add an edge and return out
-					if (x1 >= x2 && y1 >= y2) {
-						if (abs(M[(y1 * n) + x1]) > 0.1) {
-							return false;
-						} else {
-							// Add 1.0 if this is a binary graph, and a random real number otherwise
-							M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
-											100.0) - 50.0);
-							return true;
-						}
-					} else {
-						// Divide up the matrix
-						int xMidPoint = x1 + floor((x2 - x1) / 2);
-						int yMidPoint = y1 + floor((y2 - y1) / 2);
-
-						// ToDo -- add some noise to these values between iterations
-						float newA = a;
-						float newB = b;
-						float newC = c;
-
-						// Work out which quarter to recurse into
-						// There are some ugly ternary operators here to avoid going out of bounds in the edge case
-						// that we are already at 1 width or 1 height
-						float randomNum = dist(*gen);
-						if (randomNum < a) {
-							return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
-							            newA, newB, newC, gen, dist, bin);
-						} else if (randomNum < (a + b)) {
-							return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
-							            newA, newB, newC, gen, dist, bin);
-						} else if (randomNum < (a + b + c)) {
-							return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
-							            newA, newB, newC, gen, dist, bin);
-						} else {
-							return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
-							            ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
-							            gen, dist, bin);
-						}
-					}
-					return true;
-				}
+
 
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
-	// ToDo -- update this to apply to CSR format
   void preLoopRequirements() override {
     switch (offload_) {
       case gpuOffloadType::always: {
@@ -188,8 +165,8 @@ class sp_gemm_gpu : public gemm<T> {
                                        cudaMemcpyHostToDevice, s2_));
         cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_,
                                        cudaMemcpyHostToDevice, s3_));
-        // Call cuSPARSE SpGEMM kernel
-				// ToDo -- implement
+
+
         break;
       }
       case gpuOffloadType::once: {
@@ -269,6 +246,51 @@ class sp_gemm_gpu : public gemm<T> {
     }
   }
 
+	bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
+					        float a, float b, float c, std::default_random_engine* gen,
+					        std::uniform_real_distribution<double> dist, bool bin) {
+			// If a 1x1 submatrix, then add an edge and return out
+			if (x1 >= x2 && y1 >= y2) {
+				if (abs(M[(y1 * n) + x1]) > 0.1) {
+					return false;
+				} else {
+					// Add 1.0 if this is a binary graph, and a random real number otherwise
+					M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
+									100.0) - 50.0);
+					return true;
+				}
+			} else {
+				// Divide up the matrix
+				int xMidPoint = x1 + floor((x2 - x1) / 2);
+				int yMidPoint = y1 + floor((y2 - y1) / 2);
+
+				// ToDo -- add some noise to these values between iterations
+				float newA = a;
+				float newB = b;
+				float newC = c;
+
+				// Work out which quarter to recurse into
+				// There are some ugly ternary operators here to avoid going out of bounds in the edge case
+				// that we are already at 1 width or 1 height
+				float randomNum = dist(*gen);
+				if (randomNum < a) {
+					return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+					            newA, newB, newC, gen, dist, bin);
+				} else if (randomNum < (a + b)) {
+					return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+					            newA, newB, newC, gen, dist, bin);
+				} else if (randomNum < (a + b + c)) {
+					return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+					            newA, newB, newC, gen, dist, bin);
+				} else {
+					return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+					            ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
+					            gen, dist, bin);
+				}
+			}
+			return true;
+		}
+
   /** Handle used when calling cuBLAS. */
   cublasHandle_t handle_;
 
@@ -297,7 +319,11 @@ class sp_gemm_gpu : public gemm<T> {
   T* C_device_;
 
 	/** Vector for number non-zeros, held on the device */
-	int* dANnzPerRow;
+//	int* dANnzPerRow;
+
+	/** CSR format vectors for matrices A, B and C on the device */
+	T* A_val_, B_val_, C_val_;
+	int* A_col_, A_row_, B_col_, B_row_, C_col_, C_row_;
 
   /** The constant value Alpha. */
   const T alpha = ALPHA;

From de14a5682aae00ab582f87a396eaf3da5b66b99f Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Wed, 13 Mar 2024 14:07:46 +0000
Subject: [PATCH 05/32] Trying to work out CSR malloc bug

---
 cuBLAS/sp_gemm.hh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 3232293..0765adb 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -96,8 +96,6 @@ class sp_gemm_gpu : public gemm<T> {
 		// Initialise the host matricies
 		// cusparseSpGEMM() works on CSR format only.  This helpfully makes our
 		// sparse matrix format decision for us!
-		// ToDo -- do the RMAT instantiation of A_ and B_.  Need to think about
-		//  how this can be done in the context of CSR.
 
 		// Initialise the matrices
 		// Set initial values to 0

From 49cddf02f8a50571d2eaa5b653bdf8fb49198d91 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Tue, 19 Mar 2024 13:05:58 +0000
Subject: [PATCH 06/32] cuSPARSE unified memory implementation

---
 cuBLAS/sp_gemm.hh | 433 ++++++++++++++++++++++++++--------------------
 1 file changed, 250 insertions(+), 183 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 0765adb..68e3b84 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -3,6 +3,7 @@
 #ifdef GPU_CUBLAS
 #include "cusparse.h"
 #include <cuda_runtime.h>
+#include <type_traits>
 
 #include "../include/kernels/GPU/gemm.hh"
 #include "../include/utilities.hh"
@@ -20,6 +21,8 @@ class sp_gemm_gpu : public gemm<T> {
   using gemm<T>::C_;
   using gemm<T>::offload_;
 
+	// ToDo -- just unified implemented so far.  Fill in Always and Once later
+
   /** Initialise the required data structures.
    * `offload` refers to the data offload type:
    *  - Once:    Move data from host to device before all iterations & move from
@@ -33,10 +36,10 @@ class sp_gemm_gpu : public gemm<T> {
 		// Create a handle for cuSPARSE
     cusparseCreate(&handle_);
 
-    n_ = n;
+		cudaDataType_ = (std::is_same_v<T, float>) ? CUDA_R_32F :
+						CUDA_R_64F;
 
-		// Create descriptors for matrices A->C
-		cusparseMatDescr_t descrA, descrB, descrC;
+    n_ = n;
 
 		cusparseCreateMatDescr(&descrA);
 		cusparseCreateMatDescr(&descrB);
@@ -61,37 +64,30 @@ class sp_gemm_gpu : public gemm<T> {
 
 		// Work out number of edges needed to achieve target sparsity
 		int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
+		A_nnz_ = B_nnz_ = edges
+
+		// ToDo -- for all of this mallocing, bear in mind that row will probably
+		//  have fewer than 'edges' values (thats the whole point).  May need to
+		//  reorganise
+
+    cudaCheckError(cudaMallocManaged(A_num_rows_, sizeof(int)));
+		cudaCheckError(cudaMallocManaged(A_num_cols_, sizeof(int)));
+		cudaCheckError(cudaMallocManaged(A_nnz_, sizeof(int)));
+		cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges));
+		cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges));
+		cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1)));
+
+		cudaCheckError(cudaMallocManaged(B_num_rows_, sizeof(int)));
+		cudaCheckError(cudaMallocManaged(B_num_cols_, sizeof(int)));
+		cudaCheckError(cudaMallocManaged(B_nnz_, sizeof(int)));
+		cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges));
+		cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges));
+		cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1)));
+
+		C_val_ = NULL;
+		C_col_ = NULL;
+		C_row_ = NULL;
 
-    if (offload_ == gpuOffloadType::unified) {
-      cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * n_ * n_));
-      cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * n_ * n_));
-      cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * n_ * n_));
-
-			cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges));
-			cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges));
-			cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * edges));
-
-			cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges));
-			cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges));
-			cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * edges));
-
-			cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * edges));
-			cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * edges));
-			cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * edges));
-//			cudaCheckError(cudaMallocManaged(&DANnzPerRow, sizeof(int) * n_));
-    } else {
-      // Allocate matrices on host
-			A_ = (T*)malloc(sizeof(T) * n_ * n_);
-			B_ = (T*)malloc(sizeof(T) * n_ * n_);
-			C_ = (T*)malloc(sizeof(T) * n_ * n_);
-
-      // Allocate matrices on device
-      cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * n_ * n_));
-      cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * n_ * n_));
-      cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * n_ * n_));
-			// Alloce non-zero vector for A
-//			cudaCheckError(cudaMalloc((void**)&dANnzPerRow, sizeof(int) * n_));
-    }
 
 		// Initialise the host matricies
 		// cusparseSpGEMM() works on CSR format only.  This helpfully makes our
@@ -113,109 +109,160 @@ class sp_gemm_gpu : public gemm<T> {
 			             &gen, dist, false)) {}
 		}
 
-//		for (int i = 0; i < (n_ * n_); i++) {
-//			C_[i] = 0.0;
-//		}
+		toCSR(A_, n, n, edges, A_val_, A_col_, A_row_);
+		toCSR(B_, n, n, edges, B_val_, B_col_, B_row_);
+
   }
 
+
+
  private:
 
 
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
-    switch (offload_) {
-      case gpuOffloadType::always: {
-        // Offload data each iteration - no requirements
-        break;
-      }
-      case gpuOffloadType::once: {
-        // Offload data from host to the device.
-        cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * k_,
-                                       cudaMemcpyHostToDevice, s1_));
-        cudaCheckError(cudaMemcpyAsync(B_device_, B_, sizeof(T) * k_ * n_,
-                                       cudaMemcpyHostToDevice, s2_));
-        cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_,
-                                       cudaMemcpyHostToDevice, s3_));
-        break;
-      }
-      case gpuOffloadType::unified: {
-        // Prefetch memory to device
-        cudaCheckError(
-            cudaMemPrefetchAsync(A_, sizeof(T) * m_ * k_, gpuDevice_, s1_));
-        cudaCheckError(
-            cudaMemPrefetchAsync(B_, sizeof(T) * k_ * n_, gpuDevice_, s2_));
-        cudaCheckError(
-            cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_, gpuDevice_, s3_));
-        break;
-      }
-    }
+    // Prefetch memory to device
+		cudaCheckError(cudaMemPrefetchAsync(A_num_rows_, sizeof(int), gpuDevice_,
+																				s1_));
+		cudaCheckError(cudaMemPrefetchAsync(A_num_cols_, sizeof(int), gpuDevice_,
+																				s1_));
+		cudaCheckError(cudaMemPrefetchAsync(A_nnz_, sizeof(int), gpuDevice_,
+																				s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * edges, gpuDevice_,
+																				s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * edges,
+																				gpuDevice_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1),
+																				gpuDevice_, s1_));
+
+		cudaCheckError(cudaMemPrefetchAsync(B_num_rows_, sizeof(int), gpuDevice_,
+																				s2_));
+		cudaCheckError(cudaMemPrefetchAsync(B_num_cols_, sizeof(int), gpuDevice_,
+																				s2_));
+		cudaCheckError(cudaMemPrefetchAsync(B_nnz_, sizeof(int), gpuDevice_,
+																				s2_));
+		cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * edges, gpuDevice_,
+																				s2_));
+		cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * edges,
+																				gpuDevice_, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1),
+																				gpuDevice_, s2_));
+//
+//		cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), gpuDevice_,
+//																				s3_));
+//		cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), gpuDevice_,
+//																				s3_));
+//		cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), gpuDevice_,
+//																				s3_));
+//		cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * edges, gpuDevice_,
+//																				s3_));
+//		cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * edges,
+//																				gpuDevice_, s3_));
+//		cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * edges,
+//																				gpuDevice_, s3_));
+
+		// Create the CSR matrices on the device
+		cusparseCreateCsr(descrA_, n_, n_, A_nnz_, A_row_, A_col_, A_val_,
+											CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+											CUSPARSE_INDEX_BASE_ZERO, cudaDateType_);
+		cusparseCreateCsr(descrB_, n_, n_, B_nnz_, B_row_, B_col_, B_val_,
+											CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+											CUSPARSE_INDEX_BASE_ZERO, cudaDateType_);
+		cusparseCreateCsr(descrC_, n_, n_, 0, NULL, NULL, NULL,
+											CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+											CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
+
+		cusparseSpGEMM_createDescr(&spgemmDesc_);
   }
 
   /** Make a call to the BLAS Library Kernel. */
   void callGemm() override {
-    switch (offload_) {
-      case gpuOffloadType::always: {
-        // Offload data from host to the device.
-        cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * k_,
-                                       cudaMemcpyHostToDevice, s1_));
-        cudaCheckError(cudaMemcpyAsync(B_device_, B_, sizeof(T) * k_ * n_,
-                                       cudaMemcpyHostToDevice, s2_));
-        cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_,
-                                       cudaMemcpyHostToDevice, s3_));
-
-
-        break;
-      }
-      case gpuOffloadType::once: {
-        // Call cuSPRASE SpGEMM kernel
-				// ToDo -- implement
-
-        break;
-      }
-      case gpuOffloadType::unified: {
-        // Call cuSPARSE SpGEMM kernel
-				// ToDo -- implement
-
-        break;
-      }
-    }
-  }
+    cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+																	 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+																	 descrA_, descrB_, &beta, descrC_,
+																	 CUSPARSE_SPGEMM_DEFAULT, cudaDataType_,
+																	 spgemmDesc_, buffer_size1_, NULL);
+		cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_));
+    cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+																	 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+																	 descrA_, descrB_, &beta, descrC_,
+																	 CUSPARSE_SPGEMM_DEFAULT, cudaDataType_,
+																	 spgemmDesc_, buffer_size1_, buffer1_);
+		cusparseSpGEMM_cmopute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+													 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
+													 descrB_, &beta, descrC_, CUSPARSE_SPGEMM_DEFAULT,
+													 cudaDataType_, spgemmDesc_, buffer_size2_, NULL);
+		cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2));
+
+		if (cusparseSpGEMM_cmopute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+													 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
+													 descrB_, &beta, descrC_, CUSPARSE_SPGEMM_DEFAULT,
+													 cudaDataType_, spgemmDesc_, buffer_size2_, buffer2_)
+						== CUSPARSE_SATUS_INSUFFICIENT_RESOURCES) {
+			std::cout << "Insufficient resources" << std::endl;
+			exit(1);
+		}
+
+		int rows, cols, nnz;
+
+		cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz_);
+		C_nnz_ = nnz;
+		cudaCheckError(cudaMallocManaged(C_val_), sizeof(T) * nnz);
+		cudaCheckError(cudaMallocManaged(C_col_), sizeof(int) * nnz);
+		cudaCheckError(cudaMallocManaged(C_row_), sizeof(int) * (n_ + 1));
+
+		cusparseCstSetPointers(descrC_, *C_row, *C_colind, *C_val);
+		cusparseSpGEMM_copy(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+												CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
+												descrB_, &beta, descrC_, CUDA_R_32F,
+												CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_);
+	}
 
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
-	// ToDo -- check that this all still works
   void postLoopRequirements() override {
-    switch (offload_) {
-      case gpuOffloadType::always: {
-        // Offload data each iteration - no requirements
-        break;
-      }
-      case gpuOffloadType::once: {
-        // Offload data from device to host
-        cudaCheckError(cudaMemcpyAsync(A_, A_device_, sizeof(T) * m_ * k_,
-                                       cudaMemcpyDeviceToHost, s1_));
-        cudaCheckError(cudaMemcpyAsync(B_, B_device_, sizeof(T) * k_ * n_,
-                                       cudaMemcpyDeviceToHost, s2_));
-        cudaCheckError(cudaMemcpyAsync(C_, C_device_, sizeof(T) * m_ * n_,
-                                       cudaMemcpyDeviceToHost, s3_));
-        // Ensure device has finished all work.
-        cudaCheckError(cudaDeviceSynchronize());
-        break;
-      }
-      case gpuOffloadType::unified: {
-        // Ensure all data resides on host once work has completed
-        cudaCheckError(cudaMemPrefetchAsync(A_, sizeof(T) * m_ * k_,
-                                            cudaCpuDeviceId, s1_));
-        cudaCheckError(cudaMemPrefetchAsync(B_, sizeof(T) * k_ * n_,
-                                            cudaCpuDeviceId, s2_));
-        cudaCheckError(cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_,
-                                            cudaCpuDeviceId, s3_));
-        // Ensure device has finished all work.
-        cudaCheckError(cudaDeviceSynchronize());
-        break;
-      }
-    }
+    // Ensure all data resides on host once work has completed
+		cudaCheckError(cudaMemPrefetchAsync(A_num_rows_, sizeof(int),
+																				cudaCpuDeviceId_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(A_num_cols_, sizeof(int),
+																				cudaCpuDeviceId_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(A_nnz_, sizeof(int),
+																				cudaCpuDeviceId_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * edges,
+																				cudaCpuDeviceId_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * edges,
+																				cudaCpuDeviceId_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1),
+																				cudaCpuDeviceId_, s1_));
+
+		cudaCheckError(cudaMemPrefetchAsync(B_num_rows_, sizeof(int),
+																				cudaCpuDeviceId_, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(B_num_cols_, sizeof(int),
+																				cudaCpuDeviceId_, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(B_nnz_, sizeof(int),
+																				cudaCpuDeviceId_, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * edges,
+																				cudaCpuDeviceId_, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * edges,
+																				cudaCpuDeviceId_, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1),
+																				cudaCpuDeviceId_, s2_));
+
+		cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int),
+																				cudaCpuDeviceId_, s3_));
+		cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int),
+																				cudaCpuDeviceId_, s3_));
+		cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int),
+																				cudaCpuDeviceId_, s3_));
+		cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * C_nnz_,
+																				cudaCpuDeviceId_, s3_));
+		cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * C_nnz_,
+																				cudaCpuDeviceId_, s3_));
+		cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * (n_ + 1),
+																				cudaCpuDeviceId_, s3_));
+    // Ensure device has finished all work.
+    cudaCheckError(cudaDeviceSynchronize());
   }
 
   /** Do any necessary cleanup (free pointers, close library handles, etc.)
@@ -229,65 +276,76 @@ class sp_gemm_gpu : public gemm<T> {
     cudaCheckError(cudaStreamDestroy(s2_));
     cudaCheckError(cudaStreamDestroy(s3_));
 
-    if (offload_ == gpuOffloadType::unified) {
-      cudaFree(A_);
-      cudaFree(B_);
-      cudaFree(C_);
-    } else {
-      // Free the memory held on host and device
-      free(A_);
-      free(B_);
-      free(C_);
-      cudaFree(A_device_);
-      cudaFree(B_device_);
-      cudaFree(C_device_);
-    }
+    cudaFree(A_);
+    cudaFree(B_);
+    cudaFree(C_);
   }
 
 	bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
 					        float a, float b, float c, std::default_random_engine* gen,
 					        std::uniform_real_distribution<double> dist, bool bin) {
-			// If a 1x1 submatrix, then add an edge and return out
-			if (x1 >= x2 && y1 >= y2) {
-				if (abs(M[(y1 * n) + x1]) > 0.1) {
-					return false;
-				} else {
-					// Add 1.0 if this is a binary graph, and a random real number otherwise
-					M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
-									100.0) - 50.0);
-					return true;
-				}
+		// If a 1x1 submatrix, then add an edge and return out
+		if (x1 >= x2 && y1 >= y2) {
+			if (abs(M[(y1 * n) + x1]) > 0.1) {
+				return false;
+			} else {
+				// Add 1.0 if this is a binary graph, and a random real number otherwise
+				M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
+								100.0) - 50.0);
+				return true;
+			}
+		} else {
+			// Divide up the matrix
+			int xMidPoint = x1 + floor((x2 - x1) / 2);
+			int yMidPoint = y1 + floor((y2 - y1) / 2);
+
+			// ToDo -- add some noise to these values between iterations
+			float newA = a;
+			float newB = b;
+			float newC = c;
+
+			// Work out which quarter to recurse into
+			// There are some ugly ternary operators here to avoid going out of bounds in the edge case
+			// that we are already at 1 width or 1 height
+			float randomNum = dist(*gen);
+			if (randomNum < a) {
+				return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+				            newA, newB, newC, gen, dist, bin);
+			} else if (randomNum < (a + b)) {
+				return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+				            newA, newB, newC, gen, dist, bin);
+			} else if (randomNum < (a + b + c)) {
+				return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+				            newA, newB, newC, gen, dist, bin);
 			} else {
-				// Divide up the matrix
-				int xMidPoint = x1 + floor((x2 - x1) / 2);
-				int yMidPoint = y1 + floor((y2 - y1) / 2);
-
-				// ToDo -- add some noise to these values between iterations
-				float newA = a;
-				float newB = b;
-				float newC = c;
-
-				// Work out which quarter to recurse into
-				// There are some ugly ternary operators here to avoid going out of bounds in the edge case
-				// that we are already at 1 width or 1 height
-				float randomNum = dist(*gen);
-				if (randomNum < a) {
-					return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
-					            newA, newB, newC, gen, dist, bin);
-				} else if (randomNum < (a + b)) {
-					return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
-					            newA, newB, newC, gen, dist, bin);
-				} else if (randomNum < (a + b + c)) {
-					return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
-					            newA, newB, newC, gen, dist, bin);
-				} else {
-					return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
-					            ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
-					            gen, dist, bin);
+				return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+				            ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
+				            gen, dist, bin);
+			}
+		}
+		return true;
+	}
+
+	void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index,
+						 int* row_ptr) {
+		int nnz_encountered = 0;
+		int prev_row_ptr = 0;
+		for (int row = 0; row < n_row; row++) {
+			if (nnz_encountered >= nnz) break;
+			row_ptr[row] = prev_row_ptr;
+			int nnz_row = 0;
+			for (int col = 0; col < n_col; col++) {
+				if (nnz_encountered >= nnz) break;
+				if (dense[(row * n_col) + col] != 0.0) {
+					nnz_row++;
+					col_index[nnz_encountered] = col;
+					vals[nnz_encountered] = dense[(row * n_col) + col];
+					nnz_encountered++;
 				}
 			}
-			return true;
+			prev_row_ptr += nnz_row;
 		}
+	}
 
   /** Handle used when calling cuBLAS. */
   cublasHandle_t handle_;
@@ -307,27 +365,36 @@ class sp_gemm_gpu : public gemm<T> {
   /** The ID of the target GPU Device. */
   int gpuDevice_;
 
-  /** Input matrix A, held on the device. */
-  T* A_device_;
-
-  /** Input matrix B, held on the device. */
-  T* B_device_;
-
-  /** Input matrix C, held on the device. */
-  T* C_device_;
-
-	/** Vector for number non-zeros, held on the device */
-//	int* dANnzPerRow;
-
-	/** CSR format vectors for matrices A, B and C on the device */
+	/** CSR format vectors for matrices A, B and C on the host */
+	int A_nnz_, B_nnz_, C_nnz_;
 	T* A_val_, B_val_, C_val_;
 	int* A_col_, A_row_, B_col_, B_row_, C_col_, C_row_;
 
+  /** CSR format vectors for matrices A, B and C on the device. */
+	int A_num_rows_dev_, A_num_cols_dev_, A_nnz_dev_, B_num_rows_dev_,
+	B_num_cols_dev_, B_nnz_dev_, C_num_rows_dev_, C_num_cols_dev_, C_nnz_dev_;
+	T* A_val_dev_, B_val_dev_, C_val_dev_;
+	int* A_col_dev_, A_row_dev_, B_col_dev_, B_row_dev_, C_col_dev_, C_row_dev_;
+
   /** The constant value Alpha. */
   const T alpha = ALPHA;
 
   /** The constant value Beta. */
   const T beta = BETA;
+
+
+	// Create descriptors for matrices A->C
+	cusparseMatDescr_t descrA_, descrB_, descrC_;
+
+	// index type depends on kernel being run
+	cusparseIndexType_t cudaDataType_;
+
+	cusparceSpGEMMDescr_t spgemmDesc_;
+
+	size_t buffer_size1_ = 0;
+	size_t buffer_size2_ = 0;
+  void* buffer1_ = NULL;
+	void* buffer2_ = NULL;
 };
 }  // namespace gpu
 #endif
\ No newline at end of file

From 37ce8b4c32b7b04caae5a4dbc697b21086447c9f Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Thu, 21 Mar 2024 13:08:49 +0000
Subject: [PATCH 07/32] Now compiles

---
 DefaultGPU/sp_gemm.hh          |   2 +-
 Makefile                       |   2 +-
 cuBLAS/sp_gemm.hh              | 228 +++++++++++++++------------------
 include/doGemm.hh              |   7 +-
 include/kernels/GPU/sp_gemm.hh |   2 +-
 5 files changed, 112 insertions(+), 129 deletions(-)

diff --git a/DefaultGPU/sp_gemm.hh b/DefaultGPU/sp_gemm.hh
index 92d157c..2a9f478 100644
--- a/DefaultGPU/sp_gemm.hh
+++ b/DefaultGPU/sp_gemm.hh
@@ -22,7 +22,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
   }
 
   /** Initialise the required data structures. */
-  void initialise(gpuOffloadType offload, int m, int n, int k) override {
+  void initialise(gpuOffloadType offload, int n, float sparsity) override {
     // Default GPU implementation - do nothing.
   }
 
diff --git a/Makefile b/Makefile
index 5dd2fc5..bff0add 100644
--- a/Makefile
+++ b/Makefile
@@ -177,7 +177,7 @@ $(info $(TAB)$(TAB)Add `CXXFLAGS=-L<NVHPC_DIR>/.../math_libs/lib64 -L<NVHPC_DIR>
 $(info $(TAB)$(TAB)Add `CXXFLAGS=-I<NVHPC_DIR>/.../math_libs/include -I<NVHPC_DIR>/.../cuda/include` to make command)
 $(info $(TAB)$(TAB)Add `CXXFLAGS=-Wl,-rpath,<NVHPC_DIR>/.../math_libs/lib64 -Wl,-rpath,<NVHPC_DIR>/.../cuda/lib64` to make command)
 $(info )
-override CXXFLAGS += -lcublas -lcudart
+override CXXFLAGS += -lcublas -lcudart -lcusparse
 endif
 HEADER_FILES += $(wildcard cuBLAS/*.hh)
 
diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 68e3b84..c0bfb8e 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -2,24 +2,27 @@
 
 #ifdef GPU_CUBLAS
 #include "cusparse.h"
+#include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <type_traits>
+#include <random>
+#include <iostream>
 
-#include "../include/kernels/GPU/gemm.hh"
+#include "../include/kernels/GPU/sp_gemm.hh"
 #include "../include/utilities.hh"
 #include "common.hh"
 
 namespace gpu {
 /** A class for GEMM GPU BLAS kernels. */
 template <typename T>
-class sp_gemm_gpu : public gemm<T> {
+class sp_gemm_gpu : public sp_gemm<T> {
  public:
-  using gemm<T>::gemm;
-  using gemm<T>::n_;
-  using gemm<T>::A_;
-  using gemm<T>::B_;
-  using gemm<T>::C_;
-  using gemm<T>::offload_;
+  using sp_gemm<T>::sp_gemm;
+  using sp_gemm<T>::n_;
+  using sp_gemm<T>::A_;
+  using sp_gemm<T>::B_;
+  using sp_gemm<T>::C_;
+  using sp_gemm<T>::offload_;
 
 	// ToDo -- just unified implemented so far.  Fill in Always and Once later
 
@@ -31,63 +34,50 @@ class sp_gemm_gpu : public gemm<T> {
    *  - Unified: Initialise data as unified memory; no data movement semantics
    *             required */
   void initialise(gpuOffloadType offload, int n, float sparsity) override {
+    std::cout << "Initialising" << std::endl;
     offload_ = offload;
 
 		// Create a handle for cuSPARSE
     cusparseCreate(&handle_);
+    std::cout << "Handle created" << std::endl;
 
-		cudaDataType_ = (std::is_same_v<T, float>) ? CUDA_R_32F :
-						CUDA_R_64F;
 
+		if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F;
+    else if (std::is_same_v<T, double>) cudaDataType_ = CUDA_R_64F;
+    else {
+      std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
+      exit(1);
+    }
     n_ = n;
 
-		cusparseCreateMatDescr(&descrA);
-		cusparseCreateMatDescr(&descrB);
-		cusparseCreateMatDescr(&descrC);
-
-		cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
-		cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL);
-		cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL);
-
-		cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
-		cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO);
-		cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO);
-
     // Get device identifier
     cudaCheckError(cudaGetDevice(&gpuDevice_));
+    std::cout << "GPU device got" << std::endl;
 
     // Initialise 3 streams to asynchronously move data between host and device
     cudaCheckError(cudaStreamCreate(&s1_));
     cudaCheckError(cudaStreamCreate(&s2_));
     cudaCheckError(cudaStreamCreate(&s3_));
+    std::cout << "Streams created" << std::endl;
 
 
 		// Work out number of edges needed to achieve target sparsity
 		int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
-		A_nnz_ = B_nnz_ = edges
+		(*A_nnz_) = (*B_nnz_) = edges;
 
 		// ToDo -- for all of this mallocing, bear in mind that row will probably
 		//  have fewer than 'edges' values (thats the whole point).  May need to
 		//  reorganise
 
-    cudaCheckError(cudaMallocManaged(A_num_rows_, sizeof(int)));
-		cudaCheckError(cudaMallocManaged(A_num_cols_, sizeof(int)));
-		cudaCheckError(cudaMallocManaged(A_nnz_, sizeof(int)));
 		cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges));
 		cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges));
 		cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1)));
+    std::cout << "A CSR vectors malloced" << std::endl;
 
-		cudaCheckError(cudaMallocManaged(B_num_rows_, sizeof(int)));
-		cudaCheckError(cudaMallocManaged(B_num_cols_, sizeof(int)));
-		cudaCheckError(cudaMallocManaged(B_nnz_, sizeof(int)));
 		cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges));
 		cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges));
 		cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1)));
-
-		C_val_ = NULL;
-		C_col_ = NULL;
-		C_row_ = NULL;
-
+    std::cout << "B CSR vectors malloced" << std::endl;
 
 		// Initialise the host matricies
 		// cusparseSpGEMM() works on CSR format only.  This helpfully makes our
@@ -99,6 +89,13 @@ class sp_gemm_gpu : public gemm<T> {
 			A_[i] = 0.0;
 			B_[i] = 0.0;
 		}
+
+    // Random number generator objects for use in descent
+    std::default_random_engine gen;
+    gen.seed(std::chrono::system_clock::now()
+                     .time_since_epoch().count());
+    std::uniform_real_distribution<double> dist(0.0, 1.0);
+
 		// Using a=0.45 and b=c=0.22 as default probabilities
 		for (int i = 0; i < edges; i++) {
 			while (!rMat(A_, n, 0, n - 1, 0, n - 1,
@@ -117,34 +114,20 @@ class sp_gemm_gpu : public gemm<T> {
 
 
  private:
-
-
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
     // Prefetch memory to device
-		cudaCheckError(cudaMemPrefetchAsync(A_num_rows_, sizeof(int), gpuDevice_,
-																				s1_));
-		cudaCheckError(cudaMemPrefetchAsync(A_num_cols_, sizeof(int), gpuDevice_,
-																				s1_));
-		cudaCheckError(cudaMemPrefetchAsync(A_nnz_, sizeof(int), gpuDevice_,
-																				s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * edges, gpuDevice_,
-																				s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * edges,
+		cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * (*A_nnz_),
+                                        gpuDevice_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * (*A_nnz_),
 																				gpuDevice_, s1_));
 		cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1),
 																				gpuDevice_, s1_));
 
-		cudaCheckError(cudaMemPrefetchAsync(B_num_rows_, sizeof(int), gpuDevice_,
-																				s2_));
-		cudaCheckError(cudaMemPrefetchAsync(B_num_cols_, sizeof(int), gpuDevice_,
-																				s2_));
-		cudaCheckError(cudaMemPrefetchAsync(B_nnz_, sizeof(int), gpuDevice_,
-																				s2_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * edges, gpuDevice_,
-																				s2_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * edges,
+		cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * (*B_nnz_),
+                                        gpuDevice_, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * (*B_nnz_),
 																				gpuDevice_, s2_));
 		cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1),
 																				gpuDevice_, s2_));
@@ -163,13 +146,13 @@ class sp_gemm_gpu : public gemm<T> {
 //																				gpuDevice_, s3_));
 
 		// Create the CSR matrices on the device
-		cusparseCreateCsr(descrA_, n_, n_, A_nnz_, A_row_, A_col_, A_val_,
+		cusparseCreateCsr(&descrA_, n_, n_, (*A_nnz_), A_row_, A_col_, A_val_,
 											CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-											CUSPARSE_INDEX_BASE_ZERO, cudaDateType_);
-		cusparseCreateCsr(descrB_, n_, n_, B_nnz_, B_row_, B_col_, B_val_,
+											CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
+		cusparseCreateCsr(&descrB_, n_, n_, (*B_nnz_), B_row_, B_col_, B_val_,
 											CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-											CUSPARSE_INDEX_BASE_ZERO, cudaDateType_);
-		cusparseCreateCsr(descrC_, n_, n_, 0, NULL, NULL, NULL,
+											CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
+		cusparseCreateCsr(&descrC_, n_, n_, 0, NULL, NULL, NULL,
 											CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
 											CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
 
@@ -181,38 +164,40 @@ class sp_gemm_gpu : public gemm<T> {
     cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
 																	 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
 																	 descrA_, descrB_, &beta, descrC_,
-																	 CUSPARSE_SPGEMM_DEFAULT, cudaDataType_,
-																	 spgemmDesc_, buffer_size1_, NULL);
+																	 cudaDataType_, CUSPARSE_SPGEMM_DEFAULT,
+																	 spgemmDesc_, &buffer_size1_, NULL);
 		cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_));
     cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
 																	 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
 																	 descrA_, descrB_, &beta, descrC_,
-																	 CUSPARSE_SPGEMM_DEFAULT, cudaDataType_,
-																	 spgemmDesc_, buffer_size1_, buffer1_);
-		cusparseSpGEMM_cmopute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+																	 cudaDataType_, CUSPARSE_SPGEMM_DEFAULT,
+																	 spgemmDesc_, &buffer_size1_, buffer1_);
+		cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
 													 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
-													 descrB_, &beta, descrC_, CUSPARSE_SPGEMM_DEFAULT,
-													 cudaDataType_, spgemmDesc_, buffer_size2_, NULL);
-		cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2));
+													 descrB_, &beta, descrC_, cudaDataType_,
+													 CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_,
+                           &buffer_size2_, NULL);
+		cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2_));
 
-		if (cusparseSpGEMM_cmopute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+		if (cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
 													 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
-													 descrB_, &beta, descrC_, CUSPARSE_SPGEMM_DEFAULT,
-													 cudaDataType_, spgemmDesc_, buffer_size2_, buffer2_)
-						== CUSPARSE_SATUS_INSUFFICIENT_RESOURCES) {
+													 descrB_, &beta, descrC_, cudaDataType_,
+                           CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_,
+                           &buffer_size2_, buffer2_)
+						== CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) {
 			std::cout << "Insufficient resources" << std::endl;
 			exit(1);
 		}
 
-		int rows, cols, nnz;
+		int64_t rows, cols, nnz;
 
-		cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz_);
-		C_nnz_ = nnz;
-		cudaCheckError(cudaMallocManaged(C_val_), sizeof(T) * nnz);
-		cudaCheckError(cudaMallocManaged(C_col_), sizeof(int) * nnz);
-		cudaCheckError(cudaMallocManaged(C_row_), sizeof(int) * (n_ + 1));
+		cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz);
+		(*C_nnz_) = nnz;
+		cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnz));
+		cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnz));
+		cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1)));
 
-		cusparseCstSetPointers(descrC_, *C_row, *C_colind, *C_val);
+		cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_);
 		cusparseSpGEMM_copy(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
 												CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
 												descrB_, &beta, descrC_, CUDA_R_32F,
@@ -223,44 +208,26 @@ class sp_gemm_gpu : public gemm<T> {
    * be timed. */
   void postLoopRequirements() override {
     // Ensure all data resides on host once work has completed
-		cudaCheckError(cudaMemPrefetchAsync(A_num_rows_, sizeof(int),
-																				cudaCpuDeviceId_, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(A_num_cols_, sizeof(int),
-																				cudaCpuDeviceId_, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(A_nnz_, sizeof(int),
-																				cudaCpuDeviceId_, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * edges,
-																				cudaCpuDeviceId_, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * edges,
-																				cudaCpuDeviceId_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * (*A_nnz_),
+																				cudaCpuDeviceId, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * (*A_nnz_),
+																				cudaCpuDeviceId, s1_));
 		cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1),
-																				cudaCpuDeviceId_, s1_));
-
-		cudaCheckError(cudaMemPrefetchAsync(B_num_rows_, sizeof(int),
-																				cudaCpuDeviceId_, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(B_num_cols_, sizeof(int),
-																				cudaCpuDeviceId_, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(B_nnz_, sizeof(int),
-																				cudaCpuDeviceId_, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * edges,
-																				cudaCpuDeviceId_, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * edges,
-																				cudaCpuDeviceId_, s2_));
+																				cudaCpuDeviceId, s1_));
+
+		cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * (*B_nnz_),
+																				cudaCpuDeviceId, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * (*B_nnz_),
+																				cudaCpuDeviceId, s2_));
 		cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1),
-																				cudaCpuDeviceId_, s2_));
-
-		cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int),
-																				cudaCpuDeviceId_, s3_));
-		cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int),
-																				cudaCpuDeviceId_, s3_));
-		cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int),
-																				cudaCpuDeviceId_, s3_));
-		cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * C_nnz_,
-																				cudaCpuDeviceId_, s3_));
-		cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * C_nnz_,
-																				cudaCpuDeviceId_, s3_));
+																				cudaCpuDeviceId, s2_));
+
+		cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * (*C_nnz_),
+																				cudaCpuDeviceId, s3_));
+		cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * (*C_nnz_),
+																				cudaCpuDeviceId, s3_));
 		cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * (n_ + 1),
-																				cudaCpuDeviceId_, s3_));
+																				cudaCpuDeviceId, s3_));
     // Ensure device has finished all work.
     cudaCheckError(cudaDeviceSynchronize());
   }
@@ -348,7 +315,7 @@ class sp_gemm_gpu : public gemm<T> {
 	}
 
   /** Handle used when calling cuBLAS. */
-  cublasHandle_t handle_;
+  cusparseHandle_t handle_;
 
   /** CUDA Stream 1 - used to asynchronously move data between host and device.
    */
@@ -366,12 +333,29 @@ class sp_gemm_gpu : public gemm<T> {
   int gpuDevice_;
 
 	/** CSR format vectors for matrices A, B and C on the host */
-	int A_nnz_, B_nnz_, C_nnz_;
-	T* A_val_, B_val_, C_val_;
-	int* A_col_, A_row_, B_col_, B_row_, C_col_, C_row_;
+	T* A_val_;
+	int* A_col_;
+  int* A_row_;
+  int* A_num_rows_;
+  int* A_num_cols_;
+  int* A_nnz_;
+
+  T* B_val_;
+  int* B_col_;
+  int* B_row_;
+  int* B_num_rows_;
+  int* B_num_cols_;
+  int* B_nnz_;
+
+  T* C_val_;
+  int* C_col_;
+  int* C_row_;
+  int* C_num_rows_;
+  int* C_num_cols_;
+  int*C_nnz_;
 
   /** CSR format vectors for matrices A, B and C on the device. */
-	int A_num_rows_dev_, A_num_cols_dev_, A_nnz_dev_, B_num_rows_dev_,
+	int* A_num_rows_dev_, A_num_cols_dev_, A_nnz_dev_, B_num_rows_dev_,
 	B_num_cols_dev_, B_nnz_dev_, C_num_rows_dev_, C_num_cols_dev_, C_nnz_dev_;
 	T* A_val_dev_, B_val_dev_, C_val_dev_;
 	int* A_col_dev_, A_row_dev_, B_col_dev_, B_row_dev_, C_col_dev_, C_row_dev_;
@@ -384,12 +368,12 @@ class sp_gemm_gpu : public gemm<T> {
 
 
 	// Create descriptors for matrices A->C
-	cusparseMatDescr_t descrA_, descrB_, descrC_;
+	cusparseSpMatDescr_t descrA_, descrB_, descrC_;
 
-	// index type depends on kernel being run
-	cusparseIndexType_t cudaDataType_;
+	// Data type depends on kernel being run
+	cudaDataType_t cudaDataType_;
 
-	cusparceSpGEMMDescr_t spgemmDesc_;
+	cusparseSpGEMMDescr_t spgemmDesc_;
 
 	size_t buffer_size1_ = 0;
 	size_t buffer_size2_ = 0;
diff --git a/include/doGemm.hh b/include/doGemm.hh
index 4a7c564..5565fb2 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -519,20 +519,19 @@ class doGemm {
 		// Perform the GPU kernels
 		// - ONCE : Offload to/from GPU once before all iterations and once
 		// after
-		spGemmGpu_.initialise(gpuOffloadType::once, N, N, N);
+		spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
 		time_checksum_gflop gpuResult_once = gemmGpu_.compute();
 		gpuResult_once.gflops =
 						calcGflops(flops, iterations_, gpuResult_once.runtime);
 
 		// - ALWAYS: Offload to/from GPU every iteration
-		spGemmGpu_.initialise(gpuOffloadType::always, N, N, N);
+		spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
 		time_checksum_gflop gpuResult_always = gemmGpu_.compute();
 		gpuResult_always.gflops =
 						calcGflops(flops, iterations_, gpuResult_always.runtime);
-
 		// - UNIFIED : data passed from host to device (and device to host) as
 		//             needed
-		spGemmGpu_.initialise(gpuOffloadType::unified, N, N, N);
+		spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
 		time_checksum_gflop gpuResult_unified = gemmGpu_.compute();
 		gpuResult_unified.gflops =
 						calcGflops(flops, iterations_, gpuResult_unified.runtime);
diff --git a/include/kernels/GPU/sp_gemm.hh b/include/kernels/GPU/sp_gemm.hh
index 684c166..dbfba87 100644
--- a/include/kernels/GPU/sp_gemm.hh
+++ b/include/kernels/GPU/sp_gemm.hh
@@ -17,7 +17,7 @@ namespace gpu {
 				 *  - Always:  Move data from host to device and device to host each iteration
 				 *  - Unified: Initialise data as unified memory; no data movement semantics
 				 *             required */
-				virtual void initialise(gpuOffloadType offload, int m, int n, int k) = 0;
+				virtual void initialise(gpuOffloadType offload, int n, float sparsity) = 0;
 
 		protected:
 				/** Whether data should be offloaded to/from the GPU each iteration, or just

From 143c1c041d7da2afda07b27c5c3dbb8b273fab1c Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Mon, 25 Mar 2024 10:11:51 +0000
Subject: [PATCH 08/32] Now compiles with fewer runtime errors

---
 cuBLAS/sp_gemm.hh | 352 +++++++++++++++++++++++++++-------------------
 include/doGemm.hh |  42 +++---
 2 files changed, 227 insertions(+), 167 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index c0bfb8e..fa0e39d 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -37,12 +37,12 @@ class sp_gemm_gpu : public sp_gemm<T> {
     std::cout << "Initialising" << std::endl;
     offload_ = offload;
 
-		// Create a handle for cuSPARSE
+    // Create a handle for cuSPARSE
     cusparseCreate(&handle_);
     std::cout << "Handle created" << std::endl;
 
 
-		if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F;
+    if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F;
     else if (std::is_same_v<T, double>) cudaDataType_ = CUDA_R_64F;
     else {
       std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
@@ -60,24 +60,38 @@ class sp_gemm_gpu : public sp_gemm<T> {
     cudaCheckError(cudaStreamCreate(&s3_));
     std::cout << "Streams created" << std::endl;
 
+    if (offload_ == gpuOffloadType::unified) {
+      std::cout << "Into unified if statement" << std::endl;
+      A_num_rows_ = (int*)malloc(sizeof(int));
+      A_num_cols_ = (int*)malloc(sizeof(int));
+      A_nnz_ = (int*)malloc(sizeof(int));
+      B_num_rows_ = (int*)malloc(sizeof(int));
+      B_num_cols_ = (int*)malloc(sizeof(int));
+      B_nnz_ = (int*)malloc(sizeof(int));
+      C_num_rows_ = (int*)malloc(sizeof(int));
+      C_num_cols_ = (int*)malloc(sizeof(int));
+      C_nnz_ = (int*)malloc(sizeof(int));
+    }
 
-		// Work out number of edges needed to achieve target sparsity
-		int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
-		(*A_nnz_) = (*B_nnz_) = edges;
 
-		// ToDo -- for all of this mallocing, bear in mind that row will probably
-		//  have fewer than 'edges' values (thats the whole point).  May need to
-		//  reorganise
+   // Work out number of edges needed to achieve target sparsity
+    int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
+    (*A_nnz_) = (*B_nnz_) = edges;
 
-		cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges));
-		cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges));
-		cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1)));
-    std::cout << "A CSR vectors malloced" << std::endl;
+    if (offload_ == gpuOffloadType::unified) {
+      std::cout << "beginning mallocs" << std::endl;
+      cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * (*A_nnz_)));
+      std::cout << "A vals vectors malloced" << std::endl;
+      cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * (*A_nnz_)));
+      std::cout << "A cols vectors malloced" << std::endl;
+      cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1)));
+      std::cout << "A CSR vectors malloced" << std::endl;
 
-		cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges));
-		cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges));
-		cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1)));
-    std::cout << "B CSR vectors malloced" << std::endl;
+      cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * (*B_nnz_)));
+      cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * (*B_nnz_)));
+      cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1)));
+      std::cout << "B CSR vectors malloced" << std::endl;
+    }
 
 		// Initialise the host matricies
 		// cusparseSpGEMM() works on CSR format only.  This helpfully makes our
@@ -85,10 +99,12 @@ class sp_gemm_gpu : public sp_gemm<T> {
 
 		// Initialise the matrices
 		// Set initial values to 0
-		for (int i = 0; i < (n_ * n_); i++) {
-			A_[i] = 0.0;
-			B_[i] = 0.0;
-		}
+    A_ = (T*)malloc(sizeof(T) * n_ * n_);
+    B_ = (T*)malloc(sizeof(T) * n_ * n_);
+    for (int i = 0; i < (n_ * n_); i++) {
+      A_[i] = 0.0;
+      B_[i] = 0.0;
+    }
 
     // Random number generator objects for use in descent
     std::default_random_engine gen;
@@ -96,19 +112,20 @@ class sp_gemm_gpu : public sp_gemm<T> {
                      .time_since_epoch().count());
     std::uniform_real_distribution<double> dist(0.0, 1.0);
 
-		// Using a=0.45 and b=c=0.22 as default probabilities
-		for (int i = 0; i < edges; i++) {
-			while (!rMat(A_, n, 0, n - 1, 0, n - 1,
-			             0.45, 0.22, 0.22,
-			             &gen, dist, false)) {}
-			while (!rMat(B_, n, 0, n - 1, 0, n - 1,
-			             0.45, 0.22, 0.22,
-			             &gen, dist, false)) {}
-		}
-
-		toCSR(A_, n, n, edges, A_val_, A_col_, A_row_);
-		toCSR(B_, n, n, edges, B_val_, B_col_, B_row_);
+    // Using a=0.45 and b=c=0.22 as default probabilities
+    for (int i = 0; i < (*A_nnz_); i++) {
+      while (!rMat(A_, n, 0, n - 1, 0, n - 1,
+                   0.45, 0.22, 0.22,
+                   &gen, dist, false)) {}
+    }
+    for (int i = 0; i < (*B_nnz_); i++) {
+      while (!rMat(B_, n, 0, n - 1, 0, n - 1,
+                   0.45, 0.22, 0.22,
+                   &gen, dist, false)) {}
+    }
 
+    toCSR(A_, n, n, (*A_nnz_), A_val_, A_col_, A_row_);
+    toCSR(B_, n, n, (*B_nnz_), B_val_, B_col_, B_row_);
   }
 
 
@@ -117,135 +134,178 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
-    // Prefetch memory to device
-		cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * (*A_nnz_),
-                                        gpuDevice_, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * (*A_nnz_),
-																				gpuDevice_, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1),
-																				gpuDevice_, s1_));
-
-		cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * (*B_nnz_),
-                                        gpuDevice_, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * (*B_nnz_),
-																				gpuDevice_, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1),
-																				gpuDevice_, s2_));
-//
-//		cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), gpuDevice_,
-//																				s3_));
-//		cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), gpuDevice_,
-//																				s3_));
-//		cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), gpuDevice_,
-//																				s3_));
-//		cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * edges, gpuDevice_,
-//																				s3_));
-//		cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * edges,
-//																				gpuDevice_, s3_));
-//		cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * edges,
-//																				gpuDevice_, s3_));
-
-		// Create the CSR matrices on the device
-		cusparseCreateCsr(&descrA_, n_, n_, (*A_nnz_), A_row_, A_col_, A_val_,
-											CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-											CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
-		cusparseCreateCsr(&descrB_, n_, n_, (*B_nnz_), B_row_, B_col_, B_val_,
-											CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-											CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
-		cusparseCreateCsr(&descrC_, n_, n_, 0, NULL, NULL, NULL,
-											CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-											CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
-
-		cusparseSpGEMM_createDescr(&spgemmDesc_);
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        break;
+      }
+      case gpuOffloadType::once: {
+        break;
+      }
+      case gpuOffloadType::unified: {
+        // Prefetch memory to device
+        cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * (*A_nnz_),
+                                            gpuDevice_, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * (*A_nnz_),
+                                            gpuDevice_, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1),
+                                            gpuDevice_, s1_));
+
+        cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * (*B_nnz_),
+                                            gpuDevice_, s2_));
+        cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * (*B_nnz_),
+                                            gpuDevice_, s2_));
+        cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1),
+                                            gpuDevice_, s2_));
+    //
+    //		cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), gpuDevice_,
+    //																				s3_));
+    //		cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), gpuDevice_,
+    //																				s3_));
+    //		cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), gpuDevice_,
+    //																				s3_));
+    //		cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * edges, gpuDevice_,
+    //																				s3_));
+    //		cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * edges,
+    //																				gpuDevice_, s3_));
+    //		cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * edges,
+    //																				gpuDevice_, s3_));
+
+        // Create the CSR matrices on the device
+        cusparseCreateCsr(&descrA_, n_, n_, (*A_nnz_), A_row_, A_col_, A_val_,
+                          CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+                          CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
+        cusparseCreateCsr(&descrB_, n_, n_, (*B_nnz_), B_row_, B_col_, B_val_,
+                          CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+                          CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
+        cusparseCreateCsr(&descrC_, n_, n_, 0, NULL, NULL, NULL,
+                          CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+                          CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
+
+        cusparseSpGEMM_createDescr(&spgemmDesc_);
+        break;
+      }
+    }
   }
 
   /** Make a call to the BLAS Library Kernel. */
   void callGemm() override {
-    cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-																	 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
-																	 descrA_, descrB_, &beta, descrC_,
-																	 cudaDataType_, CUSPARSE_SPGEMM_DEFAULT,
-																	 spgemmDesc_, &buffer_size1_, NULL);
-		cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_));
-    cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-																	 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
-																	 descrA_, descrB_, &beta, descrC_,
-																	 cudaDataType_, CUSPARSE_SPGEMM_DEFAULT,
-																	 spgemmDesc_, &buffer_size1_, buffer1_);
-		cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-													 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
-													 descrB_, &beta, descrC_, cudaDataType_,
-													 CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_,
-                           &buffer_size2_, NULL);
-		cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2_));
-
-		if (cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-													 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
-													 descrB_, &beta, descrC_, cudaDataType_,
-                           CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_,
-                           &buffer_size2_, buffer2_)
-						== CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) {
-			std::cout << "Insufficient resources" << std::endl;
-			exit(1);
-		}
-
-		int64_t rows, cols, nnz;
-
-		cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz);
-		(*C_nnz_) = nnz;
-		cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnz));
-		cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnz));
-		cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1)));
-
-		cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_);
-		cusparseSpGEMM_copy(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-												CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
-												descrB_, &beta, descrC_, CUDA_R_32F,
-												CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_);
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        break;
+      }
+      case gpuOffloadType::once: {
+        break;
+      }
+      case gpuOffloadType::unified: {
+        cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                       CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                                       descrA_, descrB_, &beta, descrC_,
+                                       cudaDataType_, CUSPARSE_SPGEMM_DEFAULT,
+                                       spgemmDesc_, &buffer_size1_, NULL);
+        cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_));
+        cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                       CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                                       descrA_, descrB_, &beta, descrC_,
+                                       cudaDataType_, CUSPARSE_SPGEMM_DEFAULT,
+                                       spgemmDesc_, &buffer_size1_, buffer1_);
+        cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                               CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
+                               descrB_, &beta, descrC_, cudaDataType_,
+                               CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_,
+                               &buffer_size2_, NULL);
+        cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2_));
+
+        if (cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                               CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
+                               descrB_, &beta, descrC_, cudaDataType_,
+                               CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_,
+                               &buffer_size2_, buffer2_)
+                == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) {
+          std::cout << "Insufficient resources" << std::endl;
+          exit(1);
+        }
+
+        int64_t rows, cols, nnz;
+
+        cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz);
+        (*C_nnz_) = nnz;
+        cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnz));
+        cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnz));
+        cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1)));
+
+        cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_);
+        cusparseSpGEMM_copy(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
+                            descrB_, &beta, descrC_, CUDA_R_32F,
+                            CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_);
+        break;
+      }
+    }
 	}
 
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
   void postLoopRequirements() override {
-    // Ensure all data resides on host once work has completed
-		cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * (*A_nnz_),
-																				cudaCpuDeviceId, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * (*A_nnz_),
-																				cudaCpuDeviceId, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1),
-																				cudaCpuDeviceId, s1_));
-
-		cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * (*B_nnz_),
-																				cudaCpuDeviceId, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * (*B_nnz_),
-																				cudaCpuDeviceId, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1),
-																				cudaCpuDeviceId, s2_));
-
-		cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * (*C_nnz_),
-																				cudaCpuDeviceId, s3_));
-		cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * (*C_nnz_),
-																				cudaCpuDeviceId, s3_));
-		cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * (n_ + 1),
-																				cudaCpuDeviceId, s3_));
-    // Ensure device has finished all work.
-    cudaCheckError(cudaDeviceSynchronize());
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        break;
+      }
+      case gpuOffloadType::once: {
+        break;
+      }
+      case gpuOffloadType::unified: {
+        // Ensure all data resides on host once work has completed
+        cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * (*A_nnz_),
+                                            cudaCpuDeviceId, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * (*A_nnz_),
+                                            cudaCpuDeviceId, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1),
+                                            cudaCpuDeviceId, s1_));
+
+        cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * (*B_nnz_),
+                                            cudaCpuDeviceId, s2_));
+        cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * (*B_nnz_),
+                                            cudaCpuDeviceId, s2_));
+        cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1),
+                                            cudaCpuDeviceId, s2_));
+
+        cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * (*C_nnz_),
+                                            cudaCpuDeviceId, s3_));
+        cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * (*C_nnz_),
+                                            cudaCpuDeviceId, s3_));
+        cudaCheckError(cudaMemPrefetchAsync(C_row_, sizeof(int) * (n_ + 1),
+                                            cudaCpuDeviceId, s3_));
+        // Ensure device has finished all work.
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+    }
   }
 
   /** Do any necessary cleanup (free pointers, close library handles, etc.)
    * after Kernel has been called. */
   void postCallKernelCleanup() override {
-    // Destroy the handle
-    cusparseDestroy(handle_);
-
-    // Destroy streams after use
-    cudaCheckError(cudaStreamDestroy(s1_));
-    cudaCheckError(cudaStreamDestroy(s2_));
-    cudaCheckError(cudaStreamDestroy(s3_));
+    if (offload_ == gpuOffloadType::unified) {
+      // Destroy the handle
+      cusparseDestroy(handle_);
+
+      // Destroy streams after use
+      cudaCheckError(cudaStreamDestroy(s1_));
+      cudaCheckError(cudaStreamDestroy(s2_));
+      cudaCheckError(cudaStreamDestroy(s3_));
+    }
 
-    cudaFree(A_);
-    cudaFree(B_);
-    cudaFree(C_);
+    if (offload_ == gpuOffloadType::unified) {
+      cudaFree(A_val_);
+      cudaFree(A_col_);
+      cudaFree(A_row_);
+      cudaFree(B_val_);
+      cudaFree(B_col_);
+      cudaFree(B_row_);
+      cudaFree(C_val_);
+      cudaFree(C_col_);
+      cudaFree(C_row_);
+    }
   }
 
 	bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
diff --git a/include/doGemm.hh b/include/doGemm.hh
index 5565fb2..0e4dcc0 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -516,23 +516,23 @@ class doGemm {
 		time_checksum_gflop cpuResult = spGemmCpu_.compute();
 		cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
 
-		// Perform the GPU kernels
-		// - ONCE : Offload to/from GPU once before all iterations and once
-		// after
-		spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
-		time_checksum_gflop gpuResult_once = gemmGpu_.compute();
-		gpuResult_once.gflops =
-						calcGflops(flops, iterations_, gpuResult_once.runtime);
-
-		// - ALWAYS: Offload to/from GPU every iteration
-		spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
-		time_checksum_gflop gpuResult_always = gemmGpu_.compute();
-		gpuResult_always.gflops =
-						calcGflops(flops, iterations_, gpuResult_always.runtime);
-		// - UNIFIED : data passed from host to device (and device to host) as
-		//             needed
+//		// Perform the GPU kernels
+//		// - ONCE : Offload to/from GPU once before all iterations and once
+//		// after
+//		spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
+//		time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
+//		gpuResult_once.gflops =
+//						calcGflops(flops, iterations_, gpuResult_once.runtime);
+//
+//		// - ALWAYS: Offload to/from GPU every iteration
+//		spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
+//		time_checksum_gflop gpuResult_always = spGemmGpu_.compute();
+//		gpuResult_always.gflops =
+//						calcGflops(flops, iterations_, gpuResult_always.runtime);
+//		// - UNIFIED : data passed from host to device (and device to host) as
+//		//             needed
 		spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
-		time_checksum_gflop gpuResult_unified = gemmGpu_.compute();
+		time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
 		gpuResult_unified.gflops =
 						calcGflops(flops, iterations_, gpuResult_unified.runtime);
 
@@ -541,11 +541,11 @@ class doGemm {
 		// Write lines to CSV file
 		writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
 		               cpuResult.runtime, cpuResult.gflops);
-		writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
-		               iterations_, gpuResult_once.runtime, gpuResult_once.gflops);
-		writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
-		               iterations_, gpuResult_always.runtime,
-		               gpuResult_always.gflops);
+//		writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
+//		               iterations_, gpuResult_once.runtime, gpuResult_once.gflops);
+//		writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
+//		               iterations_, gpuResult_always.runtime,
+//		               gpuResult_always.gflops);
 		writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize,
 		               iterations_, gpuResult_unified.runtime,
 		               gpuResult_unified.gflops);

From bcd7ae88a01ec199951162c3fdba2d41817edff9 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:23:02 +0100
Subject: [PATCH 09/32] rebasing

---
 cuBLAS/common.hh  |  13 ++
 cuBLAS/sp_gemm.hh | 576 ++++++++++++++++++++++++++++++++++------------
 include/doGemm.hh |  34 +--
 3 files changed, 458 insertions(+), 165 deletions(-)

diff --git a/cuBLAS/common.hh b/cuBLAS/common.hh
index 78d0270..70d58fb 100644
--- a/cuBLAS/common.hh
+++ b/cuBLAS/common.hh
@@ -2,6 +2,9 @@
 
 #if defined GPU_CUBLAS
 
+#include "cusparse.h"
+
+/** Macro function to check if error occurred when calling cuBLAS. */
 /** Macro function to check if error occurred when calling CUDA. */
 #define cudaCheckError(f)                                                \
   do {                                                                   \
@@ -22,4 +25,14 @@
     }                                                                      \
   } while (false)
 
+#define cusparseCheckError(f)                                                 \
+  do {                                                                        \
+    cusparseStatus_t status = (f);                                            \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                  \
+      std::cout << "CUSPARSE error: " << __FILE__ << ":" << __LINE__ << ": "  \
+      << cusparseGetErrorString(status) << std::endl;                         \
+      exit(1);                                                                \
+    }                                                                         \
+  } while (false)                                                             \
+
 #endif
\ No newline at end of file
diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index fa0e39d..0879966 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -34,12 +34,9 @@ class sp_gemm_gpu : public sp_gemm<T> {
    *  - Unified: Initialise data as unified memory; no data movement semantics
    *             required */
   void initialise(gpuOffloadType offload, int n, float sparsity) override {
-    std::cout << "Initialising" << std::endl;
-    offload_ = offload;
+    std::cout << "_/_/_/_/ Initialising for problem size: " << n << std::endl;
 
-    // Create a handle for cuSPARSE
-    cusparseCreate(&handle_);
-    std::cout << "Handle created" << std::endl;
+    offload_ = offload;
 
 
     if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F;
@@ -52,45 +49,51 @@ class sp_gemm_gpu : public sp_gemm<T> {
 
     // Get device identifier
     cudaCheckError(cudaGetDevice(&gpuDevice_));
-    std::cout << "GPU device got" << std::endl;
 
     // Initialise 3 streams to asynchronously move data between host and device
     cudaCheckError(cudaStreamCreate(&s1_));
     cudaCheckError(cudaStreamCreate(&s2_));
     cudaCheckError(cudaStreamCreate(&s3_));
-    std::cout << "Streams created" << std::endl;
 
-    if (offload_ == gpuOffloadType::unified) {
-      std::cout << "Into unified if statement" << std::endl;
-      A_num_rows_ = (int*)malloc(sizeof(int));
-      A_num_cols_ = (int*)malloc(sizeof(int));
-      A_nnz_ = (int*)malloc(sizeof(int));
-      B_num_rows_ = (int*)malloc(sizeof(int));
-      B_num_cols_ = (int*)malloc(sizeof(int));
-      B_nnz_ = (int*)malloc(sizeof(int));
-      C_num_rows_ = (int*)malloc(sizeof(int));
-      C_num_cols_ = (int*)malloc(sizeof(int));
-      C_nnz_ = (int*)malloc(sizeof(int));
-    }
 
 
    // Work out number of edges needed to achieve target sparsity
     int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
-    (*A_nnz_) = (*B_nnz_) = edges;
+    A_nnz_ = B_nnz_ = edges;
 
     if (offload_ == gpuOffloadType::unified) {
-      std::cout << "beginning mallocs" << std::endl;
-      cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * (*A_nnz_)));
-      std::cout << "A vals vectors malloced" << std::endl;
-      cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * (*A_nnz_)));
-      std::cout << "A cols vectors malloced" << std::endl;
+      cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_));
+      cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * A_nnz_));
       cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1)));
-      std::cout << "A CSR vectors malloced" << std::endl;
 
-      cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * (*B_nnz_)));
-      cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * (*B_nnz_)));
+      cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * B_nnz_));
+      cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * B_nnz_));
       cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1)));
-      std::cout << "B CSR vectors malloced" << std::endl;
+
+      cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1)));
+      C_val_ = NULL;
+      C_col_ = NULL;
+    } else {
+      A_val_ = (T*)malloc(sizeof(T) * A_nnz_);
+      A_col_ = (int*)malloc(sizeof(int) * A_nnz_);
+      A_row_ = (int*)malloc(sizeof(int) * (n_ + 1));
+
+      B_val_ = (T*)malloc(sizeof(T) * B_nnz_);
+      B_col_ = (int*)malloc(sizeof(int) * B_nnz_);
+      B_row_ = (int*)malloc(sizeof(int) * (n_ + 1));
+
+      C_row_ = (int*)malloc(sizeof(int) * (n_ + 1));
+
+
+      cudaCheckError(cudaMalloc((void**)&A_val_dev_, sizeof(T) * A_nnz_));
+      cudaCheckError(cudaMalloc((void**)&A_col_dev_, sizeof(int) * A_nnz_));
+      cudaCheckError(cudaMalloc((void**)&A_row_dev_, sizeof(int) * (n_ + 1)));
+
+      cudaCheckError(cudaMalloc((void**)&B_val_dev_, sizeof(T) * B_nnz_));
+      cudaCheckError(cudaMalloc((void**)&B_col_dev_, sizeof(int) * B_nnz_));
+      cudaCheckError(cudaMalloc((void**)&B_row_dev_, sizeof(int) * (n_ + 1)));
+
+      cudaCheckError(cudaMalloc((void**)&C_row_dev_, sizeof(int) * (n_ + 1)));
     }
 
 		// Initialise the host matricies
@@ -113,75 +116,116 @@ class sp_gemm_gpu : public sp_gemm<T> {
     std::uniform_real_distribution<double> dist(0.0, 1.0);
 
     // Using a=0.45 and b=c=0.22 as default probabilities
-    for (int i = 0; i < (*A_nnz_); i++) {
-      while (!rMat(A_, n, 0, n - 1, 0, n - 1,
+    for (int i = 0; i < A_nnz_; i++) {
+      while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1,
                    0.45, 0.22, 0.22,
                    &gen, dist, false)) {}
     }
-    for (int i = 0; i < (*B_nnz_); i++) {
-      while (!rMat(B_, n, 0, n - 1, 0, n - 1,
+    for (int i = 0; i < B_nnz_; i++) {
+      while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1,
                    0.45, 0.22, 0.22,
                    &gen, dist, false)) {}
     }
 
-    toCSR(A_, n, n, (*A_nnz_), A_val_, A_col_, A_row_);
-    toCSR(B_, n, n, (*B_nnz_), B_val_, B_col_, B_row_);
-  }
+    toCSR(A_, n_, n_, A_nnz_, A_val_, A_col_, A_row_);
+
+    toCSR(B_, n_, n_, B_nnz_, B_val_, B_col_, B_row_);
+
 
+//    std::cout << "_____Matrix A_____" << std::endl;
+//    printDenseMatrix(A_, n_, n_);
+//    std::cout << std::endl << std::endl;
+//    printCSR(A_val_, A_col_, A_row_, A_nnz_, n_, n_);
+//
+//
+//    std::cout << "_____Matrix B_____" << std::endl;
+//    printDenseMatrix(B_, n_, n_);
+//    std::cout << std::endl << std::endl;
+//    printCSR(B_val_, B_col_, B_row_, B_nnz_, n_, n_);
 
+    // Create a handle for cuSPARSE
+    cusparseCheckError(cusparseCreate(&handle_));
+  }
 
  private:
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
+    std::cout << "\t\tPreLoop" << std::endl;
+    cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_));
     switch(offload_) {
       case gpuOffloadType::always: {
+        // Make matrix descriptors
+        cusparseCheckError(
+                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_,
+                                  A_col_dev_, A_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_,
+                                  B_col_dev_, B_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL,
+                                  rType_, cType_, indType_, cudaDataType_));
         break;
       }
       case gpuOffloadType::once: {
+        cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) *
+                                       A_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, sizeof(int) *
+                                       A_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, sizeof(int) * (n_
+                                       + 1), cudaMemcpyHostToDevice, s1_));
+
+        cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) *
+                                       B_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) *
+                                       B_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_
+                                       + 1), cudaMemcpyHostToDevice, s1_));
+
+        // Craete matrix descriptors
+        cusparseCheckError(
+                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_,
+                                  A_col_dev_, A_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_,
+                                  B_col_dev_, B_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL,
+                                  rType_, cType_, indType_, cudaDataType_));
         break;
       }
       case gpuOffloadType::unified: {
         // Prefetch memory to device
-        cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * (*A_nnz_),
+        cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_,
                                             gpuDevice_, s1_));
-        cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * (*A_nnz_),
+        cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * A_nnz_,
                                             gpuDevice_, s1_));
         cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1),
                                             gpuDevice_, s1_));
 
-        cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * (*B_nnz_),
+        cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * B_nnz_,
                                             gpuDevice_, s2_));
-        cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * (*B_nnz_),
+        cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * B_nnz_,
                                             gpuDevice_, s2_));
         cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1),
                                             gpuDevice_, s2_));
-    //
-    //		cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), gpuDevice_,
-    //																				s3_));
-    //		cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), gpuDevice_,
-    //																				s3_));
-    //		cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), gpuDevice_,
-    //																				s3_));
-    //		cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * edges, gpuDevice_,
-    //																				s3_));
-    //		cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * edges,
-    //																				gpuDevice_, s3_));
-    //		cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * edges,
-    //																				gpuDevice_, s3_));
-
-        // Create the CSR matrices on the device
-        cusparseCreateCsr(&descrA_, n_, n_, (*A_nnz_), A_row_, A_col_, A_val_,
-                          CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-                          CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
-        cusparseCreateCsr(&descrB_, n_, n_, (*B_nnz_), B_row_, B_col_, B_val_,
-                          CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-                          CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
-        cusparseCreateCsr(&descrC_, n_, n_, 0, NULL, NULL, NULL,
-                          CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-                          CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
-
-        cusparseSpGEMM_createDescr(&spgemmDesc_);
+
+        // Make matrix descriptors
+        cusparseCheckError(
+                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_, A_col_,
+                                  A_val_, rType_, cType_, indType_,
+                                  cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_, B_col_,
+                                  B_val_, rType_, cType_, indType_,
+                                  cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_, NULL, NULL,
+                                  rType_, cType_, indType_, cudaDataType_));
         break;
       }
     }
@@ -189,55 +233,208 @@ class sp_gemm_gpu : public sp_gemm<T> {
 
   /** Make a call to the BLAS Library Kernel. */
   void callGemm() override {
+    std::cout << "\t\tcallGemm" << std::endl;
     switch(offload_) {
       case gpuOffloadType::always: {
+        cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) *
+        A_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, sizeof(int) *
+        A_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, sizeof(int) * (n_
+                                       + 1), cudaMemcpyHostToDevice, s1_));
+
+        cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) *
+        B_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) *
+        B_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_
+                                       + 1), cudaMemcpyHostToDevice, s1_));
+
+        cusparseCheckError(
+                cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_,
+                                    descrB_, &beta, descrC_, cudaDataType_,
+                                    alg_, spgemmDesc_));
+
+        cusparseCheckError(
+                cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha,
+                                              descrA_, descrB_, &beta,
+                                              descrC_, cudaDataType_, alg_,
+                                              spgemmDesc_, &buffer_size1_,
+                                              NULL));
+        cudaCheckError(cudaMalloc((void**)&buffer1_, buffer_size1_));
+        cusparseCheckError(
+                cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha,
+                                              descrA_, descrB_, &beta,
+                                              descrC_, cudaDataType_, alg_,
+                                              spgemmDesc_, &buffer_size1_,
+                                              buffer1_));
+        cusparseCheckError(
+                cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_,
+                                       descrB_, &beta, descrC_, cudaDataType_,
+                                       alg_, spgemmDesc_, &buffer_size2_,
+                                       NULL));
+        cudaCheckError(cudaMalloc((void**)&buffer2_, buffer_size2_));
+
+        cusparseCheckError(
+                cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_,
+                                       descrB_, &beta, descrC_,
+                                       cudaDataType_, alg_, spgemmDesc_,
+                                       &buffer_size2_, buffer2_));
+
+        cusparseCheckError(
+                cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
+                                     &C_nnz_));
+
+        cusparseCheckError(
+                cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
+                                     &C_nnz_));
+
+        cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_));
+        cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_));
+
+        cusparseCheckError(
+                cusparseCsrSetPointers(descrC_, C_row_dev_, C_col_dev_,
+                                       C_val_dev_));
+        cusparseCheckError(
+                cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_,
+                                    descrB_, &beta, descrC_, cudaDataType_,
+                                    alg_, spgemmDesc_));
+
+        cudaCheckError(cudaMemcpyAsync(A_val_, A_val_dev_, sizeof(T) *
+        A_nnz_, cudaMemcpyDeviceToHost, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_col_, A_col_dev_, sizeof(int) *
+        A_nnz_, cudaMemcpyDeviceToHost, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_row_, A_row_dev_, sizeof(int) *
+        (n_ + 1), cudaMemcpyDeviceToHost, s1_));
+
+        cudaCheckError(cudaMemcpyAsync(B_val_, B_val_dev_, sizeof(T) *
+        B_nnz_, cudaMemcpyDeviceToHost, s2_));
+        cudaCheckError(cudaMemcpyAsync(B_col_, B_col_dev_, sizeof(int) *
+        B_nnz_, cudaMemcpyDeviceToHost, s2_));
+        cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) *
+        (n_ + 1), cudaMemcpyDeviceToHost, s2_));
+
+        C_val_ = (T*)malloc(sizeof(T) * C_nnz_);
+        C_col_ = (int*)malloc(sizeof(int) * C_nnz_);
+        cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) *
+        C_nnz_, cudaMemcpyDeviceToHost, s3_));
+        cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) *
+        C_nnz_, cudaMemcpyDeviceToHost, s3_));
+        cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) *
+        (n_ + 1), cudaMemcpyDeviceToHost, s3_));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        // Freeing memory
+        cudaCheckError(cudaFree(buffer1_));
+        cudaCheckError(cudaFree(buffer2_));
+        cudaCheckError(cudaFree(C_val_dev_));
+        cudaCheckError(cudaFree(C_col_dev_));
+        free(C_val_);
+        free(C_col_);
         break;
       }
       case gpuOffloadType::once: {
+        cusparseCheckError(
+                cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_,
+                                    descrB_, &beta, descrC_, cudaDataType_,
+                                    alg_, spgemmDesc_));
+
+        cusparseCheckError(
+                cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha,
+                                              descrA_, descrB_, &beta,
+                                              descrC_, cudaDataType_, alg_,
+                                              spgemmDesc_, &buffer_size1_,
+                                              NULL));
+        cudaCheckError(cudaMalloc((void**)&buffer1_, buffer_size1_));
+        cusparseCheckError(
+                cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha,
+                                              descrA_, descrB_, &beta,
+                                              descrC_, cudaDataType_, alg_,
+                                              spgemmDesc_, &buffer_size1_,
+                                              buffer1_));
+        cusparseCheckError(
+                cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_,
+                                       descrB_, &beta, descrC_, cudaDataType_,
+                                       alg_, spgemmDesc_, &buffer_size2_,
+                                       NULL));
+        cudaCheckError(cudaMalloc((void**)&buffer2_, buffer_size2_));
+
+        cusparseCheckError(
+                cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_,
+                               descrB_, &beta, descrC_, cudaDataType_,
+                               alg_, spgemmDesc_, &buffer_size2_, buffer2_));
+
+        cusparseCheckError(
+                cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
+                                     &C_nnz_));
+
+        cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_));
+        cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_));
+
+        cusparseCheckError(
+                cusparseCsrSetPointers(descrC_, C_row_dev_, C_col_dev_,
+                                       C_val_dev_));
+        cusparseCheckError(
+                cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha,
+                                    descrA_, descrB_, &beta, descrC_,
+                                    cudaDataType_, alg_, spgemmDesc_));
+
+        // Freeing memory
+        cudaCheckError(cudaFree(buffer1_));
+        cudaCheckError(cudaFree(buffer2_));
         break;
       }
       case gpuOffloadType::unified: {
-        cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                       CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
-                                       descrA_, descrB_, &beta, descrC_,
-                                       cudaDataType_, CUSPARSE_SPGEMM_DEFAULT,
-                                       spgemmDesc_, &buffer_size1_, NULL);
-        cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_));
-        cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                       CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
-                                       descrA_, descrB_, &beta, descrC_,
-                                       cudaDataType_, CUSPARSE_SPGEMM_DEFAULT,
-                                       spgemmDesc_, &buffer_size1_, buffer1_);
-        cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                               CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
+        cusparseCheckError(
+                cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha,
+                                              descrA_, descrB_, &beta,
+                                              descrC_, cudaDataType_,
+                                              alg_, spgemmDesc_, &buffer_size1_,
+                                              NULL));
+        cudaCheckError(cudaMallocManaged((void**)&buffer1_, buffer_size1_));
+        cusparseCheckError(
+                cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha,
+                                              descrA_, descrB_, &beta,
+                                              descrC_, cudaDataType_,
+                                              alg_, spgemmDesc_, &buffer_size1_,
+                                              buffer1_));
+        cusparseCheckError(
+                cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_,
+                                       descrB_, &beta, descrC_, cudaDataType_,
+                                       alg_, spgemmDesc_, &buffer_size2_,
+                                       NULL));
+        cudaCheckError(cudaMallocManaged((void**)&buffer2_, buffer_size2_));
+
+        cusparseCheckError(
+                cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_,
                                descrB_, &beta, descrC_, cudaDataType_,
-                               CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_,
-                               &buffer_size2_, NULL);
-        cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2_));
+                               alg_, spgemmDesc_, &buffer_size2_, buffer2_));
 
-        if (cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                               CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
-                               descrB_, &beta, descrC_, cudaDataType_,
-                               CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_,
-                               &buffer_size2_, buffer2_)
-                == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) {
-          std::cout << "Insufficient resources" << std::endl;
-          exit(1);
-        }
-
-        int64_t rows, cols, nnz;
-
-        cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz);
-        (*C_nnz_) = nnz;
-        cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnz));
-        cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnz));
-        cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1)));
-
-        cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_);
-        cusparseSpGEMM_copy(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                            CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
-                            descrB_, &beta, descrC_, CUDA_R_32F,
-                            CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_);
+        cusparseCheckError(
+                cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
+                                     &C_nnz_));
+
+        cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * C_nnz_));
+        cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * C_nnz_));
+
+        cusparseCheckError(
+                cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_));
+        cusparseCheckError(
+                cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_,
+                                    descrB_, &beta, descrC_, cudaDataType_,
+                                    alg_, spgemmDesc_));
+
+
+        cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * C_nnz_,
+                                            cudaCpuDeviceId, s3_));
+        cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * C_nnz_,
+                                            cudaCpuDeviceId, s3_));
+
+        // Freeing memory
+        cudaCheckError(cudaFree(buffer1_));
+        cudaCheckError(cudaFree(buffer2_));
+        cudaCheckError(cudaFree(C_val_));
+        cudaCheckError(cudaFree(C_col_));
         break;
       }
     }
@@ -246,33 +443,63 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
   void postLoopRequirements() override {
+    std::cout << "\t\tPostLoop" << std::endl;
+    cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_));
+    // Destroying descriptors
+    cusparseCheckError(cusparseDestroySpMat(descrA_));
+    cusparseCheckError(cusparseDestroySpMat(descrB_));
+    cusparseCheckError(cusparseDestroySpMat(descrC_));
     switch(offload_) {
       case gpuOffloadType::always: {
         break;
       }
       case gpuOffloadType::once: {
+        cudaCheckError(cudaMemcpyAsync(A_val_, A_val_dev_, sizeof(T) *
+        A_nnz_, cudaMemcpyDeviceToHost, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_col_, A_col_dev_, sizeof(int) *
+        A_nnz_, cudaMemcpyDeviceToHost, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_row_, A_row_dev_, sizeof(int) *
+        (n_ + 1), cudaMemcpyDeviceToHost, s1_));
+
+        cudaCheckError(cudaMemcpyAsync(B_val_, B_val_dev_, sizeof(T) *
+        B_nnz_, cudaMemcpyDeviceToHost, s2_));
+        cudaCheckError(cudaMemcpyAsync(B_col_, B_col_dev_, sizeof(int) *
+        B_nnz_, cudaMemcpyDeviceToHost, s2_));
+        cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) *
+        (n_ + 1), cudaMemcpyDeviceToHost, s2_));
+
+        C_val_ = (T*)malloc(sizeof(T) * C_nnz_);
+        C_col_ = (int*)malloc(sizeof(int) * C_nnz_);
+        cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) *
+        C_nnz_, cudaMemcpyDeviceToHost, s3_));
+        cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) *
+        C_nnz_, cudaMemcpyDeviceToHost, s3_));
+        cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) *
+        (n_ + 1), cudaMemcpyDeviceToHost, s3_));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        cudaCheckError(cudaFree(C_val_dev_));
+        cudaCheckError(cudaFree(C_col_dev_));
+        free(C_val_);
+        free(C_col_);
         break;
       }
       case gpuOffloadType::unified: {
         // Ensure all data resides on host once work has completed
-        cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * (*A_nnz_),
+        cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_,
                                             cudaCpuDeviceId, s1_));
-        cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * (*A_nnz_),
+        cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * A_nnz_,
                                             cudaCpuDeviceId, s1_));
         cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1),
                                             cudaCpuDeviceId, s1_));
 
-        cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * (*B_nnz_),
+        cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * B_nnz_,
                                             cudaCpuDeviceId, s2_));
-        cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * (*B_nnz_),
+        cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * B_nnz_,
                                             cudaCpuDeviceId, s2_));
         cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1),
                                             cudaCpuDeviceId, s2_));
 
-        cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * (*C_nnz_),
-                                            cudaCpuDeviceId, s3_));
-        cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * (*C_nnz_),
-                                            cudaCpuDeviceId, s3_));
         cudaCheckError(cudaMemPrefetchAsync(C_row_, sizeof(int) * (n_ + 1),
                                             cudaCpuDeviceId, s3_));
         // Ensure device has finished all work.
@@ -285,26 +512,39 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Do any necessary cleanup (free pointers, close library handles, etc.)
    * after Kernel has been called. */
   void postCallKernelCleanup() override {
-    if (offload_ == gpuOffloadType::unified) {
-      // Destroy the handle
-      cusparseDestroy(handle_);
+    std::cout << "\t\tPostCall" << std::endl << std::endl;
+    // Destroy the handle
+    cusparseCheckError(cusparseDestroy(handle_));
+
+    // Destroy streams after use
+    cudaCheckError(cudaStreamDestroy(s1_));
+    cudaCheckError(cudaStreamDestroy(s2_));
+    cudaCheckError(cudaStreamDestroy(s3_));
 
-      // Destroy streams after use
-      cudaCheckError(cudaStreamDestroy(s1_));
-      cudaCheckError(cudaStreamDestroy(s2_));
-      cudaCheckError(cudaStreamDestroy(s3_));
-    }
 
     if (offload_ == gpuOffloadType::unified) {
-      cudaFree(A_val_);
-      cudaFree(A_col_);
-      cudaFree(A_row_);
-      cudaFree(B_val_);
-      cudaFree(B_col_);
-      cudaFree(B_row_);
-      cudaFree(C_val_);
-      cudaFree(C_col_);
-      cudaFree(C_row_);
+      cudaCheckError(cudaFree(A_val_));
+      cudaCheckError(cudaFree(A_col_));
+      cudaCheckError(cudaFree(A_row_));
+      cudaCheckError(cudaFree(B_val_));
+      cudaCheckError(cudaFree(B_col_));
+      cudaCheckError(cudaFree(B_row_));
+      cudaCheckError(cudaFree(C_row_));
+    } else {
+      free(A_val_);
+      free(A_col_);
+      free(A_row_);
+      free(B_val_);
+      free(B_col_);
+      free(B_row_);
+      free(C_row_);
+      cudaCheckError(cudaFree(A_val_dev_));
+      cudaCheckError(cudaFree(A_col_dev_));
+      cudaCheckError(cudaFree(A_row_dev_));
+      cudaCheckError(cudaFree(B_val_dev_));
+      cudaCheckError(cudaFree(B_col_dev_));
+      cudaCheckError(cudaFree(B_row_dev_));
+      cudaCheckError(cudaFree(C_row_dev_));
     }
   }
 
@@ -356,13 +596,10 @@ class sp_gemm_gpu : public sp_gemm<T> {
 	void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index,
 						 int* row_ptr) {
 		int nnz_encountered = 0;
-		int prev_row_ptr = 0;
 		for (int row = 0; row < n_row; row++) {
-			if (nnz_encountered >= nnz) break;
-			row_ptr[row] = prev_row_ptr;
+			row_ptr[row] = nnz_encountered;
 			int nnz_row = 0;
 			for (int col = 0; col < n_col; col++) {
-				if (nnz_encountered >= nnz) break;
 				if (dense[(row * n_col) + col] != 0.0) {
 					nnz_row++;
 					col_index[nnz_encountered] = col;
@@ -370,10 +607,41 @@ class sp_gemm_gpu : public sp_gemm<T> {
 					nnz_encountered++;
 				}
 			}
-			prev_row_ptr += nnz_row;
 		}
+    row_ptr[n_row] = nnz_encountered;
 	}
 
+
+  // ToDo -- the two following functons are useful for debugging.  I'm
+  //  keeping them in to that end, though they are not used by the benchmark
+  //  itself
+  void printDenseMatrix(T* M, int rows, int cols) {
+    for (int row = 0; row < rows; row++) {
+      std::cout << "| ";
+      for (int col = 0; col < cols; col++) {
+        std::cout << M[(row * cols) + col] << " | ";
+      }
+      std::cout << std::endl;
+    }
+  }
+
+  void printCSR(T* values, int* col_indices, int* row_pointers, int nnz,
+                int rows, int cols) {
+    std::cout << "\tRow pointers__" << std::endl;
+    for (int p = 0; p < (rows + 1); p++) {
+      std::cout << row_pointers[p] << ", ";
+    }
+    std::cout << std::endl << "\tColumn Indices__" << std::endl;
+    for (int i = 0; i < nnz; i++) {
+      std::cout << col_indices[i] << ", ";
+    }
+    std::cout << std::endl << "\tValues__" << std::endl;
+    for (int v = 0; v < nnz; v++) {
+      std::cout << values[v] << ", ";
+    }
+    std::cout << std::endl;
+  }
+
   /** Handle used when calling cuBLAS. */
   cusparseHandle_t handle_;
 
@@ -396,29 +664,34 @@ class sp_gemm_gpu : public sp_gemm<T> {
 	T* A_val_;
 	int* A_col_;
   int* A_row_;
-  int* A_num_rows_;
-  int* A_num_cols_;
-  int* A_nnz_;
+  int64_t A_num_rows_;
+  int64_t A_num_cols_;
+  int64_t A_nnz_;
 
   T* B_val_;
   int* B_col_;
   int* B_row_;
-  int* B_num_rows_;
-  int* B_num_cols_;
-  int* B_nnz_;
+  int64_t B_num_rows_;
+  int64_t B_num_cols_;
+  int64_t B_nnz_;
 
   T* C_val_;
   int* C_col_;
   int* C_row_;
-  int* C_num_rows_;
-  int* C_num_cols_;
-  int*C_nnz_;
+  int64_t C_num_rows_;
+  int64_t C_num_cols_;
+  int64_t C_nnz_;
 
   /** CSR format vectors for matrices A, B and C on the device. */
-	int* A_num_rows_dev_, A_num_cols_dev_, A_nnz_dev_, B_num_rows_dev_,
-	B_num_cols_dev_, B_nnz_dev_, C_num_rows_dev_, C_num_cols_dev_, C_nnz_dev_;
-	T* A_val_dev_, B_val_dev_, C_val_dev_;
-	int* A_col_dev_, A_row_dev_, B_col_dev_, B_row_dev_, C_col_dev_, C_row_dev_;
+	T* A_val_dev_;
+  T* B_val_dev_;
+  T* C_val_dev_;
+	int* A_col_dev_;
+  int* A_row_dev_;
+  int* B_col_dev_;
+  int* B_row_dev_;
+  int* C_col_dev_;
+  int* C_row_dev_;
 
   /** The constant value Alpha. */
   const T alpha = ALPHA;
@@ -439,6 +712,13 @@ class sp_gemm_gpu : public sp_gemm<T> {
 	size_t buffer_size2_ = 0;
   void* buffer1_ = NULL;
 	void* buffer2_ = NULL;
+
+  cusparseOperation_t opA_ = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  cusparseOperation_t opB_ = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  cusparseSpGEMMAlg_t alg_ = CUSPARSE_SPGEMM_DEFAULT;
+  cusparseIndexType_t rType_ = CUSPARSE_INDEX_32I;
+  cusparseIndexType_t cType_ = CUSPARSE_INDEX_32I;
+  cusparseIndexBase_t indType_ = CUSPARSE_INDEX_BASE_ZERO;
 };
 }  // namespace gpu
 #endif
\ No newline at end of file
diff --git a/include/doGemm.hh b/include/doGemm.hh
index 0e4dcc0..9a66329 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -517,20 +517,20 @@ class doGemm {
 		cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
 
 //		// Perform the GPU kernels
+    // - ALWAYS: Offload to/from GPU every iteration
+    spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
+    time_checksum_gflop gpuResult_always = spGemmGpu_.compute();
+    gpuResult_always.gflops =
+            calcGflops(flops, iterations_, gpuResult_always.runtime);
 //		// - ONCE : Offload to/from GPU once before all iterations and once
 //		// after
-//		spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
-//		time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
-//		gpuResult_once.gflops =
-//						calcGflops(flops, iterations_, gpuResult_once.runtime);
-//
-//		// - ALWAYS: Offload to/from GPU every iteration
-//		spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
-//		time_checksum_gflop gpuResult_always = spGemmGpu_.compute();
-//		gpuResult_always.gflops =
-//						calcGflops(flops, iterations_, gpuResult_always.runtime);
-//		// - UNIFIED : data passed from host to device (and device to host) as
-//		//             needed
+		spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
+		time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
+		gpuResult_once.gflops =
+						calcGflops(flops, iterations_, gpuResult_once.runtime);
+
+		// - UNIFIED : data passed from host to device (and device to host) as
+		//             needed
 		spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
 		time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
 		gpuResult_unified.gflops =
@@ -541,11 +541,11 @@ class doGemm {
 		// Write lines to CSV file
 		writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
 		               cpuResult.runtime, cpuResult.gflops);
-//		writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
-//		               iterations_, gpuResult_once.runtime, gpuResult_once.gflops);
-//		writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
-//		               iterations_, gpuResult_always.runtime,
-//		               gpuResult_always.gflops);
+		writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
+		               iterations_, gpuResult_once.runtime, gpuResult_once.gflops);
+		writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
+		               iterations_, gpuResult_always.runtime,
+		               gpuResult_always.gflops);
 		writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize,
 		               iterations_, gpuResult_unified.runtime,
 		               gpuResult_unified.gflops);

From 2ffee16635466c3315f7c1cf075846c190041581 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Tue, 26 Mar 2024 12:55:10 +0000
Subject: [PATCH 10/32] All implemented and running.  No checksum at the end

---
 cuBLAS/sp_gemm.hh | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 0879966..fbd08fd 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -325,10 +325,12 @@ class sp_gemm_gpu : public sp_gemm<T> {
         cudaCheckError(cudaDeviceSynchronize());
 
         // Freeing memory
-        cudaCheckError(cudaFree(buffer1_));
-        cudaCheckError(cudaFree(buffer2_));
         cudaCheckError(cudaFree(C_val_dev_));
         cudaCheckError(cudaFree(C_col_dev_));
+        cudaCheckError(cudaFree(buffer1_));
+        cudaCheckError(cudaFree(buffer2_));
+        buffer_size1_ = 0;
+        buffer_size2_ = 0;
         free(C_val_);
         free(C_col_);
         break;
@@ -380,8 +382,12 @@ class sp_gemm_gpu : public sp_gemm<T> {
                                     cudaDataType_, alg_, spgemmDesc_));
 
         // Freeing memory
+        cudaCheckError(cudaFree(C_val_dev_));
+        cudaCheckError(cudaFree(C_col_dev_));
         cudaCheckError(cudaFree(buffer1_));
         cudaCheckError(cudaFree(buffer2_));
+        buffer_size1_ = 0;
+        buffer_size2_ = 0;
         break;
       }
       case gpuOffloadType::unified: {
@@ -414,6 +420,8 @@ class sp_gemm_gpu : public sp_gemm<T> {
                 cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
                                      &C_nnz_));
 
+        if (C_val_ != NULL) cudaCheckError(cudaFree(C_val_));
+        if (C_val_ != NULL) cudaCheckError(cudaFree(C_col_));
         cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * C_nnz_));
         cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * C_nnz_));
 
@@ -425,16 +433,11 @@ class sp_gemm_gpu : public sp_gemm<T> {
                                     alg_, spgemmDesc_));
 
 
-        cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * C_nnz_,
-                                            cudaCpuDeviceId, s3_));
-        cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * C_nnz_,
-                                            cudaCpuDeviceId, s3_));
-
         // Freeing memory
         cudaCheckError(cudaFree(buffer1_));
         cudaCheckError(cudaFree(buffer2_));
-        cudaCheckError(cudaFree(C_val_));
-        cudaCheckError(cudaFree(C_col_));
+        buffer_size1_ = 0;
+        buffer_size2_ = 0;
         break;
       }
     }
@@ -468,20 +471,9 @@ class sp_gemm_gpu : public sp_gemm<T> {
         cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) *
         (n_ + 1), cudaMemcpyDeviceToHost, s2_));
 
-        C_val_ = (T*)malloc(sizeof(T) * C_nnz_);
-        C_col_ = (int*)malloc(sizeof(int) * C_nnz_);
-        cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) *
-        C_nnz_, cudaMemcpyDeviceToHost, s3_));
-        cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) *
-        C_nnz_, cudaMemcpyDeviceToHost, s3_));
         cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) *
         (n_ + 1), cudaMemcpyDeviceToHost, s3_));
         cudaCheckError(cudaDeviceSynchronize());
-
-        cudaCheckError(cudaFree(C_val_dev_));
-        cudaCheckError(cudaFree(C_col_dev_));
-        free(C_val_);
-        free(C_col_);
         break;
       }
       case gpuOffloadType::unified: {
@@ -675,8 +667,8 @@ class sp_gemm_gpu : public sp_gemm<T> {
   int64_t B_num_cols_;
   int64_t B_nnz_;
 
-  T* C_val_;
-  int* C_col_;
+  T* C_val_ = NULL;
+  int* C_col_ = NULL;
   int* C_row_;
   int64_t C_num_rows_;
   int64_t C_num_cols_;

From 064ec5756f4b524d45e8bc2f94dbdf82412375d5 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Tue, 26 Mar 2024 12:57:45 +0000
Subject: [PATCH 11/32] Removing print statements

---
 cuBLAS/sp_gemm.hh | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index fbd08fd..01c6edb 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -34,11 +34,8 @@ class sp_gemm_gpu : public sp_gemm<T> {
    *  - Unified: Initialise data as unified memory; no data movement semantics
    *             required */
   void initialise(gpuOffloadType offload, int n, float sparsity) override {
-    std::cout << "_/_/_/_/ Initialising for problem size: " << n << std::endl;
-
     offload_ = offload;
 
-
     if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F;
     else if (std::is_same_v<T, double>) cudaDataType_ = CUDA_R_64F;
     else {
@@ -151,7 +148,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
-    std::cout << "\t\tPreLoop" << std::endl;
     cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_));
     switch(offload_) {
       case gpuOffloadType::always: {
@@ -233,7 +229,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
 
   /** Make a call to the BLAS Library Kernel. */
   void callGemm() override {
-    std::cout << "\t\tcallGemm" << std::endl;
     switch(offload_) {
       case gpuOffloadType::always: {
         cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) *
@@ -446,7 +441,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
   void postLoopRequirements() override {
-    std::cout << "\t\tPostLoop" << std::endl;
     cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_));
     // Destroying descriptors
     cusparseCheckError(cusparseDestroySpMat(descrA_));
@@ -504,7 +498,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Do any necessary cleanup (free pointers, close library handles, etc.)
    * after Kernel has been called. */
   void postCallKernelCleanup() override {
-    std::cout << "\t\tPostCall" << std::endl << std::endl;
     // Destroy the handle
     cusparseCheckError(cusparseDestroy(handle_));
 

From 88a053f2ea565e1753d671c4ddcee9ba45a80c3b Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Fri, 29 Mar 2024 12:35:53 +0000
Subject: [PATCH 12/32] Removing print statements

---
 cuBLAS/sp_gemm.hh | 116 +++++++++++++++++++++++++++++-----------------
 include/doGemm.hh |  20 ++++----
 2 files changed, 84 insertions(+), 52 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 01c6edb..db9cf29 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -24,7 +24,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
   using sp_gemm<T>::C_;
   using sp_gemm<T>::offload_;
 
-	// ToDo -- just unified implemented so far.  Fill in Always and Once later
+	// ToDo -- No checksum for sparse yet.  Nedd to do
 
   /** Initialise the required data structures.
    * `offload` refers to the data offload type:
@@ -42,7 +42,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
       std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
       exit(1);
     }
-    n_ = n;
+    n_ = n * 20;
 
     // Get device identifier
     cudaCheckError(cudaGetDevice(&gpuDevice_));
@@ -93,6 +93,10 @@ class sp_gemm_gpu : public sp_gemm<T> {
       cudaCheckError(cudaMalloc((void**)&C_row_dev_, sizeof(int) * (n_ + 1)));
     }
 
+    C_mem_allocated_always_ = false;
+    C_mem_allocated_once_ = false;
+    C_mem_allocated_unified_ = false;
+
 		// Initialise the host matricies
 		// cusparseSpGEMM() works on CSR format only.  This helpfully makes our
 		// sparse matrix format decision for us!
@@ -148,21 +152,9 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
-    cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_));
+
     switch(offload_) {
       case gpuOffloadType::always: {
-        // Make matrix descriptors
-        cusparseCheckError(
-                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_,
-                                  A_col_dev_, A_val_dev_, rType_, cType_,
-                                  indType_, cudaDataType_));
-        cusparseCheckError(
-                cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_,
-                                  B_col_dev_, B_val_dev_, rType_, cType_,
-                                  indType_, cudaDataType_));
-        cusparseCheckError(
-                cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL,
-                                  rType_, cType_, indType_, cudaDataType_));
         break;
       }
       case gpuOffloadType::once: {
@@ -174,11 +166,14 @@ class sp_gemm_gpu : public sp_gemm<T> {
                                        + 1), cudaMemcpyHostToDevice, s1_));
 
         cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) *
-                                       B_nnz_, cudaMemcpyHostToDevice, s1_));
+                                       B_nnz_, cudaMemcpyHostToDevice, s2_));
         cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) *
-                                       B_nnz_, cudaMemcpyHostToDevice, s1_));
+                                       B_nnz_, cudaMemcpyHostToDevice, s2_));
         cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_
-                                       + 1), cudaMemcpyHostToDevice, s1_));
+                                       + 1), cudaMemcpyHostToDevice, s2_));
+
+        cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_
+        + 1), cudaMemcpyHostToDevice, s3_));
 
         // Craete matrix descriptors
         cusparseCheckError(
@@ -225,6 +220,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
         break;
       }
     }
+    cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_));
   }
 
   /** Make a call to the BLAS Library Kernel. */
@@ -239,16 +235,27 @@ class sp_gemm_gpu : public sp_gemm<T> {
                                        + 1), cudaMemcpyHostToDevice, s1_));
 
         cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) *
-        B_nnz_, cudaMemcpyHostToDevice, s1_));
+        B_nnz_, cudaMemcpyHostToDevice, s2_));
         cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) *
-        B_nnz_, cudaMemcpyHostToDevice, s1_));
+        B_nnz_, cudaMemcpyHostToDevice, s2_));
         cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_
-                                       + 1), cudaMemcpyHostToDevice, s1_));
+                                       + 1), cudaMemcpyHostToDevice, s2_));
+
+        cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_
+        + 1), cudaMemcpyHostToDevice, s3_));
 
+        // Make matrix descriptors
         cusparseCheckError(
-                cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_,
-                                    descrB_, &beta, descrC_, cudaDataType_,
-                                    alg_, spgemmDesc_));
+                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_,
+                                  A_col_dev_, A_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_,
+                                  B_col_dev_, B_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL,
+                                  rType_, cType_, indType_, cudaDataType_));
 
         cusparseCheckError(
                 cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha,
@@ -280,10 +287,10 @@ class sp_gemm_gpu : public sp_gemm<T> {
                 cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
                                      &C_nnz_));
 
-        cusparseCheckError(
-                cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
-                                     &C_nnz_));
-
+        if (C_mem_allocated_always_) {
+          cudaCheckError(cudaFree(C_val_dev_));
+          cudaCheckError(cudaFree(C_col_dev_));
+        }
         cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_));
         cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_));
 
@@ -309,8 +316,14 @@ class sp_gemm_gpu : public sp_gemm<T> {
         cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) *
         (n_ + 1), cudaMemcpyDeviceToHost, s2_));
 
+        if (C_mem_allocated_always_) {
+          free(C_val_);
+          free(C_col_);
+        }
         C_val_ = (T*)malloc(sizeof(T) * C_nnz_);
         C_col_ = (int*)malloc(sizeof(int) * C_nnz_);
+        C_mem_allocated_always_ = true;
+
         cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) *
         C_nnz_, cudaMemcpyDeviceToHost, s3_));
         cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) *
@@ -320,22 +333,13 @@ class sp_gemm_gpu : public sp_gemm<T> {
         cudaCheckError(cudaDeviceSynchronize());
 
         // Freeing memory
-        cudaCheckError(cudaFree(C_val_dev_));
-        cudaCheckError(cudaFree(C_col_dev_));
         cudaCheckError(cudaFree(buffer1_));
         cudaCheckError(cudaFree(buffer2_));
         buffer_size1_ = 0;
         buffer_size2_ = 0;
-        free(C_val_);
-        free(C_col_);
         break;
       }
       case gpuOffloadType::once: {
-        cusparseCheckError(
-                cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_,
-                                    descrB_, &beta, descrC_, cudaDataType_,
-                                    alg_, spgemmDesc_));
-
         cusparseCheckError(
                 cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha,
                                               descrA_, descrB_, &beta,
@@ -365,8 +369,13 @@ class sp_gemm_gpu : public sp_gemm<T> {
                 cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
                                      &C_nnz_));
 
+        if (C_mem_allocated_once_) {
+          cudaCheckError(cudaFree(C_val_dev_));
+          cudaCheckError(cudaFree(C_col_dev_));
+        }
         cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_));
         cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_));
+        C_mem_allocated_once_ = true;
 
         cusparseCheckError(
                 cusparseCsrSetPointers(descrC_, C_row_dev_, C_col_dev_,
@@ -377,8 +386,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
                                     cudaDataType_, alg_, spgemmDesc_));
 
         // Freeing memory
-        cudaCheckError(cudaFree(C_val_dev_));
-        cudaCheckError(cudaFree(C_col_dev_));
         cudaCheckError(cudaFree(buffer1_));
         cudaCheckError(cudaFree(buffer2_));
         buffer_size1_ = 0;
@@ -415,10 +422,14 @@ class sp_gemm_gpu : public sp_gemm<T> {
                 cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
                                      &C_nnz_));
 
-        if (C_val_ != NULL) cudaCheckError(cudaFree(C_val_));
-        if (C_val_ != NULL) cudaCheckError(cudaFree(C_col_));
+        if (C_mem_allocated_unified_) {
+          cudaCheckError(cudaFree(C_val_));
+          cudaCheckError(cudaFree(C_col_));
+        }
+
         cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * C_nnz_));
         cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * C_nnz_));
+        C_mem_allocated_unified_ = true;
 
         cusparseCheckError(
                 cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_));
@@ -445,7 +456,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
     // Destroying descriptors
     cusparseCheckError(cusparseDestroySpMat(descrA_));
     cusparseCheckError(cusparseDestroySpMat(descrB_));
-    cusparseCheckError(cusparseDestroySpMat(descrC_));
     switch(offload_) {
       case gpuOffloadType::always: {
         break;
@@ -465,12 +475,19 @@ class sp_gemm_gpu : public sp_gemm<T> {
         cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) *
         (n_ + 1), cudaMemcpyDeviceToHost, s2_));
 
+        C_val_ = (T*)malloc(sizeof(T) * C_nnz_);
+        C_col_ = (int*)malloc(sizeof(int) * C_nnz_);
+        cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) *
+        C_nnz_, cudaMemcpyDeviceToHost, s3_));
+        cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) *
+        C_nnz_, cudaMemcpyDeviceToHost, s3_));
         cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) *
         (n_ + 1), cudaMemcpyDeviceToHost, s3_));
         cudaCheckError(cudaDeviceSynchronize());
         break;
       }
       case gpuOffloadType::unified: {
+        cusparseCheckError(cusparseDestroySpMat(descrC_));
         // Ensure all data resides on host once work has completed
         cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_,
                                             cudaCpuDeviceId, s1_));
@@ -486,6 +503,10 @@ class sp_gemm_gpu : public sp_gemm<T> {
         cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1),
                                             cudaCpuDeviceId, s2_));
 
+//        cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * C_nnz_,
+//                                            cudaCpuDeviceId, s3_));
+//        cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * C_nnz_,
+//                                            cudaCpuDeviceId, s3_));
         cudaCheckError(cudaMemPrefetchAsync(C_row_, sizeof(int) * (n_ + 1),
                                             cudaCpuDeviceId, s3_));
         // Ensure device has finished all work.
@@ -506,7 +527,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
     cudaCheckError(cudaStreamDestroy(s2_));
     cudaCheckError(cudaStreamDestroy(s3_));
 
-
     if (offload_ == gpuOffloadType::unified) {
       cudaCheckError(cudaFree(A_val_));
       cudaCheckError(cudaFree(A_col_));
@@ -514,6 +534,8 @@ class sp_gemm_gpu : public sp_gemm<T> {
       cudaCheckError(cudaFree(B_val_));
       cudaCheckError(cudaFree(B_col_));
       cudaCheckError(cudaFree(B_row_));
+      cudaCheckError(cudaFree(C_val_));
+      cudaCheckError(cudaFree(C_col_));
       cudaCheckError(cudaFree(C_row_));
     } else {
       free(A_val_);
@@ -522,6 +544,8 @@ class sp_gemm_gpu : public sp_gemm<T> {
       free(B_val_);
       free(B_col_);
       free(B_row_);
+      free(C_val_);
+      free(C_col_);
       free(C_row_);
       cudaCheckError(cudaFree(A_val_dev_));
       cudaCheckError(cudaFree(A_col_dev_));
@@ -529,6 +553,8 @@ class sp_gemm_gpu : public sp_gemm<T> {
       cudaCheckError(cudaFree(B_val_dev_));
       cudaCheckError(cudaFree(B_col_dev_));
       cudaCheckError(cudaFree(B_row_dev_));
+      cudaCheckError(cudaFree(C_val_dev_));
+      cudaCheckError(cudaFree(C_col_dev_));
       cudaCheckError(cudaFree(C_row_dev_));
     }
   }
@@ -678,6 +704,10 @@ class sp_gemm_gpu : public sp_gemm<T> {
   int* C_col_dev_;
   int* C_row_dev_;
 
+  bool C_mem_allocated_always_;
+  bool C_mem_allocated_once_;
+  bool C_mem_allocated_unified_;
+
   /** The constant value Alpha. */
   const T alpha = ALPHA;
 
diff --git a/include/doGemm.hh b/include/doGemm.hh
index 9a66329..8743314 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -516,25 +516,27 @@ class doGemm {
 		time_checksum_gflop cpuResult = spGemmCpu_.compute();
 		cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
 
-//		// Perform the GPU kernels
+		// Perform the GPU kernels
+
+    // - UNIFIED : data passed from host to device (and device to host) as
+    //             needed
+    spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
+    time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
+    gpuResult_unified.gflops =
+    calcGflops(flops, iterations_, gpuResult_unified.runtime);
+
     // - ALWAYS: Offload to/from GPU every iteration
     spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
     time_checksum_gflop gpuResult_always = spGemmGpu_.compute();
     gpuResult_always.gflops =
             calcGflops(flops, iterations_, gpuResult_always.runtime);
-//		// - ONCE : Offload to/from GPU once before all iterations and once
-//		// after
+		// - ONCE : Offload to/from GPU once before all iterations and once
+		// after
 		spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
 		time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
 		gpuResult_once.gflops =
 						calcGflops(flops, iterations_, gpuResult_once.runtime);
 
-		// - UNIFIED : data passed from host to device (and device to host) as
-		//             needed
-		spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
-		time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
-		gpuResult_unified.gflops =
-						calcGflops(flops, iterations_, gpuResult_unified.runtime);
 
 		// ToDo -- non-default GPU operations
 

From 5b04a2c93e88ff4438770cfb9828ce681e364c92 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Mon, 1 Apr 2024 09:59:01 +0100
Subject: [PATCH 13/32] rebasing

---
 cuBLAS/sp_gemm.hh | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index db9cf29..0848bb6 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -155,6 +155,18 @@ class sp_gemm_gpu : public sp_gemm<T> {
 
     switch(offload_) {
       case gpuOffloadType::always: {
+        // Make matrix descriptors
+        cusparseCheckError(
+                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_,
+                                  A_col_dev_, A_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_,
+                                  B_col_dev_, B_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL,
+                                  rType_, cType_, indType_, cudaDataType_));
         break;
       }
       case gpuOffloadType::once: {

From 23d318b7e066902bae676bf438f4141746fe79dc Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:26:37 +0100
Subject: [PATCH 14/32] rebasing

---
 include/doGemm.hh     | 44 ++++++++++++++----------
 include/main.hh       |  2 +-
 oneMKL/CPU/sp_gemm.hh | 79 +++++++++++++++++++++++++++++++++++++++++++
 src/main.cc           |  3 +-
 4 files changed, 108 insertions(+), 20 deletions(-)
 create mode 100644 oneMKL/CPU/sp_gemm.hh

diff --git a/include/doGemm.hh b/include/doGemm.hh
index 8743314..8153651 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -267,9 +267,7 @@ class doGemm {
     if (doCPU_ && doGPU_) {
       // Print offload results to stdout
       printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)");
-    }
 #endif
-
     // Square x Short and Wide
     // Re-initialise offload threshold structures & previous results
     cpuGpu_always_ = cpuGpu_offloadThreshold();
@@ -295,7 +293,7 @@ class doGemm {
     }
 #endif
 
-    // Square sparse matrix - sparse matrix multiplication
+// Square sparse matrix - sparse matrix multiplication
     cpuGpu_always_ = cpuGpu_offloadThreshold();
     cpuGpu_once_ = cpuGpu_offloadThreshold();
     cpuGpu_unified_ = cpuGpu_offloadThreshold();
@@ -309,6 +307,12 @@ class doGemm {
     }
     // Close file
     csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+    if (doCPU_ && dpGPU_) {
+      // Print offload results to stdout
+	    printOffloadThreshold("Sparse Square");
+    }
+#endif
   }
 
  private:
@@ -512,14 +516,20 @@ class doGemm {
 		const uint64_t flops = calcFlops(N, N, N);
 		std::string kernelName = getKernelName();
 
-		spGemmCpu_.initialise(N, sparsity);
-		time_checksum_gflop cpuResult = spGemmCpu_.compute();
-		cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
-
-		// Perform the GPU kernels
-
+#if CPU_ENABLED
+    if (doCPU_) {
+      spGemmCpu_.initialise(N, sparsity);
+      time_checksum_gflop cpuResult = spGemmCpu_.compute();
+      cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
+		writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
+		               cpuResult.runtime, cpuResult.gflops);
+    }
+#endif
+#if GPU_ENABLED
+    // Perform the GPU kernels
     // - UNIFIED : data passed from host to device (and device to host) as
     //             needed
+    if (doGPU_) {
     spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
     time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
     gpuResult_unified.gflops =
@@ -536,13 +546,9 @@ class doGemm {
 		time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
 		gpuResult_once.gflops =
 						calcGflops(flops, iterations_, gpuResult_once.runtime);
-
-
 		// ToDo -- non-default GPU operations
 
 		// Write lines to CSV file
-		writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
-		               cpuResult.runtime, cpuResult.gflops);
 		writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
 		               iterations_, gpuResult_once.runtime, gpuResult_once.gflops);
 		writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
@@ -551,6 +557,10 @@ class doGemm {
 		writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize,
 		               iterations_, gpuResult_unified.runtime,
 		               gpuResult_unified.gflops);
+
+    }
+#endif
+
 	}
 
   /** A function for calculating FLOPs performed by a GEMM.
@@ -589,7 +599,7 @@ class doGemm {
   }
 
   /** Print to stdout the offload thresholds. */
-  void printOffloadThreshold(std::string problemName) const {
+  void printOffloadThreshold(const std::string& problemName) const {
     std::vector<std::string> header = {
         "Device",  "M",          "N", "K", "Total Prob. Size (KiB)",
         "GFLOP/s", "CPU GFLOP/s"};
@@ -686,16 +696,14 @@ class doGemm {
 #if CPU_ENABLED
   /** The GEMM CPU kernel. */
   cpu::gemm_cpu<T> gemmCpu_;
+  cpu::sp_gemm_cpu<T> spGemmCpu_;
 #endif
 
-	cpu::sp_gemm_cpu<T> spGemmCpu_;
-
 #if GPU_ENABLED
   /** The GEMM GPU kernel. */
   gpu::gemm_gpu<T> gemmGpu_;
-#endif
-
 	gpu::sp_gemm_gpu<T> spGemmGpu_;
+#endif
 
   /** The point at which offloading to GPU (offload once) becomes worthwhile. */
   cpuGpu_offloadThreshold cpuGpu_once_;
diff --git a/include/main.hh b/include/main.hh
index cc0bb8f..f12ebcb 100644
--- a/include/main.hh
+++ b/include/main.hh
@@ -15,4 +15,4 @@ void printBenchmarkConfig(const int iters, const int upperLimit);
 int parseInt(const char* str);
 
 /** A function which parsen the runtime arguments. */
-void getParameters(int argc, char* argv[]);
\ No newline at end of file
+void getParameters(int argc, char** argv);
\ No newline at end of file
diff --git a/oneMKL/CPU/sp_gemm.hh b/oneMKL/CPU/sp_gemm.hh
new file mode 100644
index 0000000..847006b
--- /dev/null
+++ b/oneMKL/CPU/sp_gemm.hh
@@ -0,0 +1,79 @@
+#pragma once
+
+#ifdef CPU_ONEMKL
+#include <mkl.h>
+
+#include <algorithm>
+
+#include "../../include/kernels/CPU/sp_gemm.hh"
+#include "../../include/utilities.hh"
+
+namespace cpu {
+/** A class for GEMM CPU BLAS kernels. */
+template <typename T>
+class sp_gemm_cpu : public sp_gemm<T> {
+ public:
+  using sp_gemm<T>::sp_gemm;
+  using sp_gemm<T>::initInputMatrices;
+  using sp_gemm<T>::callConsume;
+  using sp_gemm<T>::m_;
+  using sp_gemm<T>::n_;
+  using sp_gemm<T>::k_;
+  using sp_gemm<T>::A_;
+  using sp_gemm<T>::B_;
+  using sp_gemm<T>::C_;
+
+  /** Initialise the required data structures. */
+  void initialise(int m, int n, int k) {
+    m_ = m;
+    n_ = n;
+    k_ = k;
+
+    A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64);
+    B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64);
+    C_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64);
+
+    // Initialise the matricies
+    initInputMatrices();
+  }
+
+ private:
+  /** Make call to the GEMM kernel. */
+  void callGemm() override {
+    if constexpr (std::is_same_v<T, float>) {
+      cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_,
+                  (float)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_),
+                  (float)BETA, C_, std::max(1, m_));
+    } else if constexpr (std::is_same_v<T, double>) {
+      cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_,
+                  (double)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_),
+                  (double)BETA, C_, std::max(1, m_));
+    } else {
+      // Un-specialised class will not do any work - print error and exit.
+      std::cout << "ERROR - Datatype for OneMKL CPU GEMM kernel not supported."
+                << std::endl;
+      exit(1);
+    }
+    // Ensure compiler doesn't optimise away the work being done
+    callConsume();
+  }
+
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {}
+
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {}
+
+  /** Do any necessary cleanup (free pointers, close library handles, etc.)
+   * after Kernel has been called. */
+  void postCallKernelCleanup() override {
+    mkl_free_buffers();
+    mkl_free(A_);
+    mkl_free(B_);
+    mkl_free(C_);
+  }
+};
+}  // namespace cpu
+#endif
\ No newline at end of file
diff --git a/src/main.cc b/src/main.cc
index 38e2b5a..a4eb55b 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -1,6 +1,7 @@
 #include "../include/main.hh"
 
 int iters = 10;
+int startDim = 1;
 int upperLimit = 128;
 bool sgemm = true;
 bool dgemm = true;
@@ -115,7 +116,7 @@ int parseInt(const char* str) {
   return strlen(next) ? -1 : value;
 }
 
-void getParameters(int argc, char* argv[]) {
+void getParameters(int argc, char** argv) {
   for (int i = 1; i < argc; i++) {
     if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i")) {
       if (++i >= argc || (iters = parseInt(argv[i])) < 0) {

From be9094c3c28399ac44658d92941b4923323850f5 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:32:57 +0100
Subject: [PATCH 15/32] rebasing

---
 createGflopsGraphs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/createGflopsGraphs.py b/createGflopsGraphs.py
index 0ed7772..d323162 100644
--- a/createGflopsGraphs.py
+++ b/createGflopsGraphs.py
@@ -199,7 +199,7 @@
 
     plt.margins(x=0.01, y=0.01)
     leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
-    for obj in leg.legendHandles:
+    for obj in leg.legend_handles:
         obj.set_linewidth(3.0)
         obj.set_markersize(15.0)
         obj.set_markeredgewidth(3.0)

From 7cfa7be9e278995be6d50a1ad00b9146b3996f79 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Wed, 3 Apr 2024 10:22:51 +0100
Subject: [PATCH 16/32] Tidying up spGEMM classes to remove duplicated code

---
 cuBLAS/sp_gemm.hh              | 90 ++-------------------------------
 include/kernels/CPU/sp_gemm.hh | 72 ++------------------------
 include/kernels/gemm.hh        | 92 ++++++++++++++++++++++++++++++++++
 oneMKL/CPU/sp_gemm.hh          |  9 ++--
 4 files changed, 102 insertions(+), 161 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 0848bb6..992b018 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -18,6 +18,8 @@ template <typename T>
 class sp_gemm_gpu : public sp_gemm<T> {
  public:
   using sp_gemm<T>::sp_gemm;
+  using sp_gemm<T>::initInputMatricesSparse;
+  using sp_gemm<T>::toCSR;
   using sp_gemm<T>::n_;
   using sp_gemm<T>::A_;
   using sp_gemm<T>::B_;
@@ -55,8 +57,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
 
 
    // Work out number of edges needed to achieve target sparsity
-    int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
-    A_nnz_ = B_nnz_ = edges;
+    A_nnz_ = B_nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity));
 
     if (offload_ == gpuOffloadType::unified) {
       cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_));
@@ -105,28 +106,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
 		// Set initial values to 0
     A_ = (T*)malloc(sizeof(T) * n_ * n_);
     B_ = (T*)malloc(sizeof(T) * n_ * n_);
-    for (int i = 0; i < (n_ * n_); i++) {
-      A_[i] = 0.0;
-      B_[i] = 0.0;
-    }
-
-    // Random number generator objects for use in descent
-    std::default_random_engine gen;
-    gen.seed(std::chrono::system_clock::now()
-                     .time_since_epoch().count());
-    std::uniform_real_distribution<double> dist(0.0, 1.0);
-
-    // Using a=0.45 and b=c=0.22 as default probabilities
-    for (int i = 0; i < A_nnz_; i++) {
-      while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1,
-                   0.45, 0.22, 0.22,
-                   &gen, dist, false)) {}
-    }
-    for (int i = 0; i < B_nnz_; i++) {
-      while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1,
-                   0.45, 0.22, 0.22,
-                   &gen, dist, false)) {}
-    }
+    initInputMatricesSparse(sparsity);
 
     toCSR(A_, n_, n_, A_nnz_, A_val_, A_col_, A_row_);
 
@@ -571,68 +551,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
     }
   }
 
-	bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
-					        float a, float b, float c, std::default_random_engine* gen,
-					        std::uniform_real_distribution<double> dist, bool bin) {
-		// If a 1x1 submatrix, then add an edge and return out
-		if (x1 >= x2 && y1 >= y2) {
-			if (abs(M[(y1 * n) + x1]) > 0.1) {
-				return false;
-			} else {
-				// Add 1.0 if this is a binary graph, and a random real number otherwise
-				M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
-								100.0) - 50.0);
-				return true;
-			}
-		} else {
-			// Divide up the matrix
-			int xMidPoint = x1 + floor((x2 - x1) / 2);
-			int yMidPoint = y1 + floor((y2 - y1) / 2);
-
-			// ToDo -- add some noise to these values between iterations
-			float newA = a;
-			float newB = b;
-			float newC = c;
-
-			// Work out which quarter to recurse into
-			// There are some ugly ternary operators here to avoid going out of bounds in the edge case
-			// that we are already at 1 width or 1 height
-			float randomNum = dist(*gen);
-			if (randomNum < a) {
-				return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
-				            newA, newB, newC, gen, dist, bin);
-			} else if (randomNum < (a + b)) {
-				return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
-				            newA, newB, newC, gen, dist, bin);
-			} else if (randomNum < (a + b + c)) {
-				return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
-				            newA, newB, newC, gen, dist, bin);
-			} else {
-				return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
-				            ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
-				            gen, dist, bin);
-			}
-		}
-		return true;
-	}
-
-	void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index,
-						 int* row_ptr) {
-		int nnz_encountered = 0;
-		for (int row = 0; row < n_row; row++) {
-			row_ptr[row] = nnz_encountered;
-			int nnz_row = 0;
-			for (int col = 0; col < n_col; col++) {
-				if (dense[(row * n_col) + col] != 0.0) {
-					nnz_row++;
-					col_index[nnz_encountered] = col;
-					vals[nnz_encountered] = dense[(row * n_col) + col];
-					nnz_encountered++;
-				}
-			}
-		}
-    row_ptr[n_row] = nnz_encountered;
-	}
 
 
   // ToDo -- the two following functons are useful for debugging.  I'm
diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh
index 3de5ea5..6d9d011 100644
--- a/include/kernels/CPU/sp_gemm.hh
+++ b/include/kernels/CPU/sp_gemm.hh
@@ -11,6 +11,8 @@ namespace cpu {
 		class sp_gemm : public ::gemm<T> {
 		public:
 				using ::gemm<T>::gemm;
+        using ::gemm<T>::initInputMatricesSparse;
+        using ::gemm<T>::toCSR;
 				using ::gemm<T>::m_;
 				using ::gemm<T>::n_;
 				using ::gemm<T>::k_;
@@ -27,78 +29,10 @@ namespace cpu {
 				B_ = (T*)malloc(sizeof(T) * n_ * n_);
 				C_ = (T*)malloc(sizeof(T) * n_ * n_);
 
-				// Set initial values to 0
-				for (int i = 0; i < (n_ * n_); i++) {
-					A_[i] = 0.0;
-					B_[i] = 0.0;
-				}
-
-				// Random number generator objects for use in descent
-				std::default_random_engine gen;
-				gen.seed(std::chrono::system_clock::now()
-								         .time_since_epoch().count());
-				std::uniform_real_distribution<double> dist(0.0, 1.0);
-
-				// Work out number of edges needed to achieve target sparsity
-				int edges = 1 + (int) (n * n * (1 - sparsity));
-
-				// Initialise the matrices
-				// Using a=0.45 and b=c=0.22 as default probabilities
-				for (int i = 0; i < edges; i++) {
-					while (!rMat(A_, n, 0, n - 1, 0, n - 1,
-					             0.45, 0.22, 0.22,
-					             &gen, dist, false)) {}
-					while (!rMat(B_, n, 0, n - 1, 0, n - 1,
-					             0.45, 0.22, 0.22,
-					             &gen, dist, false)) {}
-				}
+				initInputMatricesSparse(sparsity);
 			}
 
 			private:
-				bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
-					        float a, float b, float c, std::default_random_engine* gen,
-					        std::uniform_real_distribution<double> dist, bool bin) {
-					// If a 1x1 submatrix, then add an edge and return out
-					if (x1 >= x2 && y1 >= y2) {
-						if (abs(M[(y1 * n) + x1]) > 0.1) {
-							return false;
-						} else {
-							// Add 1.0 if this is a binary graph, and a random real number otherwise
-							M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
-											100.0) - 50.0);
-							return true;
-						}
-					} else {
-						// Divide up the matrix
-						int xMidPoint = x1 + floor((x2 - x1) / 2);
-						int yMidPoint = y1 + floor((y2 - y1) / 2);
-
-						// ToDo -- add some noise to these values between iterations
-						float newA = a;
-						float newB = b;
-						float newC = c;
-
-						// Work out which quarter to recurse into
-						// There are some ugly ternary operators here to avoid going out of bounds in the edge case
-						// that we are already at 1 width or 1 height
-						float randomNum = dist(*gen);
-						if (randomNum < a) {
-							return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
-							            newA, newB, newC, gen, dist, bin);
-						} else if (randomNum < (a + b)) {
-							return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
-							            newA, newB, newC, gen, dist, bin);
-						} else if (randomNum < (a + b + c)) {
-							return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
-							            newA, newB, newC, gen, dist, bin);
-						} else {
-							return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
-							            ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
-							            gen, dist, bin);
-						}
-					}
-					return true;
-				}
 				/** Do any necessary cleanup (free pointers, close library handles, etc.)
 				 * after Kernel has been called. */
 				void postCallKernelCleanup() {
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index 4eda90f..59a9898 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -4,6 +4,7 @@
 #include <chrono>
 #include <cmath>
 #include <limits>
+#include <random>
 
 #include "../utilities.hh"
 
@@ -86,9 +87,100 @@ class gemm {
     }
   }
 
+  void initInputMatricesSparse(float sparsity) {
+    for (int i = 0; i < (n_ * n_); i++) {
+      A_[i] = 0.0;
+      B_[i] = 0.0;
+    }
+
+    // Random number generator objects for use in descent
+    std::default_random_engine gen;
+    gen.seed(std::chrono::system_clock::now()
+                     .time_since_epoch().count());
+    std::uniform_real_distribution<double> dist(0.0, 1.0);
+
+    int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
+
+    // Using a=0.45 and b=c=0.22 as default probabilities
+    for (int i = 0; i < edges; i++) {
+      while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1,
+                   0.45, 0.22, 0.22,
+                   &gen, dist, false)) {}
+    }
+    for (int i = 0; i < edges; i++) {
+      while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1,
+                   0.45, 0.22, 0.22,
+                   &gen, dist, false)) {}
+    }
+  }
+
   /** Call the extern consume() function. */
   void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); }
 
+  /** Recursive function to populate sparse matrices */
+  bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
+            float a, float b, float c, std::default_random_engine* gen,
+            std::uniform_real_distribution<double> dist, bool bin) {
+    // If a 1x1 submatrix, then add an edge and return out
+    if (x1 >= x2 && y1 >= y2) {
+      if (abs(M[(y1 * n) + x1]) > 0.1) {
+        return false;
+      } else {
+        // Add 1.0 if this is a binary graph, and a random real number otherwise
+        M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
+                                                 100.0) - 50.0);
+        return true;
+      }
+    } else {
+      // Divide up the matrix
+      int xMidPoint = x1 + floor((x2 - x1) / 2);
+      int yMidPoint = y1 + floor((y2 - y1) / 2);
+
+      // ToDo -- add some noise to these values between iterations
+      float newA = a;
+      float newB = b;
+      float newC = c;
+
+      // Work out which quarter to recurse into
+      // There are some ugly ternary operators here to avoid going out of bounds in the edge case
+      // that we are already at 1 width or 1 height
+      float randomNum = dist(*gen);
+      if (randomNum < a) {
+        return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+                    newA, newB, newC, gen, dist, bin);
+      } else if (randomNum < (a + b)) {
+        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+                    newA, newB, newC, gen, dist, bin);
+      } else if (randomNum < (a + b + c)) {
+        return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+                    newA, newB, newC, gen, dist, bin);
+      } else {
+        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+                    ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
+                    gen, dist, bin);
+      }
+    }
+    return true;
+  }
+
+  void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index,
+             int* row_ptr) {
+    int nnz_encountered = 0;
+    for (int row = 0; row < n_row; row++) {
+      row_ptr[row] = nnz_encountered;
+      int nnz_row = 0;
+      for (int col = 0; col < n_col; col++) {
+        if (dense[(row * n_col) + col] != 0.0) {
+          nnz_row++;
+          col_index[nnz_encountered] = col;
+          vals[nnz_encountered] = dense[(row * n_col) + col];
+          nnz_encountered++;
+        }
+      }
+    }
+    row_ptr[n_row] = nnz_encountered;
+  }
+
   /** The number of iterations to perform per problem size. */
   const int iterations_;
 
diff --git a/oneMKL/CPU/sp_gemm.hh b/oneMKL/CPU/sp_gemm.hh
index 847006b..5ac6a70 100644
--- a/oneMKL/CPU/sp_gemm.hh
+++ b/oneMKL/CPU/sp_gemm.hh
@@ -14,20 +14,17 @@ template <typename T>
 class sp_gemm_cpu : public sp_gemm<T> {
  public:
   using sp_gemm<T>::sp_gemm;
-  using sp_gemm<T>::initInputMatrices;
+  using sp_gemm<T>::initInputMatricesSparse;
+  using sp_gemm<T>::toCSR;
   using sp_gemm<T>::callConsume;
-  using sp_gemm<T>::m_;
   using sp_gemm<T>::n_;
-  using sp_gemm<T>::k_;
   using sp_gemm<T>::A_;
   using sp_gemm<T>::B_;
   using sp_gemm<T>::C_;
 
   /** Initialise the required data structures. */
-  void initialise(int m, int n, int k) {
-    m_ = m;
+  void initialise(int n, float sparsity) {
     n_ = n;
-    k_ = k;
 
     A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64);
     B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64);

From 30d384e22573067f0b32ee7aeb30811a44b39781 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:39:46 +0100
Subject: [PATCH 17/32] rebasing

---
 cuBLAS/sp_gemm.hh       | 17 +++++++--
 include/doGemm.hh       | 82 +++++++++++++++++++++++------------------
 include/kernels/gemm.hh | 49 +++++++++---------------
 src/main.cc             |  4 +-
 4 files changed, 80 insertions(+), 72 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 992b018..aa095f8 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -36,6 +36,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
    *  - Unified: Initialise data as unified memory; no data movement semantics
    *             required */
   void initialise(gpuOffloadType offload, int n, float sparsity) override {
+    std::cout << "___________Initialising, problem size = " << n << std::endl;
     offload_ = offload;
 
     if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F;
@@ -46,9 +47,11 @@ class sp_gemm_gpu : public sp_gemm<T> {
     }
     n_ = n * 20;
 
+    std::cout << "\tGetting device" << std::endl;
     // Get device identifier
     cudaCheckError(cudaGetDevice(&gpuDevice_));
 
+    std::cout << "\tMaking streams" << std::endl;
     // Initialise 3 streams to asynchronously move data between host and device
     cudaCheckError(cudaStreamCreate(&s1_));
     cudaCheckError(cudaStreamCreate(&s2_));
@@ -59,6 +62,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
    // Work out number of edges needed to achieve target sparsity
     A_nnz_ = B_nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity));
 
+    std::cout << "\tMallocing" << std::endl;
     if (offload_ == gpuOffloadType::unified) {
       cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_));
       cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * A_nnz_));
@@ -106,8 +110,11 @@ class sp_gemm_gpu : public sp_gemm<T> {
 		// Set initial values to 0
     A_ = (T*)malloc(sizeof(T) * n_ * n_);
     B_ = (T*)malloc(sizeof(T) * n_ * n_);
+
+    std::cout << "\tInitialising start matrices" << std::endl;
     initInputMatricesSparse(sparsity);
 
+    std::cout << "\tConverting to CSR" << std::endl;
     toCSR(A_, n_, n_, A_nnz_, A_val_, A_col_, A_row_);
 
     toCSR(B_, n_, n_, B_nnz_, B_val_, B_col_, B_row_);
@@ -132,7 +139,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
-
+    std::cout << "\t\tpre loop" << std::endl;
     switch(offload_) {
       case gpuOffloadType::always: {
         // Make matrix descriptors
@@ -217,6 +224,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
 
   /** Make a call to the BLAS Library Kernel. */
   void callGemm() override {
+    std::cout << "\t\tGEMM" << std::endl;
     switch(offload_) {
       case gpuOffloadType::always: {
         cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) *
@@ -444,6 +452,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
   void postLoopRequirements() override {
+    std::cout << "\t\tpost loop" << std::endl;
     cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_));
     // Destroying descriptors
     cusparseCheckError(cusparseDestroySpMat(descrA_));
@@ -511,6 +520,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Do any necessary cleanup (free pointers, close library handles, etc.)
    * after Kernel has been called. */
   void postCallKernelCleanup() override {
+    std::cout << "\t\tcleaning up" << std::endl;
     // Destroy the handle
     cusparseCheckError(cusparseDestroy(handle_));
 
@@ -519,6 +529,9 @@ class sp_gemm_gpu : public sp_gemm<T> {
     cudaCheckError(cudaStreamDestroy(s2_));
     cudaCheckError(cudaStreamDestroy(s3_));
 
+    free(A_);
+    free(B_);
+
     if (offload_ == gpuOffloadType::unified) {
       cudaCheckError(cudaFree(A_val_));
       cudaCheckError(cudaFree(A_col_));
@@ -551,8 +564,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
     }
   }
 
-
-
   // ToDo -- the two following functons are useful for debugging.  I'm
   //  keeping them in to that end, though they are not used by the benchmark
   //  itself
diff --git a/include/doGemm.hh b/include/doGemm.hh
index 8153651..f4ec053 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -34,13 +34,16 @@ class doGemm {
  public:
   doGemm(const std::string csvDir, const int iters, const int startDim,
          const int upperLimit, const bool cpuEnabled = true,
-         const bool gpuEnabled = true)
+         const bool gpuEnabled = true, const bool doDense = true,
+         const bool doSparse = true)
       : CSV_DIR(csvDir),
         iterations_(iters),
         startDimention_(startDim),
         upperLimit_(upperLimit),
         doCPU_(cpuEnabled),
-        doGPU_(gpuEnabled)
+        doGPU_(gpuEnabled),
+        doDense_(dense),
+        doSparse_(sparse),
 #if CPU_ENABLED
         ,
         gemmCpu_(iterations_),
@@ -59,27 +62,28 @@ class doGemm {
 
   /** Run all problem types and write data to CSV files. */
   void collectData() {
-    // Square Problem Sizes...
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                                        "_square_square_M=N=K.csv");
-    for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-      // M = dim, N = dim, K = dim;
-      callDenseKernels(csvFile, dim, dim, dim);
-    }
-    // Close file
-    csvFile.close();
+    if (doDense_) {
+      // Square Problem Sizes...
+      // Re-initialise offload threshold structures
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                                          "_square_square_M=N=K.csv");
+      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+        // M = dim, N = dim, K = dim;
+        callDenseKernels(csvFile, dim, dim, dim);
+      }
+      // Close file
+      csvFile.close();
 #if CPU_ENABLED && GPU_ENABLED
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Square x Square (M=N=K)");
-    }
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Square x Square (M=N=K)");
+      }
 #endif
 
     // Rectangular Problem Sizes:
@@ -267,6 +271,7 @@ class doGemm {
     if (doCPU_ && doGPU_) {
       // Print offload results to stdout
       printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)");
+    }
 #endif
     // Square x Short and Wide
     // Re-initialise offload threshold structures & previous results
@@ -292,27 +297,28 @@ class doGemm {
       printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)");
     }
 #endif
+    }
 
-// Square sparse matrix - sparse matrix multiplication
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() +
-                          "_sparse_square.csv");
-    if (upperLimit_ >= 32) {
-      for (int dim = 1; dim <= upperLimit_; dim++) {
-        const int N = dim;
-        callSparseKernels(csvFile, N, 0.99);
+    if (doSparse_) {    // Square sparse matrix - sparse matrix multiplication
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() +
+                            "_sparse_square.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+          callSparseKernels(csvFile, dim, 0.99);
+        }
       }
-    }
-    // Close file
-    csvFile.close();
+      // Close file
+      csvFile.close();
 #if CPU_ENABLED && GPU_ENABLED
-    if (doCPU_ && dpGPU_) {
+    if (doCPU_ && doGPU_) {
       // Print offload results to stdout
 	    printOffloadThreshold("Sparse Square");
     }
 #endif
+    }
   }
 
  private:
@@ -693,6 +699,10 @@ class doGemm {
   /** Whether the GPU kernels should be run. */
   const bool doGPU_ = true;
 
+  /** Whether we should run dense and or sparse kernels */
+  const bool doDense_;
+  const bool doSparse_;
+
 #if CPU_ENABLED
   /** The GEMM CPU kernel. */
   cpu::gemm_cpu<T> gemmCpu_;
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index 59a9898..3ffc0d7 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -103,14 +103,8 @@ class gemm {
 
     // Using a=0.45 and b=c=0.22 as default probabilities
     for (int i = 0; i < edges; i++) {
-      while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1,
-                   0.45, 0.22, 0.22,
-                   &gen, dist, false)) {}
-    }
-    for (int i = 0; i < edges; i++) {
-      while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1,
-                   0.45, 0.22, 0.22,
-                   &gen, dist, false)) {}
+      rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false);
+      rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false);
     }
   }
 
@@ -118,23 +112,18 @@ class gemm {
   void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); }
 
   /** Recursive function to populate sparse matrices */
-  bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
-            float a, float b, float c, std::default_random_engine* gen,
+  void rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b,
+            float c, std::default_random_engine* gen,
             std::uniform_real_distribution<double> dist, bool bin) {
     // If a 1x1 submatrix, then add an edge and return out
     if (x1 >= x2 && y1 >= y2) {
-      if (abs(M[(y1 * n) + x1]) > 0.1) {
-        return false;
-      } else {
-        // Add 1.0 if this is a binary graph, and a random real number otherwise
-        M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
+      M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
                                                  100.0) - 50.0);
-        return true;
-      }
+      return;
     } else {
       // Divide up the matrix
-      int xMidPoint = x1 + floor((x2 - x1) / 2);
-      int yMidPoint = y1 + floor((y2 - y1) / 2);
+      int xMidPoint = (x1 == x2) ? x1 : x1 + floor((x2 - x1) / 2);
+      int yMidPoint = (y1 == y2) ? y1 : y1 + floor((y2 - y1) / 2);
 
       // ToDo -- add some noise to these values between iterations
       float newA = a;
@@ -142,25 +131,23 @@ class gemm {
       float newC = c;
 
       // Work out which quarter to recurse into
-      // There are some ugly ternary operators here to avoid going out of bounds in the edge case
-      // that we are already at 1 width or 1 height
+      // There are some ugly ternary operators here to avoid going out of
+      // bounds in the edge case that we are already at 1 width or 1 height
       float randomNum = dist(*gen);
       if (randomNum < a) {
-        return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
-                    newA, newB, newC, gen, dist, bin);
+        rMat(M, n, x1, xMidPoint, y1, yMidPoint, newA, newB, newC, gen, dist,
+             bin);
       } else if (randomNum < (a + b)) {
-        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
-                    newA, newB, newC, gen, dist, bin);
+        rMat(M, n, xMidPoint, x2, y1, yMidPoint, newA, newB, newC, gen, dist,
+             bin);
       } else if (randomNum < (a + b + c)) {
-        return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
-                    newA, newB, newC, gen, dist, bin);
+        rMat(M, n, x1, xMidPoint,  yMidPoint, y2, newA, newB, newC, gen,
+             dist, bin);
       } else {
-        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
-                    ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
-                    gen, dist, bin);
+        rMat(M, n, xMidPoint, x2, yMidPoint, y2, newA,  newB, newC, gen,
+             dist, bin);
       }
     }
-    return true;
   }
 
   void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index,
diff --git a/src/main.cc b/src/main.cc
index a4eb55b..268b628 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -37,14 +37,14 @@ int main(int argc, char** argv) {
   // SGEMM Comparison
   std::cout << std::endl << "Comparing SGEMM Kernels:" << std::endl;
   doGemm<float> sgemm(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                      doGpu);
+                      doGpu, sgemm, sp_sgemm);
   sgemm.collectData();
   std::cout << "Finished!" << std::endl;
 
   // DGEMM Comparison
   std::cout << std::endl << "Comparing DGEMM Kernels:" << std::endl;
   doGemm<double> dgemm(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                       doGpu);
+                       doGpu, dgemm, sp_dgemm);
   dgemm.collectData();
   std::cout << "Finished!" << std::endl;
 

From cc8e2a86347ca35b598b462724b5c3c71fb9a659 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:43:02 +0100
Subject: [PATCH 18/32] rebasing

---
 cuBLAS/sp_gemm.hh       | 16 +++-------------
 include/doGemm.hh       |  4 ++--
 include/kernels/gemm.hh | 34 ++++++++++++++++++++--------------
 src/main.cc             | 32 ++++++++++++++++++--------------
 4 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index aa095f8..2c787d9 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -36,7 +36,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
    *  - Unified: Initialise data as unified memory; no data movement semantics
    *             required */
   void initialise(gpuOffloadType offload, int n, float sparsity) override {
-    std::cout << "___________Initialising, problem size = " << n << std::endl;
     offload_ = offload;
 
     if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F;
@@ -45,13 +44,11 @@ class sp_gemm_gpu : public sp_gemm<T> {
       std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
       exit(1);
     }
-    n_ = n * 20;
+    n_ = n;
 
-    std::cout << "\tGetting device" << std::endl;
     // Get device identifier
     cudaCheckError(cudaGetDevice(&gpuDevice_));
 
-    std::cout << "\tMaking streams" << std::endl;
     // Initialise 3 streams to asynchronously move data between host and device
     cudaCheckError(cudaStreamCreate(&s1_));
     cudaCheckError(cudaStreamCreate(&s2_));
@@ -62,7 +59,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
    // Work out number of edges needed to achieve target sparsity
     A_nnz_ = B_nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity));
 
-    std::cout << "\tMallocing" << std::endl;
     if (offload_ == gpuOffloadType::unified) {
       cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_));
       cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * A_nnz_));
@@ -111,13 +107,11 @@ class sp_gemm_gpu : public sp_gemm<T> {
     A_ = (T*)malloc(sizeof(T) * n_ * n_);
     B_ = (T*)malloc(sizeof(T) * n_ * n_);
 
-    std::cout << "\tInitialising start matrices" << std::endl;
     initInputMatricesSparse(sparsity);
 
-    std::cout << "\tConverting to CSR" << std::endl;
-    toCSR(A_, n_, n_, A_nnz_, A_val_, A_col_, A_row_);
+    toCSR(A_, n_, n_, A_val_, A_col_, A_row_);
 
-    toCSR(B_, n_, n_, B_nnz_, B_val_, B_col_, B_row_);
+    toCSR(B_, n_, n_, B_val_, B_col_, B_row_);
 
 
 //    std::cout << "_____Matrix A_____" << std::endl;
@@ -139,7 +133,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
-    std::cout << "\t\tpre loop" << std::endl;
     switch(offload_) {
       case gpuOffloadType::always: {
         // Make matrix descriptors
@@ -224,7 +217,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
 
   /** Make a call to the BLAS Library Kernel. */
   void callGemm() override {
-    std::cout << "\t\tGEMM" << std::endl;
     switch(offload_) {
       case gpuOffloadType::always: {
         cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) *
@@ -452,7 +444,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
   void postLoopRequirements() override {
-    std::cout << "\t\tpost loop" << std::endl;
     cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_));
     // Destroying descriptors
     cusparseCheckError(cusparseDestroySpMat(descrA_));
@@ -520,7 +511,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Do any necessary cleanup (free pointers, close library handles, etc.)
    * after Kernel has been called. */
   void postCallKernelCleanup() override {
-    std::cout << "\t\tcleaning up" << std::endl;
     // Destroy the handle
     cusparseCheckError(cusparseDestroy(handle_));
 
diff --git a/include/doGemm.hh b/include/doGemm.hh
index f4ec053..53bbb54 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -42,8 +42,8 @@ class doGemm {
         upperLimit_(upperLimit),
         doCPU_(cpuEnabled),
         doGPU_(gpuEnabled),
-        doDense_(dense),
-        doSparse_(sparse),
+        doDense_(doDense),
+        doSparse_(doSparse)
 #if CPU_ENABLED
         ,
         gemmCpu_(iterations_),
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index 3ffc0d7..230c7d3 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -103,8 +103,10 @@ class gemm {
 
     // Using a=0.45 and b=c=0.22 as default probabilities
     for (int i = 0; i < edges; i++) {
-      rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false);
-      rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false);
+      while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
+              false)) {}
+      while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
+              false)){}
     }
   }
 
@@ -112,14 +114,18 @@ class gemm {
   void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); }
 
   /** Recursive function to populate sparse matrices */
-  void rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b,
+  bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b,
             float c, std::default_random_engine* gen,
             std::uniform_real_distribution<double> dist, bool bin) {
     // If a 1x1 submatrix, then add an edge and return out
     if (x1 >= x2 && y1 >= y2) {
-      M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
+      if (M[(int) (y1 * n) + x1] == 0) {
+        M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
                                                  100.0) - 50.0);
-      return;
+        return true;
+      } else {
+        return false;
+      }
     } else {
       // Divide up the matrix
       int xMidPoint = (x1 == x2) ? x1 : x1 + floor((x2 - x1) / 2);
@@ -135,22 +141,22 @@ class gemm {
       // bounds in the edge case that we are already at 1 width or 1 height
       float randomNum = dist(*gen);
       if (randomNum < a) {
-        rMat(M, n, x1, xMidPoint, y1, yMidPoint, newA, newB, newC, gen, dist,
-             bin);
+        return rMat(M, n, x1, xMidPoint, y1, yMidPoint, newA, newB, newC,
+                    gen, dist, bin);
       } else if (randomNum < (a + b)) {
-        rMat(M, n, xMidPoint, x2, y1, yMidPoint, newA, newB, newC, gen, dist,
-             bin);
+        return rMat(M, n, xMidPoint, x2, y1, yMidPoint, newA, newB, newC,
+                    gen, dist, bin);
       } else if (randomNum < (a + b + c)) {
-        rMat(M, n, x1, xMidPoint,  yMidPoint, y2, newA, newB, newC, gen,
-             dist, bin);
+        return rMat(M, n, x1, xMidPoint,  yMidPoint, y2, newA, newB, newC, gen,
+                    dist, bin);
       } else {
-        rMat(M, n, xMidPoint, x2, yMidPoint, y2, newA,  newB, newC, gen,
-             dist, bin);
+        return rMat(M, n, xMidPoint, x2, yMidPoint, y2, newA,  newB, newC,
+                    gen, dist, bin);
       }
     }
   }
 
-  void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index,
+  void toCSR(T* dense, int n_col, int n_row, T* vals, int* col_index,
              int* row_ptr) {
     int nnz_encountered = 0;
     for (int row = 0; row < n_row; row++) {
diff --git a/src/main.cc b/src/main.cc
index 268b628..06fd48e 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -3,10 +3,10 @@
 int iters = 10;
 int startDim = 1;
 int upperLimit = 128;
-bool sgemm = true;
-bool dgemm = true;
-bool sp_sgemm = true;
-bool sp_dgemm = true;
+bool doSgemm = true;
+bool doDgemm = true;
+bool doSp_sgemm = true;
+bool doSp_dgemm = true;
 
 bool doCpu = CPU_ENABLED;
 bool doGpu = GPU_ENABLED;
@@ -37,14 +37,14 @@ int main(int argc, char** argv) {
   // SGEMM Comparison
   std::cout << std::endl << "Comparing SGEMM Kernels:" << std::endl;
   doGemm<float> sgemm(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                      doGpu, sgemm, sp_sgemm);
+                      doGpu, doSgemm, doSp_sgemm);
   sgemm.collectData();
   std::cout << "Finished!" << std::endl;
 
   // DGEMM Comparison
   std::cout << std::endl << "Comparing DGEMM Kernels:" << std::endl;
   doGemm<double> dgemm(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                       doGpu, dgemm, sp_dgemm);
+                       doGpu, doDgemm, doSp_dgemm);
   dgemm.collectData();
   std::cout << "Finished!" << std::endl;
 
@@ -146,28 +146,28 @@ void getParameters(int argc, char** argv) {
     } else if (!strcmp(argv[i], "--no_gpu")) {
       doGpu = false;
     } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) {
-	    sgemm = dgemm = sp_sgemm = sp_dgemm = false;
+	    doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = false;
 	    std::string kernelList = argv[++i];
 	    if (kernelList.find("sp-sgemm") != std::string::npos) {
-		    sp_sgemm = true;
+		    doSp_sgemm = true;
 		    if (kernelList.find("sgemm") != std::string::npos &&
 						kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) {
-			    sgemm = true;
+			    doSgemm = true;
 		    }
 	    } else if (kernelList.find("sgemm") != std::string::npos) {
-			    sgemm = true;
+			    doSgemm = true;
 			}
 	    if (kernelList.find("sp-dgemm") != std::string::npos) {
-		    sp_dgemm = true;
+		    doSp_dgemm = true;
 		    if (kernelList.find("dgemm") != std::string::npos &&
 		        kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) {
-			    dgemm = true;
+			    doDgemm = true;
 		    }
 	    } else if (kernelList.find("dgemm") != std::string::npos) {
-		    dgemm = true;
+		    doDgemm = true;
 	    }
 
-	    if (!sgemm && !dgemm && !sp_sgemm && !sp_dgemm) {
+	    if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm) {
 		    std::cout << "ERROR - no implemented kernels in list" << std::endl;
 		    exit(1);
 	    }
@@ -200,6 +200,10 @@ void getParameters(int argc, char** argv) {
       std::cout << "  -d  --dimension_limit D      Max value of M, N, K is D "
                    "(default: "
                 << upperLimit << ")" << std::endl;
+      std::cout << "  -k  --kernels <kernels>      Comma-separated list of "
+                   "kernels to be run.  Options are sgemm, dgemm, sp-sgemm, "
+                   "sp-dgemm (default: sgemm,dgemm,sp-gemm,sp-dgemm)" <<
+                   std::endl;
       std::cout << std::endl;
       exit(0);
     } else {

From de56ae19b2934221195fdd4b020f0d33f97879a5 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:44:12 +0100
Subject: [PATCH 19/32] rebasing

---
 cuBLAS/sp_gemm.hh       | 27 +++++++++++++++++++--------
 include/doGemm.hh       |  2 +-
 include/kernels/gemm.hh | 38 +++++++++++++++++++++-----------------
 3 files changed, 41 insertions(+), 26 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 2c787d9..8bed12b 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -26,7 +26,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
   using sp_gemm<T>::C_;
   using sp_gemm<T>::offload_;
 
-	// ToDo -- No checksum for sparse yet.  Nedd to do
+	// ToDo -- No checksum for sparse yet.  Need to do
 
   /** Initialise the required data structures.
    * `offload` refers to the data offload type:
@@ -44,7 +44,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
       std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
       exit(1);
     }
-    n_ = n;
+    n_ = 100 * n;
 
     // Get device identifier
     cudaCheckError(cudaGetDevice(&gpuDevice_));
@@ -133,6 +133,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
+    cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_));
     switch(offload_) {
       case gpuOffloadType::always: {
         // Make matrix descriptors
@@ -212,13 +213,17 @@ class sp_gemm_gpu : public sp_gemm<T> {
         break;
       }
     }
-    cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_));
   }
 
   /** Make a call to the BLAS Library Kernel. */
   void callGemm() override {
     switch(offload_) {
       case gpuOffloadType::always: {
+        if (C_mem_allocated_always_) {
+          cusparseCheckError(cusparseDestroySpMat(descrA_));
+          cusparseCheckError(cusparseDestroySpMat(descrB_));
+          cusparseCheckError(cusparseDestroySpMat(descrC_));
+        }
         cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) *
         A_nnz_, cudaMemcpyHostToDevice, s1_));
         cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, sizeof(int) *
@@ -235,6 +240,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
 
         cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_
         + 1), cudaMemcpyHostToDevice, s3_));
+        cudaCheckError(cudaDeviceSynchronize());
 
         // Make matrix descriptors
         cusparseCheckError(
@@ -444,10 +450,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
   void postLoopRequirements() override {
-    cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_));
-    // Destroying descriptors
-    cusparseCheckError(cusparseDestroySpMat(descrA_));
-    cusparseCheckError(cusparseDestroySpMat(descrB_));
     switch(offload_) {
       case gpuOffloadType::always: {
         break;
@@ -476,10 +478,14 @@ class sp_gemm_gpu : public sp_gemm<T> {
         cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) *
         (n_ + 1), cudaMemcpyDeviceToHost, s3_));
         cudaCheckError(cudaDeviceSynchronize());
+
+        cusparseCheckError(cusparseDestroySpMat(descrA_));
+        cusparseCheckError(cusparseDestroySpMat(descrB_));
+        cusparseCheckError(cusparseDestroySpMat(descrC_));
+
         break;
       }
       case gpuOffloadType::unified: {
-        cusparseCheckError(cusparseDestroySpMat(descrC_));
         // Ensure all data resides on host once work has completed
         cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_,
                                             cudaCpuDeviceId, s1_));
@@ -503,9 +509,14 @@ class sp_gemm_gpu : public sp_gemm<T> {
                                             cudaCpuDeviceId, s3_));
         // Ensure device has finished all work.
         cudaCheckError(cudaDeviceSynchronize());
+
+        cusparseCheckError(cusparseDestroySpMat(descrA_));
+        cusparseCheckError(cusparseDestroySpMat(descrB_));
+        cusparseCheckError(cusparseDestroySpMat(descrC_));
         break;
       }
     }
+    cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_));
   }
 
   /** Do any necessary cleanup (free pointers, close library handles, etc.)
diff --git a/include/doGemm.hh b/include/doGemm.hh
index 53bbb54..b89abee 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -307,7 +307,7 @@ class doGemm {
                             "_sparse_square.csv");
       if (upperLimit_ >= 32) {
         for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-          callSparseKernels(csvFile, dim, 0.99);
+          callSparseKernels(csvFile, dim, 0.9999);
         }
       }
       // Close file
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index 230c7d3..2a971a0 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -106,7 +106,7 @@ class gemm {
       while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
               false)) {}
       while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
-              false)){}
+              false)) {}
     }
   }
 
@@ -119,17 +119,19 @@ class gemm {
             std::uniform_real_distribution<double> dist, bool bin) {
     // If a 1x1 submatrix, then add an edge and return out
     if (x1 >= x2 && y1 >= y2) {
-      if (M[(int) (y1 * n) + x1] == 0) {
-        M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
-                                                 100.0) - 50.0);
-        return true;
-      } else {
+      // Needed to avoid overfloe segfaults with large problem sizes
+      uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1);
+      if (abs(M[index]) > 0.1) {
         return false;
+      } else {
+        // Add 1.0 if this is a binary graph, and a random real number otherwise
+        M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0);
+        return true;
       }
     } else {
       // Divide up the matrix
-      int xMidPoint = (x1 == x2) ? x1 : x1 + floor((x2 - x1) / 2);
-      int yMidPoint = (y1 == y2) ? y1 : y1 + floor((y2 - y1) / 2);
+      int xMidPoint = x1 + floor((x2 - x1) / 2);
+      int yMidPoint = y1 + floor((y2 - y1) / 2);
 
       // ToDo -- add some noise to these values between iterations
       float newA = a;
@@ -137,23 +139,25 @@ class gemm {
       float newC = c;
 
       // Work out which quarter to recurse into
-      // There are some ugly ternary operators here to avoid going out of
-      // bounds in the edge case that we are already at 1 width or 1 height
+      // There are some ugly ternary operators here to avoid going out of bounds in the edge case
+      // that we are already at 1 width or 1 height
       float randomNum = dist(*gen);
       if (randomNum < a) {
-        return rMat(M, n, x1, xMidPoint, y1, yMidPoint, newA, newB, newC,
-                    gen, dist, bin);
+        return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+                    newA, newB, newC, gen, dist, bin);
       } else if (randomNum < (a + b)) {
-        return rMat(M, n, xMidPoint, x2, y1, yMidPoint, newA, newB, newC,
-                    gen, dist, bin);
+        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+                    newA, newB, newC, gen, dist, bin);
       } else if (randomNum < (a + b + c)) {
-        return rMat(M, n, x1, xMidPoint,  yMidPoint, y2, newA, newB, newC, gen,
-                    dist, bin);
+        return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+                    newA, newB, newC, gen, dist, bin);
       } else {
-        return rMat(M, n, xMidPoint, x2, yMidPoint, y2, newA,  newB, newC,
+        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+                    ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
                     gen, dist, bin);
       }
     }
+    return true;
   }
 
   void toCSR(T* dense, int n_col, int n_row, T* vals, int* col_index,

From b972c23e4058c5d5e541b6d3f3e3424dc185f7b0 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:49:45 +0100
Subject: [PATCH 20/32] rebasing

---
 src/main.cc | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/main.cc b/src/main.cc
index 06fd48e..51d1cf1 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -146,26 +146,26 @@ void getParameters(int argc, char** argv) {
     } else if (!strcmp(argv[i], "--no_gpu")) {
       doGpu = false;
     } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) {
-	    doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = false;
-	    std::string kernelList = argv[++i];
-	    if (kernelList.find("sp-sgemm") != std::string::npos) {
-		    doSp_sgemm = true;
-		    if (kernelList.find("sgemm") != std::string::npos &&
-						kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) {
-			    doSgemm = true;
-		    }
-	    } else if (kernelList.find("sgemm") != std::string::npos) {
-			    doSgemm = true;
-			}
-	    if (kernelList.find("sp-dgemm") != std::string::npos) {
-		    doSp_dgemm = true;
-		    if (kernelList.find("dgemm") != std::string::npos &&
-		        kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) {
-			    doDgemm = true;
-		    }
-	    } else if (kernelList.find("dgemm") != std::string::npos) {
-		    doDgemm = true;
-	    }
+      doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = false;
+      std::string kernelList = argv[++i];
+      if (kernelList.find("sp-sgemm") != std::string::npos) {
+        doSp_sgemm = true;
+        if (kernelList.find("sgemm") != std::string::npos &&
+            kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) {
+          doSgemm = true;
+        }
+      } else if (kernelList.find("sgemm") != std::string::npos) {
+        doSgemm = true;
+      }
+      if (kernelList.find("sp-dgemm") != std::string::npos) {
+        doSp_dgemm = true;
+        if (kernelList.find("dgemm") != std::string::npos &&
+            kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) {
+          doDgemm = true;
+        }
+      } else if (kernelList.find("dgemm") != std::string::npos) {
+        doDgemm = true;
+      }
 
 	    if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm) {
 		    std::cout << "ERROR - no implemented kernels in list" << std::endl;

From 1f5f2ddebf774b9bd35b52ab29ef02cca6065ff3 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:50:03 +0100
Subject: [PATCH 21/32] rebasing

---
 calculateOffloadThreshold.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/calculateOffloadThreshold.py b/calculateOffloadThreshold.py
index 38c2646..43028c0 100644
--- a/calculateOffloadThreshold.py
+++ b/calculateOffloadThreshold.py
@@ -165,7 +165,7 @@ def printResults(once:offloadThreshold, always:offloadThreshold, unified:offload
             gpuAlways.M = 0
             gpuAlways.N = 0
             gpuAlways.K = 0
-    if(gpuUnified.M != 0 and float(cpu[8]) >= float(gpuU[8])):
+    if("gemm" in kernel and gpuUnified.M != 0 and float(cpu[8]) >= float(gpuU[8])):
         # Do check to see if this is a momentary drop that we should ignore
         if (prevGpuUgflops <= float(cpu[8])) and  (float(gpuLines[2].split(',')[8]) <= float(cpu[8])):
             gpuUnified.cpuGflops = 0.0

From b06250c0ca7a8d14c2904d69a70da24f89824e5d Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:50:37 +0100
Subject: [PATCH 22/32] rebasing

---
 AOCL/sp_gemm.hh                |  62 ++++++++++
 cuBLAS/common.hh               |  53 +++++++--
 cuBLAS/sp_gemm.hh              |   4 +-
 include/doGemm.hh              |   4 +-
 include/kernels/CPU/sp_gemm.hh |   3 +-
 include/kernels/gemm.hh        |  25 +++-
 oneMKL/CPU/sp_gemm.hh          | 201 +++++++++++++++++++++++++++++----
 7 files changed, 320 insertions(+), 32 deletions(-)
 create mode 100644 AOCL/sp_gemm.hh

diff --git a/AOCL/sp_gemm.hh b/AOCL/sp_gemm.hh
new file mode 100644
index 0000000..3c6b5c0
--- /dev/null
+++ b/AOCL/sp_gemm.hh
@@ -0,0 +1,62 @@
+#pragma once
+
+#ifdef CPU_AOCL
+#include <blis.h>
+
+#include "../include/kernels/CPU/gemm.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+/** A class for GEMM CPU BLAS kernels. */
+template <typename T>
+class gemm_cpu : public gemm<T> {
+ public:
+  using gemm<T>::gemm;
+  using gemm<T>::callConsume;
+  using gemm<T>::m_;
+  using gemm<T>::n_;
+  using gemm<T>::k_;
+  using gemm<T>::A_;
+  using gemm<T>::B_;
+  using gemm<T>::C_;
+
+ private:
+  /** Make call to the GEMM kernel. */
+  void callGemm() override {
+    if constexpr (std::is_same_v<T, float>) {
+      bli_sgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_,
+                rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_),
+                &beta, C_, rowStride, std::max(1, m_));
+    } else if constexpr (std::is_same_v<T, double>) {
+      bli_dgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_,
+                rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_),
+                &beta, C_, rowStride, std::max(1, m_));
+    } else {
+      // Un-specialised class will not do any work - print error and exit.
+      std::cout << "ERROR - Datatype for AOCL CPU GEMM kernel not supported."
+                << std::endl;
+      exit(1);
+    }
+    // Ensure compiler doesn't optimise away the work being done
+    callConsume();
+  }
+
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {}
+
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {}
+
+  /** The constant value Alpha. */
+  T alpha = ALPHA;
+
+  /** The constant value Beta. */
+  T beta = BETA;
+
+  /** The distance in elements to the next column. */
+  const int rowStride = 1;
+};
+}  // namespace cpu
+#endif
\ No newline at end of file
diff --git a/cuBLAS/common.hh b/cuBLAS/common.hh
index 70d58fb..c8086db 100644
--- a/cuBLAS/common.hh
+++ b/cuBLAS/common.hh
@@ -16,13 +16,52 @@
   } while (false)
 
 /** Macro function to check if error occurred when calling cuBLAS. */
-#define cublasCheckError(f)                                                \
-  do {                                                                     \
-    if (cublasStatus_t e = (f); e != CUBLAS_STATUS_SUCCESS) {              \
-      std::cout << "CUBLAS error: " << __FILE__ << ":" << __LINE__ << ": " \
-                << cublasGetStatusString(e) << std::endl;                  \
-      exit(1);                                                             \
-    }                                                                      \
+#define cublasCheckError(f)                                              \
+  do {                                                                   \
+    switch (f) {                                                         \
+        case CUBLAS_STATUS_SUCCESS:                                      \
+          break;                                                         \
+        case CUBLAS_STATUS_NOT_INITIALIZED:                              \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_NOT_INITIALIZED" << std::endl;             \
+          exit(1);                                                       \
+        case CUBLAS_STATUS_ALLOC_FAILED:                                 \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_ALLOC_FAILED" << std::endl;                \
+          exit(1);                                                       \
+        case CUBLAS_STATUS_INVALID_VALUE:                                \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_INVALID_VALUE" << std::endl;               \
+          exit(1);                                                       \
+        case CUBLAS_STATUS_ARCH_MISMATCH:                                \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_ARCH_MISMATCH" << std::endl;               \
+          exit(1);                                                       \
+        case CUBLAS_STATUS_MAPPING_ERROR:                                \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_MAPPING_ERROR" << std::endl;               \
+          exit(1);                                                       \
+        case CUBLAS_STATUS_EXECUTION_FAILED:                             \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_EXECUTION_FAILED" << std::endl;            \
+          exit(1);                                                       \
+        case CUBLAS_STATUS_INTERNAL_ERROR:                               \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_INTERNAL_ERROR" << std::endl;              \
+          exit(1);                                                       \
+        case CUBLAS_STATUS_NOT_SUPPORTED:                                \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_NOT_SUPPORTED" << std::endl;               \
+          exit(1);                                                       \
+        case CUBLAS_STATUS_LICENSE_ERROR:                                \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_LICENSE_ERROR" << std::endl;               \
+          exit(1);                                                       \
+        default:                                                         \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": other error not in switch statement" << std::endl;       \
+          exit(1);                                                       \
+    }                                                                    \
   } while (false)
 
 #define cusparseCheckError(f)                                                 \
diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 8bed12b..d849d22 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -109,9 +109,9 @@ class sp_gemm_gpu : public sp_gemm<T> {
 
     initInputMatricesSparse(sparsity);
 
-    toCSR(A_, n_, n_, A_val_, A_col_, A_row_);
+    toCSR_int(A_, n_, n_, A_val_, A_col_, A_row_);
 
-    toCSR(B_, n_, n_, B_val_, B_col_, B_row_);
+    toCSR_int(B_, n_, n_, B_val_, B_col_, B_row_);
 
 
 //    std::cout << "_____Matrix A_____" << std::endl;
diff --git a/include/doGemm.hh b/include/doGemm.hh
index b89abee..e264273 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -303,8 +303,8 @@ class doGemm {
       cpuGpu_always_ = cpuGpu_offloadThreshold();
       cpuGpu_once_ = cpuGpu_offloadThreshold();
       cpuGpu_unified_ = cpuGpu_offloadThreshold();
-      csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() +
-                            "_sparse_square.csv");
+      std::ofstream csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
+              getKernelName() + "_sparse_square.csv");
       if (upperLimit_ >= 32) {
         for (int dim = startDimention_; dim <= upperLimit_; dim++) {
           callSparseKernels(csvFile, dim, 0.9999);
diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh
index 6d9d011..60778e7 100644
--- a/include/kernels/CPU/sp_gemm.hh
+++ b/include/kernels/CPU/sp_gemm.hh
@@ -1,5 +1,6 @@
 #pragma once
 
+#ifdef CPU_ONEMKL
 #include "../gemm.hh"
 
 #include <random>
@@ -41,4 +42,4 @@ namespace cpu {
 					free(C_);
 				}
 		};
-}  // namespace cpu
\ No newline at end of file
+}  // namespace cpu
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index 2a971a0..d97fc8c 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -1,5 +1,9 @@
 #pragma once
 
+#ifdef CPU_ONEMKL
+#include <mkl.h>
+#endif
+
 #include <algorithm>
 #include <chrono>
 #include <cmath>
@@ -160,7 +164,7 @@ class gemm {
     return true;
   }
 
-  void toCSR(T* dense, int n_col, int n_row, T* vals, int* col_index,
+  void toCSR_int(T* dense, int n_col, int n_row, T* vals, int* col_index,
              int* row_ptr) {
     int nnz_encountered = 0;
     for (int row = 0; row < n_row; row++) {
@@ -178,6 +182,25 @@ class gemm {
     row_ptr[n_row] = nnz_encountered;
   }
 
+#ifdef CPU_ONEMKL
+  void toCSR_mkl(T* dense, int n_col, int n_row, T* vals, MKL_INT* col_index,
+                 MKL_INT* row_ptr) {
+    int nnz_encountered = 0;
+    for (int row = 0; row < n_row; row++) {
+      row_ptr[row] = (MKL_INT)nnz_encountered;
+      int nnz_row = 0;
+      for (int col = 0; col < n_col; col++) {
+        if (dense[(row * n_col) + col] != 0.0) {
+          nnz_row++;
+          col_index[nnz_encountered] = (MKL_INT)col;
+          vals[nnz_encountered] = dense[(row * n_col) + col];
+          nnz_encountered++;
+        }
+      }
+    }
+    row_ptr[n_row] = (MKL_INT)nnz_encountered;
+  }
+#endif
   /** The number of iterations to perform per problem size. */
   const int iterations_;
 
diff --git a/oneMKL/CPU/sp_gemm.hh b/oneMKL/CPU/sp_gemm.hh
index 5ac6a70..0b4e32b 100644
--- a/oneMKL/CPU/sp_gemm.hh
+++ b/oneMKL/CPU/sp_gemm.hh
@@ -24,33 +24,146 @@ class sp_gemm_cpu : public sp_gemm<T> {
 
   /** Initialise the required data structures. */
   void initialise(int n, float sparsity) {
-    n_ = n;
-
     A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64);
     B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64);
     C_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64);
 
+    n_ = n * 100;
+    nnz_ = (1 + (int)(n_ * n_ * (1 - sparsity)));
+
+    values_A_ = (T*)mkl_malloc(sizeof(T) * nnz_, ALIGN);
+    columns_A_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * nnz_, ALIGN);
+    rowIndex_A_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (n_ + 1), ALIGN);
+
+    values_B_ = (T*)mkl_malloc(sizeof(T) * nnz_, ALIGN);
+    columns_B_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * nnz_, ALIGN);
+    rowIndex_B_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (n_ + 1), ALIGN);
+
+    x_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN);
+    y_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN);
+    rslt_mv_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN);
+    rslt_mv_trans_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN);
+
     // Initialise the matricies
-    initInputMatrices();
+    initInputMatricesSparse(sparsity);
+
+    descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL;
+
+    // Transfer from dense to CSR format
+    toCSR_mkl(A_, n_, n_, values_A_, columns_A_, rowIndex_A_);
+    toCSR_mkl(B_, n_, n_, values_B_, columns_B_, rowIndex_B_);
+
+    // ToDo -- Set values for x and y (which are vectors of length n_?)
+
+    if constexpr (std::is_same_v<T, float>) {
+      CALL_AND_CHECK_STATUS(mkl_sparse_s_create_csr(&csrA_,
+                                                    SPARSE_INDEX_BASE_ZERO, n_,
+                                                    n_, rowIndex_A_,
+                                                    rowIndex_A_+1, columns_A_,
+                                                    values_A_),
+                            "Error after MKL_SPARSE_D_CREATE_CSR for csrA\n");
+      CALL_AND_CHECK_STATUS(mkl_sparse_s_create_csr(&csrB_,
+                                                    SPARSE_INDEX_BASE_ZERO, n_,
+                                                    n_, rowIndex_B_,
+                                                    rowIndex_B_+1, columns_B_,
+                                                    values_B_),
+                            "Error after MKL_SPARSE_D_CREATE_CSR for csrB\n");
+    } else if constexpr (std::is_same_v<T, double>) {
+      CALL_AND_CHECK_STATUS(mkl_sparse_d_create_csr(&csrA_,
+                                                    SPARSE_INDEX_BASE_ZERO, n_,
+                                                    n_, rowIndex_A_,
+                                                    rowIndex_A_+1, columns_A_,
+                                                    values_A_),
+                            "Error after MKL_SPARSE_D_CREATE_CSR for csrA\n");
+      CALL_AND_CHECK_STATUS(mkl_sparse_d_create_csr(&csrB_,
+                                                    SPARSE_INDEX_BASE_ZERO, n_,
+                                                    n_, rowIndex_B_,
+                                                    rowIndex_B_+1, columns_B_,
+                                                    values_B_),
+                            "Error after MKL_SPARSE_D_CREATE_CSR for csrB\n");
+    } else {
+      std::cout << "ERROR - Datatype for OneMKL CPU spGEMM kernel not "
+                   "supported." << std::endl;
+      exit(1)
+    };
+
+    CALL_AND_CHECK_STATUS(mkl_sparse_spmm(SPARSE_OPERATION_NON_TRANSPOSE,
+                                            csrA_, csrB_, &csrC_),
+                            "Error after MKL_SPARSE_SPMM\n");
+
+    // ToDo -- check that transpose is what I want here
+    CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrA_,
+                                                 SPARSE_OPERATION_TRANSPOSE,
+                                                 descr_type_gen_, 1),
+                          "Error after MKL_SPARSE_SET_MV_HINT with csrA_\n");
+    CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrB_,
+                                                 SPARSE_OPERATION_NON_TRANSPOSE,
+                                                 descr_type_gen_, 1),
+                          "Error after MKL_SPARSE_SET_MV_HINT with csrB_\n");
+    CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrC_,
+                                                 SPARSE_OPERATION_NON_TRANSPOSE,
+                                                 descr_type_gen_, 1),
+                          "Error after MKL_SPARSE_SET_MV_HINT with csrC_\n");
+
+    CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrA_),
+                          "Error after MKL_SPARSE_OPTIMIZE with csrA_\n");
+    CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrB_),
+                          "Error after MKL_SPARSE_OPTIMIZE with csrB_\n");
+    CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrC_),
+                          "Error after MKL_SPARSE_OPTIMIZE with csrC_\n");
   }
 
  private:
   /** Make call to the GEMM kernel. */
   void callGemm() override {
     if constexpr (std::is_same_v<T, float>) {
-      cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_,
-                  (float)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_),
-                  (float)BETA, C_, std::max(1, m_));
-    } else if constexpr (std::is_same_v<T, double>) {
-      cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_,
-                  (double)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_),
-                  (double)BETA, C_, std::max(1, m_));
-    } else {
-      // Un-specialised class will not do any work - print error and exit.
-      std::cout << "ERROR - Datatype for OneMKL CPU GEMM kernel not supported."
-                << std::endl;
-      exit(1);
+      CALL_AND_CHECK_STATUS(mkl_sparse_s_mv(SPARSE_OPERATION_NON_TRASPOSE, 1
+      .0, csrC_, descr_type_gen_, x_, 0.0, rslt_mv_),
+                            "Error after MKL_SPARSE_S_MV for csrC_ * x_\n");
+      left_ = cblas_sdot(n_, rstl_mv_, 1, y_, 1);
+
+      CALL_AND_CHECK_STATUS(mkl_sparse_s_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1
+      .0, csrB_, descr_type_gen_, x, 0.0, trslt_mv_),
+                            "Error adter MKL_SPARSE_S_MV for csrB_ * x_\n");
+      CALL_AND_CHECK_STATUS(mkl_sparse_s_mv(SPARSE_OPERATION_TRANSPOSE, 1.0,
+                                            csrA_, descr_type_gen_, y_, 0.0,
+                                            rslt_mv_trans_),
+                            "Error adter MKL_SPARSE_S_MV for csrA_ * y_\n");
+      right_ = cblas_sdot(n_, rslt_mv_, 1, rslt_mv_trans_, 1);
+
+      residual = fabs(left - right)/(fabs(left) + 1);
+
+      CALL_AND_CHECK_STATUS(mkl_sparse_s_export_csr(csrC_, &indexing_,
+                                                    &rows_, &cols_,
+                                                    &pointerB_C_,
+                                                    &pointerE_C_,
+                                                    &columns_C_, &values_C_),
+                            "Error after MKL_SPARSE_S_EXPORT_CSR\n");
+    } else if constexpr (std::is_same_v<T, double) {
+      CALL_AND_CHECK_STATUS(mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRASPOSE, 1
+      .0, csrC_, descr_type_gen_, x_, 0.0, rslt_mv_),
+                            "Error after MKL_SPARSE_D_MV for csrC_ * x_\n");
+      left_ = cblas_ddot(n_, rstl_mv_, 1, y_, 1);
+
+      CALL_AND_CHECK_STATUS(mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1
+      .0, csrB_, descr_type_gen_, x, 0.0, trslt_mv_),
+                            "Error adter MKL_SPARSE_D_MV for csrB_ * x_\n");
+      CALL_AND_CHECK_STATUS(mkl_sparse_d_mv(SPARSE_OPERATION_TRANSPOSE, 1.0,
+                                            csrA_, descr_type_gen_, y_, 0.0,
+                                            rslt_mv_trans_),
+                            "Error adter MKL_SPARSE_D_MV for csrA_ * y_\n");
+      right_ = cblas_ddot(n_, rslt_mv_, 1, rslt_mv_trans_, 1);
+
+      residual = fabs(left - right)/(fabs(left) + 1);
+
+      CALL_AND_CHECK_STATUS(mkl_sparse_d_export_csr(csrC_, &indexing_,
+                                                    &rows_, &cols_,
+                                                    &pointerB_C_,
+                                                    &pointerE_C_,
+                                                    &columns_C_, &values_C_),
+                            "Error after MKL_SPARSE_D_EXPORT_CSR\n");
     }
+
     // Ensure compiler doesn't optimise away the work being done
     callConsume();
   }
@@ -66,11 +179,61 @@ class sp_gemm_cpu : public sp_gemm<T> {
   /** Do any necessary cleanup (free pointers, close library handles, etc.)
    * after Kernel has been called. */
   void postCallKernelCleanup() override {
-    mkl_free_buffers();
-    mkl_free(A_);
-    mkl_free(B_);
-    mkl_free(C_);
+    if (mkl_sparse_destroy(csrC_) != SPARSE_STATUS_SUCCESS) {
+      printf(" Error after MKL_SPARSE_DESTROY, csrC_\n");
+      fflush(0);
+      status = 1;
+    }
+
+    //Deallocate arrays for which we allocate memory ourselves.
+    mkl_free(rslt_mv_trans_);
+    mkl_free(rslt_mv-);
+    mkl_free(x_);
+    mkl_free(y_);
+
+    //Release matrix handle and deallocate arrays for which we allocate memory ourselves.
+    if (mkl_sparse_destroy(csrA_) != SPARSE_STATUS_SUCCESS) {
+      printf("Error after MKL_SPARSE_DESTROY, csrA_\n");
+      fflush(0);
+      status = 1;
+    }
+
+    mkl_free(values_A_);
+    mkl_free(columns_A_);
+    mkl_free(rowIndex_A_);
+
+    if (mkl_sparse_destroy(csrB_) != SPARSE_STATUS_SUCCESS) {
+      printf("Error after MKL_SPARSE_DESTROY, csrB_\n");
+      fflush(0);
+      status = 1;
+    }
+
+    mkl_free(values_B_);
+    mkl_free(columns_B_);
+    mkl_free(rowIndex_B_);
   }
+
+  int nnz_;
+
+  MKL_INT* columns_A_;
+  MKL_INT* columns_B_;
+  MKL_INT* columns_C_;
+  MKL_INT* rowIndex_A_;
+  MKL_INT* rowIndex_B_;
+  MKL_INT* pointerB_C_;
+  MKL_INT* pointerE_C_;
+
+  T* rslt_mv_;
+  T* rslt_mv_trans_;
+  T* x_;
+  T* y_;
+
+  T left_, right_, residual_;
+  MKL_INT rows_, cols_, i_, j_, ii_, status_;
+
+  sparse_index_base_t indexing_;
+  struct matrix_descr descr_type_gen_;
+  sparse_matrix_t csrA_, csrB_, csrC_;
 };
 }  // namespace cpu
 #endif
\ No newline at end of file

From 42bdc5846d6a5bac4f3270d62b258e0d021757aa Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Wed, 21 Aug 2024 11:05:52 +0100
Subject: [PATCH 23/32] Adding AOCL files

---
 AOCL/gemm.hh                   |   1 +
 AOCL/sp_gemm.hh                |  32 ++++-
 ArmPL/sp_gemm.hh               | 231 +++++++++++++++++++++++++++++++++
 NVPL/sp_gemv.hh                | 117 +++++++++++++++++
 include/kernels/CPU/sp_gemm.hh |  71 +++++++++-
 include/kernels/gemm.hh        |  22 ++++
 6 files changed, 464 insertions(+), 10 deletions(-)
 create mode 100644 ArmPL/sp_gemm.hh
 create mode 100644 NVPL/sp_gemv.hh

diff --git a/AOCL/gemm.hh b/AOCL/gemm.hh
index 3c6b5c0..f418bdc 100644
--- a/AOCL/gemm.hh
+++ b/AOCL/gemm.hh
@@ -23,6 +23,7 @@ class gemm_cpu : public gemm<T> {
  private:
   /** Make call to the GEMM kernel. */
   void callGemm() override {
+
     if constexpr (std::is_same_v<T, float>) {
       bli_sgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_,
                 rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_),
diff --git a/AOCL/sp_gemm.hh b/AOCL/sp_gemm.hh
index 3c6b5c0..4fc178b 100644
--- a/AOCL/sp_gemm.hh
+++ b/AOCL/sp_gemm.hh
@@ -28,9 +28,16 @@ class gemm_cpu : public gemm<T> {
                 rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_),
                 &beta, C_, rowStride, std::max(1, m_));
     } else if constexpr (std::is_same_v<T, double>) {
-      bli_dgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_,
-                rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_),
-                &beta, C_, rowStride, std::max(1, m_));
+      // Todo -- base?
+      aoclsparse_create_dscr(&A_csr_, base, n_, n_, nnz_, cst_row_ptr_A_.data
+      (), csr_col_ind_A_.data(), csr_val_A_.data());
+      aoclsparse_create_dscr(&B_csr_, base, n_, n_, nnz_, cst_row_ptr_B_.data
+      (), csr_col_ind_B_.data(), csr_val_B_.data());
+
+      aoclsparse_spmm(aoclsparse_operation_none, A_csr_, B_csr_, &C_csr_);
+      aoclsparse_export_dcsr(C_csr_, &base, &C_M_, &C_N_, &nnz_C_,
+                             &csr_row_ptr_C_, &csr_col_ind_C_, (void**)
+                             &csr_val_C_);
     } else {
       // Un-specialised class will not do any work - print error and exit.
       std::cout << "ERROR - Datatype for AOCL CPU GEMM kernel not supported."
@@ -57,6 +64,25 @@ class gemm_cpu : public gemm<T> {
 
   /** The distance in elements to the next column. */
   const int rowStride = 1;
+
+  aoclsparse_matrix A_csr_;
+  aoclsparse_int* csr_row_ptr_A_;
+  aoclsparse_int* csr_col_ind_A_;
+  T* csr_val_A_;
+
+  aoclsparse_matrix B_csr_;
+  aoclsparse_int* csr_row_ptr_B_;
+  aoclsparse_int* csr_col_ind_B_;
+  T* csr_val_B_;
+
+  aoclsparse_matrix C_csr_;
+  aoclsparse_int* csr_row_ptr_C_;
+  aoclsparse_int* csr_col_ind_C_;
+  T* csr_val_C_;
+  aoclsparse_int C_M_;
+  aoclsparse_int C_N_;
+
+  aoclsparse_status status;
 };
 }  // namespace cpu
 #endif
\ No newline at end of file
diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh
new file mode 100644
index 0000000..aba5814
--- /dev/null
+++ b/ArmPL/sp_gemm.hh
@@ -0,0 +1,231 @@
+#pragma once
+
+#ifdef CPU_ARMPL
+#include <stdio.h>
+#include <stdlib.h>
+#include <armpl.h>
+#include <omp.h>
+
+#include <algorithm>
+
+#include "../include/kernels/CPU/sp_gemm.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+/** A class for GEMM CPU BLAS kernels. */
+template <typename T>
+class sp_gemm_cpu : public sp_gemm<T> {
+ public:
+  using sp_gemm<T>::gemm;
+  using sp_gemm<T>::callConsume;
+  using sp_gemm<T>::m_;
+  using sp_gemm<T>::n_;
+  using sp_gemm<T>::k_;
+  using sp_gemm<T>::A_;
+  using sp_gemm<T>::B_;
+  using sp_gemm<T>::C_;
+
+ private:
+  /** Make call to the GEMM kernel. */
+  void callGemm() override {
+
+    /**
+     * Flow of ARMPL Sparse LA:
+     *
+     * 1. Create sparse matrix objects: armpl_spmat_create_csr[sdcz]()
+     *
+     * 2. Supply hints on usage: armpl_spmat_hint()
+     *
+     * 3. Optimise for SpMV: armpl_spmv_optimize()
+     *
+     * 4. Solve SpMV case: armpl_spmv_exec_[sdcz]()
+     *
+     * 5. Destroy sparse matrix object: armpl_spmat_destroy()
+     *
+     * In addiion, users can choose to update a set of non-zero values using
+     * armpl_spmat_update_[sdcz]()
+     */
+
+    // Todo -- See if using armpl_spmat_hint can improve performance here.
+    //  If so, follow with optimisation functions
+
+
+
+
+    if (std::is_same_v<T, float>) {
+      status_ = armpl_spmm_exec_s(transA,
+                                  transB,
+                                  alpha,
+                                  A_armpl_,
+                                  B_armpl,
+                                  beta,
+                                  C_armpl_);
+    } else if constexpr (std::is_same_v<T, double>) {
+      status_ = armpl_spmm_exec_d(transA,
+                                  transB,
+                                  alpha,
+                                  A_armpl_,
+                                  B_armpl,
+                                  beta,
+                                  C_armpl_);
+    } else {
+      // Un-specialised class will not do any work - print error and exit.
+      std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported."
+                << std::endl;
+      exit(1);
+    }
+
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    // Ensure compiler doesn't optimise away the work being done
+    callConsume();
+  }
+
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {}
+
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {
+    status_ = armpl_spmat_destroy(A_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_destroy(B_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_destroy(C_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+  }
+
+  /** The constant value Alpha. */
+  const T alpha = ALPHA;
+
+  /** The constant value Beta. */
+  const T beta = BETA;
+
+  armpl_status_t status_;
+
+  armpl_spmat_t armpl_A, armpl_B, armpl_C;
+
+  @override
+  void toCSR() {
+    n_armpl_ = n_;
+    // ToDo -- check whether flags_ is correct!
+    flags_ = 0;
+
+    // Move A to CSR
+    A_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
+    A_armpl_col_index_ = new armpl_int_t[nnz_];
+    A_vals_ = new T[nnz_];
+    int nnz_encountered = 0;
+    for (int row = 0; row < n_; row++) {
+      A_armpl_row_ptr_[row] = nnz_encountered;
+      for (int col = 0; col < n_; col++) {
+        if (A_[(row * n_) + col] != 0.0) {
+          A_armpl_col_index_[nnz_encountered] = col;
+          A_vals_[nnz_encountered] = A_[(row * n_) + col];
+          nnz_encountered++;
+        }
+      }
+    }
+
+    // Move B to CSR
+    B_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
+    B_armpl_col_index_ = new armpl_int_t[nnz_];
+    B_vals_ = new T[nnz_];
+    nnz_encountered = 0;
+    for (int row = 0; row < n_; row++) {
+      B_armpl_row_ptr_[row] = nnz_encountered;
+      for (int col = 0; col < n_; col++) {
+        if (B_[(row * n_) + col] != 0.0) {
+          B_armpl_col_index_[nnz_encountered] = col;
+          B_vals_[nnz_encountered] = B_[(row * n_) + col];
+          nnz_encountered++;
+        }
+      }
+    }
+
+    if (std::is_sam_v<T, float>) {
+      status_ = armpl_spmat_create_csr_s(A_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         A_armpl_row_ptr_,
+                                         A_armpl_col_index_,
+                                         A_vals_,
+                                         flags);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+
+      status_ = armpl_spmat_create_csr_s(B_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         B_armpl_row_ptr_,
+                                         B_armpl_col_index_,
+                                         B_vals_,
+                                         flags);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+    } else if (std::is_same_v<T, double>) {
+      status_ = armpl_spmat_create_csr_d(A_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         A_armpl_row_ptr_,
+                                         A_armpl_col_index_,
+                                         A_vals_,
+                                         flags);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+
+      status_ = armpl_spmat_create_csr_d(B_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         B_armpl_row_ptr_,
+                                         B_armpl_col_index_,
+                                         B_vals_,
+                                         flags);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+    }
+
+
+  }
+
+  armpl_int_t flags_;
+
+  armpl_int_t n_armpl_;
+
+  armpl_int_t* A_armpl_row_ptr_;
+  armpl_int_t* A_armpl_col_index_;
+  armpl_int_t* B_armpl_row_ptr_;
+  armpl_int_t* B_armpl_col_index_;
+  armpl_int_t* C_armpl_row_ptr_;
+  armpl_int_t* C_armpl_col_index_;
+
+  armpl_spmat_t* A_armpl_;
+  armpl_spmat_t* B_armpl_;
+  armpl_spmat_t* C_armpl_;
+
+  sparse_hint_value transA = ARMPL_SPARSE_OPERATION_NOTRANS;
+  sparse_hint_value transB = ARMPL_SPARSE_OPERATION_NOTRANS;
+
+};
+}  // namespace cpu
+#endif
\ No newline at end of file
diff --git a/NVPL/sp_gemv.hh b/NVPL/sp_gemv.hh
new file mode 100644
index 0000000..d04f6b8
--- /dev/null
+++ b/NVPL/sp_gemv.hh
@@ -0,0 +1,117 @@
+/**
+ * ToDo -- This is all currently written for GEMM, but NVPL does not support
+ * GEMM, so this needs to be adjusted to spmv -- which is supported
+ */
+
+
+
+
+
+#pragma once
+
+#ifdef CPU_NVPL
+#include <nvpl_sparse.h>
+
+#include "../include/kernels/CPU/gemm.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+/** A class for GEMM CPU BLAS kernels. */
+template <typename T>
+class sp_gemm_cpu : public sp_gemm<T> {
+ public:
+  using sp_gemm<T>::gemm;
+  using sp_gemm<T>::callConsume;
+  using sp_gemm<T>::m_;
+  using sp_gemm<T>::n_;
+  using sp_gemm<T>::k_;
+  using sp_gemm<T>::A_;
+  using sp_gemm<T>::B_;
+  using sp_gemm<T>::C_;
+
+ private:
+  /** Make call to the GEMM kernel. */
+  void callGemm() override {
+
+    // Ensure compiler doesn't optimise away the work being done
+    callConsume();
+  }
+
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {
+    // Set type enum
+    if constexpr (std::is_same_v<T, float>) {
+      type_ = NVPL_SPARSE_R_32F;
+    } else if constexpr (std::is_same_v<T, double>) {
+      type_ = NVPL_SPARSE_R_64F;
+    } else {
+      // Un-specialised class will not do any work - print error and exit.
+      std::cout << "ERROR - Datatype for NVPL sparse GEMM kernel not supported."
+                << std::endl;
+      exit(1);
+    }
+    status_ = nvpl_sparse_create(&handle_);
+    // Todo -- error check
+
+    // Todo -- Make const?
+    status_ = nvpl_sparse_create_csr(A_nvpl_, n_, n_, nnz_, A_row_ptr_nvpl_,
+                                     A_col_index_nvpl_, A_vals_nvpl_,
+                                     index_type_, index_type_, base_, type_);
+
+    status_ = nvpl_sparse_create_csr(B_nvpl_, n_, n_, nnz_, B_row_ptr_nvpl_,
+                                     B_col_index_nvpl_, B_vals_nvpl_,
+                                     index_type_, index_type_, base_, type_);
+    // Todo -- error check
+
+
+  }
+
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {
+    status_ = nvpl_sparse_destroy(handle_);
+    // Todo -- error check
+    status_ = nvpl_sparse_destroy_sp_mat(A_nvpl_);
+    status_ = nvpl_sparse_destroy_sp_mat(B_nvpl_);
+    status_ = nvpl_sparse_destroy_sp_mat(C_nvpl_);
+  }
+
+  /** The constant value Alpha. */
+  T alpha = ALPHA;
+
+  /** The constant value Beta. */
+  T beta = BETA;
+
+  /**
+   * Sparse metadata
+  */
+  nvpl_sparse_status_t status_;
+  nvpl_sparse_handle_t handle_;
+  nvpl_sparse_data_type_t type_;
+
+  nvpl_sparse_operation_t op_ = NVPL_SPARSE_OPERATION_NON_TRANSPOSE;
+  nvpl_sparse_index_base_t base_ = NVPL_SPARSE_INDEX_BASE_ZERO;
+  nvpl_sparse_format_t format_ = NVPL_SPARSE_FORMAT_CSR;
+  nvpl_sparse_order_t order_ = NVPL_SPARSE_ORDER_COL;
+  nvpl_sparse_index_type_t index_type_ = NVPL_SPARSE_INDEX_64I;
+
+  /**
+   * Sparse matrix descriptors
+  */
+  nvpl_sparse_sp_mat_descr_t* A_nvpl_;
+  nvpl_sparse_sp_mat_descr_t* B_nvpl_;
+  nvpl_sparse_sp_mat_descr_t* C_nvpl_;
+
+  void* A_row_ptr_nvpl_;
+  void* B_row_ptr_nvpl_;
+  void* C_row_ptr_nvpl_;
+  void* A_col_idnex_nvpl_;
+  void* B_col_idnex_nvpl_;
+  void* C_col_idnex_nvpl_;
+  void* A_vals_nvpl_;
+  void* B_vals_nvpl_;
+  void* C_vals_nvpl_;
+};
+}  // namespace cpu
+#endif
\ No newline at end of file
diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh
index 60778e7..72fd5dc 100644
--- a/include/kernels/CPU/sp_gemm.hh
+++ b/include/kernels/CPU/sp_gemm.hh
@@ -1,9 +1,9 @@
 #pragma once
 
-#ifdef CPU_ONEMKL
 #include "../gemm.hh"
 
 #include <random>
+#include <memory>
 
 namespace cpu {
 
@@ -25,21 +25,78 @@ namespace cpu {
 			/** Initialise the required data structures. */
 			virtual void initialise(int n, double sparsity, bool binary = false) {
 				n_ = n;
+        sparsity_ = sparsity;
+
+        // Note that the below should be the same as the edges calculation
+        // used in the initInputMatricesSparse function.  If changed here,
+        // change there
+        nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity_));
 
 				A_ = (T*)malloc(sizeof(T) * n_ * n_);
 				B_ = (T*)malloc(sizeof(T) * n_ * n_);
 				C_ = (T*)malloc(sizeof(T) * n_ * n_);
 
-				initInputMatricesSparse(sparsity);
+				initInputMatricesSparse(sparsity_);
+
+        toCSR();
 			}
 
 			private:
 				/** Do any necessary cleanup (free pointers, close library handles, etc.)
 				 * after Kernel has been called. */
-				void postCallKernelCleanup() {
-					free(A_);
-					free(B_);
-					free(C_);
-				}
+      void postCallKernelCleanup() {
+        free(A_);
+        free(B_);
+        free(C_);
+      }
+
+      void toCSR() {
+        // Move A to CSR
+        A_row_ptr_ = new int[n_ + 1];
+        A_col_index_ = new int[nnz_];
+        A_vals_ = new T[nnz_];
+        int nnz_encountered = 0;
+        for (int row = 0; row < n_; row++) {
+          A_row_ptr_[row] = nnz_encountered;
+          for (int col = 0; col < n_; col++) {
+            if (A_[(row * n_) + col] != 0.0) {
+              A_col_index_[nnz_encountered] = col;
+              A_vals_[nnz_encountered] = A_[(row * n_) + col];
+              nnz_encountered++;
+            }
+          }
+        }
+
+        // Move B to CSR
+        B_row_ptr_ = new int[n_ + 1];
+        B_col_index_ = new int[nnz_];
+        B_vals_ = new T[nnz_];
+        nnz_encountered = 0;
+        for (int row = 0; row < n_; row++) {
+          B_row_ptr_[row] = nnz_encountered;
+          for (int col = 0; col < n_; col++) {
+            if (B_[(row * n_) + col] != 0.0) {
+              B_col_index_[nnz_encountered] = col;
+              B_vals_[nnz_encountered] = B_[(row * n_) + col];
+              nnz_encountered++;
+            }
+          }
+        }
+      }
+
+      double sparsity_;
+
+      int nnz_;
+
+      int* A_row_ptr_;
+      int* A_col_index_;
+      int* B_row_ptr_;
+      int* B_col_index_;
+      int* C_row_ptr_;
+      int* C_col_index_;
+      T* A_vals_;
+      T* B_vals_;
+      T* C_vals;
+
 		};
 }  // namespace cpu
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index d97fc8c..d357734 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -91,6 +91,9 @@ class gemm {
     }
   }
 
+  // Note that the below should be the same as the nnz calculation
+  // used in the cpu initialise functions.  If changed here,
+  // change there
   void initInputMatricesSparse(float sparsity) {
     for (int i = 0; i < (n_ * n_); i++) {
       A_[i] = 0.0;
@@ -200,6 +203,25 @@ class gemm {
     }
     row_ptr[n_row] = (MKL_INT)nnz_encountered;
   }
+#endif
+#ifdef CPU_AOCL
+    void toCSR_aocl(T* dense, int n_col, int n_row, T* vals, aoclsparse_int*
+    col_index, aoclsparse_int* row_ptr) {
+    int nnz_encountered = 0;
+    for (int row = 0; row < n_row; row++) {
+      row_ptr[row] = (aoclsparse_int)nnz_encountered;
+      int nnz_row = 0;
+      for (int col = 0; col < n_col; col++) {
+        if (dense[(row * n_col) + col] != 0.0) {
+          nnz_row++;
+          col_index[nnz_encountered] = (aoclsparse_int)col;
+          vals[nnz_encountered] = dense[(row * n_col) + col];
+          nnz_encountered++;
+        }
+      }
+    }
+    row_ptr[n_row] = (MKL_INT)nnz_encountered;
+  }
 #endif
   /** The number of iterations to perform per problem size. */
   const int iterations_;

From 521cbf3d1f4f5369813732e46be11fd019a09241 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Tue, 1 Oct 2024 12:00:19 +0100
Subject: [PATCH 24/32] Working changes

---
 .DS_Store                            | Bin 0 -> 8196 bytes
 .idea/GPU-BLAS-Offload-Benchmark.iml |   2 +
 .idea/codeStyles/codeStyleConfig.xml |   5 +
 .idea/misc.xml                       |   6 +
 .idea/modules.xml                    |   8 +
 .idea/vcs.xml                        |   6 +
 .idea/workspace.xml                  | 541 +++++++++++++++++++++++++++
 ArmPL/sp_gemm.hh                     | 271 ++++++++++++--
 DefaultCPU/sp_gemm.hh                |  55 ---
 DefaultGPU/sp_gemm.hh                |  54 ---
 Makefile                             |   2 +-
 NVPL/sp_gemv.hh                      | 117 ------
 createGflopsGraphs.py                |   5 +
 cuBLAS/sp_gemm.hh                    |   9 +-
 cuBLAS/sp_gemv.hh                    | 261 +++++++++++++
 include/.DS_Store                    | Bin 0 -> 6148 bytes
 include/doGemm.hh                    |  46 ++-
 include/kernels/.DS_Store            | Bin 0 -> 6148 bytes
 include/kernels/CPU/sp_gemm.hh       |  23 +-
 include/kernels/CPU/sp_gemv.hh       |  47 +++
 include/kernels/GPU/sp_gemm.hh       |   3 +-
 include/kernels/GPU/sp_gemv.hh       |  28 ++
 include/kernels/gemm.hh              |   4 +
 include/kernels/gemv.hh              |  79 ++++
 24 files changed, 1278 insertions(+), 294 deletions(-)
 create mode 100644 .DS_Store
 create mode 100644 .idea/GPU-BLAS-Offload-Benchmark.iml
 create mode 100644 .idea/codeStyles/codeStyleConfig.xml
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/modules.xml
 create mode 100644 .idea/vcs.xml
 create mode 100644 .idea/workspace.xml
 delete mode 100644 DefaultCPU/sp_gemm.hh
 delete mode 100644 DefaultGPU/sp_gemm.hh
 delete mode 100644 NVPL/sp_gemv.hh
 create mode 100644 cuBLAS/sp_gemv.hh
 create mode 100644 include/.DS_Store
 create mode 100644 include/kernels/.DS_Store
 create mode 100644 include/kernels/CPU/sp_gemv.hh
 create mode 100644 include/kernels/GPU/sp_gemv.hh

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..5e3f9bcf14470d249e0f7fdd3125325b2078e4a9
GIT binary patch
literal 8196
zcmeHMO>7fK6n<lyc%x911A(YNVx^ucnbc_j1vu1k3<v=wibIOjRH-*>5{vbYWjjq#
z6v^tL67*83J#*!VfCCrQLnUrVRV9ilZorKzhpMOY%}<<}wVfjZF%#{)*?Hf4Gw;ot
zH<>jNvEq6-Ni<GG9v(qQM^N=Cf-jz)lF~h&filQbv^rT{;q^uj(seiwI1e}vI1e}v
zI1jun9>AF`iZ|fgS6o*+4>%9JmmU!L!N((LLDP<+GIgMmR{+oqx@AFoR5U<+O$(ZK
z6a@!`DN#@*%Jdb3DRK1s8duP?qo@)mrY|2%kIeK9g~`#O-<NP=1w~!$Jm5Ug>j80h
z&(H*|QjOZy{XN^dWAf^}R0<*<pwD}B?7_W1{&_fK^TS>FjWy%jz=wH=(jJUkqmZgp
zu|}pZAKP4W?5W(s++S*JL%z;;Mt?b-`giJyoSlKN#;0Gz_!*j^i!@8;&qjPj+lKVP
z{sV8~e^~?!^PHh3)oCt?ME^jfZPCz@t;e+J*6HxtuYc<Kc1JQT>W{E3l6miATA>O>
zsMk?fs14s<q$~J~h~qt^1zLkGRce89^oCf223@l~kXlXHH3p4BN@+Z!^);a74Eih3
zu!eRYO>{e<X-LkkNax|ZvOOyMc{gmZ(Q4>5x*s&6TC1JUKVhkKX3tR8%X%Z;x8*gy
zQEpe->#bs?`Hgs6;5-Vp+m&FkR^3=0-9O9YcBK|qn^K?_RsmW1x)z6gqsZ6euq9>7
zis21=!^@)wH#d(==KQ1i>8+f<dHjof&Ob7BY-?-S8~gP0lk>}0qk7D*WBMpepHTFH
zdhgaZ(6Y?8L!>EAFpF;nN$}&P6E9TQConsKzd!%cfv0^icA&`6w{(18ZpIOh#iEP3
zdq@Ti1khm$WY`4uGRdI7X>5-yHgSw)jUa=~Y@^vH(6|fQ_QBm(KqvH>UU>HW^xof<
zg*~VpKWy?7xuYrpBv6(oSRRAX2tx5JlE5kYipr=buxWmvwrxe~Zy?Q-;L!zy{Z(v<
zE3iK5v08+(X>|tL7kd*(dNzR@!lsO&^#YwsCL5WSOr0LKb_3X$`fjJRNZ%%YnC4;M
z43(f=*jcAAVWo%wQzDDa&9Sn5^=A$x&}pQACau^yMYOPeMzm;@z3!K9L6_#3>;2Pj
z-IeTech<R=YppMJpwyMY9!M+JxVZnH8vOl#umZ<%9&jFb=RLqOrE;kVGq(S8Nm?ZC
z+CHA|@Q5OC9Yv)GDtR3TmDh35?hl4&`+y3Wf~Fls(S!2FKLm7tpmgWIJO5<}zL(W&
D3B5za

literal 0
HcmV?d00001

diff --git a/.idea/GPU-BLAS-Offload-Benchmark.iml b/.idea/GPU-BLAS-Offload-Benchmark.iml
new file mode 100644
index 0000000..190534e
--- /dev/null
+++ b/.idea/GPU-BLAS-Offload-Benchmark.iml
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module classpath="External" external.linked.project.id="GPU-BLAS-Offload-Benchmark" external.linked.project.path="$MODULE_DIR$" external.root.project.path="$MODULE_DIR$" external.system.id="Makefile" type="CPP_MODULE" version="4" />
\ No newline at end of file
diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml
new file mode 100644
index 0000000..a55e7a1
--- /dev/null
+++ b/.idea/codeStyles/codeStyleConfig.xml
@@ -0,0 +1,5 @@
+<component name="ProjectCodeStyleConfiguration">
+  <state>
+    <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" />
+  </state>
+</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..830d3c8
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="MakefileWorkspace">
+    <contentRoot DIR="$PROJECT_DIR$" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..eff3984
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/GPU-BLAS-Offload-Benchmark.iml" filepath="$PROJECT_DIR$/.idea/GPU-BLAS-Offload-Benchmark.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 0000000..b954508
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,541 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="AutoImportSettings">
+    <option name="autoReloadType" value="SELECTIVE" />
+  </component>
+  <component name="CMakeRunConfigurationManager">
+    <generated>
+      <config projectName="GPU-BLAS-Offload-Benchmark" targetName="all" />
+      <config projectName="GPU-BLAS-Offload-Benchmark" targetName="gpu-blob" />
+    </generated>
+  </component>
+  <component name="CMakeSettings">
+    <configurations>
+      <configuration PROFILE_NAME="Debug" ENABLED="true" CONFIG_NAME="Debug" />
+    </configurations>
+  </component>
+  <component name="ChangeListManager">
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding AOCL files" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="ClangdSettings">
+    <option name="clangTidyViaClangd" value="false" />
+    <option name="formatViaClangd" value="false" />
+  </component>
+  <component name="ExternalProjectsData">
+    <projectState path="$PROJECT_DIR$">
+      <ProjectState />
+    </projectState>
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+    <option name="UPDATE_TYPE" value="REBASE" />
+  </component>
+  <component name="MakefileLocalSettings">
+    <option name="availableProjects">
+      <map>
+        <entry>
+          <key>
+            <ExternalProjectPojo>
+              <option name="name" value="GPU-BLAS-Offload-Benchmark" />
+              <option name="path" value="$PROJECT_DIR$" />
+            </ExternalProjectPojo>
+          </key>
+          <value>
+            <list>
+              <ExternalProjectPojo>
+                <option name="name" value="GPU-BLAS-Offload-Benchmark" />
+                <option name="path" value="$PROJECT_DIR$" />
+              </ExternalProjectPojo>
+            </list>
+          </value>
+        </entry>
+      </map>
+    </option>
+    <option name="projectSyncType">
+      <map>
+        <entry key="$PROJECT_DIR$" value="RE_IMPORT" />
+      </map>
+    </option>
+  </component>
+  <component name="MarkdownSettingsMigration">
+    <option name="stateVersion" value="1" />
+  </component>
+  <component name="OCResolveContextSettings">
+    <option name="configuration" value="$PROJECT_DIR$/src/main.cc" />
+  </component>
+  <component name="ProjectApplicationVersion">
+    <option name="ide" value="CLion" />
+    <option name="majorVersion" value="2023" />
+    <option name="minorVersion" value="3" />
+  </component>
+  <component name="ProjectColorInfo">{
+  &quot;associatedIndex&quot;: 2
+}</component>
+  <component name="ProjectId" id="2bAwYDqoTyLBV0DE8xYqkQ0FEw0" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent">{
+  &quot;keyToString&quot;: {
+    &quot;C/C++ File.main.cc.executor&quot;: &quot;Run&quot;,
+    &quot;RunOnceActivity.OpenProjectViewOnStart&quot;: &quot;true&quot;,
+    &quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
+    &quot;RunOnceActivity.cidr.known.project.marker&quot;: &quot;true&quot;,
+    &quot;RunOnceActivity.readMode.enableVisualFormatting&quot;: &quot;true&quot;,
+    &quot;cf.advertisement.text.has.clang-format&quot;: &quot;true&quot;,
+    &quot;cf.first.check.clang-format&quot;: &quot;false&quot;,
+    &quot;cidr.known.project.marker&quot;: &quot;true&quot;,
+    &quot;git-widget-placeholder&quot;: &quot;sparse&quot;,
+    &quot;last_opened_file_path&quot;: &quot;/Users/no22498/Documents/GPU-BLAS-Offload-Benchmark&quot;,
+    &quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
+    &quot;node.js.detected.package.tslint&quot;: &quot;true&quot;,
+    &quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
+    &quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;,
+    &quot;nodejs_package_manager_path&quot;: &quot;npm&quot;,
+    &quot;settings.editor.selected.configurable&quot;: &quot;preferences.lookFeel&quot;,
+    &quot;structure.view.defaults.are.configured&quot;: &quot;true&quot;,
+    &quot;vue.rearranger.settings.migration&quot;: &quot;true&quot;
+  }
+}</component>
+  <component name="RecentsManager">
+    <key name="MoveFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$/CSV_Results" />
+    </key>
+  </component>
+  <component name="RunManager" selected="C/C++ File.main.cc">
+    <configuration name="all" type="CLionNativeAppRunConfigurationType" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" EMULATE_TERMINAL="false" PASS_PARENT_ENVS_2="true" PROJECT_NAME="GPU-BLAS-Offload-Benchmark" TARGET_NAME="all" CONFIG_NAME="all" version="1">
+      <method v="2">
+        <option name="CLION.COMPOUND.BUILD" enabled="true" />
+      </method>
+    </configuration>
+    <configuration name="gpu-blob" type="CLionNativeAppRunConfigurationType" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" EMULATE_TERMINAL="false" PASS_PARENT_ENVS_2="true" PROJECT_NAME="GPU-BLAS-Offload-Benchmark" TARGET_NAME="gpu-blob" CONFIG_NAME="gpu-blob" version="1">
+      <method v="2">
+        <option name="CLION.COMPOUND.BUILD" enabled="true" />
+      </method>
+    </configuration>
+    <configuration name="main.cc" type="CppFileRunConfiguration" factoryName="CppFileRunConfiguration" temporary="true" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" EMULATE_TERMINAL="false" PASS_PARENT_ENVS_2="true" PROJECT_NAME="GPU-BLAS-Offload-Benchmark" TARGET_NAME="main.cc" CONFIG_NAME="main.cc">
+      <option name="sourceFile" value="src/main.cc" />
+      <method v="2">
+        <option name="com.jetbrains.cidr.cpp.runfile.CppFileBuildBeforeRunTaskProvider$BasicBuildBeforeRunTask" enabled="true" />
+      </method>
+    </configuration>
+    <list>
+      <item itemvalue="C/C++ File.main.cc" />
+      <item itemvalue="Native Application.all" />
+      <item itemvalue="Native Application.gpu-blob" />
+    </list>
+    <recent_temporary>
+      <list>
+        <item itemvalue="C/C++ File.main.cc" />
+      </list>
+    </recent_temporary>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="" />
+      <created>1705671236426</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1705671236426</updated>
+      <workItem from="1705671237559" duration="4602000" />
+      <workItem from="1706262352145" duration="10830000" />
+      <workItem from="1706520146967" duration="113000" />
+      <workItem from="1706524361669" duration="11224000" />
+      <workItem from="1706871479435" duration="19313000" />
+      <workItem from="1707150032379" duration="1154000" />
+      <workItem from="1707218344676" duration="510000" />
+      <workItem from="1707218861842" duration="7823000" />
+      <workItem from="1707568200980" duration="5614000" />
+      <workItem from="1708954563821" duration="751000" />
+      <workItem from="1708955322064" duration="16518000" />
+      <workItem from="1709217936554" duration="14897000" />
+      <workItem from="1709904670690" duration="598000" />
+      <workItem from="1710146767066" duration="2251000" />
+      <workItem from="1710157491483" duration="1263000" />
+      <workItem from="1710158763389" duration="2993000" />
+      <workItem from="1710161850416" duration="103978000" />
+      <workItem from="1711446443157" duration="118701000" />
+      <workItem from="1715785109710" duration="13531000" />
+      <workItem from="1716389199190" duration="1275000" />
+      <workItem from="1716897681894" duration="598000" />
+      <workItem from="1716899034743" duration="1217000" />
+      <workItem from="1716981059825" duration="14000" />
+      <workItem from="1722246444109" duration="2990000" />
+      <workItem from="1722496439084" duration="24843000" />
+      <workItem from="1723101242209" duration="21225000" />
+      <workItem from="1724244974273" duration="40294000" />
+      <workItem from="1726568120590" duration="8508000" />
+      <workItem from="1726828018604" duration="38592000" />
+    </task>
+    <task id="LOCAL-00001" summary="trivial changes">
+      <option name="closed" value="true" />
+      <created>1706261672580</created>
+      <option name="number" value="00001" />
+      <option name="presentableId" value="LOCAL-00001" />
+      <option name="project" value="LOCAL" />
+      <updated>1706261672580</updated>
+    </task>
+    <task id="LOCAL-00002" summary="Adding sparse algorithm">
+      <option name="closed" value="true" />
+      <created>1706568127804</created>
+      <option name="number" value="00002" />
+      <option name="presentableId" value="LOCAL-00002" />
+      <option name="project" value="LOCAL" />
+      <updated>1706568127804</updated>
+    </task>
+    <task id="LOCAL-00003" summary="Integrating algorithm with benchmark">
+      <option name="closed" value="true" />
+      <created>1706881882900</created>
+      <option name="number" value="00003" />
+      <option name="presentableId" value="LOCAL-00003" />
+      <option name="project" value="LOCAL" />
+      <updated>1706881882900</updated>
+    </task>
+    <task id="LOCAL-00004" summary="Adding commandline options to select only sparse or dense kernels">
+      <option name="closed" value="true" />
+      <created>1707233768599</created>
+      <option name="number" value="00004" />
+      <option name="presentableId" value="LOCAL-00004" />
+      <option name="project" value="LOCAL" />
+      <updated>1707233768599</updated>
+    </task>
+    <task id="LOCAL-00005" summary="Changes">
+      <option name="closed" value="true" />
+      <created>1709208672718</created>
+      <option name="number" value="00005" />
+      <option name="presentableId" value="LOCAL-00005" />
+      <option name="project" value="LOCAL" />
+      <updated>1709208672718</updated>
+    </task>
+    <task id="LOCAL-00006" summary="Changes">
+      <option name="closed" value="true" />
+      <created>1709211130948</created>
+      <option name="number" value="00006" />
+      <option name="presentableId" value="LOCAL-00006" />
+      <option name="project" value="LOCAL" />
+      <updated>1709211130948</updated>
+    </task>
+    <task id="LOCAL-00007" summary="Adding sparse kernel to doGemm">
+      <option name="closed" value="true" />
+      <created>1709217956669</created>
+      <option name="number" value="00007" />
+      <option name="presentableId" value="LOCAL-00007" />
+      <option name="project" value="LOCAL" />
+      <updated>1709217956669</updated>
+    </task>
+    <task id="LOCAL-00008" summary="Adding matrix type enum class">
+      <option name="closed" value="true" />
+      <created>1709218027209</created>
+      <option name="number" value="00008" />
+      <option name="presentableId" value="LOCAL-00008" />
+      <option name="project" value="LOCAL" />
+      <updated>1709218027209</updated>
+    </task>
+    <task id="LOCAL-00009" summary="changes">
+      <option name="closed" value="true" />
+      <created>1709368112577</created>
+      <option name="number" value="00009" />
+      <option name="presentableId" value="LOCAL-00009" />
+      <option name="project" value="LOCAL" />
+      <updated>1709368112577</updated>
+    </task>
+    <task id="LOCAL-00010" summary="changes">
+      <option name="closed" value="true" />
+      <created>1709368228167</created>
+      <option name="number" value="00010" />
+      <option name="presentableId" value="LOCAL-00010" />
+      <option name="project" value="LOCAL" />
+      <updated>1709368228167</updated>
+    </task>
+    <task id="LOCAL-00011" summary="adding command line kernel selection">
+      <option name="closed" value="true" />
+      <created>1709582619984</created>
+      <option name="number" value="00011" />
+      <option name="presentableId" value="LOCAL-00011" />
+      <option name="project" value="LOCAL" />
+      <updated>1709582619984</updated>
+    </task>
+    <task id="LOCAL-00012" summary="adding command line kernel selection">
+      <option name="closed" value="true" />
+      <created>1710157174669</created>
+      <option name="number" value="00012" />
+      <option name="presentableId" value="LOCAL-00012" />
+      <option name="project" value="LOCAL" />
+      <updated>1710157174669</updated>
+    </task>
+    <task id="LOCAL-00013" summary="Adding basic sparse multiplication kernel for default CPU and GPU">
+      <option name="closed" value="true" />
+      <created>1710172355530</created>
+      <option name="number" value="00013" />
+      <option name="presentableId" value="LOCAL-00013" />
+      <option name="project" value="LOCAL" />
+      <updated>1710172355530</updated>
+    </task>
+    <task id="LOCAL-00014" summary="Implementing cuSPARSE kernel">
+      <option name="closed" value="true" />
+      <created>1710337387217</created>
+      <option name="number" value="00014" />
+      <option name="presentableId" value="LOCAL-00014" />
+      <option name="project" value="LOCAL" />
+      <updated>1710337387217</updated>
+    </task>
+    <task id="LOCAL-00015" summary="Trying to work out CSR malloc bug">
+      <option name="closed" value="true" />
+      <created>1710338720376</created>
+      <option name="number" value="00015" />
+      <option name="presentableId" value="LOCAL-00015" />
+      <option name="project" value="LOCAL" />
+      <updated>1710338720376</updated>
+    </task>
+    <task id="LOCAL-00016" summary="Trying to work out CSR malloc bug">
+      <option name="closed" value="true" />
+      <created>1710338867534</created>
+      <option name="number" value="00016" />
+      <option name="presentableId" value="LOCAL-00016" />
+      <option name="project" value="LOCAL" />
+      <updated>1710338867534</updated>
+    </task>
+    <task id="LOCAL-00017" summary="cuSPARSE unified memory implementation">
+      <option name="closed" value="true" />
+      <created>1710853559721</created>
+      <option name="number" value="00017" />
+      <option name="presentableId" value="LOCAL-00017" />
+      <option name="project" value="LOCAL" />
+      <updated>1710853559721</updated>
+    </task>
+    <task id="LOCAL-00018" summary="Now compiles">
+      <option name="closed" value="true" />
+      <created>1711026531002</created>
+      <option name="number" value="00018" />
+      <option name="presentableId" value="LOCAL-00018" />
+      <option name="project" value="LOCAL" />
+      <updated>1711026531002</updated>
+    </task>
+    <task id="LOCAL-00019" summary="Now compiles">
+      <option name="closed" value="true" />
+      <created>1711026902576</created>
+      <option name="number" value="00019" />
+      <option name="presentableId" value="LOCAL-00019" />
+      <option name="project" value="LOCAL" />
+      <updated>1711026902576</updated>
+    </task>
+    <task id="LOCAL-00020" summary="Now compiles with fewer runtime errors">
+      <option name="closed" value="true" />
+      <created>1711361513432</created>
+      <option name="number" value="00020" />
+      <option name="presentableId" value="LOCAL-00020" />
+      <option name="project" value="LOCAL" />
+      <updated>1711361513432</updated>
+    </task>
+    <task id="LOCAL-00021" summary="Implementing other offload types - still some runtime errors">
+      <option name="closed" value="true" />
+      <created>1711453016707</created>
+      <option name="number" value="00021" />
+      <option name="presentableId" value="LOCAL-00021" />
+      <option name="project" value="LOCAL" />
+      <updated>1711453016707</updated>
+    </task>
+    <task id="LOCAL-00022" summary="All implemented and running.  No checksum at the end">
+      <option name="closed" value="true" />
+      <created>1711457712445</created>
+      <option name="number" value="00022" />
+      <option name="presentableId" value="LOCAL-00022" />
+      <option name="project" value="LOCAL" />
+      <updated>1711457712445</updated>
+    </task>
+    <task id="LOCAL-00023" summary="Removing print statements">
+      <option name="closed" value="true" />
+      <created>1711457867311</created>
+      <option name="number" value="00023" />
+      <option name="presentableId" value="LOCAL-00023" />
+      <option name="project" value="LOCAL" />
+      <updated>1711457867311</updated>
+    </task>
+    <task id="LOCAL-00024" summary="All three offload types working for large problem sizes">
+      <option name="closed" value="true" />
+      <created>1711715754311</created>
+      <option name="number" value="00024" />
+      <option name="presentableId" value="LOCAL-00024" />
+      <option name="project" value="LOCAL" />
+      <updated>1711715754311</updated>
+    </task>
+    <task id="LOCAL-00025" summary="Removing print statements">
+      <option name="closed" value="true" />
+      <created>1711715920815</created>
+      <option name="number" value="00025" />
+      <option name="presentableId" value="LOCAL-00025" />
+      <option name="project" value="LOCAL" />
+      <updated>1711715920815</updated>
+    </task>
+    <task id="LOCAL-00026" summary="Superficial changes">
+      <option name="closed" value="true" />
+      <created>1711961476350</created>
+      <option name="number" value="00026" />
+      <option name="presentableId" value="LOCAL-00026" />
+      <option name="project" value="LOCAL" />
+      <updated>1711961476350</updated>
+    </task>
+    <task id="LOCAL-00027" summary="rebasing">
+      <option name="closed" value="true" />
+      <created>1711961618074</created>
+      <option name="number" value="00027" />
+      <option name="presentableId" value="LOCAL-00027" />
+      <option name="project" value="LOCAL" />
+      <updated>1711961618074</updated>
+    </task>
+    <task id="LOCAL-00028" summary="rebasing">
+      <option name="closed" value="true" />
+      <created>1711961836984</created>
+      <option name="number" value="00028" />
+      <option name="presentableId" value="LOCAL-00028" />
+      <option name="project" value="LOCAL" />
+      <updated>1711961836984</updated>
+    </task>
+    <task id="LOCAL-00029" summary="rebasing">
+      <option name="closed" value="true" />
+      <created>1711961942373</created>
+      <option name="number" value="00029" />
+      <option name="presentableId" value="LOCAL-00029" />
+      <option name="project" value="LOCAL" />
+      <updated>1711961942374</updated>
+    </task>
+    <task id="LOCAL-00030" summary="Fixing after rebase">
+      <option name="closed" value="true" />
+      <created>1712057111636</created>
+      <option name="number" value="00030" />
+      <option name="presentableId" value="LOCAL-00030" />
+      <option name="project" value="LOCAL" />
+      <updated>1712057111636</updated>
+    </task>
+    <task id="LOCAL-00031" summary="Tidying up spGEMM classes to remove duplicated code">
+      <option name="closed" value="true" />
+      <created>1712136173732</created>
+      <option name="number" value="00031" />
+      <option name="presentableId" value="LOCAL-00031" />
+      <option name="project" value="LOCAL" />
+      <updated>1712136173732</updated>
+    </task>
+    <task id="LOCAL-00032" summary="Fixing py script to accomodate new kernels">
+      <option name="closed" value="true" />
+      <created>1712141872451</created>
+      <option name="number" value="00032" />
+      <option name="presentableId" value="LOCAL-00032" />
+      <option name="project" value="LOCAL" />
+      <updated>1712141872451</updated>
+    </task>
+    <task id="LOCAL-00033" summary="Fixing memory bug.  Implementing --kernels flag">
+      <option name="closed" value="true" />
+      <created>1712153668999</created>
+      <option name="number" value="00033" />
+      <option name="presentableId" value="LOCAL-00033" />
+      <option name="project" value="LOCAL" />
+      <updated>1712153668999</updated>
+    </task>
+    <task id="LOCAL-00034" summary="Getting rid of print statements">
+      <option name="closed" value="true" />
+      <created>1712222760735</created>
+      <option name="number" value="00034" />
+      <option name="presentableId" value="LOCAL-00034" />
+      <option name="project" value="LOCAL" />
+      <updated>1712222760735</updated>
+    </task>
+    <task id="LOCAL-00035" summary="WIP">
+      <option name="closed" value="true" />
+      <created>1712311301376</created>
+      <option name="number" value="00035" />
+      <option name="presentableId" value="LOCAL-00035" />
+      <option name="project" value="LOCAL" />
+      <updated>1712311301376</updated>
+    </task>
+    <task id="LOCAL-00036" summary="Finalising">
+      <option name="closed" value="true" />
+      <created>1713959722407</created>
+      <option name="number" value="00036" />
+      <option name="presentableId" value="LOCAL-00036" />
+      <option name="project" value="LOCAL" />
+      <updated>1713959722407</updated>
+    </task>
+    <task id="LOCAL-00037" summary="Rebasing">
+      <option name="closed" value="true" />
+      <created>1715161012243</created>
+      <option name="number" value="00037" />
+      <option name="presentableId" value="LOCAL-00037" />
+      <option name="project" value="LOCAL" />
+      <updated>1715161012243</updated>
+    </task>
+    <task id="LOCAL-00038" summary="Rebasing">
+      <option name="closed" value="true" />
+      <created>1715161090646</created>
+      <option name="number" value="00038" />
+      <option name="presentableId" value="LOCAL-00038" />
+      <option name="project" value="LOCAL" />
+      <updated>1715161090646</updated>
+    </task>
+    <task id="LOCAL-00039" summary="Adding AOCL files">
+      <option name="closed" value="true" />
+      <created>1716198459677</created>
+      <option name="number" value="00039" />
+      <option name="presentableId" value="LOCAL-00039" />
+      <option name="project" value="LOCAL" />
+      <updated>1716198459677</updated>
+    </task>
+    <task id="LOCAL-00040" summary="Adding AOCL files">
+      <option name="closed" value="true" />
+      <created>1724234752813</created>
+      <option name="number" value="00040" />
+      <option name="presentableId" value="LOCAL-00040" />
+      <option name="project" value="LOCAL" />
+      <updated>1724234752813</updated>
+    </task>
+    <option name="localTasksCounter" value="41" />
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+  <component name="Vcs.Log.Tabs.Properties">
+    <option name="TAB_STATES">
+      <map>
+        <entry key="MAIN">
+          <value>
+            <State />
+          </value>
+        </entry>
+      </map>
+    </option>
+  </component>
+  <component name="VcsManagerConfiguration">
+    <MESSAGE value="Adding sparse kernel to doGemm" />
+    <MESSAGE value="Adding matrix type enum class" />
+    <MESSAGE value="changes" />
+    <MESSAGE value="adding command line kernel selection" />
+    <MESSAGE value="Adding basic sparse multiplication kernel for default CPU and GPU" />
+    <MESSAGE value="Implementing cuSPARSE kernel" />
+    <MESSAGE value="Trying to work out CSR malloc bug" />
+    <MESSAGE value="cuSPARSE unified memory implementation" />
+    <MESSAGE value="Now compiles" />
+    <MESSAGE value="Now compiles with fewer runtime errors" />
+    <MESSAGE value="Implementing other offload types - still some runtime errors" />
+    <MESSAGE value="All implemented and running.  No checksum at the end" />
+    <MESSAGE value="All three offload types working for large problem sizes" />
+    <MESSAGE value="Removing print statements" />
+    <MESSAGE value="Superficial changes" />
+    <MESSAGE value="rebasing" />
+    <MESSAGE value="Fixing after rebase" />
+    <MESSAGE value="Tidying up spGEMM classes to remove duplicated code" />
+    <MESSAGE value="Fixing py script to accomodate new kernels" />
+    <MESSAGE value="Fixing memory bug.  Implementing --kernels flag" />
+    <MESSAGE value="Getting rid of print statements" />
+    <MESSAGE value="WIP" />
+    <MESSAGE value="Finalising" />
+    <MESSAGE value="Rebasing" />
+    <MESSAGE value="Adding AOCL files" />
+    <option name="LAST_COMMIT_MESSAGE" value="Adding AOCL files" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh
index aba5814..47b0bf9 100644
--- a/ArmPL/sp_gemm.hh
+++ b/ArmPL/sp_gemm.hh
@@ -16,7 +16,7 @@ namespace cpu {
 template <typename T>
 class sp_gemm_cpu : public sp_gemm<T> {
  public:
-  using sp_gemm<T>::gemm;
+  using sp_gemm<T>::sp_gemm;
   using sp_gemm<T>::callConsume;
   using sp_gemm<T>::m_;
   using sp_gemm<T>::n_;
@@ -24,6 +24,7 @@ class sp_gemm_cpu : public sp_gemm<T> {
   using sp_gemm<T>::A_;
   using sp_gemm<T>::B_;
   using sp_gemm<T>::C_;
+  using sp_gemm<T>::nnz_;
 
  private:
   /** Make call to the GEMM kernel. */
@@ -52,22 +53,23 @@ class sp_gemm_cpu : public sp_gemm<T> {
 
 
 
-    if (std::is_same_v<T, float>) {
-      status_ = armpl_spmm_exec_s(transA,
-                                  transB,
+    if constexpr (std::is_same_v<T, float>) {
+      status_ = armpl_spmm_exec_s(transA_,
+                                  transB_,
                                   alpha,
-                                  A_armpl_,
-                                  B_armpl,
+                                  *A_armpl_,
+                                  *B_armpl_,
                                   beta,
-                                  C_armpl_);
+                                  *B_armpl_);
     } else if constexpr (std::is_same_v<T, double>) {
-      status_ = armpl_spmm_exec_d(transA,
-                                  transB,
+      std::cout << "About to execute dgemm" << std::endl;
+      status_ = armpl_spmm_exec_d(transA_,
+                                  transB_,
                                   alpha,
-                                  A_armpl_,
-                                  B_armpl,
+                                  *A_armpl_,
+                                  *B_armpl_,
                                   beta,
-                                  C_armpl_);
+                                  *B_armpl_);
     } else {
       // Un-specialised class will not do any work - print error and exit.
       std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported."
@@ -85,26 +87,42 @@ class sp_gemm_cpu : public sp_gemm<T> {
 
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
-  void preLoopRequirements() override {}
+  void preLoopRequirements() override {
+    // Need to put A_ and B_ into A_armpl_ and B_armpl_
+    // ToDo -- Error catching
+    toCSR_armpl();
+//    std::cout << "toCSR_armpl() wrapped up without a problem" << std::endl;
+  }
 
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
   void postLoopRequirements() override {
-    status_ = armpl_spmat_destroy(A_armpl_);
-    if (status_ != ARMPL_STATUS_SUCCESS) {
-      std::cout << "ERROR " << status_ << std::endl;
-      exit(1);
-    }
-    status_ = armpl_spmat_destroy(B_armpl_);
+    status_ = armpl_spmat_destroy(*A_armpl_);
     if (status_ != ARMPL_STATUS_SUCCESS) {
       std::cout << "ERROR " << status_ << std::endl;
       exit(1);
     }
-    status_ = armpl_spmat_destroy(C_armpl_);
+    status_ = armpl_spmat_destroy(*B_armpl_);
     if (status_ != ARMPL_STATUS_SUCCESS) {
       std::cout << "ERROR " << status_ << std::endl;
       exit(1);
     }
+//    status_ = armpl_spmat_destroy(*C_armpl_);
+//    if (status_ != ARMPL_STATUS_SUCCESS) {
+//      std::cout << "ERROR " << status_ << std::endl;
+//      exit(1);
+//    }
+
+//    delete [] A_armpl_row_ptr_;
+//    delete [] A_armpl_col_index_;
+//    delete [] A_vals_;
+//    delete [] B_armpl_row_ptr_;
+//    delete [] B_armpl_col_index_;
+//    delete [] B_vals_;
+//    delete [] C_armpl_row_ptr_;
+//    delete [] C_armpl_col_index_;
+//    delete [] C_vals_;
+
   }
 
   /** The constant value Alpha. */
@@ -117,8 +135,7 @@ class sp_gemm_cpu : public sp_gemm<T> {
 
   armpl_spmat_t armpl_A, armpl_B, armpl_C;
 
-  @override
-  void toCSR() {
+  void toCSR_armpl() {
     n_armpl_ = n_;
     // ToDo -- check whether flags_ is correct!
     flags_ = 0;
@@ -127,85 +144,265 @@ class sp_gemm_cpu : public sp_gemm<T> {
     A_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
     A_armpl_col_index_ = new armpl_int_t[nnz_];
     A_vals_ = new T[nnz_];
+    A_armpl_row_ptr_[0] = 0;
+
     int nnz_encountered = 0;
+//    std::cout << "About to load A into csr" << std::endl;
     for (int row = 0; row < n_; row++) {
-      A_armpl_row_ptr_[row] = nnz_encountered;
+//      std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered << std::endl;
+      A_armpl_row_ptr_[row + 1] = nnz_encountered;
       for (int col = 0; col < n_; col++) {
         if (A_[(row * n_) + col] != 0.0) {
+//          std::cout << "\t\tCol " << col << " = " << A_[(row * n_) + col] <<
+//          std::endl;
           A_armpl_col_index_[nnz_encountered] = col;
-          A_vals_[nnz_encountered] = A_[(row * n_) + col];
+          A_vals_[nnz_encountered] = static_cast<T>(A_[(row * n_) + col]);
           nnz_encountered++;
+//          std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl;
         }
       }
     }
 
+//    std::cout << "___A =" << std::endl << "\t\t[";
+//    for (int i = 0; i < (n_ + 1); i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << A_armpl_row_ptr_[i];
+//    }
+//    std::cout << "]" << std::endl << "\t\t[";
+//    for (int i = 0; i < nnz_; i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << A_armpl_col_index_[i];
+//    }
+//    std::cout << "]" << std::endl << "\t\t[";
+//    for (int i = 0; i < nnz_; i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << A_vals_[i];
+//    }
+//    std::cout << "]" << std::endl;
+
+
+//    std::cout << "About to load B into csr" << std::endl;
+
     // Move B to CSR
     B_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
     B_armpl_col_index_ = new armpl_int_t[nnz_];
     B_vals_ = new T[nnz_];
+    B_armpl_row_ptr_[0] = 0;
+
     nnz_encountered = 0;
     for (int row = 0; row < n_; row++) {
-      B_armpl_row_ptr_[row] = nnz_encountered;
+//      std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered <<
+//      std::endl;
+      B_armpl_row_ptr_[row + 1] = nnz_encountered;
       for (int col = 0; col < n_; col++) {
         if (B_[(row * n_) + col] != 0.0) {
+//          std::cout << "\t\tCol " << col << " = " << B_[(row * n_) + col] << std::endl;
           B_armpl_col_index_[nnz_encountered] = col;
-          B_vals_[nnz_encountered] = B_[(row * n_) + col];
+          B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
           nnz_encountered++;
+//          std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl;
         }
       }
     }
+//    std::cout << "___B =" << std::endl << "\t\t[";
+//    for (int i = 0; i < (n_ + 1); i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << B_armpl_row_ptr_[i];
+//    }
+//    std::cout << "]" << std::endl << "\t\t[";
+//    for (int i = 0; i < nnz_; i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << B_armpl_col_index_[i];
+//    }
+//    std::cout << "]" << std::endl << "\t\t[";
+//    for (int i = 0; i < nnz_; i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << B_vals_[i];
+//    }
+//    std::cout << "]" << std::endl;
+
+
+//    // Move B to CSR
+//    C_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
+//    C_armpl_col_index_ = new armpl_int_t[nnz_];
+//    C_vals_ = new T[nnz_];
+//    C_armpl_row_ptr_[0] = 0;
+//
+//    nnz_encountered = 0;
+////    std::cout << "About to load C into csr" << std::endl;
+//    for (int row = 0; row < n_; row++) {
+////      std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered << std::endl;
+//      C_armpl_row_ptr_[row + 1] = nnz_encountered;
+//      for (int col = 0; col < n_; col++) {
+//        if (A_[(row * n_) + col] != 0.0) {
+//          C_armpl_col_index_[nnz_encountered] = col;
+//          C_vals_[nnz_encountered] = A_[(row * n_) + col];
+//          nnz_encountered++;
+////          std::cout << "\t\tCol " << col << " = " << C_vals_[nnz_encountered] <<
+////          std::endl;
+////          std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl;
+//        }
+//      }
+//    }
+
+//    std::cout << "___C =" << std::endl << "\t\t[";
+//    for (int i = 0; i < (n_ + 1); i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << C_armpl_row_ptr_[i];
+//    }
+//    std::cout << "]" << std::endl << "\t\t[";
+//    for (int i = 0; i < nnz_; i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << C_armpl_col_index_[i];
+//    }
+//    std::cout << "]" << std::endl << "\t\t[";
+//    for (int i = 0; i < nnz_; i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << C_vals_[i];
+//    }
+//    std::cout << "]" << std::endl;
+
+
+
+//    std::cout << "Loading csr A into armpl storage formats" << std::endl;
+    if constexpr (std::is_same_v<T, float>) {
+      std::cout << "\tn_armpl_ = " << n_armpl_ << std::endl;
+      std::cout << "\tA_armpl_row_ptr_ (size = " << sizeof
+      (A_armpl_row_ptr_[0]) << ") = [" << A_armpl_row_ptr_[0];
+      for (int i = 1; i < (n_ + 1); i++) {
+        std::cout << ", " << A_armpl_row_ptr_[i];
+      }
+      std::cout << "]" << std::endl << "\tA_armpl_col_index_ (size = " <<
+      sizeof(A_armpl_col_index_[0]) << ") = [" <<
+      A_armpl_col_index_[0];
+      for (int i = 1; i < nnz_; i++) {
+        std::cout << ", " << A_armpl_col_index_[i];
+      }
+      std::cout << "]" << std::endl << "\tA_vals_ (size = " << sizeof
+      (A_vals_[0]) << ") = [" << A_vals_[0];
+      for (int i = 1; i < nnz_; i++) {
+        std::cout << ", " << A_vals_[i];
+      }
+      std::cout << "]" << std::endl << "flags: " << flags_ << std::endl;
 
-    if (std::is_sam_v<T, float>) {
       status_ = armpl_spmat_create_csr_s(A_armpl_,
                                          n_armpl_,
                                          n_armpl_,
                                          A_armpl_row_ptr_,
                                          A_armpl_col_index_,
                                          A_vals_,
-                                         flags);
+                                         flags_);
       if (status_ != ARMPL_STATUS_SUCCESS) {
         std::cout << "ERROR " << status_ << std::endl;
         exit(1);
       }
 
+//      std::cout << "Loading csr C into armpl storage formats" << std::endl;
+//      status_ = armpl_spmat_create_csr_s(C_armpl_,
+//                                         n_armpl_,
+//                                         n_armpl_,
+//                                         C_armpl_row_ptr_,
+//                                         C_armpl_col_index_,
+//                                         C_vals_,
+//                                         flags_);
+//      if (status_ != ARMPL_STATUS_SUCCESS) {
+//        std::cout << "ERROR " << status_ << std::endl;
+//        exit(1);
+//      }
+
+//      std::cout << "Loading csr B into armpl storage formats" << std::endl;
       status_ = armpl_spmat_create_csr_s(B_armpl_,
                                          n_armpl_,
                                          n_armpl_,
                                          B_armpl_row_ptr_,
                                          B_armpl_col_index_,
                                          B_vals_,
-                                         flags);
+                                         flags_);
       if (status_ != ARMPL_STATUS_SUCCESS) {
         std::cout << "ERROR " << status_ << std::endl;
         exit(1);
       }
-    } else if (std::is_same_v<T, double>) {
+    } else if constexpr (std::is_same_v<T, double>) {
+      std::cout << "\tn_armpl_ = " << n_armpl_ << std::endl;
+      std::cout << "\tA_armpl_row_ptr_ (size = " << sizeof
+      (A_armpl_row_ptr_[0]) << ") = [" << A_armpl_row_ptr_[0];
+      for (int i = 1; i < (n_ + 1); i++) {
+        std::cout << ", " << A_armpl_row_ptr_[i];
+      }
+      std::cout << "]" << std::endl << "\tA_armpl_col_index_ (size = " <<
+      sizeof(A_armpl_col_index_[0]) << ") = [" <<
+      A_armpl_col_index_[0];
+      for (int i = 1; i < nnz_; i++) {
+        std::cout << ", " << A_armpl_col_index_[i];
+      }
+      std::cout << "]" << std::endl << "\tA_vals_ (size = " << sizeof
+      (A_vals_[0]) << ") = [" << A_vals_[0];
+      for (int i = 1; i < nnz_; i++) {
+        std::cout << ", " << A_vals_[i];
+      }
+      std::cout << "]" << std::endl << "flags: " << flags_ << std::endl;
+
+
+      std::cout << "About to create CSR A (double)" << std::endl;
       status_ = armpl_spmat_create_csr_d(A_armpl_,
                                          n_armpl_,
                                          n_armpl_,
                                          A_armpl_row_ptr_,
                                          A_armpl_col_index_,
                                          A_vals_,
-                                         flags);
+                                         flags_);
       if (status_ != ARMPL_STATUS_SUCCESS) {
         std::cout << "ERROR " << status_ << std::endl;
         exit(1);
       }
 
+//      std::cout << "Loading csr C into armpl storage formats" << std::endl;
+//      status_ = armpl_spmat_create_csr_d(C_armpl_,
+//                                         n_armpl_,
+//                                         n_armpl_,
+//                                         C_armpl_row_ptr_,
+//                                         C_armpl_col_index_,
+//                                         C_vals_,
+//                                         flags_);
+//      if (status_ != ARMPL_STATUS_SUCCESS) {
+//        std::cout << "ERROR " << status_ << std::endl;
+//        exit(1);
+//      }
+
+//      std::cout << "Loading csr B into armpl storage formats" << std::endl;
+      std::cout << "About to create CSR B (double)" << std::endl;
       status_ = armpl_spmat_create_csr_d(B_armpl_,
                                          n_armpl_,
                                          n_armpl_,
                                          B_armpl_row_ptr_,
                                          B_armpl_col_index_,
                                          B_vals_,
-                                         flags);
+                                         flags_);
       if (status_ != ARMPL_STATUS_SUCCESS) {
         std::cout << "ERROR " << status_ << std::endl;
         exit(1);
       }
     }
 
-
+//    std::cout << "Okay, all matrices made!!" << std::endl;
   }
 
   armpl_int_t flags_;
@@ -219,12 +416,16 @@ class sp_gemm_cpu : public sp_gemm<T> {
   armpl_int_t* C_armpl_row_ptr_;
   armpl_int_t* C_armpl_col_index_;
 
+  T* A_vals_;
+  T* B_vals_;
+  T* C_vals_;
+
   armpl_spmat_t* A_armpl_;
   armpl_spmat_t* B_armpl_;
   armpl_spmat_t* C_armpl_;
 
-  sparse_hint_value transA = ARMPL_SPARSE_OPERATION_NOTRANS;
-  sparse_hint_value transB = ARMPL_SPARSE_OPERATION_NOTRANS;
+  armpl_sparse_hint_value transA_ = ARMPL_SPARSE_OPERATION_NOTRANS;
+  armpl_sparse_hint_value transB_ = ARMPL_SPARSE_OPERATION_NOTRANS;
 
 };
 }  // namespace cpu
diff --git a/DefaultCPU/sp_gemm.hh b/DefaultCPU/sp_gemm.hh
deleted file mode 100644
index d7ecb37..0000000
--- a/DefaultCPU/sp_gemm.hh
+++ /dev/null
@@ -1,55 +0,0 @@
-#pragma once
-
-#if defined CPU_DEFAULT
-
-#include "../include/kernels/CPU/sp_gemm.hh"
-#include "../include/utilities.hh"
-
-namespace cpu {
-/** A class for GEMM CPU BLAS kernels. */
-template <typename T>
-class sp_gemm_cpu : public sp_gemm<T> {
- public:
-  using sp_gemm<T>::sp_gemm;
-  using sp_gemm<T>::callConsume;
-  using sp_gemm<T>::m_;
-  using sp_gemm<T>::n_;
-  using sp_gemm<T>::k_;
-  using sp_gemm<T>::A_;
-  using sp_gemm<T>::B_;
-  using sp_gemm<T>::C_;
-
- private:
-  /** Perform the GEMM kernel. */
-  void callGemm() override {
-    /** A naive implementation of a column-major GEMM. Alpha and Beta are always
-     * 1 and 0 respectively.
-     * Operation takes the form of C[M,N] = A[M,K] * B[K,N].
-     * callConsume() is required to ensure that the compiler does not optimise
-     * away this function. */
-    int x, y, z;
-    T acc;
-    for (x = 0; x < m_; x++) {
-      for (y = 0; y < n_; y++) {
-        acc = 0.0;
-        for (z = 0; z < k_; z++) {
-          acc += A_[z * m_ + x] * B_[y * k_ + z];
-        }
-        C_[y * m_ + x] = acc;
-      }
-    }
-    // Ensure compiler doesn't optimise away the work being done
-    callConsume();
-  }
-
-  /** Perform any required steps before calling the GEMM kernel that should
-   * be timed. */
-  void preLoopRequirements() override {}
-
-  /** Perform any required steps after calling the GEMM kernel that should
-   * be timed. */
-  void postLoopRequirements() override {}
-};
-
-}  // namespace cpu
-#endif
diff --git a/DefaultGPU/sp_gemm.hh b/DefaultGPU/sp_gemm.hh
deleted file mode 100644
index 2a9f478..0000000
--- a/DefaultGPU/sp_gemm.hh
+++ /dev/null
@@ -1,54 +0,0 @@
-#pragma once
-
-#if defined GPU_DEFAULT
-
-#include <cmath>
-
-#include "../include/kernels/GPU/sp_gemm.hh"
-#include "../include/utilities.hh"
-
-namespace gpu {
-/** A class for GEMM GPU BLAS kernels. */
-template <typename T>
-class sp_gemm_gpu : public sp_gemm<T> {
- public:
-  using sp_gemm<T>::sp_gemm;
-
-  /** Call the BLAS kernel n times, with 1 warmup run.
-   * Returns the time elapsed for n BLAS calls in seconds. */
-  time_checksum_gflop compute() {
-    // Override function in base `kernel` class as DefaultGPU should do nothing.
-    return {INFINITY, INFINITY, 0.0};
-  }
-
-  /** Initialise the required data structures. */
-  void initialise(gpuOffloadType offload, int n, float sparsity) override {
-    // Default GPU implementation - do nothing.
-  }
-
- private:
-  /** Make a call to the BLAS Library Kernel. */
-  void callGemm() override {
-    // Default GPU implementation - do nothing.
-  }
-
-  /** Perform any required steps before calling the GEMM kernel that should
-   * be timed. */
-  void preLoopRequirements() override {
-    // Default GPU implementation - do nothing.
-  }
-
-  /** Perform any required steps after calling the GEMM kernel that should
-   * be timed. */
-  void postLoopRequirements() override {
-    // Default GPU implementation - do nothing.
-  }
-
-  /** Do any necessary cleanup (free pointers, close library handles, etc.)
-   * after Kernel has been called. */
-  void postCallKernelCleanup() override {
-    // Default GPU implementation - do nothing.
-  }
-};
-}  // namespace gpu
-#endif
\ No newline at end of file
diff --git a/Makefile b/Makefile
index bff0add..e5091e0 100644
--- a/Makefile
+++ b/Makefile
@@ -170,7 +170,7 @@ $(warning GPU_LIB not set (use CUBLAS, ONEMKL, ROCBLAS). No GPU kernels will be
 else ifeq ($(GPU_LIB), CUBLAS)
 # Do cuBLAS stuff
 ifeq ($(COMPILER), NVIDIA)
-override CXXFLAGS += -cudalib=cublas
+override CXXFLAGS += -cudalib=cublas -lcusparse_static
 else
 $(warning Users may be required to do the following to use $(COMPILER) with $(GPU_LIB):)
 $(info $(TAB)$(TAB)Add `CXXFLAGS=-L<NVHPC_DIR>/.../math_libs/lib64 -L<NVHPC_DIR>/.../cuda/lib64` to make command)
diff --git a/NVPL/sp_gemv.hh b/NVPL/sp_gemv.hh
deleted file mode 100644
index d04f6b8..0000000
--- a/NVPL/sp_gemv.hh
+++ /dev/null
@@ -1,117 +0,0 @@
-/**
- * ToDo -- This is all currently written for GEMM, but NVPL does not support
- * GEMM, so this needs to be adjusted to spmv -- which is supported
- */
-
-
-
-
-
-#pragma once
-
-#ifdef CPU_NVPL
-#include <nvpl_sparse.h>
-
-#include "../include/kernels/CPU/gemm.hh"
-#include "../include/utilities.hh"
-
-namespace cpu {
-/** A class for GEMM CPU BLAS kernels. */
-template <typename T>
-class sp_gemm_cpu : public sp_gemm<T> {
- public:
-  using sp_gemm<T>::gemm;
-  using sp_gemm<T>::callConsume;
-  using sp_gemm<T>::m_;
-  using sp_gemm<T>::n_;
-  using sp_gemm<T>::k_;
-  using sp_gemm<T>::A_;
-  using sp_gemm<T>::B_;
-  using sp_gemm<T>::C_;
-
- private:
-  /** Make call to the GEMM kernel. */
-  void callGemm() override {
-
-    // Ensure compiler doesn't optimise away the work being done
-    callConsume();
-  }
-
-  /** Perform any required steps before calling the GEMM kernel that should
-   * be timed. */
-  void preLoopRequirements() override {
-    // Set type enum
-    if constexpr (std::is_same_v<T, float>) {
-      type_ = NVPL_SPARSE_R_32F;
-    } else if constexpr (std::is_same_v<T, double>) {
-      type_ = NVPL_SPARSE_R_64F;
-    } else {
-      // Un-specialised class will not do any work - print error and exit.
-      std::cout << "ERROR - Datatype for NVPL sparse GEMM kernel not supported."
-                << std::endl;
-      exit(1);
-    }
-    status_ = nvpl_sparse_create(&handle_);
-    // Todo -- error check
-
-    // Todo -- Make const?
-    status_ = nvpl_sparse_create_csr(A_nvpl_, n_, n_, nnz_, A_row_ptr_nvpl_,
-                                     A_col_index_nvpl_, A_vals_nvpl_,
-                                     index_type_, index_type_, base_, type_);
-
-    status_ = nvpl_sparse_create_csr(B_nvpl_, n_, n_, nnz_, B_row_ptr_nvpl_,
-                                     B_col_index_nvpl_, B_vals_nvpl_,
-                                     index_type_, index_type_, base_, type_);
-    // Todo -- error check
-
-
-  }
-
-  /** Perform any required steps after calling the GEMM kernel that should
-   * be timed. */
-  void postLoopRequirements() override {
-    status_ = nvpl_sparse_destroy(handle_);
-    // Todo -- error check
-    status_ = nvpl_sparse_destroy_sp_mat(A_nvpl_);
-    status_ = nvpl_sparse_destroy_sp_mat(B_nvpl_);
-    status_ = nvpl_sparse_destroy_sp_mat(C_nvpl_);
-  }
-
-  /** The constant value Alpha. */
-  T alpha = ALPHA;
-
-  /** The constant value Beta. */
-  T beta = BETA;
-
-  /**
-   * Sparse metadata
-  */
-  nvpl_sparse_status_t status_;
-  nvpl_sparse_handle_t handle_;
-  nvpl_sparse_data_type_t type_;
-
-  nvpl_sparse_operation_t op_ = NVPL_SPARSE_OPERATION_NON_TRANSPOSE;
-  nvpl_sparse_index_base_t base_ = NVPL_SPARSE_INDEX_BASE_ZERO;
-  nvpl_sparse_format_t format_ = NVPL_SPARSE_FORMAT_CSR;
-  nvpl_sparse_order_t order_ = NVPL_SPARSE_ORDER_COL;
-  nvpl_sparse_index_type_t index_type_ = NVPL_SPARSE_INDEX_64I;
-
-  /**
-   * Sparse matrix descriptors
-  */
-  nvpl_sparse_sp_mat_descr_t* A_nvpl_;
-  nvpl_sparse_sp_mat_descr_t* B_nvpl_;
-  nvpl_sparse_sp_mat_descr_t* C_nvpl_;
-
-  void* A_row_ptr_nvpl_;
-  void* B_row_ptr_nvpl_;
-  void* C_row_ptr_nvpl_;
-  void* A_col_idnex_nvpl_;
-  void* B_col_idnex_nvpl_;
-  void* C_col_idnex_nvpl_;
-  void* A_vals_nvpl_;
-  void* B_vals_nvpl_;
-  void* C_vals_nvpl_;
-};
-}  // namespace cpu
-#endif
\ No newline at end of file
diff --git a/createGflopsGraphs.py b/createGflopsGraphs.py
index d323162..07ac243 100644
--- a/createGflopsGraphs.py
+++ b/createGflopsGraphs.py
@@ -123,6 +123,11 @@
         inputTypeStr = "Square x Short-Wide (M=K=32, N)"
         for j in range(0, len(mnk)):
             xVals.append(mnk[j][1])
+    elif "_sparse_square" in gemmFilenames[i]:
+        x_name = "Value of M, N, K"
+        inputTypeStr = "Sparse square matrices"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
     else:
         # File not supported so go to next file
         continue
diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index d849d22..b5e8d93 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -1,8 +1,7 @@
 #pragma once
 
 #ifdef GPU_CUBLAS
-#include "cusparse.h"
-#include <cublas_v2.h>
+#include <cusparse_v2.h>
 #include <cuda_runtime.h>
 #include <type_traits>
 #include <random>
@@ -13,13 +12,13 @@
 #include "common.hh"
 
 namespace gpu {
-/** A class for GEMM GPU BLAS kernels. */
+/** A class for sparse GEMM GPU BLAS kernels. */
 template <typename T>
 class sp_gemm_gpu : public sp_gemm<T> {
  public:
   using sp_gemm<T>::sp_gemm;
   using sp_gemm<T>::initInputMatricesSparse;
-  using sp_gemm<T>::toCSR;
+  using sp_gemm<T>::toCSR_int;
   using sp_gemm<T>::n_;
   using sp_gemm<T>::A_;
   using sp_gemm<T>::B_;
@@ -44,7 +43,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
       std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
       exit(1);
     }
-    n_ = 100 * n;
+    n_ = n;
 
     // Get device identifier
     cudaCheckError(cudaGetDevice(&gpuDevice_));
diff --git a/cuBLAS/sp_gemv.hh b/cuBLAS/sp_gemv.hh
new file mode 100644
index 0000000..8027746
--- /dev/null
+++ b/cuBLAS/sp_gemv.hh
@@ -0,0 +1,261 @@
+//#pragma once
+//
+//#ifdef GPU_CUBLAS
+//#include <cusparse_v2.h>
+//#include <cuda.h>
+//#include <cublas_v2.h>
+//#include <cuda_runtime.h>
+//#include <type_traits>
+//#include <random>
+//#include <iostream>
+//
+//#include "../include/kernels/GPU/sp_gemv.hh"
+//#include "../include/utilities.hh"
+//#include "common.hh"
+//
+//namespace gpu {
+///** A class for sparse GEMV GPU BLAS kernels. */
+//template <typename T>
+//class gemv_gpu : public gemv<T> {
+// public:
+//  using gemv<T>::gemv;
+//  using gemv<T>::initInputMatrixVector;
+//  using gemv<T>::m_;
+//  using gemv<T>::n_;
+//  using gemv<T>::A_;
+//  using gemv<T>::x_;
+//  using gemv<T>::y_;
+//  using gemv<T>::offload_;
+//  using gemv<T>::vecIncrement_;
+//
+//  ~gemv_gpu() {
+//    if (alreadyInitialised_) {
+//      // Destroy the handle
+//      cublasCheckError(cublasDestroy(handle_));
+//
+//      // Destroy streams after use
+//      cudaCheckError(cudaStreamDestroy(s1_));
+//      cudaCheckError(cudaStreamDestroy(s2_));
+//      cudaCheckError(cudaStreamDestroy(s3_));
+//    }
+//  }
+//
+//  /** Initialise the required data structures.
+//   * `offload` refers to the data offload type:
+//   *  - Once:    Move data from host to device before all iterations & move from
+//   *             device to host after all iterations
+//   *  - Always:  Move data from host to device and device to host each iteration
+//   *  - Unified: Initialise data as unified memory; no data movement semantics
+//   *             required */
+//  void initialise(gpuOffloadType offload, int m, int n) override {
+//    if (!alreadyInitialised_) {
+//      alreadyInitialised_ = true;
+//      // Perform set-up which doesn't need to happen every problem size change.
+//      // Create a handle for CUBLAS
+//      cublasCheckError(cublasCreate(&handle_));
+//
+//      // Get device identifier
+//      cudaCheckError(cudaGetDevice(&gpuDevice_));
+//
+//      // Initialise 3 streams to asynchronously move data between host and
+//      // device
+//      cudaCheckError(cudaStreamCreate(&s1_));
+//      cudaCheckError(cudaStreamCreate(&s2_));
+//      cudaCheckError(cudaStreamCreate(&s3_));
+//    }
+//
+//    offload_ = offload;
+//    m_ = m;
+//    n_ = n;
+//
+//    if (offload_ == gpuOffloadType::unified) {
+//      cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * m_ * n_));
+//      cudaCheckError(cudaMallocManaged(&x_, sizeof(T) * n_));
+//      cudaCheckError(cudaMallocManaged(&y_, sizeof(T) * m_));
+//    } else {
+//      // Allocate matrices on host
+//      cudaCheckError(cudaMallocHost((void**)&A_, sizeof(T) * m_ * n_));
+//      cudaCheckError(cudaMallocHost((void**)&x_, sizeof(T) * n_));
+//      cudaCheckError(cudaMallocHost((void**)&y_, sizeof(T) * m_));
+//      // Allocate matrices on device
+//      cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * m_ * n_));
+//      cudaCheckError(cudaMalloc((void**)&x_device_, sizeof(T) * n_));
+//      cudaCheckError(cudaMalloc((void**)&y_device_, sizeof(T) * m_));
+//    }
+//
+//    // Initialise the host data structures
+//    initInputMatrixVector();
+//  }
+//
+// private:
+//  /** Perform any required steps before calling the GEMV kernel that should
+//   * be timed. */
+//  void preLoopRequirements() override {
+//    switch (offload_) {
+//      case gpuOffloadType::always: {
+//        // Offload data each iteration - no requirements
+//        break;
+//      }
+//      case gpuOffloadType::once: {
+//        // Offload input data from host to the device.
+//        cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * n_,
+//                                       cudaMemcpyHostToDevice, s1_));
+//        cudaCheckError(cudaMemcpyAsync(x_device_, x_, sizeof(T) * n_,
+//                                       cudaMemcpyHostToDevice, s2_));
+//        cudaCheckError(cudaMemcpyAsync(y_device_, y_, sizeof(T) * m_,
+//                                       cudaMemcpyHostToDevice, s3_));
+//        break;
+//      }
+//      case gpuOffloadType::unified: {
+//        // Prefetch input data to device
+//        cudaCheckError(
+//            cudaMemPrefetchAsync(A_, sizeof(T) * m_ * n_, gpuDevice_, s1_));
+//        cudaCheckError(
+//            cudaMemPrefetchAsync(x_, sizeof(T) * n_, gpuDevice_, s2_));
+//        cudaCheckError(
+//            cudaMemPrefetchAsync(y_, sizeof(T) * m_, gpuDevice_, s3_));
+//        break;
+//      }
+//    }
+//  }
+//
+//  /** Make a call to the BLAS Library Kernel. */
+//  void callGemv() override {
+//    switch (offload_) {
+//      case gpuOffloadType::always: {
+//        // Offload input data from host to the device.
+//        cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * n_,
+//                                       cudaMemcpyHostToDevice, s1_));
+//        cudaCheckError(cudaMemcpyAsync(x_device_, x_, sizeof(T) * n_,
+//                                       cudaMemcpyHostToDevice, s2_));
+//        cudaCheckError(cudaMemcpyAsync(y_device_, y_, sizeof(T) * m_,
+//                                       cudaMemcpyHostToDevice, s3_));
+//        // Call cuBLAS GEMV kernel
+//        if constexpr (std::is_same_v<T, float>) {
+//          cublasCheckError(cublasSgemv(
+//              handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_),
+//              x_device_, vecIncrement_, &beta, y_device_, vecIncrement_));
+//        } else if constexpr (std::is_same_v<T, double>) {
+//          cublasCheckError(cublasDgemv(
+//              handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_),
+//              x_device_, vecIncrement_, &beta, y_device_, vecIncrement_));
+//        }
+//        // Offload output data from device to host
+//        cudaCheckError(cudaMemcpyAsync(y_, y_device_, sizeof(T) * m_,
+//                                       cudaMemcpyDeviceToHost, s3_));
+//        // Ensure device has finished all work.
+//        cudaCheckError(cudaDeviceSynchronize());
+//        break;
+//      }
+//      case gpuOffloadType::once: {
+//        // Call cuBLAS GEMV kernel
+//        if constexpr (std::is_same_v<T, float>) {
+//          cublasCheckError(cublasSgemv(
+//              handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_),
+//              x_device_, vecIncrement_, &beta, y_device_, vecIncrement_));
+//        } else if constexpr (std::is_same_v<T, double>) {
+//          cublasCheckError(cublasDgemv(
+//              handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_),
+//              x_device_, vecIncrement_, &beta, y_device_, vecIncrement_));
+//        }
+//        break;
+//      }
+//      case gpuOffloadType::unified: {
+//        // Call cuBLAS GEMV kernel
+//        if constexpr (std::is_same_v<T, float>) {
+//          cublasCheckError(cublasSgemv(handle_, CUBLAS_OP_N, m_, n_, &alpha, A_,
+//                                       std::max(1, m_), x_, vecIncrement_,
+//                                       &beta, y_, vecIncrement_));
+//        } else if constexpr (std::is_same_v<T, double>) {
+//          cublasCheckError(cublasDgemv(handle_, CUBLAS_OP_N, m_, n_, &alpha, A_,
+//                                       std::max(1, m_), x_, vecIncrement_,
+//                                       &beta, y_, vecIncrement_));
+//        }
+//        break;
+//      }
+//    }
+//  }
+//
+//  /** Perform any required steps after calling the GEMV kernel that should
+//   * be timed. */
+//  void postLoopRequirements() override {
+//    switch (offload_) {
+//      case gpuOffloadType::always: {
+//        // Offload data each iteration - no requirements
+//        break;
+//      }
+//      case gpuOffloadType::once: {
+//        // Offload output data from device to host
+//        cudaCheckError(cudaMemcpyAsync(y_, y_device_, sizeof(T) * m_,
+//                                       cudaMemcpyDeviceToHost, s3_));
+//        // Ensure device has finished all work.
+//        cudaCheckError(cudaDeviceSynchronize());
+//        break;
+//      }
+//      case gpuOffloadType::unified: {
+//        // Ensure all output data resides on host once work has completed
+//        cudaCheckError(
+//            cudaMemPrefetchAsync(y_, sizeof(T) * m_, cudaCpuDeviceId, s3_));
+//        // Ensure device has finished all work.
+//        cudaCheckError(cudaDeviceSynchronize());
+//        break;
+//      }
+//    }
+//  }
+//
+//  /** Do any necessary cleanup (free pointers, close library handles, etc.)
+//   * after Kernel has been called. */
+//  void postCallKernelCleanup() override {
+//    if (offload_ == gpuOffloadType::unified) {
+//      cudaFree(A_);
+//      cudaFree(x_);
+//      cudaFree(y_);
+//    } else {
+//      // Free the memory held on host and device
+//      cudaFreeHost((void*)A_);
+//      cudaFreeHost((void*)x_);
+//      cudaFreeHost((void*)y_);
+//      cudaFree(A_device_);
+//      cudaFree(x_device_);
+//      cudaFree(y_device_);
+//    }
+//  }
+//
+//  /** Whether the initialise function has been called before. */
+//  bool alreadyInitialised_ = false;
+//
+//  /** Handle used when calling cuBLAS. */
+//  cublasHandle_t handle_;
+//
+//  /** CUDA Stream 1 - used to asynchronously move data between host and device.
+//   */
+//  cudaStream_t s1_;
+//
+//  /** CUDA Stream 2 - used to asynchronously move data between host and device.
+//   */
+//  cudaStream_t s2_;
+//
+//  /** CUDA Stream 3 - used to asynchronously move data between host and device.
+//   */
+//  cudaStream_t s3_;
+//
+//  /** The ID of the target GPU Device. */
+//  int gpuDevice_;
+//
+//  /** Input matrix A, held on the device. */
+//  T* A_device_;
+//
+//  /** Input vector x, held on the device. */
+//  T* x_device_;
+//
+//  /** Input vector y, held on the device. */
+//  T* y_device_;
+//
+//  /** The constant value Alpha. */
+//  const T alpha = ALPHA;
+//
+//  /** The constant value Beta. */
+//  const T beta = BETA;
+//};
+//}  // namespace gpu
+//#endif
\ No newline at end of file
diff --git a/include/.DS_Store b/include/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..869e02c3a673dee3916dd63df65263ee873d8adc
GIT binary patch
literal 6148
zcmeHKOG*Pl5UtjL18%a@WnUp%S8W(ikPApmP;lY|#r@#cyLb!16L=oqtEvQs^umoI
zQUzVFy1J^X=fU(xMAH0uH4~YNNP|X9G%7-Ob?C^0C%~k0tfiBu?sm4g=_?ccMHkn8
zBKNYEM|ptWuYa?(<zBoTMr``FU3UEvBhBl_TX**Pep!oJeIsgnN49p=yr`899Sj5m
z!9Xw&4EzKGII}6usbT0~AQ%V+_6%@;$Y{jkI2h{I0i{m>Ag|FXu=$o?PIfGggCRyB
z$x?xqn*528EFJ#ram8^kv~)>Y8S{AM-Qy)`b@;P}ODcw;gMnaR%)qgAr#%0!@XJ&m
z`Qw!61p~prKVu+G+C@9ZNBP-$@OeCIGuky8g>eH72<*`%03Gfl=Q?QPnKt5z<6y{H
S=+|^$Tm+PmP{F`2Fz^NPk}u={

literal 0
HcmV?d00001

diff --git a/include/doGemm.hh b/include/doGemm.hh
index e264273..a33ef7e 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -8,6 +8,7 @@
 
 #if defined CPU_ARMPL
 #include "../ArmPL/gemm.hh"
+#include "../ArmPL/sp_gemm.hh"
 #elif defined CPU_ONEMKL
 #include "../oneMKL/CPU/gemm.hh"
 #elif defined CPU_AOCL
@@ -62,7 +63,9 @@ class doGemm {
 
   /** Run all problem types and write data to CSV files. */
   void collectData() {
-    if (doDense_) {
+    // ToDo -- I've hard coded false here as kernel selection was not working
+    //  .  Needs to be fixed
+    if (false) {
       // Square Problem Sizes...
       // Re-initialise offload threshold structures
       cpuGpu_always_ = cpuGpu_offloadThreshold();
@@ -299,7 +302,7 @@ class doGemm {
 #endif
     }
 
-    if (doSparse_) {    // Square sparse matrix - sparse matrix multiplication
+    if (true) {    // Square sparse matrix - sparse matrix multiplication
       cpuGpu_always_ = cpuGpu_offloadThreshold();
       cpuGpu_once_ = cpuGpu_offloadThreshold();
       cpuGpu_unified_ = cpuGpu_offloadThreshold();
@@ -307,7 +310,7 @@ class doGemm {
               getKernelName() + "_sparse_square.csv");
       if (upperLimit_ >= 32) {
         for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-          callSparseKernels(csvFile, dim, 0.9999);
+          callSparseKernels(csvFile, dim, 0.99);
         }
       }
       // Close file
@@ -524,8 +527,12 @@ class doGemm {
 
 #if CPU_ENABLED
     if (doCPU_) {
+//      std::cout << "about to initialise matrices with size = " << N <<
+//      std::endl;
       spGemmCpu_.initialise(N, sparsity);
+//      std::cout << "about to run spGEMM" << std::endl;
       time_checksum_gflop cpuResult = spGemmCpu_.compute();
+//      std::cout << "about to calculate flops" << std::endl;
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
 		writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
 		               cpuResult.runtime, cpuResult.gflops);
@@ -536,31 +543,38 @@ class doGemm {
     // - UNIFIED : data passed from host to device (and device to host) as
     //             needed
     if (doGPU_) {
-    spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
-    time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
-    gpuResult_unified.gflops =
-    calcGflops(flops, iterations_, gpuResult_unified.runtime);
+      std::cout << "Starting with matrix of size " << N << std::endl;
+      std::cout << "\t\tUnified";
+      spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
+      std::cout << "\tInitialised" << std::endl;
+      time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
+      gpuResult_unified.gflops =
+      calcGflops(flops, iterations_, gpuResult_unified.runtime);
 
     // - ALWAYS: Offload to/from GPU every iteration
-    spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
-    time_checksum_gflop gpuResult_always = spGemmGpu_.compute();
-    gpuResult_always.gflops =
+      std::cout << "\t\tAlways";
+      spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
+      std::cout << "\tInitialised" << std::endl;
+      time_checksum_gflop gpuResult_always = spGemmGpu_.compute();
+      gpuResult_always.gflops =
             calcGflops(flops, iterations_, gpuResult_always.runtime);
 		// - ONCE : Offload to/from GPU once before all iterations and once
 		// after
-		spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
-		time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
-		gpuResult_once.gflops =
+      std::cout << "\t\tOnce";
+      spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
+      std::cout << "\tInitialised" << std::endl;
+		  time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
+		  gpuResult_once.gflops =
 						calcGflops(flops, iterations_, gpuResult_once.runtime);
 		// ToDo -- non-default GPU operations
 
 		// Write lines to CSV file
-		writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
+		  writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
 		               iterations_, gpuResult_once.runtime, gpuResult_once.gflops);
-		writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
+		  writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
 		               iterations_, gpuResult_always.runtime,
 		               gpuResult_always.gflops);
-		writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize,
+		  writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize,
 		               iterations_, gpuResult_unified.runtime,
 		               gpuResult_unified.gflops);
 
diff --git a/include/kernels/.DS_Store b/include/kernels/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..9cc84b2a4ce0fb9e6849637c24a43195d7749e28
GIT binary patch
literal 6148
zcmeHKy-EW?5S}qX4s3#z<(9SqTVahCY=vMiF<^=u7ZCdeU&q(*6?_C=!p3iQCf*&l
zSc=FD?0&oRd-uWZ-VhNlo;P!%84*<&f-H-Ih`MMxGG{Te<e0A>+Gbx!@po17>=U}C
zTe{ml_MiXswX-yBU9WfT8k*|q^5x^={q3r6-TYwPZ~IvT!cgyKT<`d^v-InoFMIWJ
zT+?>-#@0eTsp;YjI0MdrGvEvy7{Hw^Qk^LJ><l;q&cHVVay|qM!DyHi^U;ARw*bH?
z)LF2WUP5AmVKhvNus~Qtff~xzVz7ooAIvWrCPfV=w&sJa@}YU*TpjsCb|;RCK05=>
zz>tB7ZfA1;FY(C~oBUyj@0<Z=;GZ$T(|T1e@KScSj$Tjh+JJG7AtHWB77(llKLOat
gIdYbbY7er8Uo=dLl12169Oyp+nGm0xfnQ+Y3;ZEIrT_o{

literal 0
HcmV?d00001

diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh
index 72fd5dc..dfab687 100644
--- a/include/kernels/CPU/sp_gemm.hh
+++ b/include/kernels/CPU/sp_gemm.hh
@@ -4,6 +4,7 @@
 
 #include <random>
 #include <memory>
+#include <iostream>
 
 namespace cpu {
 
@@ -11,10 +12,11 @@ namespace cpu {
 		template <typename T>
 		class sp_gemm : public ::gemm<T> {
 		public:
-				using ::gemm<T>::gemm;
+        using ::gemm<T>::gemm;
         using ::gemm<T>::initInputMatricesSparse;
-        using ::gemm<T>::toCSR;
-				using ::gemm<T>::m_;
+        using ::gemm<T>::toCSR_int;
+				using ::gemm<T>::iterations_;
+        using ::gemm<T>::m_;
 				using ::gemm<T>::n_;
 				using ::gemm<T>::k_;
 				using ::gemm<T>::A_;
@@ -30,7 +32,8 @@ namespace cpu {
         // Note that the below should be the same as the edges calculation
         // used in the initInputMatricesSparse function.  If changed here,
         // change there
-        nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity_));
+        nnz_ = 1 + (int) ((double)n_ * (double)n_ * (1.0 - sparsity_));
+//        std::cout << "nnz_ = " << nnz_ << std::endl;
 
 				A_ = (T*)malloc(sizeof(T) * n_ * n_);
 				B_ = (T*)malloc(sizeof(T) * n_ * n_);
@@ -38,10 +41,12 @@ namespace cpu {
 
 				initInputMatricesSparse(sparsity_);
 
-        toCSR();
+        toCSR_int();
 			}
 
-			private:
+      int nnz_;
+
+    private:
 				/** Do any necessary cleanup (free pointers, close library handles, etc.)
 				 * after Kernel has been called. */
       void postCallKernelCleanup() {
@@ -50,7 +55,7 @@ namespace cpu {
         free(C_);
       }
 
-      void toCSR() {
+      void toCSR_int() {
         // Move A to CSR
         A_row_ptr_ = new int[n_ + 1];
         A_col_index_ = new int[nnz_];
@@ -86,8 +91,6 @@ namespace cpu {
 
       double sparsity_;
 
-      int nnz_;
-
       int* A_row_ptr_;
       int* A_col_index_;
       int* B_row_ptr_;
@@ -96,7 +99,7 @@ namespace cpu {
       int* C_col_index_;
       T* A_vals_;
       T* B_vals_;
-      T* C_vals;
+      T* C_vals_;
 
 		};
 }  // namespace cpu
diff --git a/include/kernels/CPU/sp_gemv.hh b/include/kernels/CPU/sp_gemv.hh
new file mode 100644
index 0000000..0c84cb0
--- /dev/null
+++ b/include/kernels/CPU/sp_gemv.hh
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "../gemv.hh"
+
+#include <random>
+#include <memory>
+
+namespace cpu {
+
+/** An abstract class for GEMV BLAS kernels. */
+    template <typename T>
+    class sp_gemv : public ::gemv<T> {
+    public:
+        using ::gemv<T>::gemv;
+        using ::gemv<T>::initInputMatrixVectorSparse;
+        using ::gemv<T>::m_;
+        using ::gemv<T>::n_;
+        using ::gemv<T>::A_;
+        using ::gemv<T>::x_;
+        using ::gemv<T>::y_;
+        using ::gemv<T>::sparsity_;
+
+    public:
+        /** Initialise the required data structures. */
+        void initialise(int n, double sparsity) {
+          m_ = n;
+          n_ = n;
+          sparsity_ = sparsity;
+
+          A_ = (T*)malloc(sizeof(T) * m_ * n_);
+          x_ = (T*)malloc(sizeof(T) * n_);
+          y_ = (T*)malloc(sizeof(T) * m_);
+
+          // Initialise the matrix and vectors
+          initInputMatrixVectorSparse();
+        }
+
+    private:
+        /** Do any necessary cleanup (free pointers, close library handles, etc.)
+         * after Kernel has been called. */
+        void postCallKernelCleanup() override {
+          free(A_);
+          free(x_);
+          free(y_);
+        }
+    };
+}  // namespace cpu
\ No newline at end of file
diff --git a/include/kernels/GPU/sp_gemm.hh b/include/kernels/GPU/sp_gemm.hh
index dbfba87..52a5494 100644
--- a/include/kernels/GPU/sp_gemm.hh
+++ b/include/kernels/GPU/sp_gemm.hh
@@ -17,7 +17,8 @@ namespace gpu {
 				 *  - Always:  Move data from host to device and device to host each iteration
 				 *  - Unified: Initialise data as unified memory; no data movement semantics
 				 *             required */
-				virtual void initialise(gpuOffloadType offload, int n, float sparsity) = 0;
+				virtual void initialise(gpuOffloadType offload, int n, float sparsity)
+        = 0;
 
 		protected:
 				/** Whether data should be offloaded to/from the GPU each iteration, or just
diff --git a/include/kernels/GPU/sp_gemv.hh b/include/kernels/GPU/sp_gemv.hh
new file mode 100644
index 0000000..75fd126
--- /dev/null
+++ b/include/kernels/GPU/sp_gemv.hh
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "../gemv.hh"
+
+namespace gpu {
+
+/** An abstract class for GEMV BLAS kernels. */
+    template <typename T>
+    class sp_gemv : public ::gemv<T> {
+    public:
+        using ::gemv<T>::gemv;
+
+        /** Initialise the required data structures.
+         * `offload` refers to the data offload type:
+         *  - Once:    Move data from host to device before all iterations & move from
+         *             device to host after all iterations
+         *  - Always:  Move data from host to device and device to host each iteration
+         *  - Unified: Initialise data as unified memory; no data movement semantics
+         *             required */
+        virtual void initialise(gpuOffloadType offload, int n, float sparsity)
+        = 0;
+
+    protected:
+        /** Whether data should be offloaded to/from the GPU each iteration, or just
+         * before & after. */
+        gpuOffloadType offload_ = gpuOffloadType::always;
+    };
+}  // namespace gpu
\ No newline at end of file
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index d357734..6d75554 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -9,6 +9,7 @@
 #include <cmath>
 #include <limits>
 #include <random>
+#include <iostream>
 
 #include "../utilities.hh"
 
@@ -27,10 +28,13 @@ class gemm {
         std::chrono::high_resolution_clock::now();
 
     // Perform all GEMM calls
+//    std::cout << "about to do pre-loop requirements" << std::endl;
     preLoopRequirements();
     for (int i = 0; i < iterations_; i++) {
+//      std::cout << "entering loop " << i << std::endl;
       callGemm();
     }
+//    std::cout << "about to do post-loop requirements" << std::endl;
     postLoopRequirements();
 
     // Stop Timer
diff --git a/include/kernels/gemv.hh b/include/kernels/gemv.hh
index ba12d02..665fe59 100644
--- a/include/kernels/gemv.hh
+++ b/include/kernels/gemv.hh
@@ -4,6 +4,7 @@
 #include <chrono>
 #include <cmath>
 #include <limits>
+#include <random>
 
 #include "../utilities.hh"
 
@@ -82,6 +83,82 @@ class gemv {
     }
   }
 
+  void initInputMatrixVectorSparse() {
+    // Initialise sparse matrix
+    for (int i = 0; i < (n_ * n_); i++) {
+      A_[i] = 0.0;
+    }
+
+    // Random number generator objects for use in descent
+    std::default_random_engine gen;
+    gen.seed(std::chrono::system_clock::now()
+                     .time_since_epoch().count());
+    std::uniform_real_distribution<double> dist(0.0, 1.0);
+
+    int edges = 1 + (int) (n_ * n_ * (1 - sparsity_));
+
+    // Using a=0.45 and b=c=0.22 as default probabilities
+    for (int i = 0; i < edges; i++) {
+      while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
+                   false)) {}
+    }
+
+    // Initialise the input and output vectors
+    for (int y = 0; y < n_; y++) {
+      x_[y] = (T)((double)(rand() % 100) / 3.0);
+    }
+    for (int y = 0; y < m_; y++) {
+      y_[y] = (T)0.0;
+    }
+  }
+
+  /** Recursive function to populate sparse matrices */
+  bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b,
+            float c, std::default_random_engine* gen,
+            std::uniform_real_distribution<double> dist, bool bin) {
+    // If a 1x1 submatrix, then add an edge and return out
+    if (x1 >= x2 && y1 >= y2) {
+      // Needed to avoid overfloe segfaults with large problem sizes
+      uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1);
+      if (abs(M[index]) > 0.1) {
+        return false;
+      } else {
+        // Add 1.0 if this is a binary graph, and a random real number otherwise
+        M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0);
+        return true;
+      }
+    } else {
+      // Divide up the matrix
+      int xMidPoint = x1 + floor((x2 - x1) / 2);
+      int yMidPoint = y1 + floor((y2 - y1) / 2);
+
+      // ToDo -- add some noise to these values between iterations
+      float newA = a;
+      float newB = b;
+      float newC = c;
+
+      // Work out which quarter to recurse into
+      // There are some ugly ternary operators here to avoid going out of bounds in the edge case
+      // that we are already at 1 width or 1 height
+      float randomNum = dist(*gen);
+      if (randomNum < a) {
+        return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+                    newA, newB, newC, gen, dist, bin);
+      } else if (randomNum < (a + b)) {
+        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+                    newA, newB, newC, gen, dist, bin);
+      } else if (randomNum < (a + b + c)) {
+        return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+                    newA, newB, newC, gen, dist, bin);
+      } else {
+        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+                    ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
+                    gen, dist, bin);
+      }
+    }
+    return true;
+  }
+
   /** Call the extern consume() function. */
   void callConsume() { consume((void*)A_, (void*)x_, (void*)y_); }
 
@@ -105,4 +182,6 @@ class gemv {
 
   /** The distance between two vector elements. */
   const int vecIncrement_ = 1;
+
+  double sparsity_ = 0.0;
 };

From a8e5c4690238832761286e2cde7ab7f2170acf26 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:53:08 +0100
Subject: [PATCH 25/32] Adding AOCL files

---
 .idea/workspace.xml            |   6 +-
 ArmPL/sp_gemm.hh               | 266 +++++++--------------------------
 createGflopsGraphs.py          |   2 +-
 cuBLAS/common.hh               |   2 +-
 include/doGemm.hh              |  11 --
 include/kernels/CPU/sp_gemm.hh |  10 +-
 include/kernels/gemm.hh        |   3 -
 src/main.cc                    |  24 +--
 8 files changed, 80 insertions(+), 244 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index b954508..e9a4d65 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -125,9 +125,9 @@
       </method>
     </configuration>
     <list>
-      <item itemvalue="C/C++ File.main.cc" />
       <item itemvalue="Native Application.all" />
       <item itemvalue="Native Application.gpu-blob" />
+      <item itemvalue="C/C++ File.main.cc" />
     </list>
     <recent_temporary>
       <list>
@@ -171,7 +171,9 @@
       <workItem from="1723101242209" duration="21225000" />
       <workItem from="1724244974273" duration="40294000" />
       <workItem from="1726568120590" duration="8508000" />
-      <workItem from="1726828018604" duration="38592000" />
+      <workItem from="1726828018604" duration="52619000" />
+      <workItem from="1727941759103" duration="43000" />
+      <workItem from="1727941814674" duration="165000" />
     </task>
     <task id="LOCAL-00001" summary="trivial changes">
       <option name="closed" value="true" />
diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh
index 47b0bf9..cb6b443 100644
--- a/ArmPL/sp_gemm.hh
+++ b/ArmPL/sp_gemm.hh
@@ -25,6 +25,9 @@ class sp_gemm_cpu : public sp_gemm<T> {
   using sp_gemm<T>::B_;
   using sp_gemm<T>::C_;
   using sp_gemm<T>::nnz_;
+  using sp_gemm<T>::A_vals_;
+  using sp_gemm<T>::B_vals_;
+  using sp_gemm<T>::C_vals_;
 
  private:
   /** Make call to the GEMM kernel. */
@@ -57,19 +60,18 @@ class sp_gemm_cpu : public sp_gemm<T> {
       status_ = armpl_spmm_exec_s(transA_,
                                   transB_,
                                   alpha,
-                                  *A_armpl_,
-                                  *B_armpl_,
+                                  A_armpl_,
+                                  B_armpl_,
                                   beta,
-                                  *B_armpl_);
+                                  B_armpl_);
     } else if constexpr (std::is_same_v<T, double>) {
-      std::cout << "About to execute dgemm" << std::endl;
       status_ = armpl_spmm_exec_d(transA_,
                                   transB_,
                                   alpha,
-                                  *A_armpl_,
-                                  *B_armpl_,
+                                  A_armpl_,
+                                  B_armpl_,
                                   beta,
-                                  *B_armpl_);
+                                  B_armpl_);
     } else {
       // Un-specialised class will not do any work - print error and exit.
       std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported."
@@ -89,20 +91,18 @@ class sp_gemm_cpu : public sp_gemm<T> {
    * be timed. */
   void preLoopRequirements() override {
     // Need to put A_ and B_ into A_armpl_ and B_armpl_
-    // ToDo -- Error catching
     toCSR_armpl();
-//    std::cout << "toCSR_armpl() wrapped up without a problem" << std::endl;
   }
 
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
   void postLoopRequirements() override {
-    status_ = armpl_spmat_destroy(*A_armpl_);
+    status_ = armpl_spmat_destroy(A_armpl_);
     if (status_ != ARMPL_STATUS_SUCCESS) {
       std::cout << "ERROR " << status_ << std::endl;
       exit(1);
     }
-    status_ = armpl_spmat_destroy(*B_armpl_);
+    status_ = armpl_spmat_destroy(B_armpl_);
     if (status_ != ARMPL_STATUS_SUCCESS) {
       std::cout << "ERROR " << status_ << std::endl;
       exit(1);
@@ -113,12 +113,12 @@ class sp_gemm_cpu : public sp_gemm<T> {
 //      exit(1);
 //    }
 
-//    delete [] A_armpl_row_ptr_;
-//    delete [] A_armpl_col_index_;
-//    delete [] A_vals_;
-//    delete [] B_armpl_row_ptr_;
-//    delete [] B_armpl_col_index_;
-//    delete [] B_vals_;
+    delete [] A_armpl_row_ptr_;
+    delete [] A_armpl_col_index_;
+    delete [] A_vals_;
+    delete [] B_armpl_row_ptr_;
+    delete [] B_armpl_col_index_;
+    delete [] B_vals_;
 //    delete [] C_armpl_row_ptr_;
 //    delete [] C_armpl_col_index_;
 //    delete [] C_vals_;
@@ -131,10 +131,6 @@ class sp_gemm_cpu : public sp_gemm<T> {
   /** The constant value Beta. */
   const T beta = BETA;
 
-  armpl_status_t status_;
-
-  armpl_spmat_t armpl_A, armpl_B, armpl_C;
-
   void toCSR_armpl() {
     n_armpl_ = n_;
     // ToDo -- check whether flags_ is correct!
@@ -145,50 +141,19 @@ class sp_gemm_cpu : public sp_gemm<T> {
     A_armpl_col_index_ = new armpl_int_t[nnz_];
     A_vals_ = new T[nnz_];
     A_armpl_row_ptr_[0] = 0;
-
     int nnz_encountered = 0;
-//    std::cout << "About to load A into csr" << std::endl;
+
     for (int row = 0; row < n_; row++) {
-//      std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered << std::endl;
       A_armpl_row_ptr_[row + 1] = nnz_encountered;
       for (int col = 0; col < n_; col++) {
         if (A_[(row * n_) + col] != 0.0) {
-//          std::cout << "\t\tCol " << col << " = " << A_[(row * n_) + col] <<
-//          std::endl;
           A_armpl_col_index_[nnz_encountered] = col;
           A_vals_[nnz_encountered] = static_cast<T>(A_[(row * n_) + col]);
           nnz_encountered++;
-//          std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl;
         }
       }
     }
 
-//    std::cout << "___A =" << std::endl << "\t\t[";
-//    for (int i = 0; i < (n_ + 1); i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << A_armpl_row_ptr_[i];
-//    }
-//    std::cout << "]" << std::endl << "\t\t[";
-//    for (int i = 0; i < nnz_; i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << A_armpl_col_index_[i];
-//    }
-//    std::cout << "]" << std::endl << "\t\t[";
-//    for (int i = 0; i < nnz_; i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << A_vals_[i];
-//    }
-//    std::cout << "]" << std::endl;
-
-
-//    std::cout << "About to load B into csr" << std::endl;
-
     // Move B to CSR
     B_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
     B_armpl_col_index_ = new armpl_int_t[nnz_];
@@ -197,113 +162,20 @@ class sp_gemm_cpu : public sp_gemm<T> {
 
     nnz_encountered = 0;
     for (int row = 0; row < n_; row++) {
-//      std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered <<
-//      std::endl;
       B_armpl_row_ptr_[row + 1] = nnz_encountered;
       for (int col = 0; col < n_; col++) {
         if (B_[(row * n_) + col] != 0.0) {
-//          std::cout << "\t\tCol " << col << " = " << B_[(row * n_) + col] << std::endl;
           B_armpl_col_index_[nnz_encountered] = col;
           B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
           nnz_encountered++;
-//          std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl;
         }
       }
     }
-//    std::cout << "___B =" << std::endl << "\t\t[";
-//    for (int i = 0; i < (n_ + 1); i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << B_armpl_row_ptr_[i];
-//    }
-//    std::cout << "]" << std::endl << "\t\t[";
-//    for (int i = 0; i < nnz_; i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << B_armpl_col_index_[i];
-//    }
-//    std::cout << "]" << std::endl << "\t\t[";
-//    for (int i = 0; i < nnz_; i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << B_vals_[i];
-//    }
-//    std::cout << "]" << std::endl;
-
-
-//    // Move B to CSR
-//    C_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
-//    C_armpl_col_index_ = new armpl_int_t[nnz_];
-//    C_vals_ = new T[nnz_];
-//    C_armpl_row_ptr_[0] = 0;
-//
-//    nnz_encountered = 0;
-////    std::cout << "About to load C into csr" << std::endl;
-//    for (int row = 0; row < n_; row++) {
-////      std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered << std::endl;
-//      C_armpl_row_ptr_[row + 1] = nnz_encountered;
-//      for (int col = 0; col < n_; col++) {
-//        if (A_[(row * n_) + col] != 0.0) {
-//          C_armpl_col_index_[nnz_encountered] = col;
-//          C_vals_[nnz_encountered] = A_[(row * n_) + col];
-//          nnz_encountered++;
-////          std::cout << "\t\tCol " << col << " = " << C_vals_[nnz_encountered] <<
-////          std::endl;
-////          std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl;
-//        }
-//      }
-//    }
-
-//    std::cout << "___C =" << std::endl << "\t\t[";
-//    for (int i = 0; i < (n_ + 1); i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << C_armpl_row_ptr_[i];
-//    }
-//    std::cout << "]" << std::endl << "\t\t[";
-//    for (int i = 0; i < nnz_; i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << C_armpl_col_index_[i];
-//    }
-//    std::cout << "]" << std::endl << "\t\t[";
-//    for (int i = 0; i < nnz_; i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << C_vals_[i];
-//    }
-//    std::cout << "]" << std::endl;
 
-
-
-//    std::cout << "Loading csr A into armpl storage formats" << std::endl;
     if constexpr (std::is_same_v<T, float>) {
-      std::cout << "\tn_armpl_ = " << n_armpl_ << std::endl;
-      std::cout << "\tA_armpl_row_ptr_ (size = " << sizeof
-      (A_armpl_row_ptr_[0]) << ") = [" << A_armpl_row_ptr_[0];
-      for (int i = 1; i < (n_ + 1); i++) {
-        std::cout << ", " << A_armpl_row_ptr_[i];
-      }
-      std::cout << "]" << std::endl << "\tA_armpl_col_index_ (size = " <<
-      sizeof(A_armpl_col_index_[0]) << ") = [" <<
-      A_armpl_col_index_[0];
-      for (int i = 1; i < nnz_; i++) {
-        std::cout << ", " << A_armpl_col_index_[i];
-      }
-      std::cout << "]" << std::endl << "\tA_vals_ (size = " << sizeof
-      (A_vals_[0]) << ") = [" << A_vals_[0];
-      for (int i = 1; i < nnz_; i++) {
-        std::cout << ", " << A_vals_[i];
-      }
-      std::cout << "]" << std::endl << "flags: " << flags_ << std::endl;
-
-      status_ = armpl_spmat_create_csr_s(A_armpl_,
+//      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&A_armpl_,
                                          n_armpl_,
                                          n_armpl_,
                                          A_armpl_row_ptr_,
@@ -315,21 +187,9 @@ class sp_gemm_cpu : public sp_gemm<T> {
         exit(1);
       }
 
-//      std::cout << "Loading csr C into armpl storage formats" << std::endl;
-//      status_ = armpl_spmat_create_csr_s(C_armpl_,
-//                                         n_armpl_,
-//                                         n_armpl_,
-//                                         C_armpl_row_ptr_,
-//                                         C_armpl_col_index_,
-//                                         C_vals_,
-//                                         flags_);
-//      if (status_ != ARMPL_STATUS_SUCCESS) {
-//        std::cout << "ERROR " << status_ << std::endl;
-//        exit(1);
-//      }
-
-//      std::cout << "Loading csr B into armpl storage formats" << std::endl;
-      status_ = armpl_spmat_create_csr_s(B_armpl_,
+//      printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&B_armpl_,
                                          n_armpl_,
                                          n_armpl_,
                                          B_armpl_row_ptr_,
@@ -341,28 +201,9 @@ class sp_gemm_cpu : public sp_gemm<T> {
         exit(1);
       }
     } else if constexpr (std::is_same_v<T, double>) {
-      std::cout << "\tn_armpl_ = " << n_armpl_ << std::endl;
-      std::cout << "\tA_armpl_row_ptr_ (size = " << sizeof
-      (A_armpl_row_ptr_[0]) << ") = [" << A_armpl_row_ptr_[0];
-      for (int i = 1; i < (n_ + 1); i++) {
-        std::cout << ", " << A_armpl_row_ptr_[i];
-      }
-      std::cout << "]" << std::endl << "\tA_armpl_col_index_ (size = " <<
-      sizeof(A_armpl_col_index_[0]) << ") = [" <<
-      A_armpl_col_index_[0];
-      for (int i = 1; i < nnz_; i++) {
-        std::cout << ", " << A_armpl_col_index_[i];
-      }
-      std::cout << "]" << std::endl << "\tA_vals_ (size = " << sizeof
-      (A_vals_[0]) << ") = [" << A_vals_[0];
-      for (int i = 1; i < nnz_; i++) {
-        std::cout << ", " << A_vals_[i];
-      }
-      std::cout << "]" << std::endl << "flags: " << flags_ << std::endl;
-
-
-      std::cout << "About to create CSR A (double)" << std::endl;
-      status_ = armpl_spmat_create_csr_d(A_armpl_,
+//      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
+//                nnz_, flags_
+      status_ = armpl_spmat_create_csr_d(&A_armpl_,
                                          n_armpl_,
                                          n_armpl_,
                                          A_armpl_row_ptr_,
@@ -374,22 +215,9 @@ class sp_gemm_cpu : public sp_gemm<T> {
         exit(1);
       }
 
-//      std::cout << "Loading csr C into armpl storage formats" << std::endl;
-//      status_ = armpl_spmat_create_csr_d(C_armpl_,
-//                                         n_armpl_,
-//                                         n_armpl_,
-//                                         C_armpl_row_ptr_,
-//                                         C_armpl_col_index_,
-//                                         C_vals_,
-//                                         flags_);
-//      if (status_ != ARMPL_STATUS_SUCCESS) {
-//        std::cout << "ERROR " << status_ << std::endl;
-//        exit(1);
-//      }
-
-//      std::cout << "Loading csr B into armpl storage formats" << std::endl;
-      std::cout << "About to create CSR B (double)" << std::endl;
-      status_ = armpl_spmat_create_csr_d(B_armpl_,
+//      printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_d(&B_armpl_,
                                          n_armpl_,
                                          n_armpl_,
                                          B_armpl_row_ptr_,
@@ -400,11 +228,33 @@ class sp_gemm_cpu : public sp_gemm<T> {
         std::cout << "ERROR " << status_ << std::endl;
         exit(1);
       }
+//      std::cout << "Okay, all matrices made!!" << std::endl;
     }
 
-//    std::cout << "Okay, all matrices made!!" << std::endl;
   }
 
+  void printCSR(armpl_int_t n, armpl_int_t* rp, armpl_int_t* ci, T* v,
+                armpl_int_t nz, armpl_int_t f) {
+    std::cout << "\tn = " << n << std::endl;
+    std::cout << "\trow ptr (size = " << sizeof(rp[0]) << ") = [" << rp[0];
+    for (int i = 1; i < (n + 1); i++) {
+      std::cout << ", " << rp[i];
+    }
+    std::cout << "]" << std::endl << "\tcol ind (size = " << sizeof(ci[0]) <<
+    ") = [" << ci[0];
+    for (int i = 1; i < nz; i++) {
+      std::cout << ", " << ci[i];
+    }
+    std::cout << "]" << std::endl << "\tvals (size = " << sizeof(v[0]) <<
+    ") = [" << v[0];
+    for (int i = 1; i < nz; i++) {
+      std::cout << ", " << v[i];
+    }
+    std::cout << "]" << std::endl << "\tflags = " << f << std::endl;
+  }
+
+  armpl_status_t status_;
+
   armpl_int_t flags_;
 
   armpl_int_t n_armpl_;
@@ -416,13 +266,9 @@ class sp_gemm_cpu : public sp_gemm<T> {
   armpl_int_t* C_armpl_row_ptr_;
   armpl_int_t* C_armpl_col_index_;
 
-  T* A_vals_;
-  T* B_vals_;
-  T* C_vals_;
-
-  armpl_spmat_t* A_armpl_;
-  armpl_spmat_t* B_armpl_;
-  armpl_spmat_t* C_armpl_;
+  armpl_spmat_t A_armpl_;
+  armpl_spmat_t B_armpl_;
+  armpl_spmat_t C_armpl_;
 
   armpl_sparse_hint_value transA_ = ARMPL_SPARSE_OPERATION_NOTRANS;
   armpl_sparse_hint_value transB_ = ARMPL_SPARSE_OPERATION_NOTRANS;
diff --git a/createGflopsGraphs.py b/createGflopsGraphs.py
index 07ac243..ee1a389 100644
--- a/createGflopsGraphs.py
+++ b/createGflopsGraphs.py
@@ -372,7 +372,7 @@
 
     plt.margins(x=0.01, y=0.01)
     leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
-    for obj in leg.legendHandles:
+    for obj in leg.legend_handles:
         obj.set_linewidth(3.0)
         obj.set_markersize(15.0)
         obj.set_markeredgewidth(3.0)
diff --git a/cuBLAS/common.hh b/cuBLAS/common.hh
index c8086db..f3ff6ef 100644
--- a/cuBLAS/common.hh
+++ b/cuBLAS/common.hh
@@ -2,7 +2,7 @@
 
 #if defined GPU_CUBLAS
 
-#include "cusparse.h"
+#include <cusparse_v2.h>
 
 /** Macro function to check if error occurred when calling cuBLAS. */
 /** Macro function to check if error occurred when calling CUDA. */
diff --git a/include/doGemm.hh b/include/doGemm.hh
index a33ef7e..c71684f 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -527,12 +527,8 @@ class doGemm {
 
 #if CPU_ENABLED
     if (doCPU_) {
-//      std::cout << "about to initialise matrices with size = " << N <<
-//      std::endl;
       spGemmCpu_.initialise(N, sparsity);
-//      std::cout << "about to run spGEMM" << std::endl;
       time_checksum_gflop cpuResult = spGemmCpu_.compute();
-//      std::cout << "about to calculate flops" << std::endl;
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
 		writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
 		               cpuResult.runtime, cpuResult.gflops);
@@ -543,26 +539,19 @@ class doGemm {
     // - UNIFIED : data passed from host to device (and device to host) as
     //             needed
     if (doGPU_) {
-      std::cout << "Starting with matrix of size " << N << std::endl;
-      std::cout << "\t\tUnified";
       spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
-      std::cout << "\tInitialised" << std::endl;
       time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
       gpuResult_unified.gflops =
       calcGflops(flops, iterations_, gpuResult_unified.runtime);
 
     // - ALWAYS: Offload to/from GPU every iteration
-      std::cout << "\t\tAlways";
       spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
-      std::cout << "\tInitialised" << std::endl;
       time_checksum_gflop gpuResult_always = spGemmGpu_.compute();
       gpuResult_always.gflops =
             calcGflops(flops, iterations_, gpuResult_always.runtime);
 		// - ONCE : Offload to/from GPU once before all iterations and once
 		// after
-      std::cout << "\t\tOnce";
       spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
-      std::cout << "\tInitialised" << std::endl;
 		  time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
 		  gpuResult_once.gflops =
 						calcGflops(flops, iterations_, gpuResult_once.runtime);
diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh
index dfab687..a11dcd0 100644
--- a/include/kernels/CPU/sp_gemm.hh
+++ b/include/kernels/CPU/sp_gemm.hh
@@ -33,7 +33,6 @@ namespace cpu {
         // used in the initInputMatricesSparse function.  If changed here,
         // change there
         nnz_ = 1 + (int) ((double)n_ * (double)n_ * (1.0 - sparsity_));
-//        std::cout << "nnz_ = " << nnz_ << std::endl;
 
 				A_ = (T*)malloc(sizeof(T) * n_ * n_);
 				B_ = (T*)malloc(sizeof(T) * n_ * n_);
@@ -46,6 +45,12 @@ namespace cpu {
 
       int nnz_;
 
+    protected:
+
+        T* A_vals_;
+        T* B_vals_;
+        T* C_vals_;
+
     private:
 				/** Do any necessary cleanup (free pointers, close library handles, etc.)
 				 * after Kernel has been called. */
@@ -97,9 +102,6 @@ namespace cpu {
       int* B_col_index_;
       int* C_row_ptr_;
       int* C_col_index_;
-      T* A_vals_;
-      T* B_vals_;
-      T* C_vals_;
 
 		};
 }  // namespace cpu
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index 6d75554..bbd17cb 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -28,13 +28,10 @@ class gemm {
         std::chrono::high_resolution_clock::now();
 
     // Perform all GEMM calls
-//    std::cout << "about to do pre-loop requirements" << std::endl;
     preLoopRequirements();
     for (int i = 0; i < iterations_; i++) {
-//      std::cout << "entering loop " << i << std::endl;
       callGemm();
     }
-//    std::cout << "about to do post-loop requirements" << std::endl;
     postLoopRequirements();
 
     // Stop Timer
diff --git a/src/main.cc b/src/main.cc
index 51d1cf1..e508b5b 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -50,18 +50,18 @@ int main(int argc, char** argv) {
 
   // -------- GEMV --------
   // SGEMV Comparison
-  std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl;
-  doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                      doGpu);
-  sgemv.collectData();
-  std::cout << "Finished!" << std::endl;
-
-  // DGEMV Comparison
-  std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl;
-  doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                       doGpu);
-  dgemv.collectData();
-  std::cout << "Finished!" << std::endl;
+//  std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl;
+//  doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
+//                      doGpu);
+//  sgemv.collectData();
+//  std::cout << "Finished!" << std::endl;
+//
+//  // DGEMV Comparison
+//  std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl;
+//  doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
+//                       doGpu);
+//  dgemv.collectData();
+//  std::cout << "Finished!" << std::endl;
 
   free(absPath);
   return 0;

From 9eb464668e481ef1148dd4a160ccea3fe5e7563f Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Thu, 3 Oct 2024 10:51:18 +0100
Subject: [PATCH 26/32] No longer overwriting B_

---
 .idea/workspace.xml | 22 +++++++++++----
 ArmPL/sp_gemm.hh    | 69 ++++++++++++++++++++++++++++++++++++---------
 2 files changed, 73 insertions(+), 18 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index e9a4d65..cb692bc 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -15,7 +15,10 @@
     </configurations>
   </component>
   <component name="ChangeListManager">
-    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding AOCL files" />
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="working changes">
+      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" afterDir="false" />
+    </list>
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
@@ -125,9 +128,9 @@
       </method>
     </configuration>
     <list>
+      <item itemvalue="C/C++ File.main.cc" />
       <item itemvalue="Native Application.all" />
       <item itemvalue="Native Application.gpu-blob" />
-      <item itemvalue="C/C++ File.main.cc" />
     </list>
     <recent_temporary>
       <list>
@@ -174,6 +177,7 @@
       <workItem from="1726828018604" duration="52619000" />
       <workItem from="1727941759103" duration="43000" />
       <workItem from="1727941814674" duration="165000" />
+      <workItem from="1727941995420" duration="3199000" />
     </task>
     <task id="LOCAL-00001" summary="trivial changes">
       <option name="closed" value="true" />
@@ -495,7 +499,15 @@
       <option name="project" value="LOCAL" />
       <updated>1724234752813</updated>
     </task>
-    <option name="localTasksCounter" value="41" />
+    <task id="LOCAL-00041" summary="working changes">
+      <option name="closed" value="true" />
+      <created>1727942003511</created>
+      <option name="number" value="00041" />
+      <option name="presentableId" value="LOCAL-00041" />
+      <option name="project" value="LOCAL" />
+      <updated>1727942003511</updated>
+    </task>
+    <option name="localTasksCounter" value="42" />
     <servers />
   </component>
   <component name="TypeScriptGeneratedFilesManager">
@@ -513,7 +525,6 @@
     </option>
   </component>
   <component name="VcsManagerConfiguration">
-    <MESSAGE value="Adding sparse kernel to doGemm" />
     <MESSAGE value="Adding matrix type enum class" />
     <MESSAGE value="changes" />
     <MESSAGE value="adding command line kernel selection" />
@@ -538,6 +549,7 @@
     <MESSAGE value="Finalising" />
     <MESSAGE value="Rebasing" />
     <MESSAGE value="Adding AOCL files" />
-    <option name="LAST_COMMIT_MESSAGE" value="Adding AOCL files" />
+    <MESSAGE value="working changes" />
+    <option name="LAST_COMMIT_MESSAGE" value="working changes" />
   </component>
 </project>
\ No newline at end of file
diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh
index cb6b443..28a2ca3 100644
--- a/ArmPL/sp_gemm.hh
+++ b/ArmPL/sp_gemm.hh
@@ -53,9 +53,6 @@ class sp_gemm_cpu : public sp_gemm<T> {
     // Todo -- See if using armpl_spmat_hint can improve performance here.
     //  If so, follow with optimisation functions
 
-
-
-
     if constexpr (std::is_same_v<T, float>) {
       status_ = armpl_spmm_exec_s(transA_,
                                   transB_,
@@ -63,7 +60,7 @@ class sp_gemm_cpu : public sp_gemm<T> {
                                   A_armpl_,
                                   B_armpl_,
                                   beta,
-                                  B_armpl_);
+                                  C_armpl_);
     } else if constexpr (std::is_same_v<T, double>) {
       status_ = armpl_spmm_exec_d(transA_,
                                   transB_,
@@ -71,7 +68,7 @@ class sp_gemm_cpu : public sp_gemm<T> {
                                   A_armpl_,
                                   B_armpl_,
                                   beta,
-                                  B_armpl_);
+                                  C_armpl_);
     } else {
       // Un-specialised class will not do any work - print error and exit.
       std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported."
@@ -107,11 +104,11 @@ class sp_gemm_cpu : public sp_gemm<T> {
       std::cout << "ERROR " << status_ << std::endl;
       exit(1);
     }
-//    status_ = armpl_spmat_destroy(*C_armpl_);
-//    if (status_ != ARMPL_STATUS_SUCCESS) {
-//      std::cout << "ERROR " << status_ << std::endl;
-//      exit(1);
-//    }
+    status_ = armpl_spmat_destroy(C_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
 
     delete [] A_armpl_row_ptr_;
     delete [] A_armpl_col_index_;
@@ -119,9 +116,9 @@ class sp_gemm_cpu : public sp_gemm<T> {
     delete [] B_armpl_row_ptr_;
     delete [] B_armpl_col_index_;
     delete [] B_vals_;
-//    delete [] C_armpl_row_ptr_;
-//    delete [] C_armpl_col_index_;
-//    delete [] C_vals_;
+    delete [] C_armpl_row_ptr_;
+    delete [] C_armpl_col_index_;
+    delete [] C_vals_;
 
   }
 
@@ -172,6 +169,24 @@ class sp_gemm_cpu : public sp_gemm<T> {
       }
     }
 
+    // Move C to CSR
+    C_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
+    C_armpl_col_index_ = new armpl_int_t[nnz_];
+    C_vals_ = new T[nnz_];
+    C_armpl_row_ptr_[0] = 0;
+
+    nnz_encountered = 0;
+    for (int row = 0; row < n_; row++) {
+      C_armpl_row_ptr_[row + 1] = nnz_encountered;
+      for (int col = 0; col < n_; col++) {
+        if (B_[(row * n_) + col] != 0.0) {
+          C_armpl_col_index_[nnz_encountered] = col;
+          C_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
+          nnz_encountered++;
+        }
+      }
+    }
+
     if constexpr (std::is_same_v<T, float>) {
 //      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
 //                nnz_, flags_);
@@ -200,6 +215,20 @@ class sp_gemm_cpu : public sp_gemm<T> {
         std::cout << "ERROR " << status_ << std::endl;
         exit(1);
       }
+
+//      printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&C_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         C_armpl_row_ptr_,
+                                         C_armpl_col_index_,
+                                         C_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
     } else if constexpr (std::is_same_v<T, double>) {
 //      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
 //                nnz_, flags_
@@ -228,6 +257,20 @@ class sp_gemm_cpu : public sp_gemm<T> {
         std::cout << "ERROR " << status_ << std::endl;
         exit(1);
       }
+
+//      printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_d(&C_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         C_armpl_row_ptr_,
+                                         C_armpl_col_index_,
+                                         C_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
 //      std::cout << "Okay, all matrices made!!" << std::endl;
     }
 

From 7f82b7d52f0ab2420774159d9099fb40aef00ce2 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:56:42 +0100
Subject: [PATCH 27/32] Adding AOCL files

---
 .idea/workspace.xml            | 25 +++++++++----
 include/doGemm.hh              | 66 +++++++++++++++++++++++++++++-----
 include/doGemv.hh              | 57 ++++++++++++++++-------------
 include/kernels/CPU/sp_gemm.hh |  7 ++--
 include/kernels/gemm.hh        |  7 ++--
 include/kernels/gemv.hh        |  5 +--
 src/main.cc                    | 62 +++++++++++++++++++++-----------
 7 files changed, 160 insertions(+), 69 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index cb692bc..a5afad2 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -15,9 +15,14 @@
     </configurations>
   </component>
   <component name="ChangeListManager">
-    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="working changes">
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="No longer overwriting B_">
       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/doGemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/doGemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/sp_gemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/gemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/gemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/main.cc" beforeDir="false" afterPath="$PROJECT_DIR$/src/main.cc" afterDir="false" />
     </list>
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -177,7 +182,7 @@
       <workItem from="1726828018604" duration="52619000" />
       <workItem from="1727941759103" duration="43000" />
       <workItem from="1727941814674" duration="165000" />
-      <workItem from="1727941995420" duration="3199000" />
+      <workItem from="1727941995420" duration="22747000" />
     </task>
     <task id="LOCAL-00001" summary="trivial changes">
       <option name="closed" value="true" />
@@ -507,7 +512,15 @@
       <option name="project" value="LOCAL" />
       <updated>1727942003511</updated>
     </task>
-    <option name="localTasksCounter" value="42" />
+    <task id="LOCAL-00042" summary="No longer overwriting B_">
+      <option name="closed" value="true" />
+      <created>1727949079616</created>
+      <option name="number" value="00042" />
+      <option name="presentableId" value="LOCAL-00042" />
+      <option name="project" value="LOCAL" />
+      <updated>1727949079616</updated>
+    </task>
+    <option name="localTasksCounter" value="43" />
     <servers />
   </component>
   <component name="TypeScriptGeneratedFilesManager">
@@ -525,7 +538,6 @@
     </option>
   </component>
   <component name="VcsManagerConfiguration">
-    <MESSAGE value="Adding matrix type enum class" />
     <MESSAGE value="changes" />
     <MESSAGE value="adding command line kernel selection" />
     <MESSAGE value="Adding basic sparse multiplication kernel for default CPU and GPU" />
@@ -550,6 +562,7 @@
     <MESSAGE value="Rebasing" />
     <MESSAGE value="Adding AOCL files" />
     <MESSAGE value="working changes" />
-    <option name="LAST_COMMIT_MESSAGE" value="working changes" />
+    <MESSAGE value="No longer overwriting B_" />
+    <option name="LAST_COMMIT_MESSAGE" value="No longer overwriting B_" />
   </component>
 </project>
\ No newline at end of file
diff --git a/include/doGemm.hh b/include/doGemm.hh
index c71684f..a3e5e77 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -65,7 +65,7 @@ class doGemm {
   void collectData() {
     // ToDo -- I've hard coded false here as kernel selection was not working
     //  .  Needs to be fixed
-    if (false) {
+    if (doDense_) {
       // Square Problem Sizes...
       // Re-initialise offload threshold structures
       cpuGpu_always_ = cpuGpu_offloadThreshold();
@@ -301,13 +301,12 @@ class doGemm {
     }
 #endif
     }
-
-    if (true) {    // Square sparse matrix - sparse matrix multiplication
+    if (doSparse_) {    // Square sparse matrix - sparse matrix multiplication
       cpuGpu_always_ = cpuGpu_offloadThreshold();
       cpuGpu_once_ = cpuGpu_offloadThreshold();
       cpuGpu_unified_ = cpuGpu_offloadThreshold();
       std::ofstream csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
-              getKernelName() + "_sparse_square.csv");
+              getKernelName() + "_sparse_square_99.csv");
       if (upperLimit_ >= 32) {
         for (int dim = startDimention_; dim <= upperLimit_; dim++) {
           callSparseKernels(csvFile, dim, 0.99);
@@ -316,10 +315,59 @@ class doGemm {
       // Close file
       csvFile.close();
 #if CPU_ENABLED && GPU_ENABLED
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-	    printOffloadThreshold("Sparse Square");
-    }
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Sparse Square 0.99");
+      }
+#endif
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
+              getKernelName() + "_sparse_square_999.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+          callSparseKernels(csvFile, dim, 0.999);
+        }
+      }
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Sparse Square 0.999");
+      }
+#endif
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
+              getKernelName() + "_sparse_square_9999.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+          callSparseKernels(csvFile, dim, 0.9999);
+        }
+      }
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Sparse Square 0.9999");
+      }
+#endif
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
+                                          getKernelName() +
+                                          "_sparse_square_99999.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+          callSparseKernels(csvFile, dim, 0.99999);
+        }
+      }
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Sparse Square 0.99999");
+      }
 #endif
     }
   }
@@ -530,7 +578,7 @@ class doGemm {
       spGemmCpu_.initialise(N, sparsity);
       time_checksum_gflop cpuResult = spGemmCpu_.compute();
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
-		writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
+		  writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
 		               cpuResult.runtime, cpuResult.gflops);
     }
 #endif
diff --git a/include/doGemv.hh b/include/doGemv.hh
index b86aad6..12cd097 100644
--- a/include/doGemv.hh
+++ b/include/doGemv.hh
@@ -33,13 +33,16 @@ class doGemv {
  public:
   doGemv(const std::string csvDir, const int iters, const int startDim,
          const int upperLimit, const bool cpuEnabled = true,
-         const bool gpuEnabled = true)
+         const bool gpuEnabled = true, const bool doDense = true, const bool
+         doSparse = true)
       : CSV_DIR(csvDir),
         iterations_(iters),
         startDimention_(startDim),
         upperLimit_(upperLimit),
         doCPU_(cpuEnabled),
-        doGPU_(gpuEnabled)
+        doGPU_(gpuEnabled),
+        doDense_(doDense),
+        doSparse_(doSparse)
 #if CPU_ENABLED
         ,
         gemvCpu_(iterations_)
@@ -56,28 +59,29 @@ class doGemv {
 
   /** Run all problem types and write data to CSV files. */
   void collectData() {
-    // Square Problem Sizes...
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    std::ofstream csvFile =
-        initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv");
-    for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-      // M = dim, N = dim;
-      callKernels(csvFile, dim, dim);
-    }
-    // Close file
-    csvFile.close();
-#if CPU_ENABLED && GPU_ENABLED
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Square x Vector (M=N)");
-    }
-#endif
+    if (doDense_) {
+      // Square Problem Sizes...
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      std::ofstream csvFile =
+          initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv");
+      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+        // M = dim, N = dim;
+        callKernels(csvFile, dim, dim);
+      }
+      // Close file
+      csvFile.close();
+  #if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Square x Vector (M=N)");
+      }
+  #endif
 
     // Rectangular Problem Sizes:
     // Tall and thin x Vector
@@ -182,6 +186,7 @@ class doGemv {
     }
 #endif
   }
+  }
 
  private:
   /** Call the appropriate CPU and GPU GEMV kernels. */
@@ -494,6 +499,10 @@ class doGemv {
   /** Whether the GPU kernels should be run. */
   const bool doGPU_ = true;
 
+  /** Whether sparse and or dense kernels should be run. */
+  const bool doSparse_;
+  const bool doDense_;
+
 #if CPU_ENABLED
   /** The GEMV CPU kernel. */
   cpu::gemv_cpu<T> gemvCpu_;
diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh
index a11dcd0..c431d4d 100644
--- a/include/kernels/CPU/sp_gemm.hh
+++ b/include/kernels/CPU/sp_gemm.hh
@@ -32,18 +32,19 @@ namespace cpu {
         // Note that the below should be the same as the edges calculation
         // used in the initInputMatricesSparse function.  If changed here,
         // change there
-        nnz_ = 1 + (int) ((double)n_ * (double)n_ * (1.0 - sparsity_));
+        nnz_ = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - sparsity_));
 
+//        std::cout << "\t____About to malloc()____" << std::endl;
 				A_ = (T*)malloc(sizeof(T) * n_ * n_);
 				B_ = (T*)malloc(sizeof(T) * n_ * n_);
 				C_ = (T*)malloc(sizeof(T) * n_ * n_);
 
-				initInputMatricesSparse(sparsity_);
+				initInputMatricesSparse(sparsity);
 
         toCSR_int();
 			}
 
-      int nnz_;
+      uint64_t nnz_;
 
     protected:
 
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index bbd17cb..6e1328e 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -107,14 +107,14 @@ class gemm {
                      .time_since_epoch().count());
     std::uniform_real_distribution<double> dist(0.0, 1.0);
 
-    int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
+    int edges = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - sparsity));
 
     // Using a=0.45 and b=c=0.22 as default probabilities
     for (int i = 0; i < edges; i++) {
       while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
-              false)) {}
+                   false)) {}
       while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
-              false)) {}
+                   false)) {}
     }
   }
 
@@ -165,7 +165,6 @@ class gemm {
                     gen, dist, bin);
       }
     }
-    return true;
   }
 
   void toCSR_int(T* dense, int n_col, int n_row, T* vals, int* col_index,
diff --git a/include/kernels/gemv.hh b/include/kernels/gemv.hh
index 665fe59..a64b19c 100644
--- a/include/kernels/gemv.hh
+++ b/include/kernels/gemv.hh
@@ -95,10 +95,11 @@ class gemv {
                      .time_since_epoch().count());
     std::uniform_real_distribution<double> dist(0.0, 1.0);
 
-    int edges = 1 + (int) (n_ * n_ * (1 - sparsity_));
+    uint64_t edges = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 -
+            sparsity_));
 
     // Using a=0.45 and b=c=0.22 as default probabilities
-    for (int i = 0; i < edges; i++) {
+    for (uint64_t i = 0; i < edges; i++) {
       while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
                    false)) {}
     }
diff --git a/src/main.cc b/src/main.cc
index e508b5b..bdc1db2 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -7,6 +7,10 @@ bool doSgemm = true;
 bool doDgemm = true;
 bool doSp_sgemm = true;
 bool doSp_dgemm = true;
+bool doSgemv = true;
+bool doDgemv = true;
+bool doSp_sgemv = true;
+bool doSp_dgemv = true;
 
 bool doCpu = CPU_ENABLED;
 bool doGpu = GPU_ENABLED;
@@ -50,18 +54,18 @@ int main(int argc, char** argv) {
 
   // -------- GEMV --------
   // SGEMV Comparison
-//  std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl;
-//  doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
-//                      doGpu);
-//  sgemv.collectData();
-//  std::cout << "Finished!" << std::endl;
-//
-//  // DGEMV Comparison
-//  std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl;
-//  doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
-//                       doGpu);
-//  dgemv.collectData();
-//  std::cout << "Finished!" << std::endl;
+  std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl;
+  doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
+                      doGpu, doSgemv, doSp_sgemv);
+  sgemv.collectData();
+  std::cout << "Finished!" << std::endl;
+
+  // DGEMV Comparison
+  std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl;
+  doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
+                       doGpu, doDgemv, doSp_dgemv);
+  dgemv.collectData();
+  std::cout << "Finished!" << std::endl;
 
   free(absPath);
   return 0;
@@ -146,7 +150,8 @@ void getParameters(int argc, char** argv) {
     } else if (!strcmp(argv[i], "--no_gpu")) {
       doGpu = false;
     } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) {
-      doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = false;
+      doSgemm = doDgemm = doSp_sgemm = doSp_dgemm =
+      doSgemv = doDgemv = doSp_sgemv = doSp_dgemv = false;
       std::string kernelList = argv[++i];
       if (kernelList.find("sp-sgemm") != std::string::npos) {
         doSp_sgemm = true;
@@ -167,13 +172,28 @@ void getParameters(int argc, char** argv) {
         doDgemm = true;
       }
 
-	    if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm) {
-		    std::cout << "ERROR - no implemented kernels in list" << std::endl;
-		    exit(1);
-	    }
-    } else if (!strcmp(argv[i], "--output_dir") || !strcmp(argv[i], "-o")) {
-      if (++i >= argc) {
-        std::cout << "ERROR - Invalid output directory" << std::endl;
+
+      if (kernelList.find("sp-sgemv") != std::string::npos) {
+        doSp_sgemv = true;
+        if (kernelList.find("sgemv") != std::string::npos &&
+            kernelList.find("sgemv") != kernelList.find("sp-sgemv") + 3) {
+          doSgemv = true;
+        }
+      } else if (kernelList.find("sgemv") != std::string::npos) {
+        doSgemv = true;
+      }
+      if (kernelList.find("sp-dgemv") != std::string::npos) {
+        doSp_dgemv = true;
+        if (kernelList.find("dgemv") != std::string::npos &&
+            kernelList.find("dgemv") != kernelList.find("sp-dgemv") + 3) {
+          doDgemv = true;
+        }
+      } else if (kernelList.find("dgemv") != std::string::npos) {
+        doDgemv = true;
+      }
+      if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm &&
+          !doSgemv && !doDgemv && !doSp_sgemv && !doSp_dgemv) {
+        std::cout << "ERROR - no implemented kernels in list" << std::endl;
         exit(1);
       } else {
         CSV_DIR = argv[i];
@@ -212,4 +232,4 @@ void getParameters(int argc, char** argv) {
       exit(1);
     }
   }
-}
\ No newline at end of file
+}

From 0130b81655b1fa04b433c4d22f9288df723cefd2 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:58:16 +0100
Subject: [PATCH 28/32] Adding AOCL files

---
 .idea/workspace.xml | 23 ++++++++-----
 ArmPL/sp_gemm.hh    | 84 +++++++++++++++++++++++++++++++++++++++++++++
 Makefile            |  2 +-
 include/doGemm.hh   | 26 +++++++-------
 include/doGemv.hh   | 12 +++----
 include/helpers.hh  | 12 ++++---
 6 files changed, 127 insertions(+), 32 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index a5afad2..2bb35d8 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -15,14 +15,13 @@
     </configurations>
   </component>
   <component name="ChangeListManager">
-    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="No longer overwriting B_">
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding kernel selection for gemv">
       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/Makefile" beforeDir="false" afterPath="$PROJECT_DIR$/Makefile" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/include/doGemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemm.hh" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/include/doGemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/sp_gemm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/gemm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/gemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/main.cc" beforeDir="false" afterPath="$PROJECT_DIR$/src/main.cc" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/helpers.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/helpers.hh" afterDir="false" />
     </list>
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -520,7 +519,15 @@
       <option name="project" value="LOCAL" />
       <updated>1727949079616</updated>
     </task>
-    <option name="localTasksCounter" value="43" />
+    <task id="LOCAL-00043" summary="Adding kernel selection for gemv">
+      <option name="closed" value="true" />
+      <created>1728650780575</created>
+      <option name="number" value="00043" />
+      <option name="presentableId" value="LOCAL-00043" />
+      <option name="project" value="LOCAL" />
+      <updated>1728650780575</updated>
+    </task>
+    <option name="localTasksCounter" value="44" />
     <servers />
   </component>
   <component name="TypeScriptGeneratedFilesManager">
@@ -538,7 +545,6 @@
     </option>
   </component>
   <component name="VcsManagerConfiguration">
-    <MESSAGE value="changes" />
     <MESSAGE value="adding command line kernel selection" />
     <MESSAGE value="Adding basic sparse multiplication kernel for default CPU and GPU" />
     <MESSAGE value="Implementing cuSPARSE kernel" />
@@ -563,6 +569,7 @@
     <MESSAGE value="Adding AOCL files" />
     <MESSAGE value="working changes" />
     <MESSAGE value="No longer overwriting B_" />
-    <option name="LAST_COMMIT_MESSAGE" value="No longer overwriting B_" />
+    <MESSAGE value="Adding kernel selection for gemv" />
+    <option name="LAST_COMMIT_MESSAGE" value="Adding kernel selection for gemv" />
   </component>
 </project>
\ No newline at end of file
diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh
index 28a2ca3..612f4f1 100644
--- a/ArmPL/sp_gemm.hh
+++ b/ArmPL/sp_gemm.hh
@@ -89,6 +89,90 @@ class sp_gemm_cpu : public sp_gemm<T> {
   void preLoopRequirements() override {
     // Need to put A_ and B_ into A_armpl_ and B_armpl_
     toCSR_armpl();
+
+    /** providing hints to ARMPL and optimizing the matrix datastructures */
+    // TODO -- is noallocs best here?
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_MEMORY,
+                               ARMPL_SPARSE_MEMORY_NOALLOCS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_MEMORY,
+                               ARMPL_SPARSE_MEMORY_NOALLOCS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_STRUCTURE,
+                               ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_STRUCTURE,
+                               ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+    // TODO -- will this be FEW?
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS,
+                               ARMPL_SPARSE_INVOCATIONS_MANY);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS,
+                               ARMPL_SPARSE_INVOCATIONS_MANY);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION,
+                               ARMPL_SPARSE_OPERATION_NOTRANS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION,
+                               ARMPL_SPARSE_OPERATION_NOTRANS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+    // TODO -- investigate whch is better here
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY,
+                               ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY,
+                               ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+//  TODO -- this is thorwing an error -- couldn't immediately fix so come
+//   back to
+
+//    /** provide hints for the optimisation of the spmm execution */
+//    status_ = armpl_spmm_optimize(ARMPL_SPARSE_OPERATION_NOTRANS,
+//                                  ARMPL_SPARSE_OPERATION_NOTRANS,
+//                                  ARMPL_SPARSE_SCALAR_ONE,
+//                                  A_armpl_, B_armpl_,
+//                                  ARMPL_SPARSE_SCALAR_ZERO,
+//                                  C_armpl_);
+//    if (status_ != ARMPL_STATUS_SUCCESS) {
+//      std::cout << "ERROR " << status_ << std::endl;
+//      exit(1);
+//    }
   }
 
   /** Perform any required steps after calling the GEMM kernel that should
diff --git a/Makefile b/Makefile
index e5091e0..22d080c 100644
--- a/Makefile
+++ b/Makefile
@@ -51,7 +51,7 @@ CXX = $(CXX_$(COMPILER))
 
 CXXFLAGS_ARM     = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native
 CXXFLAGS_CLANG   = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native
-CXXFLAGS_GNU     = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native
+CXXFLAGS_GNU     = -std=c++17 -Wall -Wno-deprecated-declarations -Ofast -$(ARCHFLAG)=native
 CXXFLAGS_INTEL   = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native -Wno-tautological-constant-compare
 CXXFLAGS_NVIDIA  = -std=c++17 -Wall -O3 -fast -$(ARCHFLAG)=native
 CXXFLAGS_HIP     = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native
diff --git a/include/doGemm.hh b/include/doGemm.hh
index a3e5e77..93cc058 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -392,8 +392,8 @@ class doGemm {
       cpuResult = gemmCpu_.compute();
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
       // Write result to CSV file
-      writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize, iterations_,
-                     cpuResult.runtime, cpuResult.gflops);
+      writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize,
+                     0.0, iterations_, cpuResult.runtime, cpuResult.gflops);
     }
 #endif
 
@@ -422,13 +422,13 @@ class doGemm {
 
       // Write results to CSV file
       writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, K, probSize,
-                     iterations_, gpuResult_once.runtime,
+                     0.0, iterations_, gpuResult_once.runtime,
                      gpuResult_once.gflops);
       writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, K,
-                     probSize, iterations_, gpuResult_always.runtime,
+                     probSize, 0.0, iterations_, gpuResult_always.runtime,
                      gpuResult_always.gflops);
       writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, K, probSize,
-                     iterations_, gpuResult_unified.runtime,
+                     0.0, iterations_, gpuResult_unified.runtime,
                      gpuResult_unified.gflops);
     }
 #endif
@@ -578,8 +578,9 @@ class doGemm {
       spGemmCpu_.initialise(N, sparsity);
       time_checksum_gflop cpuResult = spGemmCpu_.compute();
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
-		  writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
-		               cpuResult.runtime, cpuResult.gflops);
+		  writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize,
+                     sparsity, iterations_, cpuResult.runtime,
+                     cpuResult.gflops);
     }
 #endif
 #if GPU_ENABLED
@@ -607,13 +608,14 @@ class doGemm {
 
 		// Write lines to CSV file
 		  writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
-		               iterations_, gpuResult_once.runtime, gpuResult_once.gflops);
+		                sparsity, iterations_, gpuResult_once.runtime,
+                    gpuResult_once.gflops);
 		  writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
-		               iterations_, gpuResult_always.runtime,
-		               gpuResult_always.gflops);
+		                sparsity, iterations_, gpuResult_always.runtime,
+		                gpuResult_always.gflops);
 		  writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize,
-		               iterations_, gpuResult_unified.runtime,
-		               gpuResult_unified.gflops);
+		                sparsity, iterations_, gpuResult_unified.runtime,
+		                gpuResult_unified.gflops);
 
     }
 #endif
diff --git a/include/doGemv.hh b/include/doGemv.hh
index 12cd097..2ab5fb1 100644
--- a/include/doGemv.hh
+++ b/include/doGemv.hh
@@ -207,8 +207,8 @@ class doGemv {
       cpuResult = gemvCpu_.compute();
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
       // Write result to CSV file
-      writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, iterations_,
-                     cpuResult.runtime, cpuResult.gflops);
+      writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, 0.0,
+                     iterations_, cpuResult.runtime, cpuResult.gflops);
     }
 #endif
 
@@ -237,13 +237,13 @@ class doGemv {
 
       // Write results to CSV file
       writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, 0, probSize,
-                     iterations_, gpuResult_once.runtime,
+                     0.0, iterations_, gpuResult_once.runtime,
                      gpuResult_once.gflops);
       writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, 0,
-                     probSize, iterations_, gpuResult_always.runtime,
+                     probSize, 0.0, iterations_, gpuResult_always.runtime,
                      gpuResult_always.gflops);
       writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, 0, probSize,
-                     iterations_, gpuResult_unified.runtime,
+                     0.0, iterations_, gpuResult_unified.runtime,
                      gpuResult_unified.gflops);
     }
 #endif
@@ -500,8 +500,8 @@ class doGemv {
   const bool doGPU_ = true;
 
   /** Whether sparse and or dense kernels should be run. */
-  const bool doSparse_;
   const bool doDense_;
+  const bool doSparse_;
 
 #if CPU_ENABLED
   /** The GEMV CPU kernel. */
diff --git a/include/helpers.hh b/include/helpers.hh
index 5618557..d760cd7 100644
--- a/include/helpers.hh
+++ b/include/helpers.hh
@@ -17,8 +17,8 @@ std::ofstream initCSVFile(const std::string filename) {
 
   std::ofstream newFile(filename);
 
-  newFile << "Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total "
-             "Seconds,GFLOP/s"
+  newFile << "Device,Kernel,M,N,K,Total Problem Size (KiB),sparsity,Iterations,"
+             "Total Seconds,GFLOP/s"
           << std::endl;
 
   return newFile;
@@ -28,15 +28,17 @@ std::ofstream initCSVFile(const std::string filename) {
  * Function does not close the file. */
 void writeLineToCsv(std::ofstream& file, const std::string device,
                     const std::string kernel, const int M, const int N,
-                    const int K, const double totalProbSize, const int iters,
-                    const double totalTime, const double gflops) {
+                    const int K, const double totalProbSize, const float
+                    sparsity, const int iters, const double totalTime,
+                    const double gflops) {
   if (!file.is_open()) {
     std::cout << "ERROR - Attempted to write line to a closed CSV file."
               << std::endl;
     exit(1);
   }
   file << device << "," << kernel << "," << M << "," << N << "," << K << ","
-       << std::fixed << std::setprecision(3) << totalProbSize << "," << iters
+       << std::fixed << std::setprecision(3) << totalProbSize << ","
+       << std::fixed << std::setprecision(8) << sparsity << "," << iters
        << "," << std::fixed << std::setprecision(5) << totalTime << ","
        << std::fixed << std::setprecision(3) << gflops << std::endl;
 }

From 4581637b57e14c92b4b4ca40c200565aae9e3d91 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:12:42 +0100
Subject: [PATCH 29/32] Providing armpl with hints

---
 .idea/workspace.xml | 21 ++++++++++++---------
 ArmPL/sp_gemm.hh    |  1 +
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 2bb35d8..d791fa3 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -15,13 +15,8 @@
     </configurations>
   </component>
   <component name="ChangeListManager">
-    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding kernel selection for gemv">
-      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Providing armpl with hints">
       <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/Makefile" beforeDir="false" afterPath="$PROJECT_DIR$/Makefile" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/doGemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/doGemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/helpers.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/helpers.hh" afterDir="false" />
     </list>
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -527,7 +522,15 @@
       <option name="project" value="LOCAL" />
       <updated>1728650780575</updated>
     </task>
-    <option name="localTasksCounter" value="44" />
+    <task id="LOCAL-00044" summary="Providing armpl with hints">
+      <option name="closed" value="true" />
+      <created>1728655865948</created>
+      <option name="number" value="00044" />
+      <option name="presentableId" value="LOCAL-00044" />
+      <option name="project" value="LOCAL" />
+      <updated>1728655865948</updated>
+    </task>
+    <option name="localTasksCounter" value="45" />
     <servers />
   </component>
   <component name="TypeScriptGeneratedFilesManager">
@@ -545,7 +548,6 @@
     </option>
   </component>
   <component name="VcsManagerConfiguration">
-    <MESSAGE value="adding command line kernel selection" />
     <MESSAGE value="Adding basic sparse multiplication kernel for default CPU and GPU" />
     <MESSAGE value="Implementing cuSPARSE kernel" />
     <MESSAGE value="Trying to work out CSR malloc bug" />
@@ -570,6 +572,7 @@
     <MESSAGE value="working changes" />
     <MESSAGE value="No longer overwriting B_" />
     <MESSAGE value="Adding kernel selection for gemv" />
-    <option name="LAST_COMMIT_MESSAGE" value="Adding kernel selection for gemv" />
+    <MESSAGE value="Providing armpl with hints" />
+    <option name="LAST_COMMIT_MESSAGE" value="Providing armpl with hints" />
   </component>
 </project>
\ No newline at end of file
diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh
index 612f4f1..e8e28a5 100644
--- a/ArmPL/sp_gemm.hh
+++ b/ArmPL/sp_gemm.hh
@@ -355,6 +355,7 @@ class sp_gemm_cpu : public sp_gemm<T> {
         std::cout << "ERROR " << status_ << std::endl;
         exit(1);
       }
+
 //      std::cout << "Okay, all matrices made!!" << std::endl;
     }
 

From 477b7a0a050caeeb86ff4776ab75cbe4982cf883 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Mon, 21 Oct 2024 15:14:42 +0100
Subject: [PATCH 30/32] Updating createGflopsGraphs.py to show sparsity

---
 .idea/workspace.xml   | 6 ++++--
 createGflopsGraphs.py | 7 +++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index d791fa3..d27d844 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -15,8 +15,9 @@
     </configurations>
   </component>
   <component name="ChangeListManager">
-    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Providing armpl with hints">
-      <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" afterDir="false" />
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding AOCL files">
+      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/createGflopsGraphs.py" beforeDir="false" afterPath="$PROJECT_DIR$/createGflopsGraphs.py" afterDir="false" />
     </list>
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -177,6 +178,7 @@
       <workItem from="1727941759103" duration="43000" />
       <workItem from="1727941814674" duration="165000" />
       <workItem from="1727941995420" duration="22747000" />
+      <workItem from="1729503392250" duration="1773000" />
     </task>
     <task id="LOCAL-00001" summary="trivial changes">
       <option name="closed" value="true" />
diff --git a/createGflopsGraphs.py b/createGflopsGraphs.py
index ee1a389..7739eeb 100644
--- a/createGflopsGraphs.py
+++ b/createGflopsGraphs.py
@@ -54,7 +54,8 @@
 
     # Get number of iterations performed and kernel name
     line1 = lines[0].split(',')
-    iters = int(line1[6])
+    sparsity = float(line1[6])
+    iters = int(line1[7])
     kernel = line1[1]
 
     # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types
@@ -143,7 +144,9 @@
     elif kernel == "dgemm":
         fp = "FP64"
     y_name = "{} GFLOP/s".format(fp)        
-    title = "{}GEMM Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters)
+    title = ("{}GEMM Performance for {} Problems (sparsity = {})- {} "
+             "iterations per problemize").format(kernel[0].upper(),
+                                                 inputTypeStr, sparsity, iters)
 
     # Make Graph
     fig1 = plt.figure(figsize=(28,16))

From 407c008a75384457002c105c71311461af48854e Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Mon, 21 Oct 2024 15:50:36 +0100
Subject: [PATCH 31/32] Beginning gemv ARMPL

---
 ArmPL/sp_gemv.hh | 406 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 406 insertions(+)
 create mode 100644 ArmPL/sp_gemv.hh

diff --git a/ArmPL/sp_gemv.hh b/ArmPL/sp_gemv.hh
new file mode 100644
index 0000000..818c95e
--- /dev/null
+++ b/ArmPL/sp_gemv.hh
@@ -0,0 +1,406 @@
+#pragma once
+
+#ifdef CPU_ARMPL
+#include <stdio.h>
+#include <stdlib.h>
+#include <armpl.h>
+#include <omp.h>
+
+#include <algorithm>
+
+#include "../include/kernels/CPU/sp_gemv.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+/** A class for GEMM CPU BLAS kernels. */
+template <typename T>
+class sp_gemv_cpu : public sp_gemv<T> {
+ public:
+  using sp_gemv<T>::sp_gemv;
+  using sp_gemv<T>::callConsume;
+  using sp_gemv<T>::m_;
+  using sp_gemv<T>::n_;
+  using sp_gemv<T>::k_;
+  using sp_gemv<T>::A_;
+  using sp_gemv<T>::B_;
+  using sp_gemv<T>::C_;
+  using sp_gemv<T>::nnz_;
+  using sp_gemv<T>::A_vals_;
+  using sp_gemv<T>::B_vals_;
+  using sp_gemv<T>::C_vals_;
+
+ private:
+  /** Make call to the GEMM kernel. */
+  void callGemv() override {
+
+    /**
+     * Flow of ARMPL Sparse LA:
+     *
+     * 1. Create sparse matrix objects: armpl_spmat_create_csr[sdcz]()
+     *
+     * 2. Supply hints on usage: armpl_spmat_hint()
+     *
+     * 3. Optimise for SpMV: armpl_spmv_optimize()
+     *
+     * 4. Solve SpMV case: armpl_spmv_exec_[sdcz]()
+     *
+     * 5. Destroy sparse matrix object: armpl_spmat_destroy()
+     *
+     * In addiion, users can choose to update a set of non-zero values using
+     * armpl_spmat_update_[sdcz]()
+     */
+
+    // Todo -- See if using armpl_spmat_hint can improve performance here.
+    //  If so, follow with optimisation functions
+
+    if constexpr (std::is_same_v<T, float>) {
+      status_ = armpl_spmm_exec_s(transA_,
+                                  transB_,
+                                  alpha,
+                                  A_armpl_,
+                                  B_armpl_,
+                                  beta,
+                                  C_armpl_);
+    } else if constexpr (std::is_same_v<T, double>) {
+      status_ = armpl_spmm_exec_d(transA_,
+                                  transB_,
+                                  alpha,
+                                  A_armpl_,
+                                  B_armpl_,
+                                  beta,
+                                  C_armpl_);
+    } else {
+      // Un-specialised class will not do any work - print error and exit.
+      std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported."
+                << std::endl;
+      exit(1);
+    }
+
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    // Ensure compiler doesn't optimise away the work being done
+    callConsume();
+  }
+
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {
+    // Need to put A_ and B_ into A_armpl_ and B_armpl_
+    toCSR_armpl();
+
+    /** providing hints to ARMPL and optimizing the matrix datastructures */
+    // TODO -- is noallocs best here?
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_MEMORY,
+                               ARMPL_SPARSE_MEMORY_NOALLOCS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_MEMORY,
+                               ARMPL_SPARSE_MEMORY_NOALLOCS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_STRUCTURE,
+                               ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_STRUCTURE,
+                               ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+    // TODO -- will this be FEW?
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS,
+                               ARMPL_SPARSE_INVOCATIONS_MANY);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS,
+                               ARMPL_SPARSE_INVOCATIONS_MANY);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION,
+                               ARMPL_SPARSE_OPERATION_NOTRANS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION,
+                               ARMPL_SPARSE_OPERATION_NOTRANS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+    // TODO -- investigate whch is better here
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY,
+                               ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY,
+                               ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+//  TODO -- this is thorwing an error -- couldn't immediately fix so come
+//   back to
+
+//    /** provide hints for the optimisation of the spmm execution */
+//    status_ = armpl_spmm_optimize(ARMPL_SPARSE_OPERATION_NOTRANS,
+//                                  ARMPL_SPARSE_OPERATION_NOTRANS,
+//                                  ARMPL_SPARSE_SCALAR_ONE,
+//                                  A_armpl_, B_armpl_,
+//                                  ARMPL_SPARSE_SCALAR_ZERO,
+//                                  C_armpl_);
+//    if (status_ != ARMPL_STATUS_SUCCESS) {
+//      std::cout << "ERROR " << status_ << std::endl;
+//      exit(1);
+//    }
+  }
+
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {
+    status_ = armpl_spmat_destroy(A_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_destroy(B_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_destroy(C_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+    delete [] A_armpl_row_ptr_;
+    delete [] A_armpl_col_index_;
+    delete [] A_vals_;
+    delete [] B_armpl_row_ptr_;
+    delete [] B_armpl_col_index_;
+    delete [] B_vals_;
+    delete [] C_armpl_row_ptr_;
+    delete [] C_armpl_col_index_;
+    delete [] C_vals_;
+
+  }
+
+  /** The constant value Alpha. */
+  const T alpha = ALPHA;
+
+  /** The constant value Beta. */
+  const T beta = BETA;
+
+  void toCSR_armpl() {
+    n_armpl_ = n_;
+    // ToDo -- check whether flags_ is correct!
+    flags_ = 0;
+
+    // Move A to CSR
+    A_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
+    A_armpl_col_index_ = new armpl_int_t[nnz_];
+    A_vals_ = new T[nnz_];
+    A_armpl_row_ptr_[0] = 0;
+    int nnz_encountered = 0;
+
+    for (int row = 0; row < n_; row++) {
+      A_armpl_row_ptr_[row + 1] = nnz_encountered;
+      for (int col = 0; col < n_; col++) {
+        if (A_[(row * n_) + col] != 0.0) {
+          A_armpl_col_index_[nnz_encountered] = col;
+          A_vals_[nnz_encountered] = static_cast<T>(A_[(row * n_) + col]);
+          nnz_encountered++;
+        }
+      }
+    }
+
+    // Move B to CSR
+    B_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
+    B_armpl_col_index_ = new armpl_int_t[nnz_];
+    B_vals_ = new T[nnz_];
+    B_armpl_row_ptr_[0] = 0;
+
+    nnz_encountered = 0;
+    for (int row = 0; row < n_; row++) {
+      B_armpl_row_ptr_[row + 1] = nnz_encountered;
+      for (int col = 0; col < n_; col++) {
+        if (B_[(row * n_) + col] != 0.0) {
+          B_armpl_col_index_[nnz_encountered] = col;
+          B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
+          nnz_encountered++;
+        }
+      }
+    }
+
+    // Move C to CSR
+    C_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
+    C_armpl_col_index_ = new armpl_int_t[nnz_];
+    C_vals_ = new T[nnz_];
+    C_armpl_row_ptr_[0] = 0;
+
+    nnz_encountered = 0;
+    for (int row = 0; row < n_; row++) {
+      C_armpl_row_ptr_[row + 1] = nnz_encountered;
+      for (int col = 0; col < n_; col++) {
+        if (B_[(row * n_) + col] != 0.0) {
+          C_armpl_col_index_[nnz_encountered] = col;
+          C_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
+          nnz_encountered++;
+        }
+      }
+    }
+
+    if constexpr (std::is_same_v<T, float>) {
+//      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&A_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         A_armpl_row_ptr_,
+                                         A_armpl_col_index_,
+                                         A_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+
+//      printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&B_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         B_armpl_row_ptr_,
+                                         B_armpl_col_index_,
+                                         B_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+
+//      printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&C_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         C_armpl_row_ptr_,
+                                         C_armpl_col_index_,
+                                         C_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+    } else if constexpr (std::is_same_v<T, double>) {
+//      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
+//                nnz_, flags_
+      status_ = armpl_spmat_create_csr_d(&A_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         A_armpl_row_ptr_,
+                                         A_armpl_col_index_,
+                                         A_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+
+//      printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_d(&B_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         B_armpl_row_ptr_,
+                                         B_armpl_col_index_,
+                                         B_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+
+//      printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_d(&C_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         C_armpl_row_ptr_,
+                                         C_armpl_col_index_,
+                                         C_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+
+//      std::cout << "Okay, all matrices made!!" << std::endl;
+    }
+
+  }
+
+  void printCSR(armpl_int_t n, armpl_int_t* rp, armpl_int_t* ci, T* v,
+                armpl_int_t nz, armpl_int_t f) {
+    std::cout << "\tn = " << n << std::endl;
+    std::cout << "\trow ptr (size = " << sizeof(rp[0]) << ") = [" << rp[0];
+    for (int i = 1; i < (n + 1); i++) {
+      std::cout << ", " << rp[i];
+    }
+    std::cout << "]" << std::endl << "\tcol ind (size = " << sizeof(ci[0]) <<
+    ") = [" << ci[0];
+    for (int i = 1; i < nz; i++) {
+      std::cout << ", " << ci[i];
+    }
+    std::cout << "]" << std::endl << "\tvals (size = " << sizeof(v[0]) <<
+    ") = [" << v[0];
+    for (int i = 1; i < nz; i++) {
+      std::cout << ", " << v[i];
+    }
+    std::cout << "]" << std::endl << "\tflags = " << f << std::endl;
+  }
+
+  armpl_status_t status_;
+
+  armpl_int_t flags_;
+
+  armpl_int_t n_armpl_;
+
+  armpl_int_t* A_armpl_row_ptr_;
+  armpl_int_t* A_armpl_col_index_;
+  armpl_int_t* B_armpl_row_ptr_;
+  armpl_int_t* B_armpl_col_index_;
+  armpl_int_t* C_armpl_row_ptr_;
+  armpl_int_t* C_armpl_col_index_;
+
+  armpl_spmat_t A_armpl_;
+  armpl_spmat_t B_armpl_;
+  armpl_spmat_t C_armpl_;
+
+  armpl_sparse_hint_value transA_ = ARMPL_SPARSE_OPERATION_NOTRANS;
+  armpl_sparse_hint_value transB_ = ARMPL_SPARSE_OPERATION_NOTRANS;
+
+};
+}  // namespace cpu
+#endif
\ No newline at end of file

From 893458824dc6d343e34a66207a7ebbfc9d67f9b3 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com>
Date: Mon, 21 Oct 2024 15:50:43 +0100
Subject: [PATCH 32/32] Beginning gemv ARMPL

---
 .idea/workspace.xml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index d27d844..5a61e8c 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -16,8 +16,7 @@
   </component>
   <component name="ChangeListManager">
     <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding AOCL files">
-      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/createGflopsGraphs.py" beforeDir="false" afterPath="$PROJECT_DIR$/createGflopsGraphs.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/ArmPL/sp_gemv.hh" afterDir="false" />
     </list>
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -550,7 +549,6 @@
     </option>
   </component>
   <component name="VcsManagerConfiguration">
-    <MESSAGE value="Adding basic sparse multiplication kernel for default CPU and GPU" />
     <MESSAGE value="Implementing cuSPARSE kernel" />
     <MESSAGE value="Trying to work out CSR malloc bug" />
     <MESSAGE value="cuSPARSE unified memory implementation" />
@@ -575,6 +573,7 @@
     <MESSAGE value="No longer overwriting B_" />
     <MESSAGE value="Adding kernel selection for gemv" />
     <MESSAGE value="Providing armpl with hints" />
-    <option name="LAST_COMMIT_MESSAGE" value="Providing armpl with hints" />
+    <MESSAGE value="Updating createGflopsGraphs.py to show sparsity" />
+    <option name="LAST_COMMIT_MESSAGE" value="Updating createGflopsGraphs.py to show sparsity" />
   </component>
 </project>
\ No newline at end of file