From c1a3cd7acba859d9df200e557bd3454dc93c1abf Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:15:21 +0100 Subject: [PATCH 01/32] rebsing --- src/main.cc | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/main.cc b/src/main.cc index 2d046e3..c61df37 100644 --- a/src/main.cc +++ b/src/main.cc @@ -1,7 +1,6 @@ #include "../include/main.hh" int iters = 10; -int startDim = 1; int upperLimit = 128; bool doCpu = CPU_ENABLED; @@ -141,6 +140,32 @@ void getParameters(int argc, char* argv[]) { doCpu = false; } else if (!strcmp(argv[i], "--no_gpu")) { doGpu = false; + } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) { + sgemm = dgemm = sp_sgemm = sp_dgemm = false; + std::string kernelList = argv[++i]; + if (kernelList.find("sp-sgemm") != std::string::npos) { + sp_sgemm = true; + if (kernelList.find("sgemm") != std::string::npos && + kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) { + sgemm = true; + } + } else if (kernelList.find("sgemm") != std::string::npos) { + sgemm = true; + } + if (kernelList.find("sp-dgemm") != std::string::npos) { + sp_dgemm = true; + if (kernelList.find("dgemm") != std::string::npos && + kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) { + dgemm = true; + } + } else if (kernelList.find("dgemm") != std::string::npos) { + dgemm = true; + } + + if (!sgemm && !dgemm && !sp_sgemm && !sp_dgemm) { + std::cout << "ERROR - no implemented kernels in list" << std::endl; + exit(1); + } } else if (!strcmp(argv[i], "--output_dir") || !strcmp(argv[i], "-o")) { if (++i >= argc) { std::cout << "ERROR - Invalid output directory" << std::endl; From 21366b4359101379b640faf814173620f0635e4d Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:22:26 +0100 Subject: [PATCH 02/32] rebsing --- DefaultCPU/sp_gemm.hh | 55 ++++++ DefaultGPU/sp_gemm.hh | 54 ++++++ cuBLAS/sp_gemm.hh | 295 +++++++++++++++++++++++++++++++++ include/doGemm.hh | 94 +++++++++-- include/kernels/CPU/sp_gemm.hh | 110 ++++++++++++ include/kernels/GPU/sp_gemm.hh | 27 +++ src/main.cc | 4 + 7 files changed, 626 insertions(+), 13 deletions(-) create mode 100644 DefaultCPU/sp_gemm.hh create mode 100644 DefaultGPU/sp_gemm.hh create mode 100644 cuBLAS/sp_gemm.hh create mode 100644 include/kernels/CPU/sp_gemm.hh create mode 100644 include/kernels/GPU/sp_gemm.hh diff --git a/DefaultCPU/sp_gemm.hh b/DefaultCPU/sp_gemm.hh new file mode 100644 index 0000000..d7ecb37 --- /dev/null +++ b/DefaultCPU/sp_gemm.hh @@ -0,0 +1,55 @@ +#pragma once + +#if defined CPU_DEFAULT + +#include "../include/kernels/CPU/sp_gemm.hh" +#include "../include/utilities.hh" + +namespace cpu { +/** A class for GEMM CPU BLAS kernels. */ +template +class sp_gemm_cpu : public sp_gemm { + public: + using sp_gemm::sp_gemm; + using sp_gemm::callConsume; + using sp_gemm::m_; + using sp_gemm::n_; + using sp_gemm::k_; + using sp_gemm::A_; + using sp_gemm::B_; + using sp_gemm::C_; + + private: + /** Perform the GEMM kernel. */ + void callGemm() override { + /** A naive implementation of a column-major GEMM. Alpha and Beta are always + * 1 and 0 respectively. + * Operation takes the form of C[M,N] = A[M,K] * B[K,N]. + * callConsume() is required to ensure that the compiler does not optimise + * away this function. */ + int x, y, z; + T acc; + for (x = 0; x < m_; x++) { + for (y = 0; y < n_; y++) { + acc = 0.0; + for (z = 0; z < k_; z++) { + acc += A_[z * m_ + x] * B_[y * k_ + z]; + } + C_[y * m_ + x] = acc; + } + } + // Ensure compiler doesn't optimise away the work being done + callConsume(); + } + + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override {} + + /** Perform any required steps after calling the GEMM kernel that should + * be timed. */ + void postLoopRequirements() override {} +}; + +} // namespace cpu +#endif diff --git a/DefaultGPU/sp_gemm.hh b/DefaultGPU/sp_gemm.hh new file mode 100644 index 0000000..92d157c --- /dev/null +++ b/DefaultGPU/sp_gemm.hh @@ -0,0 +1,54 @@ +#pragma once + +#if defined GPU_DEFAULT + +#include + +#include "../include/kernels/GPU/sp_gemm.hh" +#include "../include/utilities.hh" + +namespace gpu { +/** A class for GEMM GPU BLAS kernels. */ +template +class sp_gemm_gpu : public sp_gemm { + public: + using sp_gemm::sp_gemm; + + /** Call the BLAS kernel n times, with 1 warmup run. + * Returns the time elapsed for n BLAS calls in seconds. */ + time_checksum_gflop compute() { + // Override function in base `kernel` class as DefaultGPU should do nothing. + return {INFINITY, INFINITY, 0.0}; + } + + /** Initialise the required data structures. */ + void initialise(gpuOffloadType offload, int m, int n, int k) override { + // Default GPU implementation - do nothing. + } + + private: + /** Make a call to the BLAS Library Kernel. */ + void callGemm() override { + // Default GPU implementation - do nothing. + } + + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override { + // Default GPU implementation - do nothing. + } + + /** Perform any required steps after calling the GEMM kernel that should + * be timed. */ + void postLoopRequirements() override { + // Default GPU implementation - do nothing. + } + + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() override { + // Default GPU implementation - do nothing. + } +}; +} // namespace gpu +#endif \ No newline at end of file diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh new file mode 100644 index 0000000..3a9cff0 --- /dev/null +++ b/cuBLAS/sp_gemm.hh @@ -0,0 +1,295 @@ +#pragma once + +#ifdef GPU_CUBLAS +#include +#include + +#include "../include/kernels/GPU/gemm.hh" +#include "../include/utilities.hh" +#include "common.hh" + +namespace gpu { +/** A class for GEMM GPU BLAS kernels. */ +template +class sp_gemm_gpu : public gemm { + public: + using gemm::gemm; + using gemm::m_; + using gemm::n_; + using gemm::k_; + using gemm::A_; + using gemm::B_; + using gemm::C_; + using gemm::offload_; + + /** Initialise the required data structures. + * `offload` refers to the data offload type: + * - Once: Move data from host to device before all iterations & move from + * device to host after all iterations + * - Always: Move data from host to device and device to host each iteration + * - Unified: Initialise data as unified memory; no data movement semantics + * required */ + void initialise(gpuOffloadType offload, int m, int n, int k) override { + offload_ = offload; + + m_ = m; + n_ = n; + k_ = k; + + // Create a handle for CUBLAS + cublasCreate(&handle_); + + // Get device identifier + cudaCheckError(cudaGetDevice(&gpuDevice_)); + + // Initialise 3 streams to asynchronously move data between host and device + cudaCheckError(cudaStreamCreate(&s1_)); + cudaCheckError(cudaStreamCreate(&s2_)); + cudaCheckError(cudaStreamCreate(&s3_)); + + if (offload_ == gpuOffloadType::unified) { + cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * m_ * k_)); + cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * k_ * n_)); + cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * m_ * n_)); + } else { + // Allocate matrices on host + A_ = (T*)malloc(sizeof(T) * m_ * k_); + B_ = (T*)malloc(sizeof(T) * k_ * n_); + C_ = (T*)malloc(sizeof(T) * m_ * n_); + // Allocate matrices on device + cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * m_ * k_)); + cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * k_ * n_)); + cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * m_ * n_)); + } + + // Initialise the host matricies + srand(SEED); + for (int y = 0; y < m_; y++) { + for (int x = 0; x < k_; x++) { + A_[y * k_ + x] = (((T)(rand() % 10000) / 100.0) - 30.0); + } + } + for (int y = 0; y < k_; y++) { + for (int x = 0; x < n_; x++) { + B_[y * n_ + x] = (((T)(rand() % 10000) / 100.0) - 30.0); + } + } + } + + private: + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override { + switch (offload_) { + case gpuOffloadType::always: { + // Offload data each iteration - no requirements + break; + } + case gpuOffloadType::once: { + // Offload data from host to the device. + cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * k_, + cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(B_device_, B_, sizeof(T) * k_ * n_, + cudaMemcpyHostToDevice, s2_)); + cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_, + cudaMemcpyHostToDevice, s3_)); + break; + } + case gpuOffloadType::unified: { + // Prefetch memory to device + cudaCheckError( + cudaMemPrefetchAsync(A_, sizeof(T) * m_ * k_, gpuDevice_, s1_)); + cudaCheckError( + cudaMemPrefetchAsync(B_, sizeof(T) * k_ * n_, gpuDevice_, s2_)); + cudaCheckError( + cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_, gpuDevice_, s3_)); + break; + } + } + } + + /** Make a call to the BLAS Library Kernel. */ + void callGemm() override { + switch (offload_) { + case gpuOffloadType::always: { + // Offload data from host to the device. + cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * k_, + cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(B_device_, B_, sizeof(T) * k_ * n_, + cudaMemcpyHostToDevice, s2_)); + cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_, + cudaMemcpyHostToDevice, s3_)); + // Call cuBLAS GEMM kernel + if constexpr (std::is_same_v) { + cublasStatus_t stat = + cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, + A_device_, std::max(1, m_), B_device_, + std::max(1, k_), &beta, C_device_, std::max(1, m_)); + if (stat != CUBLAS_STATUS_SUCCESS) { + std::cout << "cuBLAS error:" << stat << std::endl; + exit(1); + } + } else if constexpr (std::is_same_v) { + cublasStatus_t stat = + cublasDgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, + A_device_, std::max(1, m_), B_device_, + std::max(1, k_), &beta, C_device_, std::max(1, m_)); + if (stat != CUBLAS_STATUS_SUCCESS) { + std::cout << "cuBLAS error:" << stat << std::endl; + exit(1); + } + } + // Offload data from device to host + cudaCheckError(cudaMemcpyAsync(A_, A_device_, sizeof(T) * m_ * k_, + cudaMemcpyDeviceToHost, s1_)); + cudaCheckError(cudaMemcpyAsync(B_, B_device_, sizeof(T) * k_ * n_, + cudaMemcpyDeviceToHost, s2_)); + cudaCheckError(cudaMemcpyAsync(C_, C_device_, sizeof(T) * m_ * n_, + cudaMemcpyDeviceToHost, s3_)); + // Ensure device has finished all work. + cudaCheckError(cudaDeviceSynchronize()); + break; + } + case gpuOffloadType::once: { + // Call cuBLAS GEMM kernel + if constexpr (std::is_same_v) { + cublasStatus_t stat = + cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, + A_device_, std::max(1, m_), B_device_, + std::max(1, k_), &beta, C_device_, std::max(1, m_)); + if (stat != CUBLAS_STATUS_SUCCESS) { + std::cout << "cuBLAS error:" << stat << std::endl; + exit(1); + } + } else if constexpr (std::is_same_v) { + cublasStatus_t stat = + cublasDgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, + A_device_, std::max(1, m_), B_device_, + std::max(1, k_), &beta, C_device_, std::max(1, m_)); + if (stat != CUBLAS_STATUS_SUCCESS) { + std::cout << "cuBLAS error:" << stat << std::endl; + exit(1); + } + } + break; + } + case gpuOffloadType::unified: { + // Call cuBLAS GEMM kernel + if constexpr (std::is_same_v) { + cublasStatus_t stat = cublasSgemm( + handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, A_, + std::max(1, m_), B_, std::max(1, k_), &beta, C_, std::max(1, m_)); + if (stat != CUBLAS_STATUS_SUCCESS) { + std::cout << "cuBLAS error:" << stat << std::endl; + exit(1); + } + } else if constexpr (std::is_same_v) { + cublasStatus_t stat = cublasDgemm( + handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, A_, + std::max(1, m_), B_, std::max(1, k_), &beta, C_, std::max(1, m_)); + if (stat != CUBLAS_STATUS_SUCCESS) { + std::cout << "cuBLAS error:" << stat << std::endl; + exit(1); + } + } + break; + } + } + } + + /** Perform any required steps after calling the GEMM kernel that should + * be timed. */ + void postLoopRequirements() override { + switch (offload_) { + case gpuOffloadType::always: { + // Offload data each iteration - no requirements + break; + } + case gpuOffloadType::once: { + // Offload data from device to host + cudaCheckError(cudaMemcpyAsync(A_, A_device_, sizeof(T) * m_ * k_, + cudaMemcpyDeviceToHost, s1_)); + cudaCheckError(cudaMemcpyAsync(B_, B_device_, sizeof(T) * k_ * n_, + cudaMemcpyDeviceToHost, s2_)); + cudaCheckError(cudaMemcpyAsync(C_, C_device_, sizeof(T) * m_ * n_, + cudaMemcpyDeviceToHost, s3_)); + // Ensure device has finished all work. + cudaCheckError(cudaDeviceSynchronize()); + break; + } + case gpuOffloadType::unified: { + // Ensure all data resides on host once work has completed + cudaCheckError(cudaMemPrefetchAsync(A_, sizeof(T) * m_ * k_, + cudaCpuDeviceId, s1_)); + cudaCheckError(cudaMemPrefetchAsync(B_, sizeof(T) * k_ * n_, + cudaCpuDeviceId, s2_)); + cudaCheckError(cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_, + cudaCpuDeviceId, s3_)); + // Ensure device has finished all work. + cudaCheckError(cudaDeviceSynchronize()); + break; + } + } + } + + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() override { + // Destroy the handle + cublasDestroy(handle_); + + // Destroy streams after use + cudaCheckError(cudaStreamDestroy(s1_)); + cudaCheckError(cudaStreamDestroy(s2_)); + cudaCheckError(cudaStreamDestroy(s3_)); + + if (offload_ == gpuOffloadType::unified) { + cudaFree(A_); + cudaFree(B_); + cudaFree(C_); + } else { + // Free the memory held on host and device + free(A_); + free(B_); + free(C_); + cudaFree(A_device_); + cudaFree(B_device_); + cudaFree(C_device_); + } + } + + /** Handle used when calling cuBLAS. */ + cublasHandle_t handle_; + + /** CUDA Stream 1 - used to asynchronously move data between host and device. + */ + cudaStream_t s1_; + + /** CUDA Stream 1 - used to asynchronously move data between host and device. + */ + cudaStream_t s2_; + + /** CUDA Stream 1 - used to asynchronously move data between host and device. + */ + cudaStream_t s3_; + + /** The ID of the target GPU Device. */ + int gpuDevice_; + + /** Input matrix A, held on the device. */ + T* A_device_; + + /** Input matrix B, held on the device. */ + T* B_device_; + + /** Input matrix C, held on the device. */ + T* C_device_; + + /** The constant value Alpha. */ + const T alpha = ALPHA; + + /** The constant value Beta. */ + const T beta = BETA; +}; +} // namespace gpu +#endif \ No newline at end of file diff --git a/include/doGemm.hh b/include/doGemm.hh index c1aa742..4a7c564 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -20,6 +20,7 @@ #if defined GPU_CUBLAS #include "../cuBLAS/gemm.hh" +#include "../cuBLAS/sp_gemm.hh" #elif defined GPU_ONEMKL #include "../oneMKL/GPU/gemm.hh" #elif defined GPU_ROCBLAS @@ -42,11 +43,13 @@ class doGemm { doGPU_(gpuEnabled) #if CPU_ENABLED , - gemmCpu_(iterations_) + gemmCpu_(iterations_), + spGemmCpu_(iterations_) #endif #if GPU_ENABLED , - gemmGpu_(iterations_) + gemmGpu_(iterations_), + spGemmGpu_(iterations_) #endif { static_assert((std::is_same_v || std::is_same_v) && @@ -68,7 +71,7 @@ class doGemm { "_square_square_M=N=K.csv"); for (int dim = startDimention_; dim <= upperLimit_; dim++) { // M = dim, N = dim, K = dim; - callKernels(csvFile, dim, dim, dim); + callDenseKernels(csvFile, dim, dim, dim); } // Close file csvFile.close(); @@ -94,7 +97,7 @@ class doGemm { int M = 16 * K; int N = 16 * K; while (M <= upperLimit_) { - callKernels(csvFile, M, N, K); + callDenseKernels(csvFile, M, N, K); M += 16; N += 16; K++; @@ -121,7 +124,7 @@ class doGemm { if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { // M = dim, N = dim, K = 32; - callKernels(csvFile, dim, dim, 32); + callDenseKernels(csvFile, dim, dim, 32); } } // Close file @@ -147,7 +150,7 @@ class doGemm { N = startDimention_; K = 16 * M; while (K <= upperLimit_) { - callKernels(csvFile, M, N, K); + callDenseKernels(csvFile, M, N, K); M++; N++; K += 16; @@ -174,7 +177,7 @@ class doGemm { if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { // M = 32, N = 32, K = dim; - callKernels(csvFile, 32, 32, dim); + callDenseKernels(csvFile, 32, 32, dim); } } // Close file @@ -200,7 +203,7 @@ class doGemm { N = startDimention_; M = 16 * K; while (M <= upperLimit_) { - callKernels(csvFile, M, N, K); + callDenseKernels(csvFile, M, N, K); M += 16; N++; K++; @@ -227,7 +230,7 @@ class doGemm { if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { // M = dim, N = 32, K = 32; - callKernels(csvFile, dim, 32, 32); + callDenseKernels(csvFile, dim, 32, 32); } } // Close file @@ -253,7 +256,7 @@ class doGemm { K = startDimention_; N = 16 * K; while (N <= upperLimit_) { - callKernels(csvFile, M, N, K); + callDenseKernels(csvFile, M, N, K); M++; N += 16; K++; @@ -280,7 +283,7 @@ class doGemm { if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { // M = 32, N = dim, K = 32; - callKernels(csvFile, 32, dim, 32); + callDenseKernels(csvFile, 32, dim, 32); } } // Close file @@ -291,12 +294,27 @@ class doGemm { printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)"); } #endif + + // Square sparse matrix - sparse matrix multiplication + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() + + "_sparse_square.csv"); + if (upperLimit_ >= 32) { + for (int dim = 1; dim <= upperLimit_; dim++) { + const int N = dim; + callSparseKernels(csvFile, N, 0.99); + } + } + // Close file + csvFile.close(); } private: /** Call the appropriate CPU and GPU GEMM kernels. */ - void callKernels(std::ofstream& csvFile, const int M, const int N, - const int K) { + void callDenseKernels(std::ofstream& csvFile, const int M, const int N, + const int K) { const double probSize = calcKib(M, N, K); const uint64_t flops = calcFlops(M, N, K); std::string kernelName = getKernelName(); @@ -488,6 +506,52 @@ class doGemm { } } + void callSparseKernels(std::ofstream& csvFile, const int N, const float + sparsity) { + const double probSize = calcKib(N, N, N); + const uint64_t flops = calcFlops(N, N, N); + std::string kernelName = getKernelName(); + + spGemmCpu_.initialise(N, sparsity); + time_checksum_gflop cpuResult = spGemmCpu_.compute(); + cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); + + // Perform the GPU kernels + // - ONCE : Offload to/from GPU once before all iterations and once + // after + spGemmGpu_.initialise(gpuOffloadType::once, N, N, N); + time_checksum_gflop gpuResult_once = gemmGpu_.compute(); + gpuResult_once.gflops = + calcGflops(flops, iterations_, gpuResult_once.runtime); + + // - ALWAYS: Offload to/from GPU every iteration + spGemmGpu_.initialise(gpuOffloadType::always, N, N, N); + time_checksum_gflop gpuResult_always = gemmGpu_.compute(); + gpuResult_always.gflops = + calcGflops(flops, iterations_, gpuResult_always.runtime); + + // - UNIFIED : data passed from host to device (and device to host) as + // needed + spGemmGpu_.initialise(gpuOffloadType::unified, N, N, N); + time_checksum_gflop gpuResult_unified = gemmGpu_.compute(); + gpuResult_unified.gflops = + calcGflops(flops, iterations_, gpuResult_unified.runtime); + + // ToDo -- non-default GPU operations + + // Write lines to CSV file + writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, + cpuResult.runtime, cpuResult.gflops); + writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, + iterations_, gpuResult_once.runtime, gpuResult_once.gflops); + writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, + iterations_, gpuResult_always.runtime, + gpuResult_always.gflops); + writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize, + iterations_, gpuResult_unified.runtime, + gpuResult_unified.gflops); + } + /** A function for calculating FLOPs performed by a GEMM. * C = alpha*AB + beta*C */ constexpr uint64_t calcFlops(const int M, const int N, const int K) const { @@ -623,11 +687,15 @@ class doGemm { cpu::gemm_cpu gemmCpu_; #endif + cpu::sp_gemm_cpu spGemmCpu_; + #if GPU_ENABLED /** The GEMM GPU kernel. */ gpu::gemm_gpu gemmGpu_; #endif + gpu::sp_gemm_gpu spGemmGpu_; + /** The point at which offloading to GPU (offload once) becomes worthwhile. */ cpuGpu_offloadThreshold cpuGpu_once_; diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh new file mode 100644 index 0000000..3de5ea5 --- /dev/null +++ b/include/kernels/CPU/sp_gemm.hh @@ -0,0 +1,110 @@ +#pragma once + +#include "../gemm.hh" + +#include + +namespace cpu { + +/** An abstract class for GEMM BLAS kernels. */ + template + class sp_gemm : public ::gemm { + public: + using ::gemm::gemm; + using ::gemm::m_; + using ::gemm::n_; + using ::gemm::k_; + using ::gemm::A_; + using ::gemm::B_; + using ::gemm::C_; + + public: + /** Initialise the required data structures. */ + virtual void initialise(int n, double sparsity, bool binary = false) { + n_ = n; + + A_ = (T*)malloc(sizeof(T) * n_ * n_); + B_ = (T*)malloc(sizeof(T) * n_ * n_); + C_ = (T*)malloc(sizeof(T) * n_ * n_); + + // Set initial values to 0 + for (int i = 0; i < (n_ * n_); i++) { + A_[i] = 0.0; + B_[i] = 0.0; + } + + // Random number generator objects for use in descent + std::default_random_engine gen; + gen.seed(std::chrono::system_clock::now() + .time_since_epoch().count()); + std::uniform_real_distribution dist(0.0, 1.0); + + // Work out number of edges needed to achieve target sparsity + int edges = 1 + (int) (n * n * (1 - sparsity)); + + // Initialise the matrices + // Using a=0.45 and b=c=0.22 as default probabilities + for (int i = 0; i < edges; i++) { + while (!rMat(A_, n, 0, n - 1, 0, n - 1, + 0.45, 0.22, 0.22, + &gen, dist, false)) {} + while (!rMat(B_, n, 0, n - 1, 0, n - 1, + 0.45, 0.22, 0.22, + &gen, dist, false)) {} + } + } + + private: + bool rMat(T* M, int n, int x1, int x2, int y1, int y2, + float a, float b, float c, std::default_random_engine* gen, + std::uniform_real_distribution dist, bool bin) { + // If a 1x1 submatrix, then add an edge and return out + if (x1 >= x2 && y1 >= y2) { + if (abs(M[(y1 * n) + x1]) > 0.1) { + return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / + 100.0) - 50.0); + return true; + } + } else { + // Divide up the matrix + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); + + // ToDo -- add some noise to these values between iterations + float newA = a; + float newB = b; + float newC = c; + + // Work out which quarter to recurse into + // There are some ugly ternary operators here to avoid going out of bounds in the edge case + // that we are already at 1 width or 1 height + float randomNum = dist(*gen); + if (randomNum < a) { + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b)) { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b + c)) { + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + newA, newB, newC, gen, dist, bin); + } else { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, + gen, dist, bin); + } + } + return true; + } + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() { + free(A_); + free(B_); + free(C_); + } + }; +} // namespace cpu \ No newline at end of file diff --git a/include/kernels/GPU/sp_gemm.hh b/include/kernels/GPU/sp_gemm.hh new file mode 100644 index 0000000..684c166 --- /dev/null +++ b/include/kernels/GPU/sp_gemm.hh @@ -0,0 +1,27 @@ +#pragma once + +#include "../gemm.hh" + +namespace gpu { + +/** An abstract class for GEMM BLAS kernels. */ + template + class sp_gemm : public ::gemm { + public: + using ::gemm::gemm; + + /** Initialise the required data structures. + * `offload` refers to the data offload type: + * - Once: Move data from host to device before all iterations & move from + * device to host after all iterations + * - Always: Move data from host to device and device to host each iteration + * - Unified: Initialise data as unified memory; no data movement semantics + * required */ + virtual void initialise(gpuOffloadType offload, int m, int n, int k) = 0; + + protected: + /** Whether data should be offloaded to/from the GPU each iteration, or just + * before & after. */ + gpuOffloadType offload_ = gpuOffloadType::always; + }; +} // namespace gpu \ No newline at end of file diff --git a/src/main.cc b/src/main.cc index c61df37..38e2b5a 100644 --- a/src/main.cc +++ b/src/main.cc @@ -2,6 +2,10 @@ int iters = 10; int upperLimit = 128; +bool sgemm = true; +bool dgemm = true; +bool sp_sgemm = true; +bool sp_dgemm = true; bool doCpu = CPU_ENABLED; bool doGpu = GPU_ENABLED; From f2ed11f5325e2e063d0f92e07d09b13db6b356d7 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Wed, 13 Mar 2024 13:43:05 +0000 Subject: [PATCH 03/32] Implementing cuSPARSE kernel --- cuBLAS/sp_gemm.hh | 208 +++++++++++++++++++++++++--------------------- 1 file changed, 111 insertions(+), 97 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 3a9cff0..67d030c 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -1,7 +1,7 @@ #pragma once #ifdef GPU_CUBLAS -#include +#include "cusparse.h" #include #include "../include/kernels/GPU/gemm.hh" @@ -14,9 +14,7 @@ template class sp_gemm_gpu : public gemm { public: using gemm::gemm; - using gemm::m_; using gemm::n_; - using gemm::k_; using gemm::A_; using gemm::B_; using gemm::C_; @@ -29,15 +27,28 @@ class sp_gemm_gpu : public gemm { * - Always: Move data from host to device and device to host each iteration * - Unified: Initialise data as unified memory; no data movement semantics * required */ - void initialise(gpuOffloadType offload, int m, int n, int k) override { + void initialise(gpuOffloadType offload, int n, float sparsity) override { offload_ = offload; - m_ = m; + // Create a handle for cuSPARSE + cusparseCreate(&handle_); + n_ = n; - k_ = k; - // Create a handle for CUBLAS - cublasCreate(&handle_); + // Create descriptors for matrices A->C + cusparseMatDescr_t descrA, descrB, descrC; + + cusparseCreateMatDescr(&descrA); + cusparseCreateMatDescr(&descrB); + cusparseCreateMatDescr(&descrC); + + cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL); + + cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); + cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO); + cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO); // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); @@ -47,38 +58,96 @@ class sp_gemm_gpu : public gemm { cudaCheckError(cudaStreamCreate(&s2_)); cudaCheckError(cudaStreamCreate(&s3_)); + + // Work out number of edges needed to achieve target sparsity + int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); + if (offload_ == gpuOffloadType::unified) { - cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * m_ * k_)); - cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * k_ * n_)); - cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * m_ * n_)); + cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * n_ * n_)); + cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * n_ * n_)); + cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * n_ * n_)); + cudaCheckError(cudaMallocManaged(&DANnzPerRow, sizeof(int) * n_)); } else { // Allocate matrices on host - A_ = (T*)malloc(sizeof(T) * m_ * k_); - B_ = (T*)malloc(sizeof(T) * k_ * n_); - C_ = (T*)malloc(sizeof(T) * m_ * n_); + A_ = (T*)malloc(sizeof(T) * n_ * n_); + B_ = (T*)malloc(sizeof(T) * n_ * n_); + C_ = (T*)malloc(sizeof(T) * n_ * n_); + // Allocate matrices on device - cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * m_ * k_)); - cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * k_ * n_)); - cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * m_ * n_)); + cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * n_ * n_)); + cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * n_ * n_)); + cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * n_ * n_)); + // Alloce non-zero vector for A + cudaCheckError(cudaMalloc((void**)&dANnzPerRow, sizeof(int) * n_)); } - // Initialise the host matricies - srand(SEED); - for (int y = 0; y < m_; y++) { - for (int x = 0; x < k_; x++) { - A_[y * k_ + x] = (((T)(rand() % 10000) / 100.0) - 30.0); - } - } - for (int y = 0; y < k_; y++) { - for (int x = 0; x < n_; x++) { - B_[y * n_ + x] = (((T)(rand() % 10000) / 100.0) - 30.0); - } - } + // Initialise the host matricies + // cusparseSpGEMM() works on CSR format only. This helpfully makes our + // sparse matrix format decision for us! + // ToDo -- do the RMAT instantiation of A_ and B_. Need to think about + // how this can be done in the context of CSR. + + // Initialise the matrices + // Using a=0.45 and b=c=0.22 as default probabilities + for (int i = 0; i < edges; i++) { + while (!rMat(A_, n, 0, n - 1, 0, n - 1, + 0.45, 0.22, 0.22, + &gen, dist, false)) {} + while (!rMat(B_, n, 0, n - 1, 0, n - 1, + 0.45, 0.22, 0.22, + &gen, dist, false)) {} + } } private: + bool rMat(T* M, int n, int x1, int x2, int y1, int y2, + float a, float b, float c, std::default_random_engine* gen, + std::uniform_real_distribution dist, bool bin) { + // If a 1x1 submatrix, then add an edge and return out + if (x1 >= x2 && y1 >= y2) { + if (abs(M[(y1 * n) + x1]) > 0.1) { + return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / + 100.0) - 50.0); + return true; + } + } else { + // Divide up the matrix + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); + + // ToDo -- add some noise to these values between iterations + float newA = a; + float newB = b; + float newC = c; + + // Work out which quarter to recurse into + // There are some ugly ternary operators here to avoid going out of bounds in the edge case + // that we are already at 1 width or 1 height + float randomNum = dist(*gen); + if (randomNum < a) { + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b)) { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b + c)) { + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + newA, newB, newC, gen, dist, bin); + } else { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, + gen, dist, bin); + } + } + return true; + } + /** Perform any required steps before calling the GEMM kernel that should * be timed. */ + // ToDo -- update this to apply to CSR format void preLoopRequirements() override { switch (offload_) { case gpuOffloadType::always: { @@ -119,79 +188,20 @@ class sp_gemm_gpu : public gemm { cudaMemcpyHostToDevice, s2_)); cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_, cudaMemcpyHostToDevice, s3_)); - // Call cuBLAS GEMM kernel - if constexpr (std::is_same_v) { - cublasStatus_t stat = - cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, - A_device_, std::max(1, m_), B_device_, - std::max(1, k_), &beta, C_device_, std::max(1, m_)); - if (stat != CUBLAS_STATUS_SUCCESS) { - std::cout << "cuBLAS error:" << stat << std::endl; - exit(1); - } - } else if constexpr (std::is_same_v) { - cublasStatus_t stat = - cublasDgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, - A_device_, std::max(1, m_), B_device_, - std::max(1, k_), &beta, C_device_, std::max(1, m_)); - if (stat != CUBLAS_STATUS_SUCCESS) { - std::cout << "cuBLAS error:" << stat << std::endl; - exit(1); - } - } - // Offload data from device to host - cudaCheckError(cudaMemcpyAsync(A_, A_device_, sizeof(T) * m_ * k_, - cudaMemcpyDeviceToHost, s1_)); - cudaCheckError(cudaMemcpyAsync(B_, B_device_, sizeof(T) * k_ * n_, - cudaMemcpyDeviceToHost, s2_)); - cudaCheckError(cudaMemcpyAsync(C_, C_device_, sizeof(T) * m_ * n_, - cudaMemcpyDeviceToHost, s3_)); - // Ensure device has finished all work. - cudaCheckError(cudaDeviceSynchronize()); + // Call cuSPARSE SpGEMM kernel + // ToDo -- implement break; } case gpuOffloadType::once: { - // Call cuBLAS GEMM kernel - if constexpr (std::is_same_v) { - cublasStatus_t stat = - cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, - A_device_, std::max(1, m_), B_device_, - std::max(1, k_), &beta, C_device_, std::max(1, m_)); - if (stat != CUBLAS_STATUS_SUCCESS) { - std::cout << "cuBLAS error:" << stat << std::endl; - exit(1); - } - } else if constexpr (std::is_same_v) { - cublasStatus_t stat = - cublasDgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, - A_device_, std::max(1, m_), B_device_, - std::max(1, k_), &beta, C_device_, std::max(1, m_)); - if (stat != CUBLAS_STATUS_SUCCESS) { - std::cout << "cuBLAS error:" << stat << std::endl; - exit(1); - } - } + // Call cuSPRASE SpGEMM kernel + // ToDo -- implement + break; } case gpuOffloadType::unified: { - // Call cuBLAS GEMM kernel - if constexpr (std::is_same_v) { - cublasStatus_t stat = cublasSgemm( - handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, A_, - std::max(1, m_), B_, std::max(1, k_), &beta, C_, std::max(1, m_)); - if (stat != CUBLAS_STATUS_SUCCESS) { - std::cout << "cuBLAS error:" << stat << std::endl; - exit(1); - } - } else if constexpr (std::is_same_v) { - cublasStatus_t stat = cublasDgemm( - handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, A_, - std::max(1, m_), B_, std::max(1, k_), &beta, C_, std::max(1, m_)); - if (stat != CUBLAS_STATUS_SUCCESS) { - std::cout << "cuBLAS error:" << stat << std::endl; - exit(1); - } - } + // Call cuSPARSE SpGEMM kernel + // ToDo -- implement + break; } } @@ -199,6 +209,7 @@ class sp_gemm_gpu : public gemm { /** Perform any required steps after calling the GEMM kernel that should * be timed. */ + // ToDo -- check that this all still works void postLoopRequirements() override { switch (offload_) { case gpuOffloadType::always: { @@ -236,7 +247,7 @@ class sp_gemm_gpu : public gemm { * after Kernel has been called. */ void postCallKernelCleanup() override { // Destroy the handle - cublasDestroy(handle_); + cusparseDestroy(handle_); // Destroy streams after use cudaCheckError(cudaStreamDestroy(s1_)); @@ -285,6 +296,9 @@ class sp_gemm_gpu : public gemm { /** Input matrix C, held on the device. */ T* C_device_; + /** Vector for number non-zeros, held on the device */ + int* dANnzPerRow; + /** The constant value Alpha. */ const T alpha = ALPHA; From c208246927e738615a94c0308e845cf42c198f98 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Wed, 13 Mar 2024 14:05:20 +0000 Subject: [PATCH 04/32] Trying to work out CSR malloc bug --- cuBLAS/sp_gemm.hh | 126 ++++++++++++++++++++++++++++------------------ 1 file changed, 76 insertions(+), 50 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 67d030c..3232293 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -66,7 +66,19 @@ class sp_gemm_gpu : public gemm { cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * n_ * n_)); cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * n_ * n_)); cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * n_ * n_)); - cudaCheckError(cudaMallocManaged(&DANnzPerRow, sizeof(int) * n_)); + + cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges)); + cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges)); + cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * edges)); + + cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges)); + cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges)); + cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * edges)); + + cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * edges)); + cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * edges)); + cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * edges)); +// cudaCheckError(cudaMallocManaged(&DANnzPerRow, sizeof(int) * n_)); } else { // Allocate matrices on host A_ = (T*)malloc(sizeof(T) * n_ * n_); @@ -78,7 +90,7 @@ class sp_gemm_gpu : public gemm { cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * n_ * n_)); cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * n_ * n_)); // Alloce non-zero vector for A - cudaCheckError(cudaMalloc((void**)&dANnzPerRow, sizeof(int) * n_)); +// cudaCheckError(cudaMalloc((void**)&dANnzPerRow, sizeof(int) * n_)); } // Initialise the host matricies @@ -88,6 +100,11 @@ class sp_gemm_gpu : public gemm { // how this can be done in the context of CSR. // Initialise the matrices + // Set initial values to 0 + for (int i = 0; i < (n_ * n_); i++) { + A_[i] = 0.0; + B_[i] = 0.0; + } // Using a=0.45 and b=c=0.22 as default probabilities for (int i = 0; i < edges; i++) { while (!rMat(A_, n, 0, n - 1, 0, n - 1, @@ -97,57 +114,17 @@ class sp_gemm_gpu : public gemm { 0.45, 0.22, 0.22, &gen, dist, false)) {} } + +// for (int i = 0; i < (n_ * n_); i++) { +// C_[i] = 0.0; +// } } private: - bool rMat(T* M, int n, int x1, int x2, int y1, int y2, - float a, float b, float c, std::default_random_engine* gen, - std::uniform_real_distribution dist, bool bin) { - // If a 1x1 submatrix, then add an edge and return out - if (x1 >= x2 && y1 >= y2) { - if (abs(M[(y1 * n) + x1]) > 0.1) { - return false; - } else { - // Add 1.0 if this is a binary graph, and a random real number otherwise - M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / - 100.0) - 50.0); - return true; - } - } else { - // Divide up the matrix - int xMidPoint = x1 + floor((x2 - x1) / 2); - int yMidPoint = y1 + floor((y2 - y1) / 2); - - // ToDo -- add some noise to these values between iterations - float newA = a; - float newB = b; - float newC = c; - - // Work out which quarter to recurse into - // There are some ugly ternary operators here to avoid going out of bounds in the edge case - // that we are already at 1 width or 1 height - float randomNum = dist(*gen); - if (randomNum < a) { - return rMat(M, n, x1, xMidPoint, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b)) { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b + c)) { - return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, - newA, newB, newC, gen, dist, bin); - } else { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, - ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, - gen, dist, bin); - } - } - return true; - } + /** Perform any required steps before calling the GEMM kernel that should * be timed. */ - // ToDo -- update this to apply to CSR format void preLoopRequirements() override { switch (offload_) { case gpuOffloadType::always: { @@ -188,8 +165,8 @@ class sp_gemm_gpu : public gemm { cudaMemcpyHostToDevice, s2_)); cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_, cudaMemcpyHostToDevice, s3_)); - // Call cuSPARSE SpGEMM kernel - // ToDo -- implement + + break; } case gpuOffloadType::once: { @@ -269,6 +246,51 @@ class sp_gemm_gpu : public gemm { } } + bool rMat(T* M, int n, int x1, int x2, int y1, int y2, + float a, float b, float c, std::default_random_engine* gen, + std::uniform_real_distribution dist, bool bin) { + // If a 1x1 submatrix, then add an edge and return out + if (x1 >= x2 && y1 >= y2) { + if (abs(M[(y1 * n) + x1]) > 0.1) { + return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / + 100.0) - 50.0); + return true; + } + } else { + // Divide up the matrix + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); + + // ToDo -- add some noise to these values between iterations + float newA = a; + float newB = b; + float newC = c; + + // Work out which quarter to recurse into + // There are some ugly ternary operators here to avoid going out of bounds in the edge case + // that we are already at 1 width or 1 height + float randomNum = dist(*gen); + if (randomNum < a) { + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b)) { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b + c)) { + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + newA, newB, newC, gen, dist, bin); + } else { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, + gen, dist, bin); + } + } + return true; + } + /** Handle used when calling cuBLAS. */ cublasHandle_t handle_; @@ -297,7 +319,11 @@ class sp_gemm_gpu : public gemm { T* C_device_; /** Vector for number non-zeros, held on the device */ - int* dANnzPerRow; +// int* dANnzPerRow; + + /** CSR format vectors for matrices A, B and C on the device */ + T* A_val_, B_val_, C_val_; + int* A_col_, A_row_, B_col_, B_row_, C_col_, C_row_; /** The constant value Alpha. */ const T alpha = ALPHA; From de14a5682aae00ab582f87a396eaf3da5b66b99f Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Wed, 13 Mar 2024 14:07:46 +0000 Subject: [PATCH 05/32] Trying to work out CSR malloc bug --- cuBLAS/sp_gemm.hh | 2 -- 1 file changed, 2 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 3232293..0765adb 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -96,8 +96,6 @@ class sp_gemm_gpu : public gemm { // Initialise the host matricies // cusparseSpGEMM() works on CSR format only. This helpfully makes our // sparse matrix format decision for us! - // ToDo -- do the RMAT instantiation of A_ and B_. Need to think about - // how this can be done in the context of CSR. // Initialise the matrices // Set initial values to 0 From 49cddf02f8a50571d2eaa5b653bdf8fb49198d91 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Tue, 19 Mar 2024 13:05:58 +0000 Subject: [PATCH 06/32] cuSPARSE unified memory implementation --- cuBLAS/sp_gemm.hh | 433 ++++++++++++++++++++++++++-------------------- 1 file changed, 250 insertions(+), 183 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 0765adb..68e3b84 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -3,6 +3,7 @@ #ifdef GPU_CUBLAS #include "cusparse.h" #include +#include #include "../include/kernels/GPU/gemm.hh" #include "../include/utilities.hh" @@ -20,6 +21,8 @@ class sp_gemm_gpu : public gemm { using gemm::C_; using gemm::offload_; + // ToDo -- just unified implemented so far. Fill in Always and Once later + /** Initialise the required data structures. * `offload` refers to the data offload type: * - Once: Move data from host to device before all iterations & move from @@ -33,10 +36,10 @@ class sp_gemm_gpu : public gemm { // Create a handle for cuSPARSE cusparseCreate(&handle_); - n_ = n; + cudaDataType_ = (std::is_same_v) ? CUDA_R_32F : + CUDA_R_64F; - // Create descriptors for matrices A->C - cusparseMatDescr_t descrA, descrB, descrC; + n_ = n; cusparseCreateMatDescr(&descrA); cusparseCreateMatDescr(&descrB); @@ -61,37 +64,30 @@ class sp_gemm_gpu : public gemm { // Work out number of edges needed to achieve target sparsity int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); + A_nnz_ = B_nnz_ = edges + + // ToDo -- for all of this mallocing, bear in mind that row will probably + // have fewer than 'edges' values (thats the whole point). May need to + // reorganise + + cudaCheckError(cudaMallocManaged(A_num_rows_, sizeof(int))); + cudaCheckError(cudaMallocManaged(A_num_cols_, sizeof(int))); + cudaCheckError(cudaMallocManaged(A_nnz_, sizeof(int))); + cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges)); + cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges)); + cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1))); + + cudaCheckError(cudaMallocManaged(B_num_rows_, sizeof(int))); + cudaCheckError(cudaMallocManaged(B_num_cols_, sizeof(int))); + cudaCheckError(cudaMallocManaged(B_nnz_, sizeof(int))); + cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges)); + cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges)); + cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1))); + + C_val_ = NULL; + C_col_ = NULL; + C_row_ = NULL; - if (offload_ == gpuOffloadType::unified) { - cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * n_ * n_)); - cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * n_ * n_)); - cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * n_ * n_)); - - cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges)); - cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges)); - cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * edges)); - - cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges)); - cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges)); - cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * edges)); - - cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * edges)); - cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * edges)); - cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * edges)); -// cudaCheckError(cudaMallocManaged(&DANnzPerRow, sizeof(int) * n_)); - } else { - // Allocate matrices on host - A_ = (T*)malloc(sizeof(T) * n_ * n_); - B_ = (T*)malloc(sizeof(T) * n_ * n_); - C_ = (T*)malloc(sizeof(T) * n_ * n_); - - // Allocate matrices on device - cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * n_ * n_)); - cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * n_ * n_)); - cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * n_ * n_)); - // Alloce non-zero vector for A -// cudaCheckError(cudaMalloc((void**)&dANnzPerRow, sizeof(int) * n_)); - } // Initialise the host matricies // cusparseSpGEMM() works on CSR format only. This helpfully makes our @@ -113,109 +109,160 @@ class sp_gemm_gpu : public gemm { &gen, dist, false)) {} } -// for (int i = 0; i < (n_ * n_); i++) { -// C_[i] = 0.0; -// } + toCSR(A_, n, n, edges, A_val_, A_col_, A_row_); + toCSR(B_, n, n, edges, B_val_, B_col_, B_row_); + } + + private: /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { - switch (offload_) { - case gpuOffloadType::always: { - // Offload data each iteration - no requirements - break; - } - case gpuOffloadType::once: { - // Offload data from host to the device. - cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * k_, - cudaMemcpyHostToDevice, s1_)); - cudaCheckError(cudaMemcpyAsync(B_device_, B_, sizeof(T) * k_ * n_, - cudaMemcpyHostToDevice, s2_)); - cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_, - cudaMemcpyHostToDevice, s3_)); - break; - } - case gpuOffloadType::unified: { - // Prefetch memory to device - cudaCheckError( - cudaMemPrefetchAsync(A_, sizeof(T) * m_ * k_, gpuDevice_, s1_)); - cudaCheckError( - cudaMemPrefetchAsync(B_, sizeof(T) * k_ * n_, gpuDevice_, s2_)); - cudaCheckError( - cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_, gpuDevice_, s3_)); - break; - } - } + // Prefetch memory to device + cudaCheckError(cudaMemPrefetchAsync(A_num_rows_, sizeof(int), gpuDevice_, + s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_num_cols_, sizeof(int), gpuDevice_, + s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_nnz_, sizeof(int), gpuDevice_, + s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * edges, gpuDevice_, + s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * edges, + gpuDevice_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1), + gpuDevice_, s1_)); + + cudaCheckError(cudaMemPrefetchAsync(B_num_rows_, sizeof(int), gpuDevice_, + s2_)); + cudaCheckError(cudaMemPrefetchAsync(B_num_cols_, sizeof(int), gpuDevice_, + s2_)); + cudaCheckError(cudaMemPrefetchAsync(B_nnz_, sizeof(int), gpuDevice_, + s2_)); + cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * edges, gpuDevice_, + s2_)); + cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * edges, + gpuDevice_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1), + gpuDevice_, s2_)); +// +// cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), gpuDevice_, +// s3_)); +// cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), gpuDevice_, +// s3_)); +// cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), gpuDevice_, +// s3_)); +// cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * edges, gpuDevice_, +// s3_)); +// cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * edges, +// gpuDevice_, s3_)); +// cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * edges, +// gpuDevice_, s3_)); + + // Create the CSR matrices on the device + cusparseCreateCsr(descrA_, n_, n_, A_nnz_, A_row_, A_col_, A_val_, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, cudaDateType_); + cusparseCreateCsr(descrB_, n_, n_, B_nnz_, B_row_, B_col_, B_val_, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, cudaDateType_); + cusparseCreateCsr(descrC_, n_, n_, 0, NULL, NULL, NULL, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); + + cusparseSpGEMM_createDescr(&spgemmDesc_); } /** Make a call to the BLAS Library Kernel. */ void callGemm() override { - switch (offload_) { - case gpuOffloadType::always: { - // Offload data from host to the device. - cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * k_, - cudaMemcpyHostToDevice, s1_)); - cudaCheckError(cudaMemcpyAsync(B_device_, B_, sizeof(T) * k_ * n_, - cudaMemcpyHostToDevice, s2_)); - cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_, - cudaMemcpyHostToDevice, s3_)); - - - break; - } - case gpuOffloadType::once: { - // Call cuSPRASE SpGEMM kernel - // ToDo -- implement - - break; - } - case gpuOffloadType::unified: { - // Call cuSPARSE SpGEMM kernel - // ToDo -- implement - - break; - } - } - } + cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, + descrA_, descrB_, &beta, descrC_, + CUSPARSE_SPGEMM_DEFAULT, cudaDataType_, + spgemmDesc_, buffer_size1_, NULL); + cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_)); + cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, + descrA_, descrB_, &beta, descrC_, + CUSPARSE_SPGEMM_DEFAULT, cudaDataType_, + spgemmDesc_, buffer_size1_, buffer1_); + cusparseSpGEMM_cmopute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, + descrB_, &beta, descrC_, CUSPARSE_SPGEMM_DEFAULT, + cudaDataType_, spgemmDesc_, buffer_size2_, NULL); + cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2)); + + if (cusparseSpGEMM_cmopute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, + descrB_, &beta, descrC_, CUSPARSE_SPGEMM_DEFAULT, + cudaDataType_, spgemmDesc_, buffer_size2_, buffer2_) + == CUSPARSE_SATUS_INSUFFICIENT_RESOURCES) { + std::cout << "Insufficient resources" << std::endl; + exit(1); + } + + int rows, cols, nnz; + + cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz_); + C_nnz_ = nnz; + cudaCheckError(cudaMallocManaged(C_val_), sizeof(T) * nnz); + cudaCheckError(cudaMallocManaged(C_col_), sizeof(int) * nnz); + cudaCheckError(cudaMallocManaged(C_row_), sizeof(int) * (n_ + 1)); + + cusparseCstSetPointers(descrC_, *C_row, *C_colind, *C_val); + cusparseSpGEMM_copy(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, + descrB_, &beta, descrC_, CUDA_R_32F, + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_); + } /** Perform any required steps after calling the GEMM kernel that should * be timed. */ - // ToDo -- check that this all still works void postLoopRequirements() override { - switch (offload_) { - case gpuOffloadType::always: { - // Offload data each iteration - no requirements - break; - } - case gpuOffloadType::once: { - // Offload data from device to host - cudaCheckError(cudaMemcpyAsync(A_, A_device_, sizeof(T) * m_ * k_, - cudaMemcpyDeviceToHost, s1_)); - cudaCheckError(cudaMemcpyAsync(B_, B_device_, sizeof(T) * k_ * n_, - cudaMemcpyDeviceToHost, s2_)); - cudaCheckError(cudaMemcpyAsync(C_, C_device_, sizeof(T) * m_ * n_, - cudaMemcpyDeviceToHost, s3_)); - // Ensure device has finished all work. - cudaCheckError(cudaDeviceSynchronize()); - break; - } - case gpuOffloadType::unified: { - // Ensure all data resides on host once work has completed - cudaCheckError(cudaMemPrefetchAsync(A_, sizeof(T) * m_ * k_, - cudaCpuDeviceId, s1_)); - cudaCheckError(cudaMemPrefetchAsync(B_, sizeof(T) * k_ * n_, - cudaCpuDeviceId, s2_)); - cudaCheckError(cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_, - cudaCpuDeviceId, s3_)); - // Ensure device has finished all work. - cudaCheckError(cudaDeviceSynchronize()); - break; - } - } + // Ensure all data resides on host once work has completed + cudaCheckError(cudaMemPrefetchAsync(A_num_rows_, sizeof(int), + cudaCpuDeviceId_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_num_cols_, sizeof(int), + cudaCpuDeviceId_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_nnz_, sizeof(int), + cudaCpuDeviceId_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * edges, + cudaCpuDeviceId_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * edges, + cudaCpuDeviceId_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1), + cudaCpuDeviceId_, s1_)); + + cudaCheckError(cudaMemPrefetchAsync(B_num_rows_, sizeof(int), + cudaCpuDeviceId_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(B_num_cols_, sizeof(int), + cudaCpuDeviceId_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(B_nnz_, sizeof(int), + cudaCpuDeviceId_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * edges, + cudaCpuDeviceId_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * edges, + cudaCpuDeviceId_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1), + cudaCpuDeviceId_, s2_)); + + cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), + cudaCpuDeviceId_, s3_)); + cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), + cudaCpuDeviceId_, s3_)); + cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), + cudaCpuDeviceId_, s3_)); + cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * C_nnz_, + cudaCpuDeviceId_, s3_)); + cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * C_nnz_, + cudaCpuDeviceId_, s3_)); + cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * (n_ + 1), + cudaCpuDeviceId_, s3_)); + // Ensure device has finished all work. + cudaCheckError(cudaDeviceSynchronize()); } /** Do any necessary cleanup (free pointers, close library handles, etc.) @@ -229,65 +276,76 @@ class sp_gemm_gpu : public gemm { cudaCheckError(cudaStreamDestroy(s2_)); cudaCheckError(cudaStreamDestroy(s3_)); - if (offload_ == gpuOffloadType::unified) { - cudaFree(A_); - cudaFree(B_); - cudaFree(C_); - } else { - // Free the memory held on host and device - free(A_); - free(B_); - free(C_); - cudaFree(A_device_); - cudaFree(B_device_); - cudaFree(C_device_); - } + cudaFree(A_); + cudaFree(B_); + cudaFree(C_); } bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b, float c, std::default_random_engine* gen, std::uniform_real_distribution dist, bool bin) { - // If a 1x1 submatrix, then add an edge and return out - if (x1 >= x2 && y1 >= y2) { - if (abs(M[(y1 * n) + x1]) > 0.1) { - return false; - } else { - // Add 1.0 if this is a binary graph, and a random real number otherwise - M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / - 100.0) - 50.0); - return true; - } + // If a 1x1 submatrix, then add an edge and return out + if (x1 >= x2 && y1 >= y2) { + if (abs(M[(y1 * n) + x1]) > 0.1) { + return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / + 100.0) - 50.0); + return true; + } + } else { + // Divide up the matrix + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); + + // ToDo -- add some noise to these values between iterations + float newA = a; + float newB = b; + float newC = c; + + // Work out which quarter to recurse into + // There are some ugly ternary operators here to avoid going out of bounds in the edge case + // that we are already at 1 width or 1 height + float randomNum = dist(*gen); + if (randomNum < a) { + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b)) { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b + c)) { + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + newA, newB, newC, gen, dist, bin); } else { - // Divide up the matrix - int xMidPoint = x1 + floor((x2 - x1) / 2); - int yMidPoint = y1 + floor((y2 - y1) / 2); - - // ToDo -- add some noise to these values between iterations - float newA = a; - float newB = b; - float newC = c; - - // Work out which quarter to recurse into - // There are some ugly ternary operators here to avoid going out of bounds in the edge case - // that we are already at 1 width or 1 height - float randomNum = dist(*gen); - if (randomNum < a) { - return rMat(M, n, x1, xMidPoint, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b)) { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b + c)) { - return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, - newA, newB, newC, gen, dist, bin); - } else { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, - ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, - gen, dist, bin); + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, + gen, dist, bin); + } + } + return true; + } + + void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index, + int* row_ptr) { + int nnz_encountered = 0; + int prev_row_ptr = 0; + for (int row = 0; row < n_row; row++) { + if (nnz_encountered >= nnz) break; + row_ptr[row] = prev_row_ptr; + int nnz_row = 0; + for (int col = 0; col < n_col; col++) { + if (nnz_encountered >= nnz) break; + if (dense[(row * n_col) + col] != 0.0) { + nnz_row++; + col_index[nnz_encountered] = col; + vals[nnz_encountered] = dense[(row * n_col) + col]; + nnz_encountered++; } } - return true; + prev_row_ptr += nnz_row; } + } /** Handle used when calling cuBLAS. */ cublasHandle_t handle_; @@ -307,27 +365,36 @@ class sp_gemm_gpu : public gemm { /** The ID of the target GPU Device. */ int gpuDevice_; - /** Input matrix A, held on the device. */ - T* A_device_; - - /** Input matrix B, held on the device. */ - T* B_device_; - - /** Input matrix C, held on the device. */ - T* C_device_; - - /** Vector for number non-zeros, held on the device */ -// int* dANnzPerRow; - - /** CSR format vectors for matrices A, B and C on the device */ + /** CSR format vectors for matrices A, B and C on the host */ + int A_nnz_, B_nnz_, C_nnz_; T* A_val_, B_val_, C_val_; int* A_col_, A_row_, B_col_, B_row_, C_col_, C_row_; + /** CSR format vectors for matrices A, B and C on the device. */ + int A_num_rows_dev_, A_num_cols_dev_, A_nnz_dev_, B_num_rows_dev_, + B_num_cols_dev_, B_nnz_dev_, C_num_rows_dev_, C_num_cols_dev_, C_nnz_dev_; + T* A_val_dev_, B_val_dev_, C_val_dev_; + int* A_col_dev_, A_row_dev_, B_col_dev_, B_row_dev_, C_col_dev_, C_row_dev_; + /** The constant value Alpha. */ const T alpha = ALPHA; /** The constant value Beta. */ const T beta = BETA; + + + // Create descriptors for matrices A->C + cusparseMatDescr_t descrA_, descrB_, descrC_; + + // index type depends on kernel being run + cusparseIndexType_t cudaDataType_; + + cusparceSpGEMMDescr_t spgemmDesc_; + + size_t buffer_size1_ = 0; + size_t buffer_size2_ = 0; + void* buffer1_ = NULL; + void* buffer2_ = NULL; }; } // namespace gpu #endif \ No newline at end of file From 37ce8b4c32b7b04caae5a4dbc697b21086447c9f Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Thu, 21 Mar 2024 13:08:49 +0000 Subject: [PATCH 07/32] Now compiles --- DefaultGPU/sp_gemm.hh | 2 +- Makefile | 2 +- cuBLAS/sp_gemm.hh | 228 +++++++++++++++------------------ include/doGemm.hh | 7 +- include/kernels/GPU/sp_gemm.hh | 2 +- 5 files changed, 112 insertions(+), 129 deletions(-) diff --git a/DefaultGPU/sp_gemm.hh b/DefaultGPU/sp_gemm.hh index 92d157c..2a9f478 100644 --- a/DefaultGPU/sp_gemm.hh +++ b/DefaultGPU/sp_gemm.hh @@ -22,7 +22,7 @@ class sp_gemm_gpu : public sp_gemm { } /** Initialise the required data structures. */ - void initialise(gpuOffloadType offload, int m, int n, int k) override { + void initialise(gpuOffloadType offload, int n, float sparsity) override { // Default GPU implementation - do nothing. } diff --git a/Makefile b/Makefile index 5dd2fc5..bff0add 100644 --- a/Makefile +++ b/Makefile @@ -177,7 +177,7 @@ $(info $(TAB)$(TAB)Add `CXXFLAGS=-L/.../math_libs/lib64 -L $(info $(TAB)$(TAB)Add `CXXFLAGS=-I/.../math_libs/include -I/.../cuda/include` to make command) $(info $(TAB)$(TAB)Add `CXXFLAGS=-Wl,-rpath,/.../math_libs/lib64 -Wl,-rpath,/.../cuda/lib64` to make command) $(info ) -override CXXFLAGS += -lcublas -lcudart +override CXXFLAGS += -lcublas -lcudart -lcusparse endif HEADER_FILES += $(wildcard cuBLAS/*.hh) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 68e3b84..c0bfb8e 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -2,24 +2,27 @@ #ifdef GPU_CUBLAS #include "cusparse.h" +#include #include #include +#include +#include -#include "../include/kernels/GPU/gemm.hh" +#include "../include/kernels/GPU/sp_gemm.hh" #include "../include/utilities.hh" #include "common.hh" namespace gpu { /** A class for GEMM GPU BLAS kernels. */ template -class sp_gemm_gpu : public gemm { +class sp_gemm_gpu : public sp_gemm { public: - using gemm::gemm; - using gemm::n_; - using gemm::A_; - using gemm::B_; - using gemm::C_; - using gemm::offload_; + using sp_gemm::sp_gemm; + using sp_gemm::n_; + using sp_gemm::A_; + using sp_gemm::B_; + using sp_gemm::C_; + using sp_gemm::offload_; // ToDo -- just unified implemented so far. Fill in Always and Once later @@ -31,63 +34,50 @@ class sp_gemm_gpu : public gemm { * - Unified: Initialise data as unified memory; no data movement semantics * required */ void initialise(gpuOffloadType offload, int n, float sparsity) override { + std::cout << "Initialising" << std::endl; offload_ = offload; // Create a handle for cuSPARSE cusparseCreate(&handle_); + std::cout << "Handle created" << std::endl; - cudaDataType_ = (std::is_same_v) ? CUDA_R_32F : - CUDA_R_64F; + if (std::is_same_v) cudaDataType_ = CUDA_R_32F; + else if (std::is_same_v) cudaDataType_ = CUDA_R_64F; + else { + std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; + exit(1); + } n_ = n; - cusparseCreateMatDescr(&descrA); - cusparseCreateMatDescr(&descrB); - cusparseCreateMatDescr(&descrC); - - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL); - - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); - cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO); - cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO); - // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); + std::cout << "GPU device got" << std::endl; // Initialise 3 streams to asynchronously move data between host and device cudaCheckError(cudaStreamCreate(&s1_)); cudaCheckError(cudaStreamCreate(&s2_)); cudaCheckError(cudaStreamCreate(&s3_)); + std::cout << "Streams created" << std::endl; // Work out number of edges needed to achieve target sparsity int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); - A_nnz_ = B_nnz_ = edges + (*A_nnz_) = (*B_nnz_) = edges; // ToDo -- for all of this mallocing, bear in mind that row will probably // have fewer than 'edges' values (thats the whole point). May need to // reorganise - cudaCheckError(cudaMallocManaged(A_num_rows_, sizeof(int))); - cudaCheckError(cudaMallocManaged(A_num_cols_, sizeof(int))); - cudaCheckError(cudaMallocManaged(A_nnz_, sizeof(int))); cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges)); cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges)); cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1))); + std::cout << "A CSR vectors malloced" << std::endl; - cudaCheckError(cudaMallocManaged(B_num_rows_, sizeof(int))); - cudaCheckError(cudaMallocManaged(B_num_cols_, sizeof(int))); - cudaCheckError(cudaMallocManaged(B_nnz_, sizeof(int))); cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges)); cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges)); cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1))); - - C_val_ = NULL; - C_col_ = NULL; - C_row_ = NULL; - + std::cout << "B CSR vectors malloced" << std::endl; // Initialise the host matricies // cusparseSpGEMM() works on CSR format only. This helpfully makes our @@ -99,6 +89,13 @@ class sp_gemm_gpu : public gemm { A_[i] = 0.0; B_[i] = 0.0; } + + // Random number generator objects for use in descent + std::default_random_engine gen; + gen.seed(std::chrono::system_clock::now() + .time_since_epoch().count()); + std::uniform_real_distribution dist(0.0, 1.0); + // Using a=0.45 and b=c=0.22 as default probabilities for (int i = 0; i < edges; i++) { while (!rMat(A_, n, 0, n - 1, 0, n - 1, @@ -117,34 +114,20 @@ class sp_gemm_gpu : public gemm { private: - - /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { // Prefetch memory to device - cudaCheckError(cudaMemPrefetchAsync(A_num_rows_, sizeof(int), gpuDevice_, - s1_)); - cudaCheckError(cudaMemPrefetchAsync(A_num_cols_, sizeof(int), gpuDevice_, - s1_)); - cudaCheckError(cudaMemPrefetchAsync(A_nnz_, sizeof(int), gpuDevice_, - s1_)); - cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * edges, gpuDevice_, - s1_)); - cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * edges, + cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * (*A_nnz_), + gpuDevice_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * (*A_nnz_), gpuDevice_, s1_)); cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1), gpuDevice_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(B_num_rows_, sizeof(int), gpuDevice_, - s2_)); - cudaCheckError(cudaMemPrefetchAsync(B_num_cols_, sizeof(int), gpuDevice_, - s2_)); - cudaCheckError(cudaMemPrefetchAsync(B_nnz_, sizeof(int), gpuDevice_, - s2_)); - cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * edges, gpuDevice_, - s2_)); - cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * edges, + cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * (*B_nnz_), + gpuDevice_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * (*B_nnz_), gpuDevice_, s2_)); cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1), gpuDevice_, s2_)); @@ -163,13 +146,13 @@ class sp_gemm_gpu : public gemm { // gpuDevice_, s3_)); // Create the CSR matrices on the device - cusparseCreateCsr(descrA_, n_, n_, A_nnz_, A_row_, A_col_, A_val_, + cusparseCreateCsr(&descrA_, n_, n_, (*A_nnz_), A_row_, A_col_, A_val_, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cudaDateType_); - cusparseCreateCsr(descrB_, n_, n_, B_nnz_, B_row_, B_col_, B_val_, + CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); + cusparseCreateCsr(&descrB_, n_, n_, (*B_nnz_), B_row_, B_col_, B_val_, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cudaDateType_); - cusparseCreateCsr(descrC_, n_, n_, 0, NULL, NULL, NULL, + CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); + cusparseCreateCsr(&descrC_, n_, n_, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); @@ -181,38 +164,40 @@ class sp_gemm_gpu : public gemm { cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, descrB_, &beta, descrC_, - CUSPARSE_SPGEMM_DEFAULT, cudaDataType_, - spgemmDesc_, buffer_size1_, NULL); + cudaDataType_, CUSPARSE_SPGEMM_DEFAULT, + spgemmDesc_, &buffer_size1_, NULL); cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_)); cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, descrB_, &beta, descrC_, - CUSPARSE_SPGEMM_DEFAULT, cudaDataType_, - spgemmDesc_, buffer_size1_, buffer1_); - cusparseSpGEMM_cmopute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + cudaDataType_, CUSPARSE_SPGEMM_DEFAULT, + spgemmDesc_, &buffer_size1_, buffer1_); + cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, - descrB_, &beta, descrC_, CUSPARSE_SPGEMM_DEFAULT, - cudaDataType_, spgemmDesc_, buffer_size2_, NULL); - cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2)); + descrB_, &beta, descrC_, cudaDataType_, + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_, + &buffer_size2_, NULL); + cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2_)); - if (cusparseSpGEMM_cmopute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + if (cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, - descrB_, &beta, descrC_, CUSPARSE_SPGEMM_DEFAULT, - cudaDataType_, spgemmDesc_, buffer_size2_, buffer2_) - == CUSPARSE_SATUS_INSUFFICIENT_RESOURCES) { + descrB_, &beta, descrC_, cudaDataType_, + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_, + &buffer_size2_, buffer2_) + == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) { std::cout << "Insufficient resources" << std::endl; exit(1); } - int rows, cols, nnz; + int64_t rows, cols, nnz; - cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz_); - C_nnz_ = nnz; - cudaCheckError(cudaMallocManaged(C_val_), sizeof(T) * nnz); - cudaCheckError(cudaMallocManaged(C_col_), sizeof(int) * nnz); - cudaCheckError(cudaMallocManaged(C_row_), sizeof(int) * (n_ + 1)); + cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz); + (*C_nnz_) = nnz; + cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnz)); + cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnz)); + cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1))); - cusparseCstSetPointers(descrC_, *C_row, *C_colind, *C_val); + cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_); cusparseSpGEMM_copy(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, descrB_, &beta, descrC_, CUDA_R_32F, @@ -223,44 +208,26 @@ class sp_gemm_gpu : public gemm { * be timed. */ void postLoopRequirements() override { // Ensure all data resides on host once work has completed - cudaCheckError(cudaMemPrefetchAsync(A_num_rows_, sizeof(int), - cudaCpuDeviceId_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(A_num_cols_, sizeof(int), - cudaCpuDeviceId_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(A_nnz_, sizeof(int), - cudaCpuDeviceId_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * edges, - cudaCpuDeviceId_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * edges, - cudaCpuDeviceId_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * (*A_nnz_), + cudaCpuDeviceId, s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * (*A_nnz_), + cudaCpuDeviceId, s1_)); cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1), - cudaCpuDeviceId_, s1_)); - - cudaCheckError(cudaMemPrefetchAsync(B_num_rows_, sizeof(int), - cudaCpuDeviceId_, s2_)); - cudaCheckError(cudaMemPrefetchAsync(B_num_cols_, sizeof(int), - cudaCpuDeviceId_, s2_)); - cudaCheckError(cudaMemPrefetchAsync(B_nnz_, sizeof(int), - cudaCpuDeviceId_, s2_)); - cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * edges, - cudaCpuDeviceId_, s2_)); - cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * edges, - cudaCpuDeviceId_, s2_)); + cudaCpuDeviceId, s1_)); + + cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * (*B_nnz_), + cudaCpuDeviceId, s2_)); + cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * (*B_nnz_), + cudaCpuDeviceId, s2_)); cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1), - cudaCpuDeviceId_, s2_)); - - cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), - cudaCpuDeviceId_, s3_)); - cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), - cudaCpuDeviceId_, s3_)); - cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), - cudaCpuDeviceId_, s3_)); - cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * C_nnz_, - cudaCpuDeviceId_, s3_)); - cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * C_nnz_, - cudaCpuDeviceId_, s3_)); + cudaCpuDeviceId, s2_)); + + cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * (*C_nnz_), + cudaCpuDeviceId, s3_)); + cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * (*C_nnz_), + cudaCpuDeviceId, s3_)); cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * (n_ + 1), - cudaCpuDeviceId_, s3_)); + cudaCpuDeviceId, s3_)); // Ensure device has finished all work. cudaCheckError(cudaDeviceSynchronize()); } @@ -348,7 +315,7 @@ class sp_gemm_gpu : public gemm { } /** Handle used when calling cuBLAS. */ - cublasHandle_t handle_; + cusparseHandle_t handle_; /** CUDA Stream 1 - used to asynchronously move data between host and device. */ @@ -366,12 +333,29 @@ class sp_gemm_gpu : public gemm { int gpuDevice_; /** CSR format vectors for matrices A, B and C on the host */ - int A_nnz_, B_nnz_, C_nnz_; - T* A_val_, B_val_, C_val_; - int* A_col_, A_row_, B_col_, B_row_, C_col_, C_row_; + T* A_val_; + int* A_col_; + int* A_row_; + int* A_num_rows_; + int* A_num_cols_; + int* A_nnz_; + + T* B_val_; + int* B_col_; + int* B_row_; + int* B_num_rows_; + int* B_num_cols_; + int* B_nnz_; + + T* C_val_; + int* C_col_; + int* C_row_; + int* C_num_rows_; + int* C_num_cols_; + int*C_nnz_; /** CSR format vectors for matrices A, B and C on the device. */ - int A_num_rows_dev_, A_num_cols_dev_, A_nnz_dev_, B_num_rows_dev_, + int* A_num_rows_dev_, A_num_cols_dev_, A_nnz_dev_, B_num_rows_dev_, B_num_cols_dev_, B_nnz_dev_, C_num_rows_dev_, C_num_cols_dev_, C_nnz_dev_; T* A_val_dev_, B_val_dev_, C_val_dev_; int* A_col_dev_, A_row_dev_, B_col_dev_, B_row_dev_, C_col_dev_, C_row_dev_; @@ -384,12 +368,12 @@ class sp_gemm_gpu : public gemm { // Create descriptors for matrices A->C - cusparseMatDescr_t descrA_, descrB_, descrC_; + cusparseSpMatDescr_t descrA_, descrB_, descrC_; - // index type depends on kernel being run - cusparseIndexType_t cudaDataType_; + // Data type depends on kernel being run + cudaDataType_t cudaDataType_; - cusparceSpGEMMDescr_t spgemmDesc_; + cusparseSpGEMMDescr_t spgemmDesc_; size_t buffer_size1_ = 0; size_t buffer_size2_ = 0; diff --git a/include/doGemm.hh b/include/doGemm.hh index 4a7c564..5565fb2 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -519,20 +519,19 @@ class doGemm { // Perform the GPU kernels // - ONCE : Offload to/from GPU once before all iterations and once // after - spGemmGpu_.initialise(gpuOffloadType::once, N, N, N); + spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); time_checksum_gflop gpuResult_once = gemmGpu_.compute(); gpuResult_once.gflops = calcGflops(flops, iterations_, gpuResult_once.runtime); // - ALWAYS: Offload to/from GPU every iteration - spGemmGpu_.initialise(gpuOffloadType::always, N, N, N); + spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); time_checksum_gflop gpuResult_always = gemmGpu_.compute(); gpuResult_always.gflops = calcGflops(flops, iterations_, gpuResult_always.runtime); - // - UNIFIED : data passed from host to device (and device to host) as // needed - spGemmGpu_.initialise(gpuOffloadType::unified, N, N, N); + spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); time_checksum_gflop gpuResult_unified = gemmGpu_.compute(); gpuResult_unified.gflops = calcGflops(flops, iterations_, gpuResult_unified.runtime); diff --git a/include/kernels/GPU/sp_gemm.hh b/include/kernels/GPU/sp_gemm.hh index 684c166..dbfba87 100644 --- a/include/kernels/GPU/sp_gemm.hh +++ b/include/kernels/GPU/sp_gemm.hh @@ -17,7 +17,7 @@ namespace gpu { * - Always: Move data from host to device and device to host each iteration * - Unified: Initialise data as unified memory; no data movement semantics * required */ - virtual void initialise(gpuOffloadType offload, int m, int n, int k) = 0; + virtual void initialise(gpuOffloadType offload, int n, float sparsity) = 0; protected: /** Whether data should be offloaded to/from the GPU each iteration, or just From 143c1c041d7da2afda07b27c5c3dbb8b273fab1c Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Mon, 25 Mar 2024 10:11:51 +0000 Subject: [PATCH 08/32] Now compiles with fewer runtime errors --- cuBLAS/sp_gemm.hh | 352 +++++++++++++++++++++++++++------------------- include/doGemm.hh | 42 +++--- 2 files changed, 227 insertions(+), 167 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index c0bfb8e..fa0e39d 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -37,12 +37,12 @@ class sp_gemm_gpu : public sp_gemm { std::cout << "Initialising" << std::endl; offload_ = offload; - // Create a handle for cuSPARSE + // Create a handle for cuSPARSE cusparseCreate(&handle_); std::cout << "Handle created" << std::endl; - if (std::is_same_v) cudaDataType_ = CUDA_R_32F; + if (std::is_same_v) cudaDataType_ = CUDA_R_32F; else if (std::is_same_v) cudaDataType_ = CUDA_R_64F; else { std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; @@ -60,24 +60,38 @@ class sp_gemm_gpu : public sp_gemm { cudaCheckError(cudaStreamCreate(&s3_)); std::cout << "Streams created" << std::endl; + if (offload_ == gpuOffloadType::unified) { + std::cout << "Into unified if statement" << std::endl; + A_num_rows_ = (int*)malloc(sizeof(int)); + A_num_cols_ = (int*)malloc(sizeof(int)); + A_nnz_ = (int*)malloc(sizeof(int)); + B_num_rows_ = (int*)malloc(sizeof(int)); + B_num_cols_ = (int*)malloc(sizeof(int)); + B_nnz_ = (int*)malloc(sizeof(int)); + C_num_rows_ = (int*)malloc(sizeof(int)); + C_num_cols_ = (int*)malloc(sizeof(int)); + C_nnz_ = (int*)malloc(sizeof(int)); + } - // Work out number of edges needed to achieve target sparsity - int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); - (*A_nnz_) = (*B_nnz_) = edges; - // ToDo -- for all of this mallocing, bear in mind that row will probably - // have fewer than 'edges' values (thats the whole point). May need to - // reorganise + // Work out number of edges needed to achieve target sparsity + int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); + (*A_nnz_) = (*B_nnz_) = edges; - cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges)); - cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges)); - cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1))); - std::cout << "A CSR vectors malloced" << std::endl; + if (offload_ == gpuOffloadType::unified) { + std::cout << "beginning mallocs" << std::endl; + cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * (*A_nnz_))); + std::cout << "A vals vectors malloced" << std::endl; + cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * (*A_nnz_))); + std::cout << "A cols vectors malloced" << std::endl; + cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1))); + std::cout << "A CSR vectors malloced" << std::endl; - cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges)); - cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges)); - cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1))); - std::cout << "B CSR vectors malloced" << std::endl; + cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * (*B_nnz_))); + cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * (*B_nnz_))); + cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1))); + std::cout << "B CSR vectors malloced" << std::endl; + } // Initialise the host matricies // cusparseSpGEMM() works on CSR format only. This helpfully makes our @@ -85,10 +99,12 @@ class sp_gemm_gpu : public sp_gemm { // Initialise the matrices // Set initial values to 0 - for (int i = 0; i < (n_ * n_); i++) { - A_[i] = 0.0; - B_[i] = 0.0; - } + A_ = (T*)malloc(sizeof(T) * n_ * n_); + B_ = (T*)malloc(sizeof(T) * n_ * n_); + for (int i = 0; i < (n_ * n_); i++) { + A_[i] = 0.0; + B_[i] = 0.0; + } // Random number generator objects for use in descent std::default_random_engine gen; @@ -96,19 +112,20 @@ class sp_gemm_gpu : public sp_gemm { .time_since_epoch().count()); std::uniform_real_distribution dist(0.0, 1.0); - // Using a=0.45 and b=c=0.22 as default probabilities - for (int i = 0; i < edges; i++) { - while (!rMat(A_, n, 0, n - 1, 0, n - 1, - 0.45, 0.22, 0.22, - &gen, dist, false)) {} - while (!rMat(B_, n, 0, n - 1, 0, n - 1, - 0.45, 0.22, 0.22, - &gen, dist, false)) {} - } - - toCSR(A_, n, n, edges, A_val_, A_col_, A_row_); - toCSR(B_, n, n, edges, B_val_, B_col_, B_row_); + // Using a=0.45 and b=c=0.22 as default probabilities + for (int i = 0; i < (*A_nnz_); i++) { + while (!rMat(A_, n, 0, n - 1, 0, n - 1, + 0.45, 0.22, 0.22, + &gen, dist, false)) {} + } + for (int i = 0; i < (*B_nnz_); i++) { + while (!rMat(B_, n, 0, n - 1, 0, n - 1, + 0.45, 0.22, 0.22, + &gen, dist, false)) {} + } + toCSR(A_, n, n, (*A_nnz_), A_val_, A_col_, A_row_); + toCSR(B_, n, n, (*B_nnz_), B_val_, B_col_, B_row_); } @@ -117,135 +134,178 @@ class sp_gemm_gpu : public sp_gemm { /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { - // Prefetch memory to device - cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * (*A_nnz_), - gpuDevice_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * (*A_nnz_), - gpuDevice_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1), - gpuDevice_, s1_)); - - cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * (*B_nnz_), - gpuDevice_, s2_)); - cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * (*B_nnz_), - gpuDevice_, s2_)); - cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1), - gpuDevice_, s2_)); -// -// cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), gpuDevice_, -// s3_)); -// cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), gpuDevice_, -// s3_)); -// cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), gpuDevice_, -// s3_)); -// cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * edges, gpuDevice_, -// s3_)); -// cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * edges, -// gpuDevice_, s3_)); -// cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * edges, -// gpuDevice_, s3_)); - - // Create the CSR matrices on the device - cusparseCreateCsr(&descrA_, n_, n_, (*A_nnz_), A_row_, A_col_, A_val_, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); - cusparseCreateCsr(&descrB_, n_, n_, (*B_nnz_), B_row_, B_col_, B_val_, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); - cusparseCreateCsr(&descrC_, n_, n_, 0, NULL, NULL, NULL, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); - - cusparseSpGEMM_createDescr(&spgemmDesc_); + switch(offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + break; + } + case gpuOffloadType::unified: { + // Prefetch memory to device + cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * (*A_nnz_), + gpuDevice_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * (*A_nnz_), + gpuDevice_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1), + gpuDevice_, s1_)); + + cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * (*B_nnz_), + gpuDevice_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * (*B_nnz_), + gpuDevice_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1), + gpuDevice_, s2_)); + // + // cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), gpuDevice_, + // s3_)); + // cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), gpuDevice_, + // s3_)); + // cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), gpuDevice_, + // s3_)); + // cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * edges, gpuDevice_, + // s3_)); + // cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * edges, + // gpuDevice_, s3_)); + // cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * edges, + // gpuDevice_, s3_)); + + // Create the CSR matrices on the device + cusparseCreateCsr(&descrA_, n_, n_, (*A_nnz_), A_row_, A_col_, A_val_, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); + cusparseCreateCsr(&descrB_, n_, n_, (*B_nnz_), B_row_, B_col_, B_val_, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); + cusparseCreateCsr(&descrC_, n_, n_, 0, NULL, NULL, NULL, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); + + cusparseSpGEMM_createDescr(&spgemmDesc_); + break; + } + } } /** Make a call to the BLAS Library Kernel. */ void callGemm() override { - cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, - descrA_, descrB_, &beta, descrC_, - cudaDataType_, CUSPARSE_SPGEMM_DEFAULT, - spgemmDesc_, &buffer_size1_, NULL); - cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_)); - cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, - descrA_, descrB_, &beta, descrC_, - cudaDataType_, CUSPARSE_SPGEMM_DEFAULT, - spgemmDesc_, &buffer_size1_, buffer1_); - cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, - descrB_, &beta, descrC_, cudaDataType_, - CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_, - &buffer_size2_, NULL); - cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2_)); - - if (cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, - descrB_, &beta, descrC_, cudaDataType_, - CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_, - &buffer_size2_, buffer2_) - == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) { - std::cout << "Insufficient resources" << std::endl; - exit(1); - } - - int64_t rows, cols, nnz; - - cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz); - (*C_nnz_) = nnz; - cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnz)); - cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnz)); - cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1))); - - cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_); - cusparseSpGEMM_copy(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, - descrB_, &beta, descrC_, CUDA_R_32F, - CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_); + switch(offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + break; + } + case gpuOffloadType::unified: { + cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, + descrA_, descrB_, &beta, descrC_, + cudaDataType_, CUSPARSE_SPGEMM_DEFAULT, + spgemmDesc_, &buffer_size1_, NULL); + cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_)); + cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, + descrA_, descrB_, &beta, descrC_, + cudaDataType_, CUSPARSE_SPGEMM_DEFAULT, + spgemmDesc_, &buffer_size1_, buffer1_); + cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_, + &buffer_size2_, NULL); + cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2_)); + + if (cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_, + &buffer_size2_, buffer2_) + == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) { + std::cout << "Insufficient resources" << std::endl; + exit(1); + } + + int64_t rows, cols, nnz; + + cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz); + (*C_nnz_) = nnz; + cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnz)); + cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnz)); + cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1))); + + cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_); + cusparseSpGEMM_copy(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, + descrB_, &beta, descrC_, CUDA_R_32F, + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_); + break; + } + } } /** Perform any required steps after calling the GEMM kernel that should * be timed. */ void postLoopRequirements() override { - // Ensure all data resides on host once work has completed - cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * (*A_nnz_), - cudaCpuDeviceId, s1_)); - cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * (*A_nnz_), - cudaCpuDeviceId, s1_)); - cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1), - cudaCpuDeviceId, s1_)); - - cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * (*B_nnz_), - cudaCpuDeviceId, s2_)); - cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * (*B_nnz_), - cudaCpuDeviceId, s2_)); - cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1), - cudaCpuDeviceId, s2_)); - - cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * (*C_nnz_), - cudaCpuDeviceId, s3_)); - cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * (*C_nnz_), - cudaCpuDeviceId, s3_)); - cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * (n_ + 1), - cudaCpuDeviceId, s3_)); - // Ensure device has finished all work. - cudaCheckError(cudaDeviceSynchronize()); + switch(offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + break; + } + case gpuOffloadType::unified: { + // Ensure all data resides on host once work has completed + cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * (*A_nnz_), + cudaCpuDeviceId, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * (*A_nnz_), + cudaCpuDeviceId, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1), + cudaCpuDeviceId, s1_)); + + cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * (*B_nnz_), + cudaCpuDeviceId, s2_)); + cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * (*B_nnz_), + cudaCpuDeviceId, s2_)); + cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1), + cudaCpuDeviceId, s2_)); + + cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * (*C_nnz_), + cudaCpuDeviceId, s3_)); + cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * (*C_nnz_), + cudaCpuDeviceId, s3_)); + cudaCheckError(cudaMemPrefetchAsync(C_row_, sizeof(int) * (n_ + 1), + cudaCpuDeviceId, s3_)); + // Ensure device has finished all work. + cudaCheckError(cudaDeviceSynchronize()); + break; + } + } } /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ void postCallKernelCleanup() override { - // Destroy the handle - cusparseDestroy(handle_); - - // Destroy streams after use - cudaCheckError(cudaStreamDestroy(s1_)); - cudaCheckError(cudaStreamDestroy(s2_)); - cudaCheckError(cudaStreamDestroy(s3_)); + if (offload_ == gpuOffloadType::unified) { + // Destroy the handle + cusparseDestroy(handle_); + + // Destroy streams after use + cudaCheckError(cudaStreamDestroy(s1_)); + cudaCheckError(cudaStreamDestroy(s2_)); + cudaCheckError(cudaStreamDestroy(s3_)); + } - cudaFree(A_); - cudaFree(B_); - cudaFree(C_); + if (offload_ == gpuOffloadType::unified) { + cudaFree(A_val_); + cudaFree(A_col_); + cudaFree(A_row_); + cudaFree(B_val_); + cudaFree(B_col_); + cudaFree(B_row_); + cudaFree(C_val_); + cudaFree(C_col_); + cudaFree(C_row_); + } } bool rMat(T* M, int n, int x1, int x2, int y1, int y2, diff --git a/include/doGemm.hh b/include/doGemm.hh index 5565fb2..0e4dcc0 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -516,23 +516,23 @@ class doGemm { time_checksum_gflop cpuResult = spGemmCpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); - // Perform the GPU kernels - // - ONCE : Offload to/from GPU once before all iterations and once - // after - spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); - time_checksum_gflop gpuResult_once = gemmGpu_.compute(); - gpuResult_once.gflops = - calcGflops(flops, iterations_, gpuResult_once.runtime); - - // - ALWAYS: Offload to/from GPU every iteration - spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); - time_checksum_gflop gpuResult_always = gemmGpu_.compute(); - gpuResult_always.gflops = - calcGflops(flops, iterations_, gpuResult_always.runtime); - // - UNIFIED : data passed from host to device (and device to host) as - // needed +// // Perform the GPU kernels +// // - ONCE : Offload to/from GPU once before all iterations and once +// // after +// spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); +// time_checksum_gflop gpuResult_once = spGemmGpu_.compute(); +// gpuResult_once.gflops = +// calcGflops(flops, iterations_, gpuResult_once.runtime); +// +// // - ALWAYS: Offload to/from GPU every iteration +// spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); +// time_checksum_gflop gpuResult_always = spGemmGpu_.compute(); +// gpuResult_always.gflops = +// calcGflops(flops, iterations_, gpuResult_always.runtime); +// // - UNIFIED : data passed from host to device (and device to host) as +// // needed spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); - time_checksum_gflop gpuResult_unified = gemmGpu_.compute(); + time_checksum_gflop gpuResult_unified = spGemmGpu_.compute(); gpuResult_unified.gflops = calcGflops(flops, iterations_, gpuResult_unified.runtime); @@ -541,11 +541,11 @@ class doGemm { // Write lines to CSV file writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, cpuResult.runtime, cpuResult.gflops); - writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, - iterations_, gpuResult_once.runtime, gpuResult_once.gflops); - writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, - iterations_, gpuResult_always.runtime, - gpuResult_always.gflops); +// writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, +// iterations_, gpuResult_once.runtime, gpuResult_once.gflops); +// writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, +// iterations_, gpuResult_always.runtime, +// gpuResult_always.gflops); writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize, iterations_, gpuResult_unified.runtime, gpuResult_unified.gflops); From bcd7ae88a01ec199951162c3fdba2d41817edff9 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:23:02 +0100 Subject: [PATCH 09/32] rebasing --- cuBLAS/common.hh | 13 ++ cuBLAS/sp_gemm.hh | 576 ++++++++++++++++++++++++++++++++++------------ include/doGemm.hh | 34 +-- 3 files changed, 458 insertions(+), 165 deletions(-) diff --git a/cuBLAS/common.hh b/cuBLAS/common.hh index 78d0270..70d58fb 100644 --- a/cuBLAS/common.hh +++ b/cuBLAS/common.hh @@ -2,6 +2,9 @@ #if defined GPU_CUBLAS +#include "cusparse.h" + +/** Macro function to check if error occurred when calling cuBLAS. */ /** Macro function to check if error occurred when calling CUDA. */ #define cudaCheckError(f) \ do { \ @@ -22,4 +25,14 @@ } \ } while (false) +#define cusparseCheckError(f) \ + do { \ + cusparseStatus_t status = (f); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + std::cout << "CUSPARSE error: " << __FILE__ << ":" << __LINE__ << ": " \ + << cusparseGetErrorString(status) << std::endl; \ + exit(1); \ + } \ + } while (false) \ + #endif \ No newline at end of file diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index fa0e39d..0879966 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -34,12 +34,9 @@ class sp_gemm_gpu : public sp_gemm { * - Unified: Initialise data as unified memory; no data movement semantics * required */ void initialise(gpuOffloadType offload, int n, float sparsity) override { - std::cout << "Initialising" << std::endl; - offload_ = offload; + std::cout << "_/_/_/_/ Initialising for problem size: " << n << std::endl; - // Create a handle for cuSPARSE - cusparseCreate(&handle_); - std::cout << "Handle created" << std::endl; + offload_ = offload; if (std::is_same_v) cudaDataType_ = CUDA_R_32F; @@ -52,45 +49,51 @@ class sp_gemm_gpu : public sp_gemm { // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); - std::cout << "GPU device got" << std::endl; // Initialise 3 streams to asynchronously move data between host and device cudaCheckError(cudaStreamCreate(&s1_)); cudaCheckError(cudaStreamCreate(&s2_)); cudaCheckError(cudaStreamCreate(&s3_)); - std::cout << "Streams created" << std::endl; - if (offload_ == gpuOffloadType::unified) { - std::cout << "Into unified if statement" << std::endl; - A_num_rows_ = (int*)malloc(sizeof(int)); - A_num_cols_ = (int*)malloc(sizeof(int)); - A_nnz_ = (int*)malloc(sizeof(int)); - B_num_rows_ = (int*)malloc(sizeof(int)); - B_num_cols_ = (int*)malloc(sizeof(int)); - B_nnz_ = (int*)malloc(sizeof(int)); - C_num_rows_ = (int*)malloc(sizeof(int)); - C_num_cols_ = (int*)malloc(sizeof(int)); - C_nnz_ = (int*)malloc(sizeof(int)); - } // Work out number of edges needed to achieve target sparsity int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); - (*A_nnz_) = (*B_nnz_) = edges; + A_nnz_ = B_nnz_ = edges; if (offload_ == gpuOffloadType::unified) { - std::cout << "beginning mallocs" << std::endl; - cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * (*A_nnz_))); - std::cout << "A vals vectors malloced" << std::endl; - cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * (*A_nnz_))); - std::cout << "A cols vectors malloced" << std::endl; + cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_)); + cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * A_nnz_)); cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1))); - std::cout << "A CSR vectors malloced" << std::endl; - cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * (*B_nnz_))); - cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * (*B_nnz_))); + cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * B_nnz_)); + cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * B_nnz_)); cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1))); - std::cout << "B CSR vectors malloced" << std::endl; + + cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1))); + C_val_ = NULL; + C_col_ = NULL; + } else { + A_val_ = (T*)malloc(sizeof(T) * A_nnz_); + A_col_ = (int*)malloc(sizeof(int) * A_nnz_); + A_row_ = (int*)malloc(sizeof(int) * (n_ + 1)); + + B_val_ = (T*)malloc(sizeof(T) * B_nnz_); + B_col_ = (int*)malloc(sizeof(int) * B_nnz_); + B_row_ = (int*)malloc(sizeof(int) * (n_ + 1)); + + C_row_ = (int*)malloc(sizeof(int) * (n_ + 1)); + + + cudaCheckError(cudaMalloc((void**)&A_val_dev_, sizeof(T) * A_nnz_)); + cudaCheckError(cudaMalloc((void**)&A_col_dev_, sizeof(int) * A_nnz_)); + cudaCheckError(cudaMalloc((void**)&A_row_dev_, sizeof(int) * (n_ + 1))); + + cudaCheckError(cudaMalloc((void**)&B_val_dev_, sizeof(T) * B_nnz_)); + cudaCheckError(cudaMalloc((void**)&B_col_dev_, sizeof(int) * B_nnz_)); + cudaCheckError(cudaMalloc((void**)&B_row_dev_, sizeof(int) * (n_ + 1))); + + cudaCheckError(cudaMalloc((void**)&C_row_dev_, sizeof(int) * (n_ + 1))); } // Initialise the host matricies @@ -113,75 +116,116 @@ class sp_gemm_gpu : public sp_gemm { std::uniform_real_distribution dist(0.0, 1.0); // Using a=0.45 and b=c=0.22 as default probabilities - for (int i = 0; i < (*A_nnz_); i++) { - while (!rMat(A_, n, 0, n - 1, 0, n - 1, + for (int i = 0; i < A_nnz_; i++) { + while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false)) {} } - for (int i = 0; i < (*B_nnz_); i++) { - while (!rMat(B_, n, 0, n - 1, 0, n - 1, + for (int i = 0; i < B_nnz_; i++) { + while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false)) {} } - toCSR(A_, n, n, (*A_nnz_), A_val_, A_col_, A_row_); - toCSR(B_, n, n, (*B_nnz_), B_val_, B_col_, B_row_); - } + toCSR(A_, n_, n_, A_nnz_, A_val_, A_col_, A_row_); + + toCSR(B_, n_, n_, B_nnz_, B_val_, B_col_, B_row_); + +// std::cout << "_____Matrix A_____" << std::endl; +// printDenseMatrix(A_, n_, n_); +// std::cout << std::endl << std::endl; +// printCSR(A_val_, A_col_, A_row_, A_nnz_, n_, n_); +// +// +// std::cout << "_____Matrix B_____" << std::endl; +// printDenseMatrix(B_, n_, n_); +// std::cout << std::endl << std::endl; +// printCSR(B_val_, B_col_, B_row_, B_nnz_, n_, n_); + // Create a handle for cuSPARSE + cusparseCheckError(cusparseCreate(&handle_)); + } private: /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { + std::cout << "\t\tPreLoop" << std::endl; + cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_)); switch(offload_) { case gpuOffloadType::always: { + // Make matrix descriptors + cusparseCheckError( + cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_, + A_col_dev_, A_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_, + B_col_dev_, B_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL, + rType_, cType_, indType_, cudaDataType_)); break; } case gpuOffloadType::once: { + cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) * + A_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, sizeof(int) * + A_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, sizeof(int) * (n_ + + 1), cudaMemcpyHostToDevice, s1_)); + + cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) * + B_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) * + B_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_ + + 1), cudaMemcpyHostToDevice, s1_)); + + // Craete matrix descriptors + cusparseCheckError( + cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_, + A_col_dev_, A_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_, + B_col_dev_, B_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL, + rType_, cType_, indType_, cudaDataType_)); break; } case gpuOffloadType::unified: { // Prefetch memory to device - cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * (*A_nnz_), + cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_, gpuDevice_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * (*A_nnz_), + cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * A_nnz_, gpuDevice_, s1_)); cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1), gpuDevice_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * (*B_nnz_), + cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * B_nnz_, gpuDevice_, s2_)); - cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * (*B_nnz_), + cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * B_nnz_, gpuDevice_, s2_)); cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1), gpuDevice_, s2_)); - // - // cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), gpuDevice_, - // s3_)); - // cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), gpuDevice_, - // s3_)); - // cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), gpuDevice_, - // s3_)); - // cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * edges, gpuDevice_, - // s3_)); - // cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * edges, - // gpuDevice_, s3_)); - // cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * edges, - // gpuDevice_, s3_)); - - // Create the CSR matrices on the device - cusparseCreateCsr(&descrA_, n_, n_, (*A_nnz_), A_row_, A_col_, A_val_, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); - cusparseCreateCsr(&descrB_, n_, n_, (*B_nnz_), B_row_, B_col_, B_val_, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); - cusparseCreateCsr(&descrC_, n_, n_, 0, NULL, NULL, NULL, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); - - cusparseSpGEMM_createDescr(&spgemmDesc_); + + // Make matrix descriptors + cusparseCheckError( + cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_, A_col_, + A_val_, rType_, cType_, indType_, + cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_, B_col_, + B_val_, rType_, cType_, indType_, + cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_, NULL, NULL, + rType_, cType_, indType_, cudaDataType_)); break; } } @@ -189,55 +233,208 @@ class sp_gemm_gpu : public sp_gemm { /** Make a call to the BLAS Library Kernel. */ void callGemm() override { + std::cout << "\t\tcallGemm" << std::endl; switch(offload_) { case gpuOffloadType::always: { + cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) * + A_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, sizeof(int) * + A_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, sizeof(int) * (n_ + + 1), cudaMemcpyHostToDevice, s1_)); + + cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) * + B_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) * + B_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_ + + 1), cudaMemcpyHostToDevice, s1_)); + + cusparseCheckError( + cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + alg_, spgemmDesc_)); + + cusparseCheckError( + cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha, + descrA_, descrB_, &beta, + descrC_, cudaDataType_, alg_, + spgemmDesc_, &buffer_size1_, + NULL)); + cudaCheckError(cudaMalloc((void**)&buffer1_, buffer_size1_)); + cusparseCheckError( + cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha, + descrA_, descrB_, &beta, + descrC_, cudaDataType_, alg_, + spgemmDesc_, &buffer_size1_, + buffer1_)); + cusparseCheckError( + cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + alg_, spgemmDesc_, &buffer_size2_, + NULL)); + cudaCheckError(cudaMalloc((void**)&buffer2_, buffer_size2_)); + + cusparseCheckError( + cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, + cudaDataType_, alg_, spgemmDesc_, + &buffer_size2_, buffer2_)); + + cusparseCheckError( + cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, + &C_nnz_)); + + cusparseCheckError( + cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, + &C_nnz_)); + + cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_)); + cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_)); + + cusparseCheckError( + cusparseCsrSetPointers(descrC_, C_row_dev_, C_col_dev_, + C_val_dev_)); + cusparseCheckError( + cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + alg_, spgemmDesc_)); + + cudaCheckError(cudaMemcpyAsync(A_val_, A_val_dev_, sizeof(T) * + A_nnz_, cudaMemcpyDeviceToHost, s1_)); + cudaCheckError(cudaMemcpyAsync(A_col_, A_col_dev_, sizeof(int) * + A_nnz_, cudaMemcpyDeviceToHost, s1_)); + cudaCheckError(cudaMemcpyAsync(A_row_, A_row_dev_, sizeof(int) * + (n_ + 1), cudaMemcpyDeviceToHost, s1_)); + + cudaCheckError(cudaMemcpyAsync(B_val_, B_val_dev_, sizeof(T) * + B_nnz_, cudaMemcpyDeviceToHost, s2_)); + cudaCheckError(cudaMemcpyAsync(B_col_, B_col_dev_, sizeof(int) * + B_nnz_, cudaMemcpyDeviceToHost, s2_)); + cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) * + (n_ + 1), cudaMemcpyDeviceToHost, s2_)); + + C_val_ = (T*)malloc(sizeof(T) * C_nnz_); + C_col_ = (int*)malloc(sizeof(int) * C_nnz_); + cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) * + C_nnz_, cudaMemcpyDeviceToHost, s3_)); + cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) * + C_nnz_, cudaMemcpyDeviceToHost, s3_)); + cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) * + (n_ + 1), cudaMemcpyDeviceToHost, s3_)); + cudaCheckError(cudaDeviceSynchronize()); + + // Freeing memory + cudaCheckError(cudaFree(buffer1_)); + cudaCheckError(cudaFree(buffer2_)); + cudaCheckError(cudaFree(C_val_dev_)); + cudaCheckError(cudaFree(C_col_dev_)); + free(C_val_); + free(C_col_); break; } case gpuOffloadType::once: { + cusparseCheckError( + cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + alg_, spgemmDesc_)); + + cusparseCheckError( + cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha, + descrA_, descrB_, &beta, + descrC_, cudaDataType_, alg_, + spgemmDesc_, &buffer_size1_, + NULL)); + cudaCheckError(cudaMalloc((void**)&buffer1_, buffer_size1_)); + cusparseCheckError( + cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha, + descrA_, descrB_, &beta, + descrC_, cudaDataType_, alg_, + spgemmDesc_, &buffer_size1_, + buffer1_)); + cusparseCheckError( + cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + alg_, spgemmDesc_, &buffer_size2_, + NULL)); + cudaCheckError(cudaMalloc((void**)&buffer2_, buffer_size2_)); + + cusparseCheckError( + cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + alg_, spgemmDesc_, &buffer_size2_, buffer2_)); + + cusparseCheckError( + cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, + &C_nnz_)); + + cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_)); + cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_)); + + cusparseCheckError( + cusparseCsrSetPointers(descrC_, C_row_dev_, C_col_dev_, + C_val_dev_)); + cusparseCheckError( + cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, + descrA_, descrB_, &beta, descrC_, + cudaDataType_, alg_, spgemmDesc_)); + + // Freeing memory + cudaCheckError(cudaFree(buffer1_)); + cudaCheckError(cudaFree(buffer2_)); break; } case gpuOffloadType::unified: { - cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, - descrA_, descrB_, &beta, descrC_, - cudaDataType_, CUSPARSE_SPGEMM_DEFAULT, - spgemmDesc_, &buffer_size1_, NULL); - cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_)); - cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, - descrA_, descrB_, &beta, descrC_, - cudaDataType_, CUSPARSE_SPGEMM_DEFAULT, - spgemmDesc_, &buffer_size1_, buffer1_); - cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, + cusparseCheckError( + cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha, + descrA_, descrB_, &beta, + descrC_, cudaDataType_, + alg_, spgemmDesc_, &buffer_size1_, + NULL)); + cudaCheckError(cudaMallocManaged((void**)&buffer1_, buffer_size1_)); + cusparseCheckError( + cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha, + descrA_, descrB_, &beta, + descrC_, cudaDataType_, + alg_, spgemmDesc_, &buffer_size1_, + buffer1_)); + cusparseCheckError( + cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + alg_, spgemmDesc_, &buffer_size2_, + NULL)); + cudaCheckError(cudaMallocManaged((void**)&buffer2_, buffer_size2_)); + + cusparseCheckError( + cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_, descrB_, &beta, descrC_, cudaDataType_, - CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_, - &buffer_size2_, NULL); - cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2_)); + alg_, spgemmDesc_, &buffer_size2_, buffer2_)); - if (cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, - descrB_, &beta, descrC_, cudaDataType_, - CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_, - &buffer_size2_, buffer2_) - == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) { - std::cout << "Insufficient resources" << std::endl; - exit(1); - } - - int64_t rows, cols, nnz; - - cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz); - (*C_nnz_) = nnz; - cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnz)); - cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnz)); - cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1))); - - cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_); - cusparseSpGEMM_copy(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, - descrB_, &beta, descrC_, CUDA_R_32F, - CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_); + cusparseCheckError( + cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, + &C_nnz_)); + + cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * C_nnz_)); + cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * C_nnz_)); + + cusparseCheckError( + cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_)); + cusparseCheckError( + cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + alg_, spgemmDesc_)); + + + cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * C_nnz_, + cudaCpuDeviceId, s3_)); + cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * C_nnz_, + cudaCpuDeviceId, s3_)); + + // Freeing memory + cudaCheckError(cudaFree(buffer1_)); + cudaCheckError(cudaFree(buffer2_)); + cudaCheckError(cudaFree(C_val_)); + cudaCheckError(cudaFree(C_col_)); break; } } @@ -246,33 +443,63 @@ class sp_gemm_gpu : public sp_gemm { /** Perform any required steps after calling the GEMM kernel that should * be timed. */ void postLoopRequirements() override { + std::cout << "\t\tPostLoop" << std::endl; + cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_)); + // Destroying descriptors + cusparseCheckError(cusparseDestroySpMat(descrA_)); + cusparseCheckError(cusparseDestroySpMat(descrB_)); + cusparseCheckError(cusparseDestroySpMat(descrC_)); switch(offload_) { case gpuOffloadType::always: { break; } case gpuOffloadType::once: { + cudaCheckError(cudaMemcpyAsync(A_val_, A_val_dev_, sizeof(T) * + A_nnz_, cudaMemcpyDeviceToHost, s1_)); + cudaCheckError(cudaMemcpyAsync(A_col_, A_col_dev_, sizeof(int) * + A_nnz_, cudaMemcpyDeviceToHost, s1_)); + cudaCheckError(cudaMemcpyAsync(A_row_, A_row_dev_, sizeof(int) * + (n_ + 1), cudaMemcpyDeviceToHost, s1_)); + + cudaCheckError(cudaMemcpyAsync(B_val_, B_val_dev_, sizeof(T) * + B_nnz_, cudaMemcpyDeviceToHost, s2_)); + cudaCheckError(cudaMemcpyAsync(B_col_, B_col_dev_, sizeof(int) * + B_nnz_, cudaMemcpyDeviceToHost, s2_)); + cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) * + (n_ + 1), cudaMemcpyDeviceToHost, s2_)); + + C_val_ = (T*)malloc(sizeof(T) * C_nnz_); + C_col_ = (int*)malloc(sizeof(int) * C_nnz_); + cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) * + C_nnz_, cudaMemcpyDeviceToHost, s3_)); + cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) * + C_nnz_, cudaMemcpyDeviceToHost, s3_)); + cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) * + (n_ + 1), cudaMemcpyDeviceToHost, s3_)); + cudaCheckError(cudaDeviceSynchronize()); + + cudaCheckError(cudaFree(C_val_dev_)); + cudaCheckError(cudaFree(C_col_dev_)); + free(C_val_); + free(C_col_); break; } case gpuOffloadType::unified: { // Ensure all data resides on host once work has completed - cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * (*A_nnz_), + cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_, cudaCpuDeviceId, s1_)); - cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * (*A_nnz_), + cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * A_nnz_, cudaCpuDeviceId, s1_)); cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1), cudaCpuDeviceId, s1_)); - cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * (*B_nnz_), + cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * B_nnz_, cudaCpuDeviceId, s2_)); - cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * (*B_nnz_), + cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * B_nnz_, cudaCpuDeviceId, s2_)); cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1), cudaCpuDeviceId, s2_)); - cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * (*C_nnz_), - cudaCpuDeviceId, s3_)); - cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * (*C_nnz_), - cudaCpuDeviceId, s3_)); cudaCheckError(cudaMemPrefetchAsync(C_row_, sizeof(int) * (n_ + 1), cudaCpuDeviceId, s3_)); // Ensure device has finished all work. @@ -285,26 +512,39 @@ class sp_gemm_gpu : public sp_gemm { /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ void postCallKernelCleanup() override { - if (offload_ == gpuOffloadType::unified) { - // Destroy the handle - cusparseDestroy(handle_); + std::cout << "\t\tPostCall" << std::endl << std::endl; + // Destroy the handle + cusparseCheckError(cusparseDestroy(handle_)); + + // Destroy streams after use + cudaCheckError(cudaStreamDestroy(s1_)); + cudaCheckError(cudaStreamDestroy(s2_)); + cudaCheckError(cudaStreamDestroy(s3_)); - // Destroy streams after use - cudaCheckError(cudaStreamDestroy(s1_)); - cudaCheckError(cudaStreamDestroy(s2_)); - cudaCheckError(cudaStreamDestroy(s3_)); - } if (offload_ == gpuOffloadType::unified) { - cudaFree(A_val_); - cudaFree(A_col_); - cudaFree(A_row_); - cudaFree(B_val_); - cudaFree(B_col_); - cudaFree(B_row_); - cudaFree(C_val_); - cudaFree(C_col_); - cudaFree(C_row_); + cudaCheckError(cudaFree(A_val_)); + cudaCheckError(cudaFree(A_col_)); + cudaCheckError(cudaFree(A_row_)); + cudaCheckError(cudaFree(B_val_)); + cudaCheckError(cudaFree(B_col_)); + cudaCheckError(cudaFree(B_row_)); + cudaCheckError(cudaFree(C_row_)); + } else { + free(A_val_); + free(A_col_); + free(A_row_); + free(B_val_); + free(B_col_); + free(B_row_); + free(C_row_); + cudaCheckError(cudaFree(A_val_dev_)); + cudaCheckError(cudaFree(A_col_dev_)); + cudaCheckError(cudaFree(A_row_dev_)); + cudaCheckError(cudaFree(B_val_dev_)); + cudaCheckError(cudaFree(B_col_dev_)); + cudaCheckError(cudaFree(B_row_dev_)); + cudaCheckError(cudaFree(C_row_dev_)); } } @@ -356,13 +596,10 @@ class sp_gemm_gpu : public sp_gemm { void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index, int* row_ptr) { int nnz_encountered = 0; - int prev_row_ptr = 0; for (int row = 0; row < n_row; row++) { - if (nnz_encountered >= nnz) break; - row_ptr[row] = prev_row_ptr; + row_ptr[row] = nnz_encountered; int nnz_row = 0; for (int col = 0; col < n_col; col++) { - if (nnz_encountered >= nnz) break; if (dense[(row * n_col) + col] != 0.0) { nnz_row++; col_index[nnz_encountered] = col; @@ -370,10 +607,41 @@ class sp_gemm_gpu : public sp_gemm { nnz_encountered++; } } - prev_row_ptr += nnz_row; } + row_ptr[n_row] = nnz_encountered; } + + // ToDo -- the two following functons are useful for debugging. I'm + // keeping them in to that end, though they are not used by the benchmark + // itself + void printDenseMatrix(T* M, int rows, int cols) { + for (int row = 0; row < rows; row++) { + std::cout << "| "; + for (int col = 0; col < cols; col++) { + std::cout << M[(row * cols) + col] << " | "; + } + std::cout << std::endl; + } + } + + void printCSR(T* values, int* col_indices, int* row_pointers, int nnz, + int rows, int cols) { + std::cout << "\tRow pointers__" << std::endl; + for (int p = 0; p < (rows + 1); p++) { + std::cout << row_pointers[p] << ", "; + } + std::cout << std::endl << "\tColumn Indices__" << std::endl; + for (int i = 0; i < nnz; i++) { + std::cout << col_indices[i] << ", "; + } + std::cout << std::endl << "\tValues__" << std::endl; + for (int v = 0; v < nnz; v++) { + std::cout << values[v] << ", "; + } + std::cout << std::endl; + } + /** Handle used when calling cuBLAS. */ cusparseHandle_t handle_; @@ -396,29 +664,34 @@ class sp_gemm_gpu : public sp_gemm { T* A_val_; int* A_col_; int* A_row_; - int* A_num_rows_; - int* A_num_cols_; - int* A_nnz_; + int64_t A_num_rows_; + int64_t A_num_cols_; + int64_t A_nnz_; T* B_val_; int* B_col_; int* B_row_; - int* B_num_rows_; - int* B_num_cols_; - int* B_nnz_; + int64_t B_num_rows_; + int64_t B_num_cols_; + int64_t B_nnz_; T* C_val_; int* C_col_; int* C_row_; - int* C_num_rows_; - int* C_num_cols_; - int*C_nnz_; + int64_t C_num_rows_; + int64_t C_num_cols_; + int64_t C_nnz_; /** CSR format vectors for matrices A, B and C on the device. */ - int* A_num_rows_dev_, A_num_cols_dev_, A_nnz_dev_, B_num_rows_dev_, - B_num_cols_dev_, B_nnz_dev_, C_num_rows_dev_, C_num_cols_dev_, C_nnz_dev_; - T* A_val_dev_, B_val_dev_, C_val_dev_; - int* A_col_dev_, A_row_dev_, B_col_dev_, B_row_dev_, C_col_dev_, C_row_dev_; + T* A_val_dev_; + T* B_val_dev_; + T* C_val_dev_; + int* A_col_dev_; + int* A_row_dev_; + int* B_col_dev_; + int* B_row_dev_; + int* C_col_dev_; + int* C_row_dev_; /** The constant value Alpha. */ const T alpha = ALPHA; @@ -439,6 +712,13 @@ class sp_gemm_gpu : public sp_gemm { size_t buffer_size2_ = 0; void* buffer1_ = NULL; void* buffer2_ = NULL; + + cusparseOperation_t opA_ = CUSPARSE_OPERATION_NON_TRANSPOSE; + cusparseOperation_t opB_ = CUSPARSE_OPERATION_NON_TRANSPOSE; + cusparseSpGEMMAlg_t alg_ = CUSPARSE_SPGEMM_DEFAULT; + cusparseIndexType_t rType_ = CUSPARSE_INDEX_32I; + cusparseIndexType_t cType_ = CUSPARSE_INDEX_32I; + cusparseIndexBase_t indType_ = CUSPARSE_INDEX_BASE_ZERO; }; } // namespace gpu #endif \ No newline at end of file diff --git a/include/doGemm.hh b/include/doGemm.hh index 0e4dcc0..9a66329 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -517,20 +517,20 @@ class doGemm { cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); // // Perform the GPU kernels + // - ALWAYS: Offload to/from GPU every iteration + spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); + time_checksum_gflop gpuResult_always = spGemmGpu_.compute(); + gpuResult_always.gflops = + calcGflops(flops, iterations_, gpuResult_always.runtime); // // - ONCE : Offload to/from GPU once before all iterations and once // // after -// spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); -// time_checksum_gflop gpuResult_once = spGemmGpu_.compute(); -// gpuResult_once.gflops = -// calcGflops(flops, iterations_, gpuResult_once.runtime); -// -// // - ALWAYS: Offload to/from GPU every iteration -// spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); -// time_checksum_gflop gpuResult_always = spGemmGpu_.compute(); -// gpuResult_always.gflops = -// calcGflops(flops, iterations_, gpuResult_always.runtime); -// // - UNIFIED : data passed from host to device (and device to host) as -// // needed + spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); + time_checksum_gflop gpuResult_once = spGemmGpu_.compute(); + gpuResult_once.gflops = + calcGflops(flops, iterations_, gpuResult_once.runtime); + + // - UNIFIED : data passed from host to device (and device to host) as + // needed spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); time_checksum_gflop gpuResult_unified = spGemmGpu_.compute(); gpuResult_unified.gflops = @@ -541,11 +541,11 @@ class doGemm { // Write lines to CSV file writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, cpuResult.runtime, cpuResult.gflops); -// writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, -// iterations_, gpuResult_once.runtime, gpuResult_once.gflops); -// writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, -// iterations_, gpuResult_always.runtime, -// gpuResult_always.gflops); + writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, + iterations_, gpuResult_once.runtime, gpuResult_once.gflops); + writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, + iterations_, gpuResult_always.runtime, + gpuResult_always.gflops); writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize, iterations_, gpuResult_unified.runtime, gpuResult_unified.gflops); From 2ffee16635466c3315f7c1cf075846c190041581 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Tue, 26 Mar 2024 12:55:10 +0000 Subject: [PATCH 10/32] All implemented and running. No checksum at the end --- cuBLAS/sp_gemm.hh | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 0879966..fbd08fd 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -325,10 +325,12 @@ class sp_gemm_gpu : public sp_gemm { cudaCheckError(cudaDeviceSynchronize()); // Freeing memory - cudaCheckError(cudaFree(buffer1_)); - cudaCheckError(cudaFree(buffer2_)); cudaCheckError(cudaFree(C_val_dev_)); cudaCheckError(cudaFree(C_col_dev_)); + cudaCheckError(cudaFree(buffer1_)); + cudaCheckError(cudaFree(buffer2_)); + buffer_size1_ = 0; + buffer_size2_ = 0; free(C_val_); free(C_col_); break; @@ -380,8 +382,12 @@ class sp_gemm_gpu : public sp_gemm { cudaDataType_, alg_, spgemmDesc_)); // Freeing memory + cudaCheckError(cudaFree(C_val_dev_)); + cudaCheckError(cudaFree(C_col_dev_)); cudaCheckError(cudaFree(buffer1_)); cudaCheckError(cudaFree(buffer2_)); + buffer_size1_ = 0; + buffer_size2_ = 0; break; } case gpuOffloadType::unified: { @@ -414,6 +420,8 @@ class sp_gemm_gpu : public sp_gemm { cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, &C_nnz_)); + if (C_val_ != NULL) cudaCheckError(cudaFree(C_val_)); + if (C_val_ != NULL) cudaCheckError(cudaFree(C_col_)); cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * C_nnz_)); cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * C_nnz_)); @@ -425,16 +433,11 @@ class sp_gemm_gpu : public sp_gemm { alg_, spgemmDesc_)); - cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * C_nnz_, - cudaCpuDeviceId, s3_)); - cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * C_nnz_, - cudaCpuDeviceId, s3_)); - // Freeing memory cudaCheckError(cudaFree(buffer1_)); cudaCheckError(cudaFree(buffer2_)); - cudaCheckError(cudaFree(C_val_)); - cudaCheckError(cudaFree(C_col_)); + buffer_size1_ = 0; + buffer_size2_ = 0; break; } } @@ -468,20 +471,9 @@ class sp_gemm_gpu : public sp_gemm { cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) * (n_ + 1), cudaMemcpyDeviceToHost, s2_)); - C_val_ = (T*)malloc(sizeof(T) * C_nnz_); - C_col_ = (int*)malloc(sizeof(int) * C_nnz_); - cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) * - C_nnz_, cudaMemcpyDeviceToHost, s3_)); - cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) * - C_nnz_, cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) * (n_ + 1), cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaDeviceSynchronize()); - - cudaCheckError(cudaFree(C_val_dev_)); - cudaCheckError(cudaFree(C_col_dev_)); - free(C_val_); - free(C_col_); break; } case gpuOffloadType::unified: { @@ -675,8 +667,8 @@ class sp_gemm_gpu : public sp_gemm { int64_t B_num_cols_; int64_t B_nnz_; - T* C_val_; - int* C_col_; + T* C_val_ = NULL; + int* C_col_ = NULL; int* C_row_; int64_t C_num_rows_; int64_t C_num_cols_; From 064ec5756f4b524d45e8bc2f94dbdf82412375d5 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Tue, 26 Mar 2024 12:57:45 +0000 Subject: [PATCH 11/32] Removing print statements --- cuBLAS/sp_gemm.hh | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index fbd08fd..01c6edb 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -34,11 +34,8 @@ class sp_gemm_gpu : public sp_gemm { * - Unified: Initialise data as unified memory; no data movement semantics * required */ void initialise(gpuOffloadType offload, int n, float sparsity) override { - std::cout << "_/_/_/_/ Initialising for problem size: " << n << std::endl; - offload_ = offload; - if (std::is_same_v) cudaDataType_ = CUDA_R_32F; else if (std::is_same_v) cudaDataType_ = CUDA_R_64F; else { @@ -151,7 +148,6 @@ class sp_gemm_gpu : public sp_gemm { /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { - std::cout << "\t\tPreLoop" << std::endl; cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_)); switch(offload_) { case gpuOffloadType::always: { @@ -233,7 +229,6 @@ class sp_gemm_gpu : public sp_gemm { /** Make a call to the BLAS Library Kernel. */ void callGemm() override { - std::cout << "\t\tcallGemm" << std::endl; switch(offload_) { case gpuOffloadType::always: { cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) * @@ -446,7 +441,6 @@ class sp_gemm_gpu : public sp_gemm { /** Perform any required steps after calling the GEMM kernel that should * be timed. */ void postLoopRequirements() override { - std::cout << "\t\tPostLoop" << std::endl; cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_)); // Destroying descriptors cusparseCheckError(cusparseDestroySpMat(descrA_)); @@ -504,7 +498,6 @@ class sp_gemm_gpu : public sp_gemm { /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ void postCallKernelCleanup() override { - std::cout << "\t\tPostCall" << std::endl << std::endl; // Destroy the handle cusparseCheckError(cusparseDestroy(handle_)); From 88a053f2ea565e1753d671c4ddcee9ba45a80c3b Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 29 Mar 2024 12:35:53 +0000 Subject: [PATCH 12/32] Removing print statements --- cuBLAS/sp_gemm.hh | 116 +++++++++++++++++++++++++++++----------------- include/doGemm.hh | 20 ++++---- 2 files changed, 84 insertions(+), 52 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 01c6edb..db9cf29 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -24,7 +24,7 @@ class sp_gemm_gpu : public sp_gemm { using sp_gemm::C_; using sp_gemm::offload_; - // ToDo -- just unified implemented so far. Fill in Always and Once later + // ToDo -- No checksum for sparse yet. Nedd to do /** Initialise the required data structures. * `offload` refers to the data offload type: @@ -42,7 +42,7 @@ class sp_gemm_gpu : public sp_gemm { std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; exit(1); } - n_ = n; + n_ = n * 20; // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); @@ -93,6 +93,10 @@ class sp_gemm_gpu : public sp_gemm { cudaCheckError(cudaMalloc((void**)&C_row_dev_, sizeof(int) * (n_ + 1))); } + C_mem_allocated_always_ = false; + C_mem_allocated_once_ = false; + C_mem_allocated_unified_ = false; + // Initialise the host matricies // cusparseSpGEMM() works on CSR format only. This helpfully makes our // sparse matrix format decision for us! @@ -148,21 +152,9 @@ class sp_gemm_gpu : public sp_gemm { /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { - cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_)); + switch(offload_) { case gpuOffloadType::always: { - // Make matrix descriptors - cusparseCheckError( - cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_, - A_col_dev_, A_val_dev_, rType_, cType_, - indType_, cudaDataType_)); - cusparseCheckError( - cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_, - B_col_dev_, B_val_dev_, rType_, cType_, - indType_, cudaDataType_)); - cusparseCheckError( - cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL, - rType_, cType_, indType_, cudaDataType_)); break; } case gpuOffloadType::once: { @@ -174,11 +166,14 @@ class sp_gemm_gpu : public sp_gemm { + 1), cudaMemcpyHostToDevice, s1_)); cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) * - B_nnz_, cudaMemcpyHostToDevice, s1_)); + B_nnz_, cudaMemcpyHostToDevice, s2_)); cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) * - B_nnz_, cudaMemcpyHostToDevice, s1_)); + B_nnz_, cudaMemcpyHostToDevice, s2_)); cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_ - + 1), cudaMemcpyHostToDevice, s1_)); + + 1), cudaMemcpyHostToDevice, s2_)); + + cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_ + + 1), cudaMemcpyHostToDevice, s3_)); // Craete matrix descriptors cusparseCheckError( @@ -225,6 +220,7 @@ class sp_gemm_gpu : public sp_gemm { break; } } + cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_)); } /** Make a call to the BLAS Library Kernel. */ @@ -239,16 +235,27 @@ class sp_gemm_gpu : public sp_gemm { + 1), cudaMemcpyHostToDevice, s1_)); cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) * - B_nnz_, cudaMemcpyHostToDevice, s1_)); + B_nnz_, cudaMemcpyHostToDevice, s2_)); cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) * - B_nnz_, cudaMemcpyHostToDevice, s1_)); + B_nnz_, cudaMemcpyHostToDevice, s2_)); cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_ - + 1), cudaMemcpyHostToDevice, s1_)); + + 1), cudaMemcpyHostToDevice, s2_)); + + cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_ + + 1), cudaMemcpyHostToDevice, s3_)); + // Make matrix descriptors cusparseCheckError( - cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_, - descrB_, &beta, descrC_, cudaDataType_, - alg_, spgemmDesc_)); + cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_, + A_col_dev_, A_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_, + B_col_dev_, B_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL, + rType_, cType_, indType_, cudaDataType_)); cusparseCheckError( cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha, @@ -280,10 +287,10 @@ class sp_gemm_gpu : public sp_gemm { cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, &C_nnz_)); - cusparseCheckError( - cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, - &C_nnz_)); - + if (C_mem_allocated_always_) { + cudaCheckError(cudaFree(C_val_dev_)); + cudaCheckError(cudaFree(C_col_dev_)); + } cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_)); cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_)); @@ -309,8 +316,14 @@ class sp_gemm_gpu : public sp_gemm { cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) * (n_ + 1), cudaMemcpyDeviceToHost, s2_)); + if (C_mem_allocated_always_) { + free(C_val_); + free(C_col_); + } C_val_ = (T*)malloc(sizeof(T) * C_nnz_); C_col_ = (int*)malloc(sizeof(int) * C_nnz_); + C_mem_allocated_always_ = true; + cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) * C_nnz_, cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) * @@ -320,22 +333,13 @@ class sp_gemm_gpu : public sp_gemm { cudaCheckError(cudaDeviceSynchronize()); // Freeing memory - cudaCheckError(cudaFree(C_val_dev_)); - cudaCheckError(cudaFree(C_col_dev_)); cudaCheckError(cudaFree(buffer1_)); cudaCheckError(cudaFree(buffer2_)); buffer_size1_ = 0; buffer_size2_ = 0; - free(C_val_); - free(C_col_); break; } case gpuOffloadType::once: { - cusparseCheckError( - cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_, - descrB_, &beta, descrC_, cudaDataType_, - alg_, spgemmDesc_)); - cusparseCheckError( cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha, descrA_, descrB_, &beta, @@ -365,8 +369,13 @@ class sp_gemm_gpu : public sp_gemm { cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, &C_nnz_)); + if (C_mem_allocated_once_) { + cudaCheckError(cudaFree(C_val_dev_)); + cudaCheckError(cudaFree(C_col_dev_)); + } cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_)); cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_)); + C_mem_allocated_once_ = true; cusparseCheckError( cusparseCsrSetPointers(descrC_, C_row_dev_, C_col_dev_, @@ -377,8 +386,6 @@ class sp_gemm_gpu : public sp_gemm { cudaDataType_, alg_, spgemmDesc_)); // Freeing memory - cudaCheckError(cudaFree(C_val_dev_)); - cudaCheckError(cudaFree(C_col_dev_)); cudaCheckError(cudaFree(buffer1_)); cudaCheckError(cudaFree(buffer2_)); buffer_size1_ = 0; @@ -415,10 +422,14 @@ class sp_gemm_gpu : public sp_gemm { cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, &C_nnz_)); - if (C_val_ != NULL) cudaCheckError(cudaFree(C_val_)); - if (C_val_ != NULL) cudaCheckError(cudaFree(C_col_)); + if (C_mem_allocated_unified_) { + cudaCheckError(cudaFree(C_val_)); + cudaCheckError(cudaFree(C_col_)); + } + cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * C_nnz_)); cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * C_nnz_)); + C_mem_allocated_unified_ = true; cusparseCheckError( cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_)); @@ -445,7 +456,6 @@ class sp_gemm_gpu : public sp_gemm { // Destroying descriptors cusparseCheckError(cusparseDestroySpMat(descrA_)); cusparseCheckError(cusparseDestroySpMat(descrB_)); - cusparseCheckError(cusparseDestroySpMat(descrC_)); switch(offload_) { case gpuOffloadType::always: { break; @@ -465,12 +475,19 @@ class sp_gemm_gpu : public sp_gemm { cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) * (n_ + 1), cudaMemcpyDeviceToHost, s2_)); + C_val_ = (T*)malloc(sizeof(T) * C_nnz_); + C_col_ = (int*)malloc(sizeof(int) * C_nnz_); + cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) * + C_nnz_, cudaMemcpyDeviceToHost, s3_)); + cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) * + C_nnz_, cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) * (n_ + 1), cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaDeviceSynchronize()); break; } case gpuOffloadType::unified: { + cusparseCheckError(cusparseDestroySpMat(descrC_)); // Ensure all data resides on host once work has completed cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_, cudaCpuDeviceId, s1_)); @@ -486,6 +503,10 @@ class sp_gemm_gpu : public sp_gemm { cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1), cudaCpuDeviceId, s2_)); +// cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * C_nnz_, +// cudaCpuDeviceId, s3_)); +// cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * C_nnz_, +// cudaCpuDeviceId, s3_)); cudaCheckError(cudaMemPrefetchAsync(C_row_, sizeof(int) * (n_ + 1), cudaCpuDeviceId, s3_)); // Ensure device has finished all work. @@ -506,7 +527,6 @@ class sp_gemm_gpu : public sp_gemm { cudaCheckError(cudaStreamDestroy(s2_)); cudaCheckError(cudaStreamDestroy(s3_)); - if (offload_ == gpuOffloadType::unified) { cudaCheckError(cudaFree(A_val_)); cudaCheckError(cudaFree(A_col_)); @@ -514,6 +534,8 @@ class sp_gemm_gpu : public sp_gemm { cudaCheckError(cudaFree(B_val_)); cudaCheckError(cudaFree(B_col_)); cudaCheckError(cudaFree(B_row_)); + cudaCheckError(cudaFree(C_val_)); + cudaCheckError(cudaFree(C_col_)); cudaCheckError(cudaFree(C_row_)); } else { free(A_val_); @@ -522,6 +544,8 @@ class sp_gemm_gpu : public sp_gemm { free(B_val_); free(B_col_); free(B_row_); + free(C_val_); + free(C_col_); free(C_row_); cudaCheckError(cudaFree(A_val_dev_)); cudaCheckError(cudaFree(A_col_dev_)); @@ -529,6 +553,8 @@ class sp_gemm_gpu : public sp_gemm { cudaCheckError(cudaFree(B_val_dev_)); cudaCheckError(cudaFree(B_col_dev_)); cudaCheckError(cudaFree(B_row_dev_)); + cudaCheckError(cudaFree(C_val_dev_)); + cudaCheckError(cudaFree(C_col_dev_)); cudaCheckError(cudaFree(C_row_dev_)); } } @@ -678,6 +704,10 @@ class sp_gemm_gpu : public sp_gemm { int* C_col_dev_; int* C_row_dev_; + bool C_mem_allocated_always_; + bool C_mem_allocated_once_; + bool C_mem_allocated_unified_; + /** The constant value Alpha. */ const T alpha = ALPHA; diff --git a/include/doGemm.hh b/include/doGemm.hh index 9a66329..8743314 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -516,25 +516,27 @@ class doGemm { time_checksum_gflop cpuResult = spGemmCpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); -// // Perform the GPU kernels + // Perform the GPU kernels + + // - UNIFIED : data passed from host to device (and device to host) as + // needed + spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); + time_checksum_gflop gpuResult_unified = spGemmGpu_.compute(); + gpuResult_unified.gflops = + calcGflops(flops, iterations_, gpuResult_unified.runtime); + // - ALWAYS: Offload to/from GPU every iteration spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); time_checksum_gflop gpuResult_always = spGemmGpu_.compute(); gpuResult_always.gflops = calcGflops(flops, iterations_, gpuResult_always.runtime); -// // - ONCE : Offload to/from GPU once before all iterations and once -// // after + // - ONCE : Offload to/from GPU once before all iterations and once + // after spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); time_checksum_gflop gpuResult_once = spGemmGpu_.compute(); gpuResult_once.gflops = calcGflops(flops, iterations_, gpuResult_once.runtime); - // - UNIFIED : data passed from host to device (and device to host) as - // needed - spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); - time_checksum_gflop gpuResult_unified = spGemmGpu_.compute(); - gpuResult_unified.gflops = - calcGflops(flops, iterations_, gpuResult_unified.runtime); // ToDo -- non-default GPU operations From 5b04a2c93e88ff4438770cfb9828ce681e364c92 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Mon, 1 Apr 2024 09:59:01 +0100 Subject: [PATCH 13/32] rebasing --- cuBLAS/sp_gemm.hh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index db9cf29..0848bb6 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -155,6 +155,18 @@ class sp_gemm_gpu : public sp_gemm { switch(offload_) { case gpuOffloadType::always: { + // Make matrix descriptors + cusparseCheckError( + cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_, + A_col_dev_, A_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_, + B_col_dev_, B_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL, + rType_, cType_, indType_, cudaDataType_)); break; } case gpuOffloadType::once: { From 23d318b7e066902bae676bf438f4141746fe79dc Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:26:37 +0100 Subject: [PATCH 14/32] rebasing --- include/doGemm.hh | 44 ++++++++++++++---------- include/main.hh | 2 +- oneMKL/CPU/sp_gemm.hh | 79 +++++++++++++++++++++++++++++++++++++++++++ src/main.cc | 3 +- 4 files changed, 108 insertions(+), 20 deletions(-) create mode 100644 oneMKL/CPU/sp_gemm.hh diff --git a/include/doGemm.hh b/include/doGemm.hh index 8743314..8153651 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -267,9 +267,7 @@ class doGemm { if (doCPU_ && doGPU_) { // Print offload results to stdout printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)"); - } #endif - // Square x Short and Wide // Re-initialise offload threshold structures & previous results cpuGpu_always_ = cpuGpu_offloadThreshold(); @@ -295,7 +293,7 @@ class doGemm { } #endif - // Square sparse matrix - sparse matrix multiplication +// Square sparse matrix - sparse matrix multiplication cpuGpu_always_ = cpuGpu_offloadThreshold(); cpuGpu_once_ = cpuGpu_offloadThreshold(); cpuGpu_unified_ = cpuGpu_offloadThreshold(); @@ -309,6 +307,12 @@ class doGemm { } // Close file csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && dpGPU_) { + // Print offload results to stdout + printOffloadThreshold("Sparse Square"); + } +#endif } private: @@ -512,14 +516,20 @@ class doGemm { const uint64_t flops = calcFlops(N, N, N); std::string kernelName = getKernelName(); - spGemmCpu_.initialise(N, sparsity); - time_checksum_gflop cpuResult = spGemmCpu_.compute(); - cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); - - // Perform the GPU kernels - +#if CPU_ENABLED + if (doCPU_) { + spGemmCpu_.initialise(N, sparsity); + time_checksum_gflop cpuResult = spGemmCpu_.compute(); + cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); + writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, + cpuResult.runtime, cpuResult.gflops); + } +#endif +#if GPU_ENABLED + // Perform the GPU kernels // - UNIFIED : data passed from host to device (and device to host) as // needed + if (doGPU_) { spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); time_checksum_gflop gpuResult_unified = spGemmGpu_.compute(); gpuResult_unified.gflops = @@ -536,13 +546,9 @@ class doGemm { time_checksum_gflop gpuResult_once = spGemmGpu_.compute(); gpuResult_once.gflops = calcGflops(flops, iterations_, gpuResult_once.runtime); - - // ToDo -- non-default GPU operations // Write lines to CSV file - writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, - cpuResult.runtime, cpuResult.gflops); writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, iterations_, gpuResult_once.runtime, gpuResult_once.gflops); writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, @@ -551,6 +557,10 @@ class doGemm { writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize, iterations_, gpuResult_unified.runtime, gpuResult_unified.gflops); + + } +#endif + } /** A function for calculating FLOPs performed by a GEMM. @@ -589,7 +599,7 @@ class doGemm { } /** Print to stdout the offload thresholds. */ - void printOffloadThreshold(std::string problemName) const { + void printOffloadThreshold(const std::string& problemName) const { std::vector header = { "Device", "M", "N", "K", "Total Prob. Size (KiB)", "GFLOP/s", "CPU GFLOP/s"}; @@ -686,16 +696,14 @@ class doGemm { #if CPU_ENABLED /** The GEMM CPU kernel. */ cpu::gemm_cpu gemmCpu_; + cpu::sp_gemm_cpu spGemmCpu_; #endif - cpu::sp_gemm_cpu spGemmCpu_; - #if GPU_ENABLED /** The GEMM GPU kernel. */ gpu::gemm_gpu gemmGpu_; -#endif - gpu::sp_gemm_gpu spGemmGpu_; +#endif /** The point at which offloading to GPU (offload once) becomes worthwhile. */ cpuGpu_offloadThreshold cpuGpu_once_; diff --git a/include/main.hh b/include/main.hh index cc0bb8f..f12ebcb 100644 --- a/include/main.hh +++ b/include/main.hh @@ -15,4 +15,4 @@ void printBenchmarkConfig(const int iters, const int upperLimit); int parseInt(const char* str); /** A function which parsen the runtime arguments. */ -void getParameters(int argc, char* argv[]); \ No newline at end of file +void getParameters(int argc, char** argv); \ No newline at end of file diff --git a/oneMKL/CPU/sp_gemm.hh b/oneMKL/CPU/sp_gemm.hh new file mode 100644 index 0000000..847006b --- /dev/null +++ b/oneMKL/CPU/sp_gemm.hh @@ -0,0 +1,79 @@ +#pragma once + +#ifdef CPU_ONEMKL +#include + +#include + +#include "../../include/kernels/CPU/sp_gemm.hh" +#include "../../include/utilities.hh" + +namespace cpu { +/** A class for GEMM CPU BLAS kernels. */ +template +class sp_gemm_cpu : public sp_gemm { + public: + using sp_gemm::sp_gemm; + using sp_gemm::initInputMatrices; + using sp_gemm::callConsume; + using sp_gemm::m_; + using sp_gemm::n_; + using sp_gemm::k_; + using sp_gemm::A_; + using sp_gemm::B_; + using sp_gemm::C_; + + /** Initialise the required data structures. */ + void initialise(int m, int n, int k) { + m_ = m; + n_ = n; + k_ = k; + + A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64); + B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64); + C_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64); + + // Initialise the matricies + initInputMatrices(); + } + + private: + /** Make call to the GEMM kernel. */ + void callGemm() override { + if constexpr (std::is_same_v) { + cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_, + (float)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_), + (float)BETA, C_, std::max(1, m_)); + } else if constexpr (std::is_same_v) { + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_, + (double)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_), + (double)BETA, C_, std::max(1, m_)); + } else { + // Un-specialised class will not do any work - print error and exit. + std::cout << "ERROR - Datatype for OneMKL CPU GEMM kernel not supported." + << std::endl; + exit(1); + } + // Ensure compiler doesn't optimise away the work being done + callConsume(); + } + + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override {} + + /** Perform any required steps after calling the GEMM kernel that should + * be timed. */ + void postLoopRequirements() override {} + + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() override { + mkl_free_buffers(); + mkl_free(A_); + mkl_free(B_); + mkl_free(C_); + } +}; +} // namespace cpu +#endif \ No newline at end of file diff --git a/src/main.cc b/src/main.cc index 38e2b5a..a4eb55b 100644 --- a/src/main.cc +++ b/src/main.cc @@ -1,6 +1,7 @@ #include "../include/main.hh" int iters = 10; +int startDim = 1; int upperLimit = 128; bool sgemm = true; bool dgemm = true; @@ -115,7 +116,7 @@ int parseInt(const char* str) { return strlen(next) ? -1 : value; } -void getParameters(int argc, char* argv[]) { +void getParameters(int argc, char** argv) { for (int i = 1; i < argc; i++) { if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i")) { if (++i >= argc || (iters = parseInt(argv[i])) < 0) { From be9094c3c28399ac44658d92941b4923323850f5 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:32:57 +0100 Subject: [PATCH 15/32] rebasing --- createGflopsGraphs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/createGflopsGraphs.py b/createGflopsGraphs.py index 0ed7772..d323162 100644 --- a/createGflopsGraphs.py +++ b/createGflopsGraphs.py @@ -199,7 +199,7 @@ plt.margins(x=0.01, y=0.01) leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18) - for obj in leg.legendHandles: + for obj in leg.legend_handles: obj.set_linewidth(3.0) obj.set_markersize(15.0) obj.set_markeredgewidth(3.0) From 7cfa7be9e278995be6d50a1ad00b9146b3996f79 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Wed, 3 Apr 2024 10:22:51 +0100 Subject: [PATCH 16/32] Tidying up spGEMM classes to remove duplicated code --- cuBLAS/sp_gemm.hh | 90 ++------------------------------- include/kernels/CPU/sp_gemm.hh | 72 ++------------------------ include/kernels/gemm.hh | 92 ++++++++++++++++++++++++++++++++++ oneMKL/CPU/sp_gemm.hh | 9 ++-- 4 files changed, 102 insertions(+), 161 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 0848bb6..992b018 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -18,6 +18,8 @@ template class sp_gemm_gpu : public sp_gemm { public: using sp_gemm::sp_gemm; + using sp_gemm::initInputMatricesSparse; + using sp_gemm::toCSR; using sp_gemm::n_; using sp_gemm::A_; using sp_gemm::B_; @@ -55,8 +57,7 @@ class sp_gemm_gpu : public sp_gemm { // Work out number of edges needed to achieve target sparsity - int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); - A_nnz_ = B_nnz_ = edges; + A_nnz_ = B_nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity)); if (offload_ == gpuOffloadType::unified) { cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_)); @@ -105,28 +106,7 @@ class sp_gemm_gpu : public sp_gemm { // Set initial values to 0 A_ = (T*)malloc(sizeof(T) * n_ * n_); B_ = (T*)malloc(sizeof(T) * n_ * n_); - for (int i = 0; i < (n_ * n_); i++) { - A_[i] = 0.0; - B_[i] = 0.0; - } - - // Random number generator objects for use in descent - std::default_random_engine gen; - gen.seed(std::chrono::system_clock::now() - .time_since_epoch().count()); - std::uniform_real_distribution dist(0.0, 1.0); - - // Using a=0.45 and b=c=0.22 as default probabilities - for (int i = 0; i < A_nnz_; i++) { - while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, - 0.45, 0.22, 0.22, - &gen, dist, false)) {} - } - for (int i = 0; i < B_nnz_; i++) { - while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, - 0.45, 0.22, 0.22, - &gen, dist, false)) {} - } + initInputMatricesSparse(sparsity); toCSR(A_, n_, n_, A_nnz_, A_val_, A_col_, A_row_); @@ -571,68 +551,6 @@ class sp_gemm_gpu : public sp_gemm { } } - bool rMat(T* M, int n, int x1, int x2, int y1, int y2, - float a, float b, float c, std::default_random_engine* gen, - std::uniform_real_distribution dist, bool bin) { - // If a 1x1 submatrix, then add an edge and return out - if (x1 >= x2 && y1 >= y2) { - if (abs(M[(y1 * n) + x1]) > 0.1) { - return false; - } else { - // Add 1.0 if this is a binary graph, and a random real number otherwise - M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / - 100.0) - 50.0); - return true; - } - } else { - // Divide up the matrix - int xMidPoint = x1 + floor((x2 - x1) / 2); - int yMidPoint = y1 + floor((y2 - y1) / 2); - - // ToDo -- add some noise to these values between iterations - float newA = a; - float newB = b; - float newC = c; - - // Work out which quarter to recurse into - // There are some ugly ternary operators here to avoid going out of bounds in the edge case - // that we are already at 1 width or 1 height - float randomNum = dist(*gen); - if (randomNum < a) { - return rMat(M, n, x1, xMidPoint, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b)) { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b + c)) { - return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, - newA, newB, newC, gen, dist, bin); - } else { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, - ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, - gen, dist, bin); - } - } - return true; - } - - void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index, - int* row_ptr) { - int nnz_encountered = 0; - for (int row = 0; row < n_row; row++) { - row_ptr[row] = nnz_encountered; - int nnz_row = 0; - for (int col = 0; col < n_col; col++) { - if (dense[(row * n_col) + col] != 0.0) { - nnz_row++; - col_index[nnz_encountered] = col; - vals[nnz_encountered] = dense[(row * n_col) + col]; - nnz_encountered++; - } - } - } - row_ptr[n_row] = nnz_encountered; - } // ToDo -- the two following functons are useful for debugging. I'm diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh index 3de5ea5..6d9d011 100644 --- a/include/kernels/CPU/sp_gemm.hh +++ b/include/kernels/CPU/sp_gemm.hh @@ -11,6 +11,8 @@ namespace cpu { class sp_gemm : public ::gemm { public: using ::gemm::gemm; + using ::gemm::initInputMatricesSparse; + using ::gemm::toCSR; using ::gemm::m_; using ::gemm::n_; using ::gemm::k_; @@ -27,78 +29,10 @@ namespace cpu { B_ = (T*)malloc(sizeof(T) * n_ * n_); C_ = (T*)malloc(sizeof(T) * n_ * n_); - // Set initial values to 0 - for (int i = 0; i < (n_ * n_); i++) { - A_[i] = 0.0; - B_[i] = 0.0; - } - - // Random number generator objects for use in descent - std::default_random_engine gen; - gen.seed(std::chrono::system_clock::now() - .time_since_epoch().count()); - std::uniform_real_distribution dist(0.0, 1.0); - - // Work out number of edges needed to achieve target sparsity - int edges = 1 + (int) (n * n * (1 - sparsity)); - - // Initialise the matrices - // Using a=0.45 and b=c=0.22 as default probabilities - for (int i = 0; i < edges; i++) { - while (!rMat(A_, n, 0, n - 1, 0, n - 1, - 0.45, 0.22, 0.22, - &gen, dist, false)) {} - while (!rMat(B_, n, 0, n - 1, 0, n - 1, - 0.45, 0.22, 0.22, - &gen, dist, false)) {} - } + initInputMatricesSparse(sparsity); } private: - bool rMat(T* M, int n, int x1, int x2, int y1, int y2, - float a, float b, float c, std::default_random_engine* gen, - std::uniform_real_distribution dist, bool bin) { - // If a 1x1 submatrix, then add an edge and return out - if (x1 >= x2 && y1 >= y2) { - if (abs(M[(y1 * n) + x1]) > 0.1) { - return false; - } else { - // Add 1.0 if this is a binary graph, and a random real number otherwise - M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / - 100.0) - 50.0); - return true; - } - } else { - // Divide up the matrix - int xMidPoint = x1 + floor((x2 - x1) / 2); - int yMidPoint = y1 + floor((y2 - y1) / 2); - - // ToDo -- add some noise to these values between iterations - float newA = a; - float newB = b; - float newC = c; - - // Work out which quarter to recurse into - // There are some ugly ternary operators here to avoid going out of bounds in the edge case - // that we are already at 1 width or 1 height - float randomNum = dist(*gen); - if (randomNum < a) { - return rMat(M, n, x1, xMidPoint, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b)) { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b + c)) { - return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, - newA, newB, newC, gen, dist, bin); - } else { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, - ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, - gen, dist, bin); - } - } - return true; - } /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ void postCallKernelCleanup() { diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index 4eda90f..59a9898 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -4,6 +4,7 @@ #include #include #include +#include #include "../utilities.hh" @@ -86,9 +87,100 @@ class gemm { } } + void initInputMatricesSparse(float sparsity) { + for (int i = 0; i < (n_ * n_); i++) { + A_[i] = 0.0; + B_[i] = 0.0; + } + + // Random number generator objects for use in descent + std::default_random_engine gen; + gen.seed(std::chrono::system_clock::now() + .time_since_epoch().count()); + std::uniform_real_distribution dist(0.0, 1.0); + + int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); + + // Using a=0.45 and b=c=0.22 as default probabilities + for (int i = 0; i < edges; i++) { + while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, + 0.45, 0.22, 0.22, + &gen, dist, false)) {} + } + for (int i = 0; i < edges; i++) { + while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, + 0.45, 0.22, 0.22, + &gen, dist, false)) {} + } + } + /** Call the extern consume() function. */ void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); } + /** Recursive function to populate sparse matrices */ + bool rMat(T* M, int n, int x1, int x2, int y1, int y2, + float a, float b, float c, std::default_random_engine* gen, + std::uniform_real_distribution dist, bool bin) { + // If a 1x1 submatrix, then add an edge and return out + if (x1 >= x2 && y1 >= y2) { + if (abs(M[(y1 * n) + x1]) > 0.1) { + return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / + 100.0) - 50.0); + return true; + } + } else { + // Divide up the matrix + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); + + // ToDo -- add some noise to these values between iterations + float newA = a; + float newB = b; + float newC = c; + + // Work out which quarter to recurse into + // There are some ugly ternary operators here to avoid going out of bounds in the edge case + // that we are already at 1 width or 1 height + float randomNum = dist(*gen); + if (randomNum < a) { + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b)) { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b + c)) { + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + newA, newB, newC, gen, dist, bin); + } else { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, + gen, dist, bin); + } + } + return true; + } + + void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index, + int* row_ptr) { + int nnz_encountered = 0; + for (int row = 0; row < n_row; row++) { + row_ptr[row] = nnz_encountered; + int nnz_row = 0; + for (int col = 0; col < n_col; col++) { + if (dense[(row * n_col) + col] != 0.0) { + nnz_row++; + col_index[nnz_encountered] = col; + vals[nnz_encountered] = dense[(row * n_col) + col]; + nnz_encountered++; + } + } + } + row_ptr[n_row] = nnz_encountered; + } + /** The number of iterations to perform per problem size. */ const int iterations_; diff --git a/oneMKL/CPU/sp_gemm.hh b/oneMKL/CPU/sp_gemm.hh index 847006b..5ac6a70 100644 --- a/oneMKL/CPU/sp_gemm.hh +++ b/oneMKL/CPU/sp_gemm.hh @@ -14,20 +14,17 @@ template class sp_gemm_cpu : public sp_gemm { public: using sp_gemm::sp_gemm; - using sp_gemm::initInputMatrices; + using sp_gemm::initInputMatricesSparse; + using sp_gemm::toCSR; using sp_gemm::callConsume; - using sp_gemm::m_; using sp_gemm::n_; - using sp_gemm::k_; using sp_gemm::A_; using sp_gemm::B_; using sp_gemm::C_; /** Initialise the required data structures. */ - void initialise(int m, int n, int k) { - m_ = m; + void initialise(int n, float sparsity) { n_ = n; - k_ = k; A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64); B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64); From 30d384e22573067f0b32ee7aeb30811a44b39781 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:39:46 +0100 Subject: [PATCH 17/32] rebasing --- cuBLAS/sp_gemm.hh | 17 +++++++-- include/doGemm.hh | 82 +++++++++++++++++++++++------------------ include/kernels/gemm.hh | 49 +++++++++--------------- src/main.cc | 4 +- 4 files changed, 80 insertions(+), 72 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 992b018..aa095f8 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -36,6 +36,7 @@ class sp_gemm_gpu : public sp_gemm { * - Unified: Initialise data as unified memory; no data movement semantics * required */ void initialise(gpuOffloadType offload, int n, float sparsity) override { + std::cout << "___________Initialising, problem size = " << n << std::endl; offload_ = offload; if (std::is_same_v) cudaDataType_ = CUDA_R_32F; @@ -46,9 +47,11 @@ class sp_gemm_gpu : public sp_gemm { } n_ = n * 20; + std::cout << "\tGetting device" << std::endl; // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); + std::cout << "\tMaking streams" << std::endl; // Initialise 3 streams to asynchronously move data between host and device cudaCheckError(cudaStreamCreate(&s1_)); cudaCheckError(cudaStreamCreate(&s2_)); @@ -59,6 +62,7 @@ class sp_gemm_gpu : public sp_gemm { // Work out number of edges needed to achieve target sparsity A_nnz_ = B_nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity)); + std::cout << "\tMallocing" << std::endl; if (offload_ == gpuOffloadType::unified) { cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_)); cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * A_nnz_)); @@ -106,8 +110,11 @@ class sp_gemm_gpu : public sp_gemm { // Set initial values to 0 A_ = (T*)malloc(sizeof(T) * n_ * n_); B_ = (T*)malloc(sizeof(T) * n_ * n_); + + std::cout << "\tInitialising start matrices" << std::endl; initInputMatricesSparse(sparsity); + std::cout << "\tConverting to CSR" << std::endl; toCSR(A_, n_, n_, A_nnz_, A_val_, A_col_, A_row_); toCSR(B_, n_, n_, B_nnz_, B_val_, B_col_, B_row_); @@ -132,7 +139,7 @@ class sp_gemm_gpu : public sp_gemm { /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { - + std::cout << "\t\tpre loop" << std::endl; switch(offload_) { case gpuOffloadType::always: { // Make matrix descriptors @@ -217,6 +224,7 @@ class sp_gemm_gpu : public sp_gemm { /** Make a call to the BLAS Library Kernel. */ void callGemm() override { + std::cout << "\t\tGEMM" << std::endl; switch(offload_) { case gpuOffloadType::always: { cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) * @@ -444,6 +452,7 @@ class sp_gemm_gpu : public sp_gemm { /** Perform any required steps after calling the GEMM kernel that should * be timed. */ void postLoopRequirements() override { + std::cout << "\t\tpost loop" << std::endl; cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_)); // Destroying descriptors cusparseCheckError(cusparseDestroySpMat(descrA_)); @@ -511,6 +520,7 @@ class sp_gemm_gpu : public sp_gemm { /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ void postCallKernelCleanup() override { + std::cout << "\t\tcleaning up" << std::endl; // Destroy the handle cusparseCheckError(cusparseDestroy(handle_)); @@ -519,6 +529,9 @@ class sp_gemm_gpu : public sp_gemm { cudaCheckError(cudaStreamDestroy(s2_)); cudaCheckError(cudaStreamDestroy(s3_)); + free(A_); + free(B_); + if (offload_ == gpuOffloadType::unified) { cudaCheckError(cudaFree(A_val_)); cudaCheckError(cudaFree(A_col_)); @@ -551,8 +564,6 @@ class sp_gemm_gpu : public sp_gemm { } } - - // ToDo -- the two following functons are useful for debugging. I'm // keeping them in to that end, though they are not used by the benchmark // itself diff --git a/include/doGemm.hh b/include/doGemm.hh index 8153651..f4ec053 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -34,13 +34,16 @@ class doGemm { public: doGemm(const std::string csvDir, const int iters, const int startDim, const int upperLimit, const bool cpuEnabled = true, - const bool gpuEnabled = true) + const bool gpuEnabled = true, const bool doDense = true, + const bool doSparse = true) : CSV_DIR(csvDir), iterations_(iters), startDimention_(startDim), upperLimit_(upperLimit), doCPU_(cpuEnabled), - doGPU_(gpuEnabled) + doGPU_(gpuEnabled), + doDense_(dense), + doSparse_(sparse), #if CPU_ENABLED , gemmCpu_(iterations_), @@ -59,27 +62,28 @@ class doGemm { /** Run all problem types and write data to CSV files. */ void collectData() { - // Square Problem Sizes... - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_square_square_M=N=K.csv"); - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = dim, N = dim, K = dim; - callDenseKernels(csvFile, dim, dim, dim); - } - // Close file - csvFile.close(); + if (doDense_) { + // Square Problem Sizes... + // Re-initialise offload threshold structures + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_square_square_M=N=K.csv"); + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = dim, N = dim, K = dim; + callDenseKernels(csvFile, dim, dim, dim); + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Square x Square (M=N=K)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Square (M=N=K)"); + } #endif // Rectangular Problem Sizes: @@ -267,6 +271,7 @@ class doGemm { if (doCPU_ && doGPU_) { // Print offload results to stdout printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)"); + } #endif // Square x Short and Wide // Re-initialise offload threshold structures & previous results @@ -292,27 +297,28 @@ class doGemm { printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)"); } #endif + } -// Square sparse matrix - sparse matrix multiplication - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() + - "_sparse_square.csv"); - if (upperLimit_ >= 32) { - for (int dim = 1; dim <= upperLimit_; dim++) { - const int N = dim; - callSparseKernels(csvFile, N, 0.99); + if (doSparse_) { // Square sparse matrix - sparse matrix multiplication + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() + + "_sparse_square.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + callSparseKernels(csvFile, dim, 0.99); + } } - } - // Close file - csvFile.close(); + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && dpGPU_) { + if (doCPU_ && doGPU_) { // Print offload results to stdout printOffloadThreshold("Sparse Square"); } #endif + } } private: @@ -693,6 +699,10 @@ class doGemm { /** Whether the GPU kernels should be run. */ const bool doGPU_ = true; + /** Whether we should run dense and or sparse kernels */ + const bool doDense_; + const bool doSparse_; + #if CPU_ENABLED /** The GEMM CPU kernel. */ cpu::gemm_cpu gemmCpu_; diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index 59a9898..3ffc0d7 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -103,14 +103,8 @@ class gemm { // Using a=0.45 and b=c=0.22 as default probabilities for (int i = 0; i < edges; i++) { - while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, - 0.45, 0.22, 0.22, - &gen, dist, false)) {} - } - for (int i = 0; i < edges; i++) { - while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, - 0.45, 0.22, 0.22, - &gen, dist, false)) {} + rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false); + rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false); } } @@ -118,23 +112,18 @@ class gemm { void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); } /** Recursive function to populate sparse matrices */ - bool rMat(T* M, int n, int x1, int x2, int y1, int y2, - float a, float b, float c, std::default_random_engine* gen, + void rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b, + float c, std::default_random_engine* gen, std::uniform_real_distribution dist, bool bin) { // If a 1x1 submatrix, then add an edge and return out if (x1 >= x2 && y1 >= y2) { - if (abs(M[(y1 * n) + x1]) > 0.1) { - return false; - } else { - // Add 1.0 if this is a binary graph, and a random real number otherwise - M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / + M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0); - return true; - } + return; } else { // Divide up the matrix - int xMidPoint = x1 + floor((x2 - x1) / 2); - int yMidPoint = y1 + floor((y2 - y1) / 2); + int xMidPoint = (x1 == x2) ? x1 : x1 + floor((x2 - x1) / 2); + int yMidPoint = (y1 == y2) ? y1 : y1 + floor((y2 - y1) / 2); // ToDo -- add some noise to these values between iterations float newA = a; @@ -142,25 +131,23 @@ class gemm { float newC = c; // Work out which quarter to recurse into - // There are some ugly ternary operators here to avoid going out of bounds in the edge case - // that we are already at 1 width or 1 height + // There are some ugly ternary operators here to avoid going out of + // bounds in the edge case that we are already at 1 width or 1 height float randomNum = dist(*gen); if (randomNum < a) { - return rMat(M, n, x1, xMidPoint, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); + rMat(M, n, x1, xMidPoint, y1, yMidPoint, newA, newB, newC, gen, dist, + bin); } else if (randomNum < (a + b)) { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); + rMat(M, n, xMidPoint, x2, y1, yMidPoint, newA, newB, newC, gen, dist, + bin); } else if (randomNum < (a + b + c)) { - return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, - newA, newB, newC, gen, dist, bin); + rMat(M, n, x1, xMidPoint, yMidPoint, y2, newA, newB, newC, gen, + dist, bin); } else { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, - ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, - gen, dist, bin); + rMat(M, n, xMidPoint, x2, yMidPoint, y2, newA, newB, newC, gen, + dist, bin); } } - return true; } void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index, diff --git a/src/main.cc b/src/main.cc index a4eb55b..268b628 100644 --- a/src/main.cc +++ b/src/main.cc @@ -37,14 +37,14 @@ int main(int argc, char** argv) { // SGEMM Comparison std::cout << std::endl << "Comparing SGEMM Kernels:" << std::endl; doGemm sgemm(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu); + doGpu, sgemm, sp_sgemm); sgemm.collectData(); std::cout << "Finished!" << std::endl; // DGEMM Comparison std::cout << std::endl << "Comparing DGEMM Kernels:" << std::endl; doGemm dgemm(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu); + doGpu, dgemm, sp_dgemm); dgemm.collectData(); std::cout << "Finished!" << std::endl; From cc8e2a86347ca35b598b462724b5c3c71fb9a659 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:43:02 +0100 Subject: [PATCH 18/32] rebasing --- cuBLAS/sp_gemm.hh | 16 +++------------- include/doGemm.hh | 4 ++-- include/kernels/gemm.hh | 34 ++++++++++++++++++++-------------- src/main.cc | 32 ++++++++++++++++++-------------- 4 files changed, 43 insertions(+), 43 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index aa095f8..2c787d9 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -36,7 +36,6 @@ class sp_gemm_gpu : public sp_gemm { * - Unified: Initialise data as unified memory; no data movement semantics * required */ void initialise(gpuOffloadType offload, int n, float sparsity) override { - std::cout << "___________Initialising, problem size = " << n << std::endl; offload_ = offload; if (std::is_same_v) cudaDataType_ = CUDA_R_32F; @@ -45,13 +44,11 @@ class sp_gemm_gpu : public sp_gemm { std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; exit(1); } - n_ = n * 20; + n_ = n; - std::cout << "\tGetting device" << std::endl; // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); - std::cout << "\tMaking streams" << std::endl; // Initialise 3 streams to asynchronously move data between host and device cudaCheckError(cudaStreamCreate(&s1_)); cudaCheckError(cudaStreamCreate(&s2_)); @@ -62,7 +59,6 @@ class sp_gemm_gpu : public sp_gemm { // Work out number of edges needed to achieve target sparsity A_nnz_ = B_nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity)); - std::cout << "\tMallocing" << std::endl; if (offload_ == gpuOffloadType::unified) { cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_)); cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * A_nnz_)); @@ -111,13 +107,11 @@ class sp_gemm_gpu : public sp_gemm { A_ = (T*)malloc(sizeof(T) * n_ * n_); B_ = (T*)malloc(sizeof(T) * n_ * n_); - std::cout << "\tInitialising start matrices" << std::endl; initInputMatricesSparse(sparsity); - std::cout << "\tConverting to CSR" << std::endl; - toCSR(A_, n_, n_, A_nnz_, A_val_, A_col_, A_row_); + toCSR(A_, n_, n_, A_val_, A_col_, A_row_); - toCSR(B_, n_, n_, B_nnz_, B_val_, B_col_, B_row_); + toCSR(B_, n_, n_, B_val_, B_col_, B_row_); // std::cout << "_____Matrix A_____" << std::endl; @@ -139,7 +133,6 @@ class sp_gemm_gpu : public sp_gemm { /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { - std::cout << "\t\tpre loop" << std::endl; switch(offload_) { case gpuOffloadType::always: { // Make matrix descriptors @@ -224,7 +217,6 @@ class sp_gemm_gpu : public sp_gemm { /** Make a call to the BLAS Library Kernel. */ void callGemm() override { - std::cout << "\t\tGEMM" << std::endl; switch(offload_) { case gpuOffloadType::always: { cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) * @@ -452,7 +444,6 @@ class sp_gemm_gpu : public sp_gemm { /** Perform any required steps after calling the GEMM kernel that should * be timed. */ void postLoopRequirements() override { - std::cout << "\t\tpost loop" << std::endl; cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_)); // Destroying descriptors cusparseCheckError(cusparseDestroySpMat(descrA_)); @@ -520,7 +511,6 @@ class sp_gemm_gpu : public sp_gemm { /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ void postCallKernelCleanup() override { - std::cout << "\t\tcleaning up" << std::endl; // Destroy the handle cusparseCheckError(cusparseDestroy(handle_)); diff --git a/include/doGemm.hh b/include/doGemm.hh index f4ec053..53bbb54 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -42,8 +42,8 @@ class doGemm { upperLimit_(upperLimit), doCPU_(cpuEnabled), doGPU_(gpuEnabled), - doDense_(dense), - doSparse_(sparse), + doDense_(doDense), + doSparse_(doSparse) #if CPU_ENABLED , gemmCpu_(iterations_), diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index 3ffc0d7..230c7d3 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -103,8 +103,10 @@ class gemm { // Using a=0.45 and b=c=0.22 as default probabilities for (int i = 0; i < edges; i++) { - rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false); - rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false); + while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, + false)) {} + while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, + false)){} } } @@ -112,14 +114,18 @@ class gemm { void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); } /** Recursive function to populate sparse matrices */ - void rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b, + bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b, float c, std::default_random_engine* gen, std::uniform_real_distribution dist, bool bin) { // If a 1x1 submatrix, then add an edge and return out if (x1 >= x2 && y1 >= y2) { - M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / + if (M[(int) (y1 * n) + x1] == 0) { + M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0); - return; + return true; + } else { + return false; + } } else { // Divide up the matrix int xMidPoint = (x1 == x2) ? x1 : x1 + floor((x2 - x1) / 2); @@ -135,22 +141,22 @@ class gemm { // bounds in the edge case that we are already at 1 width or 1 height float randomNum = dist(*gen); if (randomNum < a) { - rMat(M, n, x1, xMidPoint, y1, yMidPoint, newA, newB, newC, gen, dist, - bin); + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, newA, newB, newC, + gen, dist, bin); } else if (randomNum < (a + b)) { - rMat(M, n, xMidPoint, x2, y1, yMidPoint, newA, newB, newC, gen, dist, - bin); + return rMat(M, n, xMidPoint, x2, y1, yMidPoint, newA, newB, newC, + gen, dist, bin); } else if (randomNum < (a + b + c)) { - rMat(M, n, x1, xMidPoint, yMidPoint, y2, newA, newB, newC, gen, - dist, bin); + return rMat(M, n, x1, xMidPoint, yMidPoint, y2, newA, newB, newC, gen, + dist, bin); } else { - rMat(M, n, xMidPoint, x2, yMidPoint, y2, newA, newB, newC, gen, - dist, bin); + return rMat(M, n, xMidPoint, x2, yMidPoint, y2, newA, newB, newC, + gen, dist, bin); } } } - void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index, + void toCSR(T* dense, int n_col, int n_row, T* vals, int* col_index, int* row_ptr) { int nnz_encountered = 0; for (int row = 0; row < n_row; row++) { diff --git a/src/main.cc b/src/main.cc index 268b628..06fd48e 100644 --- a/src/main.cc +++ b/src/main.cc @@ -3,10 +3,10 @@ int iters = 10; int startDim = 1; int upperLimit = 128; -bool sgemm = true; -bool dgemm = true; -bool sp_sgemm = true; -bool sp_dgemm = true; +bool doSgemm = true; +bool doDgemm = true; +bool doSp_sgemm = true; +bool doSp_dgemm = true; bool doCpu = CPU_ENABLED; bool doGpu = GPU_ENABLED; @@ -37,14 +37,14 @@ int main(int argc, char** argv) { // SGEMM Comparison std::cout << std::endl << "Comparing SGEMM Kernels:" << std::endl; doGemm sgemm(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu, sgemm, sp_sgemm); + doGpu, doSgemm, doSp_sgemm); sgemm.collectData(); std::cout << "Finished!" << std::endl; // DGEMM Comparison std::cout << std::endl << "Comparing DGEMM Kernels:" << std::endl; doGemm dgemm(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu, dgemm, sp_dgemm); + doGpu, doDgemm, doSp_dgemm); dgemm.collectData(); std::cout << "Finished!" << std::endl; @@ -146,28 +146,28 @@ void getParameters(int argc, char** argv) { } else if (!strcmp(argv[i], "--no_gpu")) { doGpu = false; } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) { - sgemm = dgemm = sp_sgemm = sp_dgemm = false; + doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = false; std::string kernelList = argv[++i]; if (kernelList.find("sp-sgemm") != std::string::npos) { - sp_sgemm = true; + doSp_sgemm = true; if (kernelList.find("sgemm") != std::string::npos && kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) { - sgemm = true; + doSgemm = true; } } else if (kernelList.find("sgemm") != std::string::npos) { - sgemm = true; + doSgemm = true; } if (kernelList.find("sp-dgemm") != std::string::npos) { - sp_dgemm = true; + doSp_dgemm = true; if (kernelList.find("dgemm") != std::string::npos && kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) { - dgemm = true; + doDgemm = true; } } else if (kernelList.find("dgemm") != std::string::npos) { - dgemm = true; + doDgemm = true; } - if (!sgemm && !dgemm && !sp_sgemm && !sp_dgemm) { + if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm) { std::cout << "ERROR - no implemented kernels in list" << std::endl; exit(1); } @@ -200,6 +200,10 @@ void getParameters(int argc, char** argv) { std::cout << " -d --dimension_limit D Max value of M, N, K is D " "(default: " << upperLimit << ")" << std::endl; + std::cout << " -k --kernels Comma-separated list of " + "kernels to be run. Options are sgemm, dgemm, sp-sgemm, " + "sp-dgemm (default: sgemm,dgemm,sp-gemm,sp-dgemm)" << + std::endl; std::cout << std::endl; exit(0); } else { From de56ae19b2934221195fdd4b020f0d33f97879a5 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:44:12 +0100 Subject: [PATCH 19/32] rebasing --- cuBLAS/sp_gemm.hh | 27 +++++++++++++++++++-------- include/doGemm.hh | 2 +- include/kernels/gemm.hh | 38 +++++++++++++++++++++----------------- 3 files changed, 41 insertions(+), 26 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 2c787d9..8bed12b 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -26,7 +26,7 @@ class sp_gemm_gpu : public sp_gemm { using sp_gemm::C_; using sp_gemm::offload_; - // ToDo -- No checksum for sparse yet. Nedd to do + // ToDo -- No checksum for sparse yet. Need to do /** Initialise the required data structures. * `offload` refers to the data offload type: @@ -44,7 +44,7 @@ class sp_gemm_gpu : public sp_gemm { std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; exit(1); } - n_ = n; + n_ = 100 * n; // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); @@ -133,6 +133,7 @@ class sp_gemm_gpu : public sp_gemm { /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { + cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_)); switch(offload_) { case gpuOffloadType::always: { // Make matrix descriptors @@ -212,13 +213,17 @@ class sp_gemm_gpu : public sp_gemm { break; } } - cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_)); } /** Make a call to the BLAS Library Kernel. */ void callGemm() override { switch(offload_) { case gpuOffloadType::always: { + if (C_mem_allocated_always_) { + cusparseCheckError(cusparseDestroySpMat(descrA_)); + cusparseCheckError(cusparseDestroySpMat(descrB_)); + cusparseCheckError(cusparseDestroySpMat(descrC_)); + } cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) * A_nnz_, cudaMemcpyHostToDevice, s1_)); cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, sizeof(int) * @@ -235,6 +240,7 @@ class sp_gemm_gpu : public sp_gemm { cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_ + 1), cudaMemcpyHostToDevice, s3_)); + cudaCheckError(cudaDeviceSynchronize()); // Make matrix descriptors cusparseCheckError( @@ -444,10 +450,6 @@ class sp_gemm_gpu : public sp_gemm { /** Perform any required steps after calling the GEMM kernel that should * be timed. */ void postLoopRequirements() override { - cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_)); - // Destroying descriptors - cusparseCheckError(cusparseDestroySpMat(descrA_)); - cusparseCheckError(cusparseDestroySpMat(descrB_)); switch(offload_) { case gpuOffloadType::always: { break; @@ -476,10 +478,14 @@ class sp_gemm_gpu : public sp_gemm { cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) * (n_ + 1), cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaDeviceSynchronize()); + + cusparseCheckError(cusparseDestroySpMat(descrA_)); + cusparseCheckError(cusparseDestroySpMat(descrB_)); + cusparseCheckError(cusparseDestroySpMat(descrC_)); + break; } case gpuOffloadType::unified: { - cusparseCheckError(cusparseDestroySpMat(descrC_)); // Ensure all data resides on host once work has completed cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_, cudaCpuDeviceId, s1_)); @@ -503,9 +509,14 @@ class sp_gemm_gpu : public sp_gemm { cudaCpuDeviceId, s3_)); // Ensure device has finished all work. cudaCheckError(cudaDeviceSynchronize()); + + cusparseCheckError(cusparseDestroySpMat(descrA_)); + cusparseCheckError(cusparseDestroySpMat(descrB_)); + cusparseCheckError(cusparseDestroySpMat(descrC_)); break; } } + cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_)); } /** Do any necessary cleanup (free pointers, close library handles, etc.) diff --git a/include/doGemm.hh b/include/doGemm.hh index 53bbb54..b89abee 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -307,7 +307,7 @@ class doGemm { "_sparse_square.csv"); if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { - callSparseKernels(csvFile, dim, 0.99); + callSparseKernels(csvFile, dim, 0.9999); } } // Close file diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index 230c7d3..2a971a0 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -106,7 +106,7 @@ class gemm { while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false)) {} while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, - false)){} + false)) {} } } @@ -119,17 +119,19 @@ class gemm { std::uniform_real_distribution dist, bool bin) { // If a 1x1 submatrix, then add an edge and return out if (x1 >= x2 && y1 >= y2) { - if (M[(int) (y1 * n) + x1] == 0) { - M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / - 100.0) - 50.0); - return true; - } else { + // Needed to avoid overfloe segfaults with large problem sizes + uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1); + if (abs(M[index]) > 0.1) { return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0); + return true; } } else { // Divide up the matrix - int xMidPoint = (x1 == x2) ? x1 : x1 + floor((x2 - x1) / 2); - int yMidPoint = (y1 == y2) ? y1 : y1 + floor((y2 - y1) / 2); + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); // ToDo -- add some noise to these values between iterations float newA = a; @@ -137,23 +139,25 @@ class gemm { float newC = c; // Work out which quarter to recurse into - // There are some ugly ternary operators here to avoid going out of - // bounds in the edge case that we are already at 1 width or 1 height + // There are some ugly ternary operators here to avoid going out of bounds in the edge case + // that we are already at 1 width or 1 height float randomNum = dist(*gen); if (randomNum < a) { - return rMat(M, n, x1, xMidPoint, y1, yMidPoint, newA, newB, newC, - gen, dist, bin); + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); } else if (randomNum < (a + b)) { - return rMat(M, n, xMidPoint, x2, y1, yMidPoint, newA, newB, newC, - gen, dist, bin); + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); } else if (randomNum < (a + b + c)) { - return rMat(M, n, x1, xMidPoint, yMidPoint, y2, newA, newB, newC, gen, - dist, bin); + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + newA, newB, newC, gen, dist, bin); } else { - return rMat(M, n, xMidPoint, x2, yMidPoint, y2, newA, newB, newC, + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, gen, dist, bin); } } + return true; } void toCSR(T* dense, int n_col, int n_row, T* vals, int* col_index, From b972c23e4058c5d5e541b6d3f3e3424dc185f7b0 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:49:45 +0100 Subject: [PATCH 20/32] rebasing --- src/main.cc | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/main.cc b/src/main.cc index 06fd48e..51d1cf1 100644 --- a/src/main.cc +++ b/src/main.cc @@ -146,26 +146,26 @@ void getParameters(int argc, char** argv) { } else if (!strcmp(argv[i], "--no_gpu")) { doGpu = false; } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) { - doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = false; - std::string kernelList = argv[++i]; - if (kernelList.find("sp-sgemm") != std::string::npos) { - doSp_sgemm = true; - if (kernelList.find("sgemm") != std::string::npos && - kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) { - doSgemm = true; - } - } else if (kernelList.find("sgemm") != std::string::npos) { - doSgemm = true; - } - if (kernelList.find("sp-dgemm") != std::string::npos) { - doSp_dgemm = true; - if (kernelList.find("dgemm") != std::string::npos && - kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) { - doDgemm = true; - } - } else if (kernelList.find("dgemm") != std::string::npos) { - doDgemm = true; - } + doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = false; + std::string kernelList = argv[++i]; + if (kernelList.find("sp-sgemm") != std::string::npos) { + doSp_sgemm = true; + if (kernelList.find("sgemm") != std::string::npos && + kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) { + doSgemm = true; + } + } else if (kernelList.find("sgemm") != std::string::npos) { + doSgemm = true; + } + if (kernelList.find("sp-dgemm") != std::string::npos) { + doSp_dgemm = true; + if (kernelList.find("dgemm") != std::string::npos && + kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) { + doDgemm = true; + } + } else if (kernelList.find("dgemm") != std::string::npos) { + doDgemm = true; + } if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm) { std::cout << "ERROR - no implemented kernels in list" << std::endl; From 1f5f2ddebf774b9bd35b52ab29ef02cca6065ff3 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:50:03 +0100 Subject: [PATCH 21/32] rebasing --- calculateOffloadThreshold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/calculateOffloadThreshold.py b/calculateOffloadThreshold.py index 38c2646..43028c0 100644 --- a/calculateOffloadThreshold.py +++ b/calculateOffloadThreshold.py @@ -165,7 +165,7 @@ def printResults(once:offloadThreshold, always:offloadThreshold, unified:offload gpuAlways.M = 0 gpuAlways.N = 0 gpuAlways.K = 0 - if(gpuUnified.M != 0 and float(cpu[8]) >= float(gpuU[8])): + if("gemm" in kernel and gpuUnified.M != 0 and float(cpu[8]) >= float(gpuU[8])): # Do check to see if this is a momentary drop that we should ignore if (prevGpuUgflops <= float(cpu[8])) and (float(gpuLines[2].split(',')[8]) <= float(cpu[8])): gpuUnified.cpuGflops = 0.0 From b06250c0ca7a8d14c2904d69a70da24f89824e5d Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:50:37 +0100 Subject: [PATCH 22/32] rebasing --- AOCL/sp_gemm.hh | 62 ++++++++++ cuBLAS/common.hh | 53 +++++++-- cuBLAS/sp_gemm.hh | 4 +- include/doGemm.hh | 4 +- include/kernels/CPU/sp_gemm.hh | 3 +- include/kernels/gemm.hh | 25 +++- oneMKL/CPU/sp_gemm.hh | 201 +++++++++++++++++++++++++++++---- 7 files changed, 320 insertions(+), 32 deletions(-) create mode 100644 AOCL/sp_gemm.hh diff --git a/AOCL/sp_gemm.hh b/AOCL/sp_gemm.hh new file mode 100644 index 0000000..3c6b5c0 --- /dev/null +++ b/AOCL/sp_gemm.hh @@ -0,0 +1,62 @@ +#pragma once + +#ifdef CPU_AOCL +#include + +#include "../include/kernels/CPU/gemm.hh" +#include "../include/utilities.hh" + +namespace cpu { +/** A class for GEMM CPU BLAS kernels. */ +template +class gemm_cpu : public gemm { + public: + using gemm::gemm; + using gemm::callConsume; + using gemm::m_; + using gemm::n_; + using gemm::k_; + using gemm::A_; + using gemm::B_; + using gemm::C_; + + private: + /** Make call to the GEMM kernel. */ + void callGemm() override { + if constexpr (std::is_same_v) { + bli_sgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_, + rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_), + &beta, C_, rowStride, std::max(1, m_)); + } else if constexpr (std::is_same_v) { + bli_dgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_, + rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_), + &beta, C_, rowStride, std::max(1, m_)); + } else { + // Un-specialised class will not do any work - print error and exit. + std::cout << "ERROR - Datatype for AOCL CPU GEMM kernel not supported." + << std::endl; + exit(1); + } + // Ensure compiler doesn't optimise away the work being done + callConsume(); + } + + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override {} + + /** Perform any required steps after calling the GEMM kernel that should + * be timed. */ + void postLoopRequirements() override {} + + /** The constant value Alpha. */ + T alpha = ALPHA; + + /** The constant value Beta. */ + T beta = BETA; + + /** The distance in elements to the next column. */ + const int rowStride = 1; +}; +} // namespace cpu +#endif \ No newline at end of file diff --git a/cuBLAS/common.hh b/cuBLAS/common.hh index 70d58fb..c8086db 100644 --- a/cuBLAS/common.hh +++ b/cuBLAS/common.hh @@ -16,13 +16,52 @@ } while (false) /** Macro function to check if error occurred when calling cuBLAS. */ -#define cublasCheckError(f) \ - do { \ - if (cublasStatus_t e = (f); e != CUBLAS_STATUS_SUCCESS) { \ - std::cout << "CUBLAS error: " << __FILE__ << ":" << __LINE__ << ": " \ - << cublasGetStatusString(e) << std::endl; \ - exit(1); \ - } \ +#define cublasCheckError(f) \ + do { \ + switch (f) { \ + case CUBLAS_STATUS_SUCCESS: \ + break; \ + case CUBLAS_STATUS_NOT_INITIALIZED: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_NOT_INITIALIZED" << std::endl; \ + exit(1); \ + case CUBLAS_STATUS_ALLOC_FAILED: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_ALLOC_FAILED" << std::endl; \ + exit(1); \ + case CUBLAS_STATUS_INVALID_VALUE: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_INVALID_VALUE" << std::endl; \ + exit(1); \ + case CUBLAS_STATUS_ARCH_MISMATCH: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_ARCH_MISMATCH" << std::endl; \ + exit(1); \ + case CUBLAS_STATUS_MAPPING_ERROR: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_MAPPING_ERROR" << std::endl; \ + exit(1); \ + case CUBLAS_STATUS_EXECUTION_FAILED: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_EXECUTION_FAILED" << std::endl; \ + exit(1); \ + case CUBLAS_STATUS_INTERNAL_ERROR: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_INTERNAL_ERROR" << std::endl; \ + exit(1); \ + case CUBLAS_STATUS_NOT_SUPPORTED: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_NOT_SUPPORTED" << std::endl; \ + exit(1); \ + case CUBLAS_STATUS_LICENSE_ERROR: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_LICENSE_ERROR" << std::endl; \ + exit(1); \ + default: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": other error not in switch statement" << std::endl; \ + exit(1); \ + } \ } while (false) #define cusparseCheckError(f) \ diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 8bed12b..d849d22 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -109,9 +109,9 @@ class sp_gemm_gpu : public sp_gemm { initInputMatricesSparse(sparsity); - toCSR(A_, n_, n_, A_val_, A_col_, A_row_); + toCSR_int(A_, n_, n_, A_val_, A_col_, A_row_); - toCSR(B_, n_, n_, B_val_, B_col_, B_row_); + toCSR_int(B_, n_, n_, B_val_, B_col_, B_row_); // std::cout << "_____Matrix A_____" << std::endl; diff --git a/include/doGemm.hh b/include/doGemm.hh index b89abee..e264273 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -303,8 +303,8 @@ class doGemm { cpuGpu_always_ = cpuGpu_offloadThreshold(); cpuGpu_once_ = cpuGpu_offloadThreshold(); cpuGpu_unified_ = cpuGpu_offloadThreshold(); - csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() + - "_sparse_square.csv"); + std::ofstream csvFile = initCSVFile(std::string(CSV_DIR) + "/" + + getKernelName() + "_sparse_square.csv"); if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { callSparseKernels(csvFile, dim, 0.9999); diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh index 6d9d011..60778e7 100644 --- a/include/kernels/CPU/sp_gemm.hh +++ b/include/kernels/CPU/sp_gemm.hh @@ -1,5 +1,6 @@ #pragma once +#ifdef CPU_ONEMKL #include "../gemm.hh" #include @@ -41,4 +42,4 @@ namespace cpu { free(C_); } }; -} // namespace cpu \ No newline at end of file +} // namespace cpu diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index 2a971a0..d97fc8c 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -1,5 +1,9 @@ #pragma once +#ifdef CPU_ONEMKL +#include +#endif + #include #include #include @@ -160,7 +164,7 @@ class gemm { return true; } - void toCSR(T* dense, int n_col, int n_row, T* vals, int* col_index, + void toCSR_int(T* dense, int n_col, int n_row, T* vals, int* col_index, int* row_ptr) { int nnz_encountered = 0; for (int row = 0; row < n_row; row++) { @@ -178,6 +182,25 @@ class gemm { row_ptr[n_row] = nnz_encountered; } +#ifdef CPU_ONEMKL + void toCSR_mkl(T* dense, int n_col, int n_row, T* vals, MKL_INT* col_index, + MKL_INT* row_ptr) { + int nnz_encountered = 0; + for (int row = 0; row < n_row; row++) { + row_ptr[row] = (MKL_INT)nnz_encountered; + int nnz_row = 0; + for (int col = 0; col < n_col; col++) { + if (dense[(row * n_col) + col] != 0.0) { + nnz_row++; + col_index[nnz_encountered] = (MKL_INT)col; + vals[nnz_encountered] = dense[(row * n_col) + col]; + nnz_encountered++; + } + } + } + row_ptr[n_row] = (MKL_INT)nnz_encountered; + } +#endif /** The number of iterations to perform per problem size. */ const int iterations_; diff --git a/oneMKL/CPU/sp_gemm.hh b/oneMKL/CPU/sp_gemm.hh index 5ac6a70..0b4e32b 100644 --- a/oneMKL/CPU/sp_gemm.hh +++ b/oneMKL/CPU/sp_gemm.hh @@ -24,33 +24,146 @@ class sp_gemm_cpu : public sp_gemm { /** Initialise the required data structures. */ void initialise(int n, float sparsity) { - n_ = n; - A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64); B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64); C_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64); + n_ = n * 100; + nnz_ = (1 + (int)(n_ * n_ * (1 - sparsity))); + + values_A_ = (T*)mkl_malloc(sizeof(T) * nnz_, ALIGN); + columns_A_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * nnz_, ALIGN); + rowIndex_A_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (n_ + 1), ALIGN); + + values_B_ = (T*)mkl_malloc(sizeof(T) * nnz_, ALIGN); + columns_B_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * nnz_, ALIGN); + rowIndex_B_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (n_ + 1), ALIGN); + + x_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN); + y_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN); + rslt_mv_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN); + rslt_mv_trans_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN); + // Initialise the matricies - initInputMatrices(); + initInputMatricesSparse(sparsity); + + descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL; + + // Transfer from dense to CSR format + toCSR_mkl(A_, n_, n_, values_A_, columns_A_, rowIndex_A_); + toCSR_mkl(B_, n_, n_, values_B_, columns_B_, rowIndex_B_); + + // ToDo -- Set values for x and y (which are vectors of length n_?) + + if constexpr (std::is_same_v) { + CALL_AND_CHECK_STATUS(mkl_sparse_s_create_csr(&csrA_, + SPARSE_INDEX_BASE_ZERO, n_, + n_, rowIndex_A_, + rowIndex_A_+1, columns_A_, + values_A_), + "Error after MKL_SPARSE_D_CREATE_CSR for csrA\n"); + CALL_AND_CHECK_STATUS(mkl_sparse_s_create_csr(&csrB_, + SPARSE_INDEX_BASE_ZERO, n_, + n_, rowIndex_B_, + rowIndex_B_+1, columns_B_, + values_B_), + "Error after MKL_SPARSE_D_CREATE_CSR for csrB\n"); + } else if constexpr (std::is_same_v) { + CALL_AND_CHECK_STATUS(mkl_sparse_d_create_csr(&csrA_, + SPARSE_INDEX_BASE_ZERO, n_, + n_, rowIndex_A_, + rowIndex_A_+1, columns_A_, + values_A_), + "Error after MKL_SPARSE_D_CREATE_CSR for csrA\n"); + CALL_AND_CHECK_STATUS(mkl_sparse_d_create_csr(&csrB_, + SPARSE_INDEX_BASE_ZERO, n_, + n_, rowIndex_B_, + rowIndex_B_+1, columns_B_, + values_B_), + "Error after MKL_SPARSE_D_CREATE_CSR for csrB\n"); + } else { + std::cout << "ERROR - Datatype for OneMKL CPU spGEMM kernel not " + "supported." << std::endl; + exit(1) + }; + + CALL_AND_CHECK_STATUS(mkl_sparse_spmm(SPARSE_OPERATION_NON_TRANSPOSE, + csrA_, csrB_, &csrC_), + "Error after MKL_SPARSE_SPMM\n"); + + // ToDo -- check that transpose is what I want here + CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrA_, + SPARSE_OPERATION_TRANSPOSE, + descr_type_gen_, 1), + "Error after MKL_SPARSE_SET_MV_HINT with csrA_\n"); + CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrB_, + SPARSE_OPERATION_NON_TRANSPOSE, + descr_type_gen_, 1), + "Error after MKL_SPARSE_SET_MV_HINT with csrB_\n"); + CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrC_, + SPARSE_OPERATION_NON_TRANSPOSE, + descr_type_gen_, 1), + "Error after MKL_SPARSE_SET_MV_HINT with csrC_\n"); + + CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrA_), + "Error after MKL_SPARSE_OPTIMIZE with csrA_\n"); + CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrB_), + "Error after MKL_SPARSE_OPTIMIZE with csrB_\n"); + CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrC_), + "Error after MKL_SPARSE_OPTIMIZE with csrC_\n"); } private: /** Make call to the GEMM kernel. */ void callGemm() override { if constexpr (std::is_same_v) { - cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_, - (float)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_), - (float)BETA, C_, std::max(1, m_)); - } else if constexpr (std::is_same_v) { - cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_, - (double)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_), - (double)BETA, C_, std::max(1, m_)); - } else { - // Un-specialised class will not do any work - print error and exit. - std::cout << "ERROR - Datatype for OneMKL CPU GEMM kernel not supported." - << std::endl; - exit(1); + CALL_AND_CHECK_STATUS(mkl_sparse_s_mv(SPARSE_OPERATION_NON_TRASPOSE, 1 + .0, csrC_, descr_type_gen_, x_, 0.0, rslt_mv_), + "Error after MKL_SPARSE_S_MV for csrC_ * x_\n"); + left_ = cblas_sdot(n_, rstl_mv_, 1, y_, 1); + + CALL_AND_CHECK_STATUS(mkl_sparse_s_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1 + .0, csrB_, descr_type_gen_, x, 0.0, trslt_mv_), + "Error adter MKL_SPARSE_S_MV for csrB_ * x_\n"); + CALL_AND_CHECK_STATUS(mkl_sparse_s_mv(SPARSE_OPERATION_TRANSPOSE, 1.0, + csrA_, descr_type_gen_, y_, 0.0, + rslt_mv_trans_), + "Error adter MKL_SPARSE_S_MV for csrA_ * y_\n"); + right_ = cblas_sdot(n_, rslt_mv_, 1, rslt_mv_trans_, 1); + + residual = fabs(left - right)/(fabs(left) + 1); + + CALL_AND_CHECK_STATUS(mkl_sparse_s_export_csr(csrC_, &indexing_, + &rows_, &cols_, + &pointerB_C_, + &pointerE_C_, + &columns_C_, &values_C_), + "Error after MKL_SPARSE_S_EXPORT_CSR\n"); + } else if constexpr (std::is_same_v { /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ void postCallKernelCleanup() override { - mkl_free_buffers(); - mkl_free(A_); - mkl_free(B_); - mkl_free(C_); + if (mkl_sparse_destroy(csrC_) != SPARSE_STATUS_SUCCESS) { + printf(" Error after MKL_SPARSE_DESTROY, csrC_\n"); + fflush(0); + status = 1; + } + + //Deallocate arrays for which we allocate memory ourselves. + mkl_free(rslt_mv_trans_); + mkl_free(rslt_mv-); + mkl_free(x_); + mkl_free(y_); + + //Release matrix handle and deallocate arrays for which we allocate memory ourselves. + if (mkl_sparse_destroy(csrA_) != SPARSE_STATUS_SUCCESS) { + printf("Error after MKL_SPARSE_DESTROY, csrA_\n"); + fflush(0); + status = 1; + } + + mkl_free(values_A_); + mkl_free(columns_A_); + mkl_free(rowIndex_A_); + + if (mkl_sparse_destroy(csrB_) != SPARSE_STATUS_SUCCESS) { + printf("Error after MKL_SPARSE_DESTROY, csrB_\n"); + fflush(0); + status = 1; + } + + mkl_free(values_B_); + mkl_free(columns_B_); + mkl_free(rowIndex_B_); } + + int nnz_; + + MKL_INT* columns_A_; + MKL_INT* columns_B_; + MKL_INT* columns_C_; + MKL_INT* rowIndex_A_; + MKL_INT* rowIndex_B_; + MKL_INT* pointerB_C_; + MKL_INT* pointerE_C_; + + T* rslt_mv_; + T* rslt_mv_trans_; + T* x_; + T* y_; + + T left_, right_, residual_; + MKL_INT rows_, cols_, i_, j_, ii_, status_; + + sparse_index_base_t indexing_; + struct matrix_descr descr_type_gen_; + sparse_matrix_t csrA_, csrB_, csrC_; }; } // namespace cpu #endif \ No newline at end of file From 42bdc5846d6a5bac4f3270d62b258e0d021757aa Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Wed, 21 Aug 2024 11:05:52 +0100 Subject: [PATCH 23/32] Adding AOCL files --- AOCL/gemm.hh | 1 + AOCL/sp_gemm.hh | 32 ++++- ArmPL/sp_gemm.hh | 231 +++++++++++++++++++++++++++++++++ NVPL/sp_gemv.hh | 117 +++++++++++++++++ include/kernels/CPU/sp_gemm.hh | 71 +++++++++- include/kernels/gemm.hh | 22 ++++ 6 files changed, 464 insertions(+), 10 deletions(-) create mode 100644 ArmPL/sp_gemm.hh create mode 100644 NVPL/sp_gemv.hh diff --git a/AOCL/gemm.hh b/AOCL/gemm.hh index 3c6b5c0..f418bdc 100644 --- a/AOCL/gemm.hh +++ b/AOCL/gemm.hh @@ -23,6 +23,7 @@ class gemm_cpu : public gemm { private: /** Make call to the GEMM kernel. */ void callGemm() override { + if constexpr (std::is_same_v) { bli_sgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_, rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_), diff --git a/AOCL/sp_gemm.hh b/AOCL/sp_gemm.hh index 3c6b5c0..4fc178b 100644 --- a/AOCL/sp_gemm.hh +++ b/AOCL/sp_gemm.hh @@ -28,9 +28,16 @@ class gemm_cpu : public gemm { rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_), &beta, C_, rowStride, std::max(1, m_)); } else if constexpr (std::is_same_v) { - bli_dgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_, - rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_), - &beta, C_, rowStride, std::max(1, m_)); + // Todo -- base? + aoclsparse_create_dscr(&A_csr_, base, n_, n_, nnz_, cst_row_ptr_A_.data + (), csr_col_ind_A_.data(), csr_val_A_.data()); + aoclsparse_create_dscr(&B_csr_, base, n_, n_, nnz_, cst_row_ptr_B_.data + (), csr_col_ind_B_.data(), csr_val_B_.data()); + + aoclsparse_spmm(aoclsparse_operation_none, A_csr_, B_csr_, &C_csr_); + aoclsparse_export_dcsr(C_csr_, &base, &C_M_, &C_N_, &nnz_C_, + &csr_row_ptr_C_, &csr_col_ind_C_, (void**) + &csr_val_C_); } else { // Un-specialised class will not do any work - print error and exit. std::cout << "ERROR - Datatype for AOCL CPU GEMM kernel not supported." @@ -57,6 +64,25 @@ class gemm_cpu : public gemm { /** The distance in elements to the next column. */ const int rowStride = 1; + + aoclsparse_matrix A_csr_; + aoclsparse_int* csr_row_ptr_A_; + aoclsparse_int* csr_col_ind_A_; + T* csr_val_A_; + + aoclsparse_matrix B_csr_; + aoclsparse_int* csr_row_ptr_B_; + aoclsparse_int* csr_col_ind_B_; + T* csr_val_B_; + + aoclsparse_matrix C_csr_; + aoclsparse_int* csr_row_ptr_C_; + aoclsparse_int* csr_col_ind_C_; + T* csr_val_C_; + aoclsparse_int C_M_; + aoclsparse_int C_N_; + + aoclsparse_status status; }; } // namespace cpu #endif \ No newline at end of file diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh new file mode 100644 index 0000000..aba5814 --- /dev/null +++ b/ArmPL/sp_gemm.hh @@ -0,0 +1,231 @@ +#pragma once + +#ifdef CPU_ARMPL +#include +#include +#include +#include + +#include + +#include "../include/kernels/CPU/sp_gemm.hh" +#include "../include/utilities.hh" + +namespace cpu { +/** A class for GEMM CPU BLAS kernels. */ +template +class sp_gemm_cpu : public sp_gemm { + public: + using sp_gemm::gemm; + using sp_gemm::callConsume; + using sp_gemm::m_; + using sp_gemm::n_; + using sp_gemm::k_; + using sp_gemm::A_; + using sp_gemm::B_; + using sp_gemm::C_; + + private: + /** Make call to the GEMM kernel. */ + void callGemm() override { + + /** + * Flow of ARMPL Sparse LA: + * + * 1. Create sparse matrix objects: armpl_spmat_create_csr[sdcz]() + * + * 2. Supply hints on usage: armpl_spmat_hint() + * + * 3. Optimise for SpMV: armpl_spmv_optimize() + * + * 4. Solve SpMV case: armpl_spmv_exec_[sdcz]() + * + * 5. Destroy sparse matrix object: armpl_spmat_destroy() + * + * In addiion, users can choose to update a set of non-zero values using + * armpl_spmat_update_[sdcz]() + */ + + // Todo -- See if using armpl_spmat_hint can improve performance here. + // If so, follow with optimisation functions + + + + + if (std::is_same_v) { + status_ = armpl_spmm_exec_s(transA, + transB, + alpha, + A_armpl_, + B_armpl, + beta, + C_armpl_); + } else if constexpr (std::is_same_v) { + status_ = armpl_spmm_exec_d(transA, + transB, + alpha, + A_armpl_, + B_armpl, + beta, + C_armpl_); + } else { + // Un-specialised class will not do any work - print error and exit. + std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported." + << std::endl; + exit(1); + } + + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + // Ensure compiler doesn't optimise away the work being done + callConsume(); + } + + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override {} + + /** Perform any required steps after calling the GEMM kernel that should + * be timed. */ + void postLoopRequirements() override { + status_ = armpl_spmat_destroy(A_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_destroy(B_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_destroy(C_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } + + /** The constant value Alpha. */ + const T alpha = ALPHA; + + /** The constant value Beta. */ + const T beta = BETA; + + armpl_status_t status_; + + armpl_spmat_t armpl_A, armpl_B, armpl_C; + + @override + void toCSR() { + n_armpl_ = n_; + // ToDo -- check whether flags_ is correct! + flags_ = 0; + + // Move A to CSR + A_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; + A_armpl_col_index_ = new armpl_int_t[nnz_]; + A_vals_ = new T[nnz_]; + int nnz_encountered = 0; + for (int row = 0; row < n_; row++) { + A_armpl_row_ptr_[row] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (A_[(row * n_) + col] != 0.0) { + A_armpl_col_index_[nnz_encountered] = col; + A_vals_[nnz_encountered] = A_[(row * n_) + col]; + nnz_encountered++; + } + } + } + + // Move B to CSR + B_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; + B_armpl_col_index_ = new armpl_int_t[nnz_]; + B_vals_ = new T[nnz_]; + nnz_encountered = 0; + for (int row = 0; row < n_; row++) { + B_armpl_row_ptr_[row] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (B_[(row * n_) + col] != 0.0) { + B_armpl_col_index_[nnz_encountered] = col; + B_vals_[nnz_encountered] = B_[(row * n_) + col]; + nnz_encountered++; + } + } + } + + if (std::is_sam_v) { + status_ = armpl_spmat_create_csr_s(A_armpl_, + n_armpl_, + n_armpl_, + A_armpl_row_ptr_, + A_armpl_col_index_, + A_vals_, + flags); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = armpl_spmat_create_csr_s(B_armpl_, + n_armpl_, + n_armpl_, + B_armpl_row_ptr_, + B_armpl_col_index_, + B_vals_, + flags); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } else if (std::is_same_v) { + status_ = armpl_spmat_create_csr_d(A_armpl_, + n_armpl_, + n_armpl_, + A_armpl_row_ptr_, + A_armpl_col_index_, + A_vals_, + flags); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = armpl_spmat_create_csr_d(B_armpl_, + n_armpl_, + n_armpl_, + B_armpl_row_ptr_, + B_armpl_col_index_, + B_vals_, + flags); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } + + + } + + armpl_int_t flags_; + + armpl_int_t n_armpl_; + + armpl_int_t* A_armpl_row_ptr_; + armpl_int_t* A_armpl_col_index_; + armpl_int_t* B_armpl_row_ptr_; + armpl_int_t* B_armpl_col_index_; + armpl_int_t* C_armpl_row_ptr_; + armpl_int_t* C_armpl_col_index_; + + armpl_spmat_t* A_armpl_; + armpl_spmat_t* B_armpl_; + armpl_spmat_t* C_armpl_; + + sparse_hint_value transA = ARMPL_SPARSE_OPERATION_NOTRANS; + sparse_hint_value transB = ARMPL_SPARSE_OPERATION_NOTRANS; + +}; +} // namespace cpu +#endif \ No newline at end of file diff --git a/NVPL/sp_gemv.hh b/NVPL/sp_gemv.hh new file mode 100644 index 0000000..d04f6b8 --- /dev/null +++ b/NVPL/sp_gemv.hh @@ -0,0 +1,117 @@ +/** + * ToDo -- This is all currently written for GEMM, but NVPL does not support + * GEMM, so this needs to be adjusted to spmv -- which is supported + */ + + + + + +#pragma once + +#ifdef CPU_NVPL +#include + +#include "../include/kernels/CPU/gemm.hh" +#include "../include/utilities.hh" + +namespace cpu { +/** A class for GEMM CPU BLAS kernels. */ +template +class sp_gemm_cpu : public sp_gemm { + public: + using sp_gemm::gemm; + using sp_gemm::callConsume; + using sp_gemm::m_; + using sp_gemm::n_; + using sp_gemm::k_; + using sp_gemm::A_; + using sp_gemm::B_; + using sp_gemm::C_; + + private: + /** Make call to the GEMM kernel. */ + void callGemm() override { + + // Ensure compiler doesn't optimise away the work being done + callConsume(); + } + + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override { + // Set type enum + if constexpr (std::is_same_v) { + type_ = NVPL_SPARSE_R_32F; + } else if constexpr (std::is_same_v) { + type_ = NVPL_SPARSE_R_64F; + } else { + // Un-specialised class will not do any work - print error and exit. + std::cout << "ERROR - Datatype for NVPL sparse GEMM kernel not supported." + << std::endl; + exit(1); + } + status_ = nvpl_sparse_create(&handle_); + // Todo -- error check + + // Todo -- Make const? + status_ = nvpl_sparse_create_csr(A_nvpl_, n_, n_, nnz_, A_row_ptr_nvpl_, + A_col_index_nvpl_, A_vals_nvpl_, + index_type_, index_type_, base_, type_); + + status_ = nvpl_sparse_create_csr(B_nvpl_, n_, n_, nnz_, B_row_ptr_nvpl_, + B_col_index_nvpl_, B_vals_nvpl_, + index_type_, index_type_, base_, type_); + // Todo -- error check + + + } + + /** Perform any required steps after calling the GEMM kernel that should + * be timed. */ + void postLoopRequirements() override { + status_ = nvpl_sparse_destroy(handle_); + // Todo -- error check + status_ = nvpl_sparse_destroy_sp_mat(A_nvpl_); + status_ = nvpl_sparse_destroy_sp_mat(B_nvpl_); + status_ = nvpl_sparse_destroy_sp_mat(C_nvpl_); + } + + /** The constant value Alpha. */ + T alpha = ALPHA; + + /** The constant value Beta. */ + T beta = BETA; + + /** + * Sparse metadata + */ + nvpl_sparse_status_t status_; + nvpl_sparse_handle_t handle_; + nvpl_sparse_data_type_t type_; + + nvpl_sparse_operation_t op_ = NVPL_SPARSE_OPERATION_NON_TRANSPOSE; + nvpl_sparse_index_base_t base_ = NVPL_SPARSE_INDEX_BASE_ZERO; + nvpl_sparse_format_t format_ = NVPL_SPARSE_FORMAT_CSR; + nvpl_sparse_order_t order_ = NVPL_SPARSE_ORDER_COL; + nvpl_sparse_index_type_t index_type_ = NVPL_SPARSE_INDEX_64I; + + /** + * Sparse matrix descriptors + */ + nvpl_sparse_sp_mat_descr_t* A_nvpl_; + nvpl_sparse_sp_mat_descr_t* B_nvpl_; + nvpl_sparse_sp_mat_descr_t* C_nvpl_; + + void* A_row_ptr_nvpl_; + void* B_row_ptr_nvpl_; + void* C_row_ptr_nvpl_; + void* A_col_idnex_nvpl_; + void* B_col_idnex_nvpl_; + void* C_col_idnex_nvpl_; + void* A_vals_nvpl_; + void* B_vals_nvpl_; + void* C_vals_nvpl_; +}; +} // namespace cpu +#endif \ No newline at end of file diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh index 60778e7..72fd5dc 100644 --- a/include/kernels/CPU/sp_gemm.hh +++ b/include/kernels/CPU/sp_gemm.hh @@ -1,9 +1,9 @@ #pragma once -#ifdef CPU_ONEMKL #include "../gemm.hh" #include +#include namespace cpu { @@ -25,21 +25,78 @@ namespace cpu { /** Initialise the required data structures. */ virtual void initialise(int n, double sparsity, bool binary = false) { n_ = n; + sparsity_ = sparsity; + + // Note that the below should be the same as the edges calculation + // used in the initInputMatricesSparse function. If changed here, + // change there + nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity_)); A_ = (T*)malloc(sizeof(T) * n_ * n_); B_ = (T*)malloc(sizeof(T) * n_ * n_); C_ = (T*)malloc(sizeof(T) * n_ * n_); - initInputMatricesSparse(sparsity); + initInputMatricesSparse(sparsity_); + + toCSR(); } private: /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ - void postCallKernelCleanup() { - free(A_); - free(B_); - free(C_); - } + void postCallKernelCleanup() { + free(A_); + free(B_); + free(C_); + } + + void toCSR() { + // Move A to CSR + A_row_ptr_ = new int[n_ + 1]; + A_col_index_ = new int[nnz_]; + A_vals_ = new T[nnz_]; + int nnz_encountered = 0; + for (int row = 0; row < n_; row++) { + A_row_ptr_[row] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (A_[(row * n_) + col] != 0.0) { + A_col_index_[nnz_encountered] = col; + A_vals_[nnz_encountered] = A_[(row * n_) + col]; + nnz_encountered++; + } + } + } + + // Move B to CSR + B_row_ptr_ = new int[n_ + 1]; + B_col_index_ = new int[nnz_]; + B_vals_ = new T[nnz_]; + nnz_encountered = 0; + for (int row = 0; row < n_; row++) { + B_row_ptr_[row] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (B_[(row * n_) + col] != 0.0) { + B_col_index_[nnz_encountered] = col; + B_vals_[nnz_encountered] = B_[(row * n_) + col]; + nnz_encountered++; + } + } + } + } + + double sparsity_; + + int nnz_; + + int* A_row_ptr_; + int* A_col_index_; + int* B_row_ptr_; + int* B_col_index_; + int* C_row_ptr_; + int* C_col_index_; + T* A_vals_; + T* B_vals_; + T* C_vals; + }; } // namespace cpu diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index d97fc8c..d357734 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -91,6 +91,9 @@ class gemm { } } + // Note that the below should be the same as the nnz calculation + // used in the cpu initialise functions. If changed here, + // change there void initInputMatricesSparse(float sparsity) { for (int i = 0; i < (n_ * n_); i++) { A_[i] = 0.0; @@ -200,6 +203,25 @@ class gemm { } row_ptr[n_row] = (MKL_INT)nnz_encountered; } +#endif +#ifdef CPU_AOCL + void toCSR_aocl(T* dense, int n_col, int n_row, T* vals, aoclsparse_int* + col_index, aoclsparse_int* row_ptr) { + int nnz_encountered = 0; + for (int row = 0; row < n_row; row++) { + row_ptr[row] = (aoclsparse_int)nnz_encountered; + int nnz_row = 0; + for (int col = 0; col < n_col; col++) { + if (dense[(row * n_col) + col] != 0.0) { + nnz_row++; + col_index[nnz_encountered] = (aoclsparse_int)col; + vals[nnz_encountered] = dense[(row * n_col) + col]; + nnz_encountered++; + } + } + } + row_ptr[n_row] = (MKL_INT)nnz_encountered; + } #endif /** The number of iterations to perform per problem size. */ const int iterations_; From 521cbf3d1f4f5369813732e46be11fd019a09241 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Tue, 1 Oct 2024 12:00:19 +0100 Subject: [PATCH 24/32] Working changes --- .DS_Store | Bin 0 -> 8196 bytes .idea/GPU-BLAS-Offload-Benchmark.iml | 2 + .idea/codeStyles/codeStyleConfig.xml | 5 + .idea/misc.xml | 6 + .idea/modules.xml | 8 + .idea/vcs.xml | 6 + .idea/workspace.xml | 541 +++++++++++++++++++++++++++ ArmPL/sp_gemm.hh | 271 ++++++++++++-- DefaultCPU/sp_gemm.hh | 55 --- DefaultGPU/sp_gemm.hh | 54 --- Makefile | 2 +- NVPL/sp_gemv.hh | 117 ------ createGflopsGraphs.py | 5 + cuBLAS/sp_gemm.hh | 9 +- cuBLAS/sp_gemv.hh | 261 +++++++++++++ include/.DS_Store | Bin 0 -> 6148 bytes include/doGemm.hh | 46 ++- include/kernels/.DS_Store | Bin 0 -> 6148 bytes include/kernels/CPU/sp_gemm.hh | 23 +- include/kernels/CPU/sp_gemv.hh | 47 +++ include/kernels/GPU/sp_gemm.hh | 3 +- include/kernels/GPU/sp_gemv.hh | 28 ++ include/kernels/gemm.hh | 4 + include/kernels/gemv.hh | 79 ++++ 24 files changed, 1278 insertions(+), 294 deletions(-) create mode 100644 .DS_Store create mode 100644 .idea/GPU-BLAS-Offload-Benchmark.iml create mode 100644 .idea/codeStyles/codeStyleConfig.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 .idea/workspace.xml delete mode 100644 DefaultCPU/sp_gemm.hh delete mode 100644 DefaultGPU/sp_gemm.hh delete mode 100644 NVPL/sp_gemv.hh create mode 100644 cuBLAS/sp_gemv.hh create mode 100644 include/.DS_Store create mode 100644 include/kernels/.DS_Store create mode 100644 include/kernels/CPU/sp_gemv.hh create mode 100644 include/kernels/GPU/sp_gemv.hh diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5e3f9bcf14470d249e0f7fdd3125325b2078e4a9 GIT binary patch literal 8196 zcmeHMO>7fK6n5{vbYWjjq# z6v^tL67*83J#*!VfCCrQLnUrVRV9ilZorKzhpMOY%}<<}wVfjZF%#{)*?Hf4Gw;ot zH<>jNvEq6-NiAF`iZ|fgS6o*+4>%9JmmU!L!N((LLDP<+GIgMmR{+oqx@AFoR5U<+O$(ZK z6a@!`DN#@*%Jdb3DRK1s8duP?qo@)mrY|2%kIeK9g~`#O-j80h z&(H*|QjOZy{XN^dWAf^}R0<*FjWy%jz=wH=(jJUkqmZgp zu|}pZAKP4W?5W(s++S*JL%z;;Mt?b-`giJyoSlKN#;0Gz_!*j^i!@8;&qjPj+lKVP z{sV8~e^~?!^PHh3)oCt?ME^jfZPCz@t;e+J*6HxtuYcW{E3l6miATA>O> zsMk?fs14s{e5x*s&6TC1JUKVhkKX3tR8%X%Z;x8*gy zQEpe->#bs?`Hgs6;5-Vp+m&FkR^3=0-9O9YcBK|qn^K?_RsmW1x)z6gqsZ6euq9>7 zis21=!^@)wH#d(==KQ1i>8+f}0qk7D*WBMpepHTFH zdhgaZ(6Y?8L!>EAFpF;nN$}&P6E9TQConsKzd!%cfv0^icA&`6w{(18ZpIOh#iEP3 zdq@Ti1khm$WY`4uGRdI7X>5-yHgSw)jUa=~Y@^vH(6|fQ_QBm(KqvH>UU>HW^xof< zg*~VpKWy?7xuYrpBv6(oSRRAX2tx5JlE5kYipr=buxWmvwrxe~Zy?Q-;L!zy{Z(v< zE3iK5v08+(X>|tL7kd*(dNzR@!lsO&^#YwsCL5WSOr0LKb_3X$`fjJRNZ%%YnC4;M z43(f=*jcAAVWo%wQzDDa&9Sn5^=A$x&}pQACau^yMYOPeMzm;@z3!K9L6_#3>;2Pj z-IeTech + \ No newline at end of file diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml new file mode 100644 index 0000000..a55e7a1 --- /dev/null +++ b/.idea/codeStyles/codeStyleConfig.xml @@ -0,0 +1,5 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..830d3c8 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..eff3984 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 0000000..b954508 --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,541 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + { + "associatedIndex": 2 +} + + + + { + "keyToString": { + "C/C++ File.main.cc.executor": "Run", + "RunOnceActivity.OpenProjectViewOnStart": "true", + "RunOnceActivity.ShowReadmeOnStart": "true", + "RunOnceActivity.cidr.known.project.marker": "true", + "RunOnceActivity.readMode.enableVisualFormatting": "true", + "cf.advertisement.text.has.clang-format": "true", + "cf.first.check.clang-format": "false", + "cidr.known.project.marker": "true", + "git-widget-placeholder": "sparse", + "last_opened_file_path": "/Users/no22498/Documents/GPU-BLAS-Offload-Benchmark", + "node.js.detected.package.eslint": "true", + "node.js.detected.package.tslint": "true", + "node.js.selected.package.eslint": "(autodetect)", + "node.js.selected.package.tslint": "(autodetect)", + "nodejs_package_manager_path": "npm", + "settings.editor.selected.configurable": "preferences.lookFeel", + "structure.view.defaults.are.configured": "true", + "vue.rearranger.settings.migration": "true" + } +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1705671236426 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh index aba5814..47b0bf9 100644 --- a/ArmPL/sp_gemm.hh +++ b/ArmPL/sp_gemm.hh @@ -16,7 +16,7 @@ namespace cpu { template class sp_gemm_cpu : public sp_gemm { public: - using sp_gemm::gemm; + using sp_gemm::sp_gemm; using sp_gemm::callConsume; using sp_gemm::m_; using sp_gemm::n_; @@ -24,6 +24,7 @@ class sp_gemm_cpu : public sp_gemm { using sp_gemm::A_; using sp_gemm::B_; using sp_gemm::C_; + using sp_gemm::nnz_; private: /** Make call to the GEMM kernel. */ @@ -52,22 +53,23 @@ class sp_gemm_cpu : public sp_gemm { - if (std::is_same_v) { - status_ = armpl_spmm_exec_s(transA, - transB, + if constexpr (std::is_same_v) { + status_ = armpl_spmm_exec_s(transA_, + transB_, alpha, - A_armpl_, - B_armpl, + *A_armpl_, + *B_armpl_, beta, - C_armpl_); + *B_armpl_); } else if constexpr (std::is_same_v) { - status_ = armpl_spmm_exec_d(transA, - transB, + std::cout << "About to execute dgemm" << std::endl; + status_ = armpl_spmm_exec_d(transA_, + transB_, alpha, - A_armpl_, - B_armpl, + *A_armpl_, + *B_armpl_, beta, - C_armpl_); + *B_armpl_); } else { // Un-specialised class will not do any work - print error and exit. std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported." @@ -85,26 +87,42 @@ class sp_gemm_cpu : public sp_gemm { /** Perform any required steps before calling the GEMM kernel that should * be timed. */ - void preLoopRequirements() override {} + void preLoopRequirements() override { + // Need to put A_ and B_ into A_armpl_ and B_armpl_ + // ToDo -- Error catching + toCSR_armpl(); +// std::cout << "toCSR_armpl() wrapped up without a problem" << std::endl; + } /** Perform any required steps after calling the GEMM kernel that should * be timed. */ void postLoopRequirements() override { - status_ = armpl_spmat_destroy(A_armpl_); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } - status_ = armpl_spmat_destroy(B_armpl_); + status_ = armpl_spmat_destroy(*A_armpl_); if (status_ != ARMPL_STATUS_SUCCESS) { std::cout << "ERROR " << status_ << std::endl; exit(1); } - status_ = armpl_spmat_destroy(C_armpl_); + status_ = armpl_spmat_destroy(*B_armpl_); if (status_ != ARMPL_STATUS_SUCCESS) { std::cout << "ERROR " << status_ << std::endl; exit(1); } +// status_ = armpl_spmat_destroy(*C_armpl_); +// if (status_ != ARMPL_STATUS_SUCCESS) { +// std::cout << "ERROR " << status_ << std::endl; +// exit(1); +// } + +// delete [] A_armpl_row_ptr_; +// delete [] A_armpl_col_index_; +// delete [] A_vals_; +// delete [] B_armpl_row_ptr_; +// delete [] B_armpl_col_index_; +// delete [] B_vals_; +// delete [] C_armpl_row_ptr_; +// delete [] C_armpl_col_index_; +// delete [] C_vals_; + } /** The constant value Alpha. */ @@ -117,8 +135,7 @@ class sp_gemm_cpu : public sp_gemm { armpl_spmat_t armpl_A, armpl_B, armpl_C; - @override - void toCSR() { + void toCSR_armpl() { n_armpl_ = n_; // ToDo -- check whether flags_ is correct! flags_ = 0; @@ -127,85 +144,265 @@ class sp_gemm_cpu : public sp_gemm { A_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; A_armpl_col_index_ = new armpl_int_t[nnz_]; A_vals_ = new T[nnz_]; + A_armpl_row_ptr_[0] = 0; + int nnz_encountered = 0; +// std::cout << "About to load A into csr" << std::endl; for (int row = 0; row < n_; row++) { - A_armpl_row_ptr_[row] = nnz_encountered; +// std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered << std::endl; + A_armpl_row_ptr_[row + 1] = nnz_encountered; for (int col = 0; col < n_; col++) { if (A_[(row * n_) + col] != 0.0) { +// std::cout << "\t\tCol " << col << " = " << A_[(row * n_) + col] << +// std::endl; A_armpl_col_index_[nnz_encountered] = col; - A_vals_[nnz_encountered] = A_[(row * n_) + col]; + A_vals_[nnz_encountered] = static_cast(A_[(row * n_) + col]); nnz_encountered++; +// std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl; } } } +// std::cout << "___A =" << std::endl << "\t\t["; +// for (int i = 0; i < (n_ + 1); i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << A_armpl_row_ptr_[i]; +// } +// std::cout << "]" << std::endl << "\t\t["; +// for (int i = 0; i < nnz_; i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << A_armpl_col_index_[i]; +// } +// std::cout << "]" << std::endl << "\t\t["; +// for (int i = 0; i < nnz_; i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << A_vals_[i]; +// } +// std::cout << "]" << std::endl; + + +// std::cout << "About to load B into csr" << std::endl; + // Move B to CSR B_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; B_armpl_col_index_ = new armpl_int_t[nnz_]; B_vals_ = new T[nnz_]; + B_armpl_row_ptr_[0] = 0; + nnz_encountered = 0; for (int row = 0; row < n_; row++) { - B_armpl_row_ptr_[row] = nnz_encountered; +// std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered << +// std::endl; + B_armpl_row_ptr_[row + 1] = nnz_encountered; for (int col = 0; col < n_; col++) { if (B_[(row * n_) + col] != 0.0) { +// std::cout << "\t\tCol " << col << " = " << B_[(row * n_) + col] << std::endl; B_armpl_col_index_[nnz_encountered] = col; - B_vals_[nnz_encountered] = B_[(row * n_) + col]; + B_vals_[nnz_encountered] = static_cast(B_[(row * n_) + col]); nnz_encountered++; +// std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl; } } } +// std::cout << "___B =" << std::endl << "\t\t["; +// for (int i = 0; i < (n_ + 1); i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << B_armpl_row_ptr_[i]; +// } +// std::cout << "]" << std::endl << "\t\t["; +// for (int i = 0; i < nnz_; i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << B_armpl_col_index_[i]; +// } +// std::cout << "]" << std::endl << "\t\t["; +// for (int i = 0; i < nnz_; i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << B_vals_[i]; +// } +// std::cout << "]" << std::endl; + + +// // Move B to CSR +// C_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; +// C_armpl_col_index_ = new armpl_int_t[nnz_]; +// C_vals_ = new T[nnz_]; +// C_armpl_row_ptr_[0] = 0; +// +// nnz_encountered = 0; +//// std::cout << "About to load C into csr" << std::endl; +// for (int row = 0; row < n_; row++) { +//// std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered << std::endl; +// C_armpl_row_ptr_[row + 1] = nnz_encountered; +// for (int col = 0; col < n_; col++) { +// if (A_[(row * n_) + col] != 0.0) { +// C_armpl_col_index_[nnz_encountered] = col; +// C_vals_[nnz_encountered] = A_[(row * n_) + col]; +// nnz_encountered++; +//// std::cout << "\t\tCol " << col << " = " << C_vals_[nnz_encountered] << +//// std::endl; +//// std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl; +// } +// } +// } + +// std::cout << "___C =" << std::endl << "\t\t["; +// for (int i = 0; i < (n_ + 1); i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << C_armpl_row_ptr_[i]; +// } +// std::cout << "]" << std::endl << "\t\t["; +// for (int i = 0; i < nnz_; i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << C_armpl_col_index_[i]; +// } +// std::cout << "]" << std::endl << "\t\t["; +// for (int i = 0; i < nnz_; i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << C_vals_[i]; +// } +// std::cout << "]" << std::endl; + + + +// std::cout << "Loading csr A into armpl storage formats" << std::endl; + if constexpr (std::is_same_v) { + std::cout << "\tn_armpl_ = " << n_armpl_ << std::endl; + std::cout << "\tA_armpl_row_ptr_ (size = " << sizeof + (A_armpl_row_ptr_[0]) << ") = [" << A_armpl_row_ptr_[0]; + for (int i = 1; i < (n_ + 1); i++) { + std::cout << ", " << A_armpl_row_ptr_[i]; + } + std::cout << "]" << std::endl << "\tA_armpl_col_index_ (size = " << + sizeof(A_armpl_col_index_[0]) << ") = [" << + A_armpl_col_index_[0]; + for (int i = 1; i < nnz_; i++) { + std::cout << ", " << A_armpl_col_index_[i]; + } + std::cout << "]" << std::endl << "\tA_vals_ (size = " << sizeof + (A_vals_[0]) << ") = [" << A_vals_[0]; + for (int i = 1; i < nnz_; i++) { + std::cout << ", " << A_vals_[i]; + } + std::cout << "]" << std::endl << "flags: " << flags_ << std::endl; - if (std::is_sam_v) { status_ = armpl_spmat_create_csr_s(A_armpl_, n_armpl_, n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, - flags); + flags_); if (status_ != ARMPL_STATUS_SUCCESS) { std::cout << "ERROR " << status_ << std::endl; exit(1); } +// std::cout << "Loading csr C into armpl storage formats" << std::endl; +// status_ = armpl_spmat_create_csr_s(C_armpl_, +// n_armpl_, +// n_armpl_, +// C_armpl_row_ptr_, +// C_armpl_col_index_, +// C_vals_, +// flags_); +// if (status_ != ARMPL_STATUS_SUCCESS) { +// std::cout << "ERROR " << status_ << std::endl; +// exit(1); +// } + +// std::cout << "Loading csr B into armpl storage formats" << std::endl; status_ = armpl_spmat_create_csr_s(B_armpl_, n_armpl_, n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_, - flags); + flags_); if (status_ != ARMPL_STATUS_SUCCESS) { std::cout << "ERROR " << status_ << std::endl; exit(1); } - } else if (std::is_same_v) { + } else if constexpr (std::is_same_v) { + std::cout << "\tn_armpl_ = " << n_armpl_ << std::endl; + std::cout << "\tA_armpl_row_ptr_ (size = " << sizeof + (A_armpl_row_ptr_[0]) << ") = [" << A_armpl_row_ptr_[0]; + for (int i = 1; i < (n_ + 1); i++) { + std::cout << ", " << A_armpl_row_ptr_[i]; + } + std::cout << "]" << std::endl << "\tA_armpl_col_index_ (size = " << + sizeof(A_armpl_col_index_[0]) << ") = [" << + A_armpl_col_index_[0]; + for (int i = 1; i < nnz_; i++) { + std::cout << ", " << A_armpl_col_index_[i]; + } + std::cout << "]" << std::endl << "\tA_vals_ (size = " << sizeof + (A_vals_[0]) << ") = [" << A_vals_[0]; + for (int i = 1; i < nnz_; i++) { + std::cout << ", " << A_vals_[i]; + } + std::cout << "]" << std::endl << "flags: " << flags_ << std::endl; + + + std::cout << "About to create CSR A (double)" << std::endl; status_ = armpl_spmat_create_csr_d(A_armpl_, n_armpl_, n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, - flags); + flags_); if (status_ != ARMPL_STATUS_SUCCESS) { std::cout << "ERROR " << status_ << std::endl; exit(1); } +// std::cout << "Loading csr C into armpl storage formats" << std::endl; +// status_ = armpl_spmat_create_csr_d(C_armpl_, +// n_armpl_, +// n_armpl_, +// C_armpl_row_ptr_, +// C_armpl_col_index_, +// C_vals_, +// flags_); +// if (status_ != ARMPL_STATUS_SUCCESS) { +// std::cout << "ERROR " << status_ << std::endl; +// exit(1); +// } + +// std::cout << "Loading csr B into armpl storage formats" << std::endl; + std::cout << "About to create CSR B (double)" << std::endl; status_ = armpl_spmat_create_csr_d(B_armpl_, n_armpl_, n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_, - flags); + flags_); if (status_ != ARMPL_STATUS_SUCCESS) { std::cout << "ERROR " << status_ << std::endl; exit(1); } } - +// std::cout << "Okay, all matrices made!!" << std::endl; } armpl_int_t flags_; @@ -219,12 +416,16 @@ class sp_gemm_cpu : public sp_gemm { armpl_int_t* C_armpl_row_ptr_; armpl_int_t* C_armpl_col_index_; + T* A_vals_; + T* B_vals_; + T* C_vals_; + armpl_spmat_t* A_armpl_; armpl_spmat_t* B_armpl_; armpl_spmat_t* C_armpl_; - sparse_hint_value transA = ARMPL_SPARSE_OPERATION_NOTRANS; - sparse_hint_value transB = ARMPL_SPARSE_OPERATION_NOTRANS; + armpl_sparse_hint_value transA_ = ARMPL_SPARSE_OPERATION_NOTRANS; + armpl_sparse_hint_value transB_ = ARMPL_SPARSE_OPERATION_NOTRANS; }; } // namespace cpu diff --git a/DefaultCPU/sp_gemm.hh b/DefaultCPU/sp_gemm.hh deleted file mode 100644 index d7ecb37..0000000 --- a/DefaultCPU/sp_gemm.hh +++ /dev/null @@ -1,55 +0,0 @@ -#pragma once - -#if defined CPU_DEFAULT - -#include "../include/kernels/CPU/sp_gemm.hh" -#include "../include/utilities.hh" - -namespace cpu { -/** A class for GEMM CPU BLAS kernels. */ -template -class sp_gemm_cpu : public sp_gemm { - public: - using sp_gemm::sp_gemm; - using sp_gemm::callConsume; - using sp_gemm::m_; - using sp_gemm::n_; - using sp_gemm::k_; - using sp_gemm::A_; - using sp_gemm::B_; - using sp_gemm::C_; - - private: - /** Perform the GEMM kernel. */ - void callGemm() override { - /** A naive implementation of a column-major GEMM. Alpha and Beta are always - * 1 and 0 respectively. - * Operation takes the form of C[M,N] = A[M,K] * B[K,N]. - * callConsume() is required to ensure that the compiler does not optimise - * away this function. */ - int x, y, z; - T acc; - for (x = 0; x < m_; x++) { - for (y = 0; y < n_; y++) { - acc = 0.0; - for (z = 0; z < k_; z++) { - acc += A_[z * m_ + x] * B_[y * k_ + z]; - } - C_[y * m_ + x] = acc; - } - } - // Ensure compiler doesn't optimise away the work being done - callConsume(); - } - - /** Perform any required steps before calling the GEMM kernel that should - * be timed. */ - void preLoopRequirements() override {} - - /** Perform any required steps after calling the GEMM kernel that should - * be timed. */ - void postLoopRequirements() override {} -}; - -} // namespace cpu -#endif diff --git a/DefaultGPU/sp_gemm.hh b/DefaultGPU/sp_gemm.hh deleted file mode 100644 index 2a9f478..0000000 --- a/DefaultGPU/sp_gemm.hh +++ /dev/null @@ -1,54 +0,0 @@ -#pragma once - -#if defined GPU_DEFAULT - -#include - -#include "../include/kernels/GPU/sp_gemm.hh" -#include "../include/utilities.hh" - -namespace gpu { -/** A class for GEMM GPU BLAS kernels. */ -template -class sp_gemm_gpu : public sp_gemm { - public: - using sp_gemm::sp_gemm; - - /** Call the BLAS kernel n times, with 1 warmup run. - * Returns the time elapsed for n BLAS calls in seconds. */ - time_checksum_gflop compute() { - // Override function in base `kernel` class as DefaultGPU should do nothing. - return {INFINITY, INFINITY, 0.0}; - } - - /** Initialise the required data structures. */ - void initialise(gpuOffloadType offload, int n, float sparsity) override { - // Default GPU implementation - do nothing. - } - - private: - /** Make a call to the BLAS Library Kernel. */ - void callGemm() override { - // Default GPU implementation - do nothing. - } - - /** Perform any required steps before calling the GEMM kernel that should - * be timed. */ - void preLoopRequirements() override { - // Default GPU implementation - do nothing. - } - - /** Perform any required steps after calling the GEMM kernel that should - * be timed. */ - void postLoopRequirements() override { - // Default GPU implementation - do nothing. - } - - /** Do any necessary cleanup (free pointers, close library handles, etc.) - * after Kernel has been called. */ - void postCallKernelCleanup() override { - // Default GPU implementation - do nothing. - } -}; -} // namespace gpu -#endif \ No newline at end of file diff --git a/Makefile b/Makefile index bff0add..e5091e0 100644 --- a/Makefile +++ b/Makefile @@ -170,7 +170,7 @@ $(warning GPU_LIB not set (use CUBLAS, ONEMKL, ROCBLAS). No GPU kernels will be else ifeq ($(GPU_LIB), CUBLAS) # Do cuBLAS stuff ifeq ($(COMPILER), NVIDIA) -override CXXFLAGS += -cudalib=cublas +override CXXFLAGS += -cudalib=cublas -lcusparse_static else $(warning Users may be required to do the following to use $(COMPILER) with $(GPU_LIB):) $(info $(TAB)$(TAB)Add `CXXFLAGS=-L/.../math_libs/lib64 -L/.../cuda/lib64` to make command) diff --git a/NVPL/sp_gemv.hh b/NVPL/sp_gemv.hh deleted file mode 100644 index d04f6b8..0000000 --- a/NVPL/sp_gemv.hh +++ /dev/null @@ -1,117 +0,0 @@ -/** - * ToDo -- This is all currently written for GEMM, but NVPL does not support - * GEMM, so this needs to be adjusted to spmv -- which is supported - */ - - - - - -#pragma once - -#ifdef CPU_NVPL -#include - -#include "../include/kernels/CPU/gemm.hh" -#include "../include/utilities.hh" - -namespace cpu { -/** A class for GEMM CPU BLAS kernels. */ -template -class sp_gemm_cpu : public sp_gemm { - public: - using sp_gemm::gemm; - using sp_gemm::callConsume; - using sp_gemm::m_; - using sp_gemm::n_; - using sp_gemm::k_; - using sp_gemm::A_; - using sp_gemm::B_; - using sp_gemm::C_; - - private: - /** Make call to the GEMM kernel. */ - void callGemm() override { - - // Ensure compiler doesn't optimise away the work being done - callConsume(); - } - - /** Perform any required steps before calling the GEMM kernel that should - * be timed. */ - void preLoopRequirements() override { - // Set type enum - if constexpr (std::is_same_v) { - type_ = NVPL_SPARSE_R_32F; - } else if constexpr (std::is_same_v) { - type_ = NVPL_SPARSE_R_64F; - } else { - // Un-specialised class will not do any work - print error and exit. - std::cout << "ERROR - Datatype for NVPL sparse GEMM kernel not supported." - << std::endl; - exit(1); - } - status_ = nvpl_sparse_create(&handle_); - // Todo -- error check - - // Todo -- Make const? - status_ = nvpl_sparse_create_csr(A_nvpl_, n_, n_, nnz_, A_row_ptr_nvpl_, - A_col_index_nvpl_, A_vals_nvpl_, - index_type_, index_type_, base_, type_); - - status_ = nvpl_sparse_create_csr(B_nvpl_, n_, n_, nnz_, B_row_ptr_nvpl_, - B_col_index_nvpl_, B_vals_nvpl_, - index_type_, index_type_, base_, type_); - // Todo -- error check - - - } - - /** Perform any required steps after calling the GEMM kernel that should - * be timed. */ - void postLoopRequirements() override { - status_ = nvpl_sparse_destroy(handle_); - // Todo -- error check - status_ = nvpl_sparse_destroy_sp_mat(A_nvpl_); - status_ = nvpl_sparse_destroy_sp_mat(B_nvpl_); - status_ = nvpl_sparse_destroy_sp_mat(C_nvpl_); - } - - /** The constant value Alpha. */ - T alpha = ALPHA; - - /** The constant value Beta. */ - T beta = BETA; - - /** - * Sparse metadata - */ - nvpl_sparse_status_t status_; - nvpl_sparse_handle_t handle_; - nvpl_sparse_data_type_t type_; - - nvpl_sparse_operation_t op_ = NVPL_SPARSE_OPERATION_NON_TRANSPOSE; - nvpl_sparse_index_base_t base_ = NVPL_SPARSE_INDEX_BASE_ZERO; - nvpl_sparse_format_t format_ = NVPL_SPARSE_FORMAT_CSR; - nvpl_sparse_order_t order_ = NVPL_SPARSE_ORDER_COL; - nvpl_sparse_index_type_t index_type_ = NVPL_SPARSE_INDEX_64I; - - /** - * Sparse matrix descriptors - */ - nvpl_sparse_sp_mat_descr_t* A_nvpl_; - nvpl_sparse_sp_mat_descr_t* B_nvpl_; - nvpl_sparse_sp_mat_descr_t* C_nvpl_; - - void* A_row_ptr_nvpl_; - void* B_row_ptr_nvpl_; - void* C_row_ptr_nvpl_; - void* A_col_idnex_nvpl_; - void* B_col_idnex_nvpl_; - void* C_col_idnex_nvpl_; - void* A_vals_nvpl_; - void* B_vals_nvpl_; - void* C_vals_nvpl_; -}; -} // namespace cpu -#endif \ No newline at end of file diff --git a/createGflopsGraphs.py b/createGflopsGraphs.py index d323162..07ac243 100644 --- a/createGflopsGraphs.py +++ b/createGflopsGraphs.py @@ -123,6 +123,11 @@ inputTypeStr = "Square x Short-Wide (M=K=32, N)" for j in range(0, len(mnk)): xVals.append(mnk[j][1]) + elif "_sparse_square" in gemmFilenames[i]: + x_name = "Value of M, N, K" + inputTypeStr = "Sparse square matrices" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) else: # File not supported so go to next file continue diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index d849d22..b5e8d93 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -1,8 +1,7 @@ #pragma once #ifdef GPU_CUBLAS -#include "cusparse.h" -#include +#include #include #include #include @@ -13,13 +12,13 @@ #include "common.hh" namespace gpu { -/** A class for GEMM GPU BLAS kernels. */ +/** A class for sparse GEMM GPU BLAS kernels. */ template class sp_gemm_gpu : public sp_gemm { public: using sp_gemm::sp_gemm; using sp_gemm::initInputMatricesSparse; - using sp_gemm::toCSR; + using sp_gemm::toCSR_int; using sp_gemm::n_; using sp_gemm::A_; using sp_gemm::B_; @@ -44,7 +43,7 @@ class sp_gemm_gpu : public sp_gemm { std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; exit(1); } - n_ = 100 * n; + n_ = n; // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); diff --git a/cuBLAS/sp_gemv.hh b/cuBLAS/sp_gemv.hh new file mode 100644 index 0000000..8027746 --- /dev/null +++ b/cuBLAS/sp_gemv.hh @@ -0,0 +1,261 @@ +//#pragma once +// +//#ifdef GPU_CUBLAS +//#include +//#include +//#include +//#include +//#include +//#include +//#include +// +//#include "../include/kernels/GPU/sp_gemv.hh" +//#include "../include/utilities.hh" +//#include "common.hh" +// +//namespace gpu { +///** A class for sparse GEMV GPU BLAS kernels. */ +//template +//class gemv_gpu : public gemv { +// public: +// using gemv::gemv; +// using gemv::initInputMatrixVector; +// using gemv::m_; +// using gemv::n_; +// using gemv::A_; +// using gemv::x_; +// using gemv::y_; +// using gemv::offload_; +// using gemv::vecIncrement_; +// +// ~gemv_gpu() { +// if (alreadyInitialised_) { +// // Destroy the handle +// cublasCheckError(cublasDestroy(handle_)); +// +// // Destroy streams after use +// cudaCheckError(cudaStreamDestroy(s1_)); +// cudaCheckError(cudaStreamDestroy(s2_)); +// cudaCheckError(cudaStreamDestroy(s3_)); +// } +// } +// +// /** Initialise the required data structures. +// * `offload` refers to the data offload type: +// * - Once: Move data from host to device before all iterations & move from +// * device to host after all iterations +// * - Always: Move data from host to device and device to host each iteration +// * - Unified: Initialise data as unified memory; no data movement semantics +// * required */ +// void initialise(gpuOffloadType offload, int m, int n) override { +// if (!alreadyInitialised_) { +// alreadyInitialised_ = true; +// // Perform set-up which doesn't need to happen every problem size change. +// // Create a handle for CUBLAS +// cublasCheckError(cublasCreate(&handle_)); +// +// // Get device identifier +// cudaCheckError(cudaGetDevice(&gpuDevice_)); +// +// // Initialise 3 streams to asynchronously move data between host and +// // device +// cudaCheckError(cudaStreamCreate(&s1_)); +// cudaCheckError(cudaStreamCreate(&s2_)); +// cudaCheckError(cudaStreamCreate(&s3_)); +// } +// +// offload_ = offload; +// m_ = m; +// n_ = n; +// +// if (offload_ == gpuOffloadType::unified) { +// cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * m_ * n_)); +// cudaCheckError(cudaMallocManaged(&x_, sizeof(T) * n_)); +// cudaCheckError(cudaMallocManaged(&y_, sizeof(T) * m_)); +// } else { +// // Allocate matrices on host +// cudaCheckError(cudaMallocHost((void**)&A_, sizeof(T) * m_ * n_)); +// cudaCheckError(cudaMallocHost((void**)&x_, sizeof(T) * n_)); +// cudaCheckError(cudaMallocHost((void**)&y_, sizeof(T) * m_)); +// // Allocate matrices on device +// cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * m_ * n_)); +// cudaCheckError(cudaMalloc((void**)&x_device_, sizeof(T) * n_)); +// cudaCheckError(cudaMalloc((void**)&y_device_, sizeof(T) * m_)); +// } +// +// // Initialise the host data structures +// initInputMatrixVector(); +// } +// +// private: +// /** Perform any required steps before calling the GEMV kernel that should +// * be timed. */ +// void preLoopRequirements() override { +// switch (offload_) { +// case gpuOffloadType::always: { +// // Offload data each iteration - no requirements +// break; +// } +// case gpuOffloadType::once: { +// // Offload input data from host to the device. +// cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * n_, +// cudaMemcpyHostToDevice, s1_)); +// cudaCheckError(cudaMemcpyAsync(x_device_, x_, sizeof(T) * n_, +// cudaMemcpyHostToDevice, s2_)); +// cudaCheckError(cudaMemcpyAsync(y_device_, y_, sizeof(T) * m_, +// cudaMemcpyHostToDevice, s3_)); +// break; +// } +// case gpuOffloadType::unified: { +// // Prefetch input data to device +// cudaCheckError( +// cudaMemPrefetchAsync(A_, sizeof(T) * m_ * n_, gpuDevice_, s1_)); +// cudaCheckError( +// cudaMemPrefetchAsync(x_, sizeof(T) * n_, gpuDevice_, s2_)); +// cudaCheckError( +// cudaMemPrefetchAsync(y_, sizeof(T) * m_, gpuDevice_, s3_)); +// break; +// } +// } +// } +// +// /** Make a call to the BLAS Library Kernel. */ +// void callGemv() override { +// switch (offload_) { +// case gpuOffloadType::always: { +// // Offload input data from host to the device. +// cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * n_, +// cudaMemcpyHostToDevice, s1_)); +// cudaCheckError(cudaMemcpyAsync(x_device_, x_, sizeof(T) * n_, +// cudaMemcpyHostToDevice, s2_)); +// cudaCheckError(cudaMemcpyAsync(y_device_, y_, sizeof(T) * m_, +// cudaMemcpyHostToDevice, s3_)); +// // Call cuBLAS GEMV kernel +// if constexpr (std::is_same_v) { +// cublasCheckError(cublasSgemv( +// handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_), +// x_device_, vecIncrement_, &beta, y_device_, vecIncrement_)); +// } else if constexpr (std::is_same_v) { +// cublasCheckError(cublasDgemv( +// handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_), +// x_device_, vecIncrement_, &beta, y_device_, vecIncrement_)); +// } +// // Offload output data from device to host +// cudaCheckError(cudaMemcpyAsync(y_, y_device_, sizeof(T) * m_, +// cudaMemcpyDeviceToHost, s3_)); +// // Ensure device has finished all work. +// cudaCheckError(cudaDeviceSynchronize()); +// break; +// } +// case gpuOffloadType::once: { +// // Call cuBLAS GEMV kernel +// if constexpr (std::is_same_v) { +// cublasCheckError(cublasSgemv( +// handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_), +// x_device_, vecIncrement_, &beta, y_device_, vecIncrement_)); +// } else if constexpr (std::is_same_v) { +// cublasCheckError(cublasDgemv( +// handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_), +// x_device_, vecIncrement_, &beta, y_device_, vecIncrement_)); +// } +// break; +// } +// case gpuOffloadType::unified: { +// // Call cuBLAS GEMV kernel +// if constexpr (std::is_same_v) { +// cublasCheckError(cublasSgemv(handle_, CUBLAS_OP_N, m_, n_, &alpha, A_, +// std::max(1, m_), x_, vecIncrement_, +// &beta, y_, vecIncrement_)); +// } else if constexpr (std::is_same_v) { +// cublasCheckError(cublasDgemv(handle_, CUBLAS_OP_N, m_, n_, &alpha, A_, +// std::max(1, m_), x_, vecIncrement_, +// &beta, y_, vecIncrement_)); +// } +// break; +// } +// } +// } +// +// /** Perform any required steps after calling the GEMV kernel that should +// * be timed. */ +// void postLoopRequirements() override { +// switch (offload_) { +// case gpuOffloadType::always: { +// // Offload data each iteration - no requirements +// break; +// } +// case gpuOffloadType::once: { +// // Offload output data from device to host +// cudaCheckError(cudaMemcpyAsync(y_, y_device_, sizeof(T) * m_, +// cudaMemcpyDeviceToHost, s3_)); +// // Ensure device has finished all work. +// cudaCheckError(cudaDeviceSynchronize()); +// break; +// } +// case gpuOffloadType::unified: { +// // Ensure all output data resides on host once work has completed +// cudaCheckError( +// cudaMemPrefetchAsync(y_, sizeof(T) * m_, cudaCpuDeviceId, s3_)); +// // Ensure device has finished all work. +// cudaCheckError(cudaDeviceSynchronize()); +// break; +// } +// } +// } +// +// /** Do any necessary cleanup (free pointers, close library handles, etc.) +// * after Kernel has been called. */ +// void postCallKernelCleanup() override { +// if (offload_ == gpuOffloadType::unified) { +// cudaFree(A_); +// cudaFree(x_); +// cudaFree(y_); +// } else { +// // Free the memory held on host and device +// cudaFreeHost((void*)A_); +// cudaFreeHost((void*)x_); +// cudaFreeHost((void*)y_); +// cudaFree(A_device_); +// cudaFree(x_device_); +// cudaFree(y_device_); +// } +// } +// +// /** Whether the initialise function has been called before. */ +// bool alreadyInitialised_ = false; +// +// /** Handle used when calling cuBLAS. */ +// cublasHandle_t handle_; +// +// /** CUDA Stream 1 - used to asynchronously move data between host and device. +// */ +// cudaStream_t s1_; +// +// /** CUDA Stream 2 - used to asynchronously move data between host and device. +// */ +// cudaStream_t s2_; +// +// /** CUDA Stream 3 - used to asynchronously move data between host and device. +// */ +// cudaStream_t s3_; +// +// /** The ID of the target GPU Device. */ +// int gpuDevice_; +// +// /** Input matrix A, held on the device. */ +// T* A_device_; +// +// /** Input vector x, held on the device. */ +// T* x_device_; +// +// /** Input vector y, held on the device. */ +// T* y_device_; +// +// /** The constant value Alpha. */ +// const T alpha = ALPHA; +// +// /** The constant value Beta. */ +// const T beta = BETA; +//}; +//} // namespace gpu +//#endif \ No newline at end of file diff --git a/include/.DS_Store b/include/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..869e02c3a673dee3916dd63df65263ee873d8adc GIT binary patch literal 6148 zcmeHKOG*Pl5UtjL18%a@WnUp%S8W(ikPApmP;lY|#r@#cyLb!16L=oqtEvQs^umoI zQUzVFy1J^X=fU(xMAH0uH4~YNNP|X9G%7-Ob?C^0C%~k0tfiBu?sm4g=_?ccMHkn8 zBKNYEM|ptWuYa?(Ag|FXu=$o?PIfGggCRyB z$x?xqn*528EFJ#ram8^kv~)>Y8S{AM-Qy)`b@;P}ODcw;gMnaR%)qgAr#%0!@XJ&m z`Qw!61p~prKVu+G+C@9ZNBP-$@OeCIGuky8g>eH72<*`%03Gfl=Q?QPnKt5z<6y{H S=+|^$Tm+PmP{F`2Fz^NPk}u={ literal 0 HcmV?d00001 diff --git a/include/doGemm.hh b/include/doGemm.hh index e264273..a33ef7e 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -8,6 +8,7 @@ #if defined CPU_ARMPL #include "../ArmPL/gemm.hh" +#include "../ArmPL/sp_gemm.hh" #elif defined CPU_ONEMKL #include "../oneMKL/CPU/gemm.hh" #elif defined CPU_AOCL @@ -62,7 +63,9 @@ class doGemm { /** Run all problem types and write data to CSV files. */ void collectData() { - if (doDense_) { + // ToDo -- I've hard coded false here as kernel selection was not working + // . Needs to be fixed + if (false) { // Square Problem Sizes... // Re-initialise offload threshold structures cpuGpu_always_ = cpuGpu_offloadThreshold(); @@ -299,7 +302,7 @@ class doGemm { #endif } - if (doSparse_) { // Square sparse matrix - sparse matrix multiplication + if (true) { // Square sparse matrix - sparse matrix multiplication cpuGpu_always_ = cpuGpu_offloadThreshold(); cpuGpu_once_ = cpuGpu_offloadThreshold(); cpuGpu_unified_ = cpuGpu_offloadThreshold(); @@ -307,7 +310,7 @@ class doGemm { getKernelName() + "_sparse_square.csv"); if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { - callSparseKernels(csvFile, dim, 0.9999); + callSparseKernels(csvFile, dim, 0.99); } } // Close file @@ -524,8 +527,12 @@ class doGemm { #if CPU_ENABLED if (doCPU_) { +// std::cout << "about to initialise matrices with size = " << N << +// std::endl; spGemmCpu_.initialise(N, sparsity); +// std::cout << "about to run spGEMM" << std::endl; time_checksum_gflop cpuResult = spGemmCpu_.compute(); +// std::cout << "about to calculate flops" << std::endl; cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, cpuResult.runtime, cpuResult.gflops); @@ -536,31 +543,38 @@ class doGemm { // - UNIFIED : data passed from host to device (and device to host) as // needed if (doGPU_) { - spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); - time_checksum_gflop gpuResult_unified = spGemmGpu_.compute(); - gpuResult_unified.gflops = - calcGflops(flops, iterations_, gpuResult_unified.runtime); + std::cout << "Starting with matrix of size " << N << std::endl; + std::cout << "\t\tUnified"; + spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); + std::cout << "\tInitialised" << std::endl; + time_checksum_gflop gpuResult_unified = spGemmGpu_.compute(); + gpuResult_unified.gflops = + calcGflops(flops, iterations_, gpuResult_unified.runtime); // - ALWAYS: Offload to/from GPU every iteration - spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); - time_checksum_gflop gpuResult_always = spGemmGpu_.compute(); - gpuResult_always.gflops = + std::cout << "\t\tAlways"; + spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); + std::cout << "\tInitialised" << std::endl; + time_checksum_gflop gpuResult_always = spGemmGpu_.compute(); + gpuResult_always.gflops = calcGflops(flops, iterations_, gpuResult_always.runtime); // - ONCE : Offload to/from GPU once before all iterations and once // after - spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); - time_checksum_gflop gpuResult_once = spGemmGpu_.compute(); - gpuResult_once.gflops = + std::cout << "\t\tOnce"; + spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); + std::cout << "\tInitialised" << std::endl; + time_checksum_gflop gpuResult_once = spGemmGpu_.compute(); + gpuResult_once.gflops = calcGflops(flops, iterations_, gpuResult_once.runtime); // ToDo -- non-default GPU operations // Write lines to CSV file - writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, + writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, iterations_, gpuResult_once.runtime, gpuResult_once.gflops); - writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, + writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, iterations_, gpuResult_always.runtime, gpuResult_always.gflops); - writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize, + writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize, iterations_, gpuResult_unified.runtime, gpuResult_unified.gflops); diff --git a/include/kernels/.DS_Store b/include/kernels/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..9cc84b2a4ce0fb9e6849637c24a43195d7749e28 GIT binary patch literal 6148 zcmeHKy-EW?5S}qX4s3#z<(9SqTVahCY=vMiF<^=u7ZCdeU&q(*6?_C=!p3iQCf*&l zSc=FD?0&oRd-uWZ-VhNlo;P!%84*<&f-H-Ih`MMxGG{Te+Gbx!@po17>=U}C zTe{ml_MiXswX-yBU9WfT8k*|q^5x^={q3r6-TYwPZ~IvT!cgyKT<`d^v-InoFMIWJ zT+?>-#@0eTsp;YjI0MdrGvEvy7{Hw^Qk^LJ> zz>tB7ZfA1;FY(C~oBUyj@0 #include +#include namespace cpu { @@ -11,10 +12,11 @@ namespace cpu { template class sp_gemm : public ::gemm { public: - using ::gemm::gemm; + using ::gemm::gemm; using ::gemm::initInputMatricesSparse; - using ::gemm::toCSR; - using ::gemm::m_; + using ::gemm::toCSR_int; + using ::gemm::iterations_; + using ::gemm::m_; using ::gemm::n_; using ::gemm::k_; using ::gemm::A_; @@ -30,7 +32,8 @@ namespace cpu { // Note that the below should be the same as the edges calculation // used in the initInputMatricesSparse function. If changed here, // change there - nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity_)); + nnz_ = 1 + (int) ((double)n_ * (double)n_ * (1.0 - sparsity_)); +// std::cout << "nnz_ = " << nnz_ << std::endl; A_ = (T*)malloc(sizeof(T) * n_ * n_); B_ = (T*)malloc(sizeof(T) * n_ * n_); @@ -38,10 +41,12 @@ namespace cpu { initInputMatricesSparse(sparsity_); - toCSR(); + toCSR_int(); } - private: + int nnz_; + + private: /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ void postCallKernelCleanup() { @@ -50,7 +55,7 @@ namespace cpu { free(C_); } - void toCSR() { + void toCSR_int() { // Move A to CSR A_row_ptr_ = new int[n_ + 1]; A_col_index_ = new int[nnz_]; @@ -86,8 +91,6 @@ namespace cpu { double sparsity_; - int nnz_; - int* A_row_ptr_; int* A_col_index_; int* B_row_ptr_; @@ -96,7 +99,7 @@ namespace cpu { int* C_col_index_; T* A_vals_; T* B_vals_; - T* C_vals; + T* C_vals_; }; } // namespace cpu diff --git a/include/kernels/CPU/sp_gemv.hh b/include/kernels/CPU/sp_gemv.hh new file mode 100644 index 0000000..0c84cb0 --- /dev/null +++ b/include/kernels/CPU/sp_gemv.hh @@ -0,0 +1,47 @@ +#pragma once + +#include "../gemv.hh" + +#include +#include + +namespace cpu { + +/** An abstract class for GEMV BLAS kernels. */ + template + class sp_gemv : public ::gemv { + public: + using ::gemv::gemv; + using ::gemv::initInputMatrixVectorSparse; + using ::gemv::m_; + using ::gemv::n_; + using ::gemv::A_; + using ::gemv::x_; + using ::gemv::y_; + using ::gemv::sparsity_; + + public: + /** Initialise the required data structures. */ + void initialise(int n, double sparsity) { + m_ = n; + n_ = n; + sparsity_ = sparsity; + + A_ = (T*)malloc(sizeof(T) * m_ * n_); + x_ = (T*)malloc(sizeof(T) * n_); + y_ = (T*)malloc(sizeof(T) * m_); + + // Initialise the matrix and vectors + initInputMatrixVectorSparse(); + } + + private: + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() override { + free(A_); + free(x_); + free(y_); + } + }; +} // namespace cpu \ No newline at end of file diff --git a/include/kernels/GPU/sp_gemm.hh b/include/kernels/GPU/sp_gemm.hh index dbfba87..52a5494 100644 --- a/include/kernels/GPU/sp_gemm.hh +++ b/include/kernels/GPU/sp_gemm.hh @@ -17,7 +17,8 @@ namespace gpu { * - Always: Move data from host to device and device to host each iteration * - Unified: Initialise data as unified memory; no data movement semantics * required */ - virtual void initialise(gpuOffloadType offload, int n, float sparsity) = 0; + virtual void initialise(gpuOffloadType offload, int n, float sparsity) + = 0; protected: /** Whether data should be offloaded to/from the GPU each iteration, or just diff --git a/include/kernels/GPU/sp_gemv.hh b/include/kernels/GPU/sp_gemv.hh new file mode 100644 index 0000000..75fd126 --- /dev/null +++ b/include/kernels/GPU/sp_gemv.hh @@ -0,0 +1,28 @@ +#pragma once + +#include "../gemv.hh" + +namespace gpu { + +/** An abstract class for GEMV BLAS kernels. */ + template + class sp_gemv : public ::gemv { + public: + using ::gemv::gemv; + + /** Initialise the required data structures. + * `offload` refers to the data offload type: + * - Once: Move data from host to device before all iterations & move from + * device to host after all iterations + * - Always: Move data from host to device and device to host each iteration + * - Unified: Initialise data as unified memory; no data movement semantics + * required */ + virtual void initialise(gpuOffloadType offload, int n, float sparsity) + = 0; + + protected: + /** Whether data should be offloaded to/from the GPU each iteration, or just + * before & after. */ + gpuOffloadType offload_ = gpuOffloadType::always; + }; +} // namespace gpu \ No newline at end of file diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index d357734..6d75554 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -9,6 +9,7 @@ #include #include #include +#include #include "../utilities.hh" @@ -27,10 +28,13 @@ class gemm { std::chrono::high_resolution_clock::now(); // Perform all GEMM calls +// std::cout << "about to do pre-loop requirements" << std::endl; preLoopRequirements(); for (int i = 0; i < iterations_; i++) { +// std::cout << "entering loop " << i << std::endl; callGemm(); } +// std::cout << "about to do post-loop requirements" << std::endl; postLoopRequirements(); // Stop Timer diff --git a/include/kernels/gemv.hh b/include/kernels/gemv.hh index ba12d02..665fe59 100644 --- a/include/kernels/gemv.hh +++ b/include/kernels/gemv.hh @@ -4,6 +4,7 @@ #include #include #include +#include #include "../utilities.hh" @@ -82,6 +83,82 @@ class gemv { } } + void initInputMatrixVectorSparse() { + // Initialise sparse matrix + for (int i = 0; i < (n_ * n_); i++) { + A_[i] = 0.0; + } + + // Random number generator objects for use in descent + std::default_random_engine gen; + gen.seed(std::chrono::system_clock::now() + .time_since_epoch().count()); + std::uniform_real_distribution dist(0.0, 1.0); + + int edges = 1 + (int) (n_ * n_ * (1 - sparsity_)); + + // Using a=0.45 and b=c=0.22 as default probabilities + for (int i = 0; i < edges; i++) { + while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, + false)) {} + } + + // Initialise the input and output vectors + for (int y = 0; y < n_; y++) { + x_[y] = (T)((double)(rand() % 100) / 3.0); + } + for (int y = 0; y < m_; y++) { + y_[y] = (T)0.0; + } + } + + /** Recursive function to populate sparse matrices */ + bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b, + float c, std::default_random_engine* gen, + std::uniform_real_distribution dist, bool bin) { + // If a 1x1 submatrix, then add an edge and return out + if (x1 >= x2 && y1 >= y2) { + // Needed to avoid overfloe segfaults with large problem sizes + uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1); + if (abs(M[index]) > 0.1) { + return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0); + return true; + } + } else { + // Divide up the matrix + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); + + // ToDo -- add some noise to these values between iterations + float newA = a; + float newB = b; + float newC = c; + + // Work out which quarter to recurse into + // There are some ugly ternary operators here to avoid going out of bounds in the edge case + // that we are already at 1 width or 1 height + float randomNum = dist(*gen); + if (randomNum < a) { + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b)) { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b + c)) { + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + newA, newB, newC, gen, dist, bin); + } else { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, + gen, dist, bin); + } + } + return true; + } + /** Call the extern consume() function. */ void callConsume() { consume((void*)A_, (void*)x_, (void*)y_); } @@ -105,4 +182,6 @@ class gemv { /** The distance between two vector elements. */ const int vecIncrement_ = 1; + + double sparsity_ = 0.0; }; From a8e5c4690238832761286e2cde7ab7f2170acf26 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:53:08 +0100 Subject: [PATCH 25/32] Adding AOCL files --- .idea/workspace.xml | 6 +- ArmPL/sp_gemm.hh | 266 +++++++-------------------------- createGflopsGraphs.py | 2 +- cuBLAS/common.hh | 2 +- include/doGemm.hh | 11 -- include/kernels/CPU/sp_gemm.hh | 10 +- include/kernels/gemm.hh | 3 - src/main.cc | 24 +-- 8 files changed, 80 insertions(+), 244 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index b954508..e9a4d65 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -125,9 +125,9 @@ - + @@ -171,7 +171,9 @@ - + + + - - @@ -538,6 +549,7 @@ - \ No newline at end of file diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh index cb6b443..28a2ca3 100644 --- a/ArmPL/sp_gemm.hh +++ b/ArmPL/sp_gemm.hh @@ -53,9 +53,6 @@ class sp_gemm_cpu : public sp_gemm { // Todo -- See if using armpl_spmat_hint can improve performance here. // If so, follow with optimisation functions - - - if constexpr (std::is_same_v) { status_ = armpl_spmm_exec_s(transA_, transB_, @@ -63,7 +60,7 @@ class sp_gemm_cpu : public sp_gemm { A_armpl_, B_armpl_, beta, - B_armpl_); + C_armpl_); } else if constexpr (std::is_same_v) { status_ = armpl_spmm_exec_d(transA_, transB_, @@ -71,7 +68,7 @@ class sp_gemm_cpu : public sp_gemm { A_armpl_, B_armpl_, beta, - B_armpl_); + C_armpl_); } else { // Un-specialised class will not do any work - print error and exit. std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported." @@ -107,11 +104,11 @@ class sp_gemm_cpu : public sp_gemm { std::cout << "ERROR " << status_ << std::endl; exit(1); } -// status_ = armpl_spmat_destroy(*C_armpl_); -// if (status_ != ARMPL_STATUS_SUCCESS) { -// std::cout << "ERROR " << status_ << std::endl; -// exit(1); -// } + status_ = armpl_spmat_destroy(C_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } delete [] A_armpl_row_ptr_; delete [] A_armpl_col_index_; @@ -119,9 +116,9 @@ class sp_gemm_cpu : public sp_gemm { delete [] B_armpl_row_ptr_; delete [] B_armpl_col_index_; delete [] B_vals_; -// delete [] C_armpl_row_ptr_; -// delete [] C_armpl_col_index_; -// delete [] C_vals_; + delete [] C_armpl_row_ptr_; + delete [] C_armpl_col_index_; + delete [] C_vals_; } @@ -172,6 +169,24 @@ class sp_gemm_cpu : public sp_gemm { } } + // Move C to CSR + C_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; + C_armpl_col_index_ = new armpl_int_t[nnz_]; + C_vals_ = new T[nnz_]; + C_armpl_row_ptr_[0] = 0; + + nnz_encountered = 0; + for (int row = 0; row < n_; row++) { + C_armpl_row_ptr_[row + 1] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (B_[(row * n_) + col] != 0.0) { + C_armpl_col_index_[nnz_encountered] = col; + C_vals_[nnz_encountered] = static_cast(B_[(row * n_) + col]); + nnz_encountered++; + } + } + } + if constexpr (std::is_same_v) { // printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, // nnz_, flags_); @@ -200,6 +215,20 @@ class sp_gemm_cpu : public sp_gemm { std::cout << "ERROR " << status_ << std::endl; exit(1); } + +// printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_s(&C_armpl_, + n_armpl_, + n_armpl_, + C_armpl_row_ptr_, + C_armpl_col_index_, + C_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } } else if constexpr (std::is_same_v) { // printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, // nnz_, flags_ @@ -228,6 +257,20 @@ class sp_gemm_cpu : public sp_gemm { std::cout << "ERROR " << status_ << std::endl; exit(1); } + +// printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_d(&C_armpl_, + n_armpl_, + n_armpl_, + C_armpl_row_ptr_, + C_armpl_col_index_, + C_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } // std::cout << "Okay, all matrices made!!" << std::endl; } From 7f82b7d52f0ab2420774159d9099fb40aef00ce2 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:56:42 +0100 Subject: [PATCH 27/32] Adding AOCL files --- .idea/workspace.xml | 25 +++++++++---- include/doGemm.hh | 66 +++++++++++++++++++++++++++++----- include/doGemv.hh | 57 ++++++++++++++++------------- include/kernels/CPU/sp_gemm.hh | 7 ++-- include/kernels/gemm.hh | 7 ++-- include/kernels/gemv.hh | 5 +-- src/main.cc | 62 +++++++++++++++++++++----------- 7 files changed, 160 insertions(+), 69 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index cb692bc..a5afad2 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -15,9 +15,14 @@ - + - + + + + + + @@ -525,7 +538,6 @@ - @@ -550,6 +562,7 @@ - \ No newline at end of file diff --git a/include/doGemm.hh b/include/doGemm.hh index c71684f..a3e5e77 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -65,7 +65,7 @@ class doGemm { void collectData() { // ToDo -- I've hard coded false here as kernel selection was not working // . Needs to be fixed - if (false) { + if (doDense_) { // Square Problem Sizes... // Re-initialise offload threshold structures cpuGpu_always_ = cpuGpu_offloadThreshold(); @@ -301,13 +301,12 @@ class doGemm { } #endif } - - if (true) { // Square sparse matrix - sparse matrix multiplication + if (doSparse_) { // Square sparse matrix - sparse matrix multiplication cpuGpu_always_ = cpuGpu_offloadThreshold(); cpuGpu_once_ = cpuGpu_offloadThreshold(); cpuGpu_unified_ = cpuGpu_offloadThreshold(); std::ofstream csvFile = initCSVFile(std::string(CSV_DIR) + "/" + - getKernelName() + "_sparse_square.csv"); + getKernelName() + "_sparse_square_99.csv"); if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { callSparseKernels(csvFile, dim, 0.99); @@ -316,10 +315,59 @@ class doGemm { // Close file csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Sparse Square"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Sparse Square 0.99"); + } +#endif + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + csvFile = initCSVFile(std::string(CSV_DIR) + "/" + + getKernelName() + "_sparse_square_999.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + callSparseKernels(csvFile, dim, 0.999); + } + } +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Sparse Square 0.999"); + } +#endif + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + csvFile = initCSVFile(std::string(CSV_DIR) + "/" + + getKernelName() + "_sparse_square_9999.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + callSparseKernels(csvFile, dim, 0.9999); + } + } +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Sparse Square 0.9999"); + } +#endif + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + csvFile = initCSVFile(std::string(CSV_DIR) + "/" + + getKernelName() + + "_sparse_square_99999.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + callSparseKernels(csvFile, dim, 0.99999); + } + } +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Sparse Square 0.99999"); + } #endif } } @@ -530,7 +578,7 @@ class doGemm { spGemmCpu_.initialise(N, sparsity); time_checksum_gflop cpuResult = spGemmCpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); - writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, + writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, cpuResult.runtime, cpuResult.gflops); } #endif diff --git a/include/doGemv.hh b/include/doGemv.hh index b86aad6..12cd097 100644 --- a/include/doGemv.hh +++ b/include/doGemv.hh @@ -33,13 +33,16 @@ class doGemv { public: doGemv(const std::string csvDir, const int iters, const int startDim, const int upperLimit, const bool cpuEnabled = true, - const bool gpuEnabled = true) + const bool gpuEnabled = true, const bool doDense = true, const bool + doSparse = true) : CSV_DIR(csvDir), iterations_(iters), startDimention_(startDim), upperLimit_(upperLimit), doCPU_(cpuEnabled), - doGPU_(gpuEnabled) + doGPU_(gpuEnabled), + doDense_(doDense), + doSparse_(doSparse) #if CPU_ENABLED , gemvCpu_(iterations_) @@ -56,28 +59,29 @@ class doGemv { /** Run all problem types and write data to CSV files. */ void collectData() { - // Square Problem Sizes... - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - std::ofstream csvFile = - initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv"); - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = dim, N = dim; - callKernels(csvFile, dim, dim); - } - // Close file - csvFile.close(); -#if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Square x Vector (M=N)"); - } -#endif + if (doDense_) { + // Square Problem Sizes... + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + std::ofstream csvFile = + initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv"); + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = dim, N = dim; + callKernels(csvFile, dim, dim); + } + // Close file + csvFile.close(); + #if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Vector (M=N)"); + } + #endif // Rectangular Problem Sizes: // Tall and thin x Vector @@ -182,6 +186,7 @@ class doGemv { } #endif } + } private: /** Call the appropriate CPU and GPU GEMV kernels. */ @@ -494,6 +499,10 @@ class doGemv { /** Whether the GPU kernels should be run. */ const bool doGPU_ = true; + /** Whether sparse and or dense kernels should be run. */ + const bool doSparse_; + const bool doDense_; + #if CPU_ENABLED /** The GEMV CPU kernel. */ cpu::gemv_cpu gemvCpu_; diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh index a11dcd0..c431d4d 100644 --- a/include/kernels/CPU/sp_gemm.hh +++ b/include/kernels/CPU/sp_gemm.hh @@ -32,18 +32,19 @@ namespace cpu { // Note that the below should be the same as the edges calculation // used in the initInputMatricesSparse function. If changed here, // change there - nnz_ = 1 + (int) ((double)n_ * (double)n_ * (1.0 - sparsity_)); + nnz_ = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - sparsity_)); +// std::cout << "\t____About to malloc()____" << std::endl; A_ = (T*)malloc(sizeof(T) * n_ * n_); B_ = (T*)malloc(sizeof(T) * n_ * n_); C_ = (T*)malloc(sizeof(T) * n_ * n_); - initInputMatricesSparse(sparsity_); + initInputMatricesSparse(sparsity); toCSR_int(); } - int nnz_; + uint64_t nnz_; protected: diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index bbd17cb..6e1328e 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -107,14 +107,14 @@ class gemm { .time_since_epoch().count()); std::uniform_real_distribution dist(0.0, 1.0); - int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); + int edges = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - sparsity)); // Using a=0.45 and b=c=0.22 as default probabilities for (int i = 0; i < edges; i++) { while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, - false)) {} + false)) {} while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, - false)) {} + false)) {} } } @@ -165,7 +165,6 @@ class gemm { gen, dist, bin); } } - return true; } void toCSR_int(T* dense, int n_col, int n_row, T* vals, int* col_index, diff --git a/include/kernels/gemv.hh b/include/kernels/gemv.hh index 665fe59..a64b19c 100644 --- a/include/kernels/gemv.hh +++ b/include/kernels/gemv.hh @@ -95,10 +95,11 @@ class gemv { .time_since_epoch().count()); std::uniform_real_distribution dist(0.0, 1.0); - int edges = 1 + (int) (n_ * n_ * (1 - sparsity_)); + uint64_t edges = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - + sparsity_)); // Using a=0.45 and b=c=0.22 as default probabilities - for (int i = 0; i < edges; i++) { + for (uint64_t i = 0; i < edges; i++) { while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false)) {} } diff --git a/src/main.cc b/src/main.cc index e508b5b..bdc1db2 100644 --- a/src/main.cc +++ b/src/main.cc @@ -7,6 +7,10 @@ bool doSgemm = true; bool doDgemm = true; bool doSp_sgemm = true; bool doSp_dgemm = true; +bool doSgemv = true; +bool doDgemv = true; +bool doSp_sgemv = true; +bool doSp_dgemv = true; bool doCpu = CPU_ENABLED; bool doGpu = GPU_ENABLED; @@ -50,18 +54,18 @@ int main(int argc, char** argv) { // -------- GEMV -------- // SGEMV Comparison -// std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl; -// doGemv sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu, -// doGpu); -// sgemv.collectData(); -// std::cout << "Finished!" << std::endl; -// -// // DGEMV Comparison -// std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl; -// doGemv dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu, -// doGpu); -// dgemv.collectData(); -// std::cout << "Finished!" << std::endl; + std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl; + doGemv sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu, + doGpu, doSgemv, doSp_sgemv); + sgemv.collectData(); + std::cout << "Finished!" << std::endl; + + // DGEMV Comparison + std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl; + doGemv dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu, + doGpu, doDgemv, doSp_dgemv); + dgemv.collectData(); + std::cout << "Finished!" << std::endl; free(absPath); return 0; @@ -146,7 +150,8 @@ void getParameters(int argc, char** argv) { } else if (!strcmp(argv[i], "--no_gpu")) { doGpu = false; } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) { - doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = false; + doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = + doSgemv = doDgemv = doSp_sgemv = doSp_dgemv = false; std::string kernelList = argv[++i]; if (kernelList.find("sp-sgemm") != std::string::npos) { doSp_sgemm = true; @@ -167,13 +172,28 @@ void getParameters(int argc, char** argv) { doDgemm = true; } - if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm) { - std::cout << "ERROR - no implemented kernels in list" << std::endl; - exit(1); - } - } else if (!strcmp(argv[i], "--output_dir") || !strcmp(argv[i], "-o")) { - if (++i >= argc) { - std::cout << "ERROR - Invalid output directory" << std::endl; + + if (kernelList.find("sp-sgemv") != std::string::npos) { + doSp_sgemv = true; + if (kernelList.find("sgemv") != std::string::npos && + kernelList.find("sgemv") != kernelList.find("sp-sgemv") + 3) { + doSgemv = true; + } + } else if (kernelList.find("sgemv") != std::string::npos) { + doSgemv = true; + } + if (kernelList.find("sp-dgemv") != std::string::npos) { + doSp_dgemv = true; + if (kernelList.find("dgemv") != std::string::npos && + kernelList.find("dgemv") != kernelList.find("sp-dgemv") + 3) { + doDgemv = true; + } + } else if (kernelList.find("dgemv") != std::string::npos) { + doDgemv = true; + } + if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm && + !doSgemv && !doDgemv && !doSp_sgemv && !doSp_dgemv) { + std::cout << "ERROR - no implemented kernels in list" << std::endl; exit(1); } else { CSV_DIR = argv[i]; @@ -212,4 +232,4 @@ void getParameters(int argc, char** argv) { exit(1); } } -} \ No newline at end of file +} From 0130b81655b1fa04b433c4d22f9288df723cefd2 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:58:16 +0100 Subject: [PATCH 28/32] Adding AOCL files --- .idea/workspace.xml | 23 ++++++++----- ArmPL/sp_gemm.hh | 84 +++++++++++++++++++++++++++++++++++++++++++++ Makefile | 2 +- include/doGemm.hh | 26 +++++++------- include/doGemv.hh | 12 +++---- include/helpers.hh | 12 ++++--- 6 files changed, 127 insertions(+), 32 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index a5afad2..2bb35d8 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -15,14 +15,13 @@ - + + + - - - - + @@ -538,7 +545,6 @@ - @@ -563,6 +569,7 @@ - \ No newline at end of file diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh index 28a2ca3..612f4f1 100644 --- a/ArmPL/sp_gemm.hh +++ b/ArmPL/sp_gemm.hh @@ -89,6 +89,90 @@ class sp_gemm_cpu : public sp_gemm { void preLoopRequirements() override { // Need to put A_ and B_ into A_armpl_ and B_armpl_ toCSR_armpl(); + + /** providing hints to ARMPL and optimizing the matrix datastructures */ + // TODO -- is noallocs best here? + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_MEMORY, + ARMPL_SPARSE_MEMORY_NOALLOCS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_MEMORY, + ARMPL_SPARSE_MEMORY_NOALLOCS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_STRUCTURE, + ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_STRUCTURE, + ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + // TODO -- will this be FEW? + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS, + ARMPL_SPARSE_INVOCATIONS_MANY); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS, + ARMPL_SPARSE_INVOCATIONS_MANY); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION, + ARMPL_SPARSE_OPERATION_NOTRANS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION, + ARMPL_SPARSE_OPERATION_NOTRANS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + // TODO -- investigate whch is better here + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY, + ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY, + ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// TODO -- this is thorwing an error -- couldn't immediately fix so come +// back to + +// /** provide hints for the optimisation of the spmm execution */ +// status_ = armpl_spmm_optimize(ARMPL_SPARSE_OPERATION_NOTRANS, +// ARMPL_SPARSE_OPERATION_NOTRANS, +// ARMPL_SPARSE_SCALAR_ONE, +// A_armpl_, B_armpl_, +// ARMPL_SPARSE_SCALAR_ZERO, +// C_armpl_); +// if (status_ != ARMPL_STATUS_SUCCESS) { +// std::cout << "ERROR " << status_ << std::endl; +// exit(1); +// } } /** Perform any required steps after calling the GEMM kernel that should diff --git a/Makefile b/Makefile index e5091e0..22d080c 100644 --- a/Makefile +++ b/Makefile @@ -51,7 +51,7 @@ CXX = $(CXX_$(COMPILER)) CXXFLAGS_ARM = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native CXXFLAGS_CLANG = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native -CXXFLAGS_GNU = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native +CXXFLAGS_GNU = -std=c++17 -Wall -Wno-deprecated-declarations -Ofast -$(ARCHFLAG)=native CXXFLAGS_INTEL = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native -Wno-tautological-constant-compare CXXFLAGS_NVIDIA = -std=c++17 -Wall -O3 -fast -$(ARCHFLAG)=native CXXFLAGS_HIP = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native diff --git a/include/doGemm.hh b/include/doGemm.hh index a3e5e77..93cc058 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -392,8 +392,8 @@ class doGemm { cpuResult = gemmCpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); // Write result to CSV file - writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize, iterations_, - cpuResult.runtime, cpuResult.gflops); + writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize, + 0.0, iterations_, cpuResult.runtime, cpuResult.gflops); } #endif @@ -422,13 +422,13 @@ class doGemm { // Write results to CSV file writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, K, probSize, - iterations_, gpuResult_once.runtime, + 0.0, iterations_, gpuResult_once.runtime, gpuResult_once.gflops); writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, K, - probSize, iterations_, gpuResult_always.runtime, + probSize, 0.0, iterations_, gpuResult_always.runtime, gpuResult_always.gflops); writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, K, probSize, - iterations_, gpuResult_unified.runtime, + 0.0, iterations_, gpuResult_unified.runtime, gpuResult_unified.gflops); } #endif @@ -578,8 +578,9 @@ class doGemm { spGemmCpu_.initialise(N, sparsity); time_checksum_gflop cpuResult = spGemmCpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); - writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, - cpuResult.runtime, cpuResult.gflops); + writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, + sparsity, iterations_, cpuResult.runtime, + cpuResult.gflops); } #endif #if GPU_ENABLED @@ -607,13 +608,14 @@ class doGemm { // Write lines to CSV file writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, - iterations_, gpuResult_once.runtime, gpuResult_once.gflops); + sparsity, iterations_, gpuResult_once.runtime, + gpuResult_once.gflops); writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, - iterations_, gpuResult_always.runtime, - gpuResult_always.gflops); + sparsity, iterations_, gpuResult_always.runtime, + gpuResult_always.gflops); writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize, - iterations_, gpuResult_unified.runtime, - gpuResult_unified.gflops); + sparsity, iterations_, gpuResult_unified.runtime, + gpuResult_unified.gflops); } #endif diff --git a/include/doGemv.hh b/include/doGemv.hh index 12cd097..2ab5fb1 100644 --- a/include/doGemv.hh +++ b/include/doGemv.hh @@ -207,8 +207,8 @@ class doGemv { cpuResult = gemvCpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); // Write result to CSV file - writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, iterations_, - cpuResult.runtime, cpuResult.gflops); + writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, 0.0, + iterations_, cpuResult.runtime, cpuResult.gflops); } #endif @@ -237,13 +237,13 @@ class doGemv { // Write results to CSV file writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, 0, probSize, - iterations_, gpuResult_once.runtime, + 0.0, iterations_, gpuResult_once.runtime, gpuResult_once.gflops); writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, 0, - probSize, iterations_, gpuResult_always.runtime, + probSize, 0.0, iterations_, gpuResult_always.runtime, gpuResult_always.gflops); writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, 0, probSize, - iterations_, gpuResult_unified.runtime, + 0.0, iterations_, gpuResult_unified.runtime, gpuResult_unified.gflops); } #endif @@ -500,8 +500,8 @@ class doGemv { const bool doGPU_ = true; /** Whether sparse and or dense kernels should be run. */ - const bool doSparse_; const bool doDense_; + const bool doSparse_; #if CPU_ENABLED /** The GEMV CPU kernel. */ diff --git a/include/helpers.hh b/include/helpers.hh index 5618557..d760cd7 100644 --- a/include/helpers.hh +++ b/include/helpers.hh @@ -17,8 +17,8 @@ std::ofstream initCSVFile(const std::string filename) { std::ofstream newFile(filename); - newFile << "Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total " - "Seconds,GFLOP/s" + newFile << "Device,Kernel,M,N,K,Total Problem Size (KiB),sparsity,Iterations," + "Total Seconds,GFLOP/s" << std::endl; return newFile; @@ -28,15 +28,17 @@ std::ofstream initCSVFile(const std::string filename) { * Function does not close the file. */ void writeLineToCsv(std::ofstream& file, const std::string device, const std::string kernel, const int M, const int N, - const int K, const double totalProbSize, const int iters, - const double totalTime, const double gflops) { + const int K, const double totalProbSize, const float + sparsity, const int iters, const double totalTime, + const double gflops) { if (!file.is_open()) { std::cout << "ERROR - Attempted to write line to a closed CSV file." << std::endl; exit(1); } file << device << "," << kernel << "," << M << "," << N << "," << K << "," - << std::fixed << std::setprecision(3) << totalProbSize << "," << iters + << std::fixed << std::setprecision(3) << totalProbSize << "," + << std::fixed << std::setprecision(8) << sparsity << "," << iters << "," << std::fixed << std::setprecision(5) << totalTime << "," << std::fixed << std::setprecision(3) << gflops << std::endl; } From 4581637b57e14c92b4b4ca40c200565aae9e3d91 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:12:42 +0100 Subject: [PATCH 29/32] Providing armpl with hints --- .idea/workspace.xml | 21 ++++++++++++--------- ArmPL/sp_gemm.hh | 1 + 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 2bb35d8..d791fa3 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -15,13 +15,8 @@ - - + - - - - @@ -545,7 +548,6 @@ - @@ -570,6 +572,7 @@ - \ No newline at end of file diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh index 612f4f1..e8e28a5 100644 --- a/ArmPL/sp_gemm.hh +++ b/ArmPL/sp_gemm.hh @@ -355,6 +355,7 @@ class sp_gemm_cpu : public sp_gemm { std::cout << "ERROR " << status_ << std::endl; exit(1); } + // std::cout << "Okay, all matrices made!!" << std::endl; } From 477b7a0a050caeeb86ff4776ab75cbe4982cf883 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Mon, 21 Oct 2024 15:14:42 +0100 Subject: [PATCH 30/32] Updating createGflopsGraphs.py to show sparsity --- .idea/workspace.xml | 6 ++++-- createGflopsGraphs.py | 7 +++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index d791fa3..d27d844 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -15,8 +15,9 @@ - - + + + - - + - @@ -575,6 +573,7 @@ - \ No newline at end of file