kachi-group · nhatdongdang · Jul 8, 2024 · Jun 28, 2024 · Jul 1, 2024 · Jul 1, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,34 +1,34 @@
 name: CI
 
 on:
-    push:
-        branches: main
-        paths: ['**.cu','**.c','**.cpp', '**.h', '**CMakeLists.txt'] 
-    pull_request:
-        branches: main
-        paths: ['**.cu','**.c','**.cpp', '**.h', '**CMakeLists.txt']
+  push:
+    branches: main
+    paths: ["**.cu", "**.c", "**.cpp", "**.h", "**CMakeLists.txt"]
+  pull_request:
+    branches: main
+    paths: ["**.cu", "**.c", "**.cpp", "**.h", "**CMakeLists.txt"]
 
 jobs:
-    build-and-test:
-        runs-on: ubuntu-latest
+  build-and-test:
+    runs-on: ubuntu-latest
 
-        steps:
-        - name: Checkout code
-          uses: actions/checkout@v4
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
 
-        - name: Setup python
-          uses: actions/setup-python@v5
-          with:
-            python-version: '3.10' 
+      - name: Setup python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
 
-        - name: Install dependencies
-          run: |
-            pip install pandas
+      - name: Install dependencies
+        run: |
+          pip install pandas
 
-        - name: Build project
-          run: |
-            make build
-            
-        - name: Run test suite
-          run: |
-            make test
+      - name: Build project
+        run: |
+          make build
+
+      - name: Run test suite
+        run: |
+          make test_cpu
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,30 +1,36 @@
 cmake_minimum_required(VERSION 3.16)
 
-# Set the project name
-project(ichida-algo)
+project(ichida-algo LANGUAGES C CXX)
 
 set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -fopenmp -Wall -Wextra")
-
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED True)
-# set(CMAKE_VERBOSE_MAKEFILE ON)
+set(CMAKE_VERBOSE_MAKEFILE ON)
 
-set(SRC_DIR src)
 set(INC_DIR include)
-set(LIB_DIR lib)
-set(TEST_DIR test)
-set(BENCHMARK_DIR benchmark)
+set(SRC_DIR src)
+set(CUDA_SRC_DIR cudasrc)
 
-# Source files
-file(GLOB_RECURSE SOURCE_FILES ${SRC_DIR}/*.c)
+include_directories(${INC_DIR})
 
-include_directories(include)
+file(GLOB_RECURSE SOURCE_FILES ${SRC_DIR}/*.c)
 
 add_executable(speed_cpu ${SOURCE_FILES})
-# add_executable(benchmark ${SRC_DIR}/matrix.c ${BENCHMARK_DIR}/benchmark.c)
-
-target_link_libraries(speed_cpu m pthread)
-# target_link_libraries(benchmark m)
-
+target_link_libraries(speed_cpu m pthread gomp)
+
+find_package(CUDA)
+
+if(CUDA_FOUND)
+    enable_language(CUDA)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xptxas -O3 --use_fast_math -Xcompiler -march=native -unroll-aggressive -arch=sm_80")
+    find_package(MPI REQUIRED)
+    include_directories(${MPI_INCLUDE_PATH})
+    file(GLOB_RECURSE CUDA_SOURCE_FILES ${CUDA_SRC_DIR}/*.cu)
+    add_executable(speed_gpu ${CUDA_SOURCE_FILES})
+    set_target_properties(speed_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_link_libraries(speed_gpu m ${MPI_LIBRARIES})
+else()
+    message(STATUS "CUDA not found, only CPU version will be built.")
+endif()
 
 
diff --git a/Makefile b/Makefile
@@ -1,37 +1,36 @@
-.PHONY: all test clean run build run_test
+.PHONY: all clean build run_cpu run_gpu test_cpu test_gpu bench stat
 
-all: rebuild
+# Default iterations
+iterations ?= 1000
+
+all: build
 
 clean:
 	rm -f test/results.csv
 	rm -f results.csv
 	rm -rf build
-	rm -f speed_cpu
+	rm -f speed_cpu speed_gpu
 
 build: clean
-	cmake -Bbuild
-	$(MAKE) -C ./build
-	mv ./build/speed_cpu ./
-
-rebuild:
-	$(MAKE) -C ./build
-	mv ./build/speed_cpu ./
-
-run: build
-	./speed_demo_cpu.sh ./weights_and_biases.txt ./tensors
-
-run_test: build
-	./speed_cpu ./weights_and_biases.txt ./tensors
-
-test: build
-	./speed_cpu ./weights_and_biases.txt ./tensors 1
-	mv ./results.csv ./test
-	python3 ./test/verify_csv.py
+	cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
+	$(MAKE) -C build
+	cp -u build/speed_cpu ./
+	if [ -f build/speed_gpu ]; then cp -u build/speed_gpu ./; fi
 
-bench: build
-	./build/benchmark
+run_cpu: build
+	./speed_cpu ./weights_and_biases.txt ./tensors $(iterations)
 
-stat: build
-	python3 ./benchmark/stat.py
+run_gpu: build
+	n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \
+	mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors $(iterations)
 
+test_cpu: build
+	./speed_cpu ./weights_and_biases.txt ./tensors $(iterations)
+	mv ./results.csv ./test
+	python3 ./test/verify_csv.py
 
+test_gpu: build
+	n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \
+	mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors $(iterations)
+	mv ./results.csv ./test
+	python3 ./test/verify_csv.py
diff --git a/benchmark/benchmark.c b/benchmark/benchmark.c
diff --git a/benchmark/matrix_add/Makefile → benchmark/cpu/matrix_add/Makefile b/benchmark/matrix_add/Makefile → benchmark/cpu/matrix_add/Makefile
diff --git a/benchmark/matrix_add/benchmark.c → benchmark/cpu/matrix_add/benchmark.c b/benchmark/matrix_add/benchmark.c → benchmark/cpu/matrix_add/benchmark.c
diff --git a/benchmark/matrix_add/matrix_add.h → benchmark/cpu/matrix_add/matrix_add.h b/benchmark/matrix_add/matrix_add.h → benchmark/cpu/matrix_add/matrix_add.h
diff --git a/benchmark/matrix_add/plot.py → benchmark/cpu/matrix_add/plot.py b/benchmark/matrix_add/plot.py → benchmark/cpu/matrix_add/plot.py
diff --git a/...hmark/matrix_add/versions/matrix_add_v1.c → ...k/cpu/matrix_add/versions/matrix_add_v1.c b/...hmark/matrix_add/versions/matrix_add_v1.c → ...k/cpu/matrix_add/versions/matrix_add_v1.c
diff --git a/benchmark/matrix_mul/Makefile → benchmark/cpu/matrix_mul/Makefile b/benchmark/matrix_mul/Makefile → benchmark/cpu/matrix_mul/Makefile
diff --git a/benchmark/matrix_mul/benchmark.c → benchmark/cpu/matrix_mul/benchmark.c b/benchmark/matrix_mul/benchmark.c → benchmark/cpu/matrix_mul/benchmark.c
diff --git a/benchmark/matrix_mul/matrix_mul.h → benchmark/cpu/matrix_mul/matrix_mul.h b/benchmark/matrix_mul/matrix_mul.h → benchmark/cpu/matrix_mul/matrix_mul.h
diff --git a/benchmark/matrix_mul/plot.py → benchmark/cpu/matrix_mul/plot.py b/benchmark/matrix_mul/plot.py → benchmark/cpu/matrix_mul/plot.py
diff --git a/...hmark/matrix_mul/versions/matrix_mul_v1.c → ...k/cpu/matrix_mul/versions/matrix_mul_v1.c b/...hmark/matrix_mul/versions/matrix_mul_v1.c → ...k/cpu/matrix_mul/versions/matrix_mul_v1.c
diff --git a/...hmark/matrix_mul/versions/matrix_mul_v2.c → ...k/cpu/matrix_mul/versions/matrix_mul_v2.c b/...hmark/matrix_mul/versions/matrix_mul_v2.c → ...k/cpu/matrix_mul/versions/matrix_mul_v2.c
diff --git a/benchmark/multithreading/plot.py → benchmark/cpu/multithreading/plot.py b/benchmark/multithreading/plot.py → benchmark/cpu/multithreading/plot.py
diff --git a/benchmark/relu/Makefile → benchmark/cpu/relu/Makefile b/benchmark/relu/Makefile → benchmark/cpu/relu/Makefile
diff --git a/benchmark/relu/benchmark.c → benchmark/cpu/relu/benchmark.c b/benchmark/relu/benchmark.c → benchmark/cpu/relu/benchmark.c
diff --git a/benchmark/relu/plot.py → benchmark/cpu/relu/plot.py b/benchmark/relu/plot.py → benchmark/cpu/relu/plot.py
diff --git a/benchmark/relu/relu.h → benchmark/cpu/relu/relu.h b/benchmark/relu/relu.h → benchmark/cpu/relu/relu.h
diff --git a/benchmark/relu/versions/relu_v1.c → benchmark/cpu/relu/versions/relu_v1.c b/benchmark/relu/versions/relu_v1.c → benchmark/cpu/relu/versions/relu_v1.c
diff --git a/benchmark/relu/versions/relu_v2.c → benchmark/cpu/relu/versions/relu_v2.c b/benchmark/relu/versions/relu_v2.c → benchmark/cpu/relu/versions/relu_v2.c
diff --git a/benchmark/softmax/Makefile → benchmark/cpu/softmax/Makefile b/benchmark/softmax/Makefile → benchmark/cpu/softmax/Makefile
diff --git a/benchmark/softmax/benchmark.c → benchmark/cpu/softmax/benchmark.c b/benchmark/softmax/benchmark.c → benchmark/cpu/softmax/benchmark.c
diff --git a/benchmark/softmax/plot.py → benchmark/cpu/softmax/plot.py b/benchmark/softmax/plot.py → benchmark/cpu/softmax/plot.py
diff --git a/benchmark/softmax/softmax.h → benchmark/cpu/softmax/softmax.h b/benchmark/softmax/softmax.h → benchmark/cpu/softmax/softmax.h
diff --git a/benchmark/softmax/versions/softmax_v1.c → benchmark/cpu/softmax/versions/softmax_v1.c b/benchmark/softmax/versions/softmax_v1.c → benchmark/cpu/softmax/versions/softmax_v1.c
diff --git a/benchmark/gpu/matrix_add/Makefile b/benchmark/gpu/matrix_add/Makefile
@@ -0,0 +1,19 @@
+compile = nvcc -O3 -arch=sm_75 --use_fast_math 
+SRC_DIR := versions
+BIN_DIR := bin
+SRC_FILES := $(wildcard $(SRC_DIR)/*.cu)
+EXECUTABLES := $(patsubst $(SRC_DIR)/%.cu, $(BIN_DIR)/%, $(SRC_FILES))
+
+all: clean $(EXECUTABLES) 
+
+clean:
+	rm -f -r bin
+	mkdir bin
+
+$(BIN_DIR)/%: $(SRC_DIR)/%.cu
+	$(compile) $< benchmark.cu -o [email protected] 
+
+plot: all
+	python3 ./plot.py
+
+
diff --git a/benchmark/gpu/matrix_add/benchmark.cu b/benchmark/gpu/matrix_add/benchmark.cu
@@ -0,0 +1,13 @@
+#include "template.cuh"
+#include <stdio.h>
+#include <time.h>
+
+int main(int argc, char* argv[]) {
+    long n;
+    if (argc > 1) {
+        n = atol(argv[1]);
+    } else {
+        n = 100000;
+    }
+    printf("%f", time(n));
+}
diff --git a/benchmark/gpu/matrix_add/plot.py b/benchmark/gpu/matrix_add/plot.py
@@ -0,0 +1,50 @@
+import os
+import subprocess
+import matplotlib.pyplot as plt
+
+result = subprocess.run(['make'], capture_output=True, text=True)
+# Define the folder containing the executables
+folder_path = './bin'  # Change this to your bin folder path
+
+# Define the input sizes to test
+start=10000
+end=10000
+step=100000
+
+input_sizes = list(range(start, end+1, step))
+# Initialize a dictionary to store runtimes for each executable
+runtimes = {exe: [] for exe in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, exe))}
+
+# Loop through each executable
+for exe in runtimes.keys():
+    exe_path = os.path.join(folder_path, exe)
+
+    # Loop through each input size
+    for n in range(start,end+1,step):
+        # Run the executable with the input size and capture its output
+        result = subprocess.run([exe_path, str(n)], capture_output=True, text=True)
+
+        # Parse the output to get the runtime
+        runtime = float(result.stdout.strip())
+        print(exe,runtime)
+
+        # Append the runtime to the corresponding executable list
+        runtimes[exe].append(runtime)
+
+# Plot the data
+plt.figure(figsize=(12, 6))
+
+# Loop through each executable and plot the runtimes
+for exe, times in runtimes.items():
+    plt.plot(input_sizes, times, marker='o', label=exe)
+
+plt.xlabel('Iterations')
+plt.ylabel('Runtime (s)')
+plt.title('Benchmark of Function Versions')
+plt.legend()
+plt.grid(True)
+plt.tight_layout()
+
+output_file = 'benchmark_plot.png'  # Specify your desired output file name and format
+plt.savefig(output_file)
+# Show the plot
diff --git a/benchmark/gpu/matrix_add/template.cuh b/benchmark/gpu/matrix_add/template.cuh
@@ -0,0 +1,10 @@
+#pragma once
+
+typedef struct {
+    int rows;
+    int cols;
+    float* data; // array
+} matrix;
+
+double time(int n);
+matrix* new_matrix_d(int rows, int cols);
diff --git a/benchmark/gpu/matrix_add/versions/1.cu b/benchmark/gpu/matrix_add/versions/1.cu
@@ -0,0 +1,44 @@
+#include "../template.cuh"
+
+matrix* new_matrix(int rows, int cols) {
+    matrix* res = (matrix*)malloc(sizeof(matrix));
+    res->rows = rows;
+    res->cols = cols;
+    res->data = (float*)malloc((rows * cols) * sizeof(float));
+    return res;
+}
+
+matrix* new_matrix_d(int rows, int cols) {
+    matrix* res = (matrix*)malloc(sizeof(matrix));
+    res->rows = rows;
+    res->cols = cols;
+    res->cols = cols;
+    cudaMalloc((void**)&(res->data), rows * cols * sizeof(float));
+    return res;
+}
+
+__global__ void matrix_add(float *a, float*b ,int rows)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx<rows){
+        a[idx]+=b[idx];
+    }
+}
+
+double time(int n) {
+    int row=100000;
+    matrix* a = new_matrix_d(row, 1);
+    matrix* b = new_matrix_d(row, 1);
+    cudaStream_t stream1;
+    cudaStreamCreate ( &stream1);
+
+    int thread=1024;
+    int block=((row+thread-1)/thread);
+
+    clock_t start = clock();
+    for(int i=0;i<n;i++){
+        matrix_add<<<1,1,0,stream1>>>(a->data,b->data,row);
+    }
+    double seconds = (double)(clock() - (double)start) / CLOCKS_PER_SEC;
+    return seconds;
+}