diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f81ecb0..f1b40b1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,34 +1,34 @@ name: CI on: - push: - branches: main - paths: ['**.cu','**.c','**.cpp', '**.h', '**CMakeLists.txt'] - pull_request: - branches: main - paths: ['**.cu','**.c','**.cpp', '**.h', '**CMakeLists.txt'] + push: + branches: main + paths: ["**.cu", "**.c", "**.cpp", "**.h", "**CMakeLists.txt"] + pull_request: + branches: main + paths: ["**.cu", "**.c", "**.cpp", "**.h", "**CMakeLists.txt"] jobs: - build-and-test: - runs-on: ubuntu-latest + build-and-test: + runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 + steps: + - name: Checkout code + uses: actions/checkout@v4 - - name: Setup python - uses: actions/setup-python@v5 - with: - python-version: '3.10' + - name: Setup python + uses: actions/setup-python@v5 + with: + python-version: "3.10" - - name: Install dependencies - run: | - pip install pandas + - name: Install dependencies + run: | + pip install pandas - - name: Build project - run: | - make build - - - name: Run test suite - run: | - make test \ No newline at end of file + - name: Build project + run: | + make build + + - name: Run test suite + run: | + make test_cpu diff --git a/CMakeLists.txt b/CMakeLists.txt index d2d5cbd..faea7b0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,30 +1,36 @@ cmake_minimum_required(VERSION 3.16) -# Set the project name -project(ichida-algo) +project(ichida-algo LANGUAGES C CXX) set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -fopenmp -Wall -Wextra") - set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED True) -# set(CMAKE_VERBOSE_MAKEFILE ON) +set(CMAKE_VERBOSE_MAKEFILE ON) -set(SRC_DIR src) set(INC_DIR include) -set(LIB_DIR lib) -set(TEST_DIR test) -set(BENCHMARK_DIR benchmark) +set(SRC_DIR src) +set(CUDA_SRC_DIR cudasrc) -# Source files -file(GLOB_RECURSE SOURCE_FILES ${SRC_DIR}/*.c) +include_directories(${INC_DIR}) -include_directories(include) +file(GLOB_RECURSE SOURCE_FILES ${SRC_DIR}/*.c) add_executable(speed_cpu ${SOURCE_FILES}) -# add_executable(benchmark ${SRC_DIR}/matrix.c ${BENCHMARK_DIR}/benchmark.c) - -target_link_libraries(speed_cpu m pthread) -# target_link_libraries(benchmark m) - +target_link_libraries(speed_cpu m pthread gomp) + +find_package(CUDA) + +if(CUDA_FOUND) + enable_language(CUDA) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xptxas -O3 --use_fast_math -Xcompiler -march=native -unroll-aggressive -arch=sm_80") + find_package(MPI REQUIRED) + include_directories(${MPI_INCLUDE_PATH}) + file(GLOB_RECURSE CUDA_SOURCE_FILES ${CUDA_SRC_DIR}/*.cu) + add_executable(speed_gpu ${CUDA_SOURCE_FILES}) + set_target_properties(speed_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_link_libraries(speed_gpu m ${MPI_LIBRARIES}) +else() + message(STATUS "CUDA not found, only CPU version will be built.") +endif() diff --git a/Makefile b/Makefile index 09a0ea2..0253aee 100644 --- a/Makefile +++ b/Makefile @@ -1,37 +1,36 @@ -.PHONY: all test clean run build run_test +.PHONY: all clean build run_cpu run_gpu test_cpu test_gpu bench stat -all: rebuild +# Default iterations +iterations ?= 1000 + +all: build clean: rm -f test/results.csv rm -f results.csv rm -rf build - rm -f speed_cpu + rm -f speed_cpu speed_gpu build: clean - cmake -Bbuild - $(MAKE) -C ./build - mv ./build/speed_cpu ./ - -rebuild: - $(MAKE) -C ./build - mv ./build/speed_cpu ./ - -run: build - ./speed_demo_cpu.sh ./weights_and_biases.txt ./tensors - -run_test: build - ./speed_cpu ./weights_and_biases.txt ./tensors - -test: build - ./speed_cpu ./weights_and_biases.txt ./tensors 1 - mv ./results.csv ./test - python3 ./test/verify_csv.py + cmake -S . -B build -DCMAKE_BUILD_TYPE=Release + $(MAKE) -C build + cp -u build/speed_cpu ./ + if [ -f build/speed_gpu ]; then cp -u build/speed_gpu ./; fi -bench: build - ./build/benchmark +run_cpu: build + ./speed_cpu ./weights_and_biases.txt ./tensors $(iterations) -stat: build - python3 ./benchmark/stat.py +run_gpu: build + n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \ + mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors $(iterations) +test_cpu: build + ./speed_cpu ./weights_and_biases.txt ./tensors $(iterations) + mv ./results.csv ./test + python3 ./test/verify_csv.py +test_gpu: build + n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \ + mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors $(iterations) + mv ./results.csv ./test + python3 ./test/verify_csv.py \ No newline at end of file diff --git a/benchmark/benchmark.c b/benchmark/benchmark.c deleted file mode 100644 index 22c7eef..0000000 --- a/benchmark/benchmark.c +++ /dev/null @@ -1,62 +0,0 @@ -#include "../include/matrix.h" -#include "util.h" -#include -#include -#include - -double benchmark_matrix_mul(int iterations, matrix* a, matrix* b, matrix* c) { - double res = 0; - for (int i = 0; i < iterations; i++) { - uint64_t start = rdtsc(); - matrix_mul(a, b, c); - uint64_t end = rdtsc(); - res += (double)(end - start) / (a->rows * a->cols); - } - res /= iterations; - return res; -} - -double benchmark_matrix_add(int iterations, matrix* a, matrix* b) { - double res = 0; - for (int i = 0; i < iterations; i++) { - uint64_t start = rdtsc(); - matrix_add(a, b); - uint64_t end = rdtsc(); - res += (double)(end - start) / (a->rows); - } - res /= iterations; - return res; -} - -double benchmark_relu(int iterations, matrix* a) { - double res = 0; - for (int i = 0; i < iterations; i++) { - uint64_t start = rdtsc(); - relu(a); - uint64_t end = rdtsc(); - res += (double)(end - start) / (a->rows); - } - res /= iterations; - return res; -} - -double benchmark_softmax(int iterations, matrix* a) { - double res = 0; - for (int i = 0; i < iterations; i++) { - uint64_t start = rdtsc(); - softmax(a); - uint64_t end = rdtsc(); - res += (double)(end - start) / (a->rows * 2); - } - res /= iterations; - return res; -} - -int main() { - int iterations = 200000; - printf("- matrix_mul: %f CPE\n", - benchmark_matrix_mul(iterations, new_matrix(2000, 1000), new_matrix(2000, 1), new_matrix(2000, 1))); - printf("- matrix_add: %f CPE\n", benchmark_matrix_add(iterations, new_matrix(2000, 1), new_matrix(2000, 1))); - printf("- relu: %f CPE\n", benchmark_relu(iterations, new_matrix(2000, 1))); - printf("- softmax: %f CPE\n", benchmark_softmax(iterations, new_matrix(2000, 1))); -} \ No newline at end of file diff --git a/benchmark/matrix_add/Makefile b/benchmark/cpu/matrix_add/Makefile similarity index 100% rename from benchmark/matrix_add/Makefile rename to benchmark/cpu/matrix_add/Makefile diff --git a/benchmark/matrix_add/benchmark.c b/benchmark/cpu/matrix_add/benchmark.c similarity index 100% rename from benchmark/matrix_add/benchmark.c rename to benchmark/cpu/matrix_add/benchmark.c diff --git a/benchmark/matrix_add/matrix_add.h b/benchmark/cpu/matrix_add/matrix_add.h similarity index 100% rename from benchmark/matrix_add/matrix_add.h rename to benchmark/cpu/matrix_add/matrix_add.h diff --git a/benchmark/matrix_add/plot.py b/benchmark/cpu/matrix_add/plot.py similarity index 100% rename from benchmark/matrix_add/plot.py rename to benchmark/cpu/matrix_add/plot.py diff --git a/benchmark/matrix_add/versions/matrix_add_v1.c b/benchmark/cpu/matrix_add/versions/matrix_add_v1.c similarity index 100% rename from benchmark/matrix_add/versions/matrix_add_v1.c rename to benchmark/cpu/matrix_add/versions/matrix_add_v1.c diff --git a/benchmark/matrix_mul/Makefile b/benchmark/cpu/matrix_mul/Makefile similarity index 100% rename from benchmark/matrix_mul/Makefile rename to benchmark/cpu/matrix_mul/Makefile diff --git a/benchmark/matrix_mul/benchmark.c b/benchmark/cpu/matrix_mul/benchmark.c similarity index 100% rename from benchmark/matrix_mul/benchmark.c rename to benchmark/cpu/matrix_mul/benchmark.c diff --git a/benchmark/matrix_mul/matrix_mul.h b/benchmark/cpu/matrix_mul/matrix_mul.h similarity index 100% rename from benchmark/matrix_mul/matrix_mul.h rename to benchmark/cpu/matrix_mul/matrix_mul.h diff --git a/benchmark/matrix_mul/plot.py b/benchmark/cpu/matrix_mul/plot.py similarity index 100% rename from benchmark/matrix_mul/plot.py rename to benchmark/cpu/matrix_mul/plot.py diff --git a/benchmark/matrix_mul/versions/matrix_mul_v1.c b/benchmark/cpu/matrix_mul/versions/matrix_mul_v1.c similarity index 100% rename from benchmark/matrix_mul/versions/matrix_mul_v1.c rename to benchmark/cpu/matrix_mul/versions/matrix_mul_v1.c diff --git a/benchmark/matrix_mul/versions/matrix_mul_v2.c b/benchmark/cpu/matrix_mul/versions/matrix_mul_v2.c similarity index 100% rename from benchmark/matrix_mul/versions/matrix_mul_v2.c rename to benchmark/cpu/matrix_mul/versions/matrix_mul_v2.c diff --git a/benchmark/multithreading/plot.py b/benchmark/cpu/multithreading/plot.py similarity index 100% rename from benchmark/multithreading/plot.py rename to benchmark/cpu/multithreading/plot.py diff --git a/benchmark/relu/Makefile b/benchmark/cpu/relu/Makefile similarity index 100% rename from benchmark/relu/Makefile rename to benchmark/cpu/relu/Makefile diff --git a/benchmark/relu/benchmark.c b/benchmark/cpu/relu/benchmark.c similarity index 100% rename from benchmark/relu/benchmark.c rename to benchmark/cpu/relu/benchmark.c diff --git a/benchmark/relu/plot.py b/benchmark/cpu/relu/plot.py similarity index 100% rename from benchmark/relu/plot.py rename to benchmark/cpu/relu/plot.py diff --git a/benchmark/relu/relu.h b/benchmark/cpu/relu/relu.h similarity index 100% rename from benchmark/relu/relu.h rename to benchmark/cpu/relu/relu.h diff --git a/benchmark/relu/versions/relu_v1.c b/benchmark/cpu/relu/versions/relu_v1.c similarity index 100% rename from benchmark/relu/versions/relu_v1.c rename to benchmark/cpu/relu/versions/relu_v1.c diff --git a/benchmark/relu/versions/relu_v2.c b/benchmark/cpu/relu/versions/relu_v2.c similarity index 100% rename from benchmark/relu/versions/relu_v2.c rename to benchmark/cpu/relu/versions/relu_v2.c diff --git a/benchmark/softmax/Makefile b/benchmark/cpu/softmax/Makefile similarity index 100% rename from benchmark/softmax/Makefile rename to benchmark/cpu/softmax/Makefile diff --git a/benchmark/softmax/benchmark.c b/benchmark/cpu/softmax/benchmark.c similarity index 100% rename from benchmark/softmax/benchmark.c rename to benchmark/cpu/softmax/benchmark.c diff --git a/benchmark/softmax/plot.py b/benchmark/cpu/softmax/plot.py similarity index 100% rename from benchmark/softmax/plot.py rename to benchmark/cpu/softmax/plot.py diff --git a/benchmark/softmax/softmax.h b/benchmark/cpu/softmax/softmax.h similarity index 100% rename from benchmark/softmax/softmax.h rename to benchmark/cpu/softmax/softmax.h diff --git a/benchmark/softmax/versions/softmax_v1.c b/benchmark/cpu/softmax/versions/softmax_v1.c similarity index 100% rename from benchmark/softmax/versions/softmax_v1.c rename to benchmark/cpu/softmax/versions/softmax_v1.c diff --git a/benchmark/gpu/matrix_add/Makefile b/benchmark/gpu/matrix_add/Makefile new file mode 100644 index 0000000..c754193 --- /dev/null +++ b/benchmark/gpu/matrix_add/Makefile @@ -0,0 +1,19 @@ +compile = nvcc -O3 -arch=sm_75 --use_fast_math +SRC_DIR := versions +BIN_DIR := bin +SRC_FILES := $(wildcard $(SRC_DIR)/*.cu) +EXECUTABLES := $(patsubst $(SRC_DIR)/%.cu, $(BIN_DIR)/%, $(SRC_FILES)) + +all: clean $(EXECUTABLES) + +clean: + rm -f -r bin + mkdir bin + +$(BIN_DIR)/%: $(SRC_DIR)/%.cu + $(compile) $< benchmark.cu -o $@.exe + +plot: all + python3 ./plot.py + + diff --git a/benchmark/gpu/matrix_add/benchmark.cu b/benchmark/gpu/matrix_add/benchmark.cu new file mode 100644 index 0000000..cc2ce0d --- /dev/null +++ b/benchmark/gpu/matrix_add/benchmark.cu @@ -0,0 +1,13 @@ +#include "template.cuh" +#include +#include + +int main(int argc, char* argv[]) { + long n; + if (argc > 1) { + n = atol(argv[1]); + } else { + n = 100000; + } + printf("%f", time(n)); +} \ No newline at end of file diff --git a/benchmark/gpu/matrix_add/plot.py b/benchmark/gpu/matrix_add/plot.py new file mode 100644 index 0000000..703c9a8 --- /dev/null +++ b/benchmark/gpu/matrix_add/plot.py @@ -0,0 +1,50 @@ +import os +import subprocess +import matplotlib.pyplot as plt + +result = subprocess.run(['make'], capture_output=True, text=True) +# Define the folder containing the executables +folder_path = './bin' # Change this to your bin folder path + +# Define the input sizes to test +start=10000 +end=10000 +step=100000 + +input_sizes = list(range(start, end+1, step)) +# Initialize a dictionary to store runtimes for each executable +runtimes = {exe: [] for exe in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, exe))} + +# Loop through each executable +for exe in runtimes.keys(): + exe_path = os.path.join(folder_path, exe) + + # Loop through each input size + for n in range(start,end+1,step): + # Run the executable with the input size and capture its output + result = subprocess.run([exe_path, str(n)], capture_output=True, text=True) + + # Parse the output to get the runtime + runtime = float(result.stdout.strip()) + print(exe,runtime) + + # Append the runtime to the corresponding executable list + runtimes[exe].append(runtime) + +# Plot the data +plt.figure(figsize=(12, 6)) + +# Loop through each executable and plot the runtimes +for exe, times in runtimes.items(): + plt.plot(input_sizes, times, marker='o', label=exe) + +plt.xlabel('Iterations') +plt.ylabel('Runtime (s)') +plt.title('Benchmark of Function Versions') +plt.legend() +plt.grid(True) +plt.tight_layout() + +output_file = 'benchmark_plot.png' # Specify your desired output file name and format +plt.savefig(output_file) +# Show the plot \ No newline at end of file diff --git a/benchmark/gpu/matrix_add/template.cuh b/benchmark/gpu/matrix_add/template.cuh new file mode 100644 index 0000000..baa94a5 --- /dev/null +++ b/benchmark/gpu/matrix_add/template.cuh @@ -0,0 +1,10 @@ +#pragma once + +typedef struct { + int rows; + int cols; + float* data; // array +} matrix; + +double time(int n); +matrix* new_matrix_d(int rows, int cols); \ No newline at end of file diff --git a/benchmark/gpu/matrix_add/versions/1.cu b/benchmark/gpu/matrix_add/versions/1.cu new file mode 100644 index 0000000..cd4b001 --- /dev/null +++ b/benchmark/gpu/matrix_add/versions/1.cu @@ -0,0 +1,44 @@ +#include "../template.cuh" + +matrix* new_matrix(int rows, int cols) { + matrix* res = (matrix*)malloc(sizeof(matrix)); + res->rows = rows; + res->cols = cols; + res->data = (float*)malloc((rows * cols) * sizeof(float)); + return res; +} + +matrix* new_matrix_d(int rows, int cols) { + matrix* res = (matrix*)malloc(sizeof(matrix)); + res->rows = rows; + res->cols = cols; + res->cols = cols; + cudaMalloc((void**)&(res->data), rows * cols * sizeof(float)); + return res; +} + +__global__ void matrix_add(float *a, float*b ,int rows) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx>>(a->data,b->data,row); + } + double seconds = (double)(clock() - (double)start) / CLOCKS_PER_SEC; + return seconds; +} \ No newline at end of file diff --git a/benchmark/gpu/matrix_add/versions/cpu.cu b/benchmark/gpu/matrix_add/versions/cpu.cu new file mode 100644 index 0000000..ce36a1c --- /dev/null +++ b/benchmark/gpu/matrix_add/versions/cpu.cu @@ -0,0 +1,37 @@ +#include "../template.cuh" + +matrix* new_matrix(int rows, int cols) { + matrix* res = (matrix*)malloc(sizeof(matrix)); + res->rows = rows; + res->cols = cols; + res->data = (float*)malloc((rows * cols) * sizeof(float)); + return res; +} + +matrix* new_matrix_d(int rows, int cols) { + matrix* res = (matrix*)malloc(sizeof(matrix)); + res->rows = rows; + res->cols = cols; + res->cols = cols; + cudaMalloc((void**)&(res->data), rows * cols * sizeof(float)); + return res; +} + +void matrix_add(float* a, float* b, int rows) { + for (int i = 0; i < rows; i++) { + a[i] += b[i]; + } +} + +double time(int n) { + int row=100000; + matrix* a = new_matrix(row, 1); + matrix* b = new_matrix(row, 1); + + clock_t start = clock(); + for (int i = 0; i < n; i++) { + matrix_add(a->data, b->data,row); + } + double seconds = (double)(clock() - (double)start) / CLOCKS_PER_SEC; + return seconds; +} \ No newline at end of file diff --git a/benchmark/multithreading/v1.png b/benchmark/multithreading/v1.png deleted file mode 100644 index 042bd8a..0000000 Binary files a/benchmark/multithreading/v1.png and /dev/null differ diff --git a/benchmark/util.h b/benchmark/util.h deleted file mode 100644 index 7d25c35..0000000 --- a/benchmark/util.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef UTIL_H -#define UTIL_H - -#ifdef __ARM_ARCH_ISA_A64 -#include - -inline uint64_t rdtsc() { - uint64_t val; - __asm volatile("mrs %0, cntvct_el0" : "=r"(val)); - return val; -} - -#else -#include -#include - -inline uint64_t rdtsc() { return __rdtsc(); } -#endif - -#endif // UTIL_H diff --git a/cudasrc/main.cu b/cudasrc/main.cu new file mode 100644 index 0000000..732c948 --- /dev/null +++ b/cudasrc/main.cu @@ -0,0 +1,277 @@ +#include "matrix.cuh" +#include +#include +#include +#include +#include +#include + +#define NUM_LAYERS 7 + +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error in %s (%s:%d): %s\n", __func__, __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +matrix* weights[NUM_LAYERS]; +matrix* biases[NUM_LAYERS]; + +// device weights and biases; +matrix** d_weights; +matrix** d_biases; + +float* inputs; +float* d_inputs; +int* results; +int* d_results; + +char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I', 'i', + 'J', 'j', 'K', 'k', 'L', 'l', 'M', 'm', 'N', 'n', 'O', 'o', 'P', 'p', 'Q', 'q', 'R', 'r', + 'S', 's', 'T', 't', 'U', 'u', 'V', 'v', 'W', 'w', 'X', 'x', 'Y', 'y', 'Z', 'z'}; + +void process_weights_str(char* line, int layer) { + char* token; + float value; + const char* delimiter = ","; + + token = strtok(line, delimiter); + int n = (weights[layer]->rows) * (weights[layer]->cols); + for (int i = 0; i < n; i++) { + value = strtof(token, NULL); + (weights[layer]->data)[i] = value; + token = strtok(NULL, delimiter); + } +} + +void process_biases_str(char* line, int layer) { + char* token; + float value; + const char* delimiter = ","; + + token = strtok(line, delimiter); + + int n = biases[layer]->rows; + for (int i = 0; i < n; i++) { + value = strtof(token, NULL); + (biases[layer]->data)[i] = value; + token = strtok(NULL, delimiter); + } +} + +void read_model(const char* file_name) { + FILE* file = fopen(file_name, "r"); + + char* line = NULL; + size_t len = 0; + int line_number = 0; + int layer = 0; + + while ((getline(&line, &len, file)) != -1) { + if ((line_number - 1) % 4 == 0) { + process_weights_str(line, layer); + } else if ((line_number - 3) % 4 == 0) { + process_biases_str(line, layer); + layer++; + } + line_number++; + } + + free(line); + fclose(file); +} + +void read_tensor(float* a, const char* fileName) { + FILE* file = fopen(fileName, "r"); + char* line = NULL; + size_t len = 0; + + getline(&line, &len, file); + char* token; + float value; + const char* delimiter = ","; + token = strtok(line, delimiter); + + for (int i = 0; i < 225; i++) { + value = strtof(token, NULL); + a[i] = value; + token = strtok(NULL, delimiter); + } + free(line); + fclose(file); +} + +__device__ void propagate_fwd(matrix* weights, float* input_layer, float* output_layer, matrix* biases) { + matrix_mul(weights->data, input_layer, output_layer, weights->rows, weights->cols); + matrix_add(output_layer, biases->data, biases->rows); +} + +#define BLOCKS 108 +#define THREADS_PER_BLOCK 1024 + +__global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input, + int in_num) { + + __shared__ float sharedInput[225]; + float out1[98]; + float out2[65]; + + int num_threads = blockDim.x * gridDim.x; + int thread_idx = (blockIdx.x * blockDim.x + threadIdx.x); + + float* input = (float*)&d_inputs[in_num * 225]; + + if (threadIdx.x < 225) { + sharedInput[threadIdx.x] = input[threadIdx.x]; + } + __syncthreads(); + + for (int i = thread_idx; i < it_per_input; i += num_threads) { + propagate_fwd(d_weights[0], sharedInput, out1, d_biases[0]); + relu(out1, 98); + + propagate_fwd(d_weights[1], out1, out2, d_biases[1]); + relu(out2, 65); + + propagate_fwd(d_weights[2], out2, out1, d_biases[2]); + relu(out1, 50); + + propagate_fwd(d_weights[3], out1, out2, d_biases[3]); + relu(out2, 30); + + propagate_fwd(d_weights[4], out2, out1, d_biases[4]); + relu(out1, 25); + + propagate_fwd(d_weights[5], out1, out2, d_biases[5]); + relu(out2, 40); + + propagate_fwd(d_weights[6], out2, out1, d_biases[6]); + softmax(out1, 52); + + d_results[in_num] = argmax(out1, 52); + } +} +int main(int argc, char* argv[]) { + MPI_Init(&argc, &argv); + int totalProcess, processId; + MPI_Comm_size(MPI_COMM_WORLD, &totalProcess); // size + MPI_Comm_rank(MPI_COMM_WORLD, &processId); // gpuid + + if (argc < 4) { + printf("Not enough arguments. Usage: speed_cpu \n"); + MPI_Finalize(); + return EXIT_FAILURE; + } + + // get no of gpu + int deviceCount; + cudaGetDeviceCount(&deviceCount); + int deviceId = processId % deviceCount; + cudaSetDevice(deviceId); + + // Start timing + struct timeval stop, start; + gettimeofday(&start, NULL); + + weights[0] = new_matrix(98, 225); + weights[1] = new_matrix(65, 98); + weights[2] = new_matrix(50, 65); + weights[3] = new_matrix(30, 50); + weights[4] = new_matrix(25, 30); + weights[5] = new_matrix(40, 25); + weights[6] = new_matrix(52, 40); + + biases[0] = new_matrix(98, 1); + biases[1] = new_matrix(65, 1); + biases[2] = new_matrix(50, 1); + biases[3] = new_matrix(30, 1); + biases[4] = new_matrix(25, 1); + biases[5] = new_matrix(40, 1); + biases[6] = new_matrix(52, 1); + read_model(argv[1]); + + CUDA_CHECK(cudaMalloc(&d_weights, NUM_LAYERS * sizeof(matrix*))); + CUDA_CHECK(cudaMalloc(&d_biases, NUM_LAYERS * sizeof(matrix*))); + for (int i = 0; i < NUM_LAYERS; i++) { + matrix* a = copy_to_device(weights[i]); + matrix* b = copy_to_device(biases[i]); + CUDA_CHECK(cudaMemcpy(&(d_weights[i]), &a, sizeof(matrix*), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(&(d_biases[i]), &b, sizeof(matrix*), cudaMemcpyHostToDevice)); + } + + const char* directory_path = argv[2]; + struct dirent* entry; + DIR* dir = opendir(directory_path); + + // Read and process inputs + char* file_name = (char*)malloc((100) * sizeof(char)); + char* file_num_str = (char*)malloc((100) * sizeof(char)); + + int file_num; + int input_count = 0; + while ((entry = readdir(dir)) != NULL) { + if (entry->d_type == DT_REG) { + input_count++; + } + } + + results = (int*)malloc((input_count) * sizeof(int)); + inputs = (float*)malloc((input_count) * sizeof(float) * 225); + + cudaMalloc(&d_results, (input_count) * sizeof(int)); + cudaMalloc(&d_inputs, (input_count) * sizeof(float) * 225); + + dir = opendir(directory_path); + while ((entry = readdir(dir)) != NULL) { + if (entry->d_type == DT_REG) { + strcpy(file_num_str, entry->d_name); + file_num_str[strlen(entry->d_name) - 7] = '\0'; + file_num = atoi(entry->d_name); + strcpy(file_name, directory_path); + strcat(file_name, "/"); + strcat(file_name, entry->d_name); + read_tensor((float*)&inputs[(file_num - 1) * 225], file_name); + } + } + + free(file_name); + free(file_num_str); + closedir(dir); + + cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * input_count, cudaMemcpyHostToDevice); + + int it_num = atoi(argv[3]); + int gpu_it_num = it_num / totalProcess + (processId < (it_num % totalProcess) ? 1 : 0); + + struct timeval stop1, start1; + gettimeofday(&start1, NULL); + + cudaDeviceSynchronize(); + for (int i = 0; i < input_count; i++) { + infer<<>>(d_inputs, d_results, d_weights, d_biases, gpu_it_num, i); + CUDA_CHECK(cudaGetLastError()); + } + cudaDeviceSynchronize(); + + if (processId == 0) { + cudaMemcpy(results, d_results, (input_count) * (sizeof(int)), cudaMemcpyDeviceToHost); + gettimeofday(&stop1, NULL); + printf("Process %d - Inference: %lu us\n", processId, + (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec); + FILE* csv_file = fopen("results.csv", "w+"); + fprintf(csv_file, "image_number, guess\n"); + for (int i = 0; i < input_count; i++) { + fprintf(csv_file, "%d, %c\n", i + 1, letters[results[i]]); + } + fclose(csv_file); + } + // Time taken + gettimeofday(&stop, NULL); + printf("Process %d - Total: %lu us\n", processId, + (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec); + MPI_Finalize(); + return EXIT_SUCCESS; +} diff --git a/cudasrc/matrix.cu b/cudasrc/matrix.cu new file mode 100644 index 0000000..c223f6c --- /dev/null +++ b/cudasrc/matrix.cu @@ -0,0 +1,110 @@ +#include "matrix.cuh" +#include "util.cuh" +#include +#include +#include + +#define UNROLL_FACTOR 8 + +__host__ __device__ matrix* new_matrix(int rows, int cols) { + matrix* res = (matrix*)malloc(sizeof(matrix)); + res->rows = rows; + res->cols = cols; + res->data = (float*)malloc((rows * cols) * sizeof(float)); + return res; +} + +__global__ void alloc(matrix* res, float* data, int rows, int cols) { + res->rows = rows; + res->cols = cols; + res->data = data; +} + +matrix* new_matrix_d(int rows, int cols) { + matrix* res; + CUDA_CHECK(cudaMalloc(&res, sizeof(matrix))); + float* data; + cudaMalloc(&data, rows * cols * sizeof(float)); + alloc<<<1, 1>>>(res, data, rows, cols); + return res; +} + +matrix* copy_to_device(matrix* h_mat) { + matrix* res; + CUDA_CHECK(cudaMalloc(&res, sizeof(matrix))); + float* data; + cudaMalloc(&data, h_mat->rows * h_mat->cols * sizeof(float)); + cudaMemcpy(data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(float), cudaMemcpyHostToDevice); + alloc<<<1, 1>>>(res, data, h_mat->rows, h_mat->cols); + return res; +} + +__device__ __host__ matrix* create_copy(matrix* mat) { + matrix* res = (matrix*)malloc(sizeof(matrix)); + res->rows = mat->rows; + res->cols = mat->cols; + res->data = (float*)malloc((res->rows * res->cols) * sizeof(float)); + memcpy(res->data, mat->data, res->rows * res->cols * sizeof(float)); + return res; +} + +__device__ void matrix_mul(float* weight, float* input, float* result, int w_rows, int w_cols) { + for (int i = 0; i < w_rows; i++) { + float sum = 0; + int j = 0; + + for (; j <= w_cols - 4; j += 4) { + sum += weight[i * w_cols + j] * input[j]; + sum += weight[i * w_cols + j + 1] * input[j + 1]; + sum += weight[i * w_cols + j + 2] * input[j + 2]; + sum += weight[i * w_cols + j + 3] * input[j + 3]; + } + for (; j < w_cols; j++) { + sum += weight[i * w_cols + j] * input[j]; + } + result[i] = sum; + } +} + +__device__ void matrix_add(float* a, float* b, int rows) { + for (int i = 0; i < rows; i++) { + a[i] += b[i]; + } +} + +__device__ void relu(float* a, int rows) { + for (int i = 0; i < rows; i++) { + a[i] = (a[i] > 0) ? a[i] : 0; + } +} + +// Hacky but fast and accurate for existing inputs +static __device__ inline float fastexp(float x) { + int tmp = (int)(1512775 * x + 1072632447); + float result; + memcpy(&result, &tmp, sizeof(result)); + return result; +} + +__device__ void softmax(float* a, int rows) { + float sum = 0.0; + for (size_t i = 0; i < rows; i++) { + sum += __expf(a[i]); + } + float t = __logf(sum); + for (size_t i = 0; i < rows; i++) { + a[i] = __expf(a[i] - t); + } +} + +__device__ int argmax(float* a, int rows) { + float res = a[0]; + int idx = 0; + for (int i = 0; i < rows; i++) { + if (res < a[i]) { + res = a[i]; + idx = i; + } + } + return idx; +} diff --git a/cudasrc/matrix.cuh b/cudasrc/matrix.cuh new file mode 100644 index 0000000..b2191cb --- /dev/null +++ b/cudasrc/matrix.cuh @@ -0,0 +1,25 @@ +#pragma once + +typedef struct { + int rows; + int cols; + float* data; // array +} matrix; + + __host__ __device__ matrix* new_matrix(int rows, int cols); + +matrix* copy_to_device(matrix* h_mat); + +matrix* new_matrix_d(int rows, int cols); + +__device__ void matrix_mul(float* a, float* b, float* c, int rows, int cols); + +__device__ void matrix_add(float* a, float* b, int rows); + +__device__ void relu(float* a, int rows); + +__device__ void softmax(float* a, int rows); + +__device__ int argmax(float* a, int rows); + +__device__ __host__ matrix* create_copy(matrix* mat); diff --git a/cudasrc/util.cuh b/cudasrc/util.cuh new file mode 100644 index 0000000..a6f988a --- /dev/null +++ b/cudasrc/util.cuh @@ -0,0 +1,8 @@ +#pragma once + +#define CUDA_CHECK(call) \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error in %s (%s:%d): %s\n", __func__, __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ No newline at end of file