Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Main staging #45

Merged
merged 46 commits into from
Jul 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
f57e89c
Setting up gpu environment
johnathanchann Jun 28, 2024
6274c26
Fixed file naming
johnathanchann Jul 1, 2024
ce3c5c5
Fix makefile
johnathanchann Jul 1, 2024
2096c57
Merge pull request #20 from Jinxto/feature/cuduck
nhatdongdang Jul 1, 2024
c1c1c48
Restructured
nhatdongdang Jul 4, 2024
1c8530b
Fix main
nhatdongdang Jul 4, 2024
89d09f5
Fix styling
nhatdongdang Jul 4, 2024
080492b
Remove unused functions
nhatdongdang Jul 4, 2024
d12d0ed
Optimize a little bit
nhatdongdang Jul 4, 2024
f0859f7
Update ci for cuda branch
nhatdongdang Jul 4, 2024
44c9b58
Remove CI since github action don't have a gpu
nhatdongdang Jul 4, 2024
83d9ff6
Fix styling
nhatdongdang Jul 4, 2024
a4b925b
Merge pull request #25 from nhatdongdang/feat/gpu-optimization
johnathanchann Jul 4, 2024
02e0600
Update
nhatdongdang Jul 5, 2024
2ab6689
Change optimal block size
nhatdongdang Jul 5, 2024
c1639bc
Tune block size and grid size
nhatdongdang Jul 5, 2024
e8db37a
Fix style
nhatdongdang Jul 5, 2024
2e2c1e5
Merge pull request #26 from nhatdongdang/feat/gpu-multithread
nhatdongdang Jul 5, 2024
4f43728
Performance gain fine tuning
johnathanchann Jul 5, 2024
c95569b
Update reel
nhatdongdang Jul 5, 2024
b3881fc
Remove unnecessary print
nhatdongdang Jul 5, 2024
4bfeb3d
Merge pull request #27 from Jinxto/cuda-styling
nhatdongdang Jul 5, 2024
3c339a6
Remove print
nhatdongdang Jul 5, 2024
31e4844
Merge pull request #28 from nhatdongdang/hotfix/remove-print
johnathanchann Jul 5, 2024
3adde2f
Add iter per input to exe arg
nhatdongdang Jul 5, 2024
9759b85
Merge pull request #29 from nhatdongdang/feat/add-arg
johnathanchann Jul 5, 2024
9402f01
Optimize param
nhatdongdang Jul 6, 2024
dc885f0
Optimize flag
nhatdongdang Jul 6, 2024
9d479e7
Fix softmax and argmax precision
nhatdongdang Jul 6, 2024
d4b590e
Further optimization using shared memory and local memory
nhatdongdang Jul 6, 2024
dd98f99
Fix compilation flag
nhatdongdang Jul 6, 2024
02dc834
Merge pull request #30 from nhatdongdang/feat/optimize-param
rozukke Jul 6, 2024
0459841
Merge pull request #1 from kachi-group/cuda-staging
johnathanchann Jul 6, 2024
f1de8af
mpi-base
johnathanchann Jul 6, 2024
42382e1
divide by inference
johnathanchann Jul 6, 2024
23ff410
Added mpi-optimization
johnathanchann Jul 6, 2024
70c358b
fine tuning
johnathanchann Jul 6, 2024
f273bb4
fix styling issues
johnathanchann Jul 6, 2024
cd400be
Merge pull request #35 from Jinxto/mpi-v2
nhatdongdang Jul 6, 2024
4468b9e
Merge pull request #40 from kachi-group/mpi-staging
nhatdongdang Jul 8, 2024
6ef3410
Change src to cudasrc
nhatdongdang Jul 8, 2024
5542177
Merge branch 'cuda-merge'
nhatdongdang Jul 8, 2024
27cc4b9
Merge pull request #43 from nhatdongdang/main-staging
johnathanchann Jul 8, 2024
d7ce11b
cmake cuda updates
johnathanchann Jul 8, 2024
f38c5b7
remove plot
johnathanchann Jul 8, 2024
cb46223
Merge pull request #44 from Jinxto/cmake
nhatdongdang Jul 8, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 25 additions & 25 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,34 +1,34 @@
name: CI

on:
push:
branches: main
paths: ['**.cu','**.c','**.cpp', '**.h', '**CMakeLists.txt']
pull_request:
branches: main
paths: ['**.cu','**.c','**.cpp', '**.h', '**CMakeLists.txt']
push:
branches: main
paths: ["**.cu", "**.c", "**.cpp", "**.h", "**CMakeLists.txt"]
pull_request:
branches: main
paths: ["**.cu", "**.c", "**.cpp", "**.h", "**CMakeLists.txt"]

jobs:
build-and-test:
runs-on: ubuntu-latest
build-and-test:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v4
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Setup python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Setup python
uses: actions/setup-python@v5
with:
python-version: "3.10"

- name: Install dependencies
run: |
pip install pandas
- name: Install dependencies
run: |
pip install pandas
- name: Build project
run: |
make build
- name: Run test suite
run: |
make test
- name: Build project
run: |
make build
- name: Run test suite
run: |
make test_cpu
38 changes: 22 additions & 16 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,30 +1,36 @@
cmake_minimum_required(VERSION 3.16)

# Set the project name
project(ichida-algo)
project(ichida-algo LANGUAGES C CXX)

set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -fopenmp -Wall -Wextra")

set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED True)
# set(CMAKE_VERBOSE_MAKEFILE ON)
set(CMAKE_VERBOSE_MAKEFILE ON)

set(SRC_DIR src)
set(INC_DIR include)
set(LIB_DIR lib)
set(TEST_DIR test)
set(BENCHMARK_DIR benchmark)
set(SRC_DIR src)
set(CUDA_SRC_DIR cudasrc)

# Source files
file(GLOB_RECURSE SOURCE_FILES ${SRC_DIR}/*.c)
include_directories(${INC_DIR})

include_directories(include)
file(GLOB_RECURSE SOURCE_FILES ${SRC_DIR}/*.c)

add_executable(speed_cpu ${SOURCE_FILES})
# add_executable(benchmark ${SRC_DIR}/matrix.c ${BENCHMARK_DIR}/benchmark.c)

target_link_libraries(speed_cpu m pthread)
# target_link_libraries(benchmark m)

target_link_libraries(speed_cpu m pthread gomp)

find_package(CUDA)

if(CUDA_FOUND)
enable_language(CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xptxas -O3 --use_fast_math -Xcompiler -march=native -unroll-aggressive -arch=sm_80")
find_package(MPI REQUIRED)
include_directories(${MPI_INCLUDE_PATH})
file(GLOB_RECURSE CUDA_SOURCE_FILES ${CUDA_SRC_DIR}/*.cu)
add_executable(speed_gpu ${CUDA_SOURCE_FILES})
set_target_properties(speed_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(speed_gpu m ${MPI_LIBRARIES})
else()
message(STATUS "CUDA not found, only CPU version will be built.")
endif()


49 changes: 24 additions & 25 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,37 +1,36 @@
.PHONY: all test clean run build run_test
.PHONY: all clean build run_cpu run_gpu test_cpu test_gpu bench stat

all: rebuild
# Default iterations
iterations ?= 1000

all: build

clean:
rm -f test/results.csv
rm -f results.csv
rm -rf build
rm -f speed_cpu
rm -f speed_cpu speed_gpu

build: clean
cmake -Bbuild
$(MAKE) -C ./build
mv ./build/speed_cpu ./

rebuild:
$(MAKE) -C ./build
mv ./build/speed_cpu ./

run: build
./speed_demo_cpu.sh ./weights_and_biases.txt ./tensors

run_test: build
./speed_cpu ./weights_and_biases.txt ./tensors

test: build
./speed_cpu ./weights_and_biases.txt ./tensors 1
mv ./results.csv ./test
python3 ./test/verify_csv.py
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
$(MAKE) -C build
cp -u build/speed_cpu ./
if [ -f build/speed_gpu ]; then cp -u build/speed_gpu ./; fi

bench: build
./build/benchmark
run_cpu: build
./speed_cpu ./weights_and_biases.txt ./tensors $(iterations)

stat: build
python3 ./benchmark/stat.py
run_gpu: build
n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \
mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors $(iterations)

test_cpu: build
./speed_cpu ./weights_and_biases.txt ./tensors $(iterations)
mv ./results.csv ./test
python3 ./test/verify_csv.py

test_gpu: build
n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \
mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors $(iterations)
mv ./results.csv ./test
python3 ./test/verify_csv.py
62 changes: 0 additions & 62 deletions benchmark/benchmark.c

This file was deleted.

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
19 changes: 19 additions & 0 deletions benchmark/gpu/matrix_add/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
compile = nvcc -O3 -arch=sm_75 --use_fast_math
SRC_DIR := versions
BIN_DIR := bin
SRC_FILES := $(wildcard $(SRC_DIR)/*.cu)
EXECUTABLES := $(patsubst $(SRC_DIR)/%.cu, $(BIN_DIR)/%, $(SRC_FILES))

all: clean $(EXECUTABLES)

clean:
rm -f -r bin
mkdir bin

$(BIN_DIR)/%: $(SRC_DIR)/%.cu
$(compile) $< benchmark.cu -o [email protected]

plot: all
python3 ./plot.py


13 changes: 13 additions & 0 deletions benchmark/gpu/matrix_add/benchmark.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#include "template.cuh"
#include <stdio.h>
#include <time.h>

int main(int argc, char* argv[]) {
long n;
if (argc > 1) {
n = atol(argv[1]);
} else {
n = 100000;
}
printf("%f", time(n));
}
50 changes: 50 additions & 0 deletions benchmark/gpu/matrix_add/plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os
import subprocess
import matplotlib.pyplot as plt

result = subprocess.run(['make'], capture_output=True, text=True)
# Define the folder containing the executables
folder_path = './bin' # Change this to your bin folder path

# Define the input sizes to test
start=10000
end=10000
step=100000

input_sizes = list(range(start, end+1, step))
# Initialize a dictionary to store runtimes for each executable
runtimes = {exe: [] for exe in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, exe))}

# Loop through each executable
for exe in runtimes.keys():
exe_path = os.path.join(folder_path, exe)

# Loop through each input size
for n in range(start,end+1,step):
# Run the executable with the input size and capture its output
result = subprocess.run([exe_path, str(n)], capture_output=True, text=True)

# Parse the output to get the runtime
runtime = float(result.stdout.strip())
print(exe,runtime)

# Append the runtime to the corresponding executable list
runtimes[exe].append(runtime)

# Plot the data
plt.figure(figsize=(12, 6))

# Loop through each executable and plot the runtimes
for exe, times in runtimes.items():
plt.plot(input_sizes, times, marker='o', label=exe)

plt.xlabel('Iterations')
plt.ylabel('Runtime (s)')
plt.title('Benchmark of Function Versions')
plt.legend()
plt.grid(True)
plt.tight_layout()

output_file = 'benchmark_plot.png' # Specify your desired output file name and format
plt.savefig(output_file)
# Show the plot
10 changes: 10 additions & 0 deletions benchmark/gpu/matrix_add/template.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#pragma once

typedef struct {
int rows;
int cols;
float* data; // array
} matrix;

double time(int n);
matrix* new_matrix_d(int rows, int cols);
44 changes: 44 additions & 0 deletions benchmark/gpu/matrix_add/versions/1.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#include "../template.cuh"

matrix* new_matrix(int rows, int cols) {
matrix* res = (matrix*)malloc(sizeof(matrix));
res->rows = rows;
res->cols = cols;
res->data = (float*)malloc((rows * cols) * sizeof(float));
return res;
}

matrix* new_matrix_d(int rows, int cols) {
matrix* res = (matrix*)malloc(sizeof(matrix));
res->rows = rows;
res->cols = cols;
res->cols = cols;
cudaMalloc((void**)&(res->data), rows * cols * sizeof(float));
return res;
}

__global__ void matrix_add(float *a, float*b ,int rows)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<rows){
a[idx]+=b[idx];
}
}

double time(int n) {
int row=100000;
matrix* a = new_matrix_d(row, 1);
matrix* b = new_matrix_d(row, 1);
cudaStream_t stream1;
cudaStreamCreate ( &stream1);

int thread=1024;
int block=((row+thread-1)/thread);

clock_t start = clock();
for(int i=0;i<n;i++){
matrix_add<<<1,1,0,stream1>>>(a->data,b->data,row);
}
double seconds = (double)(clock() - (double)start) / CLOCKS_PER_SEC;
return seconds;
}
Loading