Skip to content

Commit

Permalink
Merge pull request #14 from nhatdongdang/feat/benchmark
Browse files Browse the repository at this point in the history
Add benchmark and multithread processing (experimental)
  • Loading branch information
rozukke authored Jul 1, 2024
2 parents b50c3f0 + 26e6693 commit 154692e
Show file tree
Hide file tree
Showing 30 changed files with 652 additions and 40 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,7 @@ speed_gpu
# Misc
.vscode/
build/
bin/
tensors/
results.csv
tensors/
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,27 @@ cmake_minimum_required(VERSION 3.16)
project(ichida-algo)

set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -Wall -Wextra")

set(CMAKE_C_STANDARD 99)
set(CMAKE_C_STANDARD_REQUIRED True)
set(CMAKE_VERBOSE_MAKEFILE ON)

set(SRC_DIR src)
set(INC_DIR include)
set(LIB_DIR lib)
set(TEST_DIR test)
set(BENCHMARK_DIR benchmark)

# Source files
file(GLOB_RECURSE SOURCE_FILES ${SRC_DIR}/*.c)

include_directories(include)

add_executable(speed_cpu ${SOURCE_FILES})
add_executable(benchmark ${SRC_DIR}/matrix.c ${BENCHMARK_DIR}/benchmark.c)

target_link_libraries(speed_cpu m)
target_link_libraries(benchmark m)



6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,10 @@ test: build
mv ./results.csv ./test
python3 ./test/verify_csv.py

bench: build
./build/benchmark

stat: build
python3 ./benchmark/stat.py


62 changes: 62 additions & 0 deletions benchmark/benchmark.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#include "../include/matrix.h"
#include "util.h"
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

double benchmark_matrix_mul(int iterations, matrix* a, matrix* b, matrix* c) {
double res = 0;
for (int i = 0; i < iterations; i++) {
uint64_t start = rdtsc();
matrix_mul(a, b, c);
uint64_t end = rdtsc();
res += (double)(end - start) / (a->rows * a->cols);
}
res /= iterations;
return res;
}

double benchmark_matrix_add(int iterations, matrix* a, matrix* b) {
double res = 0;
for (int i = 0; i < iterations; i++) {
uint64_t start = rdtsc();
matrix_add(a, b);
uint64_t end = rdtsc();
res += (double)(end - start) / (a->rows);
}
res /= iterations;
return res;
}

double benchmark_relu(int iterations, matrix* a) {
double res = 0;
for (int i = 0; i < iterations; i++) {
uint64_t start = rdtsc();
relu(a);
uint64_t end = rdtsc();
res += (double)(end - start) / (a->rows);
}
res /= iterations;
return res;
}

double benchmark_softmax(int iterations, matrix* a) {
double res = 0;
for (int i = 0; i < iterations; i++) {
uint64_t start = rdtsc();
softmax(a);
uint64_t end = rdtsc();
res += (double)(end - start) / (a->rows * 2);
}
res /= iterations;
return res;
}

int main() {
int iterations = 200000;
printf("- matrix_mul: %f CPE\n",
benchmark_matrix_mul(iterations, new_matrix(2000, 1000), new_matrix(2000, 1), new_matrix(2000, 1)));
printf("- matrix_add: %f CPE\n", benchmark_matrix_add(iterations, new_matrix(2000, 1), new_matrix(2000, 1)));
printf("- relu: %f CPE\n", benchmark_relu(iterations, new_matrix(2000, 1)));
printf("- softmax: %f CPE\n", benchmark_softmax(iterations, new_matrix(2000, 1)));
}
21 changes: 21 additions & 0 deletions benchmark/matrix_add/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
compile = gcc -O3 -march=native -ffast-math -funroll-loops -Wall -Wextra

SRC_DIR := versions
BIN_DIR := bin
SRC_FILES := $(wildcard $(SRC_DIR)/*.c)
EXECUTABLES := $(patsubst $(SRC_DIR)/%.c, $(BIN_DIR)/%, $(SRC_FILES))

all: clean $(EXECUTABLES)

clean:
rm -f -r bin
mkdir bin

$(BIN_DIR)/%: $(SRC_DIR)/%.c
$(compile) $< benchmark.c -o $@

plot: all
python3 ./plot.py



37 changes: 37 additions & 0 deletions benchmark/matrix_add/benchmark.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#include "matrix_add.h"
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

typedef struct {
float* data;
int rows;
int cols;
} matrix;

matrix* new_matrix(int rows, int cols) {
matrix* res = (matrix*)malloc(sizeof(matrix));
res->rows = rows;
res->cols = cols;
res->data = (float*)malloc((rows * cols) * sizeof(float));
return res;
}

int main(int argc, char* argv[]) {
long n = 0;
if (argc > 1) {
n = atol(argv[1]);
} else {
printf("Error!");
exit(1);
}
clock_t start = clock();
matrix* a = new_matrix(255, 1);
matrix* b = new_matrix(255, 1);
for (int i = 0; i < n; i++) {
matrix_add(a->data, b->data, a->rows);
}
float seconds = (float)(clock() - (float)start) / CLOCKS_PER_SEC;
printf("%f", seconds);
}
2 changes: 2 additions & 0 deletions benchmark/matrix_add/matrix_add.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#pragma once
void matrix_add(const float* src, float* __restrict__ dest, int rows);
49 changes: 49 additions & 0 deletions benchmark/matrix_add/plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os
import subprocess
import matplotlib.pyplot as plt

result = subprocess.run(['make'], capture_output=True, text=True)
# Define the folder containing the executables
folder_path = './bin' # Change this to your bin folder path

# Define the input sizes to test
start=100000
end=1000000
step=100000

input_sizes = list(range(start, end+1, step))
# Initialize a dictionary to store runtimes for each executable
runtimes = {exe: [] for exe in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, exe))}

# Loop through each executable
for exe in runtimes.keys():
exe_path = os.path.join(folder_path, exe)

# Loop through each input size
for n in range(start,end+1,step):
# Run the executable with the input size and capture its output
result = subprocess.run([exe_path, str(n)], capture_output=True, text=True)

# Parse the output to get the runtime
runtime = float(result.stdout.strip())
print(exe,runtime)

# Append the runtime to the corresponding executable list
runtimes[exe].append(runtime)

# Plot the data
plt.figure(figsize=(12, 6))

# Loop through each executable and plot the runtimes
for exe, times in runtimes.items():
plt.plot(input_sizes, times, marker='o', label=exe)

plt.xlabel('Input Size')
plt.ylabel('Runtime (s)')
plt.title('Benchmark of Function Versions')
plt.legend()
plt.grid(True)
plt.tight_layout()

# Show the plot
plt.show()
6 changes: 6 additions & 0 deletions benchmark/matrix_add/versions/matrix_add_v1.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#include "../matrix_add.h"
void matrix_add(const float* src, float* __restrict__ dest, int rows) {
for (int i = 0; i < rows; i++) {
dest[i] += src[i];
}
}
20 changes: 20 additions & 0 deletions benchmark/matrix_mul/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
compile = gcc -O3 -march=native -ffast-math -funroll-loops -Wall -Wextra
SRC_DIR := versions
BIN_DIR := bin
SRC_FILES := $(wildcard $(SRC_DIR)/*.c)
EXECUTABLES := $(patsubst $(SRC_DIR)/%.c, $(BIN_DIR)/%, $(SRC_FILES))

all: clean $(EXECUTABLES)

clean:
rm -f -r bin
mkdir bin

$(BIN_DIR)/%: $(SRC_DIR)/%.c
$(compile) $< benchmark.c -o $@

plot: all
python3 ./plot.py



38 changes: 38 additions & 0 deletions benchmark/matrix_mul/benchmark.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#include "matrix_mul.h"
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

typedef struct {
float* data;
int rows;
int cols;
} matrix;

matrix* new_matrix(int rows, int cols) {
matrix* res = (matrix*)malloc(sizeof(matrix));
res->rows = rows;
res->cols = cols;
res->data = (float*)malloc((rows * cols) * sizeof(float));
return res;
}

int main(int argc, char* argv[]) {
long n = 0;
if (argc > 1) {
n = atol(argv[1]);
} else {
printf("Error!");
exit(1);
}
clock_t start = clock();
matrix* a = new_matrix(98, 255);
matrix* b = new_matrix(255, 1);
matrix* c = new_matrix(98, 1);
for (int i = 0; i < n; i++) {
matrix_mul(a->data, b->data, c->data, c->rows, a->cols);
}
float seconds = (float)(clock() - (float)start) / CLOCKS_PER_SEC;
printf("%f", seconds);
}
2 changes: 2 additions & 0 deletions benchmark/matrix_mul/matrix_mul.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#pragma once
void matrix_mul(const float* weights, const float* inputs, float* __restrict__ results, int res_rows, int w_cols);
49 changes: 49 additions & 0 deletions benchmark/matrix_mul/plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os
import subprocess
import matplotlib.pyplot as plt

result = subprocess.run(['make'], capture_output=True, text=True)
# Define the folder containing the executables
folder_path = './bin' # Change this to your bin folder path

# Define the input sizes to test
start=100000
end=1000000
step=100000

input_sizes = list(range(start, end+1, step))
# Initialize a dictionary to store runtimes for each executable
runtimes = {exe: [] for exe in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, exe))}

# Loop through each executable
for exe in runtimes.keys():
exe_path = os.path.join(folder_path, exe)

# Loop through each input size
for n in range(start,end+1,step):
# Run the executable with the input size and capture its output
result = subprocess.run([exe_path, str(n)], capture_output=True, text=True)

# Parse the output to get the runtime
runtime = float(result.stdout.strip())
print(exe,runtime)

# Append the runtime to the corresponding executable list
runtimes[exe].append(runtime)

# Plot the data
plt.figure(figsize=(12, 6))

# Loop through each executable and plot the runtimes
for exe, times in runtimes.items():
plt.plot(input_sizes, times, marker='o', label=exe)

plt.xlabel('Input Size')
plt.ylabel('Runtime (s)')
plt.title('Benchmark of Function Versions')
plt.legend()
plt.grid(True)
plt.tight_layout()

# Show the plot
plt.show()
12 changes: 12 additions & 0 deletions benchmark/matrix_mul/versions/matrix_mul_v1.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#include "../matrix_mul.h"

void matrix_mul(const float* weights, const float* inputs, float* __restrict__ results, int res_rows, int w_cols) {
for (int i = 0; i < res_rows; i++) {
float sum = 0;
int h = i * w_cols;
for (int k = 0; k < w_cols; k++) {
sum += weights[h + k] * inputs[k];
}
results[i] = sum;
}
}
10 changes: 10 additions & 0 deletions benchmark/matrix_mul/versions/matrix_mul_v2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#include "../matrix_mul.h"

void matrix_mul(const float* weights, const float* inputs, float* __restrict__ results, int res_rows, int w_cols) {
for (int cur_row = 0; cur_row < res_rows; cur_row++) {
results[cur_row] = 0;
for (int col = 0; col < w_cols; col++) {
results[cur_row] += weights[cur_row * w_cols + col] * inputs[col];
}
}
}
Loading

0 comments on commit 154692e

Please sign in to comment.