Skip to content

Commit

Permalink
Merge pull request #8 from rozukke/fix/speed-demon
Browse files Browse the repository at this point in the history
Fix remaining serious performance issues
  • Loading branch information
nhatdongdang authored Jun 26, 2024
2 parents adc90cc + 05063db commit 8090249
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 19 deletions.
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@ cmake_minimum_required(VERSION 3.16)
# Set the project name
project(ichida-algo)

set(CMAKE_CXX_FLAGS "-O3 -Wall -Wextra")
set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -Wall -Wextra")
set(CMAKE_C_STANDARD 99)
set(CMAKE_C_STANDARD_REQUIRED True)
set(CMAKE_VERBOSE_MAKEFILE ON)

set(SRC_DIR src)
set(INC_DIR include)
Expand Down
27 changes: 21 additions & 6 deletions src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>

matrix* weights[7];
matrix* biases[7];
#define NUM_LAYERS 7

matrix* weights[NUM_LAYERS];
matrix* biases[NUM_LAYERS];

char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I', 'i',
'J', 'j', 'K', 'k', 'L', 'l', 'M', 'm', 'N', 'n', 'O', 'o', 'P', 'p', 'Q', 'q', 'R', 'r',
Expand Down Expand Up @@ -66,15 +69,13 @@ void read_tensor(matrix* a, const char* fileName) {
FILE* file = fopen(fileName, "r");
char* line = NULL;
size_t len = 0;
ssize_t read;
int line_number = 0;

getline(&line, &len, file);
char* token;
float value;
const char* delimiter = ",";
token = strtok(line, delimiter);
int size = 0;

for (int i = 0; i < 225; i++) {
value = strtof(token, NULL);
(a->data)[i] = value;
Expand Down Expand Up @@ -103,7 +104,7 @@ int get_max(matrix* a) {
}

int infer(matrix* input) {
matrix* mdl_layers[7];
matrix* mdl_layers[NUM_LAYERS];
mdl_layers[0] = new_matrix(98, 1);
mdl_layers[1] = new_matrix(65, 1);
mdl_layers[2] = new_matrix(50, 1);
Expand Down Expand Up @@ -132,6 +133,15 @@ int infer(matrix* input) {
}

int main(int argc, char* argv[]) {
if (argc < 3) {
printf("Not enough arguments.");
return EXIT_FAILURE;
}

// Start timing
struct timeval stop, start;
gettimeofday(&start, NULL);

// TODO: find a way to load static weights and biases
// Load model (The memory of those code should be initialize during compile time to enchance the speed)
weights[0] = new_matrix(98, 225);
Expand Down Expand Up @@ -197,5 +207,10 @@ int main(int argc, char* argv[]) {
fprintf(csv_file, "%d, %c\n", i, letters[results[i]]);
}
fclose(csv_file);

// Time taken
gettimeofday(&stop, NULL);
printf("took %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);

return EXIT_SUCCESS;
}
71 changes: 59 additions & 12 deletions src/matrix.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#include "../include/matrix.h"
#include "math.h"
#include "stdio.h"
#include "stdlib.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#define UNROLL_FACTOR 8

matrix* new_matrix(int rows, int cols) {
matrix* res = (matrix*)malloc(sizeof(matrix));
Expand All @@ -11,19 +13,64 @@ matrix* new_matrix(int rows, int cols) {
return res;
}

void matrix_mul(const matrix* __restrict__ a, const matrix* __restrict__ b, const matrix* __restrict__ result) {
int m = result->rows;
int p = a->cols;
for (int i = 0; i < m; i++) {
float sum = 0;
int h = i * p;
for (int k = 0; k < p; k++) {
sum += (a->data)[h + k] * ((b->data)[k]);
// Loop unrolling optimisation with a factor of 8 which should be enough to saturate a Zen3 core
void matrix_mul(const matrix* weights, const matrix* inputs, const matrix* __restrict__ result) {
int res_rows = result->rows;
int w_width = weights->cols;
float* w_data = weights->data;
float* i_data = inputs->data;

int u_limit = w_width - (UNROLL_FACTOR - 1);

for (int cur_row = 0; cur_row < res_rows; cur_row++) {
float sum0 = 0;
float sum1 = 0;
float sum2 = 0;
float sum3 = 0;
float sum4 = 0;
float sum5 = 0;
float sum6 = 0;
float sum7 = 0;
// float sum8 = 0;
// float sum9 = 0;
int row_offs = cur_row * w_width;

int k = 0;
for (; k < u_limit; k += UNROLL_FACTOR) {
sum0 += w_data[row_offs + k] * i_data[k];
sum1 += w_data[row_offs + k + 1] * i_data[k + 1];
sum2 += w_data[row_offs + k + 2] * i_data[k + 2];
sum3 += w_data[row_offs + k + 3] * i_data[k + 3];
sum4 += w_data[row_offs + k + 4] * i_data[k + 4];
sum5 += w_data[row_offs + k + 5] * i_data[k + 5];
sum6 += w_data[row_offs + k + 6] * i_data[k + 6];
sum7 += w_data[row_offs + k + 7] * i_data[k + 7];
// sum8 += w_data[row_offs + k + 8] * i_data[k + 8];
// sum9 += w_data[row_offs + k + 9] * i_data[k + 9];
}
(result->data)[i] = sum;

for (; k < w_width; k++) {
sum0 += w_data[row_offs + k] * i_data[k];
}

(result->data)[cur_row] = sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7; // + sum8 + sum9;
}
}

// // Old version with no specific optimisation
// void matrix_mul(const matrix* __restrict__ a, const matrix* __restrict__ b, const matrix* __restrict__ result) {
// int m = result->rows;
// int p = a->cols;
// for (int i = 0; i < m; i++) {
// float sum = 0;
// int h = i * p;
// for (int k = 0; k < p; k++) {
// sum += (a->data)[h + k] * ((b->data)[k]);
// }
// (result->data)[i] = sum;
// }
// }

void matrix_add(matrix* a, const matrix* b) {
for (int i = 0; i < a->rows; i++) {
(a->data)[i] += (b->data)[i];
Expand Down

0 comments on commit 8090249

Please sign in to comment.