diff --git a/CMakeLists.txt b/CMakeLists.txt index aeb85ab..5a84d2f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,9 +3,10 @@ cmake_minimum_required(VERSION 3.16) # Set the project name project(ichida-algo) -set(CMAKE_CXX_FLAGS "-O3 -Wall -Wextra") +set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -Wall -Wextra") set(CMAKE_C_STANDARD 99) set(CMAKE_C_STANDARD_REQUIRED True) +set(CMAKE_VERBOSE_MAKEFILE ON) set(SRC_DIR src) set(INC_DIR include) diff --git a/src/main.c b/src/main.c index a04b6a6..4ebfdcf 100644 --- a/src/main.c +++ b/src/main.c @@ -3,9 +3,12 @@ #include #include #include +#include -matrix* weights[7]; -matrix* biases[7]; +#define NUM_LAYERS 7 + +matrix* weights[NUM_LAYERS]; +matrix* biases[NUM_LAYERS]; char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I', 'i', 'J', 'j', 'K', 'k', 'L', 'l', 'M', 'm', 'N', 'n', 'O', 'o', 'P', 'p', 'Q', 'q', 'R', 'r', @@ -66,15 +69,13 @@ void read_tensor(matrix* a, const char* fileName) { FILE* file = fopen(fileName, "r"); char* line = NULL; size_t len = 0; - ssize_t read; - int line_number = 0; getline(&line, &len, file); char* token; float value; const char* delimiter = ","; token = strtok(line, delimiter); - int size = 0; + for (int i = 0; i < 225; i++) { value = strtof(token, NULL); (a->data)[i] = value; @@ -103,7 +104,7 @@ int get_max(matrix* a) { } int infer(matrix* input) { - matrix* mdl_layers[7]; + matrix* mdl_layers[NUM_LAYERS]; mdl_layers[0] = new_matrix(98, 1); mdl_layers[1] = new_matrix(65, 1); mdl_layers[2] = new_matrix(50, 1); @@ -132,6 +133,15 @@ int infer(matrix* input) { } int main(int argc, char* argv[]) { + if (argc < 3) { + printf("Not enough arguments."); + return EXIT_FAILURE; + } + + // Start timing + struct timeval stop, start; + gettimeofday(&start, NULL); + // TODO: find a way to load static weights and biases // Load model (The memory of those code should be initialize during compile time to enchance the speed) weights[0] = new_matrix(98, 225); @@ -197,5 +207,10 @@ int main(int argc, char* argv[]) { fprintf(csv_file, "%d, %c\n", i, letters[results[i]]); } fclose(csv_file); + + // Time taken + gettimeofday(&stop, NULL); + printf("took %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec); + return EXIT_SUCCESS; } diff --git a/src/matrix.c b/src/matrix.c index dec4960..54c2d2f 100644 --- a/src/matrix.c +++ b/src/matrix.c @@ -1,7 +1,9 @@ #include "../include/matrix.h" -#include "math.h" -#include "stdio.h" -#include "stdlib.h" +#include +#include +#include + +#define UNROLL_FACTOR 8 matrix* new_matrix(int rows, int cols) { matrix* res = (matrix*)malloc(sizeof(matrix)); @@ -11,19 +13,64 @@ matrix* new_matrix(int rows, int cols) { return res; } -void matrix_mul(const matrix* __restrict__ a, const matrix* __restrict__ b, const matrix* __restrict__ result) { - int m = result->rows; - int p = a->cols; - for (int i = 0; i < m; i++) { - float sum = 0; - int h = i * p; - for (int k = 0; k < p; k++) { - sum += (a->data)[h + k] * ((b->data)[k]); +// Loop unrolling optimisation with a factor of 8 which should be enough to saturate a Zen3 core +void matrix_mul(const matrix* weights, const matrix* inputs, const matrix* __restrict__ result) { + int res_rows = result->rows; + int w_width = weights->cols; + float* w_data = weights->data; + float* i_data = inputs->data; + + int u_limit = w_width - (UNROLL_FACTOR - 1); + + for (int cur_row = 0; cur_row < res_rows; cur_row++) { + float sum0 = 0; + float sum1 = 0; + float sum2 = 0; + float sum3 = 0; + float sum4 = 0; + float sum5 = 0; + float sum6 = 0; + float sum7 = 0; + // float sum8 = 0; + // float sum9 = 0; + int row_offs = cur_row * w_width; + + int k = 0; + for (; k < u_limit; k += UNROLL_FACTOR) { + sum0 += w_data[row_offs + k] * i_data[k]; + sum1 += w_data[row_offs + k + 1] * i_data[k + 1]; + sum2 += w_data[row_offs + k + 2] * i_data[k + 2]; + sum3 += w_data[row_offs + k + 3] * i_data[k + 3]; + sum4 += w_data[row_offs + k + 4] * i_data[k + 4]; + sum5 += w_data[row_offs + k + 5] * i_data[k + 5]; + sum6 += w_data[row_offs + k + 6] * i_data[k + 6]; + sum7 += w_data[row_offs + k + 7] * i_data[k + 7]; + // sum8 += w_data[row_offs + k + 8] * i_data[k + 8]; + // sum9 += w_data[row_offs + k + 9] * i_data[k + 9]; } - (result->data)[i] = sum; + + for (; k < w_width; k++) { + sum0 += w_data[row_offs + k] * i_data[k]; + } + + (result->data)[cur_row] = sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7; // + sum8 + sum9; } } +// // Old version with no specific optimisation +// void matrix_mul(const matrix* __restrict__ a, const matrix* __restrict__ b, const matrix* __restrict__ result) { +// int m = result->rows; +// int p = a->cols; +// for (int i = 0; i < m; i++) { +// float sum = 0; +// int h = i * p; +// for (int k = 0; k < p; k++) { +// sum += (a->data)[h + k] * ((b->data)[k]); +// } +// (result->data)[i] = sum; +// } +// } + void matrix_add(matrix* a, const matrix* b) { for (int i = 0; i < a->rows; i++) { (a->data)[i] += (b->data)[i];