Merge pull request #8 from rozukke/fix/speed-demon

Fix remaining serious performance issues
kachi-group · Jun 26, 2024 · 8090249 · 8090249
2 parents adc90cc + 05063db
commit 8090249
Show file tree

Hide file tree

Showing 3 changed files with 82 additions and 19 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,9 +3,10 @@ cmake_minimum_required(VERSION 3.16)
 # Set the project name
 project(ichida-algo)
 
-set(CMAKE_CXX_FLAGS "-O3 -Wall -Wextra")
+set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -Wall -Wextra")
 set(CMAKE_C_STANDARD 99)
 set(CMAKE_C_STANDARD_REQUIRED True)
+set(CMAKE_VERBOSE_MAKEFILE ON)
 
 set(SRC_DIR src)
 set(INC_DIR include)

diff --git a/src/main.c b/src/main.c
@@ -3,9 +3,12 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/time.h>
 
-matrix* weights[7];
-matrix* biases[7];
+#define NUM_LAYERS 7
+
+matrix* weights[NUM_LAYERS];
+matrix* biases[NUM_LAYERS];
 
 char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I', 'i',
                     'J', 'j', 'K', 'k', 'L', 'l', 'M', 'm', 'N', 'n', 'O', 'o', 'P', 'p', 'Q', 'q', 'R', 'r',
@@ -66,15 +69,13 @@ void read_tensor(matrix* a, const char* fileName) {
     FILE* file = fopen(fileName, "r");
     char* line = NULL;
     size_t len = 0;
-    ssize_t read;
-    int line_number = 0;
 
     getline(&line, &len, file);
     char* token;
     float value;
     const char* delimiter = ",";
     token = strtok(line, delimiter);
-    int size = 0;
+
     for (int i = 0; i < 225; i++) {
         value = strtof(token, NULL);
         (a->data)[i] = value;
@@ -103,7 +104,7 @@ int get_max(matrix* a) {
 }
 
 int infer(matrix* input) {
-    matrix* mdl_layers[7];
+    matrix* mdl_layers[NUM_LAYERS];
     mdl_layers[0] = new_matrix(98, 1);
     mdl_layers[1] = new_matrix(65, 1);
     mdl_layers[2] = new_matrix(50, 1);
@@ -132,6 +133,15 @@ int infer(matrix* input) {
 }
 
 int main(int argc, char* argv[]) {
+    if (argc < 3) {
+        printf("Not enough arguments.");
+        return EXIT_FAILURE;
+    }
+
+    // Start timing
+    struct timeval stop, start;
+    gettimeofday(&start, NULL);
+
     // TODO: find a way to load static weights and biases
     // Load model (The memory of those code should be initialize during compile time to enchance the speed)
     weights[0] = new_matrix(98, 225);
@@ -197,5 +207,10 @@ int main(int argc, char* argv[]) {
         fprintf(csv_file, "%d, %c\n", i, letters[results[i]]);
     }
     fclose(csv_file);
+
+    // Time taken
+    gettimeofday(&stop, NULL);
+    printf("took %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
+
     return EXIT_SUCCESS;
 }
diff --git a/src/matrix.c b/src/matrix.c
@@ -1,7 +1,9 @@
 #include "../include/matrix.h"
-#include "math.h"
-#include "stdio.h"
-#include "stdlib.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define UNROLL_FACTOR 8
 
 matrix* new_matrix(int rows, int cols) {
     matrix* res = (matrix*)malloc(sizeof(matrix));
@@ -11,19 +13,64 @@ matrix* new_matrix(int rows, int cols) {
     return res;
 }
 
-void matrix_mul(const matrix* __restrict__ a, const matrix* __restrict__ b, const matrix* __restrict__ result) {
-    int m = result->rows;
-    int p = a->cols;
-    for (int i = 0; i < m; i++) {
-        float sum = 0;
-        int h = i * p;
-        for (int k = 0; k < p; k++) {
-            sum += (a->data)[h + k] * ((b->data)[k]);
+// Loop unrolling optimisation with a factor of 8 which should be enough to saturate a Zen3 core
+void matrix_mul(const matrix* weights, const matrix* inputs, const matrix* __restrict__ result) {
+    int res_rows = result->rows;
+    int w_width = weights->cols;
+    float* w_data = weights->data;
+    float* i_data = inputs->data;
+
+    int u_limit = w_width - (UNROLL_FACTOR - 1);
+
+    for (int cur_row = 0; cur_row < res_rows; cur_row++) {
+        float sum0 = 0;
+        float sum1 = 0;
+        float sum2 = 0;
+        float sum3 = 0;
+        float sum4 = 0;
+        float sum5 = 0;
+        float sum6 = 0;
+        float sum7 = 0;
+        // float sum8 = 0;
+        // float sum9 = 0;
+        int row_offs = cur_row * w_width;
+
+        int k = 0;
+        for (; k < u_limit; k += UNROLL_FACTOR) {
+            sum0 += w_data[row_offs + k] * i_data[k];
+            sum1 += w_data[row_offs + k + 1] * i_data[k + 1];
+            sum2 += w_data[row_offs + k + 2] * i_data[k + 2];
+            sum3 += w_data[row_offs + k + 3] * i_data[k + 3];
+            sum4 += w_data[row_offs + k + 4] * i_data[k + 4];
+            sum5 += w_data[row_offs + k + 5] * i_data[k + 5];
+            sum6 += w_data[row_offs + k + 6] * i_data[k + 6];
+            sum7 += w_data[row_offs + k + 7] * i_data[k + 7];
+            // sum8 += w_data[row_offs + k + 8] * i_data[k + 8];
+            // sum9 += w_data[row_offs + k + 9] * i_data[k + 9];
         }
-        (result->data)[i] = sum;
+
+        for (; k < w_width; k++) {
+            sum0 += w_data[row_offs + k] * i_data[k];
+        }
+
+        (result->data)[cur_row] = sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7; // + sum8 + sum9;
     }
 }
 
+// // Old version with no specific optimisation
+// void matrix_mul(const matrix* __restrict__ a, const matrix* __restrict__ b, const matrix* __restrict__ result) {
+//     int m = result->rows;
+//     int p = a->cols;
+//     for (int i = 0; i < m; i++) {
+//         float sum = 0;
+//         int h = i * p;
+//         for (int k = 0; k < p; k++) {
+//             sum += (a->data)[h + k] * ((b->data)[k]);
+//         }
+//         (result->data)[i] = sum;
+//     }
+// }
+
 void matrix_add(matrix* a, const matrix* b) {
     for (int i = 0; i < a->rows; i++) {
         (a->data)[i] += (b->data)[i];