From 0c97bb12479a6a6af12eb254cf2cf737eae925f8 Mon Sep 17 00:00:00 2001
From: nhatdongdang <nhatdongdang7777@gmail.com>
Date: Sun, 7 Jul 2024 03:37:19 +0000
Subject: [PATCH 1/6] Optimize memory

---
 src/main.c   | 14 +++++++-------
 src/matrix.c |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/main.c b/src/main.c
index 211703e..9721e2a 100644
--- a/src/main.c
+++ b/src/main.c
@@ -16,7 +16,7 @@ typedef unsigned char u8;
 #define NUM_LAYERS 7
 
 #define TENSOR_SIZE 225
-#define TSIZE_ALGN_BYTES (((TENSOR_SIZE + SIMD_ALGN - 1) / SIMD_ALGN * SIMD_ALGN) * sizeof(f32))
+#define TSIZE_ALIGN_BYTES (((TENSOR_SIZE + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32) * sizeof(f32))
 
 matrix* weights[NUM_LAYERS];
 vector* biases[NUM_LAYERS];
@@ -169,7 +169,7 @@ int main(int argc, char* argv[]) {
     printf("Number of input tensors: %d\n", input_count);
     printf("Iterations per input: %d\n", iter_per_in);
 
-    f32* tensors = (f32*)aligned_alloc(SIMD_ALGN, TSIZE_ALGN_BYTES * input_count);
+    f32* tensors = (f32*)aligned_alloc(SIMD_ALIGN, TSIZE_ALIGN_BYTES * input_count);
 
     // Read and process inputs
     char* file_path = (char*)malloc((256) * sizeof(char));
@@ -185,7 +185,7 @@ int main(int argc, char* argv[]) {
             strcpy(file_path, directory_path);
             strcat(file_path, "/");
             strcat(file_path, entry->d_name);
-            read_tensor((f32*)&tensors[TSIZE_ALGN_BYTES / sizeof(f32) * (file_num - 1)], file_path);
+            read_tensor((f32*)&tensors[TSIZE_ALIGN_BYTES / sizeof(f32) * (file_num - 1)], file_path);
         }
     }
     closedir(dir);
@@ -200,7 +200,7 @@ int main(int argc, char* argv[]) {
     // int NUM_THREADS = sysconf(_SC_NPROCESSORS_ONLN);
 
     if (iter_per_in > 1)
-#pragma omp parallel
+    #pragma omp parallel
     {
         int force = 0;
         u8* results_local = (u8*)malloc(input_count * sizeof(u8));
@@ -209,9 +209,9 @@ int main(int argc, char* argv[]) {
             // printf("Thread %d: Processing input %d\n", omp_get_thread_num(), i);
 
             vector* input = new_vec_aligned(TENSOR_SIZE);
-            memcpy(input->data, (f32*)&tensors[TSIZE_ALGN_BYTES / sizeof(f32) * i], TENSOR_SIZE * sizeof(f32));
+            memcpy(input->data, (f32*)&tensors[TSIZE_ALIGN_BYTES / sizeof(f32) * i], TENSOR_SIZE * sizeof(f32));
 
-#pragma omp for
+            #pragma omp for
             for (int j = 0; j < iter_per_in - 1; j++) {
                 // Using global memory for model seems to be faster
                 results_local[i] = infer_reuse_layers_thread(input, weights, biases);
@@ -230,7 +230,7 @@ int main(int argc, char* argv[]) {
     vector* input = new_vec_aligned(TENSOR_SIZE);
     u8* results = (u8*)malloc(input_count * sizeof(u8));
     for (int i = 0; i < input_count; i++) {
-        input->data = (f32*)&tensors[TSIZE_ALGN_BYTES / sizeof(f32) * i];
+        input->data = (f32*)&tensors[TSIZE_ALIGN_BYTES / sizeof(f32) * i];
         results[i] = infer_reuse_layers_thread(input, weights, biases);
     }
 
diff --git a/src/matrix.c b/src/matrix.c
index 3287146..aff2472 100644
--- a/src/matrix.c
+++ b/src/matrix.c
@@ -16,7 +16,7 @@ matrix* new_matrix_aligned(int rows, int cols) {
 
     // Align entire array for simd access and better cache line utilisation
     new_mat->data =
-        (f32*)aligned_alloc(SIMD_ALGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALGN - 1) / SIMD_ALGN * SIMD_ALGN));
+        (f32*)aligned_alloc(SIMD_ALIGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32));
 
     return new_mat;
 }
@@ -31,7 +31,7 @@ vector* new_vec_aligned(int len) {
 
     // Align entire array for simd access and better cache line utilisation
     new_vec->data =
-        (f32*)aligned_alloc(SIMD_ALGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALGN - 1) / SIMD_ALGN * SIMD_ALGN));
+        (f32*)aligned_alloc(SIMD_ALIGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32));
 
     memset(new_vec->data, 0, kern_align_f32 * sizeof(f32));
 
@@ -115,7 +115,7 @@ void transpose_mat_inplace(matrix* in) {
     int pad_w_rows = (cols_before + KERN_ROWS - 1) / KERN_ROWS * KERN_ROWS;
     int pad_w_width = (rows_before + KERN_COLS - 1) / KERN_COLS * KERN_COLS;
     f32* transposed = (f32*)aligned_alloc(
-        SIMD_ALGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + SIMD_ALGN - 1) / SIMD_ALGN * SIMD_ALGN));
+        SIMD_ALIGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32));
     memset(transposed, 0, pad_w_rows * pad_w_width * sizeof(f32));
 
     for (int row = 0; row < rows_before; row++) {

From 195e8978f03c6ae245ff6217817742eeeaeaf67e Mon Sep 17 00:00:00 2001
From: nhatdongdang <nhatdongdang7777@gmail.com>
Date: Sun, 7 Jul 2024 03:37:51 +0000
Subject: [PATCH 2/6] Optimize kernel size and SIMD_ALIGN size

---
 src/matrix.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/matrix.h b/src/matrix.h
index 42d7b0a..4bc699c 100644
--- a/src/matrix.h
+++ b/src/matrix.h
@@ -5,8 +5,9 @@ typedef unsigned char u8;
 typedef signed long i64;
 
 #define KERN_COLS 8
-#define KERN_ROWS 2
-#define SIMD_ALGN 64
+#define KERN_ROWS 4
+#define SIMD_ALIGN 32
+#define SIMD_ALIGN_F32 (SIMD_ALIGN / 4) // f32 is 4 bytes
 
 typedef struct vector {
     int len;

From fba79e01a8724467ca77320bd8d37e9d7f08c4b9 Mon Sep 17 00:00:00 2001
From: nhatdongdang <nhatdongdang7777@gmail.com>
Date: Sun, 7 Jul 2024 03:46:29 +0000
Subject: [PATCH 3/6] Remove unnecessary memory assignment

---
 src/main.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/main.c b/src/main.c
index 9721e2a..7cfd2d3 100644
--- a/src/main.c
+++ b/src/main.c
@@ -28,7 +28,7 @@ char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f',
 void propagate_fwd(const matrix* weights, const vector* inputs, vector* results, const vector* biases) {
     sgemv_t_tuned(weights->data, inputs->data, results->data, weights->cols, weights->rows);
     // Add biases onto results
-    vector_add_inplace(results->len, biases->data, results->data);
+    vector_add_inplace(biases->len, biases->data, results->data);
 }
 
 // Basic version, too many aligned_alloc
@@ -89,31 +89,26 @@ u8 infer_reuse_layers_thread(vector* input, matrix** weights, vector** biases) {
     propagate_fwd(weights[1], outputs[0], outputs[1], biases[1]);
     relu_inplace(outputs[1]->data, 65);
 
-    outputs[0]->len = 50;
     memset(outputs[0]->data, 0, 50 * sizeof(f32));
 
     propagate_fwd(weights[2], outputs[1], outputs[0], biases[2]);
     relu_inplace(outputs[0]->data, 50);
 
-    outputs[1]->len = 30;
     memset(outputs[1]->data, 0, 30 * sizeof(f32));
 
     propagate_fwd(weights[3], outputs[0], outputs[1], biases[3]);
     relu_inplace(outputs[1]->data, 30);
 
-    outputs[0]->len = 25;
     memset(outputs[0]->data, 0, 25 * sizeof(f32));
 
     propagate_fwd(weights[4], outputs[1], outputs[0], biases[4]);
     relu_inplace(outputs[0]->data, 25);
 
-    outputs[1]->len = 40;
     memset(outputs[1]->data, 0, 40 * sizeof(f32));
 
     propagate_fwd(weights[5], outputs[0], outputs[1], biases[5]);
     relu_inplace(outputs[1]->data, 40);
 
-    outputs[0]->len = 52;
     memset(outputs[0]->data, 0, 52 * sizeof(f32));
 
     propagate_fwd(weights[6], outputs[1], outputs[0], biases[6]);

From 30e65358b3b807fd9deb6f7269c2cea0246bf825 Mon Sep 17 00:00:00 2001
From: nhatdongdang <nhatdongdang7777@gmail.com>
Date: Sun, 7 Jul 2024 03:58:59 +0000
Subject: [PATCH 4/6] Fix code formatting

---
 src/main.c   |  4 ++--
 src/matrix.c | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/main.c b/src/main.c
index 7cfd2d3..47728f8 100644
--- a/src/main.c
+++ b/src/main.c
@@ -195,7 +195,7 @@ int main(int argc, char* argv[]) {
     // int NUM_THREADS = sysconf(_SC_NPROCESSORS_ONLN);
 
     if (iter_per_in > 1)
-    #pragma omp parallel
+#pragma omp parallel
     {
         int force = 0;
         u8* results_local = (u8*)malloc(input_count * sizeof(u8));
@@ -206,7 +206,7 @@ int main(int argc, char* argv[]) {
             vector* input = new_vec_aligned(TENSOR_SIZE);
             memcpy(input->data, (f32*)&tensors[TSIZE_ALIGN_BYTES / sizeof(f32) * i], TENSOR_SIZE * sizeof(f32));
 
-            #pragma omp for
+#pragma omp for
             for (int j = 0; j < iter_per_in - 1; j++) {
                 // Using global memory for model seems to be faster
                 results_local[i] = infer_reuse_layers_thread(input, weights, biases);
diff --git a/src/matrix.c b/src/matrix.c
index aff2472..39f818e 100644
--- a/src/matrix.c
+++ b/src/matrix.c
@@ -15,8 +15,8 @@ matrix* new_matrix_aligned(int rows, int cols) {
     new_mat->cols = cols;
 
     // Align entire array for simd access and better cache line utilisation
-    new_mat->data =
-        (f32*)aligned_alloc(SIMD_ALIGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32));
+    new_mat->data = (f32*)aligned_alloc(
+        SIMD_ALIGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32));
 
     return new_mat;
 }
@@ -30,8 +30,8 @@ vector* new_vec_aligned(int len) {
     new_vec->len = len;
 
     // Align entire array for simd access and better cache line utilisation
-    new_vec->data =
-        (f32*)aligned_alloc(SIMD_ALIGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32));
+    new_vec->data = (f32*)aligned_alloc(
+        SIMD_ALIGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32));
 
     memset(new_vec->data, 0, kern_align_f32 * sizeof(f32));
 
@@ -114,8 +114,8 @@ void transpose_mat_inplace(matrix* in) {
     // Swapped for transpose
     int pad_w_rows = (cols_before + KERN_ROWS - 1) / KERN_ROWS * KERN_ROWS;
     int pad_w_width = (rows_before + KERN_COLS - 1) / KERN_COLS * KERN_COLS;
-    f32* transposed = (f32*)aligned_alloc(
-        SIMD_ALIGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32));
+    f32* transposed = (f32*)aligned_alloc(SIMD_ALIGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + SIMD_ALIGN_F32 - 1) /
+                                                       SIMD_ALIGN_F32 * SIMD_ALIGN_F32));
     memset(transposed, 0, pad_w_rows * pad_w_width * sizeof(f32));
 
     for (int row = 0; row < rows_before; row++) {

From 9ed1a7108c1eb0a0c02110dc49855f2c392a0323 Mon Sep 17 00:00:00 2001
From: nhatdongdang <nhatdongdang7777@gmail.com>
Date: Sun, 7 Jul 2024 04:02:00 +0000
Subject: [PATCH 5/6] Fix styling and variable naming

---
 src/matrix.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/matrix.h b/src/matrix.h
index 4bc699c..a6cef19 100644
--- a/src/matrix.h
+++ b/src/matrix.h
@@ -7,7 +7,7 @@ typedef signed long i64;
 #define KERN_COLS 8
 #define KERN_ROWS 4
 #define SIMD_ALIGN 32
-#define SIMD_ALIGN_F32 (SIMD_ALIGN / 4) // f32 is 4 bytes
+#define SIMD_ALIGN_F32 (SIMD_ALIGN / sizeof(f32)) // f32 is 4 bytes
 
 typedef struct vector {
     int len;

From 6c69bfe53619b675c3de58e95d171006511eda32 Mon Sep 17 00:00:00 2001
From: nhatdongdang <nhatdongdang7777@gmail.com>
Date: Sun, 7 Jul 2024 05:23:20 +0000
Subject: [PATCH 6/6] Remove unnecessary comment

---
 src/matrix.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/matrix.h b/src/matrix.h
index a6cef19..9a3d0ed 100644
--- a/src/matrix.h
+++ b/src/matrix.h
@@ -7,7 +7,7 @@ typedef signed long i64;
 #define KERN_COLS 8
 #define KERN_ROWS 4
 #define SIMD_ALIGN 32
-#define SIMD_ALIGN_F32 (SIMD_ALIGN / sizeof(f32)) // f32 is 4 bytes
+#define SIMD_ALIGN_F32 (SIMD_ALIGN / sizeof(f32))
 
 typedef struct vector {
     int len;