From 0c97bb12479a6a6af12eb254cf2cf737eae925f8 Mon Sep 17 00:00:00 2001 From: nhatdongdang Date: Sun, 7 Jul 2024 03:37:19 +0000 Subject: [PATCH 1/6] Optimize memory --- src/main.c | 14 +++++++------- src/matrix.c | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/main.c b/src/main.c index 211703e..9721e2a 100644 --- a/src/main.c +++ b/src/main.c @@ -16,7 +16,7 @@ typedef unsigned char u8; #define NUM_LAYERS 7 #define TENSOR_SIZE 225 -#define TSIZE_ALGN_BYTES (((TENSOR_SIZE + SIMD_ALGN - 1) / SIMD_ALGN * SIMD_ALGN) * sizeof(f32)) +#define TSIZE_ALIGN_BYTES (((TENSOR_SIZE + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32) * sizeof(f32)) matrix* weights[NUM_LAYERS]; vector* biases[NUM_LAYERS]; @@ -169,7 +169,7 @@ int main(int argc, char* argv[]) { printf("Number of input tensors: %d\n", input_count); printf("Iterations per input: %d\n", iter_per_in); - f32* tensors = (f32*)aligned_alloc(SIMD_ALGN, TSIZE_ALGN_BYTES * input_count); + f32* tensors = (f32*)aligned_alloc(SIMD_ALIGN, TSIZE_ALIGN_BYTES * input_count); // Read and process inputs char* file_path = (char*)malloc((256) * sizeof(char)); @@ -185,7 +185,7 @@ int main(int argc, char* argv[]) { strcpy(file_path, directory_path); strcat(file_path, "/"); strcat(file_path, entry->d_name); - read_tensor((f32*)&tensors[TSIZE_ALGN_BYTES / sizeof(f32) * (file_num - 1)], file_path); + read_tensor((f32*)&tensors[TSIZE_ALIGN_BYTES / sizeof(f32) * (file_num - 1)], file_path); } } closedir(dir); @@ -200,7 +200,7 @@ int main(int argc, char* argv[]) { // int NUM_THREADS = sysconf(_SC_NPROCESSORS_ONLN); if (iter_per_in > 1) -#pragma omp parallel + #pragma omp parallel { int force = 0; u8* results_local = (u8*)malloc(input_count * sizeof(u8)); @@ -209,9 +209,9 @@ int main(int argc, char* argv[]) { // printf("Thread %d: Processing input %d\n", omp_get_thread_num(), i); vector* input = new_vec_aligned(TENSOR_SIZE); - memcpy(input->data, (f32*)&tensors[TSIZE_ALGN_BYTES / sizeof(f32) * i], TENSOR_SIZE * sizeof(f32)); + memcpy(input->data, (f32*)&tensors[TSIZE_ALIGN_BYTES / sizeof(f32) * i], TENSOR_SIZE * sizeof(f32)); -#pragma omp for + #pragma omp for for (int j = 0; j < iter_per_in - 1; j++) { // Using global memory for model seems to be faster results_local[i] = infer_reuse_layers_thread(input, weights, biases); @@ -230,7 +230,7 @@ int main(int argc, char* argv[]) { vector* input = new_vec_aligned(TENSOR_SIZE); u8* results = (u8*)malloc(input_count * sizeof(u8)); for (int i = 0; i < input_count; i++) { - input->data = (f32*)&tensors[TSIZE_ALGN_BYTES / sizeof(f32) * i]; + input->data = (f32*)&tensors[TSIZE_ALIGN_BYTES / sizeof(f32) * i]; results[i] = infer_reuse_layers_thread(input, weights, biases); } diff --git a/src/matrix.c b/src/matrix.c index 3287146..aff2472 100644 --- a/src/matrix.c +++ b/src/matrix.c @@ -16,7 +16,7 @@ matrix* new_matrix_aligned(int rows, int cols) { // Align entire array for simd access and better cache line utilisation new_mat->data = - (f32*)aligned_alloc(SIMD_ALGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALGN - 1) / SIMD_ALGN * SIMD_ALGN)); + (f32*)aligned_alloc(SIMD_ALIGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32)); return new_mat; } @@ -31,7 +31,7 @@ vector* new_vec_aligned(int len) { // Align entire array for simd access and better cache line utilisation new_vec->data = - (f32*)aligned_alloc(SIMD_ALGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALGN - 1) / SIMD_ALGN * SIMD_ALGN)); + (f32*)aligned_alloc(SIMD_ALIGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32)); memset(new_vec->data, 0, kern_align_f32 * sizeof(f32)); @@ -115,7 +115,7 @@ void transpose_mat_inplace(matrix* in) { int pad_w_rows = (cols_before + KERN_ROWS - 1) / KERN_ROWS * KERN_ROWS; int pad_w_width = (rows_before + KERN_COLS - 1) / KERN_COLS * KERN_COLS; f32* transposed = (f32*)aligned_alloc( - SIMD_ALGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + SIMD_ALGN - 1) / SIMD_ALGN * SIMD_ALGN)); + SIMD_ALIGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32)); memset(transposed, 0, pad_w_rows * pad_w_width * sizeof(f32)); for (int row = 0; row < rows_before; row++) { From 195e8978f03c6ae245ff6217817742eeeaeaf67e Mon Sep 17 00:00:00 2001 From: nhatdongdang Date: Sun, 7 Jul 2024 03:37:51 +0000 Subject: [PATCH 2/6] Optimize kernel size and SIMD_ALIGN size --- src/matrix.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/matrix.h b/src/matrix.h index 42d7b0a..4bc699c 100644 --- a/src/matrix.h +++ b/src/matrix.h @@ -5,8 +5,9 @@ typedef unsigned char u8; typedef signed long i64; #define KERN_COLS 8 -#define KERN_ROWS 2 -#define SIMD_ALGN 64 +#define KERN_ROWS 4 +#define SIMD_ALIGN 32 +#define SIMD_ALIGN_F32 (SIMD_ALIGN / 4) // f32 is 4 bytes typedef struct vector { int len; From fba79e01a8724467ca77320bd8d37e9d7f08c4b9 Mon Sep 17 00:00:00 2001 From: nhatdongdang Date: Sun, 7 Jul 2024 03:46:29 +0000 Subject: [PATCH 3/6] Remove unnecessary memory assignment --- src/main.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/main.c b/src/main.c index 9721e2a..7cfd2d3 100644 --- a/src/main.c +++ b/src/main.c @@ -28,7 +28,7 @@ char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', void propagate_fwd(const matrix* weights, const vector* inputs, vector* results, const vector* biases) { sgemv_t_tuned(weights->data, inputs->data, results->data, weights->cols, weights->rows); // Add biases onto results - vector_add_inplace(results->len, biases->data, results->data); + vector_add_inplace(biases->len, biases->data, results->data); } // Basic version, too many aligned_alloc @@ -89,31 +89,26 @@ u8 infer_reuse_layers_thread(vector* input, matrix** weights, vector** biases) { propagate_fwd(weights[1], outputs[0], outputs[1], biases[1]); relu_inplace(outputs[1]->data, 65); - outputs[0]->len = 50; memset(outputs[0]->data, 0, 50 * sizeof(f32)); propagate_fwd(weights[2], outputs[1], outputs[0], biases[2]); relu_inplace(outputs[0]->data, 50); - outputs[1]->len = 30; memset(outputs[1]->data, 0, 30 * sizeof(f32)); propagate_fwd(weights[3], outputs[0], outputs[1], biases[3]); relu_inplace(outputs[1]->data, 30); - outputs[0]->len = 25; memset(outputs[0]->data, 0, 25 * sizeof(f32)); propagate_fwd(weights[4], outputs[1], outputs[0], biases[4]); relu_inplace(outputs[0]->data, 25); - outputs[1]->len = 40; memset(outputs[1]->data, 0, 40 * sizeof(f32)); propagate_fwd(weights[5], outputs[0], outputs[1], biases[5]); relu_inplace(outputs[1]->data, 40); - outputs[0]->len = 52; memset(outputs[0]->data, 0, 52 * sizeof(f32)); propagate_fwd(weights[6], outputs[1], outputs[0], biases[6]); From 30e65358b3b807fd9deb6f7269c2cea0246bf825 Mon Sep 17 00:00:00 2001 From: nhatdongdang Date: Sun, 7 Jul 2024 03:58:59 +0000 Subject: [PATCH 4/6] Fix code formatting --- src/main.c | 4 ++-- src/matrix.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/main.c b/src/main.c index 7cfd2d3..47728f8 100644 --- a/src/main.c +++ b/src/main.c @@ -195,7 +195,7 @@ int main(int argc, char* argv[]) { // int NUM_THREADS = sysconf(_SC_NPROCESSORS_ONLN); if (iter_per_in > 1) - #pragma omp parallel +#pragma omp parallel { int force = 0; u8* results_local = (u8*)malloc(input_count * sizeof(u8)); @@ -206,7 +206,7 @@ int main(int argc, char* argv[]) { vector* input = new_vec_aligned(TENSOR_SIZE); memcpy(input->data, (f32*)&tensors[TSIZE_ALIGN_BYTES / sizeof(f32) * i], TENSOR_SIZE * sizeof(f32)); - #pragma omp for +#pragma omp for for (int j = 0; j < iter_per_in - 1; j++) { // Using global memory for model seems to be faster results_local[i] = infer_reuse_layers_thread(input, weights, biases); diff --git a/src/matrix.c b/src/matrix.c index aff2472..39f818e 100644 --- a/src/matrix.c +++ b/src/matrix.c @@ -15,8 +15,8 @@ matrix* new_matrix_aligned(int rows, int cols) { new_mat->cols = cols; // Align entire array for simd access and better cache line utilisation - new_mat->data = - (f32*)aligned_alloc(SIMD_ALIGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32)); + new_mat->data = (f32*)aligned_alloc( + SIMD_ALIGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32)); return new_mat; } @@ -30,8 +30,8 @@ vector* new_vec_aligned(int len) { new_vec->len = len; // Align entire array for simd access and better cache line utilisation - new_vec->data = - (f32*)aligned_alloc(SIMD_ALIGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32)); + new_vec->data = (f32*)aligned_alloc( + SIMD_ALIGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32)); memset(new_vec->data, 0, kern_align_f32 * sizeof(f32)); @@ -114,8 +114,8 @@ void transpose_mat_inplace(matrix* in) { // Swapped for transpose int pad_w_rows = (cols_before + KERN_ROWS - 1) / KERN_ROWS * KERN_ROWS; int pad_w_width = (rows_before + KERN_COLS - 1) / KERN_COLS * KERN_COLS; - f32* transposed = (f32*)aligned_alloc( - SIMD_ALIGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32)); + f32* transposed = (f32*)aligned_alloc(SIMD_ALIGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / + SIMD_ALIGN_F32 * SIMD_ALIGN_F32)); memset(transposed, 0, pad_w_rows * pad_w_width * sizeof(f32)); for (int row = 0; row < rows_before; row++) { From 9ed1a7108c1eb0a0c02110dc49855f2c392a0323 Mon Sep 17 00:00:00 2001 From: nhatdongdang Date: Sun, 7 Jul 2024 04:02:00 +0000 Subject: [PATCH 5/6] Fix styling and variable naming --- src/matrix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/matrix.h b/src/matrix.h index 4bc699c..a6cef19 100644 --- a/src/matrix.h +++ b/src/matrix.h @@ -7,7 +7,7 @@ typedef signed long i64; #define KERN_COLS 8 #define KERN_ROWS 4 #define SIMD_ALIGN 32 -#define SIMD_ALIGN_F32 (SIMD_ALIGN / 4) // f32 is 4 bytes +#define SIMD_ALIGN_F32 (SIMD_ALIGN / sizeof(f32)) // f32 is 4 bytes typedef struct vector { int len; From 6c69bfe53619b675c3de58e95d171006511eda32 Mon Sep 17 00:00:00 2001 From: nhatdongdang Date: Sun, 7 Jul 2024 05:23:20 +0000 Subject: [PATCH 6/6] Remove unnecessary comment --- src/matrix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/matrix.h b/src/matrix.h index a6cef19..9a3d0ed 100644 --- a/src/matrix.h +++ b/src/matrix.h @@ -7,7 +7,7 @@ typedef signed long i64; #define KERN_COLS 8 #define KERN_ROWS 4 #define SIMD_ALIGN 32 -#define SIMD_ALIGN_F32 (SIMD_ALIGN / sizeof(f32)) // f32 is 4 bytes +#define SIMD_ALIGN_F32 (SIMD_ALIGN / sizeof(f32)) typedef struct vector { int len;