From 6e0527f06e8d9d64aaf288a5fa87e798c9d99f7d Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Thu, 4 Jul 2024 01:31:02 +1000 Subject: [PATCH 1/2] Improve softmax perf 4x --- src/main.c | 4 ++-- src/matrix.c | 36 ++++++++++++++++++------------------ 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/main.c b/src/main.c index 841e8ca..3163ab8 100644 --- a/src/main.c +++ b/src/main.c @@ -80,7 +80,7 @@ u8 infer(vector* input) { // Somewhat experimental, minumum number of alligned_alloc without breaking things. // This code fucking sucks but its fast so uhhhh u8 infer_reuse_layers_thread(vector* input, matrix** weights, vector** biases) { - vector* outputs[NUM_LAYERS]; + vector* outputs[2]; outputs[0] = new_vec_aligned(98); outputs[1] = new_vec_aligned(65); @@ -223,7 +223,7 @@ int main(int argc, char* argv[]) { } free(results_local); - printf("Thread %d: %d\n", omp_get_thread_num(), force); + printf("Thread %2d: %d\n", omp_get_thread_num(), force); } // Output for csv diff --git a/src/matrix.c b/src/matrix.c index 872dd2f..1624f82 100644 --- a/src/matrix.c +++ b/src/matrix.c @@ -64,7 +64,7 @@ void sgemv_t_tuned(const float* weights, const float* inputs, float* __restrict_ } // TODO: SIMD tuned versions if these are a noticeable impact -void vector_add_inplace(int len, const f32* src, f32* dest) { +void vector_add_inplace(int len, const f32* src, f32* __restrict__ dest) { for (int i = 0; i < len; i++) { dest[i] += src[i]; } @@ -77,10 +77,9 @@ void relu_inplace(f32* dest, int len) { } // Hacky but fast and accurate for existing inputs -static double fastexp(double x) { - i64 tmp = (i64)(1512775 * x + 1072632447); - tmp <<= 32; - double result; +static inline float fastexp(float x) { + int tmp = (int)(1512775 * x + 1072632447); + float result; memcpy(&result, &tmp, sizeof(result)); return result; } @@ -95,6 +94,20 @@ void softmax_inplace(f32* dest, int len) { } } +// Get result from output layer +u8 argmax(f32* in, int len) { + int idx = 0; + float res = in[0]; + for (int i = 0; i < len; i++) { + if (res < in[i]) { + res = in[i]; + idx = i; + } + } + return idx; +} + + void transpose_mat_inplace(matrix* in) { int cols_before = in->cols; int rows_before = in->rows; @@ -118,16 +131,3 @@ void transpose_mat_inplace(matrix* in) { in->cols = pad_w_width; in->rows = cols_before; } - -// Get result from output layer -u8 argmax(f32* in, int len) { - int idx = 0; - float res = in[0]; - for (int i = 0; i < len; i++) { - if (res < in[i]) { - res = in[i]; - idx = i; - } - } - return idx; -} From fae646bc11bf0ae769374a23956c74f6bdd14e77 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Thu, 4 Jul 2024 01:51:59 +1000 Subject: [PATCH 2/2] Fix format --- src/main.c | 4 ++-- src/matrix.c | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main.c b/src/main.c index 3163ab8..211703e 100644 --- a/src/main.c +++ b/src/main.c @@ -208,8 +208,8 @@ int main(int argc, char* argv[]) { for (int i = 0; i < input_count; i++) { // printf("Thread %d: Processing input %d\n", omp_get_thread_num(), i); - vector* input = new_vec_aligned(TSIZE_ALGN_BYTES / sizeof(f32)); - memcpy(input->data, (f32*)&tensors[TSIZE_ALGN_BYTES / sizeof(f32) * i], TSIZE_ALGN_BYTES); + vector* input = new_vec_aligned(TENSOR_SIZE); + memcpy(input->data, (f32*)&tensors[TSIZE_ALGN_BYTES / sizeof(f32) * i], TENSOR_SIZE * sizeof(f32)); #pragma omp for for (int j = 0; j < iter_per_in - 1; j++) { diff --git a/src/matrix.c b/src/matrix.c index 1624f82..3287146 100644 --- a/src/matrix.c +++ b/src/matrix.c @@ -107,7 +107,6 @@ u8 argmax(f32* in, int len) { return idx; } - void transpose_mat_inplace(matrix* in) { int cols_before = in->cols; int rows_before = in->rows;