diff --git a/Makefile b/Makefile index 09a0ea2..c4a8c0c 100644 --- a/Makefile +++ b/Makefile @@ -24,7 +24,7 @@ run_test: build ./speed_cpu ./weights_and_biases.txt ./tensors test: build - ./speed_cpu ./weights_and_biases.txt ./tensors 1 + ./speed_cpu ./weights_and_biases.txt ./tensors 1000000 mv ./results.csv ./test python3 ./test/verify_csv.py diff --git a/benchmark/matrix_mul/benchmark.c b/benchmark/matrix_mul/benchmark.c index 767135d..bceca02 100644 --- a/benchmark/matrix_mul/benchmark.c +++ b/benchmark/matrix_mul/benchmark.c @@ -10,14 +10,22 @@ typedef struct { int cols; } matrix; -matrix* new_matrix(int rows, int cols) { - matrix* res = (matrix*)malloc(sizeof(matrix)); - res->rows = rows; - res->cols = cols; - res->data = (float*)malloc((rows * cols) * sizeof(float)); - return res; +matrix* new_matrix_aligned(int rows, int cols) { + // Pad width to fit kernel + int kern_align_f32 = (rows * cols + KERN_COLS - 1) / KERN_COLS * KERN_COLS; + + matrix* new_mat = (matrix*)malloc(sizeof(matrix)); + new_mat->rows = rows; + new_mat->cols = cols; + + // Align entire array for simd access and better cache line utilisation + new_mat->data = + (f32*)aligned_alloc(SIMD_ALIGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32)); + + return new_mat; } + int main(int argc, char* argv[]) { long n = 0; if (argc > 1) { diff --git a/benchmark/matrix_mul/benchmark.png b/benchmark/matrix_mul/benchmark.png new file mode 100644 index 0000000..2701a5a Binary files /dev/null and b/benchmark/matrix_mul/benchmark.png differ diff --git a/benchmark/matrix_mul/versions/2x8.c b/benchmark/matrix_mul/versions/2x8.c new file mode 100644 index 0000000..b340385 --- /dev/null +++ b/benchmark/matrix_mul/versions/2x8.c @@ -0,0 +1,76 @@ +#include "../matrix_mul.h" +#include +#include + +#define KERN_COLS 8 +#define VEC_IN_KERN (KERN_COLS / 8) +#define KERN_ROWS 2 +#define SIMD_ALGN 32 +typedef float f32; +typedef unsigned char u8; + +void transpose_mat_inplace(matrix* in) { + int cols_before = in->cols; + int rows_before = in->rows; + + // Swapped for transpose + int pad_w_rows = (cols_before + KERN_ROWS - 1) / KERN_ROWS * KERN_ROWS; + int pad_w_width = (rows_before + KERN_COLS - 1) / KERN_COLS * KERN_COLS; + f32* transposed = (f32*)aligned_alloc( + SIMD_ALGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + 8 - 1) / 8 * 8)); + memset(transposed, 0, pad_w_rows * pad_w_width * sizeof(f32)); + + for (int row = 0; row < rows_before; row++) { + for (int col = 0; col < cols_before; col++) { + transposed[col * pad_w_width + row] = in->data[row * cols_before + col]; + } + } + + free(in->data); + in->data = transposed; + // Swap dims + in->cols = pad_w_width; + in->rows = cols_before; +} + +matrix* new_matrix_aligned(int rows, int cols) { + // Pad width to fit kernel + int kern_align_f32 = (rows * cols + KERN_COLS - 1) / KERN_COLS * KERN_COLS; + + matrix* new_mat = (matrix*)malloc(sizeof(matrix)); + new_mat->rows = rows; + new_mat->cols = cols; + + // Align entire array for simd access and better cache line utilisation + new_mat->data = + (f32*)aligned_alloc(SIMD_ALGN, (((kern_align_f32 * sizeof(f32)) + 8 - 1) / 8 * 8)); + + return new_mat; +} + + +void matrix_mul(const f32* weights, const f32* inputs, f32* __restrict__ results, int w_cols, int w_rows){ + for (int row = 0; row < w_rows; row += KERN_ROWS) { + for (int col = 0; col < w_cols; col += KERN_COLS) { + int col_bound=col+KERN_COLS; + __m256 res[VEC_IN_KERN]; + for (int z=col;z +#include + +#define KERN_COLS 8 +#define KERN_ROWS 4 +#define SIMD_ALIGN 32 +#define SIMD_ALIGN_F32 (SIMD_ALIGN / 4) // f32 is 4 bytes + +typedef float f32; +typedef unsigned char u8; + +void transpose_mat_inplace(matrix* in) { + int cols_before = in->cols; + int rows_before = in->rows; + + // Swapped for transpose + int pad_w_rows = (cols_before + KERN_ROWS - 1) / KERN_ROWS * KERN_ROWS; + int pad_w_width = (rows_before + KERN_COLS - 1) / KERN_COLS * KERN_COLS; + f32* transposed = (f32*)aligned_alloc( + SIMD_ALIGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32)); + memset(transposed, 0, pad_w_rows * pad_w_width * sizeof(f32)); + + for (int row = 0; row < rows_before; row++) { + for (int col = 0; col < cols_before; col++) { + transposed[col * pad_w_width + row] = in->data[row * cols_before + col]; + } + } + + free(in->data); + in->data = transposed; + // Swap dims + in->cols = pad_w_width; + in->rows = cols_before; +} + +matrix* new_matrix_aligned(int rows, int cols) { + // Pad width to fit kernel + int kern_align_f32 = (rows * cols + KERN_COLS - 1) / KERN_COLS * KERN_COLS; + + matrix* new_mat = (matrix*)malloc(sizeof(matrix)); + new_mat->rows = rows; + new_mat->cols = cols; + + // Align entire array for simd access and better cache line utilisation + new_mat->data = + (f32*)aligned_alloc(SIMD_ALGN, (((kern_align_f32 * sizeof(f32)) + 8 - 1) / 8 * 8)); + + return new_mat; +} + + +void matrix_mul(const f32* weights, const f32* inputs, f32* __restrict__ results, int w_cols, int w_rows){ + for (int row = 0; row < w_rows; row += KERN_ROWS) { + for (int col = 0; col < w_cols; col += KERN_COLS) { + int col_bound=col+KERN_COLS; + __m256 res[VEC_IN_KERN]; + for (int z=col;z \n"); return EXIT_FAILURE; } - + // Start timing struct timeval stop, start, preinf; gettimeofday(&start, NULL); @@ -195,7 +199,7 @@ int main(int argc, char* argv[]) { // int NUM_THREADS = sysconf(_SC_NPROCESSORS_ONLN); if (iter_per_in > 1) -#pragma omp parallel +// #pragma omp parallel { int force = 0; u8* results_local = (u8*)malloc(input_count * sizeof(u8)); @@ -206,7 +210,7 @@ int main(int argc, char* argv[]) { vector* input = new_vec_aligned(TENSOR_SIZE); memcpy(input->data, (f32*)&tensors[TSIZE_ALIGN_BYTES / sizeof(f32) * i], TENSOR_SIZE * sizeof(f32)); -#pragma omp for +// #pragma omp for for (int j = 0; j < iter_per_in - 1; j++) { // Using global memory for model seems to be faster results_local[i] = infer_reuse_layers_thread(input, weights, biases); diff --git a/src/matrix.c b/src/matrix.c index 39f818e..9e66b2d 100644 --- a/src/matrix.c +++ b/src/matrix.c @@ -54,7 +54,7 @@ static void kernel(const float* in, const float* wg, float* rs, int start_row, i // Ver. Artemis Rosman // W rows and W width is expected to be for the column major matrix, i.e. len of // in vec = w_rows, len of out vec = w_cols -void sgemv_t_tuned(const float* weights, const float* inputs, float* __restrict__ results, int w_width, int w_rows) { +void sgemv_t_tuned(const float* __restrict__ weights, const float* __restrict__ inputs, float* __restrict__ results, int w_width, int w_rows) { // Perform mult using kernel for (int row = 0; row < w_rows; row += KERN_ROWS) { for (int col = 0; col < w_width; col += KERN_COLS) { diff --git a/src/matrix.h b/src/matrix.h index a6cef19..6ba3fb5 100644 --- a/src/matrix.h +++ b/src/matrix.h @@ -7,7 +7,7 @@ typedef signed long i64; #define KERN_COLS 8 #define KERN_ROWS 4 #define SIMD_ALIGN 32 -#define SIMD_ALIGN_F32 (SIMD_ALIGN / sizeof(f32)) // f32 is 4 bytes +#define SIMD_ALIGN_F32 (SIMD_ALIGN / sizeof(f32)) typedef struct vector { int len;