diff --git a/Makefile b/Makefile
index 09a0ea2..c4a8c0c 100644
--- a/Makefile
+++ b/Makefile
@@ -24,7 +24,7 @@ run_test: build
 	./speed_cpu ./weights_and_biases.txt ./tensors
 
 test: build
-	./speed_cpu ./weights_and_biases.txt ./tensors 1
+	./speed_cpu ./weights_and_biases.txt ./tensors 1000000
 	mv ./results.csv ./test
 	python3 ./test/verify_csv.py
 
diff --git a/benchmark/matrix_mul/benchmark.c b/benchmark/matrix_mul/benchmark.c
index 767135d..bceca02 100644
--- a/benchmark/matrix_mul/benchmark.c
+++ b/benchmark/matrix_mul/benchmark.c
@@ -10,14 +10,22 @@ typedef struct {
     int cols;
 } matrix;
 
-matrix* new_matrix(int rows, int cols) {
-    matrix* res = (matrix*)malloc(sizeof(matrix));
-    res->rows = rows;
-    res->cols = cols;
-    res->data = (float*)malloc((rows * cols) * sizeof(float));
-    return res;
+matrix* new_matrix_aligned(int rows, int cols) {
+    // Pad width to fit kernel
+    int kern_align_f32 = (rows * cols + KERN_COLS - 1) / KERN_COLS * KERN_COLS;
+
+    matrix* new_mat = (matrix*)malloc(sizeof(matrix));
+    new_mat->rows = rows;
+    new_mat->cols = cols;
+
+    // Align entire array for simd access and better cache line utilisation
+    new_mat->data =
+        (f32*)aligned_alloc(SIMD_ALIGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32));
+
+    return new_mat;
 }
 
+
 int main(int argc, char* argv[]) {
     long n = 0;
     if (argc > 1) {
diff --git a/benchmark/matrix_mul/benchmark.png b/benchmark/matrix_mul/benchmark.png
new file mode 100644
index 0000000..2701a5a
Binary files /dev/null and b/benchmark/matrix_mul/benchmark.png differ
diff --git a/benchmark/matrix_mul/versions/2x8.c b/benchmark/matrix_mul/versions/2x8.c
new file mode 100644
index 0000000..b340385
--- /dev/null
+++ b/benchmark/matrix_mul/versions/2x8.c
@@ -0,0 +1,76 @@
+#include "../matrix_mul.h"
+#include <x86intrin.h>
+#include <string.h>
+
+#define KERN_COLS 8
+#define VEC_IN_KERN (KERN_COLS / 8)
+#define KERN_ROWS 2
+#define SIMD_ALGN 32
+typedef float f32;
+typedef unsigned char u8;
+
+void transpose_mat_inplace(matrix* in) {
+    int cols_before = in->cols;
+    int rows_before = in->rows;
+
+    // Swapped for transpose
+    int pad_w_rows = (cols_before + KERN_ROWS - 1) / KERN_ROWS * KERN_ROWS;
+    int pad_w_width = (rows_before + KERN_COLS - 1) / KERN_COLS * KERN_COLS;
+    f32* transposed = (f32*)aligned_alloc(
+        SIMD_ALGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + 8 - 1) / 8 * 8));
+    memset(transposed, 0, pad_w_rows * pad_w_width * sizeof(f32));
+
+    for (int row = 0; row < rows_before; row++) {
+        for (int col = 0; col < cols_before; col++) {
+            transposed[col * pad_w_width + row] = in->data[row * cols_before + col];
+        }
+    }
+
+    free(in->data);
+    in->data = transposed;
+    // Swap dims
+    in->cols = pad_w_width;
+    in->rows = cols_before;
+}
+
+matrix* new_matrix_aligned(int rows, int cols) {
+    // Pad width to fit kernel
+    int kern_align_f32 = (rows * cols + KERN_COLS - 1) / KERN_COLS * KERN_COLS;
+
+    matrix* new_mat = (matrix*)malloc(sizeof(matrix));
+    new_mat->rows = rows;
+    new_mat->cols = cols;
+
+    // Align entire array for simd access and better cache line utilisation
+    new_mat->data =
+        (f32*)aligned_alloc(SIMD_ALGN, (((kern_align_f32 * sizeof(f32)) + 8 - 1) / 8 * 8));
+
+    return new_mat;
+}
+
+
+void matrix_mul(const f32* weights, const f32* inputs, f32* __restrict__ results, int w_cols, int w_rows){
+    for (int row = 0; row < w_rows; row += KERN_ROWS) {
+        for (int col = 0; col < w_cols; col += KERN_COLS) {
+            int col_bound=col+KERN_COLS;
+            __m256 res[VEC_IN_KERN];
+            for (int z=col;z<col_bound;z+=8){
+                int k=(z-col)/8;
+                res[k]= _mm256_load_ps(&results[z]);
+            }
+            int row_bound=row+KERN_ROWS;
+            for (int j = row; j < row_bound; j++) {
+                __m256 is = _mm256_set1_ps(inputs[j]);
+                for (int z=col;z<col_bound;z+=8){
+                    __m256 wr = _mm256_load_ps(&weights[w_cols*j+ z]);
+                    int k=(z-col)/8;
+                    res[k]= _mm256_fmadd_ps(is,wr,res[k]);
+                }
+            }
+            for (int z=col;z<col_bound;z+=8){
+                int k=(z-col)/8;
+                _mm256_store_ps(&results[z], res[k]);
+            }
+        }
+    }
+}
diff --git a/benchmark/matrix_mul/versions/4x8.c b/benchmark/matrix_mul/versions/4x8.c
new file mode 100644
index 0000000..bc26916
--- /dev/null
+++ b/benchmark/matrix_mul/versions/4x8.c
@@ -0,0 +1,77 @@
+#include "../matrix_mul.h"
+#include <x86intrin.h>
+#include <string.h>
+
+#define KERN_COLS 8
+#define KERN_ROWS 4
+#define SIMD_ALIGN 32
+#define SIMD_ALIGN_F32 (SIMD_ALIGN / 4) // f32 is 4 bytes
+
+typedef float f32;
+typedef unsigned char u8;
+
+void transpose_mat_inplace(matrix* in) {
+    int cols_before = in->cols;
+    int rows_before = in->rows;
+
+    // Swapped for transpose
+    int pad_w_rows = (cols_before + KERN_ROWS - 1) / KERN_ROWS * KERN_ROWS;
+    int pad_w_width = (rows_before + KERN_COLS - 1) / KERN_COLS * KERN_COLS;
+    f32* transposed = (f32*)aligned_alloc(
+        SIMD_ALIGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32));
+    memset(transposed, 0, pad_w_rows * pad_w_width * sizeof(f32));
+
+    for (int row = 0; row < rows_before; row++) {
+        for (int col = 0; col < cols_before; col++) {
+            transposed[col * pad_w_width + row] = in->data[row * cols_before + col];
+        }
+    }
+
+    free(in->data);
+    in->data = transposed;
+    // Swap dims
+    in->cols = pad_w_width;
+    in->rows = cols_before;
+}
+
+matrix* new_matrix_aligned(int rows, int cols) {
+    // Pad width to fit kernel
+    int kern_align_f32 = (rows * cols + KERN_COLS - 1) / KERN_COLS * KERN_COLS;
+
+    matrix* new_mat = (matrix*)malloc(sizeof(matrix));
+    new_mat->rows = rows;
+    new_mat->cols = cols;
+
+    // Align entire array for simd access and better cache line utilisation
+    new_mat->data =
+        (f32*)aligned_alloc(SIMD_ALGN, (((kern_align_f32 * sizeof(f32)) + 8 - 1) / 8 * 8));
+
+    return new_mat;
+}
+
+
+void matrix_mul(const f32* weights, const f32* inputs, f32* __restrict__ results, int w_cols, int w_rows){
+    for (int row = 0; row < w_rows; row += KERN_ROWS) {
+        for (int col = 0; col < w_cols; col += KERN_COLS) {
+            int col_bound=col+KERN_COLS;
+            __m256 res[VEC_IN_KERN];
+            for (int z=col;z<col_bound;z+=8){
+                int k=(z-col)/8;
+                res[k]= _mm256_load_ps(&results[z]);
+            }
+            int row_bound=row+KERN_ROWS;
+            for (int j = row; j < row_bound; j++) {
+                __m256 is = _mm256_set1_ps(inputs[j]);
+                for (int z=col;z<col_bound;z+=8){
+                    __m256 wr = _mm256_load_ps(&weights[w_cols*j+ z]);
+                    int k=(z-col)/8;
+                    res[k]= _mm256_fmadd_ps(is,wr,res[k]);
+                }
+            }
+            for (int z=col;z<col_bound;z+=8){
+                int k=(z-col)/8;
+                _mm256_store_ps(&results[z], res[k]);
+            }
+        }
+    }
+}
diff --git a/benchmark/softmax/benchmark.png b/benchmark/softmax/benchmark.png
new file mode 100644
index 0000000..eeacbd1
Binary files /dev/null and b/benchmark/softmax/benchmark.png differ
diff --git a/benchmark/softmax/versions/softmax_v2.c b/benchmark/softmax/versions/softmax_v2.c
new file mode 100644
index 0000000..45f0640
--- /dev/null
+++ b/benchmark/softmax/versions/softmax_v2.c
@@ -0,0 +1,15 @@
+#include "../softmax.h"
+#include "math.h"
+
+
+void softmax(float* dest, int len) {
+    float res = 0.0f;
+    for (int i = 0; i < len; i++) {
+        res += expf(dest[i]);
+    }
+
+    float t= logf(res);
+    for (int i = 0; i < len; i++) {
+        dest[i] = expf(dest[i]-t);
+    }
+}
\ No newline at end of file
diff --git a/src/main.c b/src/main.c
index 47728f8..90adb7b 100644
--- a/src/main.c
+++ b/src/main.c
@@ -33,6 +33,10 @@ void propagate_fwd(const matrix* weights, const vector* inputs, vector* results,
 
 // Basic version, too many aligned_alloc
 u8 infer(vector* input) {
+    f32 temp1[104];
+    f32 temp2[72];
+    memset(temp1,0,sizeof(f32)*104);
+    memset(temp2,0,sizeof(f32)*72);
     vector* outputs[NUM_LAYERS];
     outputs[0] = new_vec_aligned(98);
     outputs[1] = new_vec_aligned(65);
@@ -129,7 +133,7 @@ int main(int argc, char* argv[]) {
         printf("Not enough arguments. Usage: speed_cpu <path_to_model.txt> <tensors_dir/> <number_of_inferences>\n");
         return EXIT_FAILURE;
     }
-
+    
     // Start timing
     struct timeval stop, start, preinf;
     gettimeofday(&start, NULL);
@@ -195,7 +199,7 @@ int main(int argc, char* argv[]) {
     // int NUM_THREADS = sysconf(_SC_NPROCESSORS_ONLN);
 
     if (iter_per_in > 1)
-#pragma omp parallel
+// #pragma omp parallel
     {
         int force = 0;
         u8* results_local = (u8*)malloc(input_count * sizeof(u8));
@@ -206,7 +210,7 @@ int main(int argc, char* argv[]) {
             vector* input = new_vec_aligned(TENSOR_SIZE);
             memcpy(input->data, (f32*)&tensors[TSIZE_ALIGN_BYTES / sizeof(f32) * i], TENSOR_SIZE * sizeof(f32));
 
-#pragma omp for
+// #pragma omp for
             for (int j = 0; j < iter_per_in - 1; j++) {
                 // Using global memory for model seems to be faster
                 results_local[i] = infer_reuse_layers_thread(input, weights, biases);
diff --git a/src/matrix.c b/src/matrix.c
index 39f818e..9e66b2d 100644
--- a/src/matrix.c
+++ b/src/matrix.c
@@ -54,7 +54,7 @@ static void kernel(const float* in, const float* wg, float* rs, int start_row, i
 // Ver. Artemis Rosman
 // W rows and W width is expected to be for the column major matrix, i.e. len of
 // in vec = w_rows, len of out vec = w_cols
-void sgemv_t_tuned(const float* weights, const float* inputs, float* __restrict__ results, int w_width, int w_rows) {
+void sgemv_t_tuned(const float* __restrict__ weights, const float* __restrict__ inputs, float* __restrict__ results, int w_width, int w_rows) {
     // Perform mult using kernel
     for (int row = 0; row < w_rows; row += KERN_ROWS) {
         for (int col = 0; col < w_width; col += KERN_COLS) {
diff --git a/src/matrix.h b/src/matrix.h
index a6cef19..6ba3fb5 100644
--- a/src/matrix.h
+++ b/src/matrix.h
@@ -7,7 +7,7 @@ typedef signed long i64;
 #define KERN_COLS 8
 #define KERN_ROWS 4
 #define SIMD_ALIGN 32
-#define SIMD_ALIGN_F32 (SIMD_ALIGN / sizeof(f32)) // f32 is 4 bytes
+#define SIMD_ALIGN_F32 (SIMD_ALIGN / sizeof(f32)) 
 
 typedef struct vector {
     int len;