diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f81ecb0..f1b40b1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,34 +1,34 @@
 name: CI
 
 on:
-    push:
-        branches: main
-        paths: ['**.cu','**.c','**.cpp', '**.h', '**CMakeLists.txt'] 
-    pull_request:
-        branches: main
-        paths: ['**.cu','**.c','**.cpp', '**.h', '**CMakeLists.txt']
+  push:
+    branches: main
+    paths: ["**.cu", "**.c", "**.cpp", "**.h", "**CMakeLists.txt"]
+  pull_request:
+    branches: main
+    paths: ["**.cu", "**.c", "**.cpp", "**.h", "**CMakeLists.txt"]
 
 jobs:
-    build-and-test:
-        runs-on: ubuntu-latest
+  build-and-test:
+    runs-on: ubuntu-latest
 
-        steps:
-        - name: Checkout code
-          uses: actions/checkout@v4
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
 
-        - name: Setup python
-          uses: actions/setup-python@v5
-          with:
-            python-version: '3.10' 
+      - name: Setup python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
 
-        - name: Install dependencies
-          run: |
-            pip install pandas
+      - name: Install dependencies
+        run: |
+          pip install pandas
 
-        - name: Build project
-          run: |
-            make build
-            
-        - name: Run test suite
-          run: |
-            make test
\ No newline at end of file
+      - name: Build project
+        run: |
+          make build
+
+      - name: Run test suite
+        run: |
+          make test_cpu
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d2d5cbd..faea7b0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,30 +1,36 @@
 cmake_minimum_required(VERSION 3.16)
 
-# Set the project name
-project(ichida-algo)
+project(ichida-algo LANGUAGES C CXX)
 
 set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -fopenmp -Wall -Wextra")
-
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED True)
-# set(CMAKE_VERBOSE_MAKEFILE ON)
+set(CMAKE_VERBOSE_MAKEFILE ON)
 
-set(SRC_DIR src)
 set(INC_DIR include)
-set(LIB_DIR lib)
-set(TEST_DIR test)
-set(BENCHMARK_DIR benchmark)
+set(SRC_DIR src)
+set(CUDA_SRC_DIR cudasrc)
 
-# Source files
-file(GLOB_RECURSE SOURCE_FILES ${SRC_DIR}/*.c)
+include_directories(${INC_DIR})
 
-include_directories(include)
+file(GLOB_RECURSE SOURCE_FILES ${SRC_DIR}/*.c)
 
 add_executable(speed_cpu ${SOURCE_FILES})
-# add_executable(benchmark ${SRC_DIR}/matrix.c ${BENCHMARK_DIR}/benchmark.c)
-
-target_link_libraries(speed_cpu m pthread)
-# target_link_libraries(benchmark m)
-
+target_link_libraries(speed_cpu m pthread gomp)
+
+find_package(CUDA)
+
+if(CUDA_FOUND)
+    enable_language(CUDA)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xptxas -O3 --use_fast_math -Xcompiler -march=native -unroll-aggressive -arch=sm_80")
+    find_package(MPI REQUIRED)
+    include_directories(${MPI_INCLUDE_PATH})
+    file(GLOB_RECURSE CUDA_SOURCE_FILES ${CUDA_SRC_DIR}/*.cu)
+    add_executable(speed_gpu ${CUDA_SOURCE_FILES})
+    set_target_properties(speed_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_link_libraries(speed_gpu m ${MPI_LIBRARIES})
+else()
+    message(STATUS "CUDA not found, only CPU version will be built.")
+endif()
 
 
diff --git a/Makefile b/Makefile
index 09a0ea2..0253aee 100644
--- a/Makefile
+++ b/Makefile
@@ -1,37 +1,36 @@
-.PHONY: all test clean run build run_test
+.PHONY: all clean build run_cpu run_gpu test_cpu test_gpu bench stat
 
-all: rebuild
+# Default iterations
+iterations ?= 1000
+
+all: build
 
 clean:
 	rm -f test/results.csv
 	rm -f results.csv
 	rm -rf build
-	rm -f speed_cpu
+	rm -f speed_cpu speed_gpu
 
 build: clean
-	cmake -Bbuild
-	$(MAKE) -C ./build
-	mv ./build/speed_cpu ./
-    
-rebuild:
-	$(MAKE) -C ./build
-	mv ./build/speed_cpu ./
-
-run: build
-	./speed_demo_cpu.sh ./weights_and_biases.txt ./tensors
-
-run_test: build
-	./speed_cpu ./weights_and_biases.txt ./tensors
-
-test: build
-	./speed_cpu ./weights_and_biases.txt ./tensors 1
-	mv ./results.csv ./test
-	python3 ./test/verify_csv.py
+	cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
+	$(MAKE) -C build
+	cp -u build/speed_cpu ./
+	if [ -f build/speed_gpu ]; then cp -u build/speed_gpu ./; fi
 
-bench: build
-	./build/benchmark
+run_cpu: build
+	./speed_cpu ./weights_and_biases.txt ./tensors $(iterations)
 
-stat: build
-	python3 ./benchmark/stat.py
+run_gpu: build
+	n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \
+	mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors $(iterations)
 
+test_cpu: build
+	./speed_cpu ./weights_and_biases.txt ./tensors $(iterations)
+	mv ./results.csv ./test
+	python3 ./test/verify_csv.py
 
+test_gpu: build
+	n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \
+	mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors $(iterations)
+	mv ./results.csv ./test
+	python3 ./test/verify_csv.py
\ No newline at end of file
diff --git a/benchmark/benchmark.c b/benchmark/benchmark.c
deleted file mode 100644
index 22c7eef..0000000
--- a/benchmark/benchmark.c
+++ /dev/null
@@ -1,62 +0,0 @@
-#include "../include/matrix.h"
-#include "util.h"
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-double benchmark_matrix_mul(int iterations, matrix* a, matrix* b, matrix* c) {
-    double res = 0;
-    for (int i = 0; i < iterations; i++) {
-        uint64_t start = rdtsc();
-        matrix_mul(a, b, c);
-        uint64_t end = rdtsc();
-        res += (double)(end - start) / (a->rows * a->cols);
-    }
-    res /= iterations;
-    return res;
-}
-
-double benchmark_matrix_add(int iterations, matrix* a, matrix* b) {
-    double res = 0;
-    for (int i = 0; i < iterations; i++) {
-        uint64_t start = rdtsc();
-        matrix_add(a, b);
-        uint64_t end = rdtsc();
-        res += (double)(end - start) / (a->rows);
-    }
-    res /= iterations;
-    return res;
-}
-
-double benchmark_relu(int iterations, matrix* a) {
-    double res = 0;
-    for (int i = 0; i < iterations; i++) {
-        uint64_t start = rdtsc();
-        relu(a);
-        uint64_t end = rdtsc();
-        res += (double)(end - start) / (a->rows);
-    }
-    res /= iterations;
-    return res;
-}
-
-double benchmark_softmax(int iterations, matrix* a) {
-    double res = 0;
-    for (int i = 0; i < iterations; i++) {
-        uint64_t start = rdtsc();
-        softmax(a);
-        uint64_t end = rdtsc();
-        res += (double)(end - start) / (a->rows * 2);
-    }
-    res /= iterations;
-    return res;
-}
-
-int main() {
-    int iterations = 200000;
-    printf("- matrix_mul: %f CPE\n",
-           benchmark_matrix_mul(iterations, new_matrix(2000, 1000), new_matrix(2000, 1), new_matrix(2000, 1)));
-    printf("- matrix_add: %f CPE\n", benchmark_matrix_add(iterations, new_matrix(2000, 1), new_matrix(2000, 1)));
-    printf("- relu: %f CPE\n", benchmark_relu(iterations, new_matrix(2000, 1)));
-    printf("- softmax: %f CPE\n", benchmark_softmax(iterations, new_matrix(2000, 1)));
-}
\ No newline at end of file
diff --git a/benchmark/matrix_add/Makefile b/benchmark/cpu/matrix_add/Makefile
similarity index 100%
rename from benchmark/matrix_add/Makefile
rename to benchmark/cpu/matrix_add/Makefile
diff --git a/benchmark/matrix_add/benchmark.c b/benchmark/cpu/matrix_add/benchmark.c
similarity index 100%
rename from benchmark/matrix_add/benchmark.c
rename to benchmark/cpu/matrix_add/benchmark.c
diff --git a/benchmark/matrix_add/matrix_add.h b/benchmark/cpu/matrix_add/matrix_add.h
similarity index 100%
rename from benchmark/matrix_add/matrix_add.h
rename to benchmark/cpu/matrix_add/matrix_add.h
diff --git a/benchmark/matrix_add/plot.py b/benchmark/cpu/matrix_add/plot.py
similarity index 100%
rename from benchmark/matrix_add/plot.py
rename to benchmark/cpu/matrix_add/plot.py
diff --git a/benchmark/matrix_add/versions/matrix_add_v1.c b/benchmark/cpu/matrix_add/versions/matrix_add_v1.c
similarity index 100%
rename from benchmark/matrix_add/versions/matrix_add_v1.c
rename to benchmark/cpu/matrix_add/versions/matrix_add_v1.c
diff --git a/benchmark/matrix_mul/Makefile b/benchmark/cpu/matrix_mul/Makefile
similarity index 100%
rename from benchmark/matrix_mul/Makefile
rename to benchmark/cpu/matrix_mul/Makefile
diff --git a/benchmark/matrix_mul/benchmark.c b/benchmark/cpu/matrix_mul/benchmark.c
similarity index 100%
rename from benchmark/matrix_mul/benchmark.c
rename to benchmark/cpu/matrix_mul/benchmark.c
diff --git a/benchmark/matrix_mul/matrix_mul.h b/benchmark/cpu/matrix_mul/matrix_mul.h
similarity index 100%
rename from benchmark/matrix_mul/matrix_mul.h
rename to benchmark/cpu/matrix_mul/matrix_mul.h
diff --git a/benchmark/matrix_mul/plot.py b/benchmark/cpu/matrix_mul/plot.py
similarity index 100%
rename from benchmark/matrix_mul/plot.py
rename to benchmark/cpu/matrix_mul/plot.py
diff --git a/benchmark/matrix_mul/versions/matrix_mul_v1.c b/benchmark/cpu/matrix_mul/versions/matrix_mul_v1.c
similarity index 100%
rename from benchmark/matrix_mul/versions/matrix_mul_v1.c
rename to benchmark/cpu/matrix_mul/versions/matrix_mul_v1.c
diff --git a/benchmark/matrix_mul/versions/matrix_mul_v2.c b/benchmark/cpu/matrix_mul/versions/matrix_mul_v2.c
similarity index 100%
rename from benchmark/matrix_mul/versions/matrix_mul_v2.c
rename to benchmark/cpu/matrix_mul/versions/matrix_mul_v2.c
diff --git a/benchmark/multithreading/plot.py b/benchmark/cpu/multithreading/plot.py
similarity index 100%
rename from benchmark/multithreading/plot.py
rename to benchmark/cpu/multithreading/plot.py
diff --git a/benchmark/relu/Makefile b/benchmark/cpu/relu/Makefile
similarity index 100%
rename from benchmark/relu/Makefile
rename to benchmark/cpu/relu/Makefile
diff --git a/benchmark/relu/benchmark.c b/benchmark/cpu/relu/benchmark.c
similarity index 100%
rename from benchmark/relu/benchmark.c
rename to benchmark/cpu/relu/benchmark.c
diff --git a/benchmark/relu/plot.py b/benchmark/cpu/relu/plot.py
similarity index 100%
rename from benchmark/relu/plot.py
rename to benchmark/cpu/relu/plot.py
diff --git a/benchmark/relu/relu.h b/benchmark/cpu/relu/relu.h
similarity index 100%
rename from benchmark/relu/relu.h
rename to benchmark/cpu/relu/relu.h
diff --git a/benchmark/relu/versions/relu_v1.c b/benchmark/cpu/relu/versions/relu_v1.c
similarity index 100%
rename from benchmark/relu/versions/relu_v1.c
rename to benchmark/cpu/relu/versions/relu_v1.c
diff --git a/benchmark/relu/versions/relu_v2.c b/benchmark/cpu/relu/versions/relu_v2.c
similarity index 100%
rename from benchmark/relu/versions/relu_v2.c
rename to benchmark/cpu/relu/versions/relu_v2.c
diff --git a/benchmark/softmax/Makefile b/benchmark/cpu/softmax/Makefile
similarity index 100%
rename from benchmark/softmax/Makefile
rename to benchmark/cpu/softmax/Makefile
diff --git a/benchmark/softmax/benchmark.c b/benchmark/cpu/softmax/benchmark.c
similarity index 100%
rename from benchmark/softmax/benchmark.c
rename to benchmark/cpu/softmax/benchmark.c
diff --git a/benchmark/softmax/plot.py b/benchmark/cpu/softmax/plot.py
similarity index 100%
rename from benchmark/softmax/plot.py
rename to benchmark/cpu/softmax/plot.py
diff --git a/benchmark/softmax/softmax.h b/benchmark/cpu/softmax/softmax.h
similarity index 100%
rename from benchmark/softmax/softmax.h
rename to benchmark/cpu/softmax/softmax.h
diff --git a/benchmark/softmax/versions/softmax_v1.c b/benchmark/cpu/softmax/versions/softmax_v1.c
similarity index 100%
rename from benchmark/softmax/versions/softmax_v1.c
rename to benchmark/cpu/softmax/versions/softmax_v1.c
diff --git a/benchmark/gpu/matrix_add/Makefile b/benchmark/gpu/matrix_add/Makefile
new file mode 100644
index 0000000..c754193
--- /dev/null
+++ b/benchmark/gpu/matrix_add/Makefile
@@ -0,0 +1,19 @@
+compile = nvcc -O3 -arch=sm_75 --use_fast_math 
+SRC_DIR := versions
+BIN_DIR := bin
+SRC_FILES := $(wildcard $(SRC_DIR)/*.cu)
+EXECUTABLES := $(patsubst $(SRC_DIR)/%.cu, $(BIN_DIR)/%, $(SRC_FILES))
+
+all: clean $(EXECUTABLES) 
+
+clean:
+	rm -f -r bin
+	mkdir bin
+
+$(BIN_DIR)/%: $(SRC_DIR)/%.cu
+	$(compile) $< benchmark.cu -o $@.exe 
+
+plot: all
+	python3 ./plot.py
+
+
diff --git a/benchmark/gpu/matrix_add/benchmark.cu b/benchmark/gpu/matrix_add/benchmark.cu
new file mode 100644
index 0000000..cc2ce0d
--- /dev/null
+++ b/benchmark/gpu/matrix_add/benchmark.cu
@@ -0,0 +1,13 @@
+#include "template.cuh"
+#include <stdio.h>
+#include <time.h>
+
+int main(int argc, char* argv[]) {
+    long n;
+    if (argc > 1) {
+        n = atol(argv[1]);
+    } else {
+        n = 100000;
+    }
+    printf("%f", time(n));
+}
\ No newline at end of file
diff --git a/benchmark/gpu/matrix_add/plot.py b/benchmark/gpu/matrix_add/plot.py
new file mode 100644
index 0000000..703c9a8
--- /dev/null
+++ b/benchmark/gpu/matrix_add/plot.py
@@ -0,0 +1,50 @@
+import os
+import subprocess
+import matplotlib.pyplot as plt
+
+result = subprocess.run(['make'], capture_output=True, text=True)
+# Define the folder containing the executables
+folder_path = './bin'  # Change this to your bin folder path
+
+# Define the input sizes to test
+start=10000
+end=10000
+step=100000
+
+input_sizes = list(range(start, end+1, step))
+# Initialize a dictionary to store runtimes for each executable
+runtimes = {exe: [] for exe in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, exe))}
+
+# Loop through each executable
+for exe in runtimes.keys():
+    exe_path = os.path.join(folder_path, exe)
+    
+    # Loop through each input size
+    for n in range(start,end+1,step):
+        # Run the executable with the input size and capture its output
+        result = subprocess.run([exe_path, str(n)], capture_output=True, text=True)
+        
+        # Parse the output to get the runtime
+        runtime = float(result.stdout.strip())
+        print(exe,runtime)
+        
+        # Append the runtime to the corresponding executable list
+        runtimes[exe].append(runtime)
+
+# Plot the data
+plt.figure(figsize=(12, 6))
+
+# Loop through each executable and plot the runtimes
+for exe, times in runtimes.items():
+    plt.plot(input_sizes, times, marker='o', label=exe)
+
+plt.xlabel('Iterations')
+plt.ylabel('Runtime (s)')
+plt.title('Benchmark of Function Versions')
+plt.legend()
+plt.grid(True)
+plt.tight_layout()
+
+output_file = 'benchmark_plot.png'  # Specify your desired output file name and format
+plt.savefig(output_file)
+# Show the plot
\ No newline at end of file
diff --git a/benchmark/gpu/matrix_add/template.cuh b/benchmark/gpu/matrix_add/template.cuh
new file mode 100644
index 0000000..baa94a5
--- /dev/null
+++ b/benchmark/gpu/matrix_add/template.cuh
@@ -0,0 +1,10 @@
+#pragma once
+
+typedef struct {
+    int rows;
+    int cols;
+    float* data; // array
+} matrix;
+
+double time(int n);
+matrix* new_matrix_d(int rows, int cols);
\ No newline at end of file
diff --git a/benchmark/gpu/matrix_add/versions/1.cu b/benchmark/gpu/matrix_add/versions/1.cu
new file mode 100644
index 0000000..cd4b001
--- /dev/null
+++ b/benchmark/gpu/matrix_add/versions/1.cu
@@ -0,0 +1,44 @@
+#include "../template.cuh"
+
+matrix* new_matrix(int rows, int cols) {
+    matrix* res = (matrix*)malloc(sizeof(matrix));
+    res->rows = rows;
+    res->cols = cols;
+    res->data = (float*)malloc((rows * cols) * sizeof(float));
+    return res;
+}
+
+matrix* new_matrix_d(int rows, int cols) {
+    matrix* res = (matrix*)malloc(sizeof(matrix));
+    res->rows = rows;
+    res->cols = cols;
+    res->cols = cols;
+    cudaMalloc((void**)&(res->data), rows * cols * sizeof(float));
+    return res;
+}
+
+__global__ void matrix_add(float *a, float*b ,int rows)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx<rows){
+        a[idx]+=b[idx];
+    }
+}
+
+double time(int n) {
+    int row=100000;
+    matrix* a = new_matrix_d(row, 1);
+    matrix* b = new_matrix_d(row, 1);
+    cudaStream_t stream1;
+    cudaStreamCreate ( &stream1);
+    
+    int thread=1024;
+    int block=((row+thread-1)/thread);
+
+    clock_t start = clock();
+    for(int i=0;i<n;i++){
+        matrix_add<<<1,1,0,stream1>>>(a->data,b->data,row);
+    }
+    double seconds = (double)(clock() - (double)start) / CLOCKS_PER_SEC;
+    return seconds;
+}
\ No newline at end of file
diff --git a/benchmark/gpu/matrix_add/versions/cpu.cu b/benchmark/gpu/matrix_add/versions/cpu.cu
new file mode 100644
index 0000000..ce36a1c
--- /dev/null
+++ b/benchmark/gpu/matrix_add/versions/cpu.cu
@@ -0,0 +1,37 @@
+#include "../template.cuh"
+
+matrix* new_matrix(int rows, int cols) {
+    matrix* res = (matrix*)malloc(sizeof(matrix));
+    res->rows = rows;
+    res->cols = cols;
+    res->data = (float*)malloc((rows * cols) * sizeof(float));
+    return res;
+}
+
+matrix* new_matrix_d(int rows, int cols) {
+    matrix* res = (matrix*)malloc(sizeof(matrix));
+    res->rows = rows;
+    res->cols = cols;
+    res->cols = cols;
+    cudaMalloc((void**)&(res->data), rows * cols * sizeof(float));
+    return res;
+}
+
+void matrix_add(float* a, float* b, int rows) {
+    for (int i = 0; i < rows; i++) {
+        a[i] += b[i];
+    }
+}
+
+double time(int n) {
+    int row=100000;
+    matrix* a = new_matrix(row, 1);
+    matrix* b = new_matrix(row, 1);
+
+    clock_t start = clock();
+    for (int i = 0; i < n; i++) {
+        matrix_add(a->data, b->data,row);
+    }
+    double seconds = (double)(clock() - (double)start) / CLOCKS_PER_SEC;
+    return seconds;
+}
\ No newline at end of file
diff --git a/benchmark/multithreading/v1.png b/benchmark/multithreading/v1.png
deleted file mode 100644
index 042bd8a..0000000
Binary files a/benchmark/multithreading/v1.png and /dev/null differ
diff --git a/benchmark/util.h b/benchmark/util.h
deleted file mode 100644
index 7d25c35..0000000
--- a/benchmark/util.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef UTIL_H
-#define UTIL_H
-
-#ifdef __ARM_ARCH_ISA_A64
-#include <stdint.h>
-
-inline uint64_t rdtsc() {
-    uint64_t val;
-    __asm volatile("mrs %0, cntvct_el0" : "=r"(val));
-    return val;
-}
-
-#else
-#include <stdint.h>
-#include <x86intrin.h>
-
-inline uint64_t rdtsc() { return __rdtsc(); }
-#endif
-
-#endif // UTIL_H
diff --git a/cudasrc/main.cu b/cudasrc/main.cu
new file mode 100644
index 0000000..732c948
--- /dev/null
+++ b/cudasrc/main.cu
@@ -0,0 +1,277 @@
+#include "matrix.cuh"
+#include <dirent.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+
+#define NUM_LAYERS 7
+
+#define CUDA_CHECK(call)                                                                                               \
+    do {                                                                                                               \
+        cudaError_t err = call;                                                                                        \
+        if (err != cudaSuccess) {                                                                                      \
+            fprintf(stderr, "CUDA error in %s (%s:%d): %s\n", __func__, __FILE__, __LINE__, cudaGetErrorString(err));  \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+
+matrix* weights[NUM_LAYERS];
+matrix* biases[NUM_LAYERS];
+
+// device weights and biases;
+matrix** d_weights;
+matrix** d_biases;
+
+float* inputs;
+float* d_inputs;
+int* results;
+int* d_results;
+
+char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I', 'i',
+                    'J', 'j', 'K', 'k', 'L', 'l', 'M', 'm', 'N', 'n', 'O', 'o', 'P', 'p', 'Q', 'q', 'R', 'r',
+                    'S', 's', 'T', 't', 'U', 'u', 'V', 'v', 'W', 'w', 'X', 'x', 'Y', 'y', 'Z', 'z'};
+
+void process_weights_str(char* line, int layer) {
+    char* token;
+    float value;
+    const char* delimiter = ",";
+
+    token = strtok(line, delimiter);
+    int n = (weights[layer]->rows) * (weights[layer]->cols);
+    for (int i = 0; i < n; i++) {
+        value = strtof(token, NULL);
+        (weights[layer]->data)[i] = value;
+        token = strtok(NULL, delimiter);
+    }
+}
+
+void process_biases_str(char* line, int layer) {
+    char* token;
+    float value;
+    const char* delimiter = ",";
+
+    token = strtok(line, delimiter);
+
+    int n = biases[layer]->rows;
+    for (int i = 0; i < n; i++) {
+        value = strtof(token, NULL);
+        (biases[layer]->data)[i] = value;
+        token = strtok(NULL, delimiter);
+    }
+}
+
+void read_model(const char* file_name) {
+    FILE* file = fopen(file_name, "r");
+
+    char* line = NULL;
+    size_t len = 0;
+    int line_number = 0;
+    int layer = 0;
+
+    while ((getline(&line, &len, file)) != -1) {
+        if ((line_number - 1) % 4 == 0) {
+            process_weights_str(line, layer);
+        } else if ((line_number - 3) % 4 == 0) {
+            process_biases_str(line, layer);
+            layer++;
+        }
+        line_number++;
+    }
+
+    free(line);
+    fclose(file);
+}
+
+void read_tensor(float* a, const char* fileName) {
+    FILE* file = fopen(fileName, "r");
+    char* line = NULL;
+    size_t len = 0;
+
+    getline(&line, &len, file);
+    char* token;
+    float value;
+    const char* delimiter = ",";
+    token = strtok(line, delimiter);
+
+    for (int i = 0; i < 225; i++) {
+        value = strtof(token, NULL);
+        a[i] = value;
+        token = strtok(NULL, delimiter);
+    }
+    free(line);
+    fclose(file);
+}
+
+__device__ void propagate_fwd(matrix* weights, float* input_layer, float* output_layer, matrix* biases) {
+    matrix_mul(weights->data, input_layer, output_layer, weights->rows, weights->cols);
+    matrix_add(output_layer, biases->data, biases->rows);
+}
+
+#define BLOCKS 108
+#define THREADS_PER_BLOCK 1024
+
+__global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input,
+                      int in_num) {
+
+    __shared__ float sharedInput[225];
+    float out1[98];
+    float out2[65];
+
+    int num_threads = blockDim.x * gridDim.x;
+    int thread_idx = (blockIdx.x * blockDim.x + threadIdx.x);
+
+    float* input = (float*)&d_inputs[in_num * 225];
+
+    if (threadIdx.x < 225) {
+        sharedInput[threadIdx.x] = input[threadIdx.x];
+    }
+    __syncthreads();
+
+    for (int i = thread_idx; i < it_per_input; i += num_threads) {
+        propagate_fwd(d_weights[0], sharedInput, out1, d_biases[0]);
+        relu(out1, 98);
+
+        propagate_fwd(d_weights[1], out1, out2, d_biases[1]);
+        relu(out2, 65);
+
+        propagate_fwd(d_weights[2], out2, out1, d_biases[2]);
+        relu(out1, 50);
+
+        propagate_fwd(d_weights[3], out1, out2, d_biases[3]);
+        relu(out2, 30);
+
+        propagate_fwd(d_weights[4], out2, out1, d_biases[4]);
+        relu(out1, 25);
+
+        propagate_fwd(d_weights[5], out1, out2, d_biases[5]);
+        relu(out2, 40);
+
+        propagate_fwd(d_weights[6], out2, out1, d_biases[6]);
+        softmax(out1, 52);
+
+        d_results[in_num] = argmax(out1, 52);
+    }
+}
+int main(int argc, char* argv[]) {
+    MPI_Init(&argc, &argv);
+    int totalProcess, processId;
+    MPI_Comm_size(MPI_COMM_WORLD, &totalProcess); // size
+    MPI_Comm_rank(MPI_COMM_WORLD, &processId);    // gpuid
+
+    if (argc < 4) {
+        printf("Not enough arguments. Usage: speed_cpu <path_to_model.txt> <tensors_dir/> <number_of_inferences>\n");
+        MPI_Finalize();
+        return EXIT_FAILURE;
+    }
+
+    // get no of gpu
+    int deviceCount;
+    cudaGetDeviceCount(&deviceCount);
+    int deviceId = processId % deviceCount;
+    cudaSetDevice(deviceId);
+
+    // Start timing
+    struct timeval stop, start;
+    gettimeofday(&start, NULL);
+
+    weights[0] = new_matrix(98, 225);
+    weights[1] = new_matrix(65, 98);
+    weights[2] = new_matrix(50, 65);
+    weights[3] = new_matrix(30, 50);
+    weights[4] = new_matrix(25, 30);
+    weights[5] = new_matrix(40, 25);
+    weights[6] = new_matrix(52, 40);
+
+    biases[0] = new_matrix(98, 1);
+    biases[1] = new_matrix(65, 1);
+    biases[2] = new_matrix(50, 1);
+    biases[3] = new_matrix(30, 1);
+    biases[4] = new_matrix(25, 1);
+    biases[5] = new_matrix(40, 1);
+    biases[6] = new_matrix(52, 1);
+    read_model(argv[1]);
+
+    CUDA_CHECK(cudaMalloc(&d_weights, NUM_LAYERS * sizeof(matrix*)));
+    CUDA_CHECK(cudaMalloc(&d_biases, NUM_LAYERS * sizeof(matrix*)));
+    for (int i = 0; i < NUM_LAYERS; i++) {
+        matrix* a = copy_to_device(weights[i]);
+        matrix* b = copy_to_device(biases[i]);
+        CUDA_CHECK(cudaMemcpy(&(d_weights[i]), &a, sizeof(matrix*), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(&(d_biases[i]), &b, sizeof(matrix*), cudaMemcpyHostToDevice));
+    }
+
+    const char* directory_path = argv[2];
+    struct dirent* entry;
+    DIR* dir = opendir(directory_path);
+
+    // Read and process inputs
+    char* file_name = (char*)malloc((100) * sizeof(char));
+    char* file_num_str = (char*)malloc((100) * sizeof(char));
+
+    int file_num;
+    int input_count = 0;
+    while ((entry = readdir(dir)) != NULL) {
+        if (entry->d_type == DT_REG) {
+            input_count++;
+        }
+    }
+
+    results = (int*)malloc((input_count) * sizeof(int));
+    inputs = (float*)malloc((input_count) * sizeof(float) * 225);
+
+    cudaMalloc(&d_results, (input_count) * sizeof(int));
+    cudaMalloc(&d_inputs, (input_count) * sizeof(float) * 225);
+
+    dir = opendir(directory_path);
+    while ((entry = readdir(dir)) != NULL) {
+        if (entry->d_type == DT_REG) {
+            strcpy(file_num_str, entry->d_name);
+            file_num_str[strlen(entry->d_name) - 7] = '\0';
+            file_num = atoi(entry->d_name);
+            strcpy(file_name, directory_path);
+            strcat(file_name, "/");
+            strcat(file_name, entry->d_name);
+            read_tensor((float*)&inputs[(file_num - 1) * 225], file_name);
+        }
+    }
+
+    free(file_name);
+    free(file_num_str);
+    closedir(dir);
+
+    cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * input_count, cudaMemcpyHostToDevice);
+
+    int it_num = atoi(argv[3]);
+    int gpu_it_num = it_num / totalProcess + (processId < (it_num % totalProcess) ? 1 : 0);
+
+    struct timeval stop1, start1;
+    gettimeofday(&start1, NULL);
+
+    cudaDeviceSynchronize();
+    for (int i = 0; i < input_count; i++) {
+        infer<<<BLOCKS, THREADS_PER_BLOCK>>>(d_inputs, d_results, d_weights, d_biases, gpu_it_num, i);
+        CUDA_CHECK(cudaGetLastError());
+    }
+    cudaDeviceSynchronize();
+
+    if (processId == 0) {
+        cudaMemcpy(results, d_results, (input_count) * (sizeof(int)), cudaMemcpyDeviceToHost);
+        gettimeofday(&stop1, NULL);
+        printf("Process %d - Inference: %lu us\n", processId,
+               (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec);
+        FILE* csv_file = fopen("results.csv", "w+");
+        fprintf(csv_file, "image_number, guess\n");
+        for (int i = 0; i < input_count; i++) {
+            fprintf(csv_file, "%d, %c\n", i + 1, letters[results[i]]);
+        }
+        fclose(csv_file);
+    }
+    // Time taken
+    gettimeofday(&stop, NULL);
+    printf("Process %d - Total: %lu us\n", processId,
+           (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}
diff --git a/cudasrc/matrix.cu b/cudasrc/matrix.cu
new file mode 100644
index 0000000..c223f6c
--- /dev/null
+++ b/cudasrc/matrix.cu
@@ -0,0 +1,110 @@
+#include "matrix.cuh"
+#include "util.cuh"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define UNROLL_FACTOR 8
+
+__host__ __device__ matrix* new_matrix(int rows, int cols) {
+    matrix* res = (matrix*)malloc(sizeof(matrix));
+    res->rows = rows;
+    res->cols = cols;
+    res->data = (float*)malloc((rows * cols) * sizeof(float));
+    return res;
+}
+
+__global__ void alloc(matrix* res, float* data, int rows, int cols) {
+    res->rows = rows;
+    res->cols = cols;
+    res->data = data;
+}
+
+matrix* new_matrix_d(int rows, int cols) {
+    matrix* res;
+    CUDA_CHECK(cudaMalloc(&res, sizeof(matrix)));
+    float* data;
+    cudaMalloc(&data, rows * cols * sizeof(float));
+    alloc<<<1, 1>>>(res, data, rows, cols);
+    return res;
+}
+
+matrix* copy_to_device(matrix* h_mat) {
+    matrix* res;
+    CUDA_CHECK(cudaMalloc(&res, sizeof(matrix)));
+    float* data;
+    cudaMalloc(&data, h_mat->rows * h_mat->cols * sizeof(float));
+    cudaMemcpy(data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(float), cudaMemcpyHostToDevice);
+    alloc<<<1, 1>>>(res, data, h_mat->rows, h_mat->cols);
+    return res;
+}
+
+__device__ __host__ matrix* create_copy(matrix* mat) {
+    matrix* res = (matrix*)malloc(sizeof(matrix));
+    res->rows = mat->rows;
+    res->cols = mat->cols;
+    res->data = (float*)malloc((res->rows * res->cols) * sizeof(float));
+    memcpy(res->data, mat->data, res->rows * res->cols * sizeof(float));
+    return res;
+}
+
+__device__ void matrix_mul(float* weight, float* input, float* result, int w_rows, int w_cols) {
+    for (int i = 0; i < w_rows; i++) {
+        float sum = 0;
+        int j = 0;
+
+        for (; j <= w_cols - 4; j += 4) {
+            sum += weight[i * w_cols + j] * input[j];
+            sum += weight[i * w_cols + j + 1] * input[j + 1];
+            sum += weight[i * w_cols + j + 2] * input[j + 2];
+            sum += weight[i * w_cols + j + 3] * input[j + 3];
+        }
+        for (; j < w_cols; j++) {
+            sum += weight[i * w_cols + j] * input[j];
+        }
+        result[i] = sum;
+    }
+}
+
+__device__ void matrix_add(float* a, float* b, int rows) {
+    for (int i = 0; i < rows; i++) {
+        a[i] += b[i];
+    }
+}
+
+__device__ void relu(float* a, int rows) {
+    for (int i = 0; i < rows; i++) {
+        a[i] = (a[i] > 0) ? a[i] : 0;
+    }
+}
+
+// Hacky but fast and accurate for existing inputs
+static __device__ inline float fastexp(float x) {
+    int tmp = (int)(1512775 * x + 1072632447);
+    float result;
+    memcpy(&result, &tmp, sizeof(result));
+    return result;
+}
+
+__device__ void softmax(float* a, int rows) {
+    float sum = 0.0;
+    for (size_t i = 0; i < rows; i++) {
+        sum += __expf(a[i]);
+    }
+    float t = __logf(sum);
+    for (size_t i = 0; i < rows; i++) {
+        a[i] = __expf(a[i] - t);
+    }
+}
+
+__device__ int argmax(float* a, int rows) {
+    float res = a[0];
+    int idx = 0;
+    for (int i = 0; i < rows; i++) {
+        if (res < a[i]) {
+            res = a[i];
+            idx = i;
+        }
+    }
+    return idx;
+}
diff --git a/cudasrc/matrix.cuh b/cudasrc/matrix.cuh
new file mode 100644
index 0000000..b2191cb
--- /dev/null
+++ b/cudasrc/matrix.cuh
@@ -0,0 +1,25 @@
+#pragma once
+
+typedef struct {
+    int rows;
+    int cols;
+    float* data; // array
+} matrix;
+
+ __host__ __device__ matrix* new_matrix(int rows, int cols);
+
+matrix* copy_to_device(matrix* h_mat);
+
+matrix* new_matrix_d(int rows, int cols);
+
+__device__ void matrix_mul(float* a, float* b, float* c, int rows, int cols);
+
+__device__ void matrix_add(float* a, float* b, int rows);
+
+__device__ void relu(float* a, int rows);
+
+__device__ void softmax(float* a, int rows);
+
+__device__ int argmax(float* a, int rows);
+
+__device__ __host__ matrix* create_copy(matrix* mat);
diff --git a/cudasrc/util.cuh b/cudasrc/util.cuh
new file mode 100644
index 0000000..a6f988a
--- /dev/null
+++ b/cudasrc/util.cuh
@@ -0,0 +1,8 @@
+#pragma once
+
+#define CUDA_CHECK(call)                                                                                               \
+    cudaError_t err = call;                                                                                            \
+    if (err != cudaSuccess) {                                                                                          \
+        fprintf(stderr, "CUDA error in %s (%s:%d): %s\n", __func__, __FILE__, __LINE__, cudaGetErrorString(err));      \
+        exit(EXIT_FAILURE);                                                                                            \
+    }
\ No newline at end of file