Type naming consistency

kachi-group · Jul 8, 2024 · 327d37e · 327d37e
1 parent bae32dc
commit 327d37e
Show file tree

Hide file tree

Showing 6 changed files with 48 additions and 48 deletions.
diff --git a/cudasrc/main.cu b/cudasrc/main.cu
@@ -31,7 +31,7 @@ char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f',
 
 void process_weights_str(char* line, int layer) {
     char* token;
-    float value;
+    f32 value;
     const char* delimiter = ",";
 
     token = strtok(line, delimiter);
@@ -45,7 +45,7 @@ void process_weights_str(char* line, int layer) {
 
 void process_biases_str(char* line, int layer) {
     char* token;
-    float value;
+    f32 value;
     const char* delimiter = ",";
 
     token = strtok(line, delimiter);
@@ -80,7 +80,7 @@ void read_model(const char* file_name) {
     fclose(file);
 }
 
-void read_tensor(float* out, const char* fileName) {
+void read_tensor(f32* out, const char* fileName) {
     FILE* file = fopen(fileName, "r");
     char* line = NULL;
     size_t len = 0;
@@ -91,7 +91,7 @@ void read_tensor(float* out, const char* fileName) {
     }
 
     char* token;
-    float value;
+    f32 value;
     const char* delimiter = ",";
     token = strtok(line, delimiter);
 
@@ -123,17 +123,17 @@ __device__ void propagate_fwd(matrix* weights, f32* input_layer, f32* output_lay
     matrix_add(output_layer, biases->data, biases->rows);
 }
 
-__global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input,
+__global__ void infer(f32* d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input,
                       int in_num) {
 
-    __shared__ float shared_input[TENSOR_LENGTH];
-    float out1[98];
-    float out2[65];
+    __shared__ f32 shared_input[TENSOR_LENGTH];
+    f32 out1[98];
+    f32 out2[65];
 
     int num_threads = blockDim.x * gridDim.x;
     int thread_idx = (blockIdx.x * blockDim.x + threadIdx.x);
 
-    float* input = (float*)&d_inputs[in_num * TENSOR_LENGTH];
+    f32* input = (f32*)&d_inputs[in_num * TENSOR_LENGTH];
 
     if (threadIdx.x < TENSOR_LENGTH) {
         shared_input[threadIdx.x] = input[threadIdx.x];
@@ -228,7 +228,7 @@ int main(int argc, char* argv[]) {
     results = (int*)malloc((input_count) * sizeof(int));
     inputs = (f32*)malloc((input_count) * sizeof(f32) * TENSOR_LENGTH);
     cudaMalloc(&d_results, (input_count) * sizeof(int));
-    cudaMalloc(&d_inputs, (input_count) * sizeof(float) * TENSOR_LENGTH);
+    cudaMalloc(&d_inputs, (input_count) * sizeof(f32) * TENSOR_LENGTH);
 
     // Read and process inputs
     char* file_name = (char*)malloc((100) * sizeof(char));
@@ -253,7 +253,7 @@ int main(int argc, char* argv[]) {
     closedir(dir);
 
     // Move input array to GPU memory
-    cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * input_count, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_inputs, inputs, sizeof(f32) * 225 * input_count, cudaMemcpyHostToDevice);
 
 # ifdef USE_MPI
     int it_per_gpu = num_its / num_proccesses + (process_id < (num_its % num_proccesses) ? 1 : 0);

diff --git a/cudasrc/matrix.cu b/cudasrc/matrix.cu
@@ -9,11 +9,11 @@ __host__ __device__ matrix* new_matrix(int rows, int cols) {
     matrix* res = (matrix*)malloc(sizeof(matrix));
     res->rows = rows;
     res->cols = cols;
-    res->data = (float*)malloc((rows * cols) * sizeof(float));
+    res->data = (f32*)malloc((rows * cols) * sizeof(f32));
     return res;
 }
 
-__global__ void alloc(matrix* res, float* data, int rows, int cols) {
+__global__ void alloc(matrix* res, f32* data, int rows, int cols) {
     res->rows = rows;
     res->cols = cols;
     res->data = data;
@@ -22,18 +22,18 @@ __global__ void alloc(matrix* res, float* data, int rows, int cols) {
 matrix* new_matrix_d(int rows, int cols) {
     matrix* res;
     cudaMalloc(&res, sizeof(matrix));
-    float* data;
-    cudaMalloc(&data, rows * cols * sizeof(float));
+    f32* data;
+    cudaMalloc(&data, rows * cols * sizeof(f32));
     alloc<<<1, 1>>>(res, data, rows, cols);
     return res;
 }
 
 matrix* copy_to_device(matrix* h_mat) {
     matrix* res;
     cudaMalloc(&res, sizeof(matrix));
-    float* data;
-    cudaMalloc(&data, h_mat->rows * h_mat->cols * sizeof(float));
-    cudaMemcpy(data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(float), cudaMemcpyHostToDevice);
+    f32* data;
+    cudaMalloc(&data, h_mat->rows * h_mat->cols * sizeof(f32));
+    cudaMemcpy(data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(f32), cudaMemcpyHostToDevice);
     alloc<<<1, 1>>>(res, data, h_mat->rows, h_mat->cols);
     return res;
 }
@@ -42,14 +42,14 @@ __device__ __host__ matrix* create_copy(matrix* mat) {
     matrix* res = (matrix*)malloc(sizeof(matrix));
     res->rows = mat->rows;
     res->cols = mat->cols;
-    res->data = (float*)malloc((res->rows * res->cols) * sizeof(float));
-    memcpy(res->data, mat->data, res->rows * res->cols * sizeof(float));
+    res->data = (f32*)malloc((res->rows * res->cols) * sizeof(f32));
+    memcpy(res->data, mat->data, res->rows * res->cols * sizeof(f32));
     return res;
 }
 
-__device__ void matrix_mul(float* weight, float* input, float* result, int w_rows, int w_cols) {
+__device__ void matrix_mul(f32* weight, f32* input, f32* result, int w_rows, int w_cols) {
     for (int i = 0; i < w_rows; i++) {
-        float sum = 0;
+        f32 sum = 0;
         int j = 0;
 
         for (; j <= w_cols - 4; j += 4) {
@@ -65,31 +65,31 @@ __device__ void matrix_mul(float* weight, float* input, float* result, int w_row
     }
 }
 
-__device__ void matrix_add(float* a, float* b, int rows) {
+__device__ void matrix_add(f32* a, f32* b, int rows) {
     for (int i = 0; i < rows; i++) {
         a[i] += b[i];
     }
 }
 
-__device__ void relu(float* a, int rows) {
+__device__ void relu(f32* a, int rows) {
     for (int i = 0; i < rows; i++) {
         a[i] = (a[i] > 0) ? a[i] : 0;
     }
 }
 
-__device__ void softmax(float* a, int rows) {
-    float sum = 0.0;
+__device__ void softmax(f32* a, int rows) {
+    f32 sum = 0.0;
     for (size_t i = 0; i < rows; i++) {
         sum += __expf(a[i]);
     }
-    float t = __logf(sum);
+    f32 t = __logf(sum);
     for (size_t i = 0; i < rows; i++) {
         a[i] = __expf(a[i] - t);
     }
 }
 
-__device__ int argmax(float* a, int rows) {
-    float res = a[0];
+__device__ int argmax(f32* a, int rows) {
+    f32 res = a[0];
     int idx = 0;
     for (int i = 0; i < rows; i++) {
         if (res < a[i]) {

diff --git a/cudasrc/matrix.cuh b/cudasrc/matrix.cuh
@@ -5,7 +5,7 @@ typedef float f32;
 typedef struct {
     int rows;
     int cols;
-    float* data; // array
+    f32* data; // array
 } matrix;
 
  __host__ __device__ matrix* new_matrix(int rows, int cols);
@@ -14,14 +14,14 @@ matrix* copy_to_device(matrix* h_mat);
 
 matrix* new_matrix_d(int rows, int cols);
 
-__device__ void matrix_mul(float* a, float* b, float* c, int rows, int cols);
+__device__ void matrix_mul(f32* a, f32* b, f32* c, int rows, int cols);
 
-__device__ void matrix_add(float* a, float* b, int rows);
+__device__ void matrix_add(f32* a, f32* b, int rows);
 
-__device__ void relu(float* a, int rows);
+__device__ void relu(f32* a, int rows);
 
-__device__ void softmax(float* a, int rows);
+__device__ void softmax(f32* a, int rows);
 
-__device__ int argmax(float* a, int rows);
+__device__ int argmax(f32* a, int rows);
 
 __device__ __host__ matrix* create_copy(matrix* mat);
diff --git a/src/file_io.c b/src/file_io.c
@@ -20,7 +20,7 @@ int file_count(const char* dir_path) {
 
 void process_weights_str(matrix** weights, char* line, int layer) {
     char* token;
-    float value;
+    f32 value;
     const char* delimiter = ",";
 
     token = strtok(line, delimiter);
@@ -34,7 +34,7 @@ void process_weights_str(matrix** weights, char* line, int layer) {
 
 void process_biases_str(vector** biases, char* line, int layer) {
     char* token;
-    float value;
+    f32 value;
     const char* delimiter = ",";
 
     token = strtok(line, delimiter);
@@ -78,9 +78,9 @@ void read_tensor(f32* a, const char* file_name) {
         perror("Could not read tensor file. Exiting.");
         exit(EXIT_FAILURE);
     } 
-    
+
     char* token;
-    float value;
+    f32 value;
     const char* delimiter = ",";
     token = strtok(line, delimiter);
 

diff --git a/src/main.c b/src/main.c
@@ -25,7 +25,7 @@ char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f',
                     'J', 'j', 'K', 'k', 'L', 'l', 'M', 'm', 'N', 'n', 'O', 'o', 'P', 'p', 'Q', 'q', 'R', 'r',
                     'S', 's', 'T', 't', 'U', 'u', 'V', 'v', 'W', 'w', 'X', 'x', 'Y', 'y', 'Z', 'z'};
 
-void propagate_fwd(const matrix* weights, const float* inputs, float* results, const vector* biases) {
+void propagate_fwd(const matrix* weights, const f32* inputs, f32* results, const vector* biases) {
     sgemv_t_tuned(weights->data, inputs, results, weights->cols, weights->rows);
     // Add biases onto results
     vector_add_inplace(biases->len, biases->data, results);
@@ -35,8 +35,8 @@ void propagate_fwd(const matrix* weights, const float* inputs, float* results, c
 // This code f***ing sucks but its fast so uhhhh
 u8 infer_reuse_layers_thread(vector* input, matrix** weights, vector** biases) {
     // Slightly larger than required for padding
-    float out0[104] __attribute__((aligned(SIMD_ALIGN))) = {0};
-    float out1[72] __attribute__((aligned(SIMD_ALIGN))) = {0};
+    f32 out0[104] __attribute__((aligned(SIMD_ALIGN))) = {0};
+    f32 out1[72] __attribute__((aligned(SIMD_ALIGN))) = {0};
 
     propagate_fwd(weights[0], input->data, out0, biases[0]);
     relu_inplace(out0, 98);

diff --git a/src/matrix.c b/src/matrix.c
@@ -39,7 +39,7 @@ vector* new_vec_aligned(int len) {
 }
 
 // ver. Artemis Rosman simd_intrin 2x8
-static void kernel(const float* in, const float* wg, float* rs, int start_row, int start_col, int w_width) {
+static void kernel(const f32* in, const f32* wg, f32* rs, int start_row, int start_col, int w_width) {
     // printf("Kernel at row %d col %d\n", start_row, start_col);
     __m256 res = _mm256_load_ps(&rs[start_col]);
 
@@ -54,7 +54,7 @@ static void kernel(const float* in, const float* wg, float* rs, int start_row, i
 // Ver. Artemis Rosman
 // W rows and W width is expected to be for the column major matrix, i.e. len of
 // in vec = w_rows, len of out vec = w_cols
-void sgemv_t_tuned(const float* weights, const float* inputs, float* __restrict__ results, int w_width, int w_rows) {
+void sgemv_t_tuned(const f32* weights, const f32* inputs, f32* __restrict__ results, int w_width, int w_rows) {
     // Perform mult using kernel
     for (int row = 0; row < w_rows; row += KERN_ROWS) {
         for (int col = 0; col < w_width; col += KERN_COLS) {
@@ -77,15 +77,15 @@ void relu_inplace(f32* dest, int len) {
 }
 
 // Hacky but fast and accurate for existing inputs
-static inline float fastexp(float x) {
+static inline f32 fastexp(f32 x) {
     int tmp = (int)(1512775 * x + 1072632447);
-    float result;
+    f32 result;
     memcpy(&result, &tmp, sizeof(result));
     return result;
 }
 
 void softmax_inplace(f32* dest, int len) {
-    float res = 0.0f;
+    f32 res = 0.0f;
     for (int i = 0; i < len; i++) {
         res += fastexp(dest[i]);
     }
@@ -97,7 +97,7 @@ void softmax_inplace(f32* dest, int len) {
 // Get result from output layer
 u8 argmax(f32* in, int len) {
     int idx = 0;
-    float res = in[0];
+    f32 res = in[0];
     for (int i = 0; i < len; i++) {
         if (res < in[i]) {
             res = in[i];