Skip to content

Commit

Permalink
Type naming consistency
Browse files Browse the repository at this point in the history
  • Loading branch information
rozukke committed Jul 8, 2024
1 parent bae32dc commit 327d37e
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 48 deletions.
22 changes: 11 additions & 11 deletions cudasrc/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f',

void process_weights_str(char* line, int layer) {
char* token;
float value;
f32 value;
const char* delimiter = ",";

token = strtok(line, delimiter);
Expand All @@ -45,7 +45,7 @@ void process_weights_str(char* line, int layer) {

void process_biases_str(char* line, int layer) {
char* token;
float value;
f32 value;
const char* delimiter = ",";

token = strtok(line, delimiter);
Expand Down Expand Up @@ -80,7 +80,7 @@ void read_model(const char* file_name) {
fclose(file);
}

void read_tensor(float* out, const char* fileName) {
void read_tensor(f32* out, const char* fileName) {
FILE* file = fopen(fileName, "r");
char* line = NULL;
size_t len = 0;
Expand All @@ -91,7 +91,7 @@ void read_tensor(float* out, const char* fileName) {
}

char* token;
float value;
f32 value;
const char* delimiter = ",";
token = strtok(line, delimiter);

Expand Down Expand Up @@ -123,17 +123,17 @@ __device__ void propagate_fwd(matrix* weights, f32* input_layer, f32* output_lay
matrix_add(output_layer, biases->data, biases->rows);
}

__global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input,
__global__ void infer(f32* d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input,
int in_num) {

__shared__ float shared_input[TENSOR_LENGTH];
float out1[98];
float out2[65];
__shared__ f32 shared_input[TENSOR_LENGTH];
f32 out1[98];
f32 out2[65];

int num_threads = blockDim.x * gridDim.x;
int thread_idx = (blockIdx.x * blockDim.x + threadIdx.x);

float* input = (float*)&d_inputs[in_num * TENSOR_LENGTH];
f32* input = (f32*)&d_inputs[in_num * TENSOR_LENGTH];

if (threadIdx.x < TENSOR_LENGTH) {
shared_input[threadIdx.x] = input[threadIdx.x];
Expand Down Expand Up @@ -228,7 +228,7 @@ int main(int argc, char* argv[]) {
results = (int*)malloc((input_count) * sizeof(int));
inputs = (f32*)malloc((input_count) * sizeof(f32) * TENSOR_LENGTH);
cudaMalloc(&d_results, (input_count) * sizeof(int));
cudaMalloc(&d_inputs, (input_count) * sizeof(float) * TENSOR_LENGTH);
cudaMalloc(&d_inputs, (input_count) * sizeof(f32) * TENSOR_LENGTH);

// Read and process inputs
char* file_name = (char*)malloc((100) * sizeof(char));
Expand All @@ -253,7 +253,7 @@ int main(int argc, char* argv[]) {
closedir(dir);

// Move input array to GPU memory
cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * input_count, cudaMemcpyHostToDevice);
cudaMemcpy(d_inputs, inputs, sizeof(f32) * 225 * input_count, cudaMemcpyHostToDevice);

# ifdef USE_MPI
int it_per_gpu = num_its / num_proccesses + (process_id < (num_its % num_proccesses) ? 1 : 0);
Expand Down
36 changes: 18 additions & 18 deletions cudasrc/matrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ __host__ __device__ matrix* new_matrix(int rows, int cols) {
matrix* res = (matrix*)malloc(sizeof(matrix));
res->rows = rows;
res->cols = cols;
res->data = (float*)malloc((rows * cols) * sizeof(float));
res->data = (f32*)malloc((rows * cols) * sizeof(f32));
return res;
}

__global__ void alloc(matrix* res, float* data, int rows, int cols) {
__global__ void alloc(matrix* res, f32* data, int rows, int cols) {
res->rows = rows;
res->cols = cols;
res->data = data;
Expand All @@ -22,18 +22,18 @@ __global__ void alloc(matrix* res, float* data, int rows, int cols) {
matrix* new_matrix_d(int rows, int cols) {
matrix* res;
cudaMalloc(&res, sizeof(matrix));
float* data;
cudaMalloc(&data, rows * cols * sizeof(float));
f32* data;
cudaMalloc(&data, rows * cols * sizeof(f32));
alloc<<<1, 1>>>(res, data, rows, cols);
return res;
}

matrix* copy_to_device(matrix* h_mat) {
matrix* res;
cudaMalloc(&res, sizeof(matrix));
float* data;
cudaMalloc(&data, h_mat->rows * h_mat->cols * sizeof(float));
cudaMemcpy(data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(float), cudaMemcpyHostToDevice);
f32* data;
cudaMalloc(&data, h_mat->rows * h_mat->cols * sizeof(f32));
cudaMemcpy(data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(f32), cudaMemcpyHostToDevice);
alloc<<<1, 1>>>(res, data, h_mat->rows, h_mat->cols);
return res;
}
Expand All @@ -42,14 +42,14 @@ __device__ __host__ matrix* create_copy(matrix* mat) {
matrix* res = (matrix*)malloc(sizeof(matrix));
res->rows = mat->rows;
res->cols = mat->cols;
res->data = (float*)malloc((res->rows * res->cols) * sizeof(float));
memcpy(res->data, mat->data, res->rows * res->cols * sizeof(float));
res->data = (f32*)malloc((res->rows * res->cols) * sizeof(f32));
memcpy(res->data, mat->data, res->rows * res->cols * sizeof(f32));
return res;
}

__device__ void matrix_mul(float* weight, float* input, float* result, int w_rows, int w_cols) {
__device__ void matrix_mul(f32* weight, f32* input, f32* result, int w_rows, int w_cols) {
for (int i = 0; i < w_rows; i++) {
float sum = 0;
f32 sum = 0;
int j = 0;

for (; j <= w_cols - 4; j += 4) {
Expand All @@ -65,31 +65,31 @@ __device__ void matrix_mul(float* weight, float* input, float* result, int w_row
}
}

__device__ void matrix_add(float* a, float* b, int rows) {
__device__ void matrix_add(f32* a, f32* b, int rows) {
for (int i = 0; i < rows; i++) {
a[i] += b[i];
}
}

__device__ void relu(float* a, int rows) {
__device__ void relu(f32* a, int rows) {
for (int i = 0; i < rows; i++) {
a[i] = (a[i] > 0) ? a[i] : 0;
}
}

__device__ void softmax(float* a, int rows) {
float sum = 0.0;
__device__ void softmax(f32* a, int rows) {
f32 sum = 0.0;
for (size_t i = 0; i < rows; i++) {
sum += __expf(a[i]);
}
float t = __logf(sum);
f32 t = __logf(sum);
for (size_t i = 0; i < rows; i++) {
a[i] = __expf(a[i] - t);
}
}

__device__ int argmax(float* a, int rows) {
float res = a[0];
__device__ int argmax(f32* a, int rows) {
f32 res = a[0];
int idx = 0;
for (int i = 0; i < rows; i++) {
if (res < a[i]) {
Expand Down
12 changes: 6 additions & 6 deletions cudasrc/matrix.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ typedef float f32;
typedef struct {
int rows;
int cols;
float* data; // array
f32* data; // array
} matrix;

__host__ __device__ matrix* new_matrix(int rows, int cols);
Expand All @@ -14,14 +14,14 @@ matrix* copy_to_device(matrix* h_mat);

matrix* new_matrix_d(int rows, int cols);

__device__ void matrix_mul(float* a, float* b, float* c, int rows, int cols);
__device__ void matrix_mul(f32* a, f32* b, f32* c, int rows, int cols);

__device__ void matrix_add(float* a, float* b, int rows);
__device__ void matrix_add(f32* a, f32* b, int rows);

__device__ void relu(float* a, int rows);
__device__ void relu(f32* a, int rows);

__device__ void softmax(float* a, int rows);
__device__ void softmax(f32* a, int rows);

__device__ int argmax(float* a, int rows);
__device__ int argmax(f32* a, int rows);

__device__ __host__ matrix* create_copy(matrix* mat);
8 changes: 4 additions & 4 deletions src/file_io.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ int file_count(const char* dir_path) {

void process_weights_str(matrix** weights, char* line, int layer) {
char* token;
float value;
f32 value;
const char* delimiter = ",";

token = strtok(line, delimiter);
Expand All @@ -34,7 +34,7 @@ void process_weights_str(matrix** weights, char* line, int layer) {

void process_biases_str(vector** biases, char* line, int layer) {
char* token;
float value;
f32 value;
const char* delimiter = ",";

token = strtok(line, delimiter);
Expand Down Expand Up @@ -78,9 +78,9 @@ void read_tensor(f32* a, const char* file_name) {
perror("Could not read tensor file. Exiting.");
exit(EXIT_FAILURE);
}

char* token;
float value;
f32 value;
const char* delimiter = ",";
token = strtok(line, delimiter);

Expand Down
6 changes: 3 additions & 3 deletions src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f',
'J', 'j', 'K', 'k', 'L', 'l', 'M', 'm', 'N', 'n', 'O', 'o', 'P', 'p', 'Q', 'q', 'R', 'r',
'S', 's', 'T', 't', 'U', 'u', 'V', 'v', 'W', 'w', 'X', 'x', 'Y', 'y', 'Z', 'z'};

void propagate_fwd(const matrix* weights, const float* inputs, float* results, const vector* biases) {
void propagate_fwd(const matrix* weights, const f32* inputs, f32* results, const vector* biases) {
sgemv_t_tuned(weights->data, inputs, results, weights->cols, weights->rows);
// Add biases onto results
vector_add_inplace(biases->len, biases->data, results);
Expand All @@ -35,8 +35,8 @@ void propagate_fwd(const matrix* weights, const float* inputs, float* results, c
// This code f***ing sucks but its fast so uhhhh
u8 infer_reuse_layers_thread(vector* input, matrix** weights, vector** biases) {
// Slightly larger than required for padding
float out0[104] __attribute__((aligned(SIMD_ALIGN))) = {0};
float out1[72] __attribute__((aligned(SIMD_ALIGN))) = {0};
f32 out0[104] __attribute__((aligned(SIMD_ALIGN))) = {0};
f32 out1[72] __attribute__((aligned(SIMD_ALIGN))) = {0};

propagate_fwd(weights[0], input->data, out0, biases[0]);
relu_inplace(out0, 98);
Expand Down
12 changes: 6 additions & 6 deletions src/matrix.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ vector* new_vec_aligned(int len) {
}

// ver. Artemis Rosman simd_intrin 2x8
static void kernel(const float* in, const float* wg, float* rs, int start_row, int start_col, int w_width) {
static void kernel(const f32* in, const f32* wg, f32* rs, int start_row, int start_col, int w_width) {
// printf("Kernel at row %d col %d\n", start_row, start_col);
__m256 res = _mm256_load_ps(&rs[start_col]);

Expand All @@ -54,7 +54,7 @@ static void kernel(const float* in, const float* wg, float* rs, int start_row, i
// Ver. Artemis Rosman
// W rows and W width is expected to be for the column major matrix, i.e. len of
// in vec = w_rows, len of out vec = w_cols
void sgemv_t_tuned(const float* weights, const float* inputs, float* __restrict__ results, int w_width, int w_rows) {
void sgemv_t_tuned(const f32* weights, const f32* inputs, f32* __restrict__ results, int w_width, int w_rows) {
// Perform mult using kernel
for (int row = 0; row < w_rows; row += KERN_ROWS) {
for (int col = 0; col < w_width; col += KERN_COLS) {
Expand All @@ -77,15 +77,15 @@ void relu_inplace(f32* dest, int len) {
}

// Hacky but fast and accurate for existing inputs
static inline float fastexp(float x) {
static inline f32 fastexp(f32 x) {
int tmp = (int)(1512775 * x + 1072632447);
float result;
f32 result;
memcpy(&result, &tmp, sizeof(result));
return result;
}

void softmax_inplace(f32* dest, int len) {
float res = 0.0f;
f32 res = 0.0f;
for (int i = 0; i < len; i++) {
res += fastexp(dest[i]);
}
Expand All @@ -97,7 +97,7 @@ void softmax_inplace(f32* dest, int len) {
// Get result from output layer
u8 argmax(f32* in, int len) {
int idx = 0;
float res = in[0];
f32 res = in[0];
for (int i = 0; i < len; i++) {
if (res < in[i]) {
res = in[i];
Expand Down

0 comments on commit 327d37e

Please sign in to comment.