From c1c1c4811ce9449eb9fd6110f0bc783caa9ff82e Mon Sep 17 00:00:00 2001 From: nhatdongdang <144138246+nhatdongdang@users.noreply.github.com> Date: Thu, 4 Jul 2024 05:01:43 +0000 Subject: [PATCH 1/8] Restructured --- cuda-keyring_1.1-1_all.deb | Bin 0 -> 4332 bytes include/matrix.h | 19 ----- src/main.cu | 166 +++++++++++++++++-------------------- src/matrix.cu | 99 +++++++++++----------- src/matrix.cuh | 21 +++++ src/util.cuh | 8 ++ 6 files changed, 153 insertions(+), 160 deletions(-) create mode 100644 cuda-keyring_1.1-1_all.deb delete mode 100644 include/matrix.h create mode 100644 src/matrix.cuh create mode 100644 src/util.cuh diff --git a/cuda-keyring_1.1-1_all.deb b/cuda-keyring_1.1-1_all.deb new file mode 100644 index 0000000000000000000000000000000000000000..d02294184b32b91eb1a7d348406ebaa0f3218bdd GIT binary patch literal 4332 zcmbtXcT`i^`b9y7h>8vexk+wFNC>1tG9nHrqc|2+EP#%v zSg>FPMG+jTf(ljy5gZ$cjvWLAewp%{HSevr*8Agq>#V!Zz31F>);ZtW-`?9R1klTr zp%6JJRRP*XQk53aTcA)V0*glwFmM7E35CLK@BjZqz~NXl8Va?wzd`~*2Eie+WUW!3 ztO+s#`k*xHfB&BV|8rF+R00{sS!2X@b`CG@p9^=Wt{K0Y>osP&-Jemr>wS5*@2LIa zIJ+4WFV9IIlUF^L8ESF+rFp@D6>f`quaMUU4VO*4D(HM*x(FxL`n$iI(j?p_Nlwar za#>0jHhmj%6O=*KmBhG1YuYZ&IW{e5G+w*wb<%liwu^7?gB>vYI|JUYuD^S=_EKoP zJ*GCwX>|!YP@EoGofc(1ktouOG3zJP+*(tRQsv||uz#Lf@sY3&zlCfcInF_UpYon@ zez5gr#Yi5m`1+2_4)0YHs*WeL*TgfHXO%qu&ETd*-h@cnhAUHtJ4kQg1)gzFnoF;K z?TW_|cL>%x>kmY&KUElaCbekkZ#N%$6HeT?1}=->HmNqwV*wd^mf<4-H3bT{FFdLKI>l2&o?rx)$WKZ--@FQN)XO!cRSOIw(Ff84H-4EiE9l5GqceK(=bV*c(sr)BeM4AI&5`HS=1&tdvNk2nI%Duelj`$P zCk|{U?Gz4fo#){UGU;O!Xo6H&yQ`Coo!-I=fw(YeQE??sX(VnViHns4O> zY0_d|K)e3~W!Lr>HYnvu1Rtih*MN3ox@`EevvZ!YE9i7>!q>6C!^|O8EC-Ch-{7?& z`@i8IK|t6l!|(C$_&q;)^}aEG130TT>{>Txbi$(l0l%WDJ3`O?+FSR!xovjDg3p$p z`>%W-I`_NzVEDPAll!3A9?Py=Sl=Xmv*Soo-<<9r*PYi4!w2D;$1kty@sxd;UQy!O zSl3V!IJlp2tNX)!Wtztql_W0vH2CpcS~@tFdG~#_``azW@wkP|{LPUOohS1$tWOZ= z=W$s-j69v(+W5WOq;nqZubwv+3PV4??XHnynJrymTi`j4`X(NA`gqnY`+T3;+pi?W zL}$K=d$^10bre`#c=6~92ai$~(?5LA6ZGDZyodFKUlw$oY1IiH-yD9CDxhY%L*457 z#F9A|9fd~870d1?{qtdYg@$;)v-zg6$1+jH@O;iCeb~Fvh2C~!u{SPU%g1EV#&*2O z861Ad_+C1r>{PHi8;-P#J;h=>Ka<*+D8&LW{v9M+2iF$mira>J9nLnxc0{75b+p)#f_t*N(BT6!gauX?YXA9rzHmAV|knRxxeJNtf0&{uJXyl{|# z^iqeFcFlCAH`8Wr)Q7cfo?!_3l)B_n-NKJ%r!}QlTTG732?f|qf*g{qXhkeI6F4om zV?6H|Yd7iq{FWhm$HG&)cs^p{+Mo78`vjZ5P&Gs3+=Fi^uLqh+)&}_0n=0l`_dRw> zsV751+@CGp>dMc5mw0KbtJiYLiWQYZ&w2O1b4D(IEIPQ(KOa9c;9E}MmXFmTL9wrP zm2C@3e$Dw{=d!l8!-}mr0={y-Wxi0>DTCbc(7}E!>z;9~?s?6n?vMIR&Pc=SocI?5 zpMTNfysrxHHjZf*InGYGQm`-9SJU2VSa9vfj6EbmjFTt#Q?V~klHHkmQnBKl?C|EH zmBn2-hu0hvza4t1+un3?K4JftBfyHJR-dq}qrTnn%baCf>gJ4G6>rI!M(+DL!d$SX zNlNH%qo#S?|2Cne#(lQ>>$chFGrbjGFYiAcPj3^1UU!_B6?nvWJiCb(*^hax0m8yqugm_Jyb`5lj(my4B7vY)ZE z%XOQhOGrKJ1-T{%e0pzG9)F>I@sjYMo@_@gJvBGyRPN)(68fU|8{_UmmHMgoiv06N z$Kv=;M+Wy-*&B0kUTM+OT(VxoZn(laZHIpOwb9a&H9BSLPp=`y_v@C)W=;FC1=Ht1 zf7;Ug-m$&DFO>T_Oj)w0`pvWas0V&`9&I$$?BhnL!-S!;i<_2F`)74DMjh~&B)C3u zW8+NfXTtm%VZHNy zc`O-TFIeh<9?hIJb?+PQtW~6-u7wSlN4`lPm5VAvPaW_)fQzq)V=Z6V57mBU1)rTd zA@;}yW_64xW0&BN%k~X!+g`UVT~hL=_PstZh#D*Eq}z>Y{hd?o0yCDZ`zL3|C!{1u zO)8BX)c?W6e{wbgfwPJ8|D@RwNG$G;i8jt&^m~w~GzJ^WrE#Hr28~UnKv`5ipGc$r zF$kdp24ygmPbXp!NC-p(>J6%7Z7>uK;j0p~pgtHXGuc!_aS707ftvDo~oB#1uUg35UYr!6-s73T^8nQIM!)jcxd^6F|aM8qg37g*c&P3IYO; z#32<503!!&HUx)3%5f+KN)Dhv3<$*$Fa#L^O90^nqyh@< zMd8ta+-6e##nwQa05lwq#)EP^+%^M+B48AFg%l+NaZ(Im+Xo8;0Xz;)z<~rg8f*Kg zK!9i|h{j;0C^-s+kP>hhg;ar)5um^Omw)v<5GN^s1wasx%K;=xDgzZLDPAVU+0H>g zqTmWp29?55fNfXMwmc4x02KrfVe?ZMBo4sf&PY4A_|02 ze`V_L+|aZpE)8n78L}WG66!}|3t^EettrhP0#Wf8G7^zWA}0`OBpy@EV`ynaVltB& z!G{ClSX=;&%fjfAc|-vSSb$IUj_@ zV)ap?WTc5LX0drBGb)o`;(rVbAf&~6B$g2D58KFpkd<0Ff2L1%tgndm{tfJ#$;+ZEP{rF5HZtK;c^2( zLAIDlG))S}f=AF35kioz7R$+KeJloJRe`o##)#4o@JLmR5rCs15ojF?BL-}>hop=Z c6GS{fkA*BDC(3ny;)Kek{9V-kffC670c3a*V*mgE literal 0 HcmV?d00001 diff --git a/include/matrix.h b/include/matrix.h deleted file mode 100644 index 7176922..0000000 --- a/include/matrix.h +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -typedef struct { - int rows; - int cols; - float* data; // array -} matrix; -void initmalloc(matrix* d_mat, matrix* h_mat, int rows, int cols); -void dealloc(matrix* d_mat); - -matrix* new_matrix(int rows, int cols); - -__global__ void matrix_mul(matrix* a, matrix* b, matrix* result); - -__global__ void matrix_add(matrix* a, matrix* b); - -__global__ void relu(matrix* a); - -__global__ void softmax(matrix* a); diff --git a/src/main.cu b/src/main.cu index 5079299..d8dcb19 100644 --- a/src/main.cu +++ b/src/main.cu @@ -1,4 +1,4 @@ -#include "../include/matrix.h" +#include "matrix.cuh" #include #include #include @@ -21,9 +21,11 @@ matrix* weights[NUM_LAYERS]; matrix* biases[NUM_LAYERS]; // device weights and biases; -matrix* d_weights; -matrix* d_biases; -matrix* d_input; +matrix* d_weights[7]; +matrix* d_biases[7]; +matrix** d_inputs; + +int* results; char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I', 'i', 'J', 'j', 'K', 'k', 'L', 'l', 'M', 'm', 'N', 'n', 'O', 'o', 'P', 'p', 'Q', 'q', 'R', 'r', @@ -101,91 +103,76 @@ void read_tensor(matrix* a, const char* fileName) { } void propagate_fwd(matrix* weights, matrix* input_layer, matrix* output_layer, matrix* biases) { - matrix_mul<<<1, 1>>>(weights, input_layer, output_layer); + matrix_mul<<<1, 1>>>(weights->data, input_layer->data, output_layer->data, weights->rows, weights->cols); cudaDeviceSynchronize(); - matrix_add<<<1, 1>>>(output_layer, biases); + matrix_add<<<1, 1>>>(output_layer->data, biases->data, biases->rows); cudaDeviceSynchronize(); } -__global__ void get_max(matrix* a, int* d_int) { - int idx = 0; - float res = a->data[0]; - for (int i = 0; i < a->rows; i++) { - if (res < a->data[i]) { - res = a->data[i]; - idx = i; - } +__global__ void pp(float* a, int rows) { + for (int i = 0; i < rows; i++) { + printf("%f\n", a[i]); } - *d_int = idx; } int infer(matrix* d_input) { - matrix* mdl_layers[NUM_LAYERS]; - matrix* d_mdl_layers; - - mdl_layers[0] = new_matrix(98, 1); - mdl_layers[1] = new_matrix(65, 1); - mdl_layers[2] = new_matrix(50, 1); - mdl_layers[3] = new_matrix(30, 1); - mdl_layers[4] = new_matrix(25, 1); - mdl_layers[5] = new_matrix(40, 1); - mdl_layers[6] = new_matrix(52, 1); - - CUDA_CHECK(cudaMalloc(&d_mdl_layers, NUM_LAYERS * sizeof(matrix))); - - initmalloc(&d_mdl_layers[0], mdl_layers[0], 98, 1); - initmalloc(&d_mdl_layers[1], mdl_layers[1], 65, 1); - initmalloc(&d_mdl_layers[2], mdl_layers[2], 50, 1); - initmalloc(&d_mdl_layers[3], mdl_layers[3], 30, 1); - initmalloc(&d_mdl_layers[4], mdl_layers[4], 25, 1); - initmalloc(&d_mdl_layers[5], mdl_layers[5], 40, 1); - initmalloc(&d_mdl_layers[6], mdl_layers[6], 52, 1); - - propagate_fwd(&d_weights[0], d_input, &d_mdl_layers[0], &d_biases[0]); - relu<<<1, 1>>>(&d_mdl_layers[0]); + matrix* outputs[2]; + outputs[0] = new_matrix_d(98, 1); + outputs[1] = new_matrix_d(65, 1); + + propagate_fwd(d_weights[0], d_input, outputs[0], d_biases[0]); + relu<<<1, 1>>>(outputs[0]->data, 98); cudaDeviceSynchronize(); - propagate_fwd(&d_weights[1], &d_mdl_layers[0], &d_mdl_layers[1], &d_biases[1]); - relu<<<1, 1>>>(&d_mdl_layers[1]); + propagate_fwd(d_weights[1], outputs[0], outputs[1], d_biases[1]); + cudaMemsetAsync(outputs[0], 0, 50 * sizeof(float)); + relu<<<1, 1>>>(outputs[1]->data, 65); cudaDeviceSynchronize(); - propagate_fwd(&d_weights[2], &d_mdl_layers[1], &d_mdl_layers[2], &d_biases[2]); - relu<<<1, 1>>>(&d_mdl_layers[2]); + propagate_fwd(d_weights[2], outputs[1], outputs[0], d_biases[2]); + cudaMemsetAsync(outputs[1], 0, 30 * sizeof(float)); + relu<<<1, 1>>>(outputs[0]->data, 50); cudaDeviceSynchronize(); - propagate_fwd(&d_weights[3], &d_mdl_layers[2], &d_mdl_layers[3], &d_biases[3]); - relu<<<1, 1>>>(&d_mdl_layers[3]); + propagate_fwd(d_weights[3], outputs[0], outputs[1], d_biases[3]); + cudaMemsetAsync(outputs[0], 0, 25 * sizeof(float)); + relu<<<1, 1>>>(outputs[1]->data, 30); cudaDeviceSynchronize(); - propagate_fwd(&d_weights[4], &d_mdl_layers[3], &d_mdl_layers[4], &d_biases[4]); - relu<<<1, 1>>>(&d_mdl_layers[4]); + propagate_fwd(d_weights[4], outputs[1], outputs[0], d_biases[4]); + cudaMemsetAsync(outputs[1], 0, 40 * sizeof(float)); + relu<<<1, 1>>>(outputs[0]->data, 25); cudaDeviceSynchronize(); - propagate_fwd(&d_weights[5], &d_mdl_layers[4], &d_mdl_layers[5], &d_biases[5]); - relu<<<1, 1>>>(&d_mdl_layers[5]); + propagate_fwd(d_weights[5], outputs[0], outputs[1], d_biases[5]); + cudaMemsetAsync(outputs[0], 0, 52 * sizeof(float)); + relu<<<1, 1>>>(outputs[1]->data, 40); cudaDeviceSynchronize(); - propagate_fwd(&d_weights[6], &d_mdl_layers[5], &d_mdl_layers[6], &d_biases[6]); - softmax<<<1, 1>>>(&d_mdl_layers[6]); + propagate_fwd(d_weights[6], outputs[1], outputs[0], d_biases[6]); + softmax<<<1, 1>>>(outputs[0]->data, 52); cudaDeviceSynchronize(); - int* d_int; - int h_int = 0; + int* res_d; + cudaMalloc(&res_d, sizeof(int)); - CUDA_CHECK(cudaMalloc((void**)&d_int, sizeof(int))); - get_max<<<1, 1>>>(&d_mdl_layers[6], d_int); + argmax<<<1, 1>>>(outputs[0]->data, 52, res_d); cudaDeviceSynchronize(); - CUDA_CHECK(cudaMemcpy(&h_int, d_int, sizeof(int), cudaMemcpyDeviceToHost)); - dealloc(&d_mdl_layers[0]); - dealloc(&d_mdl_layers[1]); - dealloc(&d_mdl_layers[2]); - dealloc(&d_mdl_layers[3]); - dealloc(&d_mdl_layers[4]); - dealloc(&d_mdl_layers[5]); - dealloc(&d_mdl_layers[6]); + cudaFree(outputs[0]->data); + free(outputs[0]); + cudaFree(outputs[1]->data); + free(outputs[1]); - return h_int; + int res_h; + cudaMemcpy(&res_h, res_d, sizeof(int), cudaMemcpyDeviceToHost); + return res_h; +} + +void process(int input_size) { + for (int i = 1; i <= input_size; i++) { + results[i] = infer(d_inputs[i]); + } } int main(int argc, char* argv[]) { @@ -218,26 +205,24 @@ int main(int argc, char* argv[]) { read_model(argv[1]); - // initialize d_weights struct matrix arr; - CUDA_CHECK(cudaMalloc(&d_weights, NUM_LAYERS * sizeof(matrix))); - CUDA_CHECK(cudaMalloc(&d_biases, NUM_LAYERS * sizeof(matrix))); - - initmalloc(&d_weights[0], weights[0], 98, 225); - initmalloc(&d_weights[1], weights[1], 65, 98); - initmalloc(&d_weights[2], weights[2], 50, 65); - initmalloc(&d_weights[3], weights[3], 30, 50); - initmalloc(&d_weights[4], weights[4], 25, 30); - initmalloc(&d_weights[5], weights[5], 40, 25); - initmalloc(&d_weights[6], weights[6], 52, 40); - initmalloc(&d_biases[0], biases[0], 98, 1); - initmalloc(&d_biases[1], biases[1], 65, 1); - initmalloc(&d_biases[2], biases[2], 50, 1); - initmalloc(&d_biases[3], biases[3], 30, 1); - initmalloc(&d_biases[4], biases[4], 25, 1); - initmalloc(&d_biases[5], biases[5], 40, 1); - initmalloc(&d_biases[6], biases[6], 52, 1); - - // Run program + d_weights[0] = get_copy(weights[0]); + d_weights[1] = get_copy(weights[1]); + d_weights[2] = get_copy(weights[2]); + d_weights[3] = get_copy(weights[3]); + d_weights[4] = get_copy(weights[4]); + d_weights[5] = get_copy(weights[5]); + d_weights[6] = get_copy(weights[6]); + + d_biases[0] = get_copy(biases[0]); + d_biases[1] = get_copy(biases[1]); + d_biases[2] = get_copy(biases[2]); + d_biases[3] = get_copy(biases[3]); + d_biases[4] = get_copy(biases[4]); + d_biases[5] = get_copy(biases[5]); + d_biases[6] = get_copy(biases[6]); + + // ------------------------------------------------------------ + const char* directory_path = argv[2]; struct dirent* entry; DIR* dir = opendir(directory_path); @@ -253,7 +238,11 @@ int main(int argc, char* argv[]) { size++; } } - int* results = (int*)malloc((size + 1) * sizeof(int)); + + results = (int*)malloc((size + 1) * sizeof(int)); + memset(results, 0, (size + 1) * sizeof(int)); + d_inputs = (matrix**)malloc((size + 1) * sizeof(matrix*)); + dir = opendir(directory_path); matrix* d_input; @@ -267,11 +256,7 @@ int main(int argc, char* argv[]) { strcat(file_name, "/"); strcat(file_name, entry->d_name); read_tensor(input, file_name); - CUDA_CHECK(cudaMalloc(&d_input, 255 * sizeof(matrix))); - initmalloc(d_input, input, 1, 225); - results[file_num] = infer(d_input); - dealloc(d_input); - + d_inputs[file_num] = get_copy(input); free(input); } } @@ -280,6 +265,9 @@ int main(int argc, char* argv[]) { free(file_num_str); closedir(dir); + // Process + process(size); + // Write to csv file FILE* csv_file = fopen("results.csv", "w+"); fprintf(csv_file, "image_number, guess\n"); diff --git a/src/matrix.cu b/src/matrix.cu index 3069f38..7185242 100644 --- a/src/matrix.cu +++ b/src/matrix.cu @@ -1,4 +1,5 @@ -#include "../include/matrix.h" +#include "matrix.cuh" +#include "util.cuh" #include #include #include @@ -13,6 +14,21 @@ matrix* new_matrix(int rows, int cols) { return res; } +matrix* new_matrix_d(int rows, int cols) { + matrix* res = (matrix*)malloc(sizeof(matrix)); + res->rows = rows; + res->cols = cols; + res->cols = cols; + cudaMalloc((void**)&(res->data), rows * cols * sizeof(float)); + return res; +} + +matrix* get_copy(matrix* h_mat) { + matrix* res = new_matrix_d(h_mat->rows, h_mat->cols); + CUDA_CHECK(cudaMemcpy(res->data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(float), cudaMemcpyHostToDevice)); + return res; +} + __global__ void ptref(matrix* d_mat, float* d_res, int* d_cols, int* d_rows) { d_mat->data = d_res; d_mat->cols = *d_cols; @@ -46,68 +62,47 @@ void dealloc(matrix* d_mat) { cudaFree(d_mat); } -// Loop unrolling optimisation with a factor of 8 which should be enough to saturate a Zen3 core -__global__ void matrix_mul(matrix* weights, matrix* inputs, matrix* __restrict__ result) { - - int res_rows = result->rows; - int w_width = weights->cols; - float* w_data = weights->data; - float* i_data = inputs->data; - - int u_limit = w_width - (UNROLL_FACTOR - 1); - - for (int cur_row = 0; cur_row < res_rows; cur_row++) { - float sum0 = 0; - float sum1 = 0; - float sum2 = 0; - float sum3 = 0; - float sum4 = 0; - float sum5 = 0; - float sum6 = 0; - float sum7 = 0; - - int row_offs = cur_row * w_width; - - int k = 0; - for (; k < u_limit; k += UNROLL_FACTOR) { - sum0 += w_data[row_offs + k] * i_data[k]; - sum1 += w_data[row_offs + k + 1] * i_data[k + 1]; - sum2 += w_data[row_offs + k + 2] * i_data[k + 2]; - sum3 += w_data[row_offs + k + 3] * i_data[k + 3]; - sum4 += w_data[row_offs + k + 4] * i_data[k + 4]; - sum5 += w_data[row_offs + k + 5] * i_data[k + 5]; - sum6 += w_data[row_offs + k + 6] * i_data[k + 6]; - sum7 += w_data[row_offs + k + 7] * i_data[k + 7]; - } - - for (; k < w_width; k++) { - sum0 += w_data[row_offs + k] * i_data[k]; +__global__ void matrix_mul(float* weight, float* input, float* result, int w_rows, int w_cols) { + for (int i = 0; i < w_rows; i++) { + float sum = 0; + for (int j = 0; j < w_cols; j++) { + sum += weight[i * w_cols + j] * input[j]; } - - (result->data)[cur_row] = sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7; // + sum8 + sum9; + result[i] = sum; } } -__global__ void matrix_add(matrix* a, matrix* b) { - - for (int i = 0; i < a->rows; i++) { - (a->data)[i] += (b->data)[i]; +__global__ void matrix_add(float* a, float* b, int rows) { + for (int i = 0; i < rows; i++) { + a[i] += b[i]; } } -__global__ void relu(matrix* a) { - for (int i = 0; i < a->rows; i++) { - if ((a->data)[i] < (float)0) - (a->data)[i] = (float)0; +__global__ void relu(float* a, int rows) { + for (int i = 0; i < rows; i++) { + if ((a)[i] < (float)0) + (a)[i] = (float)0; } } -__global__ void softmax(matrix* a) { +__global__ void softmax(float* a, int rows) { float res = (float)0; - for (int i = 0; i < a->rows; i++) { - res += exp((a->data)[i]); + for (int i = 0; i < rows; i++) { + res += exp(a[i]); } - for (int i = 0; i < a->rows; i++) { - (a->data)[i] /= res; + for (int i = 0; i < rows; i++) { + a[i] /= res; } } + +__global__ void argmax(float* a, int rows, int* des) { + int res = a[0]; + int idx = 0; + for (int i = 0; i < rows; i++) { + if (res < a[i]) { + res = a[i]; + idx = i; + } + } + *des = idx; +} \ No newline at end of file diff --git a/src/matrix.cuh b/src/matrix.cuh new file mode 100644 index 0000000..81ced97 --- /dev/null +++ b/src/matrix.cuh @@ -0,0 +1,21 @@ +#pragma once + +typedef struct { + int rows; + int cols; + float* data; // array +} matrix; + +matrix* new_matrix(int rows, int cols); +matrix* get_copy(matrix* h_mat); +matrix* new_matrix_d(int rows, int cols); + +__global__ void matrix_mul(float* a, float* b, float* c, int rows, int cols); + +__global__ void matrix_add(float* a, float* b, int rows); + +__global__ void relu(float* a, int rows); + +__global__ void softmax(float* a, int rows); + +__global__ void argmax(float* a, int rows, int* res); \ No newline at end of file diff --git a/src/util.cuh b/src/util.cuh new file mode 100644 index 0000000..a6f988a --- /dev/null +++ b/src/util.cuh @@ -0,0 +1,8 @@ +#pragma once + +#define CUDA_CHECK(call) \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error in %s (%s:%d): %s\n", __func__, __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ No newline at end of file From 1c8530bcb5fc385d77d89686ad35bda4e4cf0e84 Mon Sep 17 00:00:00 2001 From: nhatdongdang <144138246+nhatdongdang@users.noreply.github.com> Date: Thu, 4 Jul 2024 05:01:57 +0000 Subject: [PATCH 2/8] Fix main --- src/main.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main.cu b/src/main.cu index d8dcb19..9245894 100644 --- a/src/main.cu +++ b/src/main.cu @@ -202,7 +202,6 @@ int main(int argc, char* argv[]) { biases[4] = new_matrix(25, 1); biases[5] = new_matrix(40, 1); biases[6] = new_matrix(52, 1); - read_model(argv[1]); d_weights[0] = get_copy(weights[0]); From 89d09f53b281a25c743d7fba06e704d7331399bd Mon Sep 17 00:00:00 2001 From: nhatdongdang <144138246+nhatdongdang@users.noreply.github.com> Date: Thu, 4 Jul 2024 05:05:50 +0000 Subject: [PATCH 3/8] Fix styling --- cuda-keyring_1.1-1_all.deb | Bin 4332 -> 0 bytes src/main.cu | 6 ------ src/matrix.cuh | 2 ++ 3 files changed, 2 insertions(+), 6 deletions(-) delete mode 100644 cuda-keyring_1.1-1_all.deb diff --git a/cuda-keyring_1.1-1_all.deb b/cuda-keyring_1.1-1_all.deb deleted file mode 100644 index d02294184b32b91eb1a7d348406ebaa0f3218bdd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4332 zcmbtXcT`i^`b9y7h>8vexk+wFNC>1tG9nHrqc|2+EP#%v zSg>FPMG+jTf(ljy5gZ$cjvWLAewp%{HSevr*8Agq>#V!Zz31F>);ZtW-`?9R1klTr zp%6JJRRP*XQk53aTcA)V0*glwFmM7E35CLK@BjZqz~NXl8Va?wzd`~*2Eie+WUW!3 ztO+s#`k*xHfB&BV|8rF+R00{sS!2X@b`CG@p9^=Wt{K0Y>osP&-Jemr>wS5*@2LIa zIJ+4WFV9IIlUF^L8ESF+rFp@D6>f`quaMUU4VO*4D(HM*x(FxL`n$iI(j?p_Nlwar za#>0jHhmj%6O=*KmBhG1YuYZ&IW{e5G+w*wb<%liwu^7?gB>vYI|JUYuD^S=_EKoP zJ*GCwX>|!YP@EoGofc(1ktouOG3zJP+*(tRQsv||uz#Lf@sY3&zlCfcInF_UpYon@ zez5gr#Yi5m`1+2_4)0YHs*WeL*TgfHXO%qu&ETd*-h@cnhAUHtJ4kQg1)gzFnoF;K z?TW_|cL>%x>kmY&KUElaCbekkZ#N%$6HeT?1}=->HmNqwV*wd^mf<4-H3bT{FFdLKI>l2&o?rx)$WKZ--@FQN)XO!cRSOIw(Ff84H-4EiE9l5GqceK(=bV*c(sr)BeM4AI&5`HS=1&tdvNk2nI%Duelj`$P zCk|{U?Gz4fo#){UGU;O!Xo6H&yQ`Coo!-I=fw(YeQE??sX(VnViHns4O> zY0_d|K)e3~W!Lr>HYnvu1Rtih*MN3ox@`EevvZ!YE9i7>!q>6C!^|O8EC-Ch-{7?& z`@i8IK|t6l!|(C$_&q;)^}aEG130TT>{>Txbi$(l0l%WDJ3`O?+FSR!xovjDg3p$p z`>%W-I`_NzVEDPAll!3A9?Py=Sl=Xmv*Soo-<<9r*PYi4!w2D;$1kty@sxd;UQy!O zSl3V!IJlp2tNX)!Wtztql_W0vH2CpcS~@tFdG~#_``azW@wkP|{LPUOohS1$tWOZ= z=W$s-j69v(+W5WOq;nqZubwv+3PV4??XHnynJrymTi`j4`X(NA`gqnY`+T3;+pi?W zL}$K=d$^10bre`#c=6~92ai$~(?5LA6ZGDZyodFKUlw$oY1IiH-yD9CDxhY%L*457 z#F9A|9fd~870d1?{qtdYg@$;)v-zg6$1+jH@O;iCeb~Fvh2C~!u{SPU%g1EV#&*2O z861Ad_+C1r>{PHi8;-P#J;h=>Ka<*+D8&LW{v9M+2iF$mira>J9nLnxc0{75b+p)#f_t*N(BT6!gauX?YXA9rzHmAV|knRxxeJNtf0&{uJXyl{|# z^iqeFcFlCAH`8Wr)Q7cfo?!_3l)B_n-NKJ%r!}QlTTG732?f|qf*g{qXhkeI6F4om zV?6H|Yd7iq{FWhm$HG&)cs^p{+Mo78`vjZ5P&Gs3+=Fi^uLqh+)&}_0n=0l`_dRw> zsV751+@CGp>dMc5mw0KbtJiYLiWQYZ&w2O1b4D(IEIPQ(KOa9c;9E}MmXFmTL9wrP zm2C@3e$Dw{=d!l8!-}mr0={y-Wxi0>DTCbc(7}E!>z;9~?s?6n?vMIR&Pc=SocI?5 zpMTNfysrxHHjZf*InGYGQm`-9SJU2VSa9vfj6EbmjFTt#Q?V~klHHkmQnBKl?C|EH zmBn2-hu0hvza4t1+un3?K4JftBfyHJR-dq}qrTnn%baCf>gJ4G6>rI!M(+DL!d$SX zNlNH%qo#S?|2Cne#(lQ>>$chFGrbjGFYiAcPj3^1UU!_B6?nvWJiCb(*^hax0m8yqugm_Jyb`5lj(my4B7vY)ZE z%XOQhOGrKJ1-T{%e0pzG9)F>I@sjYMo@_@gJvBGyRPN)(68fU|8{_UmmHMgoiv06N z$Kv=;M+Wy-*&B0kUTM+OT(VxoZn(laZHIpOwb9a&H9BSLPp=`y_v@C)W=;FC1=Ht1 zf7;Ug-m$&DFO>T_Oj)w0`pvWas0V&`9&I$$?BhnL!-S!;i<_2F`)74DMjh~&B)C3u zW8+NfXTtm%VZHNy zc`O-TFIeh<9?hIJb?+PQtW~6-u7wSlN4`lPm5VAvPaW_)fQzq)V=Z6V57mBU1)rTd zA@;}yW_64xW0&BN%k~X!+g`UVT~hL=_PstZh#D*Eq}z>Y{hd?o0yCDZ`zL3|C!{1u zO)8BX)c?W6e{wbgfwPJ8|D@RwNG$G;i8jt&^m~w~GzJ^WrE#Hr28~UnKv`5ipGc$r zF$kdp24ygmPbXp!NC-p(>J6%7Z7>uK;j0p~pgtHXGuc!_aS707ftvDo~oB#1uUg35UYr!6-s73T^8nQIM!)jcxd^6F|aM8qg37g*c&P3IYO; z#32<503!!&HUx)3%5f+KN)Dhv3<$*$Fa#L^O90^nqyh@< zMd8ta+-6e##nwQa05lwq#)EP^+%^M+B48AFg%l+NaZ(Im+Xo8;0Xz;)z<~rg8f*Kg zK!9i|h{j;0C^-s+kP>hhg;ar)5um^Omw)v<5GN^s1wasx%K;=xDgzZLDPAVU+0H>g zqTmWp29?55fNfXMwmc4x02KrfVe?ZMBo4sf&PY4A_|02 ze`V_L+|aZpE)8n78L}WG66!}|3t^EettrhP0#Wf8G7^zWA}0`OBpy@EV`ynaVltB& z!G{ClSX=;&%fjfAc|-vSSb$IUj_@ zV)ap?WTc5LX0drBGb)o`;(rVbAf&~6B$g2D58KFpkd<0Ff2L1%tgndm{tfJ#$;+ZEP{rF5HZtK;c^2( zLAIDlG))S}f=AF35kioz7R$+KeJloJRe`o##)#4o@JLmR5rCs15ojF?BL-}>hop=Z c6GS{fkA*BDC(3ny;)Kek{9V-kffC670c3a*V*mgE diff --git a/src/main.cu b/src/main.cu index 9245894..a5cfe20 100644 --- a/src/main.cu +++ b/src/main.cu @@ -109,12 +109,6 @@ void propagate_fwd(matrix* weights, matrix* input_layer, matrix* output_layer, m cudaDeviceSynchronize(); } -__global__ void pp(float* a, int rows) { - for (int i = 0; i < rows; i++) { - printf("%f\n", a[i]); - } -} - int infer(matrix* d_input) { matrix* outputs[2]; outputs[0] = new_matrix_d(98, 1); diff --git a/src/matrix.cuh b/src/matrix.cuh index 81ced97..6b47911 100644 --- a/src/matrix.cuh +++ b/src/matrix.cuh @@ -7,7 +7,9 @@ typedef struct { } matrix; matrix* new_matrix(int rows, int cols); + matrix* get_copy(matrix* h_mat); + matrix* new_matrix_d(int rows, int cols); __global__ void matrix_mul(float* a, float* b, float* c, int rows, int cols); From 080492bb2f8361b20e3b524380f961516fd3d355 Mon Sep 17 00:00:00 2001 From: nhatdongdang <144138246+nhatdongdang@users.noreply.github.com> Date: Thu, 4 Jul 2024 05:08:20 +0000 Subject: [PATCH 4/8] Remove unused functions --- src/matrix.cu | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/src/matrix.cu b/src/matrix.cu index 7185242..fc5bf24 100644 --- a/src/matrix.cu +++ b/src/matrix.cu @@ -29,39 +29,6 @@ matrix* get_copy(matrix* h_mat) { return res; } -__global__ void ptref(matrix* d_mat, float* d_res, int* d_cols, int* d_rows) { - d_mat->data = d_res; - d_mat->cols = *d_cols; - d_mat->rows = *d_rows; -} - -// Allocate device memory for matrix dimensions and data -void initmalloc(matrix* d_mat, matrix* h_mat, int rows, int cols) { - int* d_cols; - int* d_rows; - float* d_res; - cudaMalloc(&d_cols, sizeof(int)); - cudaMalloc(&d_rows, sizeof(int)); - cudaMalloc(&d_res, rows * cols * sizeof(float)); - - cudaMemcpy(d_rows, &rows, sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_cols, &cols, sizeof(int), cudaMemcpyHostToDevice); - - cudaMemcpy(d_res, h_mat->data, (rows * cols * sizeof(float)), cudaMemcpyHostToDevice); - - // Call kernel to initialize the matrix structure on the device - ptref<<<1, 1>>>(d_mat, d_res, d_cols, d_rows); - cudaDeviceSynchronize(); -} - -void dealloc(matrix* d_mat) { - cudaFree(&d_mat->data); - cudaFree(&d_mat->cols); - cudaFree(&d_mat->rows); - - cudaFree(d_mat); -} - __global__ void matrix_mul(float* weight, float* input, float* result, int w_rows, int w_cols) { for (int i = 0; i < w_rows; i++) { float sum = 0; From d12d0ed06bd4113bbd40135b964abee8aa6e6ccc Mon Sep 17 00:00:00 2001 From: nhatdongdang <144138246+nhatdongdang@users.noreply.github.com> Date: Thu, 4 Jul 2024 05:09:36 +0000 Subject: [PATCH 5/8] Optimize a little bit --- src/main.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main.cu b/src/main.cu index a5cfe20..d1146e3 100644 --- a/src/main.cu +++ b/src/main.cu @@ -233,7 +233,6 @@ int main(int argc, char* argv[]) { } results = (int*)malloc((size + 1) * sizeof(int)); - memset(results, 0, (size + 1) * sizeof(int)); d_inputs = (matrix**)malloc((size + 1) * sizeof(matrix*)); dir = opendir(directory_path); From f0859f730c0d222baa8d70917f90c2337e2080e9 Mon Sep 17 00:00:00 2001 From: nhatdongdang <144138246+nhatdongdang@users.noreply.github.com> Date: Thu, 4 Jul 2024 05:13:52 +0000 Subject: [PATCH 6/8] Update ci for cuda branch --- .github/workflows/ci.yml | 4 ++-- .github/workflows/cpp-linter.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2f8120a..b27b858 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,10 +2,10 @@ name: CI on: push: - branches: main + branches: cuda-staging paths: ['**.cu','**.c','**.cpp', '**.h', '**CMakeLists.txt'] pull_request: - branches: main + branches: cuda-staging paths: ['**.cu','**.c','**.cpp', '**.h', '**CMakeLists.txt'] jobs: diff --git a/.github/workflows/cpp-linter.yml b/.github/workflows/cpp-linter.yml index 3ec0fe6..8aec3f9 100644 --- a/.github/workflows/cpp-linter.yml +++ b/.github/workflows/cpp-linter.yml @@ -1,10 +1,10 @@ name: cpp-linter on: pull_request: - branches: main + branches: cuda-staging paths: ['**.cu','**.cpp','**.c', '**.h', '**CMakeLists.txt'] push: - branches: main + branches: cuda-staging paths: ['**.cu','**.cpp','**.c', '**.h', '**CMakeLists.txt'] permissions: From 44c9b5869c0f8225b214b73eae2875a78c12712c Mon Sep 17 00:00:00 2001 From: nhatdongdang <144138246+nhatdongdang@users.noreply.github.com> Date: Thu, 4 Jul 2024 05:25:11 +0000 Subject: [PATCH 7/8] Remove CI since github action don't have a gpu --- .github/workflows/ci.yml | 34 ---------------------------------- 1 file changed, 34 deletions(-) delete mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index b27b858..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,34 +0,0 @@ -name: CI - -on: - push: - branches: cuda-staging - paths: ['**.cu','**.c','**.cpp', '**.h', '**CMakeLists.txt'] - pull_request: - branches: cuda-staging - paths: ['**.cu','**.c','**.cpp', '**.h', '**CMakeLists.txt'] - -jobs: - build-and-test: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - - name: Install dependencies - run: | - pip install pandas - - - name: Build project - run: | - make build - - - name: Run test suite - run: | - make test \ No newline at end of file From 83d9ff6c3c8746d18478c9bbb7e2ff3eea2cd7ab Mon Sep 17 00:00:00 2001 From: nhatdongdang <144138246+nhatdongdang@users.noreply.github.com> Date: Thu, 4 Jul 2024 05:42:08 +0000 Subject: [PATCH 8/8] Fix styling --- src/main.cu | 47 ++++++++++++++++++++++------------------------- src/matrix.cu | 2 +- src/matrix.cuh | 2 +- 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/src/main.cu b/src/main.cu index d1146e3..1a12d40 100644 --- a/src/main.cu +++ b/src/main.cu @@ -147,10 +147,10 @@ int infer(matrix* d_input) { softmax<<<1, 1>>>(outputs[0]->data, 52); cudaDeviceSynchronize(); - int* res_d; - cudaMalloc(&res_d, sizeof(int)); + int* d_res; + cudaMalloc(&d_res, sizeof(int)); - argmax<<<1, 1>>>(outputs[0]->data, 52, res_d); + argmax<<<1, 1>>>(outputs[0]->data, 52, d_res); cudaDeviceSynchronize(); cudaFree(outputs[0]->data); @@ -158,9 +158,9 @@ int infer(matrix* d_input) { cudaFree(outputs[1]->data); free(outputs[1]); - int res_h; - cudaMemcpy(&res_h, res_d, sizeof(int), cudaMemcpyDeviceToHost); - return res_h; + int h_res; + cudaMemcpy(&h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost); + return h_res; } void process(int input_size) { @@ -198,23 +198,21 @@ int main(int argc, char* argv[]) { biases[6] = new_matrix(52, 1); read_model(argv[1]); - d_weights[0] = get_copy(weights[0]); - d_weights[1] = get_copy(weights[1]); - d_weights[2] = get_copy(weights[2]); - d_weights[3] = get_copy(weights[3]); - d_weights[4] = get_copy(weights[4]); - d_weights[5] = get_copy(weights[5]); - d_weights[6] = get_copy(weights[6]); - - d_biases[0] = get_copy(biases[0]); - d_biases[1] = get_copy(biases[1]); - d_biases[2] = get_copy(biases[2]); - d_biases[3] = get_copy(biases[3]); - d_biases[4] = get_copy(biases[4]); - d_biases[5] = get_copy(biases[5]); - d_biases[6] = get_copy(biases[6]); - - // ------------------------------------------------------------ + d_weights[0] = copy_to_device(weights[0]); + d_weights[1] = copy_to_device(weights[1]); + d_weights[2] = copy_to_device(weights[2]); + d_weights[3] = copy_to_device(weights[3]); + d_weights[4] = copy_to_device(weights[4]); + d_weights[5] = copy_to_device(weights[5]); + d_weights[6] = copy_to_device(weights[6]); + + d_biases[0] = copy_to_device(biases[0]); + d_biases[1] = copy_to_device(biases[1]); + d_biases[2] = copy_to_device(biases[2]); + d_biases[3] = copy_to_device(biases[3]); + d_biases[4] = copy_to_device(biases[4]); + d_biases[5] = copy_to_device(biases[5]); + d_biases[6] = copy_to_device(biases[6]); const char* directory_path = argv[2]; struct dirent* entry; @@ -236,7 +234,6 @@ int main(int argc, char* argv[]) { d_inputs = (matrix**)malloc((size + 1) * sizeof(matrix*)); dir = opendir(directory_path); - matrix* d_input; while ((entry = readdir(dir)) != NULL) { if (entry->d_type == DT_REG) { @@ -248,7 +245,7 @@ int main(int argc, char* argv[]) { strcat(file_name, "/"); strcat(file_name, entry->d_name); read_tensor(input, file_name); - d_inputs[file_num] = get_copy(input); + d_inputs[file_num] = copy_to_device(input); free(input); } } diff --git a/src/matrix.cu b/src/matrix.cu index fc5bf24..e8ed37d 100644 --- a/src/matrix.cu +++ b/src/matrix.cu @@ -23,7 +23,7 @@ matrix* new_matrix_d(int rows, int cols) { return res; } -matrix* get_copy(matrix* h_mat) { +matrix* copy_to_device(matrix* h_mat) { matrix* res = new_matrix_d(h_mat->rows, h_mat->cols); CUDA_CHECK(cudaMemcpy(res->data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(float), cudaMemcpyHostToDevice)); return res; diff --git a/src/matrix.cuh b/src/matrix.cuh index 6b47911..83005cf 100644 --- a/src/matrix.cuh +++ b/src/matrix.cuh @@ -8,7 +8,7 @@ typedef struct { matrix* new_matrix(int rows, int cols); -matrix* get_copy(matrix* h_mat); +matrix* copy_to_device(matrix* h_mat); matrix* new_matrix_d(int rows, int cols);