diff --git a/.gitignore b/.gitignore index ff55af43..d5134332 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ riscv_elf_build/ x86_build/ install_nn2/ tools/ +.vscode/ diff --git a/source/c906_opt/convolution_3x3_fp16.c b/source/c906_opt/convolution_3x3_fp16.c index 4b640794..b81ed12f 100644 --- a/source/c906_opt/convolution_3x3_fp16.c +++ b/source/c906_opt/convolution_3x3_fp16.c @@ -675,22 +675,11 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, /*********************************** dot ***************************************/ // reorder input_tm1_buf - int size_input_tm2 = 0; - if (tiles >= 8) { - size_input_tm2 = 64 * (tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 8; - } else if (tiles >= 4) { - size_input_tm2 = 64 * (tiles / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 4; - } else if (tiles >= 2) { - size_input_tm2 = 64 * (tiles / 2 + tiles % 2) * in_c * 2; - } else { - size_input_tm2 = 64 * tiles * in_c; - } - __fp16 *input_tm2_buf = (__fp16 *)csi_mem_alloc(size_input_tm2 * sizeof(__fp16)); + __fp16 *input_tm2_buf = (__fp16 *)csi_mem_alloc(64 * tiles * in_c * sizeof(__fp16)); - #pragma omp parallel for num_threads(1) +#pragma omp parallel for num_threads(1) for (int r = 0; r < 64; r++) { - - __fp16 *img_tm2 = input_tm2_buf + r * size_input_tm2 / 64; // input_tm2 r channel data + __fp16 *img_tm2 = input_tm2_buf + r * tiles * in_c; // input_tm2 r channel data int t = 0; for (; t + 7 < tiles; t += 8) { @@ -762,7 +751,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, ); } for (; t + 3 < tiles; t += 4) { - __fp16 *tm2 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8; // img_tm2 row data + __fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data __fp16 *tm1 = input_tm1_buf; tm1 += (r * tiles + t) * 8; @@ -817,7 +806,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, } for (; t + 1 < tiles; t += 2) { - __fp16 *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8; // img_tm2 row data + __fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data __fp16 *tm1 = input_tm1_buf; tm1 += (r * tiles + t) * 8; @@ -865,7 +854,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, } for (; t < tiles; t++) { - __fp16 *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8; // img_tm2 row data + __fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data __fp16 *tm1 = input_tm1_buf; tm1 += (r * tiles + t) * 8; @@ -923,12 +912,10 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, __fp16 *kernel0_tm = kernel_data + p * 64 * in_c * 8; for (int r = 0; r < 64; r++) { - - __fp16 *img_tm2 = input_tm2_buf + r * size_input_tm2 / 64; // img_tm2 第r个channel + __fp16 *img_tm2 = input_tm2_buf + r * tiles * in_c; // img_tm2 第r个channel int t = 0; for (; t + 7 < tiles; t += 8) { - __fp16 *r0 = img_tm2 + t * in_c; __fp16 *k0 = kernel0_tm + r * in_c * 8; @@ -1004,7 +991,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, ); } for (; t + 3 < tiles; t += 4) { - __fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8; + __fp16 *r0 = img_tm2 + t * in_c; __fp16 *k0 = kernel0_tm + r * in_c * 8; asm volatile( @@ -1055,7 +1042,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, ); } for (; t + 1 < tiles; t += 2) { - __fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8; + __fp16 *r0 = img_tm2 + t * in_c; __fp16 *k0 = kernel0_tm + r * in_c * 8; asm volatile( @@ -1096,8 +1083,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, ); } for (; t < tiles; t++) { - - __fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8; + __fp16 *r0 = img_tm2 + t * in_c; __fp16 *k0 = kernel0_tm + r * in_c * 8; asm volatile( @@ -1789,26 +1775,15 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, /*********************************** dot ***************************************/ // reorder input_tm1_buf - int size_input_tm2 = 0; - if (tiles >= 8) { - size_input_tm2 = 36 * (tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 8; - } else if (tiles >= 4) { - size_input_tm2 = 36 * (tiles / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 4; - } else if (tiles >= 2) { - size_input_tm2 = 36 * (tiles / 2 + tiles % 2) * in_c * 2; - } else { - size_input_tm2 = 36 * tiles * in_c; - } - __fp16 *input_tm2_buf = (__fp16 *)csi_mem_alloc(size_input_tm2 * sizeof(__fp16)); + __fp16 *input_tm2_buf = (__fp16 *)csi_mem_alloc(36 * tiles * in_c * sizeof(__fp16)); #pragma omp parallel for num_threads(1) for (int r = 0; r < 36; r++) { - - __fp16 *img_tm2 = input_tm2_buf + r * size_input_tm2 / 36; // input_tm2 r channel data + __fp16 *img_tm2 = input_tm2_buf + r * tiles * in_c; // input_tm2 r channel data int t = 0; for (; t + 7 < tiles; t += 8) { - __fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data + __fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data __fp16 *tm1 = input_tm1_buf; tm1 += (r * tiles + t) * 8; @@ -1830,7 +1805,7 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, } } for (; t + 3 < tiles; t += 4) { - __fp16 *tm2 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8; // img_tm2 row data + __fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data __fp16 *tm1 = input_tm1_buf; tm1 += (r * tiles + t) * 8; @@ -1847,7 +1822,7 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, } } for (; t + 1 < tiles; t += 2) { - __fp16 *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8; // img_tm2 row data + __fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data __fp16 *tm1 = input_tm1_buf; tm1 += (r * tiles + t) * 8; @@ -1862,7 +1837,7 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, } for (; t < tiles; t++) { - __fp16 *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8; // img_tm2 row data + __fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data __fp16 *tm1 = input_tm1_buf; tm1 += (r * tiles + t) * 8; @@ -1888,12 +1863,10 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, __fp16 *kernel0_tm = kernel_data + p * 36 * in_c * 8; // 8 channel kernel for (int r = 0; r < 36; r++) { - - __fp16 *img_tm2 = input_tm2_buf + r * size_input_tm2 / 36; // img_tm2 第r个channel + __fp16 *img_tm2 = input_tm2_buf + r * tiles * in_c; // img_tm2 第r个channel int t = 0; for (; t + 7 < tiles; t += 8) { - __fp16 *r0 = img_tm2 + t * in_c; __fp16 *k0 = kernel0_tm + r * in_c * 8; @@ -1969,7 +1942,7 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, ); } for (; t + 3 < tiles; t += 4) { - __fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8; + __fp16 *r0 = img_tm2 + t * in_c; __fp16 *k0 = kernel0_tm + r * in_c * 8; asm volatile( @@ -2020,7 +1993,7 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, ); } for (; t + 1 < tiles; t += 2) { - __fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8; + __fp16 *r0 = img_tm2 + t * in_c; __fp16 *k0 = kernel0_tm + r * in_c * 8; asm volatile( @@ -2061,8 +2034,7 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, ); } for (; t < tiles; t++) { - - __fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8; + __fp16 *r0 = img_tm2 + t * in_c; __fp16 *k0 = kernel0_tm + r * in_c * 8; asm volatile( diff --git a/source/c906_opt/convolution_3x3.c b/source/c906_opt/convolution_3x3_fp32.c similarity index 98% rename from source/c906_opt/convolution_3x3.c rename to source/c906_opt/convolution_3x3_fp32.c index 00dae52f..56218cd4 100644 --- a/source/c906_opt/convolution_3x3.c +++ b/source/c906_opt/convolution_3x3_fp32.c @@ -2220,22 +2220,12 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, /*********************************** dot ***************************************/ // reorder input_tm1_buf - int size_input_tm2 = 0; - if (tiles >= 8) { - size_input_tm2 = 64 * (tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 8; - } else if (tiles >= 4) { - size_input_tm2 = 64 * (tiles / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 4; - } else if (tiles >= 2) { - size_input_tm2 = 64 * (tiles / 2 + tiles % 2) * in_c * 2; - } else { - size_input_tm2 = 64 * tiles * in_c; - } - float *input_tm2_buf = (float *)csi_mem_alloc(size_input_tm2 * sizeof(float)); + float *input_tm2_buf = (float *)csi_mem_alloc(64 * tiles * in_c * sizeof(float)); #pragma omp parallel for num_threads(1) for (int r = 0; r < 64; r++) { - float *img_tm2 = input_tm2_buf + r * size_input_tm2 / 64; // input_tm2 r channel data + float *img_tm2 = input_tm2_buf + r * tiles * in_c; // input_tm2 r channel data int t = 0; for (; t + 7 < tiles; t += 8) { @@ -2305,7 +2295,7 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, ); } for (; t + 3 < tiles; t += 4) { - float *tm2 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8; // img_tm2 row data + float *tm2 = img_tm2 + t * in_c; // img_tm2 row data float *tm1 = input_tm1_buf; tm1 += (r * tiles + t) * 4; @@ -2358,7 +2348,7 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, ); } for (; t + 1 < tiles; t += 2) { - float *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8; // img_tm2 row data + float *tm2 = img_tm2 + t * in_c; // img_tm2 row data float *tm1 = input_tm1_buf; tm1 += (r * tiles + t) * 4; @@ -2406,7 +2396,7 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, } for (; t < tiles; t++) { - float *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8; // img_tm2 row data + float *tm2 = img_tm2 + t * in_c; // img_tm2 row data float *tm1 = input_tm1_buf; tm1 += (r * tiles + t) * 4; @@ -2466,7 +2456,7 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, for (int r = 0; r < 64; r++) { - float *img_tm2 = input_tm2_buf + r * size_input_tm2 / 64; // img_tm2 第r个channel + float *img_tm2 = input_tm2_buf + r * tiles * in_c; // img_tm2 第r个channel int t = 0; for (; t + 7 < tiles; t += 8) { @@ -2546,7 +2536,7 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, ); } for (; t + 3 < tiles; t += 4) { - float *r0 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8; + float *r0 = img_tm2 + t * in_c; float *k0 = kernel0_tm + r * in_c * 4; asm volatile( @@ -2597,7 +2587,7 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, ); } for (; t + 1 < tiles; t += 2) { - float *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8; + float *r0 = img_tm2 + t * in_c; float *k0 = kernel0_tm + r * in_c * 4; asm volatile( @@ -2639,7 +2629,7 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, } for (; t < tiles; t++) { - float *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8; + float *r0 = img_tm2 + t * in_c; float *k0 = kernel0_tm + r * in_c * 4; asm volatile( @@ -3320,22 +3310,12 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, /*********************************** dot ***************************************/ // reorder input_tm1_buf - int size_input_tm2 = 0; - if (tiles >= 8) { - size_input_tm2 = 36 * (tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 8; - } else if (tiles >= 4) { - size_input_tm2 = 36 * (tiles / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 4; - } else if (tiles >= 2) { - size_input_tm2 = 36 * (tiles / 2 + tiles % 2) * in_c * 2; - } else { - size_input_tm2 = 36 * tiles * in_c; - } - float *input_tm2_buf = (float *)csi_mem_alloc(size_input_tm2 * sizeof(float)); + float *input_tm2_buf = (float *)csi_mem_alloc(36 * tiles * in_c * sizeof(float)); #pragma omp parallel for num_threads(1) for (int r = 0; r < 36; r++) { - float *img_tm2 = input_tm2_buf + r * size_input_tm2 / 36; // input_tm2 r channel data + float *img_tm2 = input_tm2_buf + r * tiles * in_c; // input_tm2 r channel data int t = 0; for (; t + 7 < tiles; t += 8) { @@ -3361,7 +3341,7 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, } } for (; t + 3 < tiles; t += 4) { - float *tm2 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8; // img_tm2 row data + float *tm2 = img_tm2 + t * in_c; // img_tm2 row data float *tm1 = input_tm1_buf; tm1 += (r * tiles + t) * 4; @@ -3378,7 +3358,7 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, } } for (; t + 1 < tiles; t += 2) { - float *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8; // img_tm2 row data + float *tm2 = img_tm2 + t * in_c; // img_tm2 row data float *tm1 = input_tm1_buf; tm1 += (r * tiles + t) * 4; @@ -3393,7 +3373,7 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, } for (; t < tiles; t++) { - float *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8; // img_tm2 row data + float *tm2 = img_tm2 + t * in_c; // img_tm2 row data float *tm1 = input_tm1_buf; tm1 += (r * tiles + t) * 4; @@ -3420,7 +3400,7 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, for (int r = 0; r < 36; r++) { - float *img_tm2 = input_tm2_buf + r * size_input_tm2 / 36; // img_tm2 第r个channel + float *img_tm2 = input_tm2_buf + r * tiles * in_c; // img_tm2 第r个channel int t = 0; for (; t + 7 < tiles; t += 8) { @@ -3500,7 +3480,7 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, ); } for (; t + 3 < tiles; t += 4) { - float *r0 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8; + float *r0 = img_tm2 + t * in_c; float *k0 = kernel0_tm + r * in_c * 4; asm volatile( @@ -3551,7 +3531,7 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, ); } for (; t + 1 < tiles; t += 2) { - float *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8; + float *r0 = img_tm2 + t * in_c; float *k0 = kernel0_tm + r * in_c * 4; asm volatile( @@ -3593,7 +3573,7 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, } for (; t < tiles; t++) { - float *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8; + float *r0 = img_tm2 + t * in_c; float *k0 = kernel0_tm + r * in_c * 4; asm volatile(