diff --git a/include/small/abstract_layer.hpp b/include/small/abstract_layer.hpp index d018834..4442a90 100644 --- a/include/small/abstract_layer.hpp +++ b/include/small/abstract_layer.hpp @@ -23,6 +23,12 @@ #define DEBUG 0 + +#define ELEMENTAL 1 +#define BLOCK 2 + +#define PARALLEL_DIST ELEMENTAL + namespace small { namespace detail @@ -67,7 +73,7 @@ namespace detail //**************************************************************************** /// @todo add parameter: step -#define FLOAT_ABSTRACT_OP_END(op_type, op_class, a_cur, b_cur, c_cur, W_elements, _C_ob) \ +#define FLOAT_ABSTRACT_OP_END(op_type, op_class, step, a_cur, b_cur, c_cur, W_elements, _C_ob) \ if constexpr (op_type == OP_CONV) \ { \ if constexpr (op_class == 1) \ @@ -103,6 +109,7 @@ namespace detail //**************************************************************************** +//@todo: See if these different functions can be merged. Might require in-assembly microkernels template +void inline compute_with_padding(dim_t H_lb, dim_t H_ub, + dim_t W_lb, dim_t W_ub, + dim_t F_w, + dim_t W_elements, + dim_t input_col_stride, + ScalarT const *F, + ScalarT const *I, + c_tile_t *c_cur, + const dim_t F_c_left) +{ + #if DEBUG + printf("F_c_left %d\n", F_c_left); + + #endif + constexpr dim_t _C_ob = _G_b * _K_b; + const dim_t _C_ib = _G_b * F_c_left; + const dim_t step = _stride * _C_ib; + + for (uint32_t n = H_lb; n < H_ub; n++) + { + int filter_offset_h = n * F_w * F_c_left * _G_b * _K_b; + int input_stencil_h = (n - H_lb) * input_col_stride; /*+ input_col_offset + input_row_offset*/ + + for (uint32_t m = W_lb; m < W_ub; m++) + { + int filter_offset_w = m * F_c_left * _G_b * _K_b + filter_offset_h; + /* This is C_ib because the microkernel stretches across groups*/ + int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h; + + ScalarT const *b = F + filter_offset_w; + ScalarT const *a = I + input_stencil_w; + + // TODO: reintroduce convolution + for (uint32_t ii = 0; ii < F_c_left / _UNROLL; ii++) + { + + /// @note using platform C_ob + ScalarT const *b_cur = b + ii * _UNROLL * FLOAT_C_ob; + ScalarT const *a_cur = a + ii * _UNROLL; + +#if DEBUG + printf("\t compute w padding_in channel %d\n", ii); + printf("input values: %f \n", a_cur[0]); + printf("filter values for output channel 0 : %f %f %f %f \n", b_cur[0 ], b_cur[1 ], b_cur[2 ], b_cur[3 ]); + printf("input col stride %d\n", input_col_stride); + +#endif + FLOAT_ABSTRACT_OP_END(op_type, op_class, step, a_cur, b_cur, c_cur, W_elements, _C_ob); + } + } + } + +} + +// Edge case to handle remainder input and output channels +// When streamlined, this is the only function that should remain +template +void inline compute_with_padding(dim_t H_lb, dim_t H_ub, + dim_t W_lb, dim_t W_ub, + dim_t F_w, + dim_t W_elements, + dim_t input_col_stride, + ScalarT const *F, + ScalarT const *I, + c_tile_t *c_cur, + const dim_t F_c_left, + const dim_t G_left, + const dim_t K_left) +{ +#if DEBUG ==1 + printf("_F_cb %d _G_b %d _K_b %d\n", F_c_left, _G_b, _K_b); + +#endif + + const dim_t _C_ob = G_left * K_left; + const dim_t _C_ib = G_left * F_c_left; + const dim_t step = _stride * _C_ib; + + for (uint32_t n = H_lb; n < H_ub; n++) + { + int filter_offset_h = n * F_w * F_c_left * G_left * K_left; + int input_stencil_h = (n - H_lb) * input_col_stride; /*+ input_col_offset + input_row_offset*/ + + for (uint32_t m = W_lb; m < W_ub; m++) + { + int filter_offset_w = m * F_c_left * G_left * K_left + filter_offset_h; + /* This is C_ib because the microkernel stretches across groups*/ + int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h; + + ScalarT const *b = F + filter_offset_w; + ScalarT const *a = I + input_stencil_w; + + // TODO: reintroduce convolution + for (uint32_t ii = 0; ii < F_c_left / _UNROLL; ii++) + { + + /// @note using platform C_ob + ScalarT const *b_cur = b + ii * _UNROLL * _C_ob; + ScalarT const *a_cur = a + ii * _UNROLL; + +#if DEBUG == 1 + printf("\t compute w padding_in channel %d\n", ii); + printf("input values: %f \n", a_cur[0]); + if (op_type == OP_CONV ) + printf("filter values for output channel 0 : %f %f %f %f \n", b_cur[0], b_cur[1], b_cur[2], b_cur[3]); + printf("input col stride %d\n", input_col_stride); + +#endif + FLOAT_ABSTRACT_OP_END(op_type, op_class, step, a_cur, b_cur, c_cur, W_elements, _C_ob); } } } } //**************************************************************************** +// @todo: is kernel left buggy? Needs Load_C_strided for left padding output elements in pooling template AccumT dim_t H_lb = 0, - dim_t H_ub = 0) -{ - constexpr dim_t _C_ob = _G_b * _K_b; - // constexpr dim_t _C_ib = _G_b * _F_cb; - // constexpr dim_t step = _stride * _C_ib; + dim_t H_ub = 0, + AccumT k_zero = (AccumT)0, + AccumT I_offset = 0, + AccumT F_offset = 0) + { + constexpr dim_t _C_ob = _G_b * _K_b;; const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub); FLOAT_DEF_END_C(_O_wb, _C_ob); @@ -203,17 +353,20 @@ void inline kernel_left( // dim_t c_cur = 0; for (uint32_t k_p = 0; k_p < l_pad_el; k_p++) { - compute_with_padding( - H_lb, H_UPPER, - W_i_valid, F_w, - F_w, - 1, - input_col_stride, - F, - I_ptr, - c_cur); + + compute_with_padding( + H_lb, H_UPPER, + W_i_valid, F_w, + F_w, + 1, + input_col_stride, + F, + I_ptr, + c_cur); + + c_cur += (_K_b * _G_b) / (FLOAT_SIMD_EPILOGUE); // c_cur += 1; @@ -231,7 +384,6 @@ void inline kernel_left( O_ptr += _G_b * _K_b; } -//**************************************************************************** template -void inline kernel( +void inline rem_kernel_left( bool first, dim_t F_h, dim_t F_w, dim_t input_col_stride, + dim_t l_pad_el, + dim_t l_pad, ScalarT const *I, ScalarT const *F, AccumT *O, // ScalarT -> AccumT + const dim_t G_left, + const dim_t K_left, dim_t H_lb = 0, - dim_t H_ub = 0, - dim_t W_lb = 0, - dim_t W_ub = 0) + dim_t H_ub = 0 + ) { - constexpr dim_t _C_ob = _G_b * _K_b; - constexpr dim_t _C_ib = _G_b * _F_cb; - constexpr dim_t step = _stride * _C_ib; - + const dim_t _C_ob = G_left * K_left; const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub); - // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub); + FLOAT_DEF_END_C(_O_wb, _C_ob); + + // left padding elements + AccumT *O_ptr = O; // ScalarT -> AccumT + ScalarT const *I_ptr = I; + + int W_i_valid = l_pad; - FLOAT_DEF_TILE_C(_O_wb, _C_ob); if (first) { - FLOAT_ZERO_TILE_C(_O_wb, _C_ob); - if (op_type == OP_MAX_POOL || op_type == OP_MUL) - { - /// @note using platform C_ob - FLOAT_LOAD_TILE_C_strided(I, step, _O_wb, FLOAT_C_ob); - } - else if (op_type == OP_UPSAMPLE) - { - FLOAT_LOAD_TILE_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob); - } + FLOAT_ZERO_END_C(l_pad_el, _C_ob); } else { - FLOAT_LOAD_TILE_C(O, _O_wb, _C_ob); - if constexpr (op_type == OP_UPSAMPLE) - { - FLOAT_ACCUM_TILE_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob); - } - //@todo support reduction tree-like kernel (for global reductions) + FLOAT_LOAD_END_C(O_ptr, l_pad_el, _C_ob); } - for (uint32_t n = H_lb; n < H_UPPER; n++) + c_tile_t *c_cur = c_tile; + // dim_t c_cur = 0; + for (uint32_t k_p = 0; k_p < l_pad_el; k_p++) { - int filter_offset_h = n * F_w * _F_cb * _G_b * _K_b; - int input_stencil_h = (n - H_lb) * input_col_stride; /*+ input_col_offset + input_row_offset*/ - - for (uint32_t m = 0; m < F_w; m++) - { - int filter_offset_w = m * _F_cb * _G_b * _K_b + filter_offset_h; - // This is C_ob because the microkernel stretches across groups - int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h; - - ScalarT const *b = F + filter_offset_w; - ScalarT const *a = I + input_stencil_w; - for (uint32_t ii = 0; ii < _F_cb / _UNROLL; ii++) - { - /// @note using platform C_ob - ScalarT const *b_cur = b + ii * _UNROLL * FLOAT_C_ob; - ScalarT const *a_cur = a + ii * _UNROLL; - FLOAT_ABSTRACT_OP(op_type, op_class, a_cur, b_cur, _O_wb, _C_ob); /// @todo pass _C_ob - } - } + // UNROLL has to be 1 for this to work (FLOAT_UNROLL == 1 for ZEN2) + compute_with_padding( + H_lb, H_UPPER, + W_i_valid, F_w, + F_w, + 1, + input_col_stride, + F, + I_ptr, + c_cur, + _F_cb, + G_left, + K_left); + + c_cur += (G_left * K_left) / (FLOAT_SIMD_EPILOGUE); + // c_cur += 1; + W_i_valid -= _stride; + // I_ptr += ()*(_stride * _F_cb * _G_b); } - + // Fusion Slot # 1 + // Include division for Average Pooling if (op_type == OP_AVERAGE_POOL) { float norm = 1.0 / (1.0 * F_h * F_w); - FLOAT_DIV_TILE_C(norm, _O_wb, _C_ob); + FLOAT_DIV_END_C(c_tile, norm, l_pad_el, _C_ob); } - FLOAT_STORE_TILE_C(O, _O_wb, _C_ob); - //@todo support reduction-tree like store for global reductions + FLOAT_STORE_END_C(O_ptr, l_pad_el, _C_ob); + O_ptr += G_left * K_left; } -//**************************************************************************** -// TODO: Explain the difference between kernel and kernel_pad +// Edge case to handle remainder input channels template -void inline kernel_pad( +void inline kernel_left_rem( bool first, dim_t F_h, dim_t F_w, dim_t input_col_stride, + dim_t l_pad_el, + dim_t l_pad, ScalarT const *I, ScalarT const *F, AccumT *O, // ScalarT -> AccumT + const dim_t F_c_left, dim_t H_lb = 0, - dim_t H_ub = 0, - dim_t W_lb = 0, - dim_t W_ub = 0) + dim_t H_ub = 0) { constexpr dim_t _C_ob = _G_b * _K_b; - constexpr dim_t _C_ib = _G_b * _F_cb; - constexpr dim_t step = _stride * _C_ib; + ; const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub); - // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub); + FLOAT_DEF_END_C(_O_wb, _C_ob); + + // left padding elements + AccumT *O_ptr = O; // ScalarT -> AccumT + ScalarT const *I_ptr = I; + + int W_i_valid = l_pad; - FLOAT_DEF_TILE_C(_O_wb, _C_ob); if (first) { - FLOAT_ZERO_TILE_C(_O_wb, _C_ob); - - //@note padding should always be 'v' for pointwise operations, so this code path should not be used - if(op_type == OP_MUL) - { - FLOAT_LOAD_TILE_C_strided(I, step, _O_wb, _C_ob); - } + FLOAT_ZERO_END_C(l_pad_el, _C_ob); } else { - FLOAT_LOAD_TILE_C(O, _O_wb, _C_ob); + FLOAT_LOAD_END_C(O_ptr, l_pad_el, _C_ob); } - - for (uint32_t n = H_lb; n < H_UPPER; n++) + c_tile_t *c_cur = c_tile; + // dim_t c_cur = 0; + for (uint32_t k_p = 0; k_p < l_pad_el; k_p++) { - int filter_offset_h = n * F_w * _F_cb * _G_b * _K_b; - int input_stencil_h = /*input_col_offset + input_row_offset +*/ - (n - H_lb) * input_col_stride; - - for (uint32_t m = 0; m < F_w; m++) - { - int filter_offset_w = m * _F_cb * _G_b * _K_b + filter_offset_h; - // This is C_ob because the microkernel stretches across groups - int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h; - ScalarT const *b = F + filter_offset_w; - ScalarT const *a = I + input_stencil_w; - - for (uint32_t ii = 0; ii < _F_cb / _UNROLL; ii++) - { - /// @note using platform C_ob - ScalarT const *b_cur = b + ii * _UNROLL * FLOAT_C_ob; - ScalarT const *a_cur = a + ii * _UNROLL; + compute_with_padding( + H_lb, H_UPPER, + W_i_valid, F_w, + F_w, + 1, + input_col_stride, + F, + I_ptr, + c_cur, + F_c_left); - FLOAT_ABSTRACT_OP(op_type, op_class, a_cur, b_cur, _O_wb, _C_ob); - } - } + c_cur += (_K_b * _G_b) / (FLOAT_SIMD_EPILOGUE); + // c_cur += 1; + W_i_valid -= _stride; + // I_ptr += ()*(_stride * _F_cb * _G_b); } + // Fusion Slot # 1 + // Include division for Average Pooling if (op_type == OP_AVERAGE_POOL) { float norm = 1.0 / (1.0 * F_h * F_w); - FLOAT_DIV_TILE_C(norm, _O_wb, _C_ob); + FLOAT_DIV_END_C(c_tile, norm, l_pad_el, _C_ob); } - FLOAT_STORE_TILE_C(O, _O_wb, _C_ob); + FLOAT_STORE_END_C(O_ptr, l_pad_el, _C_ob); + O_ptr += _G_b * _K_b; } -//**************************************************************************** +// Edge case to handle remainder input channels template -void inline kernel_right( +void inline rem_kernel_left_rem( bool first, dim_t F_h, dim_t F_w, dim_t input_col_stride, - dim_t O_w_left, - dim_t r_pad_el, - dim_t r_pad, + dim_t l_pad_el, + dim_t l_pad, ScalarT const *I, ScalarT const *F, AccumT *O, // ScalarT -> AccumT + const dim_t F_c_left, + const dim_t G_left, + const dim_t K_left, dim_t H_lb = 0, dim_t H_ub = 0) { - constexpr dim_t _C_ob = _G_b * _K_b; - constexpr dim_t _C_ib = _G_b * _F_cb; - constexpr dim_t step = _stride * _C_ib; + const dim_t _C_ob = G_left * K_left; + ; + const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub); FLOAT_DEF_END_C(_O_wb, _C_ob); -#if DEBUG + + // left padding elements + AccumT *O_ptr = O; // ScalarT -> AccumT + ScalarT const *I_ptr = I; + + int W_i_valid = l_pad; + + if (first) + { + FLOAT_ZERO_END_C(l_pad_el, _C_ob); + } + else + { + FLOAT_LOAD_END_C(O_ptr, l_pad_el, _C_ob); + } + + c_tile_t *c_cur = c_tile; + // dim_t c_cur = 0; + for (uint32_t k_p = 0; k_p < l_pad_el; k_p++) + { + + compute_with_padding( + H_lb, H_UPPER, + W_i_valid, F_w, + F_w, + 1, + input_col_stride, + F, + I_ptr, + c_cur, + F_c_left, + G_left, + K_left); + + c_cur += (K_left * G_left) / (FLOAT_SIMD_EPILOGUE); + // c_cur += 1; + W_i_valid -= _stride; + // I_ptr += ()*(_stride * _F_cb * _G_b); + } + // Fusion Slot # 1 + // Include division for Average Pooling + if (op_type == OP_AVERAGE_POOL) + { + float norm = 1.0 / (1.0 * F_h * F_w); + FLOAT_DIV_END_C(c_tile, norm, l_pad_el, _C_ob); + } + FLOAT_STORE_END_C(O_ptr, l_pad_el, _C_ob); + O_ptr += G_left * K_left; +} + +//**************************************************************************** +template +void inline kernel( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + ScalarT const *I, + ScalarT const *F, + AccumT *O, // ScalarT -> AccumT + dim_t H_lb = 0, + dim_t H_ub = 0, + dim_t W_lb = 0, + dim_t W_ub = 0) +{ + constexpr dim_t _C_ob = _G_b * _K_b; + constexpr dim_t _C_ib = _G_b * _F_cb; + constexpr dim_t step = _stride * _C_ib; + + + const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub); + // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub); + + FLOAT_DEF_TILE_C(_O_wb, _C_ob); + if (first) + { + FLOAT_ZERO_TILE_C(_O_wb, _C_ob); + if (op_type == OP_MAX_POOL || op_type == OP_MUL) + { + /// @note using platform C_ob + FLOAT_LOAD_TILE_C_strided(I, step, _O_wb, _C_ob); + } + else if (op_type == OP_UPSAMPLE) + { + FLOAT_LOAD_TILE_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob); + } + } + else + { + FLOAT_LOAD_TILE_C(O, _O_wb, _C_ob); + if constexpr (op_type == OP_UPSAMPLE) + { + FLOAT_ACCUM_TILE_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob); + } + //@todo support reduction tree-like kernel (for global reductions) + } + + for (uint32_t n = H_lb; n < H_UPPER; n++) + { + int filter_offset_h = n * F_w * _F_cb * _G_b * _K_b; + int input_stencil_h = (n - H_lb) * input_col_stride; /*+ input_col_offset + input_row_offset*/ + + for (uint32_t m = 0; m < F_w; m++) + { + int filter_offset_w = m * _F_cb * _G_b * _K_b + filter_offset_h; + // This is C_ob because the microkernel stretches across groups + int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h; + + ScalarT const *b = F + filter_offset_w; + ScalarT const *a = I + input_stencil_w; + for (uint32_t ii = 0; ii < _F_cb / _UNROLL; ii++) + { + /// @note using platform C_ob + ScalarT const *b_cur = b + ii * _UNROLL * _C_ob; + ScalarT const *a_cur = a + ii * _UNROLL; + FLOAT_ABSTRACT_OP(op_type, op_class, a_cur, b_cur, _O_wb, _C_ob); /// @todo pass _C_ob + } + } + } + + if (op_type == OP_AVERAGE_POOL) + { + float norm = 1.0 / (1.0 * F_h * F_w); + FLOAT_DIV_TILE_C(norm, _O_wb, _C_ob); + } + FLOAT_STORE_TILE_C(O, _O_wb, _C_ob); + //@todo support reduction-tree like store for global reductions +} + +//Edge case to handle remainder output channels + +template +void inline rem_kernel( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + ScalarT const *I, + ScalarT const *F, + AccumT *O, // ScalarT -> AccumT + const dim_t G_left, + const dim_t K_left, + dim_t H_lb = 0, + dim_t H_ub = 0, + dim_t W_lb = 0, + dim_t W_ub = 0) +{ + const dim_t _C_ob = G_left * K_left; + const dim_t _C_ib = G_left * _F_cb; + const dim_t step = _stride * _C_ib; + + // printf(" _C_ob %d\n", _C_ob); + // printf(" _C_ib %d\n", _C_ib); + const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub); + // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub); + + FLOAT_DEF_END_C(_O_wb, _C_ob); + if (first) + { + FLOAT_ZERO_END_C(_O_wb, _C_ob); + if (op_type == OP_MAX_POOL || op_type == OP_MUL) + { + /// @note using platform C_ob + FLOAT_LOAD_END_C_strided(I, step, _O_wb, _C_ob); + } + else if (op_type == OP_UPSAMPLE) + { + FLOAT_LOAD_END_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob); + } + } + else + { + FLOAT_LOAD_END_C(O, _O_wb, _C_ob); + if constexpr (op_type == OP_UPSAMPLE) + { + FLOAT_ACCUM_END_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob); + } + //@todo support reduction tree-like kernel (for global reductions) + } + + for (uint32_t n = H_lb; n < H_UPPER; n++) + { + int filter_offset_h = n * F_w * _F_cb * G_left * K_left; + int input_stencil_h = (n - H_lb) * input_col_stride; /*+ input_col_offset + input_row_offset*/ + + for (uint32_t m = 0; m < F_w; m++) + { + int filter_offset_w = m * _F_cb * G_left * K_left + filter_offset_h; + // This is C_ob because the microkernel stretches across groups + int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h; + + ScalarT const *b = F + filter_offset_w; + ScalarT const *a = I + input_stencil_w; + for (uint32_t ii = 0; ii < _F_cb / _UNROLL; ii++) + { + /// @note using platform C_ob + ScalarT const *b_cur = b + ii * _UNROLL * _C_ob; + ScalarT const *a_cur = a + ii * _UNROLL; + FLOAT_ABSTRACT_OP_END(op_type, op_class, step, a_cur, b_cur, c_tile, _O_wb, _C_ob); /// @todo pass _C_ob + } + } + } + + if (op_type == OP_AVERAGE_POOL) + { + float norm = 1.0 / (1.0 * F_h * F_w); + FLOAT_DIV_END_C(c_tile, norm, _O_wb, _C_ob); + } + + FLOAT_STORE_END_C(O, _O_wb, _C_ob); + //@todo support reduction-tree like store for global reductions +} + +//Edge case to handle remainder input channels +template +void inline kernel_rem( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + ScalarT const *I, + ScalarT const *F, + AccumT *O, // ScalarT -> AccumT + const dim_t F_c_left, + dim_t H_lb = 0, + dim_t H_ub = 0, + dim_t W_lb = 0, + dim_t W_ub = 0) +{ + constexpr dim_t _C_ob = _G_b * _K_b; + dim_t _C_ib = _G_b * F_c_left; + dim_t step = _stride * _C_ib; + + const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub); + // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub); + + FLOAT_DEF_TILE_C(_O_wb, _C_ob); + if (first) + { + FLOAT_ZERO_TILE_C(_O_wb, _C_ob); + if (op_type == OP_MAX_POOL || op_type == OP_MUL) + { + /// @note using platform C_ob + FLOAT_LOAD_TILE_C_strided(I, step, _O_wb, FLOAT_C_ob); + } + else if (op_type == OP_UPSAMPLE) + { + FLOAT_LOAD_TILE_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob); + } + } + else + { + FLOAT_LOAD_TILE_C(O, _O_wb, _C_ob); + if constexpr (op_type == OP_UPSAMPLE) + { + FLOAT_ACCUM_TILE_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob); + } + //@todo support reduction tree-like kernel (for global reductions) + } + + for (uint32_t n = H_lb; n < H_UPPER; n++) + { + int filter_offset_h = n * F_w * F_c_left * _G_b * _K_b; + int input_stencil_h = (n - H_lb) * input_col_stride; /*+ input_col_offset + input_row_offset*/ + + for (uint32_t m = 0; m < F_w; m++) + { + int filter_offset_w = m * F_c_left * _G_b * _K_b + filter_offset_h; + // This is C_ob because the microkernel stretches across groups + int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h; + + ScalarT const *b = F + filter_offset_w; + ScalarT const *a = I + input_stencil_w; + for (uint32_t ii = 0; ii < F_c_left / _UNROLL; ii++) + { + /// @note using platform C_ob + ScalarT const *b_cur = b + ii * _UNROLL * FLOAT_C_ob; + ScalarT const *a_cur = a + ii * _UNROLL; + FLOAT_ABSTRACT_OP(op_type, op_class, a_cur, b_cur, _O_wb, _C_ob); /// @todo pass _C_ob + } + } + } + + if (op_type == OP_AVERAGE_POOL) + { + float norm = 1.0 / (1.0 * F_h * F_w); + FLOAT_DIV_TILE_C(norm, _O_wb, _C_ob); + } + FLOAT_STORE_TILE_C(O, _O_wb, _C_ob); + //@todo support reduction-tree like store for global reductions +} + +// Edge case to handle remainder input and output channels +// Edge case to handle remainder input channels +template +void inline rem_kernel_rem( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + ScalarT const *I, + ScalarT const *F, + AccumT *O, // ScalarT -> AccumT + const dim_t F_c_left, + const dim_t G_left, + const dim_t K_left, + dim_t H_lb = 0, + dim_t H_ub = 0, + dim_t W_lb = 0, + dim_t W_ub = 0) +{ + const dim_t _C_ob = G_left * K_left; + dim_t _C_ib = G_left * F_c_left; + dim_t step = _stride * _C_ib; + + const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub); + // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub); + + FLOAT_DEF_END_C(_O_wb, _C_ob); + if (first) + { + FLOAT_ZERO_END_C(_O_wb, _C_ob); + if (op_type == OP_MAX_POOL || op_type == OP_MUL) + { + /// @note using platform C_ob + FLOAT_LOAD_END_C_strided(I, step, _O_wb, FLOAT_C_ob); + } + else if (op_type == OP_UPSAMPLE) + { + FLOAT_LOAD_END_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob); + } + } + else + { + FLOAT_LOAD_END_C(O, _O_wb, _C_ob); + if constexpr (op_type == OP_UPSAMPLE) + { + FLOAT_ACCUM_END_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob); + } + //@todo support reduction tree-like kernel (for global reductions) + } + + for (uint32_t n = H_lb; n < H_UPPER; n++) + { + int filter_offset_h = n * F_w * F_c_left * G_left * K_left; + int input_stencil_h = (n - H_lb) * input_col_stride; /*+ input_col_offset + input_row_offset*/ + + for (uint32_t m = 0; m < F_w; m++) + { + int filter_offset_w = m * F_c_left * G_left * K_left + filter_offset_h; + // This is C_ob because the microkernel stretches across groups + int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h; + + ScalarT const *b = F + filter_offset_w; + ScalarT const *a = I + input_stencil_w; + for (uint32_t ii = 0; ii < F_c_left / _UNROLL; ii++) + { + /// @note using platform C_ob + ScalarT const *b_cur = b + ii * _UNROLL * _C_ob; + ScalarT const *a_cur = a + ii * _UNROLL; + FLOAT_ABSTRACT_OP_END(op_type, op_class, step, a_cur, b_cur, c_tile, _O_wb, _C_ob); /// @todo pass _C_ob + } + } + } + + if (op_type == OP_AVERAGE_POOL) + { + float norm = 1.0 / (1.0 * F_h * F_w); + FLOAT_DIV_END_C(c_tile, norm, _O_wb, _C_ob); + } + FLOAT_STORE_END_C(O, _O_wb, _C_ob); + //@todo support reduction-tree like store for global reductions +} + +//**************************************************************************** +//The kernel pad function allows for padding in the height and width of the filter +// If the compiler complies, kernel_pad and kernel could be combined +template +void inline kernel_pad( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + ScalarT const *I, + ScalarT const *F, + AccumT *O, // ScalarT -> AccumT + dim_t H_lb = 0, + dim_t H_ub = 0, + dim_t W_lb = 0, + dim_t W_ub = 0) +{ + constexpr dim_t _C_ob = _G_b * _K_b; + constexpr dim_t _C_ib = _G_b * _F_cb; + constexpr dim_t step = _stride * _C_ib; + + const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub); + // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub); + + FLOAT_DEF_TILE_C(_O_wb, _C_ob); + if (first) + { + FLOAT_ZERO_TILE_C(_O_wb, _C_ob); + + //@note padding should always be 'v' for pointwise operations, so this code path should not be used + if(op_type == OP_MUL) + { + FLOAT_LOAD_TILE_C_strided(I, step, _O_wb, _C_ob); + } + + } + else + { + FLOAT_LOAD_TILE_C(O, _O_wb, _C_ob); + } + + + for (uint32_t n = H_lb; n < H_UPPER; n++) + { + int filter_offset_h = n * F_w * _F_cb * _G_b * _K_b; + int input_stencil_h = /*input_col_offset + input_row_offset +*/ + (n - H_lb) * input_col_stride; + + for (uint32_t m = 0; m < F_w; m++) + { + int filter_offset_w = m * _F_cb * _G_b * _K_b + filter_offset_h; + // This is C_ob because the microkernel stretches across groups + int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h; + + ScalarT const *b = F + filter_offset_w; + ScalarT const *a = I + input_stencil_w; + + for (uint32_t ii = 0; ii < _F_cb / _UNROLL; ii++) + { + /// @note using platform C_ob + ScalarT const *b_cur = b + ii * _UNROLL * FLOAT_C_ob; + ScalarT const *a_cur = a + ii * _UNROLL; + + FLOAT_ABSTRACT_OP(op_type, op_class, a_cur, b_cur, _O_wb, _C_ob); + } + } + } + if (op_type == OP_AVERAGE_POOL) + { + float norm = 1.0 / (1.0 * F_h * F_w); + FLOAT_DIV_TILE_C(norm, _O_wb, _C_ob); + } + FLOAT_STORE_TILE_C(O, _O_wb, _C_ob); +} + +//Edge case to handle remainder output channels +template +void inline rem_kernel_pad( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + ScalarT const *I, + ScalarT const *F, + AccumT *O, // ScalarT -> AccumT + const dim_t G_left, + const dim_t K_left, + dim_t H_lb = 0, + dim_t H_ub = 0, + dim_t W_lb = 0, + dim_t W_ub = 0) +{ + const dim_t _C_ob = G_left * K_left; + const dim_t _C_ib = G_left * _F_cb; + const dim_t step = _stride * _C_ib; + + const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub); + // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub); + + FLOAT_DEF_END_C(_O_wb, _C_ob); + if (first) + { + FLOAT_ZERO_END_C(_O_wb, _C_ob); + + //@note padding should always be 'v' for pointwise operations, so this code path should not be used + if(op_type == OP_MUL) + { + FLOAT_LOAD_END_C_strided(I, step, _O_wb, _C_ob); + } + } + else + { + FLOAT_LOAD_END_C(O, _O_wb, _C_ob); + } + + + for (uint32_t n = H_lb; n < H_UPPER; n++) + { + int filter_offset_h = n * F_w * _F_cb * _C_ob; + int input_stencil_h = /*input_col_offset + input_row_offset +*/ + (n - H_lb) * input_col_stride; + + for (uint32_t m = 0; m < F_w; m++) + { + int filter_offset_w = m * _F_cb * _C_ob + filter_offset_h; + // This is C_ob because the microkernel stretches across groups + int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h; + + ScalarT const *b = F + filter_offset_w; + ScalarT const *a = I + input_stencil_w; + + for (uint32_t ii = 0; ii < _F_cb / _UNROLL; ii++) + { + /// @note using platform C_ob + ScalarT const *b_cur = b + ii * _UNROLL * _C_ob; + ScalarT const *a_cur = a + ii * _UNROLL; + + FLOAT_ABSTRACT_OP_END(op_type, op_class, step, a_cur, b_cur, c_tile, _O_wb, _C_ob); + } + } + } + if (op_type == OP_AVERAGE_POOL) + { + float norm = 1.0 / (1.0 * F_h * F_w); + FLOAT_DIV_END_C(c_tile, norm, _O_wb, _C_ob); + } + FLOAT_STORE_END_C(O, _O_wb, _C_ob); +} + + +//Edge case to handle remainder input channels +template +void inline kernel_pad_rem( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + ScalarT const *I, + ScalarT const *F, + AccumT *O, // ScalarT -> AccumT + const dim_t F_c_left, + dim_t H_lb = 0, + dim_t H_ub = 0, + dim_t W_lb = 0, + dim_t W_ub = 0 + ) +{ + constexpr dim_t _C_ob = _G_b * _K_b; + const dim_t _C_ib = _G_b * F_c_left; + const dim_t step = _stride * _C_ib; + + const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub); + // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub); + + FLOAT_DEF_TILE_C(_O_wb, _C_ob); + if (first) + { + FLOAT_ZERO_TILE_C(_O_wb, _C_ob); + + //@note padding should always be 'v' for pointwise operations, so this code path should not be used + if (op_type == OP_MUL) + { + FLOAT_LOAD_TILE_C_strided(I, step, _O_wb, _C_ob); + } + } + else + { + FLOAT_LOAD_TILE_C(O, _O_wb, _C_ob); + } + + for (uint32_t n = H_lb; n < H_UPPER; n++) + { + int filter_offset_h = n * F_w * F_c_left * _G_b * _K_b; + int input_stencil_h = /*input_col_offset + input_row_offset +*/ + (n - H_lb) * input_col_stride; + + for (uint32_t m = 0; m < F_w; m++) + { + int filter_offset_w = m * F_c_left * _G_b * _K_b + filter_offset_h; + // This is C_ob because the microkernel stretches across groups + int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h; + + ScalarT const *b = F + filter_offset_w; + ScalarT const *a = I + input_stencil_w; + + for (uint32_t ii = 0; ii < F_c_left / _UNROLL; ii++) + { + /// @note using platform C_ob + ScalarT const *b_cur = b + ii * _UNROLL * _C_ob; + ScalarT const *a_cur = a + ii * _UNROLL; + + FLOAT_ABSTRACT_OP(op_type, op_class, a_cur, b_cur, _O_wb, _C_ob); + } + } + } + if (op_type == OP_AVERAGE_POOL) + { + float norm = 1.0 / (1.0 * F_h * F_w); + FLOAT_DIV_TILE_C(norm, _O_wb, _C_ob); + } + FLOAT_STORE_TILE_C(O, _O_wb, _C_ob); +} + +//Edge case to handle remainder input and output channels +template +void inline rem_kernel_pad_rem( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + ScalarT const *I, + ScalarT const *F, + AccumT *O, // ScalarT -> AccumT + const dim_t F_c_left, + const dim_t G_left, + const dim_t K_left, + dim_t H_lb = 0, + dim_t H_ub = 0, + dim_t W_lb = 0, + dim_t W_ub = 0) +{ + const dim_t _C_ob = G_left * K_left; + const dim_t _C_ib = G_left * F_c_left; + const dim_t step = _stride * _C_ib; + + const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub); + // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub); + + FLOAT_DEF_END_C(_O_wb, _C_ob); + if (first) + { + FLOAT_ZERO_END_C(_O_wb, _C_ob); + + //@note padding should always be 'v' for pointwise operations, so this code path should not be used + if (op_type == OP_MUL) + { + FLOAT_LOAD_END_C_strided(I, step, _O_wb, _C_ob); + } + } + else + { + FLOAT_LOAD_END_C(O, _O_wb, _C_ob); + } + + for (uint32_t n = H_lb; n < H_UPPER; n++) + { + int filter_offset_h = n * F_w * F_c_left * G_left * K_left; + int input_stencil_h = /*input_col_offset + input_row_offset +*/ + (n - H_lb) * input_col_stride; + + for (uint32_t m = 0; m < F_w; m++) + { + int filter_offset_w = m * F_c_left * G_left * K_left + filter_offset_h; + // This is C_ob because the microkernel stretches across groups + int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h; + + ScalarT const *b = F + filter_offset_w; + ScalarT const *a = I + input_stencil_w; + + for (uint32_t ii = 0; ii < F_c_left / _UNROLL; ii++) + { + /// @note using platform C_ob + ScalarT const *b_cur = b + ii * _UNROLL * _C_ob; + ScalarT const *a_cur = a + ii * _UNROLL; + + FLOAT_ABSTRACT_OP_END(op_type, op_class, step, a_cur, b_cur, c_tile, _O_wb, _C_ob); + } + } + } + if (op_type == OP_AVERAGE_POOL) + { + float norm = 1.0 / (1.0 * F_h * F_w); + FLOAT_DIV_END_C(c_tile, norm, _O_wb, _C_ob); + } + FLOAT_STORE_END_C(O, _O_wb, _C_ob); + //prinf the stored _O_wb x _C_ob output tile + +} + +//**************************************************************************** +template +void inline kernel_right( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + dim_t O_w_left, + dim_t r_pad_el, + dim_t r_pad, + ScalarT const *I, + ScalarT const *F, + AccumT *O, // ScalarT -> AccumT + dim_t H_lb = 0, + dim_t H_ub = 0) +{ + constexpr dim_t _C_ob = _G_b * _K_b; + constexpr dim_t _C_ib = _G_b * _F_cb; + constexpr dim_t step = _stride * _C_ib; + const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub); + FLOAT_DEF_END_C(_O_wb, _C_ob); + +#if DEBUG + printf("O_W_left %d r_pad_el %d\n", O_w_left, r_pad_el); +#endif + if (O_w_left) + { + if (first) + { + FLOAT_ZERO_END_C(O_w_left, _C_ob); + + if ( (op_type == OP_MUL)|| (op_type == OP_MAX_POOL && H_lb == 0 && H_ub == 0)) + { + FLOAT_LOAD_END_C_strided(I, step, O_w_left, _C_ob); + } + else if (op_type == OP_UPSAMPLE) + { + FLOAT_LOAD_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob); + } + } + else + { + if constexpr(op_type == OP_ADD && op_class == 3) + { + FLOAT_ZERO_END_C(O_w_left, _C_ob); + } + FLOAT_LOAD_END_C(O, O_w_left, _C_ob); + if constexpr (op_type == OP_UPSAMPLE) + { + FLOAT_ACCUM_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob); + } + } + + compute_with_padding( + H_lb, H_UPPER, + 0, F_w, + F_w, + O_w_left, + input_col_stride, + F, + I, + c_tile); + + if (op_type == OP_AVERAGE_POOL) + { + float norm = 1.0 / (1.0 * F_h * F_w); + FLOAT_DIV_END_C(c_tile, norm, O_w_left, _C_ob); + } + if constexpr(op_type == OP_ADD && op_class == 3 && _C_ob == 1) + { + /* If the operation reduces the channel dimension, reduce across channel dimension of simd tile*/ + FLOAT_REDUCE_CHANNEL_END_C(O_w_left, _C_ob) + } + + FLOAT_STORE_END_C(O, O_w_left, _C_ob); + } + + // right padding elements + AccumT *O_ptr = O + O_w_left * _C_ob; // ScalarT --> AccumT + ScalarT const *I_ptr = I + O_w_left * step; + int W_i_valid = F_w - 1; + + if (first) + { + FLOAT_ZERO_END_C(r_pad_el, _C_ob); + + // Initialize with 0 for the padding elements + + //@note padding should always be 'v' for pointwise operations, so this code path should not be used + if (op_type == OP_MUL) + { + FLOAT_LOAD_END_C_strided(I_ptr, step, r_pad_el, _C_ob); + } + } + else + { + FLOAT_LOAD_END_C(O_ptr, r_pad_el, _C_ob); + } + + c_tile_t *c_cur = c_tile; + // dim_t c_cur = 0; + for (uint32_t k_p = 0; k_p < r_pad_el; k_p++) + { + compute_with_padding( + H_lb, H_UPPER, + 0, W_i_valid, + F_w, + 1, + input_col_stride, + F, + I_ptr, + c_cur); + + c_cur += (_K_b * _G_b) / (FLOAT_SIMD_EPILOGUE); + W_i_valid -= _stride; + I_ptr += _stride * _F_cb * _G_b; + } + + if (op_type == OP_AVERAGE_POOL) + { + float norm = 1.0 / (1.0 * F_h * F_w); + FLOAT_DIV_END_C(c_tile, norm, r_pad_el, _C_ob); + } + + FLOAT_STORE_END_C(O_ptr, r_pad_el, _C_ob); +} + +//Edge case to handle remainder output channels +template +void inline rem_kernel_right( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + dim_t O_w_left, + dim_t r_pad_el, + dim_t r_pad, + ScalarT const *I, + ScalarT const *F, + AccumT *O, // ScalarT -> AccumT + const dim_t G_left, + const dim_t K_left, + dim_t H_lb = 0, + dim_t H_ub = 0) +{ + const dim_t _C_ob = G_left * K_left; + const dim_t _C_ib = G_left * _F_cb; + const dim_t step = _stride * _C_ib; + const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub); + FLOAT_DEF_END_C(_O_wb, _C_ob); + +#if DEBUG == 1 + printf("G_left %d K_left %d\n", G_left, K_left); + printf("kernel_right_rem\n"); + printf("First 5 input values: %f %f %f %f %f\n", I[0], I[1], I[2], I[3], I[4]); + if(op_type == OP_CONV) + printf("First 5 Filter values for output channel 0 : %f %f %f %f %f \n", F[0], F[1], F[2], F[3], F[4]); + printf("O_W_left %d r_pad_el %d\n", O_w_left, r_pad_el); + printf("input col stride %d\n", input_col_stride); +#endif + if (O_w_left) + { + if (first) + { + FLOAT_ZERO_END_C(O_w_left, _C_ob); + + if ((op_type == OP_MUL) || (op_type == OP_MAX_POOL && H_lb == 0 && H_ub == 0)) + { + FLOAT_LOAD_END_C_strided(I, step, O_w_left, _C_ob); + } + else if (op_type == OP_UPSAMPLE) + { + FLOAT_LOAD_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob); + } + } + else + { + if constexpr (op_type == OP_ADD && op_class == 3) + { + FLOAT_ZERO_END_C(O_w_left, _C_ob); + } + FLOAT_LOAD_END_C(O, O_w_left, _C_ob); + if constexpr (op_type == OP_UPSAMPLE) + { + FLOAT_ACCUM_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob); + } + } + compute_with_padding( + H_lb, H_UPPER, + 0, F_w, + F_w, + O_w_left, + input_col_stride, + F, + I, + c_tile, + _F_cb, + G_left, + K_left); + + if (op_type == OP_AVERAGE_POOL) + { + float norm = 1.0 / (1.0 * F_h * F_w); + FLOAT_DIV_END_C(c_tile, norm, O_w_left, _C_ob); + } + if (op_type == OP_ADD && op_class == 3 && _C_ob == 1) + { + /* If the operation reduces the channel dimension, reduce across channel dimension of simd tile*/ + FLOAT_REDUCE_REM_CHANNEL_END_C(O_w_left, _C_ob) + } + + FLOAT_STORE_END_C(O, O_w_left, _C_ob); +#if DEBUG == 1 + printf("First output value: %f %f %f %f \n", O[0], O[1], O[2], O[3]); +#endif + } + + // right padding elements + AccumT *O_ptr = O + O_w_left * _C_ob; // ScalarT --> AccumT + ScalarT const *I_ptr = I + O_w_left * step; + int W_i_valid = F_w - 1; + + if (first) + { + FLOAT_ZERO_END_C(r_pad_el, _C_ob); + + // Initialize with 0 for the padding elements + + //@note padding should always be 'v' for pointwise operations, so this code path should not be used + if (op_type == OP_MUL) + { + FLOAT_LOAD_END_C_strided(I_ptr, step, r_pad_el, _C_ob); + } + } + else + { + FLOAT_LOAD_END_C(O_ptr, r_pad_el, _C_ob); + } + + c_tile_t *c_cur = c_tile; + // dim_t c_cur = 0; + for (uint32_t k_p = 0; k_p < r_pad_el; k_p++) + { + compute_with_padding( + H_lb, H_UPPER, + 0, W_i_valid, + F_w, + 1, + input_col_stride, + F, + I_ptr, + c_cur, + _F_cb, + G_left, + K_left); + + c_cur += (K_left * G_left) / (FLOAT_SIMD_EPILOGUE); + W_i_valid -= _stride; + I_ptr += _stride * _C_ib; + } + + if (op_type == OP_AVERAGE_POOL) + { + float norm = 1.0 / (1.0 * F_h * F_w); + FLOAT_DIV_END_C(c_tile, norm, r_pad_el, _C_ob); + } + + FLOAT_STORE_END_C(O_ptr, r_pad_el, _C_ob); +} + +//Edge case to handle remainder input channels +template +void inline kernel_right_rem( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + dim_t O_w_left, + dim_t r_pad_el, + dim_t r_pad, + ScalarT const *I, + ScalarT const *F, + AccumT *O, // ScalarT -> AccumT + const dim_t F_c_left, + dim_t H_lb = 0, + dim_t H_ub = 0) +{ + constexpr dim_t _C_ob = _G_b * _K_b; + const dim_t _C_ib = _G_b * F_c_left; + const dim_t step = _stride * _C_ib; + const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub); + FLOAT_DEF_END_C(_O_wb, _C_ob); + +#if DEBUG + printf("kernel_right_rem\n"); + printf("First 5 input values: %f %f %f %f %f\n", I[0], I[1], I[2], I[3], I[4]); + printf("First 5 Filter values for output channel 0 : %f %f %f %f %f \n", F[0*_C_ob], F[1*_C_ob], F[2*_C_ob], F[3*_C_ob], F[4*_C_ob]); + printf("O_W_left %d r_pad_el %d\n", O_w_left, r_pad_el); + printf("input col stride %d\n", input_col_stride); +#endif + if (O_w_left) + { + if (first) + { + FLOAT_ZERO_END_C(O_w_left, _C_ob); + + if ((op_type == OP_MUL) || (op_type == OP_MAX_POOL && H_lb == 0 && H_ub == 0)) + { + FLOAT_LOAD_END_C_strided(I, step, O_w_left, _C_ob); + } + else if (op_type == OP_UPSAMPLE) + { + FLOAT_LOAD_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob); + } + } + else + { + if constexpr (op_type == OP_ADD && op_class == 3) + { + FLOAT_ZERO_END_C(O_w_left, _C_ob); + } + FLOAT_LOAD_END_C(O, O_w_left, _C_ob); + if constexpr (op_type == OP_UPSAMPLE) + { + FLOAT_ACCUM_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob); + } + } + + + compute_with_padding( + H_lb, H_UPPER, + 0, F_w, + F_w, + O_w_left, + input_col_stride, + F, + I, + c_tile, + F_c_left); + + + + if (op_type == OP_AVERAGE_POOL) + { + float norm = 1.0 / (1.0 * F_h * F_w); + FLOAT_DIV_END_C(c_tile, norm, O_w_left, _C_ob); + } + if constexpr (op_type == OP_ADD && op_class == 3 && _C_ob == 1) + { + /* If the operation reduces the channel dimension, reduce across channel dimension of simd tile*/ + FLOAT_REDUCE_CHANNEL_END_C(O_w_left, _C_ob) + } + + FLOAT_STORE_END_C(O, O_w_left, _C_ob); + } + + // right padding elements + AccumT *O_ptr = O + O_w_left * _C_ob; // ScalarT --> AccumT + ScalarT const *I_ptr = I + O_w_left * step; + int W_i_valid = F_w - 1; + + if (first) + { + FLOAT_ZERO_END_C(r_pad_el, _C_ob); + + // Initialize with 0 for the padding elements + + //@note padding should always be 'v' for pointwise operations, so this code path should not be used + if (op_type == OP_MUL) + { + FLOAT_LOAD_END_C_strided(I_ptr, step, r_pad_el, _C_ob); + } + } + else + { + FLOAT_LOAD_END_C(O_ptr, r_pad_el, _C_ob); + } + + c_tile_t *c_cur = c_tile; + // dim_t c_cur = 0; + + for (uint32_t k_p = 0; k_p < r_pad_el; k_p++) + { + compute_with_padding( + H_lb, H_UPPER, + 0, W_i_valid, + F_w, + 1, + input_col_stride, + F, + I_ptr, + c_cur, + F_c_left); + + c_cur += (_K_b * _G_b) / (FLOAT_SIMD_EPILOGUE); + W_i_valid -= _stride; + I_ptr += step; + } + + if (op_type == OP_AVERAGE_POOL) + { + float norm = 1.0 / (1.0 * F_h * F_w); + FLOAT_DIV_END_C(c_tile, norm, r_pad_el, _C_ob); + } + + FLOAT_STORE_END_C(O_ptr, r_pad_el, _C_ob); + + #if DEBUG + printf("First output value: %f %f %f %f \n", O[0], O[1], O[2], O[3]); + #endif +} + +//Edge case to handle remainder input and output channels +template +void inline rem_kernel_right_rem( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + dim_t O_w_left, + dim_t r_pad_el, + dim_t r_pad, + ScalarT const *I, + ScalarT const *F, + AccumT *O, // ScalarT -> AccumT + const dim_t F_c_left, + const dim_t G_left, + const dim_t K_left, + dim_t H_lb = 0, + dim_t H_ub = 0) +{ + const dim_t _C_ob = G_left * K_left; + const dim_t _C_ib = G_left * F_c_left; + const dim_t step = _stride * _C_ib; + const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub); + FLOAT_DEF_END_C(_O_wb, _C_ob); + +#if DEBUG + printf("kernel_right_rem\n"); + printf("First 5 input values: %f %f %f %f %f\n", I[0], I[1], I[2], I[3], I[4]); + printf("First 5 Filter values for output channel 0 : %f %f %f %f %f \n", F[0 * _C_ob], F[1 * _C_ob], F[2 * _C_ob], F[3 * _C_ob], F[4 * _C_ob]); printf("O_W_left %d r_pad_el %d\n", O_w_left, r_pad_el); + printf("input col stride %d\n", input_col_stride); #endif if (O_w_left) { @@ -440,7 +1823,7 @@ void inline kernel_right( { FLOAT_ZERO_END_C(O_w_left, _C_ob); - if ( (op_type == OP_MUL)|| (op_type == OP_MAX_POOL && H_lb == 0 && H_ub == 0)) + if ((op_type == OP_MUL) || (op_type == OP_MAX_POOL && H_lb == 0 && H_ub == 0)) { FLOAT_LOAD_END_C_strided(I, step, O_w_left, _C_ob); } @@ -451,38 +1834,41 @@ void inline kernel_right( } else { - if constexpr(op_type == OP_ADD && op_class == 3) + if constexpr (op_type == OP_ADD && op_class == 3) { FLOAT_ZERO_END_C(O_w_left, _C_ob); } FLOAT_LOAD_END_C(O, O_w_left, _C_ob); if constexpr (op_type == OP_UPSAMPLE) - { - FLOAT_ACCUM_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob); - } + { + FLOAT_ACCUM_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob); + } } compute_with_padding( - H_lb, H_UPPER, - 0, F_w, - F_w, - O_w_left, - input_col_stride, - F, - I, - c_tile); + H_lb, H_UPPER, + 0, F_w, + F_w, + O_w_left, + input_col_stride, + F, + I, + c_tile, + F_c_left, + G_left, + K_left); if (op_type == OP_AVERAGE_POOL) { float norm = 1.0 / (1.0 * F_h * F_w); FLOAT_DIV_END_C(c_tile, norm, O_w_left, _C_ob); } - if constexpr(op_type == OP_ADD && op_class == 3 && _C_ob == 1) + if (op_type == OP_ADD && op_class == 3 && _C_ob == 1) { /* If the operation reduces the channel dimension, reduce across channel dimension of simd tile*/ - FLOAT_REDUCE_CHANNEL_END_C(O_w_left, _C_ob) + FLOAT_REDUCE_REM_CHANNEL_END_C(O_w_left, _C_ob) } FLOAT_STORE_END_C(O, O_w_left, _C_ob); @@ -510,37 +1896,379 @@ void inline kernel_right( FLOAT_LOAD_END_C(O_ptr, r_pad_el, _C_ob); } - c_tile_t *c_cur = c_tile; - // dim_t c_cur = 0; - for (uint32_t k_p = 0; k_p < r_pad_el; k_p++) - { - compute_with_padding( - H_lb, H_UPPER, - 0, W_i_valid, - F_w, - 1, - input_col_stride, - F, - I_ptr, - c_cur); + c_tile_t *c_cur = c_tile; + // dim_t c_cur = 0; + + for (uint32_t k_p = 0; k_p < r_pad_el; k_p++) + { + compute_with_padding( + H_lb, H_UPPER, + 0, W_i_valid, + F_w, + 1, + input_col_stride, + F, + I_ptr, + c_cur, + F_c_left, + G_left, + K_left); + + c_cur += (G_left * K_left) / (FLOAT_SIMD_EPILOGUE); + W_i_valid -= _stride; + I_ptr += step; + } + + if (op_type == OP_AVERAGE_POOL) + { + float norm = 1.0 / (1.0 * F_h * F_w); + FLOAT_DIV_END_C(c_tile, norm, r_pad_el, _C_ob); + } + + FLOAT_STORE_END_C(O_ptr, r_pad_el, _C_ob); + +#if DEBUG + printf("First output value: %f %f %f %f \n", O[0], O[1], O[2], O[3]); +#endif +} + +//**************************************************************************** +template +void inline kernel_bottom( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + dim_t b_pad_el, + dim_t b_pad, + dim_t W_full_index, + dim_t l_pad_el, + dim_t l_pad, + dim_t O_w_w_pad, + dim_t O_w_full, + dim_t O_w_left, + dim_t r_pad_el, + dim_t r_pad, + ScalarT const *I, + ScalarT const *F, + AccumT *O) // ScalarT -> AccumT +{ + ScalarT const *I_ptr = I; + AccumT *O_ptr = O; // ScalarT -> AccumT + + int H_i_valid = F_h - 1; + + for (uint32_t j_p = 0; j_p < b_pad_el; j_p++) + { + // Prologue with left padding + kernel_left( + first, + F_h, + F_w, + input_col_stride, + l_pad_el, + l_pad, + I_ptr, + F, + O_ptr, + 0, + H_i_valid); + + ScalarT const *I_row_full = I + W_full_index * (_F_cb * _G_b); + AccumT *O_row_full = O + l_pad_el * (_G_b * _K_b); // ScalarT -> AccumT + // Steady State with microkernel + for (index_t l = 0; l < O_w_full; l += _O_wb) + { + ScalarT const *I_col = I_row_full + (l * _stride) * (_F_cb * _G_b); + ScalarT const *F_col = F + 0; + AccumT *O_col = O_row_full + l * (_G_b * _K_b); // ScalarT -> AccumT + + kernel_pad( + first, + F_h, + F_w, + input_col_stride, + I_col, + F_col, + O_col, + 0, + H_i_valid, + 0, /// @todo This was added, W_lb. Is it right? + 0); /// @todo This was added, W_ub. Is it right?); + } + + // Epilogue for microkernel + right padding elements + ScalarT const *I_col_left = + I_row_full + (O_w_full * _stride) * (_F_cb * _G_b); + ScalarT const *F_col_left = F + 0; + AccumT *O_col_left = O_row_full + O_w_full * (_G_b * _K_b); // ScalarT -> AccumT + + kernel_right( + first, + F_h, + F_w, + input_col_stride, + O_w_left, + r_pad_el, + r_pad, + I_col_left, + F_col_left, + O_col_left, + 0, /// @todo confirm this, H_lb + H_i_valid); /// @todo confirm this, H_ub + + O_ptr += O_w_w_pad * _K_b * _G_b; + + H_i_valid -= _stride; + I_ptr += _stride * _F_cb * _G_b; + } +} + +//Edge case to handle remainder output channels +template +void inline rem_kernel_bottom( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + dim_t b_pad_el, + dim_t b_pad, + dim_t W_full_index, + dim_t l_pad_el, + dim_t l_pad, + dim_t O_w_w_pad, + dim_t O_w_full, + dim_t O_w_left, + dim_t r_pad_el, + dim_t r_pad, + ScalarT const *I, + ScalarT const *F, + AccumT *O, + const dim_t G_left, + const dim_t K_left) // ScalarT -> AccumT +{ + ScalarT const *I_ptr = I; + AccumT *O_ptr = O; // ScalarT -> AccumT + + int H_i_valid = F_h - 1; + + for (uint32_t j_p = 0; j_p < b_pad_el; j_p++) + { + // Prologue with left padding + rem_kernel_left( + first, + F_h, + F_w, + input_col_stride, + l_pad_el, + l_pad, + I_ptr, + F, + O_ptr, + G_left, + K_left, + 0, + H_i_valid); + + ScalarT const *I_row_full = I + W_full_index * (_F_cb * G_left); + AccumT *O_row_full = O + l_pad_el * (G_left * K_left); // ScalarT -> AccumT + // Steady State with microkernel + for (index_t l = 0; l < O_w_full; l += _O_wb) + { + ScalarT const *I_col = I_row_full + (l * _stride) * (_F_cb * G_left); + ScalarT const *F_col = F + 0; + AccumT *O_col = O_row_full + l * (G_left * K_left); // ScalarT -> AccumT + + rem_kernel_pad( + first, + F_h, + F_w, + input_col_stride, + I_col, + F_col, + O_col, + G_left, + K_left, + 0, + H_i_valid, + 0, /// @todo This was added, W_lb. Is it right? + 0); /// @todo This was added, W_ub. Is it right?); + } + + // Epilogue for microkernel + right padding elements + ScalarT const *I_col_left = + I_row_full + (O_w_full * _stride) * (_F_cb * G_left); + ScalarT const *F_col_left = F + 0; + AccumT *O_col_left = O_row_full + O_w_full * (G_left * K_left); // ScalarT -> AccumT + + rem_kernel_right( + first, + F_h, + F_w, + input_col_stride, + O_w_left, + r_pad_el, + r_pad, + I_col_left, + F_col_left, + O_col_left, + G_left, + K_left, + 0, /// @todo confirm this, H_lb + H_i_valid); /// @todo confirm this, H_ub + + O_ptr += O_w_w_pad * G_left * K_left; + + H_i_valid -= _stride; + I_ptr += _stride * _F_cb * G_left; + } +} + +// Edge case to handle remainder input channels +template +void inline kernel_bottom_rem( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + dim_t b_pad_el, + dim_t b_pad, + dim_t W_full_index, + dim_t l_pad_el, + dim_t l_pad, + dim_t O_w_w_pad, + dim_t O_w_full, + dim_t O_w_left, + dim_t r_pad_el, + dim_t r_pad, + ScalarT const *I, + ScalarT const *F, + AccumT *O, + const dim_t F_c_left) // ScalarT -> AccumT +{ + ScalarT const *I_ptr = I; + AccumT *O_ptr = O; // ScalarT -> AccumT + + int H_i_valid = F_h - 1; + + for (uint32_t j_p = 0; j_p < b_pad_el; j_p++) + { + // Prologue with left padding + kernel_left_rem( + first, + F_h, + F_w, + input_col_stride, + l_pad_el, + l_pad, + I_ptr, + F, + O_ptr, + F_c_left, + 0, + H_i_valid); + + ScalarT const *I_row_full = I + W_full_index * (F_c_left * _G_b); + AccumT *O_row_full = O + l_pad_el * (_G_b * _K_b); // ScalarT -> AccumT + // Steady State with microkernel + for (index_t l = 0; l < O_w_full; l += _O_wb) + { + ScalarT const *I_col = I_row_full + (l * _stride) * (F_c_left * _G_b); + ScalarT const *F_col = F + 0; + AccumT *O_col = O_row_full + l * (_G_b * _K_b); // ScalarT -> AccumT + + kernel_pad_rem( + first, + F_h, + F_w, + input_col_stride, + I_col, + F_col, + O_col, + F_c_left, + 0, + H_i_valid, + 0, /// @todo This was added, W_lb. Is it right? + 0); /// @todo This was added, W_ub. Is it right?); + } + + // Epilogue for microkernel + right padding elements + ScalarT const *I_col_left = + I_row_full + (O_w_full * _stride) * (F_c_left * _G_b); + ScalarT const *F_col_left = F + 0; + AccumT *O_col_left = O_row_full + O_w_full * (_G_b * _K_b); // ScalarT -> AccumT + + kernel_right_rem( + first, + F_h, + F_w, + input_col_stride, + O_w_left, + r_pad_el, + r_pad, + I_col_left, + F_col_left, + O_col_left, + F_c_left, + 0, /// @todo confirm this, H_lb + H_i_valid); /// @todo confirm this, H_ub - c_cur += (_K_b * _G_b) / (FLOAT_SIMD_EPILOGUE); - W_i_valid -= _stride; - I_ptr += _stride * _F_cb * _G_b; - } + O_ptr += O_w_w_pad * _K_b * _G_b; - if (op_type == OP_AVERAGE_POOL) - { - float norm = 1.0 / (1.0 * F_h * F_w); - FLOAT_DIV_END_C(c_tile, norm, r_pad_el, _C_ob); + H_i_valid -= _stride; + I_ptr += _stride * F_c_left * _G_b; } - - FLOAT_STORE_END_C(O_ptr, r_pad_el, _C_ob); } -//**************************************************************************** +// Edge case to handle remainder input and output channels +// Edge case to handle remainder input channels template -void inline kernel_bottom( +void inline rem_kernel_bottom_rem( bool first, dim_t F_h, dim_t F_w, @@ -568,8 +2296,15 @@ void inline kernel_bottom( dim_t r_pad, ScalarT const *I, ScalarT const *F, - AccumT *O) // ScalarT -> AccumT + AccumT *O, + const dim_t F_c_left, + const dim_t G_left, + const dim_t K_left) // ScalarT -> AccumT { + const dim_t _C_ob = G_left * K_left; + const dim_t _C_ib = G_left * F_c_left; + const dim_t step = _stride * _C_ib; + ScalarT const *I_ptr = I; AccumT *O_ptr = O; // ScalarT -> AccumT @@ -578,72 +2313,81 @@ void inline kernel_bottom( for (uint32_t j_p = 0; j_p < b_pad_el; j_p++) { // Prologue with left padding - kernel_left( - first, - F_h, - F_w, - input_col_stride, - l_pad_el, - l_pad, - I_ptr, - F, - O_ptr, - 0, - H_i_valid); - - ScalarT const *I_row_full = I + W_full_index * (_F_cb * _G_b); - AccumT *O_row_full = O + l_pad_el * (_G_b * _K_b); // ScalarT -> AccumT + rem_kernel_left_rem( + first, + F_h, + F_w, + input_col_stride, + l_pad_el, + l_pad, + I_ptr, + F, + O_ptr, + F_c_left, + G_left, + K_left, + 0, + H_i_valid); + + ScalarT const *I_row_full = I + W_full_index * (_C_ib); + AccumT *O_row_full = O + l_pad_el * (_C_ob); // ScalarT -> AccumT // Steady State with microkernel for (index_t l = 0; l < O_w_full; l += _O_wb) { - ScalarT const *I_col = I_row_full + (l * _stride) * (_F_cb * _G_b); + ScalarT const *I_col = I_row_full + (l * _stride) * (_C_ib); ScalarT const *F_col = F + 0; - AccumT *O_col = O_row_full + l * (_G_b * _K_b); // ScalarT -> AccumT - - kernel_pad( - first, - F_h, - F_w, - input_col_stride, - I_col, - F_col, - O_col, - 0, - H_i_valid, - 0, /// @todo This was added, W_lb. Is it right? - 0); /// @todo This was added, W_ub. Is it right?); + AccumT *O_col = O_row_full + l * (_C_ob); // ScalarT -> AccumT + + rem_kernel_pad_rem( + first, + F_h, + F_w, + input_col_stride, + I_col, + F_col, + O_col, + F_c_left, + G_left, + K_left, + 0, + H_i_valid, + 0, /// @todo This was added, W_lb. Is it right? + 0); /// @todo This was added, W_ub. Is it right?); } // Epilogue for microkernel + right padding elements ScalarT const *I_col_left = - I_row_full + (O_w_full * _stride) * (_F_cb * _G_b); + I_row_full + (O_w_full * _stride) * (_C_ib); ScalarT const *F_col_left = F + 0; - AccumT *O_col_left = O_row_full + O_w_full * (_G_b * _K_b); // ScalarT -> AccumT - - kernel_right( - first, - F_h, - F_w, - input_col_stride, - O_w_left, - r_pad_el, - r_pad, - I_col_left, - F_col_left, - O_col_left, - 0, /// @todo confirm this, H_lb - H_i_valid); /// @todo confirm this, H_ub - - O_ptr += O_w_w_pad * _K_b * _G_b; + AccumT *O_col_left = O_row_full + O_w_full * (_C_ob); // ScalarT -> AccumT + + rem_kernel_right_rem( + first, + F_h, + F_w, + input_col_stride, + O_w_left, + r_pad_el, + r_pad, + I_col_left, + F_col_left, + O_col_left, + F_c_left, + G_left, + K_left, + 0, /// @todo confirm this, H_lb + H_i_valid); /// @todo confirm this, H_ub + + O_ptr += O_w_w_pad * (_C_ob); H_i_valid -= _stride; - I_ptr += _stride * _F_cb * _G_b; + I_ptr += _stride * (_C_ib); } } @@ -750,7 +2494,366 @@ void inline kernel_top( H_i_valid, F_h); - O_ptr += O_w_w_pad * _K_b * _G_b; + O_ptr += O_w_w_pad * _K_b * _G_b; + H_i_valid += _stride; + // I_ptr += _stride * _F_cb * _G_b; + } +} + +//Edge case to handle remainder output channels +template +void inline rem_kernel_top( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + dim_t t_pad_el, + dim_t t_pad, + dim_t W_full_index, + dim_t l_pad_el, + dim_t l_pad, + dim_t O_w_w_pad, + dim_t O_w_full, + dim_t O_w_left, + dim_t r_pad_el, + dim_t r_pad, + ScalarT const *I, + ScalarT const *F, + AccumT *O, + const dim_t G_left, + const dim_t K_left) // ScalarT --> AccumT +{ + const dim_t _C_ob = G_left * K_left; + const dim_t _C_ib = G_left * _F_cb; + const dim_t step = _stride * _C_ib; + + ScalarT const *I_ptr = I; + AccumT *O_ptr = O; // ScalarT --> AccumT + + int H_i_valid = t_pad; + + for (uint32_t j_p = 0; j_p < t_pad_el; j_p++) + { + // Prologue with left padding + rem_kernel_left( + first, + F_h, + F_w, + input_col_stride, + l_pad_el, + l_pad, + I_ptr, + F, + O_ptr, + G_left, + K_left, + H_i_valid, + F_h); + + ScalarT const *I_row_full = I + W_full_index * (_C_ib); + AccumT *O_row_full = O + l_pad_el * (_C_ob); // ScalarT --> AccumT + + // Steady State with microkernel + for (index_t l = 0; l < O_w_full; l += _O_wb) + { + ScalarT const *I_col = + I_row_full + (l * _stride) * (_C_ib); + ScalarT const *F_col = F + 0; + AccumT *O_col = O_row_full + l * (_C_ob); // ScalarT --> AccumT + + rem_kernel_pad( + first, + F_h, + F_w, + input_col_stride, + I_col, + F_col, + O_col, + G_left, + K_left, + H_i_valid, // H_lb + F_h, // H_ub + 0, // W_lb + 0); /// @todo Confirm this, W_ub. Is it right? q_abstract_layer has F_w + } + + // Epilogue for microkernel + right padding elements + ScalarT const *I_col_left = + I_row_full + (O_w_full * _stride) * (_C_ib); + ScalarT const *F_col_left = F + 0; + AccumT *O_col_left = + O_row_full + O_w_full * (_C_ob); // ScalarT --> AccumT + + rem_kernel_right( + first, + F_h, + F_w, + input_col_stride, + O_w_left, + r_pad_el, + r_pad, + I_col_left, + F_col_left, + O_col_left, + G_left, + K_left, + H_i_valid, + F_h); + + O_ptr += O_w_w_pad * _C_ob; + H_i_valid += _stride; + } + +} + +// Edge case to handle remainder input channels +template +void inline kernel_top_rem( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + dim_t t_pad_el, + dim_t t_pad, + dim_t W_full_index, + dim_t l_pad_el, + dim_t l_pad, + dim_t O_w_w_pad, + dim_t O_w_full, + dim_t O_w_left, + dim_t r_pad_el, + dim_t r_pad, + ScalarT const *I, + ScalarT const *F, + AccumT *O, + const dim_t F_c_left) // ScalarT --> AccumT +{ + ScalarT const *I_ptr = I; + AccumT *O_ptr = O; // ScalarT --> AccumT + + int H_i_valid = t_pad; + + for (uint32_t j_p = 0; j_p < t_pad_el; j_p++) + { + // Prologue with left padding + kernel_left_rem( + first, + F_h, + F_w, + input_col_stride, + l_pad_el, + l_pad, + I_ptr, + F, + O_ptr, + F_c_left, + H_i_valid, + F_h); + + ScalarT const *I_row_full = I + W_full_index * (F_c_left * _G_b); + AccumT *O_row_full = O + l_pad_el * (_G_b * _K_b); // ScalarT --> AccumT + + // Steady State with microkernel + for (index_t l = 0; l < O_w_full; l += _O_wb) + { + ScalarT const *I_col = + I_row_full + (l * _stride) * (F_c_left * _G_b); + ScalarT const *F_col = F + 0; + AccumT *O_col = O_row_full + l * (_G_b * _K_b); // ScalarT --> AccumT + + kernel_pad_rem( + first, + F_h, + F_w, + input_col_stride, + I_col, + F_col, + O_col, + F_c_left, + H_i_valid, // H_lb + F_h, // H_ub + 0, // W_lb + 0); /// @todo Confirm this, W_ub. Is it right? q_abstract_layer has F_w + } + + // Epilogue for microkernel + right padding elements + ScalarT const *I_col_left = + I_row_full + (O_w_full * _stride) * (F_c_left * _G_b); + ScalarT const *F_col_left = F + 0; + AccumT *O_col_left = + O_row_full + O_w_full * (_G_b * _K_b); // ScalarT --> AccumT + + kernel_right_rem( + first, + F_h, + F_w, + input_col_stride, + O_w_left, + r_pad_el, + r_pad, + I_col_left, + F_col_left, + O_col_left, + F_c_left, + H_i_valid, + F_h); + + O_ptr += O_w_w_pad * _K_b * _G_b; + H_i_valid += _stride; + // I_ptr += _stride * _F_cb * _G_b; + } +} + +// Edge case to handle remainder input and output channels +template +void inline rem_kernel_top_rem( + bool first, + dim_t F_h, + dim_t F_w, + dim_t input_col_stride, + dim_t t_pad_el, + dim_t t_pad, + dim_t W_full_index, + dim_t l_pad_el, + dim_t l_pad, + dim_t O_w_w_pad, + dim_t O_w_full, + dim_t O_w_left, + dim_t r_pad_el, + dim_t r_pad, + ScalarT const *I, + ScalarT const *F, + AccumT *O, + const dim_t F_c_left, + const dim_t G_left, + const dim_t K_left) // ScalarT --> AccumT +{ + const dim_t _C_ob = G_left * K_left; + const dim_t _C_ib = G_left * F_c_left; + const dim_t step = _stride * _C_ib; + + ScalarT const *I_ptr = I; + AccumT *O_ptr = O; // ScalarT --> AccumT + + int H_i_valid = t_pad; + + for (uint32_t j_p = 0; j_p < t_pad_el; j_p++) + { + // Prologue with left padding + rem_kernel_left_rem( + first, + F_h, + F_w, + input_col_stride, + l_pad_el, + l_pad, + I_ptr, + F, + O_ptr, + F_c_left, + G_left, + K_left, + H_i_valid, + F_h); + + ScalarT const *I_row_full = I + W_full_index * (_C_ib); + AccumT *O_row_full = O + l_pad_el * (_C_ob); // ScalarT --> AccumT + + // Steady State with microkernel + for (index_t l = 0; l < O_w_full; l += _O_wb) + { + ScalarT const *I_col = + I_row_full + (l * _stride) * (_C_ib); + ScalarT const *F_col = F + 0; + AccumT *O_col = O_row_full + l * (_C_ob); // ScalarT --> AccumT + + rem_kernel_pad_rem( + first, + F_h, + F_w, + input_col_stride, + I_col, + F_col, + O_col, + F_c_left, + G_left, + K_left, + H_i_valid, // H_lb + F_h, // H_ub + 0, // W_lb + 0); /// @todo Confirm this, W_ub. Is it right? q_abstract_layer has F_w + } + + // Epilogue for microkernel + right padding elements + ScalarT const *I_col_left = + I_row_full + (O_w_full * _stride) * (_C_ib); + ScalarT const *F_col_left = F + 0; + AccumT *O_col_left = + O_row_full + O_w_full * (_C_ob); // ScalarT --> AccumT + + rem_kernel_right_rem( + first, + F_h, + F_w, + input_col_stride, + O_w_left, + r_pad_el, + r_pad, + I_col_left, + F_col_left, + O_col_left, + F_c_left, + G_left, + K_left, + H_i_valid, + F_h); + + O_ptr += O_w_w_pad * _C_ob; H_i_valid += _stride; // I_ptr += _stride * _F_cb * _G_b; } @@ -834,6 +2937,11 @@ void abstract_layer( * I: [G/G_b, F_c/F_cb, I_h, I_w, F_cb, G_b ] * F: [G/G_b, K/K_b, F_c/F_cb, F_h, F_w, F_cb, G_b, K_b] * O: [G/G_b, K/K_b, O_h, O_w, G_b, K_b] + * + * For the case where the number of channels is not a multiple of the blocking size, + * I: + * F: + * O: */ //************************************************************************ @@ -911,9 +3019,37 @@ void abstract_layer( const dim_t O_w_left = O_w - O_w_full; const dim_t O_hxO_w = O_h_w_pad * O_w_w_pad; + + // Number of input channels in reduction is not a multiple of blocking size + const dim_t F_c_full = (F_c / _F_cb) * _F_cb; + // When the number of channels is not a multiple of blocking size - // const dim_t K_full = (K / _K_b) * _K_b; - // const dim_t K_left = K - K_full; + const dim_t K_full = (K / _K_b) * _K_b; + + //When the number of groups is not a multiple of blocking size + const dim_t G_full = (G / _G_b) * _G_b; + + + //We need to do in the edge case if either K_left or G_left is non-zero + // HACK: THIS CALCULATE IS IMPRECISE (group convs won't work) + + const dim_t F_c_full_idx = (F_c_full == 1)? 0: F_c_full; + + const dim_t K_full_idx = (K_full == 1)? 0: K_full; + const dim_t K_left = K - K_full_idx; + + // When the number of groups is not a multiple of blocking size + const dim_t G_full_idx = (G_full == 1)? 0: G_full; + const dim_t G_left = G - G_full_idx; + + const dim_t F_c_left = F_c - F_c_full_idx; + const dim_t _C_ib_left = F_c_left * _G_b; + const dim_t _C_ib_left_group = G_left * _F_cb; + const dim_t _C_ib_left_group_channels = G_left * F_c_left; + + //Only handling the case where G_left*K_left < FLOAT_C_ob + //There could be a case where G_left*K_left == FLOAT_C_ob + //todo: handle remaining cases #if DEBUG == 1 printf("\t\t I_h %d I_w %d F_C %d G %d \n", I_h, I_w, F_c, G); @@ -932,6 +3068,7 @@ void abstract_layer( printf("params: F_Cb %d G_b %d K_b %d\n", _F_cb, _G_b, _K_b); printf("rewrite output?: %d, op type/class: %d/%d\n", rewrite_output, op_type, op_class); + printf("F_c_full: %d F_c_left: %d \n", F_c_full, F_c_left); #endif // Set up parallelism for the channel loops @@ -970,7 +3107,7 @@ void abstract_layer( auto group_tid = ((t_id / (T_channel * T_height))) % T_group; // loops over output channels - for (index_t g = group_tid; g < G / _G_b; g += T_group) + for (index_t g = group_tid; g < G_full / _G_b; g += T_group) { ScalarT const *I_group; if constexpr (op_type == OP_UPSAMPLE && _stride == std::numeric_limits::max()) @@ -994,8 +3131,9 @@ void abstract_layer( F_group = F_buf + g * (K * F_c * F_h * F_w * _G_b); } - // resuse O_group as a uint32_t array - for (index_t k = channel_tid; k < K / _K_b; k += T_channel) + // reuse O_group as a uint32_t array + // k'th block over K_full + for (index_t k = channel_tid; k < K_full / _K_b; k += T_channel) { ScalarT const *I_channel_block_output = I_group + 0; @@ -1005,8 +3143,8 @@ void abstract_layer( O_group + k * (O_hxO_w * _G_b * _K_b); //************************************************************ - // Loop over input channel reduction - for (index_t i = 0; i < (F_c / _F_cb); i++) + // Loop over input channel reduction (multiple of blocking size) + for (index_t i = 0; i < (F_c_full / _F_cb); i++) { bool first = rewrite_output && (i == 0); @@ -1196,6 +3334,681 @@ void abstract_layer( F_row_bot, O_row_bot); } + + + // Loop over remaining channels + // _UNROLL defaults to 1 + //This loop should have 1 iteration + for (index_t i = F_c_full; i < F_c; i+=F_c_left) + { + // printf("F_c_full: %d F_c: %d iter: %i \n", F_c_full, F_c, i); + bool first = rewrite_output && (i == 0); + + ScalarT const *I_channel_block_input = + I_channel_block_output + (i/_F_cb) * (I_h * I_w * _F_cb * _G_b); + ScalarT const *F_channel_block_input = + F_channel_block_output + (i/_F_cb) * (F_h * F_w * _F_cb * _G_b * _K_b); + ScalarT *O_channel_block_input = + O_channel_block_output + 0; + + // Loops over spatial dimensions of output + + // Prologue with top padding + ScalarT const *I_row_top = I_channel_block_input; + ScalarT const *F_row_top = F_channel_block_input + 0; + AccumT *O_row_top = O_channel_block_input; // ScalarT --> AccumT + + kernel_top_rem( + first, + F_h, + F_w, + I_w * _C_ib_left, + t_pad_el, + pad_top, + W_full_index, + l_pad_el, + pad_left, + O_w_w_pad, + O_w_full, + O_w_left, + r_pad_el, + pad_right, + I_row_top, + F_row_top, + O_row_top, + F_c_left); + + ScalarT const *I_row_full = + I_row_top + H_full_index * I_w * _C_ib_left; //(_F_cb * _G_b); + AccumT *O_row_full = + O_row_top + t_pad_el * O_w_w_pad * (_G_b * _K_b); // ScalarT --> AccumT + + // Steady State over rows + // The stride over input channels is the number of remaining channels + for (index_t j = height_tid; j < O_h; j += T_height) + { + ScalarT const *I_row; + // @todo cast index calculation as int and make stride a float value. + // I_x = I_x + (int)(j * _stride) * () + if constexpr (op_type == OP_UPSAMPLE) + { + I_row = I_row_full + (j / _stride) * (I_w * _F_cb * _G_b); + } + else + { + I_row = I_row_full + (j * _stride) * (I_w * /*_F_cb * _G_b*/ _C_ib_left); + } + ScalarT const *F_row = F_channel_block_input + 0; + AccumT *O_row = + O_row_full + j * (O_w_w_pad * _G_b * _K_b); // ScalarT --> AccumT + // Prologue with left padding + kernel_left_rem( + first, + F_h, + F_w, + I_w * _C_ib_left, + l_pad_el, + pad_left, + I_row, + F_row, + O_row, + F_c_left, + 0, + 0); + + ScalarT const *I_col_full = + I_row + W_full_index * /*(_F_cb * _G_b)*/ _C_ib_left; + AccumT *O_col_full = O_row + l_pad_el * (_G_b * _K_b); // ScalarT --> AccumT + // Steady State with microkernel + for (index_t l = 0; l < O_w_full; l += _O_wb) + { + ScalarT const *I_col; + // @todo cast index calculation as int and make stride a float value. + // I_x = I_x + (int)(j * _stride) * () + if constexpr (op_type == OP_UPSAMPLE) + { + I_col = I_col_full + (l / _stride) * (_F_cb * _G_b); + } + else + { + I_col = I_col_full + (l * _stride) * _C_ib_left /*(_F_cb * _G_b)*/; + } + ScalarT const *F_col = F_row + 0; + AccumT *O_col = O_col_full + l * (_G_b * _K_b); // ScalarT --> AccumT + + kernel_rem( + first, + F_h, + F_w, + I_w * _C_ib_left, + I_col, + F_col, + O_col, + F_c_left, + 0, + 0, + 0, + 0); + } + +#if DEBUG + printf(" end kernel\n"); +#endif + + // Epilogue for microkernel + right padding elements + ScalarT const *I_col_left; + if constexpr (op_type == OP_UPSAMPLE) + { + I_col_left = + I_col_full + (O_w_full / _stride) * (_F_cb * _G_b); + } + else + { + I_col_left = + I_col_full + (O_w_full * _stride) * _C_ib_left/*(_F_cb * _G_b)*/; + } + + ScalarT const *F_col_left = F_row + 0; + AccumT *O_col_left = O_col_full + O_w_full * (_G_b * _K_b); // ScalarT --> AccumT + +#if DEBUG + printf("calling right\n"); +#endif + kernel_right_rem( + first, + F_h, + F_w, + I_w * _C_ib_left, + O_w_left, + r_pad_el, + pad_right, + I_col_left, + F_col_left, + O_col_left, + F_c_left, + 0, + 0); + } + // Epilogue with bottom padding + ScalarT const *I_row_bot; + // @todo cast index calculation as int and make stride a float value. + // I_x = I_x + (int)(j * _stride) * () + if constexpr (op_type == OP_UPSAMPLE) + { + I_row_bot = + I_row_full + (O_h * _stride) * (I_w * _F_cb * _G_b); + } + else + { + I_row_bot = + I_row_full + (O_h * _stride) * (I_w * _C_ib_left /*_F_cb * _G_b*/); + } + ScalarT const *F_row_bot = F_channel_block_input + 0; + AccumT *O_row_bot = O_row_full + O_h * (O_w_w_pad * _G_b * _K_b); // ScalarT --> AccumT + + kernel_bottom_rem( + first, + F_h, + F_w, + I_w * _C_ib_left, + b_pad_el, + pad_bottom, + W_full_index, + l_pad_el, + pad_left, + O_w_w_pad, + O_w_full, + O_w_left, + r_pad_el, + pad_right, + I_row_bot, + F_row_bot, + O_row_bot, + F_c_left); + } + } + } + + + } + + //@todo: add this back to the parallel loop + // loop over remaining groups and output channels + // assumes that only 1 will have a remainder + for (index_t g = G_full_idx; g < G ; g += G_left) + { + #if DEBUG == 1 + printf("G_left %d K_left %d \n", G_left, K_left); + printf("G_full_idx %d K_full_idx %d \n", G_full_idx, K_full_idx); + #endif + ScalarT const *I_group; + if constexpr (op_type == OP_UPSAMPLE && _stride == std::numeric_limits::max()) + { + I_group = I_buf + g * (F_c * 1 * 1 ); + // I_group = I_buf + g * (F_c * 1 * 1 * _G_b); + } + else + { + I_group = I_buf + g * (F_c * I_h * I_w); + // I_group = I_buf + g * (F_c * I_h * I_w * _G_b); + } + ScalarT *O_group = O_buf + g * (K * O_hxO_w); + // ScalarT *O_group = O_buf + g * (K * O_hxO_w * _G_b); + // if leaky relu, the weight pointer does not change with the group id + + ScalarT const *F_group; + if constexpr ((op_type == OP_LEAKY_RELU) || (op_type == OP_MUL)) + { + F_group = F_buf; + } + else + { + F_group = F_buf + g * (K * F_c * F_h * F_w); + // F_group = F_buf + g * (K * F_c * F_h * F_w * _G_b); + } + + // resuse O_group as a uint32_t array + //This loop has 1 iteration + // k'th element in K_full + for (index_t k = K_full_idx; k < K ; k += K_left) + { + ScalarT const *I_channel_block_output = + I_group + 0; + ScalarT const *F_channel_block_output = + // F_group + k * (F_c * F_h * F_w * _G_b * _K_b); + F_group + k *(F_c * F_h * F_w * _G_b); + ScalarT *O_channel_block_output = + // O_group + k * (O_hxO_w * _G_b * _K_b); + O_group + k *(O_hxO_w * _G_b); + + //************************************************************ + // Loop over input channel reduction (multiple of blocking size) + for (index_t i = 0; i < (F_c_full / _F_cb); i++) + { + bool first = rewrite_output && (i == 0); + + + ScalarT const *I_channel_block_input = + I_channel_block_output + i * (I_h * I_w * _F_cb * G_left); + ScalarT const *F_channel_block_input = + F_channel_block_output + i * (F_h * F_w * _F_cb * G_left * K_left); + ScalarT *O_channel_block_input = + O_channel_block_output + 0; + + // Loops over spatial dimensions of output + + // Prologue with top padding + ScalarT const *I_row_top = I_channel_block_input; + ScalarT const *F_row_top = F_channel_block_input + 0; + AccumT *O_row_top = O_channel_block_input; // ScalarT --> AccumT + + rem_kernel_top( + first, + F_h, + F_w, + // I_w * _C_ib, + I_w * _C_ib_left_group, + t_pad_el, + pad_top, + W_full_index, + l_pad_el, + pad_left, + O_w_w_pad, + O_w_full, + O_w_left, + r_pad_el, + pad_right, + I_row_top, + F_row_top, + O_row_top, + G_left, + K_left); + + ScalarT const *I_row_full = + I_row_top + H_full_index * I_w * (_F_cb * G_left); + AccumT *O_row_full = + O_row_top + t_pad_el * O_w_w_pad * (G_left * K_left); // ScalarT --> AccumT + + // Steady State over rows + for (index_t j = 0; j < O_h; j += T_height) + { + ScalarT const *I_row; + // @todo cast index calculation as int and make stride a float value. + // I_x = I_x + (int)(j * _stride) * () + if constexpr (op_type == OP_UPSAMPLE) + { + I_row = I_row_full + (j / _stride) * (I_w * _F_cb * G_left); + } + else + { + I_row = I_row_full + (j * _stride) * (I_w * _F_cb * G_left); + } + ScalarT const *F_row = F_channel_block_input + 0; + AccumT *O_row = + O_row_full + j * (O_w_w_pad * G_left * K_left); // ScalarT --> AccumT + // Prologue with left padding + rem_kernel_left( + first, + F_h, + F_w, + I_w * _C_ib_left_group, + // I_w * _C_ib, + l_pad_el, + pad_left, + I_row, + F_row, + O_row, + G_left, + K_left, + 0, + 0); + + ScalarT const *I_col_full = + I_row + W_full_index * (_F_cb * G_left); + AccumT *O_col_full = O_row + l_pad_el * (G_left * K_left); // ScalarT --> AccumT + // Steady State with microkernel + for (index_t l = 0; l < O_w_full; l += _O_wb) + { + ScalarT const *I_col; + // @todo cast index calculation as int and make stride a float value. + // I_x = I_x + (int)(j * _stride) * () + if constexpr (op_type == OP_UPSAMPLE) + { + I_col = I_col_full + (l / _stride) * (_F_cb * G_left); + } + else + { + I_col = I_col_full + (l * _stride) * (_F_cb * G_left); + } + ScalarT const *F_col = F_row + 0; + AccumT *O_col = O_col_full + l * (G_left * K_left); // ScalarT --> AccumT + + rem_kernel( + first, + F_h, + F_w, + I_w * _C_ib_left_group, + // I_w * _C_ib, + I_col, + F_col, + O_col, + G_left, + K_left, + 0, + 0, + 0, + 0); + } + +#if DEBUG + printf(" end kernel\n"); +#endif + + // Epilogue for microkernel + right padding elements + ScalarT const *I_col_left; + if constexpr (op_type == OP_UPSAMPLE) + { + I_col_left = + I_col_full + (O_w_full / _stride) * (_F_cb * G_left); + } + else + { + I_col_left = + I_col_full + (O_w_full * _stride) * (_F_cb * G_left); + } + + ScalarT const *F_col_left = F_row + 0; + AccumT *O_col_left = O_col_full + O_w_full * (G_left * K_left); // ScalarT --> AccumT + +#if DEBUG + printf(" calling right output channel\n"); +#endif + rem_kernel_right( + first, + F_h, + F_w, + I_w * _C_ib_left_group, + // I_w * _C_ib, + O_w_left, + r_pad_el, + pad_right, + I_col_left, + F_col_left, + O_col_left, + G_left, + K_left, + 0, + 0); + } + // Epilogue with bottom padding + ScalarT const *I_row_bot; + // @todo cast index calculation as int and make stride a float value. + // I_x = I_x + (int)(j * _stride) * () + if constexpr (op_type == OP_UPSAMPLE) + { + I_row_bot = + I_row_full + (O_h * _stride) * (I_w * _F_cb * G_left); + } + else + { + I_row_bot = + I_row_full + (O_h * _stride) * (I_w * _F_cb * G_left); + } + ScalarT const *F_row_bot = F_channel_block_input + 0; + AccumT *O_row_bot = O_row_full + O_h * (O_w_w_pad * G_left * K_left); // ScalarT --> AccumT + + rem_kernel_bottom( + first, + F_h, + F_w, + I_w * _C_ib_left_group, + // I_w * _C_ib, + b_pad_el, + pad_bottom, + W_full_index, + l_pad_el, + pad_left, + O_w_w_pad, + O_w_full, + O_w_left, + r_pad_el, + pad_right, + I_row_bot, + F_row_bot, + O_row_bot, + G_left, + K_left); + } + + // Loop over remaining channels + // _UNROLL defaults to 1 + // This loop should have 1 iteration + for (index_t i = F_c_full; i < F_c; i += F_c_left) + { + // printf("F_c_full: %d F_c: %d iter: %i \n", F_c_full, F_c, i); + bool first = rewrite_output && (i == 0); + + ScalarT const *I_channel_block_input = + I_channel_block_output + (i / _F_cb) * (I_h * I_w * _F_cb * G_left); + ScalarT const *F_channel_block_input = + F_channel_block_output + (i / _F_cb) * (F_h * F_w * _F_cb * G_left * K_left); + ScalarT *O_channel_block_input = + O_channel_block_output + 0; + + // Loops over spatial dimensions of output + + // Prologue with top padding + ScalarT const *I_row_top = I_channel_block_input; + ScalarT const *F_row_top = F_channel_block_input + 0; + AccumT *O_row_top = O_channel_block_input; // ScalarT --> AccumT + + rem_kernel_top_rem( + first, + F_h, + F_w, + I_w * F_c_left*G_left, + t_pad_el, + pad_top, + W_full_index, + l_pad_el, + pad_left, + O_w_w_pad, + O_w_full, + O_w_left, + r_pad_el, + pad_right, + I_row_top, + F_row_top, + O_row_top, + F_c_left, + G_left, + K_left); + + ScalarT const *I_row_full = + I_row_top + H_full_index * I_w * F_c_left * G_left; //(_F_cb * _G_b); + AccumT *O_row_full = + O_row_top + t_pad_el * O_w_w_pad * (G_left * K_left); // ScalarT --> AccumT + + // Steady State over rows + // The stride over input channels is the number of remaining channels + for (index_t j = 0; j < O_h; j += T_height) + { + ScalarT const *I_row; + // @todo cast index calculation as int and make stride a float value. + // I_x = I_x + (int)(j * _stride) * () + if constexpr (op_type == OP_UPSAMPLE) + { + I_row = I_row_full + (j / _stride) * (I_w * _F_cb * G_left); + } + else + { + I_row = I_row_full + (j * _stride) * (I_w * F_c_left * G_left /*_F_cb * _G_b*/); + } + ScalarT const *F_row = F_channel_block_input + 0; + AccumT *O_row = + O_row_full + j * (O_w_w_pad * G_left * K_left); // ScalarT --> AccumT + // Prologue with left padding + rem_kernel_left_rem( + first, + F_h, + F_w, + I_w * _C_ib_left, + l_pad_el, + pad_left, + I_row, + F_row, + O_row, + F_c_left, + G_left, + K_left, + 0, + 0); + + ScalarT const *I_col_full = + I_row + W_full_index * /*(_F_cb * _G_b)*/ F_c_left * G_left; + AccumT *O_col_full = O_row + l_pad_el * (G_left * K_left); // ScalarT --> AccumT + // Steady State with microkernel + for (index_t l = 0; l < O_w_full; l += _O_wb) + { + ScalarT const *I_col; + // @todo cast index calculation as int and make stride a float value. + // I_x = I_x + (int)(j * _stride) * () + if constexpr (op_type == OP_UPSAMPLE) + { + I_col = I_col_full + (l / _stride) * (_F_cb * G_left); + } + else + { + I_col = I_col_full + (l * _stride) * F_c_left * G_left /*(_F_cb * _G_b)*/; + } + ScalarT const *F_col = F_row + 0; + AccumT *O_col = O_col_full + l * (G_left * K_left); // ScalarT --> AccumT + + rem_kernel_rem( + first, + F_h, + F_w, + I_w * _C_ib_left, + I_col, + F_col, + O_col, + F_c_left, + G_left, + K_left, + 0, + 0, + 0, + 0); + } + +#if DEBUG + printf(" end kernel\n"); +#endif + + // Epilogue for microkernel + right padding elements + ScalarT const *I_col_left; + if constexpr (op_type == OP_UPSAMPLE) + { + I_col_left = + I_col_full + (O_w_full / _stride) * (_F_cb * G_left); + } + else + { + I_col_left = + I_col_full + (O_w_full * _stride) * F_c_left * G_left /*(_F_cb * _G_b)*/; + } + + ScalarT const *F_col_left = F_row + 0; + AccumT *O_col_left = O_col_full + O_w_full * (G_left * K_left); // ScalarT --> AccumT + +#if DEBUG + printf("calling right\n"); +#endif + rem_kernel_right_rem( + first, + F_h, + F_w, + I_w * _C_ib_left, + O_w_left, + r_pad_el, + pad_right, + I_col_left, + F_col_left, + O_col_left, + F_c_left, + G_left, + K_left, + 0, + 0); + } + // Epilogue with bottom padding + ScalarT const *I_row_bot; + // @todo cast index calculation as int and make stride a float value. + // I_x = I_x + (int)(j * _stride) * () + if constexpr (op_type == OP_UPSAMPLE) + { + I_row_bot = + I_row_full + (O_h * _stride) * (I_w * _F_cb * G_left); + } + else + { + I_row_bot = + I_row_full + (O_h * _stride) * (I_w * F_c_left * G_left /*_F_cb * _G_b*/); + } + ScalarT const *F_row_bot = F_channel_block_input + 0; + AccumT *O_row_bot = O_row_full + O_h * (O_w_w_pad * G_left * K_left); // ScalarT --> AccumT + + rem_kernel_bottom_rem( + first, + F_h, + F_w, + I_w * _C_ib_left, + b_pad_el, + pad_bottom, + W_full_index, + l_pad_el, + pad_left, + O_w_w_pad, + O_w_full, + O_w_left, + r_pad_el, + pad_right, + I_row_bot, + F_row_bot, + O_row_bot, + F_c_left, + G_left, + K_left); } } } diff --git a/include/small/gen_float_intrinsics_platform.py b/include/small/gen_float_intrinsics_platform.py new file mode 100644 index 0000000..d912027 --- /dev/null +++ b/include/small/gen_float_intrinsics_platform.py @@ -0,0 +1,234 @@ +# SMaLL, Software for Machine Learning Libraries +# Copyright 2023 by The SMaLL Contributors, All Rights Reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# For additional details (including references to third party source code and +# other files) see the LICENSE file or contact permission@sei.cmu.edu. See +# Contributors.txt for a full list of contributors. Created, in part, with +# funding and support from the U.S. Government (see Acknowledgments.txt file). +# DM23-0126 + +# Platforrm specific parameters +#make this is its own file, import based on runtime platform parameters +platform_name="arm" +W_ob = 6 +C_ob2 = 1 +C_ob1 = 16 +C_ob = C_ob2 * C_ob1 +SIMD = 8 +UNROLL = 1 # only affects conv kernel. +# UNROLL = SIMD # only affects conv kernel. + +NUM_FMA = 2 +NUM_MAX = 1 +NUM_LOAD = 2 +NUM_STORE = 1 + +#ISA setup +simd_intrin_lib_path="arm_neon.h" +simd_reg_typename="float32x4_t" +simd_load_func="vld1q_f32" +simd_store_func="vst1q_f32" + +simd_fma_func="vfmaq_f32" +simd_max_func="vmaxq_f32" +simd_div_func="vdivq_f32" +simd_add_func="vaddq_f32" +#end ISA setup + + +#path to put the generated code +#get current directory +import os +cur_dir=os.getcwd() +path_to_gen_code=cur_dir+"/platforms/"+platform_name +print(path_to_gen_code) +#end platform specific parameters + +#There should be a generalization of the kernel generation based on the op_class, whether it's binary, unary, etc. +#Ideally, given the instruction sequence for a single operation and the op_class, the kernel can be generated. +matmul_alg="in register broadcast" + +with open('{:}/params_temp.h'.format(path_to_gen_code), 'w') as f: + f.write( + ''' +//**************************************************************************** +// SMaLL, Software for Machine Learning Libraries +// Copyright 2023 by The SMaLL Contributors, All Rights Reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +// For additional details (including references to third party source code and +// other files) see the LICENSE file or contact permission@sei.cmu.edu. See +// Contributors.txt for a full list of contributors. Created, in part, with +// funding and support from the U.S. Government (see Acknowledgments.txt file). +// DM23-0126 +//**************************************************************************** + +#pragma once + +#define SMALL_HAS_FLOAT_SUPPORT 1 + +#define FLOAT_W_ob {W_ob} +#define FLOAT_C_ob {C_ob} +#define FLOAT_SIMD {SIMD} +#define FLOAT_UNROLL {UNROLL} +#define FLOAT_C_ib FLOAT_C_ob + +// not used for kernels, but used in throughput calculation. +#define NUM_FMA {NUM_FMA} +#define NUM_MAX {NUM_MAX} +#define NUM_LOAD {NUM_LOAD} +#define NUM_STORE {NUM_STORE} + '''.format(**locals()) + ) + +def redefine(name): + return ['#ifdef {n}\n#undef {n}\n#endif\n'.format(n=name)] + +with open('{:}/intrinsics-gen_temp.h'.format(path_to_gen_code), 'w') as f: + s = [''' +//**************************************************************************** +// SMaLL, Software for Machine Learning Libraries +// Copyright 2023 by The SMaLL Contributors, All Rights Reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +// For additional details (including references to third party source code and +// other files) see the LICENSE file or contact permission@sei.cmu.edu. See +// Contributors.txt for a full list of contributors. Created, in part, with +// funding and support from the U.S. Government (see Acknowledgments.txt file). +// DM23-0126 +//**************************************************************************** + +#pragma once + +#define SMALL_HAS_FLOAT_SUPPORT 1 +'''] + s += ['#include <{}>'.format(simd_intrin_lib_path)] + + # define tile + # names of variables + s += redefine('FLOAT_DEF_TILE_C') + s += ['#define FLOAT_DEF_TILE_C(W_ob, C_ob)\\'] + c_tile = [["c_{}_{}".format(kk, jj) for jj in range(C_ob//SIMD)] for kk in range(W_ob)] + s += ['float c_tile[W_ob * C_ob];\\'] + for kk in range(W_ob): + for jj in range(C_ob//SIMD): + s += ['{} {};\\'.format(simd_reg_typename, c_tile[kk][jj])] + s += [''] + + + # zero tile + s += redefine('FLOAT_ZERO_TILE_C') + s += ['#define FLOAT_ZERO_TILE_C(W_ob, C_ob)\\'] + for kk in range(W_ob): + for jj in range(C_ob//SIMD): + s += ['{} = vdupq_n_f32(0);\\'.format(c_tile[kk][jj])] + s += [''] + + # load tile + s += redefine('FLOAT_LOAD_TILE_C') + s += ['#define FLOAT_LOAD_TILE_C(O, W_ob, C_ob)\\'] + for kk in range(W_ob): + for jj in range(C_ob//SIMD): + s += ['{c} = vld1q_f32(O + {k} * C_ob + {j} * SIMD);\\'.format(c=c_tile[kk][jj], k=kk, j=jj)] + s += [''] + + # load tile strided + s += redefine('FLOAT_LOAD_TILE_C_strided') + s += ['#define FLOAT_LOAD_TILE_C_strided(O, step, W_ob, C_ob)\\'] + for kk in range(W_ob): + for jj in range(C_ob//SIMD): + s += ['{c} = vld1q_f32(O + {k} * step + {j} * SIMD);\\'.format(c=c_tile[kk][jj], k=kk, j=jj)] + s += [''] + + # store tile + s += redefine('STORE_TILE_C') + s += ['#define STORE_TILE_C(O, W_ob, C_ob)\\'] + for kk in range(W_ob): + for jj in range(C_ob//SIMD): + s += ['vst1q_f32(O + {k} * C_ob + {j} * SIMD, {c});\\'.format(c=c_tile[kk][jj], k=kk, j=jj)] + s += [''] + + # convolution + s += redefine('CONV_TILE_C') + s += ['#define CONV_TILE_C(step, a, b, W_ob, C_ob)\\'] + s += ['float *aa = a;\\'] + s += ['float *bb = b;\\'] + # define a + for kk in range(W_ob): + s += ['{} a_{kk};\\'.format(simd_reg_typename, kk=kk)] + # define b [half as many] + for jj in range(C_ob1//SIMD): + s += ['{} b_{jj};\\'.format(simd_reg_typename, jj=jj)] + + for i in range(UNROLL//SIMD): + # load a SIMD width of a + # for kk in range(W_ob): + # s += ['a_{kk} = vld1q_f32(a + {kk} * step + {i} * SIMD);\\'.format(kk=kk, i=i)] + + for j in range(C_ob2): + + + for ii in range(SIMD): + # load B + # for jj in range(C_ob1//SIMD): + # # s += ['b_{jj} = vld1q_f32(b + ({i} * SIMD + {ii})*C_ob + ({j} * {C_ob1} + {jj})*SIMD);\\'.format(i=i, ii=ii, j=j, C_ob1=C_ob1//SIMD, jj=jj)] + # s += ['b_{jj} = vld1q_f32(bb + {ii}*C_ob + ({j} * {C_ob1} + {jj})*SIMD);\\'.format(ii=ii, j=j, C_ob1=C_ob1//SIMD, jj=jj)] + + # compute + for kk in range(W_ob): + if j == 0: # load a just before use + s += ['a_{kk} = vld1q_f32(a + {kk} * step + {i} * SIMD);\\'.format(kk=kk, i=i)] + + for jj in range(C_ob1//SIMD): + + if kk == 0: # load b just before use + # s += ['b_{jj} = vld1q_f32(b + ({i} * SIMD + {ii})*C_ob + ({j} * {C_ob1} + {jj})*SIMD);\\'.format(i=i, ii=ii, j=j, C_ob1=C_ob1//SIMD, jj=jj)] + s += ['b_{jj} = vld1q_f32(bb + {ii}*C_ob + ({j} * {C_ob1} + {jj})*SIMD);\\'.format(ii=ii, j=j, C_ob1=C_ob1//SIMD, jj=jj)] + + # s += ['{c} = vfmaq_laneq_f32({c}, b_{jj}, a_{kk}, {ii});\\'.format(c=c_tile[kk][j * (C_ob1//SIMD) + jj], kk=kk, jj=jj, ii=ii)] + # s += ['__asm__ volatile("fmla %[c].4s, %[b].4s, %[a].s[{ii}]\\n\\t" : [c] "+w"({c}) : [b] "w"(b_{jj}), [a] "w"(a_{kk}));'.format( + s += ['__asm__ volatile ("fmla %0.4s, %1.4s, %2.s[{ii}]" : "+w"({c}) : "w"(b_{jj}), "w"(a_{kk}));\\'.format( + # s += ['__asm__ ("fmla %0.4s, %1.4s, %2.s[{ii}]" : "+w"({c}) : "w"(b_{jj}), "w"(a_{kk}));\\'.format( + c=c_tile[kk][j * (C_ob1//SIMD) + jj], kk=kk, jj=jj, ii=ii + )] + + # s += ['{c} = fma_reg_broadcast({c}, b_{jj}, a_{kk}, {ii});\\'.format(c=c_tile[kk][j * (C_ob1//SIMD) + jj], kk=kk, jj=jj, ii=ii)] + s += ['bb += {};\\'.format(SIMD * C_ob)] + # s += ['aa += \\'] + + s += [''] + + # + + # max pooling / relu + s += redefine('MAX_TILE_C') + s += ['#define MAX_TILE_C(step, a, W_ob, C_ob)\\'] + # compute + s += ['{} av; \\'.format(simd_reg_typename)] + for kk in range(W_ob): + for jj in range(C_ob//SIMD): + s += ['av = vld1q_f32(a + {k} * step + {j} * SIMD);\\'.format(k=kk, j=jj)] + s += ['{c} = vmaxq_f32({c}, av);\\'.format(c=c_tile[kk][jj], k=kk, j=jj)] + s += [''] + + # depthwise + s += redefine('DW_TILE_C') + s += ['#define DW_TILE_C(step, a, b, W_ob, C_ob)\\'] + s += ['{} av; \\'.format(simd_reg_typename)] + # load B + for jj in range(C_ob//SIMD): + s += ['{} b_{j} = vld1q_f32(b + {j}*SIMD);\\'.format(simd_reg_typename, j=jj)] + # compute + for kk in range(W_ob): + for jj in range(C_ob//SIMD): + s += ['av = vld1q_f32(a + {k} * step + {j} * SIMD);\\'.format(k=kk, j=jj)] + s += ['{c} = vfmaq_f32({c}, av, b_{j});\\'.format(c=c_tile[kk][jj], j=jj)] + s += [''] + + + + # to fix backslash at end of file warning + s += [''] + + f.write('\n'.join(s)) diff --git a/include/small/interface_abstract.hpp b/include/small/interface_abstract.hpp index dbd1185..7db09bf 100644 --- a/include/small/interface_abstract.hpp +++ b/include/small/interface_abstract.hpp @@ -17,6 +17,8 @@ #include #include +#define SMALL_HAS_FLOAT_SUPPORT 1 + #if defined(SMALL_HAS_FLOAT_SUPPORT) #include /// @todo abstract_layer_float.hpp #endif @@ -155,11 +157,49 @@ void Conv2D( "Conv2D ERROR: stride unsupported."); } } + + //Generic handling for input channels that are not a multiple else { - throw std::invalid_argument( - "Conv2D ERROR: in_channels unsupported."); + // printf("Generic handling for input channels that are not a multiple\n"); + // printf("call parameters: "); + if (stride == 1) + { + detail::abstract_layer< + FloatBuffer, 1, FLOAT_C_ob, FLOAT_C_ib, + FLOAT_W_ob, 1, FLOAT_UNROLL, OP_CONV, 2, 1>( + 1, // Output Channel Grouping + output_channels, // Output Channels per group + input_channels, + input_height, input_width, + kernel_height, kernel_width, + t_pad, l_pad, r_pad, b_pad, + &input_buf, &filter_buf, &output_buf); + } + else if (stride == 2) + { + detail::abstract_layer< + FloatBuffer, 1, FLOAT_C_ob, FLOAT_C_ib, + FLOAT_W_ob, 2, FLOAT_UNROLL, OP_CONV, 2, 1>( + 1, // Output Channel Grouping + output_channels, // Output Channels per group + input_channels, + input_height, input_width, + kernel_height, kernel_width, + t_pad, l_pad, r_pad, b_pad, + &input_buf, &filter_buf, &output_buf); + } + else + { + throw std::invalid_argument( + "Conv2D ERROR: stride unsupported."); + } } + // else + // { + // throw std::invalid_argument( + // "Conv2D ERROR: in_channels unsupported."); + // } } #endif @@ -379,11 +419,11 @@ void PartialConv2D( /// @todo We need another specific case for input_channels==1 (maybe more) - else - { - throw std::invalid_argument( - "PartialConv2D ERROR: in_channels unsupported."); - } + // else + // { + // throw std::invalid_argument( + // "PartialConv2D ERROR: in_channels unsupported."); + // } } #endif @@ -525,8 +565,8 @@ void MaxPool2D( << ",I,O)\n"; #endif - if (input_channels % FLOAT_C_ib == 0) - { + // if (input_channels % FLOAT_C_ib == 0) + // { if (stride == 1) { detail::abstract_layer< @@ -556,12 +596,12 @@ void MaxPool2D( throw std::invalid_argument( "MaxPool2D ERROR: stride unsupported."); } - } - else - { - throw std::invalid_argument( - "MaxPool2D ERROR: in_channels unsupported."); - } + // } + // else + // { + // throw std::invalid_argument( + // "MaxPool2D ERROR: in_channels unsupported."); + // } } #endif @@ -707,11 +747,11 @@ void AveragePool2D( "AveragePool2D ERROR: stride unsupported."); } } - else - { - throw std::invalid_argument( - "AveragePool2D ERROR: in_channels unsupported."); - } + // else + // { + // throw std::invalid_argument( + // "AveragePool2D ERROR: in_channels unsupported."); + // } } #endif @@ -754,8 +794,8 @@ void DepthwiseConv2D( << ",img:" << input_height << "x" << input_width << ",I,F,O)\n"; #endif - if (input_channels % FLOAT_C_ib == 0) - { + // if (input_channels % FLOAT_C_ib == 0) + // { if (stride == 1) { detail::abstract_layer< @@ -786,12 +826,12 @@ void DepthwiseConv2D( throw std::invalid_argument( "DepthwiseConv2D ERROR: stride unsupported."); } - } - else - { - throw std::invalid_argument( - "DepthwiseConv2D ERROR: in_channels unsupported."); - } + // } + // else + // { + // throw std::invalid_argument( + // "DepthwiseConv2D ERROR: in_channels unsupported."); + // } } #endif @@ -1029,8 +1069,8 @@ void ReLUActivation(int input_channels, << ",I,O)\n"; #endif - if (input_channels % FLOAT_C_ib == 0) - { + // if (input_channels % FLOAT_C_ob == 0) + // { detail::abstract_layer< FloatBuffer, FLOAT_C_ob, 1, 1, FLOAT_W_ob, 1, 1, OP_RELU, 0, 1>( input_channels, // Output Channel Grouping @@ -1040,12 +1080,12 @@ void ReLUActivation(int input_channels, 1, 1, 0, 0, 0, 0, &input_buf, (FloatBuffer *)nullptr, &output_buf); - } - else - { - throw std::invalid_argument( - "ReLUActivation ERROR: in_channels unsupported."); - } + // } + // else + // { + // throw std::invalid_argument( + // "ReLUActivation ERROR: in_channels unsupported."); + // } } #endif diff --git a/include/small/intrinsics-gen_temp.h b/include/small/intrinsics-gen_temp.h new file mode 100644 index 0000000..8dfbe13 --- /dev/null +++ b/include/small/intrinsics-gen_temp.h @@ -0,0 +1,189 @@ + +//**************************************************************************** +// SMaLL, Software for Machine Learning Libraries +// Copyright 2023 by The SMaLL Contributors, All Rights Reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +// For additional details (including references to third party source code and +// other files) see the LICENSE file or contact permission@sei.cmu.edu. See +// Contributors.txt for a full list of contributors. Created, in part, with +// funding and support from the U.S. Government (see Acknowledgments.txt file). +// DM23-0126 +//**************************************************************************** + +#pragma once + +#define SMALL_HAS_FLOAT_SUPPORT 1 + +#include +#ifdef FLOAT_DEF_TILE_C +#undef FLOAT_DEF_TILE_C +#endif + +#define FLOAT_DEF_TILE_C(W_ob, C_ob)\ +float c_tile[W_ob * C_ob];\ +float32x4_t c_0_0;\ +float32x4_t c_0_1;\ +float32x4_t c_1_0;\ +float32x4_t c_1_1;\ +float32x4_t c_2_0;\ +float32x4_t c_2_1;\ +float32x4_t c_3_0;\ +float32x4_t c_3_1;\ +float32x4_t c_4_0;\ +float32x4_t c_4_1;\ +float32x4_t c_5_0;\ +float32x4_t c_5_1;\ + +#ifdef FLOAT_ZERO_TILE_C +#undef FLOAT_ZERO_TILE_C +#endif + +#define FLOAT_ZERO_TILE_C(W_ob, C_ob)\ +c_0_0 = vdupq_n_f32(0);\ +c_0_1 = vdupq_n_f32(0);\ +c_1_0 = vdupq_n_f32(0);\ +c_1_1 = vdupq_n_f32(0);\ +c_2_0 = vdupq_n_f32(0);\ +c_2_1 = vdupq_n_f32(0);\ +c_3_0 = vdupq_n_f32(0);\ +c_3_1 = vdupq_n_f32(0);\ +c_4_0 = vdupq_n_f32(0);\ +c_4_1 = vdupq_n_f32(0);\ +c_5_0 = vdupq_n_f32(0);\ +c_5_1 = vdupq_n_f32(0);\ + +#ifdef FLOAT_LOAD_TILE_C +#undef FLOAT_LOAD_TILE_C +#endif + +#define FLOAT_LOAD_TILE_C(O, W_ob, C_ob)\ +c_0_0 = vld1q_f32(O + 0 * C_ob + 0 * SIMD);\ +c_0_1 = vld1q_f32(O + 0 * C_ob + 1 * SIMD);\ +c_1_0 = vld1q_f32(O + 1 * C_ob + 0 * SIMD);\ +c_1_1 = vld1q_f32(O + 1 * C_ob + 1 * SIMD);\ +c_2_0 = vld1q_f32(O + 2 * C_ob + 0 * SIMD);\ +c_2_1 = vld1q_f32(O + 2 * C_ob + 1 * SIMD);\ +c_3_0 = vld1q_f32(O + 3 * C_ob + 0 * SIMD);\ +c_3_1 = vld1q_f32(O + 3 * C_ob + 1 * SIMD);\ +c_4_0 = vld1q_f32(O + 4 * C_ob + 0 * SIMD);\ +c_4_1 = vld1q_f32(O + 4 * C_ob + 1 * SIMD);\ +c_5_0 = vld1q_f32(O + 5 * C_ob + 0 * SIMD);\ +c_5_1 = vld1q_f32(O + 5 * C_ob + 1 * SIMD);\ + +#ifdef FLOAT_LOAD_TILE_C_strided +#undef FLOAT_LOAD_TILE_C_strided +#endif + +#define FLOAT_LOAD_TILE_C_strided(O, step, W_ob, C_ob)\ +c_0_0 = vld1q_f32(O + 0 * step + 0 * SIMD);\ +c_0_1 = vld1q_f32(O + 0 * step + 1 * SIMD);\ +c_1_0 = vld1q_f32(O + 1 * step + 0 * SIMD);\ +c_1_1 = vld1q_f32(O + 1 * step + 1 * SIMD);\ +c_2_0 = vld1q_f32(O + 2 * step + 0 * SIMD);\ +c_2_1 = vld1q_f32(O + 2 * step + 1 * SIMD);\ +c_3_0 = vld1q_f32(O + 3 * step + 0 * SIMD);\ +c_3_1 = vld1q_f32(O + 3 * step + 1 * SIMD);\ +c_4_0 = vld1q_f32(O + 4 * step + 0 * SIMD);\ +c_4_1 = vld1q_f32(O + 4 * step + 1 * SIMD);\ +c_5_0 = vld1q_f32(O + 5 * step + 0 * SIMD);\ +c_5_1 = vld1q_f32(O + 5 * step + 1 * SIMD);\ + +#ifdef STORE_TILE_C +#undef STORE_TILE_C +#endif + +#define STORE_TILE_C(O, W_ob, C_ob)\ +vst1q_f32(O + 0 * C_ob + 0 * SIMD, c_0_0);\ +vst1q_f32(O + 0 * C_ob + 1 * SIMD, c_0_1);\ +vst1q_f32(O + 1 * C_ob + 0 * SIMD, c_1_0);\ +vst1q_f32(O + 1 * C_ob + 1 * SIMD, c_1_1);\ +vst1q_f32(O + 2 * C_ob + 0 * SIMD, c_2_0);\ +vst1q_f32(O + 2 * C_ob + 1 * SIMD, c_2_1);\ +vst1q_f32(O + 3 * C_ob + 0 * SIMD, c_3_0);\ +vst1q_f32(O + 3 * C_ob + 1 * SIMD, c_3_1);\ +vst1q_f32(O + 4 * C_ob + 0 * SIMD, c_4_0);\ +vst1q_f32(O + 4 * C_ob + 1 * SIMD, c_4_1);\ +vst1q_f32(O + 5 * C_ob + 0 * SIMD, c_5_0);\ +vst1q_f32(O + 5 * C_ob + 1 * SIMD, c_5_1);\ + +#ifdef CONV_TILE_C +#undef CONV_TILE_C +#endif + +#define CONV_TILE_C(step, a, b, W_ob, C_ob)\ +float *aa = a;\ +float *bb = b;\ +float32x4_t a_0;\ +float32x4_t a_1;\ +float32x4_t a_2;\ +float32x4_t a_3;\ +float32x4_t a_4;\ +float32x4_t a_5;\ +float32x4_t b_0;\ +float32x4_t b_1;\ + +#ifdef MAX_TILE_C +#undef MAX_TILE_C +#endif + +#define MAX_TILE_C(step, a, W_ob, C_ob)\ +float32x4_t av; \ +av = vld1q_f32(a + 0 * step + 0 * SIMD);\ +c_0_0 = vmaxq_f32(c_0_0, av);\ +av = vld1q_f32(a + 0 * step + 1 * SIMD);\ +c_0_1 = vmaxq_f32(c_0_1, av);\ +av = vld1q_f32(a + 1 * step + 0 * SIMD);\ +c_1_0 = vmaxq_f32(c_1_0, av);\ +av = vld1q_f32(a + 1 * step + 1 * SIMD);\ +c_1_1 = vmaxq_f32(c_1_1, av);\ +av = vld1q_f32(a + 2 * step + 0 * SIMD);\ +c_2_0 = vmaxq_f32(c_2_0, av);\ +av = vld1q_f32(a + 2 * step + 1 * SIMD);\ +c_2_1 = vmaxq_f32(c_2_1, av);\ +av = vld1q_f32(a + 3 * step + 0 * SIMD);\ +c_3_0 = vmaxq_f32(c_3_0, av);\ +av = vld1q_f32(a + 3 * step + 1 * SIMD);\ +c_3_1 = vmaxq_f32(c_3_1, av);\ +av = vld1q_f32(a + 4 * step + 0 * SIMD);\ +c_4_0 = vmaxq_f32(c_4_0, av);\ +av = vld1q_f32(a + 4 * step + 1 * SIMD);\ +c_4_1 = vmaxq_f32(c_4_1, av);\ +av = vld1q_f32(a + 5 * step + 0 * SIMD);\ +c_5_0 = vmaxq_f32(c_5_0, av);\ +av = vld1q_f32(a + 5 * step + 1 * SIMD);\ +c_5_1 = vmaxq_f32(c_5_1, av);\ + +#ifdef DW_TILE_C +#undef DW_TILE_C +#endif + +#define DW_TILE_C(step, a, b, W_ob, C_ob)\ +float32x4_t av; \ +float32x4_t b_0 = vld1q_f32(b + 0*SIMD);\ +float32x4_t b_1 = vld1q_f32(b + 1*SIMD);\ +av = vld1q_f32(a + 0 * step + 0 * SIMD);\ +c_0_0 = vfmaq_f32(c_0_0, av, b_0);\ +av = vld1q_f32(a + 0 * step + 1 * SIMD);\ +c_0_1 = vfmaq_f32(c_0_1, av, b_1);\ +av = vld1q_f32(a + 1 * step + 0 * SIMD);\ +c_1_0 = vfmaq_f32(c_1_0, av, b_0);\ +av = vld1q_f32(a + 1 * step + 1 * SIMD);\ +c_1_1 = vfmaq_f32(c_1_1, av, b_1);\ +av = vld1q_f32(a + 2 * step + 0 * SIMD);\ +c_2_0 = vfmaq_f32(c_2_0, av, b_0);\ +av = vld1q_f32(a + 2 * step + 1 * SIMD);\ +c_2_1 = vfmaq_f32(c_2_1, av, b_1);\ +av = vld1q_f32(a + 3 * step + 0 * SIMD);\ +c_3_0 = vfmaq_f32(c_3_0, av, b_0);\ +av = vld1q_f32(a + 3 * step + 1 * SIMD);\ +c_3_1 = vfmaq_f32(c_3_1, av, b_1);\ +av = vld1q_f32(a + 4 * step + 0 * SIMD);\ +c_4_0 = vfmaq_f32(c_4_0, av, b_0);\ +av = vld1q_f32(a + 4 * step + 1 * SIMD);\ +c_4_1 = vfmaq_f32(c_4_1, av, b_1);\ +av = vld1q_f32(a + 5 * step + 0 * SIMD);\ +c_5_0 = vfmaq_f32(c_5_0, av, b_0);\ +av = vld1q_f32(a + 5 * step + 1 * SIMD);\ +c_5_1 = vfmaq_f32(c_5_1, av, b_1);\ + diff --git a/include/small/params_temp.h b/include/small/params_temp.h new file mode 100644 index 0000000..bc7f745 --- /dev/null +++ b/include/small/params_temp.h @@ -0,0 +1,29 @@ + +//**************************************************************************** +// SMaLL, Software for Machine Learning Libraries +// Copyright 2023 by The SMaLL Contributors, All Rights Reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +// For additional details (including references to third party source code and +// other files) see the LICENSE file or contact permission@sei.cmu.edu. See +// Contributors.txt for a full list of contributors. Created, in part, with +// funding and support from the U.S. Government (see Acknowledgments.txt file). +// DM23-0126 +//**************************************************************************** + +#pragma once + +#define SMALL_HAS_FLOAT_SUPPORT 1 + +#define FLOAT_W_ob 6 +#define FLOAT_C_ob 16 +#define FLOAT_SIMD 8 +#define FLOAT_UNROLL 1 +#define FLOAT_C_ib FLOAT_C_ob + +// not used for kernels, but used in throughput calculation. +#define NUM_FMA 2 +#define NUM_MAX 1 +#define NUM_LOAD 2 +#define NUM_STORE 1 + \ No newline at end of file diff --git a/include/small/platforms/arm/intrinsics-gen_temp.h b/include/small/platforms/arm/intrinsics-gen_temp.h new file mode 100644 index 0000000..8dfbe13 --- /dev/null +++ b/include/small/platforms/arm/intrinsics-gen_temp.h @@ -0,0 +1,189 @@ + +//**************************************************************************** +// SMaLL, Software for Machine Learning Libraries +// Copyright 2023 by The SMaLL Contributors, All Rights Reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +// For additional details (including references to third party source code and +// other files) see the LICENSE file or contact permission@sei.cmu.edu. See +// Contributors.txt for a full list of contributors. Created, in part, with +// funding and support from the U.S. Government (see Acknowledgments.txt file). +// DM23-0126 +//**************************************************************************** + +#pragma once + +#define SMALL_HAS_FLOAT_SUPPORT 1 + +#include +#ifdef FLOAT_DEF_TILE_C +#undef FLOAT_DEF_TILE_C +#endif + +#define FLOAT_DEF_TILE_C(W_ob, C_ob)\ +float c_tile[W_ob * C_ob];\ +float32x4_t c_0_0;\ +float32x4_t c_0_1;\ +float32x4_t c_1_0;\ +float32x4_t c_1_1;\ +float32x4_t c_2_0;\ +float32x4_t c_2_1;\ +float32x4_t c_3_0;\ +float32x4_t c_3_1;\ +float32x4_t c_4_0;\ +float32x4_t c_4_1;\ +float32x4_t c_5_0;\ +float32x4_t c_5_1;\ + +#ifdef FLOAT_ZERO_TILE_C +#undef FLOAT_ZERO_TILE_C +#endif + +#define FLOAT_ZERO_TILE_C(W_ob, C_ob)\ +c_0_0 = vdupq_n_f32(0);\ +c_0_1 = vdupq_n_f32(0);\ +c_1_0 = vdupq_n_f32(0);\ +c_1_1 = vdupq_n_f32(0);\ +c_2_0 = vdupq_n_f32(0);\ +c_2_1 = vdupq_n_f32(0);\ +c_3_0 = vdupq_n_f32(0);\ +c_3_1 = vdupq_n_f32(0);\ +c_4_0 = vdupq_n_f32(0);\ +c_4_1 = vdupq_n_f32(0);\ +c_5_0 = vdupq_n_f32(0);\ +c_5_1 = vdupq_n_f32(0);\ + +#ifdef FLOAT_LOAD_TILE_C +#undef FLOAT_LOAD_TILE_C +#endif + +#define FLOAT_LOAD_TILE_C(O, W_ob, C_ob)\ +c_0_0 = vld1q_f32(O + 0 * C_ob + 0 * SIMD);\ +c_0_1 = vld1q_f32(O + 0 * C_ob + 1 * SIMD);\ +c_1_0 = vld1q_f32(O + 1 * C_ob + 0 * SIMD);\ +c_1_1 = vld1q_f32(O + 1 * C_ob + 1 * SIMD);\ +c_2_0 = vld1q_f32(O + 2 * C_ob + 0 * SIMD);\ +c_2_1 = vld1q_f32(O + 2 * C_ob + 1 * SIMD);\ +c_3_0 = vld1q_f32(O + 3 * C_ob + 0 * SIMD);\ +c_3_1 = vld1q_f32(O + 3 * C_ob + 1 * SIMD);\ +c_4_0 = vld1q_f32(O + 4 * C_ob + 0 * SIMD);\ +c_4_1 = vld1q_f32(O + 4 * C_ob + 1 * SIMD);\ +c_5_0 = vld1q_f32(O + 5 * C_ob + 0 * SIMD);\ +c_5_1 = vld1q_f32(O + 5 * C_ob + 1 * SIMD);\ + +#ifdef FLOAT_LOAD_TILE_C_strided +#undef FLOAT_LOAD_TILE_C_strided +#endif + +#define FLOAT_LOAD_TILE_C_strided(O, step, W_ob, C_ob)\ +c_0_0 = vld1q_f32(O + 0 * step + 0 * SIMD);\ +c_0_1 = vld1q_f32(O + 0 * step + 1 * SIMD);\ +c_1_0 = vld1q_f32(O + 1 * step + 0 * SIMD);\ +c_1_1 = vld1q_f32(O + 1 * step + 1 * SIMD);\ +c_2_0 = vld1q_f32(O + 2 * step + 0 * SIMD);\ +c_2_1 = vld1q_f32(O + 2 * step + 1 * SIMD);\ +c_3_0 = vld1q_f32(O + 3 * step + 0 * SIMD);\ +c_3_1 = vld1q_f32(O + 3 * step + 1 * SIMD);\ +c_4_0 = vld1q_f32(O + 4 * step + 0 * SIMD);\ +c_4_1 = vld1q_f32(O + 4 * step + 1 * SIMD);\ +c_5_0 = vld1q_f32(O + 5 * step + 0 * SIMD);\ +c_5_1 = vld1q_f32(O + 5 * step + 1 * SIMD);\ + +#ifdef STORE_TILE_C +#undef STORE_TILE_C +#endif + +#define STORE_TILE_C(O, W_ob, C_ob)\ +vst1q_f32(O + 0 * C_ob + 0 * SIMD, c_0_0);\ +vst1q_f32(O + 0 * C_ob + 1 * SIMD, c_0_1);\ +vst1q_f32(O + 1 * C_ob + 0 * SIMD, c_1_0);\ +vst1q_f32(O + 1 * C_ob + 1 * SIMD, c_1_1);\ +vst1q_f32(O + 2 * C_ob + 0 * SIMD, c_2_0);\ +vst1q_f32(O + 2 * C_ob + 1 * SIMD, c_2_1);\ +vst1q_f32(O + 3 * C_ob + 0 * SIMD, c_3_0);\ +vst1q_f32(O + 3 * C_ob + 1 * SIMD, c_3_1);\ +vst1q_f32(O + 4 * C_ob + 0 * SIMD, c_4_0);\ +vst1q_f32(O + 4 * C_ob + 1 * SIMD, c_4_1);\ +vst1q_f32(O + 5 * C_ob + 0 * SIMD, c_5_0);\ +vst1q_f32(O + 5 * C_ob + 1 * SIMD, c_5_1);\ + +#ifdef CONV_TILE_C +#undef CONV_TILE_C +#endif + +#define CONV_TILE_C(step, a, b, W_ob, C_ob)\ +float *aa = a;\ +float *bb = b;\ +float32x4_t a_0;\ +float32x4_t a_1;\ +float32x4_t a_2;\ +float32x4_t a_3;\ +float32x4_t a_4;\ +float32x4_t a_5;\ +float32x4_t b_0;\ +float32x4_t b_1;\ + +#ifdef MAX_TILE_C +#undef MAX_TILE_C +#endif + +#define MAX_TILE_C(step, a, W_ob, C_ob)\ +float32x4_t av; \ +av = vld1q_f32(a + 0 * step + 0 * SIMD);\ +c_0_0 = vmaxq_f32(c_0_0, av);\ +av = vld1q_f32(a + 0 * step + 1 * SIMD);\ +c_0_1 = vmaxq_f32(c_0_1, av);\ +av = vld1q_f32(a + 1 * step + 0 * SIMD);\ +c_1_0 = vmaxq_f32(c_1_0, av);\ +av = vld1q_f32(a + 1 * step + 1 * SIMD);\ +c_1_1 = vmaxq_f32(c_1_1, av);\ +av = vld1q_f32(a + 2 * step + 0 * SIMD);\ +c_2_0 = vmaxq_f32(c_2_0, av);\ +av = vld1q_f32(a + 2 * step + 1 * SIMD);\ +c_2_1 = vmaxq_f32(c_2_1, av);\ +av = vld1q_f32(a + 3 * step + 0 * SIMD);\ +c_3_0 = vmaxq_f32(c_3_0, av);\ +av = vld1q_f32(a + 3 * step + 1 * SIMD);\ +c_3_1 = vmaxq_f32(c_3_1, av);\ +av = vld1q_f32(a + 4 * step + 0 * SIMD);\ +c_4_0 = vmaxq_f32(c_4_0, av);\ +av = vld1q_f32(a + 4 * step + 1 * SIMD);\ +c_4_1 = vmaxq_f32(c_4_1, av);\ +av = vld1q_f32(a + 5 * step + 0 * SIMD);\ +c_5_0 = vmaxq_f32(c_5_0, av);\ +av = vld1q_f32(a + 5 * step + 1 * SIMD);\ +c_5_1 = vmaxq_f32(c_5_1, av);\ + +#ifdef DW_TILE_C +#undef DW_TILE_C +#endif + +#define DW_TILE_C(step, a, b, W_ob, C_ob)\ +float32x4_t av; \ +float32x4_t b_0 = vld1q_f32(b + 0*SIMD);\ +float32x4_t b_1 = vld1q_f32(b + 1*SIMD);\ +av = vld1q_f32(a + 0 * step + 0 * SIMD);\ +c_0_0 = vfmaq_f32(c_0_0, av, b_0);\ +av = vld1q_f32(a + 0 * step + 1 * SIMD);\ +c_0_1 = vfmaq_f32(c_0_1, av, b_1);\ +av = vld1q_f32(a + 1 * step + 0 * SIMD);\ +c_1_0 = vfmaq_f32(c_1_0, av, b_0);\ +av = vld1q_f32(a + 1 * step + 1 * SIMD);\ +c_1_1 = vfmaq_f32(c_1_1, av, b_1);\ +av = vld1q_f32(a + 2 * step + 0 * SIMD);\ +c_2_0 = vfmaq_f32(c_2_0, av, b_0);\ +av = vld1q_f32(a + 2 * step + 1 * SIMD);\ +c_2_1 = vfmaq_f32(c_2_1, av, b_1);\ +av = vld1q_f32(a + 3 * step + 0 * SIMD);\ +c_3_0 = vfmaq_f32(c_3_0, av, b_0);\ +av = vld1q_f32(a + 3 * step + 1 * SIMD);\ +c_3_1 = vfmaq_f32(c_3_1, av, b_1);\ +av = vld1q_f32(a + 4 * step + 0 * SIMD);\ +c_4_0 = vfmaq_f32(c_4_0, av, b_0);\ +av = vld1q_f32(a + 4 * step + 1 * SIMD);\ +c_4_1 = vfmaq_f32(c_4_1, av, b_1);\ +av = vld1q_f32(a + 5 * step + 0 * SIMD);\ +c_5_0 = vfmaq_f32(c_5_0, av, b_0);\ +av = vld1q_f32(a + 5 * step + 1 * SIMD);\ +c_5_1 = vfmaq_f32(c_5_1, av, b_1);\ + diff --git a/include/small/platforms/arm/intrinsics.h b/include/small/platforms/arm/intrinsics.h index e01a4ba..2145daf 100644 --- a/include/small/platforms/arm/intrinsics.h +++ b/include/small/platforms/arm/intrinsics.h @@ -930,6 +930,31 @@ if constexpr(_C_ob == 1 && _C_ob != FLOAT_SIMD_EPILOGUE)\ }\ } +#define FLOAT_REDUCE_REM_CHANNEL_END_C(O_w_left, _C_ob) \ + if (_C_ob == 1 && _C_ob != FLOAT_SIMD_EPILOGUE) \ + { \ + float c_tile_array[FLOAT_C_ob]; \ + for (uint32_t kk = 0; kk < O_w_left; kk++) \ + { \ + float32x4_t *c_channel_v = c_tile + kk * (FLOAT_C_ob / FLOAT_SIMD); \ + c_channel_v[0] = vaddq_f32(c_channel_v[0], c_channel_v[1]); \ + c_channel_v[2] = vaddq_f32(c_channel_v[2], c_channel_v[3]); \ + c_channel_v[0] = vaddq_f32(c_channel_v[0], c_channel_v[2]); \ + \ + vst1q_f32(c_tile_array, c_channel_v[0]); \ + for (uint32_t jj = 1; jj < FLOAT_SIMD; jj++) \ + { \ + c_tile_array[0] += c_tile_array[jj]; \ + c_tile_array[jj] = 0; \ + } \ + \ + c_channel_v[0] = vld1q_f32(c_tile_array); \ + c_channel_v[1] = vdupq_n_f32(0.0); \ + c_channel_v[2] = vdupq_n_f32(0.0); \ + c_channel_v[3] = vdupq_n_f32(0.0); \ + } \ + } + //**************************************************************************** // AVG Pooling //**************************************************************************** diff --git a/include/small/platforms/arm/params.h b/include/small/platforms/arm/params.h index cdaf6b5..d63c9d5 100644 --- a/include/small/platforms/arm/params.h +++ b/include/small/platforms/arm/params.h @@ -15,16 +15,16 @@ /// @todo move this include to intrinsics.hpp? #include -#define SMALL_HAS_FLOAT_SUPPORT 1 +#define SMALL_HAS_FLOAT_SUPPORT 1 -#define FLOAT_W_ob 6 -#define FLOAT_C_ob 16 -#define FLOAT_SIMD 4 +#define FLOAT_W_ob 6 +#define FLOAT_C_ob 16 +#define FLOAT_SIMD 4 #define FLOAT_UNROLL 4 -#define FLOAT_C_ib FLOAT_C_ob +#define FLOAT_C_ib FLOAT_C_ob // not used for kernels, but used in throughput calculation. #define FLOAT_NUM_FMA 2 #define FLOAT_NUM_MAX 1 #define FLOAT_NUM_LOAD 2 -#define FLOAT_NUM_STORE 1 +#define FLOAT_NUM_STORE 1 \ No newline at end of file diff --git a/include/small/platforms/arm/params_temp.h b/include/small/platforms/arm/params_temp.h new file mode 100644 index 0000000..bc7f745 --- /dev/null +++ b/include/small/platforms/arm/params_temp.h @@ -0,0 +1,29 @@ + +//**************************************************************************** +// SMaLL, Software for Machine Learning Libraries +// Copyright 2023 by The SMaLL Contributors, All Rights Reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +// For additional details (including references to third party source code and +// other files) see the LICENSE file or contact permission@sei.cmu.edu. See +// Contributors.txt for a full list of contributors. Created, in part, with +// funding and support from the U.S. Government (see Acknowledgments.txt file). +// DM23-0126 +//**************************************************************************** + +#pragma once + +#define SMALL_HAS_FLOAT_SUPPORT 1 + +#define FLOAT_W_ob 6 +#define FLOAT_C_ob 16 +#define FLOAT_SIMD 8 +#define FLOAT_UNROLL 1 +#define FLOAT_C_ib FLOAT_C_ob + +// not used for kernels, but used in throughput calculation. +#define NUM_FMA 2 +#define NUM_MAX 1 +#define NUM_LOAD 2 +#define NUM_STORE 1 + \ No newline at end of file diff --git a/include/small/platforms/reference/intrinsics_float.h b/include/small/platforms/reference/intrinsics_float.h index fe03cef..5bf83f4 100644 --- a/include/small/platforms/reference/intrinsics_float.h +++ b/include/small/platforms/reference/intrinsics_float.h @@ -441,6 +441,21 @@ for(uint32_t u =0 ; u < _UNROLL; u++)\ } \ } +#define FLOAT_REDUCE_REM_CHANNEL_END_C(O_w_left, _C_ob) \ + if(_C_ob == 1 && _C_ob != FLOAT_SIMD_EPILOGUE) \ + { \ + float c_tile_array[FLOAT_C_ob]; \ + for (uint32_t kk = 0; kk < O_w_left; kk++) \ + { \ + float *c_channel_v = c_tile + kk * (FLOAT_C_ob); \ + for (uint32_t jj = 1; jj < FLOAT_C_ob; jj++) \ + { \ + c_channel_v[0] += c_channel_v[jj]; \ + c_channel_v[jj] = 0; \ + } \ + } \ + } + //**************************************************************************** // Reduce kernels?? //**************************************************************************** diff --git a/include/small/platforms/zen2/intrinsics.h b/include/small/platforms/zen2/intrinsics.h index 23c678d..101ea55 100644 --- a/include/small/platforms/zen2/intrinsics.h +++ b/include/small/platforms/zen2/intrinsics.h @@ -617,6 +617,22 @@ for(uint32_t u =0 ; u < _UNROLL; u++)\ } \ } \ } + +#define FLOAT_REDUCE_REM_CHANNEL_END_C(O_w_left, _C_ob) \ + if (_C_ob == 1 && _C_ob != FLOAT_SIMD_EPILOGUE) \ + { \ + float c_tile_array[FLOAT_C_ob]; \ + for (uint32_t kk = 0; kk < O_w_left; kk++) \ + { \ + float *c_channel_v = c_tile + kk * (FLOAT_C_ob); \ + for (uint32_t jj = 1; jj < FLOAT_C_ob; jj++) \ + { \ + c_channel_v[0] += c_channel_v[jj]; \ + c_channel_v[jj] = 0; \ + } \ + } \ + } + //**************************************************************************** // FMA unused? //****************************************************************************