diff --git a/include/small/abstract_layer.hpp b/include/small/abstract_layer.hpp
index d018834..4442a90 100644
--- a/include/small/abstract_layer.hpp
+++ b/include/small/abstract_layer.hpp
@@ -23,6 +23,12 @@
 
 #define DEBUG 0
 
+
+#define ELEMENTAL 1
+#define BLOCK 2
+
+#define PARALLEL_DIST ELEMENTAL
+
 namespace small
 {
 namespace detail
@@ -67,7 +73,7 @@ namespace detail
 
 //****************************************************************************
 /// @todo add parameter: step
-#define FLOAT_ABSTRACT_OP_END(op_type, op_class, a_cur, b_cur, c_cur, W_elements, _C_ob) \
+#define FLOAT_ABSTRACT_OP_END(op_type, op_class, step, a_cur, b_cur, c_cur, W_elements, _C_ob) \
     if constexpr (op_type == OP_CONV)                                                    \
     {                                                                                    \
         if constexpr (op_class == 1)                                                     \
@@ -103,6 +109,7 @@ namespace detail
 
 
 //****************************************************************************
+//@todo: See if these different functions can be merged. Might require in-assembly microkernels
 template <typename ScalarT,
           typename AccumT,
           dim_t _G_b,
@@ -127,6 +134,8 @@ void inline compute_with_padding(dim_t H_lb, dim_t H_ub,
     constexpr dim_t _C_ob = _G_b * _K_b;
     constexpr dim_t _C_ib = _G_b * _F_cb;
     constexpr dim_t step = _stride * _C_ib;
+
+
     for (uint32_t n = H_lb; n < H_ub; n++)
     {
         int filter_offset_h = n * F_w * _F_cb * _G_b * _K_b;
@@ -147,13 +156,153 @@ void inline compute_with_padding(dim_t H_lb, dim_t H_ub,
                 /// @note using platform C_ob
                 ScalarT const *b_cur = b + ii * _UNROLL * FLOAT_C_ob;
                 ScalarT const *a_cur = a + ii * _UNROLL;
-                FLOAT_ABSTRACT_OP_END(op_type, op_class, a_cur, b_cur, c_cur, W_elements, _C_ob);
+                FLOAT_ABSTRACT_OP_END(op_type, op_class, step, a_cur, b_cur, c_cur, W_elements, _C_ob);
+            }
+        }
+    }
+
+}
+
+// Edge case to handle remainder input channels
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          // TODO: add a bool to switch between microkernel and default imp
+          // Leaf to describe abstract operation
+          OpType op_type,
+          int8_t op_class>
+void inline compute_with_padding(dim_t H_lb, dim_t H_ub,
+                                 dim_t W_lb, dim_t W_ub,
+                                 dim_t F_w,
+                                 dim_t W_elements,
+                                 dim_t input_col_stride,
+                                 ScalarT const *F,
+                                 ScalarT const *I,
+                                 c_tile_t *c_cur,
+                                 const dim_t F_c_left) 
+{
+    #if DEBUG
+    printf("F_c_left %d\n", F_c_left);
+
+    #endif
+    constexpr dim_t _C_ob = _G_b * _K_b;
+    const dim_t _C_ib = _G_b * F_c_left;
+    const dim_t step = _stride * _C_ib;
+
+        for (uint32_t n = H_lb; n < H_ub; n++)
+        {
+            int filter_offset_h = n * F_w * F_c_left * _G_b * _K_b;
+            int input_stencil_h = (n - H_lb) * input_col_stride; /*+ input_col_offset + input_row_offset*/
+
+            for (uint32_t m = W_lb; m < W_ub; m++)
+            {
+                int filter_offset_w = m * F_c_left * _G_b * _K_b + filter_offset_h;
+                /* This is C_ib because the microkernel stretches across groups*/
+                int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h;
+
+                ScalarT const *b = F + filter_offset_w;
+                ScalarT const *a = I + input_stencil_w;
+
+                // TODO: reintroduce convolution
+                for (uint32_t ii = 0; ii < F_c_left / _UNROLL; ii++)
+                {
+
+                    /// @note using platform C_ob
+                    ScalarT const *b_cur = b + ii * _UNROLL * FLOAT_C_ob;
+                    ScalarT const *a_cur = a + ii * _UNROLL;
+
+#if DEBUG
+                    printf("\t compute w padding_in channel %d\n", ii);
+                    printf("input values: %f \n", a_cur[0]);
+                    printf("filter values for output channel 0 : %f %f %f %f \n", b_cur[0 ], b_cur[1 ], b_cur[2 ], b_cur[3 ]);
+                    printf("input col stride %d\n", input_col_stride);
+
+#endif
+                    FLOAT_ABSTRACT_OP_END(op_type, op_class, step, a_cur, b_cur, c_cur, W_elements, _C_ob);
+                }
+            }
+        }
+    
+}
+
+// Edge case to handle remainder input and output channels
+// When streamlined, this is the only function that should remain
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL, /*@todo _UNROLL should be 1, or handled if F_C_left < _UNROLL*/
+          // TODO: add a bool to switch between microkernel and default imp
+          // Leaf to describe abstract operation
+          OpType op_type,
+          int8_t op_class>
+void inline compute_with_padding(dim_t H_lb, dim_t H_ub,
+                                 dim_t W_lb, dim_t W_ub,
+                                 dim_t F_w,
+                                 dim_t W_elements,
+                                 dim_t input_col_stride,
+                                 ScalarT const *F,
+                                 ScalarT const *I,
+                                 c_tile_t *c_cur,
+                                 const dim_t F_c_left,
+                                 const dim_t G_left,
+                                 const dim_t K_left) 
+{
+#if DEBUG ==1
+    printf("_F_cb %d _G_b %d _K_b %d\n", F_c_left, _G_b, _K_b);
+
+#endif
+
+    const dim_t _C_ob = G_left * K_left;
+    const dim_t _C_ib = G_left * F_c_left;
+    const dim_t step = _stride * _C_ib;
+
+    for (uint32_t n = H_lb; n < H_ub; n++)
+    {
+        int filter_offset_h = n * F_w * F_c_left * G_left * K_left;
+        int input_stencil_h = (n - H_lb) * input_col_stride; /*+ input_col_offset + input_row_offset*/
+
+        for (uint32_t m = W_lb; m < W_ub; m++)
+        {
+            int filter_offset_w = m * F_c_left * G_left * K_left + filter_offset_h;
+            /* This is C_ib because the microkernel stretches across groups*/
+            int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h;
+
+            ScalarT const *b = F + filter_offset_w;
+            ScalarT const *a = I + input_stencil_w;
+
+            // TODO: reintroduce convolution
+            for (uint32_t ii = 0; ii < F_c_left / _UNROLL; ii++)
+            {
+
+                /// @note using platform C_ob
+                ScalarT const *b_cur = b + ii * _UNROLL * _C_ob;
+                ScalarT const *a_cur = a + ii * _UNROLL;
+
+#if DEBUG == 1 
+                printf("\t compute w padding_in channel %d\n", ii);
+                printf("input values: %f \n", a_cur[0]);
+                if (op_type == OP_CONV )
+                printf("filter values for output channel 0 : %f %f %f %f \n", b_cur[0], b_cur[1], b_cur[2], b_cur[3]);
+                printf("input col stride %d\n", input_col_stride);
+
+#endif
+                FLOAT_ABSTRACT_OP_END(op_type, op_class, step, a_cur, b_cur, c_cur, W_elements, _C_ob);
             }
         }
     }
 }
 
 //****************************************************************************
+// @todo: is kernel left buggy? Needs Load_C_strided for left padding output elements in pooling
 template <typename ScalarT,
           typename AccumT,
           dim_t _G_b,
@@ -175,11 +324,12 @@ void inline kernel_left(
     ScalarT const *F,
     AccumT *O, // ScalarT -> AccumT
     dim_t H_lb = 0,
-    dim_t H_ub = 0)
-{
-    constexpr dim_t _C_ob = _G_b * _K_b;
-    // constexpr dim_t _C_ib = _G_b * _F_cb;
-    // constexpr dim_t step = _stride * _C_ib;
+    dim_t H_ub = 0,
+    AccumT k_zero = (AccumT)0,
+    AccumT I_offset = 0,
+    AccumT F_offset = 0)
+    {
+    constexpr dim_t _C_ob = _G_b * _K_b;;
 
     const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub);
     FLOAT_DEF_END_C(_O_wb, _C_ob);
@@ -203,17 +353,20 @@ void inline kernel_left(
     // dim_t c_cur = 0;
     for (uint32_t k_p = 0; k_p < l_pad_el; k_p++)
     {
-        compute_with_padding<ScalarT, AccumT,
-                             _G_b, _K_b, _F_cb, _O_wb, _stride,
-                             _UNROLL, op_type, op_class>(
-                                 H_lb, H_UPPER,
-                                 W_i_valid, F_w,
-                                 F_w,
-                                 1,
-                                 input_col_stride,
-                                 F,
-                                 I_ptr,
-                                 c_cur);
+   
+            compute_with_padding<ScalarT, AccumT,
+                                 _G_b, _K_b, _F_cb, _O_wb, _stride,
+                                 _UNROLL, op_type, op_class>(
+                                     H_lb, H_UPPER,
+                                     W_i_valid, F_w,
+                                     F_w,
+                                     1,
+                                     input_col_stride,
+                                     F,
+                                     I_ptr,
+                                     c_cur);
+        
+                                
 
         c_cur += (_K_b * _G_b) / (FLOAT_SIMD_EPILOGUE);
         // c_cur += 1;
@@ -231,7 +384,6 @@ void inline kernel_left(
     O_ptr += _G_b * _K_b;
 }
 
-//****************************************************************************
 template <typename ScalarT,
           typename AccumT,
           dim_t _G_b,
@@ -242,85 +394,79 @@ template <typename ScalarT,
           dim_t _UNROLL,
           OpType op_type,
           int8_t op_class>
-void inline kernel(
+void inline rem_kernel_left(
     bool first,
     dim_t F_h,
     dim_t F_w,
     dim_t input_col_stride,
+    dim_t l_pad_el,
+    dim_t l_pad,
     ScalarT const *I,
     ScalarT const *F,
     AccumT *O, // ScalarT -> AccumT
+    const dim_t G_left,
+    const dim_t K_left,
     dim_t H_lb = 0,
-    dim_t H_ub = 0,
-    dim_t W_lb = 0,
-    dim_t W_ub = 0)
+    dim_t H_ub = 0
+    )
 {
-    constexpr dim_t _C_ob = _G_b * _K_b;
-    constexpr dim_t _C_ib = _G_b * _F_cb;
-    constexpr dim_t step = _stride * _C_ib;
-
+    const dim_t _C_ob = G_left * K_left;
 
     const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub);
-    // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub);
+    FLOAT_DEF_END_C(_O_wb, _C_ob);
+
+    // left padding elements
+    AccumT *O_ptr = O; // ScalarT -> AccumT
+    ScalarT const *I_ptr = I;
+
+    int W_i_valid = l_pad;
 
-    FLOAT_DEF_TILE_C(_O_wb, _C_ob);
     if (first)
     {
-        FLOAT_ZERO_TILE_C(_O_wb, _C_ob);
-        if (op_type == OP_MAX_POOL || op_type == OP_MUL)
-        {
-            /// @note using platform C_ob
-            FLOAT_LOAD_TILE_C_strided(I, step, _O_wb, FLOAT_C_ob);
-        }
-        else if (op_type == OP_UPSAMPLE)
-        {
-            FLOAT_LOAD_TILE_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob);
-        }
+        FLOAT_ZERO_END_C(l_pad_el, _C_ob);
     }
     else
     {
-        FLOAT_LOAD_TILE_C(O, _O_wb, _C_ob);
-        if constexpr (op_type == OP_UPSAMPLE)
-                     {
-                         FLOAT_ACCUM_TILE_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob);
-                     }
-        //@todo support reduction tree-like kernel (for global reductions)
+        FLOAT_LOAD_END_C(O_ptr, l_pad_el, _C_ob);
     }
 
-    for (uint32_t n = H_lb; n < H_UPPER; n++)
+    c_tile_t *c_cur = c_tile;
+    // dim_t c_cur = 0;
+    for (uint32_t k_p = 0; k_p < l_pad_el; k_p++)
     {
-        int filter_offset_h = n * F_w * _F_cb * _G_b * _K_b;
-        int input_stencil_h = (n - H_lb) * input_col_stride; /*+ input_col_offset + input_row_offset*/
-
-        for (uint32_t m = 0; m < F_w; m++)
-        {
-            int filter_offset_w = m * _F_cb * _G_b * _K_b + filter_offset_h;
-            // This is C_ob because the microkernel stretches across groups
-            int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h;
-
-            ScalarT const *b = F + filter_offset_w;
-            ScalarT const *a = I + input_stencil_w;
-            for (uint32_t ii = 0; ii < _F_cb / _UNROLL; ii++)
-            {
-                /// @note using platform C_ob
-                ScalarT const *b_cur = b + ii * _UNROLL * FLOAT_C_ob;
-                ScalarT const *a_cur = a + ii * _UNROLL;
-                FLOAT_ABSTRACT_OP(op_type, op_class, a_cur, b_cur, _O_wb, _C_ob); /// @todo pass _C_ob
-            }
-        }
+        // UNROLL has to be 1 for this to work (FLOAT_UNROLL == 1 for ZEN2)
+        compute_with_padding<ScalarT, AccumT,
+                             _G_b, _K_b, _F_cb, _O_wb, _stride,
+                             _UNROLL, op_type, op_class>(
+            H_lb, H_UPPER,
+            W_i_valid, F_w,
+            F_w,
+            1,
+            input_col_stride,
+            F,
+            I_ptr,
+            c_cur,
+            _F_cb,
+            G_left,
+            K_left);
+
+        c_cur += (G_left * K_left) / (FLOAT_SIMD_EPILOGUE);
+        // c_cur += 1;
+        W_i_valid -= _stride;
+        // I_ptr += ()*(_stride * _F_cb * _G_b);
     }
-
+    // Fusion Slot # 1
+    //  Include division for Average Pooling
     if (op_type == OP_AVERAGE_POOL)
     {
         float norm = 1.0 / (1.0 * F_h * F_w);
-        FLOAT_DIV_TILE_C(norm, _O_wb, _C_ob);
+        FLOAT_DIV_END_C(c_tile, norm, l_pad_el, _C_ob);
     }
-    FLOAT_STORE_TILE_C(O, _O_wb, _C_ob);
-    //@todo support reduction-tree like store for global reductions
+    FLOAT_STORE_END_C(O_ptr, l_pad_el, _C_ob);
+    O_ptr += G_left * K_left;
 }
 
-//****************************************************************************
-// TODO: Explain the difference between kernel and kernel_pad
+// Edge case to handle remainder input channels
 template <typename ScalarT,
           typename AccumT,
           dim_t _G_b,
@@ -331,77 +477,76 @@ template <typename ScalarT,
           dim_t _UNROLL,
           OpType op_type,
           int8_t op_class>
-void inline kernel_pad(
+void inline kernel_left_rem(
     bool first,
     dim_t F_h,
     dim_t F_w,
     dim_t input_col_stride,
+    dim_t l_pad_el,
+    dim_t l_pad,
     ScalarT const *I,
     ScalarT const *F,
     AccumT *O, // ScalarT -> AccumT
+    const dim_t F_c_left,
     dim_t H_lb = 0,
-    dim_t H_ub = 0,
-    dim_t W_lb = 0,
-    dim_t W_ub = 0)
+    dim_t H_ub = 0)
 {
     constexpr dim_t _C_ob = _G_b * _K_b;
-    constexpr dim_t _C_ib = _G_b * _F_cb;
-    constexpr dim_t step = _stride * _C_ib;
+    ;
 
     const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub);
-    // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub);
+    FLOAT_DEF_END_C(_O_wb, _C_ob);
+
+    // left padding elements
+    AccumT *O_ptr = O; // ScalarT -> AccumT
+    ScalarT const *I_ptr = I;
+
+    int W_i_valid = l_pad;
 
-    FLOAT_DEF_TILE_C(_O_wb, _C_ob);
     if (first)
     {
-        FLOAT_ZERO_TILE_C(_O_wb, _C_ob);
-
-        //@note padding  should always be 'v' for pointwise operations, so this code path should not be used
-        if(op_type == OP_MUL)
-        {
-            FLOAT_LOAD_TILE_C_strided(I, step, _O_wb, _C_ob);
-        }
+        FLOAT_ZERO_END_C(l_pad_el, _C_ob);
     }
     else
     {
-        FLOAT_LOAD_TILE_C(O, _O_wb, _C_ob);
+        FLOAT_LOAD_END_C(O_ptr, l_pad_el, _C_ob);
     }
 
-
-    for (uint32_t n = H_lb; n < H_UPPER; n++)
+    c_tile_t *c_cur = c_tile;
+    // dim_t c_cur = 0;
+    for (uint32_t k_p = 0; k_p < l_pad_el; k_p++)
     {
-        int filter_offset_h = n * F_w * _F_cb * _G_b * _K_b;
-        int input_stencil_h = /*input_col_offset + input_row_offset +*/
-            (n - H_lb) * input_col_stride;
-
-        for (uint32_t m = 0; m < F_w; m++)
-        {
-            int filter_offset_w = m * _F_cb * _G_b * _K_b + filter_offset_h;
-            // This is C_ob because the microkernel stretches across groups
-            int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h;
 
-            ScalarT const *b = F + filter_offset_w;
-            ScalarT const *a = I + input_stencil_w;
-
-            for (uint32_t ii = 0; ii < _F_cb / _UNROLL; ii++)
-            {
-                /// @note using platform C_ob
-                ScalarT const *b_cur = b + ii * _UNROLL * FLOAT_C_ob;
-                ScalarT const *a_cur = a + ii * _UNROLL;
+            compute_with_padding<ScalarT, AccumT,
+                                 _G_b, _K_b, _F_cb, _O_wb, _stride,
+                                 _UNROLL, op_type, op_class>(
+                H_lb, H_UPPER,
+                W_i_valid, F_w,
+                F_w,
+                1,
+                input_col_stride,
+                F,
+                I_ptr,
+                c_cur,
+                F_c_left);
 
-                FLOAT_ABSTRACT_OP(op_type, op_class, a_cur, b_cur, _O_wb, _C_ob);
-            }
-        }
+        c_cur += (_K_b * _G_b) / (FLOAT_SIMD_EPILOGUE);
+        // c_cur += 1;
+        W_i_valid -= _stride;
+        // I_ptr += ()*(_stride * _F_cb * _G_b);
     }
+    // Fusion Slot # 1
+    //  Include division for Average Pooling
     if (op_type == OP_AVERAGE_POOL)
     {
         float norm = 1.0 / (1.0 * F_h * F_w);
-        FLOAT_DIV_TILE_C(norm, _O_wb, _C_ob);
+        FLOAT_DIV_END_C(c_tile, norm, l_pad_el, _C_ob);
     }
-    FLOAT_STORE_TILE_C(O, _O_wb, _C_ob);
+    FLOAT_STORE_END_C(O_ptr, l_pad_el, _C_ob);
+    O_ptr += _G_b * _K_b;
 }
 
-//****************************************************************************
+// Edge case to handle remainder input channels
 template <typename ScalarT,
           typename AccumT,
           dim_t _G_b,
@@ -412,27 +557,1265 @@ template <typename ScalarT,
           dim_t _UNROLL,
           OpType op_type,
           int8_t op_class>
-void inline kernel_right(
+void inline rem_kernel_left_rem(
     bool first,
     dim_t F_h,
     dim_t F_w,
     dim_t input_col_stride,
-    dim_t O_w_left,
-    dim_t r_pad_el,
-    dim_t r_pad,
+    dim_t l_pad_el,
+    dim_t l_pad,
     ScalarT const *I,
     ScalarT const *F,
     AccumT *O, // ScalarT -> AccumT
+    const dim_t F_c_left,
+    const dim_t G_left,
+    const dim_t K_left,
     dim_t H_lb = 0,
     dim_t H_ub = 0)
 {
-    constexpr dim_t _C_ob = _G_b * _K_b;
-    constexpr dim_t _C_ib = _G_b * _F_cb;
-    constexpr dim_t step = _stride * _C_ib;
+    const dim_t _C_ob = G_left * K_left;
+    ;
+
     const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub);
     FLOAT_DEF_END_C(_O_wb, _C_ob);
-#if DEBUG
+
+    // left padding elements
+    AccumT *O_ptr = O; // ScalarT -> AccumT
+    ScalarT const *I_ptr = I;
+
+    int W_i_valid = l_pad;
+
+    if (first)
+    {
+        FLOAT_ZERO_END_C(l_pad_el, _C_ob);
+    }
+    else
+    {
+        FLOAT_LOAD_END_C(O_ptr, l_pad_el, _C_ob);
+    }
+
+    c_tile_t *c_cur = c_tile;
+    // dim_t c_cur = 0;
+    for (uint32_t k_p = 0; k_p < l_pad_el; k_p++)
+    {
+
+        compute_with_padding<ScalarT, AccumT,
+                             _G_b, _K_b, _F_cb, _O_wb, _stride,
+                             _UNROLL, op_type, op_class>(
+            H_lb, H_UPPER,
+            W_i_valid, F_w,
+            F_w,
+            1,
+            input_col_stride,
+            F,
+            I_ptr,
+            c_cur,
+            F_c_left,
+            G_left,
+            K_left);
+
+        c_cur += (K_left * G_left) / (FLOAT_SIMD_EPILOGUE);
+        // c_cur += 1;
+        W_i_valid -= _stride;
+        // I_ptr += ()*(_stride * _F_cb * _G_b);
+    }
+    // Fusion Slot # 1
+    //  Include division for Average Pooling
+    if (op_type == OP_AVERAGE_POOL)
+    {
+        float norm = 1.0 / (1.0 * F_h * F_w);
+        FLOAT_DIV_END_C(c_tile, norm, l_pad_el, _C_ob);
+    }
+    FLOAT_STORE_END_C(O_ptr, l_pad_el, _C_ob);
+    O_ptr += G_left * K_left;
+}
+
+//****************************************************************************
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline kernel(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O, // ScalarT -> AccumT
+    dim_t H_lb = 0,
+    dim_t H_ub = 0,
+    dim_t W_lb = 0,
+    dim_t W_ub = 0)
+{
+    constexpr dim_t _C_ob = _G_b * _K_b;
+    constexpr dim_t _C_ib = _G_b * _F_cb;
+    constexpr dim_t step = _stride * _C_ib;
+
+
+    const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub);
+    // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub);
+
+    FLOAT_DEF_TILE_C(_O_wb, _C_ob);
+    if (first)
+    {
+        FLOAT_ZERO_TILE_C(_O_wb, _C_ob);
+        if (op_type == OP_MAX_POOL || op_type == OP_MUL)
+        {
+            /// @note using platform C_ob
+            FLOAT_LOAD_TILE_C_strided(I, step, _O_wb, _C_ob);
+        }
+        else if (op_type == OP_UPSAMPLE)
+        {
+            FLOAT_LOAD_TILE_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob);
+        }
+    }
+    else
+    {
+        FLOAT_LOAD_TILE_C(O, _O_wb, _C_ob);
+        if constexpr (op_type == OP_UPSAMPLE)
+                     {
+                         FLOAT_ACCUM_TILE_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob);
+                     }
+        //@todo support reduction tree-like kernel (for global reductions)
+    }
+
+    for (uint32_t n = H_lb; n < H_UPPER; n++)
+    {
+        int filter_offset_h = n * F_w * _F_cb * _G_b * _K_b;
+        int input_stencil_h = (n - H_lb) * input_col_stride; /*+ input_col_offset + input_row_offset*/
+
+        for (uint32_t m = 0; m < F_w; m++)
+        {
+            int filter_offset_w = m * _F_cb * _G_b * _K_b + filter_offset_h;
+            // This is C_ob because the microkernel stretches across groups
+            int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h;
+
+            ScalarT const *b = F + filter_offset_w;
+            ScalarT const *a = I + input_stencil_w;
+            for (uint32_t ii = 0; ii < _F_cb / _UNROLL; ii++)
+            {
+                /// @note using platform C_ob
+                ScalarT const *b_cur = b + ii * _UNROLL * _C_ob;
+                ScalarT const *a_cur = a + ii * _UNROLL;
+                FLOAT_ABSTRACT_OP(op_type, op_class, a_cur, b_cur, _O_wb, _C_ob); /// @todo pass _C_ob
+            }
+        }
+    }
+
+    if (op_type == OP_AVERAGE_POOL)
+    {
+        float norm = 1.0 / (1.0 * F_h * F_w);
+        FLOAT_DIV_TILE_C(norm, _O_wb, _C_ob);
+    }
+    FLOAT_STORE_TILE_C(O, _O_wb, _C_ob);
+    //@todo support reduction-tree like store for global reductions
+}
+
+//Edge case to handle remainder output channels
+
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline rem_kernel(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O, // ScalarT -> AccumT
+    const dim_t G_left,
+    const dim_t K_left,
+    dim_t H_lb = 0,
+    dim_t H_ub = 0,
+    dim_t W_lb = 0,
+    dim_t W_ub = 0)
+{
+    const dim_t _C_ob = G_left * K_left;
+    const dim_t _C_ib = G_left * _F_cb;
+    const dim_t step = _stride * _C_ib;
+
+    // printf(" _C_ob %d\n", _C_ob);
+    // printf(" _C_ib %d\n", _C_ib);
+    const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub);
+    // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub);
+
+    FLOAT_DEF_END_C(_O_wb, _C_ob);
+    if (first)
+    {
+        FLOAT_ZERO_END_C(_O_wb, _C_ob);
+        if (op_type == OP_MAX_POOL || op_type == OP_MUL)
+        {
+            /// @note using platform C_ob
+            FLOAT_LOAD_END_C_strided(I, step, _O_wb, _C_ob);
+        }
+        else if (op_type == OP_UPSAMPLE)
+        {
+            FLOAT_LOAD_END_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob);
+        }
+    }
+    else
+    {
+        FLOAT_LOAD_END_C(O, _O_wb, _C_ob);
+        if constexpr (op_type == OP_UPSAMPLE)
+        {
+            FLOAT_ACCUM_END_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob);
+        }
+        //@todo support reduction tree-like kernel (for global reductions)
+    }
+
+    for (uint32_t n = H_lb; n < H_UPPER; n++)
+    {
+        int filter_offset_h = n * F_w * _F_cb * G_left * K_left;
+        int input_stencil_h = (n - H_lb) * input_col_stride; /*+ input_col_offset + input_row_offset*/
+
+        for (uint32_t m = 0; m < F_w; m++)
+        {
+            int filter_offset_w = m * _F_cb * G_left * K_left + filter_offset_h;
+            // This is C_ob because the microkernel stretches across groups
+            int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h;
+
+            ScalarT const *b = F + filter_offset_w;
+            ScalarT const *a = I + input_stencil_w;
+            for (uint32_t ii = 0; ii < _F_cb / _UNROLL; ii++)
+            {
+                /// @note using platform C_ob
+                ScalarT const *b_cur = b + ii * _UNROLL * _C_ob;
+                ScalarT const *a_cur = a + ii * _UNROLL;
+                FLOAT_ABSTRACT_OP_END(op_type, op_class, step, a_cur, b_cur, c_tile, _O_wb, _C_ob); /// @todo pass _C_ob
+            }
+        }
+    }
+
+    if (op_type == OP_AVERAGE_POOL)
+    {
+        float norm = 1.0 / (1.0 * F_h * F_w);
+        FLOAT_DIV_END_C(c_tile, norm, _O_wb, _C_ob);
+    }
+
+    FLOAT_STORE_END_C(O, _O_wb, _C_ob);
+    //@todo support reduction-tree like store for global reductions
+}
+
+//Edge case to handle remainder input channels
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline kernel_rem(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O, // ScalarT -> AccumT
+    const dim_t F_c_left,
+    dim_t H_lb = 0,
+    dim_t H_ub = 0,
+    dim_t W_lb = 0,
+    dim_t W_ub = 0)
+{
+    constexpr dim_t _C_ob = _G_b * _K_b;
+     dim_t _C_ib = _G_b * F_c_left;
+     dim_t step = _stride * _C_ib;
+
+    const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub);
+    // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub);
+
+    FLOAT_DEF_TILE_C(_O_wb, _C_ob);
+    if (first)
+    {
+        FLOAT_ZERO_TILE_C(_O_wb, _C_ob);
+        if (op_type == OP_MAX_POOL || op_type == OP_MUL)
+        {
+            /// @note using platform C_ob
+            FLOAT_LOAD_TILE_C_strided(I, step, _O_wb, FLOAT_C_ob);
+        }
+        else if (op_type == OP_UPSAMPLE)
+        {
+            FLOAT_LOAD_TILE_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob);
+        }
+    }
+    else
+    {
+        FLOAT_LOAD_TILE_C(O, _O_wb, _C_ob);
+        if constexpr (op_type == OP_UPSAMPLE)
+        {
+            FLOAT_ACCUM_TILE_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob);
+        }
+        //@todo support reduction tree-like kernel (for global reductions)
+    }
+
+    for (uint32_t n = H_lb; n < H_UPPER; n++)
+    {
+        int filter_offset_h = n * F_w * F_c_left * _G_b * _K_b;
+        int input_stencil_h = (n - H_lb) * input_col_stride; /*+ input_col_offset + input_row_offset*/
+
+        for (uint32_t m = 0; m < F_w; m++)
+        {
+            int filter_offset_w = m * F_c_left * _G_b * _K_b + filter_offset_h;
+            // This is C_ob because the microkernel stretches across groups
+            int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h;
+
+            ScalarT const *b = F + filter_offset_w;
+            ScalarT const *a = I + input_stencil_w;
+            for (uint32_t ii = 0; ii < F_c_left / _UNROLL; ii++)
+            {
+                /// @note using platform C_ob
+                ScalarT const *b_cur = b + ii * _UNROLL * FLOAT_C_ob;
+                ScalarT const *a_cur = a + ii * _UNROLL;
+                FLOAT_ABSTRACT_OP(op_type, op_class, a_cur, b_cur, _O_wb, _C_ob); /// @todo pass _C_ob
+            }
+        }
+    }
+
+    if (op_type == OP_AVERAGE_POOL)
+    {
+        float norm = 1.0 / (1.0 * F_h * F_w);
+        FLOAT_DIV_TILE_C(norm, _O_wb, _C_ob);
+    }
+    FLOAT_STORE_TILE_C(O, _O_wb, _C_ob);
+    //@todo support reduction-tree like store for global reductions
+}
+
+// Edge case to handle remainder input and output channels
+// Edge case to handle remainder input channels
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline rem_kernel_rem(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O, // ScalarT -> AccumT
+    const dim_t F_c_left,
+    const dim_t G_left,
+    const dim_t K_left, 
+    dim_t H_lb = 0,
+    dim_t H_ub = 0,
+    dim_t W_lb = 0,
+    dim_t W_ub = 0)
+{
+    const dim_t _C_ob = G_left * K_left;
+    dim_t _C_ib = G_left * F_c_left;
+    dim_t step = _stride * _C_ib;
+
+    const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub);
+    // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub);
+
+    FLOAT_DEF_END_C(_O_wb, _C_ob);
+    if (first)
+    {
+        FLOAT_ZERO_END_C(_O_wb, _C_ob);
+        if (op_type == OP_MAX_POOL || op_type == OP_MUL)
+        {
+            /// @note using platform C_ob
+            FLOAT_LOAD_END_C_strided(I, step, _O_wb, FLOAT_C_ob);
+        }
+        else if (op_type == OP_UPSAMPLE)
+        {
+            FLOAT_LOAD_END_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob);
+        }
+    }
+    else
+    {
+        FLOAT_LOAD_END_C(O, _O_wb, _C_ob);
+        if constexpr (op_type == OP_UPSAMPLE)
+        {
+            FLOAT_ACCUM_END_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob);
+        }
+        //@todo support reduction tree-like kernel (for global reductions)
+    }
+
+    for (uint32_t n = H_lb; n < H_UPPER; n++)
+    {
+        int filter_offset_h = n * F_w * F_c_left * G_left * K_left;
+        int input_stencil_h = (n - H_lb) * input_col_stride; /*+ input_col_offset + input_row_offset*/
+
+        for (uint32_t m = 0; m < F_w; m++)
+        {
+            int filter_offset_w = m * F_c_left * G_left * K_left + filter_offset_h;
+            // This is C_ob because the microkernel stretches across groups
+            int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h;
+
+            ScalarT const *b = F + filter_offset_w;
+            ScalarT const *a = I + input_stencil_w;
+            for (uint32_t ii = 0; ii < F_c_left / _UNROLL; ii++)
+            {
+                /// @note using platform C_ob
+                ScalarT const *b_cur = b + ii * _UNROLL * _C_ob;
+                ScalarT const *a_cur = a + ii * _UNROLL;
+                FLOAT_ABSTRACT_OP_END(op_type, op_class, step, a_cur, b_cur, c_tile,  _O_wb, _C_ob); /// @todo pass _C_ob
+            }
+        }
+    }
+
+    if (op_type == OP_AVERAGE_POOL)
+    {
+        float norm = 1.0 / (1.0 * F_h * F_w);
+        FLOAT_DIV_END_C(c_tile, norm, _O_wb, _C_ob);
+    }
+    FLOAT_STORE_END_C(O, _O_wb, _C_ob);
+    //@todo support reduction-tree like store for global reductions
+}
+
+//****************************************************************************
+//The kernel pad function allows for padding in the height and width of the filter
+// If the compiler complies, kernel_pad and kernel could be combined
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline kernel_pad(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O, // ScalarT -> AccumT
+    dim_t H_lb = 0,
+    dim_t H_ub = 0,
+    dim_t W_lb = 0,
+    dim_t W_ub = 0)
+{
+    constexpr dim_t _C_ob = _G_b * _K_b;
+    constexpr dim_t _C_ib = _G_b * _F_cb;
+    constexpr dim_t step = _stride * _C_ib;
+
+    const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub);
+    // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub);
+
+    FLOAT_DEF_TILE_C(_O_wb, _C_ob);
+    if (first)
+    {
+        FLOAT_ZERO_TILE_C(_O_wb, _C_ob);
+
+        //@note padding  should always be 'v' for pointwise operations, so this code path should not be used
+        if(op_type == OP_MUL)
+        {
+            FLOAT_LOAD_TILE_C_strided(I, step, _O_wb, _C_ob);
+        }
+
+    }
+    else
+    {
+        FLOAT_LOAD_TILE_C(O, _O_wb, _C_ob);
+    }
+
+
+    for (uint32_t n = H_lb; n < H_UPPER; n++)
+    {
+        int filter_offset_h = n * F_w * _F_cb * _G_b * _K_b;
+        int input_stencil_h = /*input_col_offset + input_row_offset +*/
+            (n - H_lb) * input_col_stride;
+
+        for (uint32_t m = 0; m < F_w; m++)
+        {
+            int filter_offset_w = m * _F_cb * _G_b * _K_b + filter_offset_h;
+            // This is C_ob because the microkernel stretches across groups
+            int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h;
+
+            ScalarT const *b = F + filter_offset_w;
+            ScalarT const *a = I + input_stencil_w;
+
+            for (uint32_t ii = 0; ii < _F_cb / _UNROLL; ii++)
+            {
+                /// @note using platform C_ob
+                ScalarT const *b_cur = b + ii * _UNROLL * FLOAT_C_ob;
+                ScalarT const *a_cur = a + ii * _UNROLL;
+
+                FLOAT_ABSTRACT_OP(op_type, op_class, a_cur, b_cur, _O_wb, _C_ob);
+            }
+        }
+    }
+    if (op_type == OP_AVERAGE_POOL)
+    {
+        float norm = 1.0 / (1.0 * F_h * F_w);
+        FLOAT_DIV_TILE_C(norm, _O_wb, _C_ob);
+    }
+    FLOAT_STORE_TILE_C(O, _O_wb, _C_ob);
+}
+
+//Edge case to handle remainder output channels
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline rem_kernel_pad(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O, // ScalarT -> AccumT
+    const dim_t G_left,
+    const dim_t K_left,
+    dim_t H_lb = 0,
+    dim_t H_ub = 0,
+    dim_t W_lb = 0,
+    dim_t W_ub = 0)
+{
+    const dim_t _C_ob = G_left * K_left;
+    const dim_t _C_ib = G_left * _F_cb;
+    const dim_t step = _stride * _C_ib;
+
+    const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub);
+    // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub);
+
+    FLOAT_DEF_END_C(_O_wb, _C_ob);
+    if (first)
+    {
+        FLOAT_ZERO_END_C(_O_wb, _C_ob);
+
+        //@note padding  should always be 'v' for pointwise operations, so this code path should not be used
+        if(op_type == OP_MUL)
+        {
+            FLOAT_LOAD_END_C_strided(I, step, _O_wb, _C_ob);
+        }
+    }
+    else
+    {
+        FLOAT_LOAD_END_C(O, _O_wb, _C_ob);
+    }
+
+
+    for (uint32_t n = H_lb; n < H_UPPER; n++)
+    {
+        int filter_offset_h = n * F_w * _F_cb * _C_ob;
+        int input_stencil_h = /*input_col_offset + input_row_offset +*/
+            (n - H_lb) * input_col_stride;
+
+        for (uint32_t m = 0; m < F_w; m++)
+        {
+            int filter_offset_w = m * _F_cb * _C_ob + filter_offset_h;
+            // This is C_ob because the microkernel stretches across groups
+            int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h;
+
+            ScalarT const *b = F + filter_offset_w;
+            ScalarT const *a = I + input_stencil_w;
+
+            for (uint32_t ii = 0; ii < _F_cb / _UNROLL; ii++)
+            {
+                /// @note using platform C_ob
+                ScalarT const *b_cur = b + ii * _UNROLL * _C_ob;
+                ScalarT const *a_cur = a + ii * _UNROLL;
+
+                FLOAT_ABSTRACT_OP_END(op_type, op_class, step, a_cur, b_cur, c_tile, _O_wb, _C_ob);
+            }
+        }
+    }
+    if (op_type == OP_AVERAGE_POOL)
+    {
+        float norm = 1.0 / (1.0 * F_h * F_w);
+        FLOAT_DIV_END_C(c_tile, norm, _O_wb, _C_ob);
+    }
+    FLOAT_STORE_END_C(O, _O_wb, _C_ob);
+}
+
+
+//Edge case to handle remainder input channels
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline kernel_pad_rem(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O, // ScalarT -> AccumT
+    const dim_t F_c_left,
+    dim_t H_lb = 0,
+    dim_t H_ub = 0,
+    dim_t W_lb = 0,
+    dim_t W_ub = 0
+    )
+{
+    constexpr dim_t _C_ob = _G_b * _K_b;
+    const dim_t _C_ib = _G_b * F_c_left;
+    const dim_t step = _stride * _C_ib;
+
+    const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub);
+    // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub);
+
+    FLOAT_DEF_TILE_C(_O_wb, _C_ob);
+    if (first)
+    {
+        FLOAT_ZERO_TILE_C(_O_wb, _C_ob);
+
+        //@note padding  should always be 'v' for pointwise operations, so this code path should not be used
+        if (op_type == OP_MUL)
+        {
+            FLOAT_LOAD_TILE_C_strided(I, step, _O_wb, _C_ob);
+        }
+    }
+    else
+    {
+        FLOAT_LOAD_TILE_C(O, _O_wb, _C_ob);
+    }
+
+    for (uint32_t n = H_lb; n < H_UPPER; n++)
+    {
+        int filter_offset_h = n * F_w * F_c_left * _G_b * _K_b;
+        int input_stencil_h = /*input_col_offset + input_row_offset +*/
+            (n - H_lb) * input_col_stride;
+
+        for (uint32_t m = 0; m < F_w; m++)
+        {
+            int filter_offset_w = m * F_c_left * _G_b * _K_b + filter_offset_h;
+            // This is C_ob because the microkernel stretches across groups
+            int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h;
+
+            ScalarT const *b = F + filter_offset_w;
+            ScalarT const *a = I + input_stencil_w;
+
+            for (uint32_t ii = 0; ii < F_c_left / _UNROLL; ii++)
+            {
+                /// @note using platform C_ob
+                ScalarT const *b_cur = b + ii * _UNROLL * _C_ob;
+                ScalarT const *a_cur = a + ii * _UNROLL;
+
+                FLOAT_ABSTRACT_OP(op_type, op_class, a_cur, b_cur, _O_wb, _C_ob);
+            }
+        }
+    }
+    if (op_type == OP_AVERAGE_POOL)
+    {
+        float norm = 1.0 / (1.0 * F_h * F_w);
+        FLOAT_DIV_TILE_C(norm, _O_wb, _C_ob);
+    }
+    FLOAT_STORE_TILE_C(O, _O_wb, _C_ob);
+}
+
+//Edge case to handle remainder input and output channels
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline rem_kernel_pad_rem(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O, // ScalarT -> AccumT
+    const dim_t F_c_left,
+    const dim_t G_left, 
+    const dim_t K_left,
+    dim_t H_lb = 0,
+    dim_t H_ub = 0,
+    dim_t W_lb = 0,
+    dim_t W_ub = 0)
+{
+    const dim_t _C_ob = G_left * K_left;
+    const dim_t _C_ib = G_left * F_c_left;
+    const dim_t step = _stride * _C_ib;
+
+    const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub);
+    // const dim_t W_UPPER = ((!W_ub) * (F_w)) + (W_ub);
+
+    FLOAT_DEF_END_C(_O_wb, _C_ob);
+    if (first)
+    {
+        FLOAT_ZERO_END_C(_O_wb, _C_ob);
+
+        //@note padding  should always be 'v' for pointwise operations, so this code path should not be used
+        if (op_type == OP_MUL)
+        {
+            FLOAT_LOAD_END_C_strided(I, step, _O_wb, _C_ob);
+        }
+    }
+    else
+    {
+        FLOAT_LOAD_END_C(O, _O_wb, _C_ob);
+    }
+
+    for (uint32_t n = H_lb; n < H_UPPER; n++)
+    {
+        int filter_offset_h = n * F_w * F_c_left * G_left * K_left;
+        int input_stencil_h = /*input_col_offset + input_row_offset +*/
+            (n - H_lb) * input_col_stride;
+
+        for (uint32_t m = 0; m < F_w; m++)
+        {
+            int filter_offset_w = m * F_c_left * G_left * K_left + filter_offset_h;
+            // This is C_ob because the microkernel stretches across groups
+            int input_stencil_w = (m - W_lb) * _C_ib + input_stencil_h;
+
+            ScalarT const *b = F + filter_offset_w;
+            ScalarT const *a = I + input_stencil_w;
+
+            for (uint32_t ii = 0; ii < F_c_left / _UNROLL; ii++)
+            {
+                /// @note using platform C_ob
+                ScalarT const *b_cur = b + ii * _UNROLL * _C_ob;
+                ScalarT const *a_cur = a + ii * _UNROLL;
+
+                FLOAT_ABSTRACT_OP_END(op_type, op_class, step, a_cur, b_cur, c_tile,  _O_wb, _C_ob);
+            }
+        }
+    }
+    if (op_type == OP_AVERAGE_POOL)
+    {
+        float norm = 1.0 / (1.0 * F_h * F_w);
+        FLOAT_DIV_END_C(c_tile, norm, _O_wb, _C_ob);
+    }
+    FLOAT_STORE_END_C(O, _O_wb, _C_ob);
+    //prinf the stored _O_wb x _C_ob output tile
+    
+}
+
+//****************************************************************************
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline kernel_right(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    dim_t O_w_left,
+    dim_t r_pad_el,
+    dim_t r_pad,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O, // ScalarT -> AccumT
+    dim_t H_lb = 0,
+    dim_t H_ub = 0)
+{
+    constexpr dim_t _C_ob = _G_b * _K_b;
+    constexpr dim_t _C_ib = _G_b * _F_cb;
+    constexpr dim_t step = _stride * _C_ib;
+    const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub);
+    FLOAT_DEF_END_C(_O_wb, _C_ob);
+
+#if DEBUG
+    printf("O_W_left %d r_pad_el %d\n", O_w_left, r_pad_el);
+#endif
+    if (O_w_left)
+    {
+        if (first)
+        {
+            FLOAT_ZERO_END_C(O_w_left, _C_ob);
+
+            if ( (op_type == OP_MUL)|| (op_type == OP_MAX_POOL && H_lb == 0 && H_ub == 0))
+            {
+                FLOAT_LOAD_END_C_strided(I, step, O_w_left, _C_ob);
+            }
+            else if (op_type == OP_UPSAMPLE)
+            {
+                FLOAT_LOAD_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob);
+            }
+        }
+        else
+        {
+            if constexpr(op_type == OP_ADD && op_class == 3)
+            {
+                FLOAT_ZERO_END_C(O_w_left, _C_ob);
+            }
+            FLOAT_LOAD_END_C(O, O_w_left, _C_ob);
+            if constexpr (op_type == OP_UPSAMPLE)
+            {
+                             FLOAT_ACCUM_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob);
+            }
+        }
+
+        compute_with_padding<ScalarT, AccumT,
+                             _G_b, _K_b, _F_cb, _O_wb, _stride,
+                             _UNROLL, op_type, op_class>(
+                                 H_lb, H_UPPER,
+                                 0, F_w,
+                                 F_w,
+                                 O_w_left,
+                                 input_col_stride,
+                                 F,
+                                 I,
+                                 c_tile);
+
+        if (op_type == OP_AVERAGE_POOL)
+        {
+            float norm = 1.0 / (1.0 * F_h * F_w);
+            FLOAT_DIV_END_C(c_tile, norm, O_w_left, _C_ob);
+        }
+        if constexpr(op_type == OP_ADD && op_class == 3 && _C_ob == 1)
+        {
+            /* If the operation reduces the channel dimension, reduce across channel dimension of simd tile*/
+            FLOAT_REDUCE_CHANNEL_END_C(O_w_left, _C_ob)
+        }
+
+        FLOAT_STORE_END_C(O, O_w_left, _C_ob);
+    }
+
+    // right padding elements
+    AccumT *O_ptr = O + O_w_left * _C_ob; // ScalarT --> AccumT
+    ScalarT const *I_ptr = I + O_w_left * step;
+    int W_i_valid = F_w - 1;
+
+    if (first)
+    {
+        FLOAT_ZERO_END_C(r_pad_el, _C_ob);
+
+        // Initialize with 0 for the padding elements
+
+        //@note padding  should always be 'v' for pointwise operations, so this code path should not be used
+        if (op_type == OP_MUL)
+        {
+            FLOAT_LOAD_END_C_strided(I_ptr, step, r_pad_el, _C_ob);
+        }
+    }
+    else
+    {
+        FLOAT_LOAD_END_C(O_ptr, r_pad_el, _C_ob);
+    }
+
+    c_tile_t *c_cur = c_tile;
+    // dim_t c_cur = 0;
+    for (uint32_t k_p = 0; k_p < r_pad_el; k_p++)
+    {
+        compute_with_padding<ScalarT, AccumT,
+                             _G_b, _K_b, _F_cb, _O_wb, _stride,
+                             _UNROLL, op_type, op_class>(
+                                 H_lb, H_UPPER,
+                                 0, W_i_valid,
+                                 F_w,
+                                 1,
+                                 input_col_stride,
+                                 F,
+                                 I_ptr,
+                                 c_cur);
+
+        c_cur += (_K_b * _G_b) / (FLOAT_SIMD_EPILOGUE);
+        W_i_valid -= _stride;
+        I_ptr += _stride * _F_cb * _G_b;
+    }
+
+    if (op_type == OP_AVERAGE_POOL)
+    {
+        float norm = 1.0 / (1.0 * F_h * F_w);
+        FLOAT_DIV_END_C(c_tile, norm, r_pad_el, _C_ob);
+    }
+
+    FLOAT_STORE_END_C(O_ptr, r_pad_el, _C_ob);
+}
+
+//Edge case to handle remainder output channels
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline rem_kernel_right(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    dim_t O_w_left,
+    dim_t r_pad_el,
+    dim_t r_pad,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O, // ScalarT -> AccumT
+    const dim_t G_left,
+    const dim_t K_left,
+    dim_t H_lb = 0,
+    dim_t H_ub = 0)
+{
+    const dim_t _C_ob = G_left * K_left;
+    const dim_t _C_ib = G_left * _F_cb;
+    const dim_t step = _stride * _C_ib;
+    const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub);
+    FLOAT_DEF_END_C(_O_wb, _C_ob);
+
+#if DEBUG == 1
+    printf("G_left %d K_left %d\n", G_left, K_left);
+    printf("kernel_right_rem\n");
+    printf("First 5 input values: %f %f %f %f %f\n", I[0], I[1], I[2], I[3], I[4]);
+    if(op_type == OP_CONV)
+    printf("First 5 Filter values for output channel 0 : %f %f %f %f %f \n", F[0], F[1], F[2], F[3], F[4]);
+    printf("O_W_left %d r_pad_el %d\n", O_w_left, r_pad_el);
+    printf("input col stride %d\n", input_col_stride);
+#endif
+    if (O_w_left)
+    {
+        if (first)
+        {
+            FLOAT_ZERO_END_C(O_w_left, _C_ob);
+
+            if ((op_type == OP_MUL) || (op_type == OP_MAX_POOL && H_lb == 0 && H_ub == 0))
+            {
+                FLOAT_LOAD_END_C_strided(I, step, O_w_left, _C_ob);
+            }
+            else if (op_type == OP_UPSAMPLE)
+            {
+                FLOAT_LOAD_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob);
+            }
+        }
+        else
+        {
+            if constexpr (op_type == OP_ADD && op_class == 3)
+            {
+                FLOAT_ZERO_END_C(O_w_left, _C_ob);
+            }
+            FLOAT_LOAD_END_C(O, O_w_left, _C_ob);
+            if constexpr (op_type == OP_UPSAMPLE)
+            {
+                FLOAT_ACCUM_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob);
+            }
+        }
+        compute_with_padding<ScalarT, AccumT,
+                             _G_b, _K_b, _F_cb, _O_wb, _stride,
+                             _UNROLL, op_type, op_class>(
+            H_lb, H_UPPER,
+            0, F_w,
+            F_w,
+            O_w_left,
+            input_col_stride,
+            F,
+            I,
+            c_tile,
+            _F_cb,
+            G_left,
+            K_left);
+
+        if (op_type == OP_AVERAGE_POOL)
+        {
+            float norm = 1.0 / (1.0 * F_h * F_w);
+            FLOAT_DIV_END_C(c_tile, norm, O_w_left, _C_ob);
+        }
+        if (op_type == OP_ADD && op_class == 3 && _C_ob == 1)
+        {
+            /* If the operation reduces the channel dimension, reduce across channel dimension of simd tile*/
+            FLOAT_REDUCE_REM_CHANNEL_END_C(O_w_left, _C_ob)
+        }
+
+        FLOAT_STORE_END_C(O, O_w_left, _C_ob);
+#if DEBUG == 1
+        printf("First output value: %f %f %f %f \n", O[0], O[1], O[2], O[3]);
+#endif
+    }
+
+    // right padding elements
+    AccumT *O_ptr = O + O_w_left * _C_ob; // ScalarT --> AccumT
+    ScalarT const *I_ptr = I + O_w_left * step;
+    int W_i_valid = F_w - 1;
+
+    if (first)
+    {
+        FLOAT_ZERO_END_C(r_pad_el, _C_ob);
+
+        // Initialize with 0 for the padding elements
+
+        //@note padding  should always be 'v' for pointwise operations, so this code path should not be used
+        if (op_type == OP_MUL)
+        {
+            FLOAT_LOAD_END_C_strided(I_ptr, step, r_pad_el, _C_ob);
+        }
+    }
+    else
+    {
+        FLOAT_LOAD_END_C(O_ptr, r_pad_el, _C_ob);
+    }
+
+    c_tile_t *c_cur = c_tile;
+    // dim_t c_cur = 0;
+    for (uint32_t k_p = 0; k_p < r_pad_el; k_p++)
+    {
+        compute_with_padding<ScalarT, AccumT,
+                             _G_b, _K_b, _F_cb, _O_wb, _stride,
+                             _UNROLL, op_type, op_class>(
+            H_lb, H_UPPER,
+            0, W_i_valid,
+            F_w,
+            1,
+            input_col_stride,
+            F,
+            I_ptr,
+            c_cur,
+            _F_cb,
+            G_left,
+            K_left);
+
+        c_cur += (K_left * G_left) / (FLOAT_SIMD_EPILOGUE);
+        W_i_valid -= _stride;
+        I_ptr += _stride * _C_ib;
+    }
+
+    if (op_type == OP_AVERAGE_POOL)
+    {
+        float norm = 1.0 / (1.0 * F_h * F_w);
+        FLOAT_DIV_END_C(c_tile, norm, r_pad_el, _C_ob);
+    }
+
+    FLOAT_STORE_END_C(O_ptr, r_pad_el, _C_ob);
+}
+
+//Edge case to handle remainder input channels
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline kernel_right_rem(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    dim_t O_w_left,
+    dim_t r_pad_el,
+    dim_t r_pad,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O, // ScalarT -> AccumT
+    const dim_t F_c_left,
+    dim_t H_lb = 0,
+    dim_t H_ub = 0)
+{
+    constexpr dim_t _C_ob = _G_b * _K_b;
+    const dim_t _C_ib = _G_b * F_c_left;
+    const dim_t step = _stride * _C_ib;
+    const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub);
+    FLOAT_DEF_END_C(_O_wb, _C_ob);
+
+#if DEBUG
+    printf("kernel_right_rem\n");
+    printf("First 5 input values: %f %f %f %f %f\n", I[0], I[1], I[2], I[3], I[4]);
+    printf("First 5 Filter values for output channel 0 : %f %f %f %f %f \n", F[0*_C_ob], F[1*_C_ob], F[2*_C_ob], F[3*_C_ob], F[4*_C_ob]);
+    printf("O_W_left %d r_pad_el %d\n", O_w_left, r_pad_el);
+    printf("input col stride %d\n", input_col_stride);
+#endif
+    if (O_w_left)
+    {
+        if (first)
+        {
+            FLOAT_ZERO_END_C(O_w_left, _C_ob);
+
+            if ((op_type == OP_MUL) || (op_type == OP_MAX_POOL && H_lb == 0 && H_ub == 0))
+            {
+                FLOAT_LOAD_END_C_strided(I, step, O_w_left, _C_ob);
+            }
+            else if (op_type == OP_UPSAMPLE)
+            {
+                FLOAT_LOAD_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob);
+            }
+        }
+        else
+        {
+            if constexpr (op_type == OP_ADD && op_class == 3)
+            {
+                FLOAT_ZERO_END_C(O_w_left, _C_ob);
+            }
+            FLOAT_LOAD_END_C(O, O_w_left, _C_ob);
+            if constexpr (op_type == OP_UPSAMPLE)
+            {
+                FLOAT_ACCUM_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob);
+            }
+        }
+
+
+            compute_with_padding<ScalarT, AccumT,
+                                 _G_b, _K_b, _F_cb, _O_wb, _stride,
+                                 _UNROLL, op_type, op_class>(
+                H_lb, H_UPPER,
+                0, F_w,
+                F_w,
+                O_w_left,
+                input_col_stride,
+                F,
+                I,
+                c_tile,
+                F_c_left);
+        
+            
+
+        if (op_type == OP_AVERAGE_POOL)
+        {
+            float norm = 1.0 / (1.0 * F_h * F_w);
+            FLOAT_DIV_END_C(c_tile, norm, O_w_left, _C_ob);
+        }
+        if constexpr (op_type == OP_ADD && op_class == 3 && _C_ob == 1)
+        {
+            /* If the operation reduces the channel dimension, reduce across channel dimension of simd tile*/
+            FLOAT_REDUCE_CHANNEL_END_C(O_w_left, _C_ob)
+        }
+
+        FLOAT_STORE_END_C(O, O_w_left, _C_ob);
+    }
+
+    // right padding elements
+    AccumT *O_ptr = O + O_w_left * _C_ob; // ScalarT --> AccumT
+    ScalarT const *I_ptr = I + O_w_left * step;
+    int W_i_valid = F_w - 1;
+
+    if (first)
+    {
+        FLOAT_ZERO_END_C(r_pad_el, _C_ob);
+
+        // Initialize with 0 for the padding elements
+
+        //@note padding  should always be 'v' for pointwise operations, so this code path should not be used
+        if (op_type == OP_MUL)
+        {
+            FLOAT_LOAD_END_C_strided(I_ptr, step, r_pad_el, _C_ob);
+        }
+    }
+    else
+    {
+        FLOAT_LOAD_END_C(O_ptr, r_pad_el, _C_ob);
+    }
+
+    c_tile_t *c_cur = c_tile;
+    // dim_t c_cur = 0;
+
+    for (uint32_t k_p = 0; k_p < r_pad_el; k_p++)
+    {
+        compute_with_padding<ScalarT, AccumT,
+                             _G_b, _K_b, _F_cb, _O_wb, _stride,
+                             _UNROLL, op_type, op_class>(
+            H_lb, H_UPPER,
+            0, W_i_valid,
+            F_w,
+            1,
+            input_col_stride,
+            F,
+            I_ptr,
+            c_cur, 
+            F_c_left);
+
+        c_cur += (_K_b * _G_b) / (FLOAT_SIMD_EPILOGUE);
+        W_i_valid -= _stride;
+        I_ptr += step;
+    }
+
+    if (op_type == OP_AVERAGE_POOL)
+    {
+        float norm = 1.0 / (1.0 * F_h * F_w);
+        FLOAT_DIV_END_C(c_tile, norm, r_pad_el, _C_ob);
+    }
+
+    FLOAT_STORE_END_C(O_ptr, r_pad_el, _C_ob);
+
+    #if DEBUG
+    printf("First output value: %f %f %f %f \n", O[0], O[1], O[2], O[3]);
+    #endif
+}
+
+//Edge case to handle remainder input and output channels
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline rem_kernel_right_rem(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    dim_t O_w_left,
+    dim_t r_pad_el,
+    dim_t r_pad,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O, // ScalarT -> AccumT
+    const dim_t F_c_left,
+    const dim_t G_left,
+    const dim_t K_left,
+    dim_t H_lb = 0,
+    dim_t H_ub = 0)
+{
+    const dim_t _C_ob = G_left * K_left;
+    const dim_t _C_ib = G_left * F_c_left;
+    const dim_t step = _stride * _C_ib;
+    const dim_t H_UPPER = ((!H_ub) * (F_h)) + (H_ub);
+    FLOAT_DEF_END_C(_O_wb, _C_ob);
+
+#if DEBUG
+    printf("kernel_right_rem\n");
+    printf("First 5 input values: %f %f %f %f %f\n", I[0], I[1], I[2], I[3], I[4]);
+    printf("First 5 Filter values for output channel 0 : %f %f %f %f %f \n", F[0 * _C_ob], F[1 * _C_ob], F[2 * _C_ob], F[3 * _C_ob], F[4 * _C_ob]);
     printf("O_W_left %d r_pad_el %d\n", O_w_left, r_pad_el);
+    printf("input col stride %d\n", input_col_stride);
 #endif
     if (O_w_left)
     {
@@ -440,7 +1823,7 @@ void inline kernel_right(
         {
             FLOAT_ZERO_END_C(O_w_left, _C_ob);
 
-            if ( (op_type == OP_MUL)|| (op_type == OP_MAX_POOL && H_lb == 0 && H_ub == 0))
+            if ((op_type == OP_MUL) || (op_type == OP_MAX_POOL && H_lb == 0 && H_ub == 0))
             {
                 FLOAT_LOAD_END_C_strided(I, step, O_w_left, _C_ob);
             }
@@ -451,38 +1834,41 @@ void inline kernel_right(
         }
         else
         {
-            if constexpr(op_type == OP_ADD && op_class == 3)
+            if constexpr (op_type == OP_ADD && op_class == 3)
             {
                 FLOAT_ZERO_END_C(O_w_left, _C_ob);
             }
             FLOAT_LOAD_END_C(O, O_w_left, _C_ob);
             if constexpr (op_type == OP_UPSAMPLE)
-                         {
-                             FLOAT_ACCUM_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob);
-                         }
+            {
+                FLOAT_ACCUM_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob);
+            }
         }
 
         compute_with_padding<ScalarT, AccumT,
                              _G_b, _K_b, _F_cb, _O_wb, _stride,
                              _UNROLL, op_type, op_class>(
-                                 H_lb, H_UPPER,
-                                 0, F_w,
-                                 F_w,
-                                 O_w_left,
-                                 input_col_stride,
-                                 F,
-                                 I,
-                                 c_tile);
+            H_lb, H_UPPER,
+            0, F_w,
+            F_w,
+            O_w_left,
+            input_col_stride,
+            F,
+            I,
+            c_tile,
+            F_c_left,
+            G_left,
+            K_left);
 
         if (op_type == OP_AVERAGE_POOL)
         {
             float norm = 1.0 / (1.0 * F_h * F_w);
             FLOAT_DIV_END_C(c_tile, norm, O_w_left, _C_ob);
         }
-        if constexpr(op_type == OP_ADD && op_class == 3 && _C_ob == 1)
+        if  (op_type == OP_ADD && op_class == 3 && _C_ob == 1)
         {
             /* If the operation reduces the channel dimension, reduce across channel dimension of simd tile*/
-            FLOAT_REDUCE_CHANNEL_END_C(O_w_left, _C_ob)
+            FLOAT_REDUCE_REM_CHANNEL_END_C(O_w_left, _C_ob)
         }
 
         FLOAT_STORE_END_C(O, O_w_left, _C_ob);
@@ -510,37 +1896,379 @@ void inline kernel_right(
         FLOAT_LOAD_END_C(O_ptr, r_pad_el, _C_ob);
     }
 
-    c_tile_t *c_cur = c_tile;
-    // dim_t c_cur = 0;
-    for (uint32_t k_p = 0; k_p < r_pad_el; k_p++)
-    {
-        compute_with_padding<ScalarT, AccumT,
-                             _G_b, _K_b, _F_cb, _O_wb, _stride,
-                             _UNROLL, op_type, op_class>(
-                                 H_lb, H_UPPER,
-                                 0, W_i_valid,
-                                 F_w,
-                                 1,
-                                 input_col_stride,
-                                 F,
-                                 I_ptr,
-                                 c_cur);
+    c_tile_t *c_cur = c_tile;
+    // dim_t c_cur = 0;
+
+    for (uint32_t k_p = 0; k_p < r_pad_el; k_p++)
+    {
+        compute_with_padding<ScalarT, AccumT,
+                             _G_b, _K_b, _F_cb, _O_wb, _stride,
+                             _UNROLL, op_type, op_class>(
+            H_lb, H_UPPER,
+            0, W_i_valid,
+            F_w,
+            1,
+            input_col_stride,
+            F,
+            I_ptr,
+            c_cur,
+            F_c_left,
+            G_left,
+            K_left);
+
+        c_cur += (G_left * K_left) / (FLOAT_SIMD_EPILOGUE);
+        W_i_valid -= _stride;
+        I_ptr += step;
+    }
+
+    if (op_type == OP_AVERAGE_POOL)
+    {
+        float norm = 1.0 / (1.0 * F_h * F_w);
+        FLOAT_DIV_END_C(c_tile, norm, r_pad_el, _C_ob);
+    }
+
+    FLOAT_STORE_END_C(O_ptr, r_pad_el, _C_ob);
+
+#if DEBUG
+    printf("First output value: %f %f %f %f \n", O[0], O[1], O[2], O[3]);
+#endif
+}
+
+//****************************************************************************
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline kernel_bottom(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    dim_t b_pad_el,
+    dim_t b_pad,
+    dim_t W_full_index,
+    dim_t l_pad_el,
+    dim_t l_pad,
+    dim_t O_w_w_pad,
+    dim_t O_w_full,
+    dim_t O_w_left,
+    dim_t r_pad_el,
+    dim_t r_pad,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O) // ScalarT -> AccumT
+{
+    ScalarT const *I_ptr = I;
+    AccumT *O_ptr = O; // ScalarT -> AccumT
+
+    int H_i_valid = F_h - 1;
+
+    for (uint32_t j_p = 0; j_p < b_pad_el; j_p++)
+    {
+        // Prologue with left padding
+        kernel_left<ScalarT, AccumT,
+                    _G_b, _K_b, _F_cb, _O_wb, _stride,
+                    _UNROLL, op_type, op_class>(
+                        first,
+                        F_h,
+                        F_w,
+                        input_col_stride,
+                        l_pad_el,
+                        l_pad,
+                        I_ptr,
+                        F,
+                        O_ptr,
+                        0,
+                        H_i_valid);
+
+        ScalarT const *I_row_full = I + W_full_index * (_F_cb * _G_b);
+        AccumT *O_row_full = O + l_pad_el * (_G_b * _K_b); // ScalarT -> AccumT
+        // Steady State with microkernel
+        for (index_t l = 0; l < O_w_full; l += _O_wb)
+        {
+            ScalarT const *I_col = I_row_full + (l * _stride) * (_F_cb * _G_b);
+            ScalarT const *F_col = F + 0;
+            AccumT *O_col = O_row_full + l * (_G_b * _K_b); // ScalarT -> AccumT
+
+            kernel_pad<ScalarT, AccumT,
+                       _G_b, _K_b, _F_cb, _O_wb, _stride,
+                       _UNROLL, op_type, op_class>(
+                           first,
+                           F_h,
+                           F_w,
+                           input_col_stride,
+                           I_col,
+                           F_col,
+                           O_col,
+                           0,
+                           H_i_valid,
+                           0,  /// @todo This was added, W_lb. Is it right?
+                           0); /// @todo This was added, W_ub. Is it right?);
+        }
+
+        // Epilogue for microkernel + right padding elements
+        ScalarT const *I_col_left =
+            I_row_full + (O_w_full * _stride) * (_F_cb * _G_b);
+        ScalarT const *F_col_left = F + 0;
+        AccumT *O_col_left = O_row_full + O_w_full * (_G_b * _K_b); // ScalarT -> AccumT
+
+        kernel_right<ScalarT, AccumT,
+                     _G_b, _K_b, _F_cb, _O_wb, _stride,
+                     _UNROLL, op_type, op_class>(
+                         first,
+                         F_h,
+                         F_w,
+                         input_col_stride,
+                         O_w_left,
+                         r_pad_el,
+                         r_pad,
+                         I_col_left,
+                         F_col_left,
+                         O_col_left,
+                         0,          /// @todo confirm this, H_lb
+                         H_i_valid); /// @todo confirm this, H_ub
+
+        O_ptr += O_w_w_pad * _K_b * _G_b;
+
+        H_i_valid -= _stride;
+        I_ptr += _stride * _F_cb * _G_b;
+    }
+}
+
+//Edge case to handle remainder output channels
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline rem_kernel_bottom(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    dim_t b_pad_el,
+    dim_t b_pad,
+    dim_t W_full_index,
+    dim_t l_pad_el,
+    dim_t l_pad,
+    dim_t O_w_w_pad,
+    dim_t O_w_full,
+    dim_t O_w_left,
+    dim_t r_pad_el,
+    dim_t r_pad,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O,
+    const dim_t G_left,
+    const dim_t K_left) // ScalarT -> AccumT
+{
+    ScalarT const *I_ptr = I;
+    AccumT *O_ptr = O; // ScalarT -> AccumT
+
+    int H_i_valid = F_h - 1;
+
+    for (uint32_t j_p = 0; j_p < b_pad_el; j_p++)
+    {
+        // Prologue with left padding
+        rem_kernel_left<ScalarT, AccumT,
+                    _G_b, _K_b, _F_cb, _O_wb, _stride,
+                    _UNROLL, op_type, op_class>(
+            first,
+            F_h,
+            F_w,
+            input_col_stride,
+            l_pad_el,
+            l_pad,
+            I_ptr,
+            F,
+            O_ptr,
+            G_left,
+            K_left,
+            0,
+            H_i_valid);
+
+        ScalarT const *I_row_full = I + W_full_index * (_F_cb * G_left);
+        AccumT *O_row_full = O + l_pad_el * (G_left * K_left); // ScalarT -> AccumT
+        // Steady State with microkernel
+        for (index_t l = 0; l < O_w_full; l += _O_wb)
+        {
+            ScalarT const *I_col = I_row_full + (l * _stride) * (_F_cb * G_left);
+            ScalarT const *F_col = F + 0;
+            AccumT *O_col = O_row_full + l * (G_left * K_left); // ScalarT -> AccumT
+
+            rem_kernel_pad<ScalarT, AccumT,
+                       _G_b, _K_b, _F_cb, _O_wb, _stride,
+                       _UNROLL, op_type, op_class>(
+                first,
+                F_h,
+                F_w,
+                input_col_stride,
+                I_col,
+                F_col,
+                O_col,
+                G_left,
+                K_left,
+                0,
+                H_i_valid,
+                0,  /// @todo This was added, W_lb. Is it right?
+                0); /// @todo This was added, W_ub. Is it right?);
+        }
+
+        // Epilogue for microkernel + right padding elements
+        ScalarT const *I_col_left =
+            I_row_full + (O_w_full * _stride) * (_F_cb * G_left);
+        ScalarT const *F_col_left = F + 0;
+        AccumT *O_col_left = O_row_full + O_w_full * (G_left * K_left); // ScalarT -> AccumT
+
+        rem_kernel_right<ScalarT, AccumT,
+                     _G_b, _K_b, _F_cb, _O_wb, _stride,
+                     _UNROLL, op_type, op_class>(
+            first,
+            F_h,
+            F_w,
+            input_col_stride,
+            O_w_left,
+            r_pad_el,
+            r_pad,
+            I_col_left,
+            F_col_left,
+            O_col_left,
+            G_left, 
+            K_left,
+            0,          /// @todo confirm this, H_lb
+            H_i_valid); /// @todo confirm this, H_ub
+
+        O_ptr += O_w_w_pad * G_left * K_left;
+
+        H_i_valid -= _stride;
+        I_ptr += _stride * _F_cb * G_left;
+    }
+}
+
+// Edge case to handle remainder input channels
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline kernel_bottom_rem(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    dim_t b_pad_el,
+    dim_t b_pad,
+    dim_t W_full_index,
+    dim_t l_pad_el,
+    dim_t l_pad,
+    dim_t O_w_w_pad,
+    dim_t O_w_full,
+    dim_t O_w_left,
+    dim_t r_pad_el,
+    dim_t r_pad,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O,
+    const dim_t F_c_left) // ScalarT -> AccumT
+{
+    ScalarT const *I_ptr = I;
+    AccumT *O_ptr = O; // ScalarT -> AccumT
+
+    int H_i_valid = F_h - 1;
+
+    for (uint32_t j_p = 0; j_p < b_pad_el; j_p++)
+    {
+        // Prologue with left padding
+        kernel_left_rem<ScalarT, AccumT,
+                    _G_b, _K_b, _F_cb, _O_wb, _stride,
+                    _UNROLL, op_type, op_class>(
+            first,
+            F_h,
+            F_w,
+            input_col_stride,
+            l_pad_el,
+            l_pad,
+            I_ptr,
+            F,
+            O_ptr,
+            F_c_left,
+            0,
+            H_i_valid);
+
+        ScalarT const *I_row_full = I + W_full_index * (F_c_left * _G_b);
+        AccumT *O_row_full = O + l_pad_el * (_G_b * _K_b); // ScalarT -> AccumT
+        // Steady State with microkernel
+        for (index_t l = 0; l < O_w_full; l += _O_wb)
+        {
+            ScalarT const *I_col = I_row_full + (l * _stride) * (F_c_left * _G_b);
+            ScalarT const *F_col = F + 0;
+            AccumT *O_col = O_row_full + l * (_G_b * _K_b); // ScalarT -> AccumT
+
+            kernel_pad_rem<ScalarT, AccumT,
+                       _G_b, _K_b, _F_cb, _O_wb, _stride,
+                       _UNROLL, op_type, op_class>(
+                first,
+                F_h,
+                F_w,
+                input_col_stride,
+                I_col,
+                F_col,
+                O_col,
+                F_c_left,
+                0,
+                H_i_valid,
+                0,  /// @todo This was added, W_lb. Is it right?
+                0); /// @todo This was added, W_ub. Is it right?);
+        }
+
+        // Epilogue for microkernel + right padding elements
+        ScalarT const *I_col_left =
+            I_row_full + (O_w_full * _stride) * (F_c_left * _G_b);
+        ScalarT const *F_col_left = F + 0;
+        AccumT *O_col_left = O_row_full + O_w_full * (_G_b * _K_b); // ScalarT -> AccumT
+
+        kernel_right_rem<ScalarT, AccumT,
+                     _G_b, _K_b, _F_cb, _O_wb, _stride,
+                     _UNROLL, op_type, op_class>(
+            first,
+            F_h,
+            F_w,
+            input_col_stride,
+            O_w_left,
+            r_pad_el,
+            r_pad,
+            I_col_left,
+            F_col_left,
+            O_col_left,
+            F_c_left,
+            0,          /// @todo confirm this, H_lb
+            H_i_valid); /// @todo confirm this, H_ub
 
-        c_cur += (_K_b * _G_b) / (FLOAT_SIMD_EPILOGUE);
-        W_i_valid -= _stride;
-        I_ptr += _stride * _F_cb * _G_b;
-    }
+        O_ptr += O_w_w_pad * _K_b * _G_b;
 
-    if (op_type == OP_AVERAGE_POOL)
-    {
-        float norm = 1.0 / (1.0 * F_h * F_w);
-        FLOAT_DIV_END_C(c_tile, norm, r_pad_el, _C_ob);
+        H_i_valid -= _stride;
+        I_ptr += _stride * F_c_left * _G_b;
     }
-
-    FLOAT_STORE_END_C(O_ptr, r_pad_el, _C_ob);
 }
 
-//****************************************************************************
+// Edge case to handle remainder input and output channels
+// Edge case to handle remainder input channels
 template <typename ScalarT,
           typename AccumT,
           dim_t _G_b,
@@ -551,7 +2279,7 @@ template <typename ScalarT,
           dim_t _UNROLL,
           OpType op_type,
           int8_t op_class>
-void inline kernel_bottom(
+void inline rem_kernel_bottom_rem(
     bool first,
     dim_t F_h,
     dim_t F_w,
@@ -568,8 +2296,15 @@ void inline kernel_bottom(
     dim_t r_pad,
     ScalarT const *I,
     ScalarT const *F,
-    AccumT *O) // ScalarT -> AccumT
+    AccumT *O,
+    const dim_t F_c_left,
+    const dim_t G_left, 
+    const dim_t K_left) // ScalarT -> AccumT 
 {
+    const dim_t _C_ob = G_left * K_left;
+    const dim_t _C_ib = G_left * F_c_left;
+    const dim_t step = _stride * _C_ib;
+    
     ScalarT const *I_ptr = I;
     AccumT *O_ptr = O; // ScalarT -> AccumT
 
@@ -578,72 +2313,81 @@ void inline kernel_bottom(
     for (uint32_t j_p = 0; j_p < b_pad_el; j_p++)
     {
         // Prologue with left padding
-        kernel_left<ScalarT, AccumT,
-                    _G_b, _K_b, _F_cb, _O_wb, _stride,
-                    _UNROLL, op_type, op_class>(
-                        first,
-                        F_h,
-                        F_w,
-                        input_col_stride,
-                        l_pad_el,
-                        l_pad,
-                        I_ptr,
-                        F,
-                        O_ptr,
-                        0,
-                        H_i_valid);
-
-        ScalarT const *I_row_full = I + W_full_index * (_F_cb * _G_b);
-        AccumT *O_row_full = O + l_pad_el * (_G_b * _K_b); // ScalarT -> AccumT
+        rem_kernel_left_rem<ScalarT, AccumT,
+                        _G_b, _K_b, _F_cb, _O_wb, _stride,
+                        _UNROLL, op_type, op_class>(
+            first,
+            F_h,
+            F_w,
+            input_col_stride,
+            l_pad_el,
+            l_pad,
+            I_ptr,
+            F,
+            O_ptr,
+            F_c_left,
+            G_left, 
+            K_left,
+            0,
+            H_i_valid);
+
+        ScalarT const *I_row_full = I + W_full_index * (_C_ib);
+        AccumT *O_row_full = O + l_pad_el * (_C_ob); // ScalarT -> AccumT
         // Steady State with microkernel
         for (index_t l = 0; l < O_w_full; l += _O_wb)
         {
-            ScalarT const *I_col = I_row_full + (l * _stride) * (_F_cb * _G_b);
+            ScalarT const *I_col = I_row_full + (l * _stride) * (_C_ib);
             ScalarT const *F_col = F + 0;
-            AccumT *O_col = O_row_full + l * (_G_b * _K_b); // ScalarT -> AccumT
-
-            kernel_pad<ScalarT, AccumT,
-                       _G_b, _K_b, _F_cb, _O_wb, _stride,
-                       _UNROLL, op_type, op_class>(
-                           first,
-                           F_h,
-                           F_w,
-                           input_col_stride,
-                           I_col,
-                           F_col,
-                           O_col,
-                           0,
-                           H_i_valid,
-                           0,  /// @todo This was added, W_lb. Is it right?
-                           0); /// @todo This was added, W_ub. Is it right?);
+            AccumT *O_col = O_row_full + l * (_C_ob); // ScalarT -> AccumT
+
+            rem_kernel_pad_rem<ScalarT, AccumT,
+                           _G_b, _K_b, _F_cb, _O_wb, _stride,
+                           _UNROLL, op_type, op_class>(
+                first,
+                F_h,
+                F_w,
+                input_col_stride,
+                I_col,
+                F_col,
+                O_col,
+                F_c_left,
+                G_left,
+                K_left,
+                0,
+                H_i_valid,
+                0,  /// @todo This was added, W_lb. Is it right?
+                0); /// @todo This was added, W_ub. Is it right?);
         }
 
         // Epilogue for microkernel + right padding elements
         ScalarT const *I_col_left =
-            I_row_full + (O_w_full * _stride) * (_F_cb * _G_b);
+            I_row_full + (O_w_full * _stride) * (_C_ib);
         ScalarT const *F_col_left = F + 0;
-        AccumT *O_col_left = O_row_full + O_w_full * (_G_b * _K_b); // ScalarT -> AccumT
-
-        kernel_right<ScalarT, AccumT,
-                     _G_b, _K_b, _F_cb, _O_wb, _stride,
-                     _UNROLL, op_type, op_class>(
-                         first,
-                         F_h,
-                         F_w,
-                         input_col_stride,
-                         O_w_left,
-                         r_pad_el,
-                         r_pad,
-                         I_col_left,
-                         F_col_left,
-                         O_col_left,
-                         0,          /// @todo confirm this, H_lb
-                         H_i_valid); /// @todo confirm this, H_ub
-
-        O_ptr += O_w_w_pad * _K_b * _G_b;
+        AccumT *O_col_left = O_row_full + O_w_full * (_C_ob); // ScalarT -> AccumT
+
+        rem_kernel_right_rem<ScalarT, AccumT,
+                         _G_b, _K_b, _F_cb, _O_wb, _stride,
+                         _UNROLL, op_type, op_class>(
+            first,
+            F_h,
+            F_w,
+            input_col_stride,
+            O_w_left,
+            r_pad_el,
+            r_pad,
+            I_col_left,
+            F_col_left,
+            O_col_left,
+            F_c_left,
+            G_left,
+            K_left, 
+            0,          /// @todo confirm this, H_lb
+            H_i_valid); /// @todo confirm this, H_ub
+
+        O_ptr += O_w_w_pad * (_C_ob);
 
         H_i_valid -= _stride;
-        I_ptr += _stride * _F_cb * _G_b;
+        I_ptr += _stride * (_C_ib);
     }
 }
 
@@ -750,7 +2494,366 @@ void inline kernel_top(
                          H_i_valid,
                          F_h);
 
-        O_ptr += O_w_w_pad * _K_b * _G_b;
+        O_ptr += O_w_w_pad * _K_b * _G_b;
+        H_i_valid += _stride;
+        // I_ptr += _stride * _F_cb * _G_b;
+    }
+}
+
+//Edge case to handle remainder output channels
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline rem_kernel_top(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    dim_t t_pad_el,
+    dim_t t_pad,
+    dim_t W_full_index,
+    dim_t l_pad_el,
+    dim_t l_pad,
+    dim_t O_w_w_pad,
+    dim_t O_w_full,
+    dim_t O_w_left,
+    dim_t r_pad_el,
+    dim_t r_pad,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O,
+    const dim_t G_left,
+    const dim_t K_left) // ScalarT --> AccumT
+{
+    const dim_t _C_ob = G_left * K_left;
+    const dim_t _C_ib = G_left * _F_cb;
+    const dim_t step = _stride * _C_ib;
+
+    ScalarT const *I_ptr = I;
+    AccumT *O_ptr = O; // ScalarT --> AccumT
+
+    int H_i_valid = t_pad;
+
+    for (uint32_t j_p = 0; j_p < t_pad_el; j_p++)
+    {
+        // Prologue with left padding
+        rem_kernel_left<ScalarT, AccumT,
+                    _G_b, _K_b, _F_cb, _O_wb, _stride,
+                    _UNROLL, op_type, op_class>(
+            first,
+            F_h,
+            F_w,
+            input_col_stride,
+            l_pad_el,
+            l_pad,
+            I_ptr,
+            F,
+            O_ptr,
+            G_left, 
+            K_left,
+            H_i_valid,
+            F_h);
+
+        ScalarT const *I_row_full = I + W_full_index * (_C_ib);
+        AccumT *O_row_full = O + l_pad_el * (_C_ob); // ScalarT --> AccumT
+
+        // Steady State with microkernel
+        for (index_t l = 0; l < O_w_full; l += _O_wb)
+        {
+            ScalarT const *I_col =
+                I_row_full + (l * _stride) * (_C_ib);
+            ScalarT const *F_col = F + 0;
+            AccumT *O_col = O_row_full + l * (_C_ob); // ScalarT --> AccumT
+
+            rem_kernel_pad<ScalarT, AccumT,
+                       _G_b, _K_b, _F_cb, _O_wb, _stride,
+                       _UNROLL, op_type, op_class>(
+                first,
+                F_h,
+                F_w,
+                input_col_stride,
+                I_col,
+                F_col,
+                O_col,
+                G_left, 
+                K_left,
+                H_i_valid, // H_lb
+                F_h,       // H_ub
+                0,         // W_lb
+                0);        /// @todo Confirm this, W_ub. Is it right? q_abstract_layer has F_w
+        }
+
+        // Epilogue for microkernel + right padding elements
+        ScalarT const *I_col_left =
+            I_row_full + (O_w_full * _stride) * (_C_ib);
+        ScalarT const *F_col_left = F + 0;
+        AccumT *O_col_left =
+            O_row_full + O_w_full * (_C_ob); // ScalarT --> AccumT
+
+        rem_kernel_right<ScalarT, AccumT,
+                     _G_b, _K_b, _F_cb, _O_wb, _stride,
+                     _UNROLL, op_type, op_class>(
+            first,
+            F_h,
+            F_w,
+            input_col_stride,
+            O_w_left,
+            r_pad_el,
+            r_pad,
+            I_col_left,
+            F_col_left,
+            O_col_left,
+            G_left,
+            K_left,
+            H_i_valid,
+            F_h);
+
+        O_ptr += O_w_w_pad * _C_ob;
+        H_i_valid += _stride;
+    }
+    
+}
+
+// Edge case to handle remainder input channels
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline kernel_top_rem(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    dim_t t_pad_el,
+    dim_t t_pad,
+    dim_t W_full_index,
+    dim_t l_pad_el,
+    dim_t l_pad,
+    dim_t O_w_w_pad,
+    dim_t O_w_full,
+    dim_t O_w_left,
+    dim_t r_pad_el,
+    dim_t r_pad,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O,
+    const dim_t F_c_left) // ScalarT --> AccumT
+{
+    ScalarT const *I_ptr = I;
+    AccumT *O_ptr = O; // ScalarT --> AccumT
+
+    int H_i_valid = t_pad;
+
+    for (uint32_t j_p = 0; j_p < t_pad_el; j_p++)
+    {
+        // Prologue with left padding
+        kernel_left_rem<ScalarT, AccumT,
+                    _G_b, _K_b, _F_cb, _O_wb, _stride,
+                    _UNROLL, op_type, op_class>(
+            first,
+            F_h,
+            F_w,
+            input_col_stride,
+            l_pad_el,
+            l_pad,
+            I_ptr,
+            F,
+            O_ptr,
+            F_c_left,
+            H_i_valid,
+            F_h);
+
+        ScalarT const *I_row_full = I + W_full_index * (F_c_left * _G_b);
+        AccumT *O_row_full = O + l_pad_el * (_G_b * _K_b); // ScalarT --> AccumT
+
+        // Steady State with microkernel
+        for (index_t l = 0; l < O_w_full; l += _O_wb)
+        {
+            ScalarT const *I_col =
+                I_row_full + (l * _stride) * (F_c_left * _G_b);
+            ScalarT const *F_col = F + 0;
+            AccumT *O_col = O_row_full + l * (_G_b * _K_b); // ScalarT --> AccumT
+
+            kernel_pad_rem<ScalarT, AccumT,
+                       _G_b, _K_b, _F_cb, _O_wb, _stride,
+                       _UNROLL, op_type, op_class>(
+                first,
+                F_h,
+                F_w,
+                input_col_stride,
+                I_col,
+                F_col,
+                O_col,
+                F_c_left,
+                H_i_valid, // H_lb
+                F_h,       // H_ub
+                0,         // W_lb
+                0);        /// @todo Confirm this, W_ub. Is it right? q_abstract_layer has F_w
+        }
+
+        // Epilogue for microkernel + right padding elements
+        ScalarT const *I_col_left =
+            I_row_full + (O_w_full * _stride) * (F_c_left * _G_b);
+        ScalarT const *F_col_left = F + 0;
+        AccumT *O_col_left =
+            O_row_full + O_w_full * (_G_b * _K_b); // ScalarT --> AccumT
+
+        kernel_right_rem<ScalarT, AccumT,
+                     _G_b, _K_b, _F_cb, _O_wb, _stride,
+                     _UNROLL, op_type, op_class>(
+            first,
+            F_h,
+            F_w,
+            input_col_stride,
+            O_w_left,
+            r_pad_el,
+            r_pad,
+            I_col_left,
+            F_col_left,
+            O_col_left,
+            F_c_left,
+            H_i_valid,
+            F_h);
+
+        O_ptr += O_w_w_pad * _K_b * _G_b;
+        H_i_valid += _stride;
+        // I_ptr += _stride * _F_cb * _G_b;
+    }
+}
+
+// Edge case to handle remainder input and output channels
+template <typename ScalarT,
+          typename AccumT,
+          dim_t _G_b,
+          dim_t _K_b,
+          dim_t _F_cb,
+          dim_t _O_wb,
+          dim_t _stride,
+          dim_t _UNROLL,
+          OpType op_type,
+          int8_t op_class>
+void inline rem_kernel_top_rem(
+    bool first,
+    dim_t F_h,
+    dim_t F_w,
+    dim_t input_col_stride,
+    dim_t t_pad_el,
+    dim_t t_pad,
+    dim_t W_full_index,
+    dim_t l_pad_el,
+    dim_t l_pad,
+    dim_t O_w_w_pad,
+    dim_t O_w_full,
+    dim_t O_w_left,
+    dim_t r_pad_el,
+    dim_t r_pad,
+    ScalarT const *I,
+    ScalarT const *F,
+    AccumT *O,
+    const dim_t F_c_left,
+    const dim_t G_left, 
+    const dim_t K_left) // ScalarT --> AccumT
+{
+    const dim_t _C_ob = G_left * K_left;
+    const dim_t _C_ib = G_left * F_c_left;
+    const dim_t step = _stride * _C_ib;
+
+    ScalarT const *I_ptr = I;
+    AccumT *O_ptr = O; // ScalarT --> AccumT
+
+    int H_i_valid = t_pad;
+
+    for (uint32_t j_p = 0; j_p < t_pad_el; j_p++)
+    {
+        // Prologue with left padding
+        rem_kernel_left_rem<ScalarT, AccumT,
+                        _G_b, _K_b, _F_cb, _O_wb, _stride,
+                        _UNROLL, op_type, op_class>(
+            first,
+            F_h,
+            F_w,
+            input_col_stride,
+            l_pad_el,
+            l_pad,
+            I_ptr,
+            F,
+            O_ptr,
+            F_c_left,
+            G_left, 
+            K_left,
+            H_i_valid,
+            F_h);
+
+        ScalarT const *I_row_full = I + W_full_index * (_C_ib);
+        AccumT *O_row_full = O + l_pad_el * (_C_ob); // ScalarT --> AccumT
+
+        // Steady State with microkernel
+        for (index_t l = 0; l < O_w_full; l += _O_wb)
+        {
+            ScalarT const *I_col =
+                I_row_full + (l * _stride) * (_C_ib);
+            ScalarT const *F_col = F + 0;
+            AccumT *O_col = O_row_full + l * (_C_ob); // ScalarT --> AccumT
+
+            rem_kernel_pad_rem<ScalarT, AccumT,
+                           _G_b, _K_b, _F_cb, _O_wb, _stride,
+                           _UNROLL, op_type, op_class>(
+                first,
+                F_h,
+                F_w,
+                input_col_stride,
+                I_col,
+                F_col,
+                O_col,
+                F_c_left,
+                G_left, 
+                K_left,
+                H_i_valid, // H_lb
+                F_h,       // H_ub
+                0,         // W_lb
+                0);        /// @todo Confirm this, W_ub. Is it right? q_abstract_layer has F_w
+        }
+
+        // Epilogue for microkernel + right padding elements
+        ScalarT const *I_col_left =
+            I_row_full + (O_w_full * _stride) * (_C_ib);
+        ScalarT const *F_col_left = F + 0;
+        AccumT *O_col_left =
+            O_row_full + O_w_full * (_C_ob); // ScalarT --> AccumT
+
+        rem_kernel_right_rem<ScalarT, AccumT,
+                         _G_b, _K_b, _F_cb, _O_wb, _stride,
+                         _UNROLL, op_type, op_class>(
+            first,
+            F_h,
+            F_w,
+            input_col_stride,
+            O_w_left,
+            r_pad_el,
+            r_pad,
+            I_col_left,
+            F_col_left,
+            O_col_left,
+            F_c_left,
+            G_left, 
+            K_left,
+            H_i_valid,
+            F_h);
+
+        O_ptr += O_w_w_pad * _C_ob;
         H_i_valid += _stride;
         // I_ptr += _stride * _F_cb * _G_b;
     }
@@ -834,6 +2937,11 @@ void abstract_layer(
      * I: [G/G_b,        F_c/F_cb, I_h, I_w, F_cb, G_b     ]
      * F: [G/G_b, K/K_b, F_c/F_cb, F_h, F_w, F_cb, G_b, K_b]
      * O: [G/G_b, K/K_b,           O_h, O_w,       G_b, K_b]
+     * 
+     * For the case where the number of channels is not a multiple of the blocking size,
+     * I:
+     * F:
+     * O:
      */
 
     //************************************************************************
@@ -911,9 +3019,37 @@ void abstract_layer(
     const dim_t O_w_left = O_w - O_w_full;
     const dim_t O_hxO_w = O_h_w_pad * O_w_w_pad;
 
+
+    // Number of input channels in reduction is not a multiple of blocking size
+    const dim_t F_c_full = (F_c / _F_cb) * _F_cb;
+
     // When the number of channels is not a multiple of blocking size
-    // const dim_t K_full = (K / _K_b) * _K_b;
-    // const dim_t K_left = K - K_full;
+    const dim_t K_full = (K / _K_b) * _K_b;
+
+    //When the number of groups is not a multiple of blocking size
+    const dim_t G_full = (G / _G_b) * _G_b;
+
+
+    //We need to do in the edge case if either K_left or G_left is non-zero
+    // HACK: THIS CALCULATE IS IMPRECISE (group convs won't work)
+
+    const dim_t F_c_full_idx = (F_c_full == 1)? 0: F_c_full;
+    
+    const dim_t K_full_idx = (K_full == 1)? 0: K_full;
+    const dim_t K_left = K - K_full_idx;
+
+    // When the number of groups is not a multiple of blocking size
+    const dim_t G_full_idx = (G_full == 1)? 0: G_full;
+    const dim_t G_left = G - G_full_idx;
+
+    const dim_t F_c_left = F_c - F_c_full_idx;
+    const dim_t _C_ib_left = F_c_left * _G_b;
+    const dim_t _C_ib_left_group = G_left * _F_cb;
+    const dim_t _C_ib_left_group_channels = G_left * F_c_left;
+    
+    //Only handling the case where G_left*K_left < FLOAT_C_ob
+    //There could be a case where G_left*K_left == FLOAT_C_ob
+    //todo: handle remaining cases
 
 #if DEBUG == 1
     printf("\t\t I_h %d I_w %d F_C %d G %d \n", I_h, I_w, F_c, G);
@@ -932,6 +3068,7 @@ void abstract_layer(
     printf("params: F_Cb %d G_b %d K_b %d\n", _F_cb, _G_b, _K_b);
     printf("rewrite output?: %d, op type/class:  %d/%d\n",
            rewrite_output, op_type, op_class);
+           printf("F_c_full: %d F_c_left: %d \n", F_c_full, F_c_left);
 #endif
 
     // Set up parallelism for the channel loops
@@ -970,7 +3107,7 @@ void abstract_layer(
         auto group_tid = ((t_id / (T_channel * T_height))) % T_group;
 
         // loops over output channels
-        for (index_t g = group_tid; g < G / _G_b; g += T_group)
+        for (index_t g = group_tid; g < G_full / _G_b; g += T_group)
         {
             ScalarT const *I_group;
             if constexpr (op_type == OP_UPSAMPLE && _stride == std::numeric_limits<dim_t>::max())
@@ -994,8 +3131,9 @@ void abstract_layer(
                 F_group = F_buf + g * (K * F_c * F_h * F_w * _G_b);
             }
 
-            // resuse O_group as a uint32_t array
-            for (index_t k = channel_tid; k < K / _K_b; k += T_channel)
+            // reuse O_group as a uint32_t array
+            // k'th block over K_full
+            for (index_t k = channel_tid; k < K_full / _K_b; k += T_channel)
             {
                 ScalarT const *I_channel_block_output =
                     I_group + 0;
@@ -1005,8 +3143,8 @@ void abstract_layer(
                     O_group + k * (O_hxO_w * _G_b * _K_b);
 
                 //************************************************************
-                // Loop over input channel reduction
-                for (index_t i = 0; i < (F_c / _F_cb); i++)
+                // Loop over input channel reduction (multiple of blocking size)
+                for (index_t i = 0; i < (F_c_full / _F_cb); i++)
                 {
                     bool first = rewrite_output && (i == 0);
 
@@ -1196,6 +3334,681 @@ void abstract_layer(
                                       F_row_bot,
                                       O_row_bot);
                 }
+            
+            
+                // Loop over remaining channels
+                // _UNROLL defaults to 1
+                //This loop should have 1 iteration
+                for (index_t i = F_c_full; i < F_c; i+=F_c_left)
+                {
+                    // printf("F_c_full: %d F_c: %d iter: %i \n", F_c_full, F_c, i);
+                    bool first = rewrite_output && (i == 0);
+
+                    ScalarT const *I_channel_block_input =
+                        I_channel_block_output + (i/_F_cb) * (I_h * I_w * _F_cb * _G_b);
+                    ScalarT const *F_channel_block_input =
+                        F_channel_block_output + (i/_F_cb) * (F_h * F_w * _F_cb * _G_b * _K_b);
+                    ScalarT *O_channel_block_input =
+                        O_channel_block_output + 0;
+
+                    // Loops over spatial dimensions of output
+
+                    // Prologue with top padding
+                    ScalarT const *I_row_top = I_channel_block_input;
+                    ScalarT const *F_row_top = F_channel_block_input + 0;
+                    AccumT *O_row_top = O_channel_block_input; // ScalarT --> AccumT
+
+                    kernel_top_rem<ScalarT, AccumT,
+                               _G_b, _K_b, _F_cb, _O_wb, _stride,
+                               1, op_type, op_class>(
+                        first,
+                        F_h,
+                        F_w,
+                        I_w * _C_ib_left,
+                        t_pad_el,
+                        pad_top,
+                        W_full_index,
+                        l_pad_el,
+                        pad_left,
+                        O_w_w_pad,
+                        O_w_full,
+                        O_w_left,
+                        r_pad_el,
+                        pad_right,
+                        I_row_top,
+                        F_row_top,
+                        O_row_top,
+                        F_c_left);
+
+                    ScalarT const *I_row_full =
+                        I_row_top + H_full_index * I_w * _C_ib_left; //(_F_cb * _G_b);
+                    AccumT *O_row_full =
+                        O_row_top + t_pad_el * O_w_w_pad * (_G_b * _K_b); // ScalarT --> AccumT
+
+                    // Steady State over rows
+                    // The stride over input channels is the number of remaining channels
+                    for (index_t j = height_tid; j < O_h; j += T_height)
+                    {
+                        ScalarT const *I_row;
+                        // @todo cast index calculation as int and make stride a float value.
+                        // I_x = I_x + (int)(j * _stride) * (<remaining dimensions>)
+                        if constexpr (op_type == OP_UPSAMPLE)
+                        {
+                            I_row = I_row_full + (j / _stride) * (I_w * _F_cb * _G_b);
+                        }
+                        else
+                        {
+                            I_row = I_row_full + (j * _stride) * (I_w * /*_F_cb * _G_b*/ _C_ib_left);
+                        }
+                        ScalarT const *F_row = F_channel_block_input + 0;
+                        AccumT *O_row =
+                            O_row_full + j * (O_w_w_pad * _G_b * _K_b); // ScalarT --> AccumT
+                        // Prologue with left padding
+                        kernel_left_rem<ScalarT, AccumT,
+                                        _G_b, _K_b, _F_cb, _O_wb, _stride,
+                                        1, op_type, op_class>(
+                            first,
+                            F_h,
+                            F_w,
+                            I_w * _C_ib_left,
+                            l_pad_el,
+                            pad_left,
+                            I_row,
+                            F_row,
+                            O_row,
+                            F_c_left,
+                            0,
+                            0);
+
+                        ScalarT const *I_col_full =
+                            I_row + W_full_index * /*(_F_cb * _G_b)*/ _C_ib_left;
+                        AccumT *O_col_full = O_row + l_pad_el * (_G_b * _K_b); // ScalarT --> AccumT
+                        // Steady State with microkernel
+                        for (index_t l = 0; l < O_w_full; l += _O_wb)
+                        {
+                            ScalarT const *I_col;
+                            // @todo cast index calculation as int and make stride a float value.
+                            // I_x = I_x + (int)(j * _stride) * (<remaining dimensions>)
+                            if constexpr (op_type == OP_UPSAMPLE)
+                            {
+                                I_col = I_col_full + (l / _stride) * (_F_cb * _G_b);
+                            }
+                            else
+                            {
+                                I_col = I_col_full + (l * _stride) * _C_ib_left /*(_F_cb * _G_b)*/;
+                            }
+                            ScalarT const *F_col = F_row + 0;
+                            AccumT *O_col = O_col_full + l * (_G_b * _K_b); // ScalarT --> AccumT
+
+                            kernel_rem<ScalarT, AccumT,
+                                       _G_b, _K_b, _F_cb, _O_wb, _stride,
+                                       1, op_type, op_class>(
+                                first,
+                                F_h,
+                                F_w,
+                                I_w * _C_ib_left,
+                                I_col,
+                                F_col,
+                                O_col,
+                                F_c_left,
+                                0,
+                                0,
+                                0,
+                                0);
+                        }
+
+#if DEBUG
+                        printf(" end  kernel\n");
+#endif
+
+                        // Epilogue for microkernel + right padding elements
+                        ScalarT const *I_col_left;
+                        if constexpr (op_type == OP_UPSAMPLE)
+                        {
+                            I_col_left =
+                                I_col_full + (O_w_full / _stride) * (_F_cb * _G_b);
+                        }
+                        else
+                        {
+                            I_col_left =
+                                I_col_full + (O_w_full * _stride) * _C_ib_left/*(_F_cb * _G_b)*/;
+                        }
+
+                        ScalarT const *F_col_left = F_row + 0;
+                        AccumT *O_col_left = O_col_full + O_w_full * (_G_b * _K_b); // ScalarT --> AccumT
+
+#if DEBUG
+                        printf("calling right\n");
+#endif
+                        kernel_right_rem<ScalarT, AccumT,
+                                         _G_b, _K_b, _F_cb, _O_wb, _stride,
+                                         1, op_type, op_class>(
+                            first,
+                            F_h,
+                            F_w,
+                            I_w * _C_ib_left,
+                            O_w_left,
+                            r_pad_el,
+                            pad_right,
+                            I_col_left,
+                            F_col_left,
+                            O_col_left,
+                            F_c_left,
+                            0,
+                            0);
+                    }
+                    // Epilogue with bottom padding
+                    ScalarT const *I_row_bot;
+                    // @todo cast index calculation as int and make stride a float value.
+                    // I_x = I_x + (int)(j * _stride) * (<remaining dimensions>)
+                    if constexpr (op_type == OP_UPSAMPLE)
+                    {
+                        I_row_bot =
+                            I_row_full + (O_h * _stride) * (I_w * _F_cb * _G_b);
+                    }
+                    else
+                    {
+                        I_row_bot =
+                            I_row_full + (O_h * _stride) * (I_w * _C_ib_left /*_F_cb * _G_b*/);
+                    }
+                    ScalarT const *F_row_bot = F_channel_block_input + 0;
+                    AccumT *O_row_bot = O_row_full + O_h * (O_w_w_pad * _G_b * _K_b); // ScalarT --> AccumT
+
+                    kernel_bottom_rem<ScalarT, AccumT,
+                                  _G_b, _K_b, _F_cb, _O_wb, _stride,
+                                  1, op_type, op_class>(
+                        first,
+                        F_h,
+                        F_w,
+                        I_w * _C_ib_left,
+                        b_pad_el,
+                        pad_bottom,
+                        W_full_index,
+                        l_pad_el,
+                        pad_left,
+                        O_w_w_pad,
+                        O_w_full,
+                        O_w_left,
+                        r_pad_el,
+                        pad_right,
+                        I_row_bot,
+                        F_row_bot,
+                        O_row_bot,
+                        F_c_left);
+                }
+            }
+        }
+    
+
+    }
+
+    //@todo: add this back to the parallel loop
+    // loop over remaining groups and output channels
+    // assumes that only 1 will have a remainder
+    for (index_t g = G_full_idx; g < G ; g += G_left)
+    {
+        #if DEBUG == 1
+        printf("G_left %d K_left %d \n", G_left, K_left);
+        printf("G_full_idx %d K_full_idx %d \n", G_full_idx, K_full_idx);
+        #endif
+        ScalarT const *I_group;
+        if constexpr (op_type == OP_UPSAMPLE && _stride == std::numeric_limits<dim_t>::max())
+        {
+            I_group = I_buf + g * (F_c * 1 * 1 );
+            // I_group = I_buf + g * (F_c * 1 * 1 * _G_b);
+        }
+        else
+        {
+            I_group = I_buf + g * (F_c * I_h * I_w);
+            // I_group = I_buf + g * (F_c * I_h * I_w * _G_b);
+        }
+        ScalarT *O_group = O_buf + g * (K * O_hxO_w);
+        // ScalarT *O_group = O_buf + g * (K * O_hxO_w * _G_b);
+        // if leaky relu, the weight pointer does not change with the group id
+
+        ScalarT const *F_group;
+        if constexpr ((op_type == OP_LEAKY_RELU) || (op_type == OP_MUL))
+        {
+            F_group = F_buf;
+        }
+        else
+        {
+            F_group = F_buf + g * (K * F_c * F_h * F_w);
+            // F_group = F_buf + g * (K * F_c * F_h * F_w * _G_b);
+        }
+
+        // resuse O_group as a uint32_t array
+        //This loop has 1 iteration
+        // k'th element in K_full
+        for (index_t k = K_full_idx; k < K ; k += K_left)
+        {
+            ScalarT const *I_channel_block_output =
+                I_group + 0;
+            ScalarT const *F_channel_block_output =
+                // F_group + k * (F_c * F_h * F_w * _G_b * _K_b);
+            F_group + k *(F_c * F_h * F_w * _G_b);
+            ScalarT *O_channel_block_output =
+                // O_group + k * (O_hxO_w * _G_b * _K_b);
+            O_group + k *(O_hxO_w * _G_b);
+
+            //************************************************************
+            // Loop over input channel reduction (multiple of blocking size)
+            for (index_t i = 0; i < (F_c_full / _F_cb); i++)
+            {
+                bool first = rewrite_output && (i == 0);
+
+                
+                ScalarT const *I_channel_block_input =
+                    I_channel_block_output + i * (I_h * I_w * _F_cb * G_left);
+                ScalarT const *F_channel_block_input =
+                    F_channel_block_output + i * (F_h * F_w * _F_cb * G_left * K_left);
+                ScalarT *O_channel_block_input =
+                    O_channel_block_output + 0;
+
+                // Loops over spatial dimensions of output
+
+                // Prologue with top padding
+                ScalarT const *I_row_top = I_channel_block_input;
+                ScalarT const *F_row_top = F_channel_block_input + 0;
+                AccumT *O_row_top = O_channel_block_input; // ScalarT --> AccumT
+
+                rem_kernel_top<ScalarT, AccumT,
+                               _G_b, _K_b, _F_cb, _O_wb, _stride,
+                               _UNROLL, op_type, op_class>(
+                    first,
+                    F_h,
+                    F_w,
+                    // I_w * _C_ib,
+                    I_w * _C_ib_left_group,
+                    t_pad_el,
+                    pad_top,
+                    W_full_index,
+                    l_pad_el,
+                    pad_left,
+                    O_w_w_pad,
+                    O_w_full,
+                    O_w_left,
+                    r_pad_el,
+                    pad_right,
+                    I_row_top,
+                    F_row_top,
+                    O_row_top,
+                    G_left,
+                    K_left);
+
+                ScalarT const *I_row_full =
+                    I_row_top + H_full_index * I_w * (_F_cb * G_left);
+                AccumT *O_row_full =
+                    O_row_top + t_pad_el * O_w_w_pad * (G_left * K_left); // ScalarT --> AccumT
+
+                // Steady State over rows
+                for (index_t j = 0; j < O_h; j += T_height)
+                {
+                    ScalarT const *I_row;
+                    // @todo cast index calculation as int and make stride a float value.
+                    // I_x = I_x + (int)(j * _stride) * (<remaining dimensions>)
+                    if constexpr (op_type == OP_UPSAMPLE)
+                    {
+                        I_row = I_row_full + (j / _stride) * (I_w * _F_cb * G_left);
+                    }
+                    else
+                    {
+                        I_row = I_row_full + (j * _stride) * (I_w * _F_cb * G_left);
+                    }
+                    ScalarT const *F_row = F_channel_block_input + 0;
+                    AccumT *O_row =
+                        O_row_full + j * (O_w_w_pad * G_left * K_left); // ScalarT --> AccumT
+                    // Prologue with left padding
+                    rem_kernel_left<ScalarT, AccumT,
+                                    _G_b, _K_b, _F_cb, _O_wb, _stride,
+                                    _UNROLL, op_type, op_class>(
+                        first,
+                        F_h,
+                        F_w,
+                        I_w * _C_ib_left_group,
+                        // I_w * _C_ib,
+                        l_pad_el,
+                        pad_left,
+                        I_row,
+                        F_row,
+                        O_row,
+                        G_left,
+                        K_left,
+                        0,
+                        0);
+
+                    ScalarT const *I_col_full =
+                        I_row + W_full_index * (_F_cb * G_left);
+                    AccumT *O_col_full = O_row + l_pad_el * (G_left * K_left); // ScalarT --> AccumT
+                    // Steady State with microkernel
+                    for (index_t l = 0; l < O_w_full; l += _O_wb)
+                    {
+                        ScalarT const *I_col;
+                        // @todo cast index calculation as int and make stride a float value.
+                        // I_x = I_x + (int)(j * _stride) * (<remaining dimensions>)
+                        if constexpr (op_type == OP_UPSAMPLE)
+                        {
+                            I_col = I_col_full + (l / _stride) * (_F_cb * G_left);
+                        }
+                        else
+                        {
+                            I_col = I_col_full + (l * _stride) * (_F_cb * G_left);
+                        }
+                        ScalarT const *F_col = F_row + 0;
+                        AccumT *O_col = O_col_full + l * (G_left * K_left); // ScalarT --> AccumT
+
+                        rem_kernel<ScalarT, AccumT,
+                                   _G_b, _K_b, _F_cb, _O_wb, _stride,
+                                   _UNROLL, op_type, op_class>(
+                            first,
+                            F_h,
+                            F_w,
+                            I_w * _C_ib_left_group,
+                            // I_w * _C_ib,
+                            I_col,
+                            F_col,
+                            O_col,
+                            G_left,
+                            K_left,
+                            0,
+                            0,
+                            0,
+                            0);
+                    }
+
+#if DEBUG
+                    printf(" end  kernel\n");
+#endif
+
+                    // Epilogue for microkernel + right padding elements
+                    ScalarT const *I_col_left;
+                    if constexpr (op_type == OP_UPSAMPLE)
+                    {
+                        I_col_left =
+                            I_col_full + (O_w_full / _stride) * (_F_cb * G_left);
+                    }
+                    else
+                    {
+                        I_col_left =
+                            I_col_full + (O_w_full * _stride) * (_F_cb * G_left);
+                    }
+
+                    ScalarT const *F_col_left = F_row + 0;
+                    AccumT *O_col_left = O_col_full + O_w_full * (G_left * K_left); // ScalarT --> AccumT
+
+#if DEBUG
+                    printf(" calling right output channel\n");
+#endif
+                    rem_kernel_right<ScalarT, AccumT,
+                                     _G_b, _K_b, _F_cb, _O_wb, _stride,
+                                     _UNROLL, op_type, op_class>(
+                        first,
+                        F_h,
+                        F_w,
+                        I_w * _C_ib_left_group,
+                        // I_w * _C_ib,
+                        O_w_left,
+                        r_pad_el,
+                        pad_right,
+                        I_col_left,
+                        F_col_left,
+                        O_col_left,
+                        G_left,
+                        K_left,
+                        0,
+                        0);
+                }
+                // Epilogue with bottom padding
+                ScalarT const *I_row_bot;
+                // @todo cast index calculation as int and make stride a float value.
+                // I_x = I_x + (int)(j * _stride) * (<remaining dimensions>)
+                if constexpr (op_type == OP_UPSAMPLE)
+                {
+                    I_row_bot =
+                        I_row_full + (O_h * _stride) * (I_w * _F_cb * G_left);
+                }
+                else
+                {
+                    I_row_bot =
+                        I_row_full + (O_h * _stride) * (I_w * _F_cb * G_left);
+                }
+                ScalarT const *F_row_bot = F_channel_block_input + 0;
+                AccumT *O_row_bot = O_row_full + O_h * (O_w_w_pad * G_left * K_left); // ScalarT --> AccumT
+
+                rem_kernel_bottom<ScalarT, AccumT,
+                                  _G_b, _K_b, _F_cb, _O_wb, _stride,
+                                  _UNROLL, op_type, op_class>(
+                    first,
+                    F_h,
+                    F_w,
+                    I_w * _C_ib_left_group,
+                    // I_w * _C_ib,
+                    b_pad_el,
+                    pad_bottom,
+                    W_full_index,
+                    l_pad_el,
+                    pad_left,
+                    O_w_w_pad,
+                    O_w_full,
+                    O_w_left,
+                    r_pad_el,
+                    pad_right,
+                    I_row_bot,
+                    F_row_bot,
+                    O_row_bot,
+                    G_left,
+                    K_left);
+            }
+
+            // Loop over remaining channels
+            // _UNROLL defaults to 1
+            // This loop should have 1 iteration
+            for (index_t i = F_c_full; i < F_c; i += F_c_left)
+            {
+                // printf("F_c_full: %d F_c: %d iter: %i \n", F_c_full, F_c, i);
+                bool first = rewrite_output && (i == 0);
+
+                ScalarT const *I_channel_block_input =
+                    I_channel_block_output + (i / _F_cb) * (I_h * I_w * _F_cb * G_left);
+                ScalarT const *F_channel_block_input =
+                    F_channel_block_output + (i / _F_cb) * (F_h * F_w * _F_cb * G_left * K_left);
+                ScalarT *O_channel_block_input =
+                    O_channel_block_output + 0;
+
+                // Loops over spatial dimensions of output
+
+                // Prologue with top padding
+                ScalarT const *I_row_top = I_channel_block_input;
+                ScalarT const *F_row_top = F_channel_block_input + 0;
+                AccumT *O_row_top = O_channel_block_input; // ScalarT --> AccumT
+
+                rem_kernel_top_rem<ScalarT, AccumT,
+                               _G_b, _K_b, _F_cb, _O_wb, _stride,
+                               1, op_type, op_class>(
+                    first,
+                    F_h,
+                    F_w,
+                    I_w * F_c_left*G_left,
+                    t_pad_el,
+                    pad_top,
+                    W_full_index,
+                    l_pad_el,
+                    pad_left,
+                    O_w_w_pad,
+                    O_w_full,
+                    O_w_left,
+                    r_pad_el,
+                    pad_right,
+                    I_row_top,
+                    F_row_top,
+                    O_row_top,
+                    F_c_left,
+                    G_left,
+                    K_left);
+
+                ScalarT const *I_row_full =
+                    I_row_top + H_full_index * I_w * F_c_left * G_left; //(_F_cb * _G_b);
+                AccumT *O_row_full =
+                    O_row_top + t_pad_el * O_w_w_pad * (G_left * K_left); // ScalarT --> AccumT
+
+                // Steady State over rows
+                // The stride over input channels is the number of remaining channels
+                for (index_t j = 0; j < O_h; j += T_height)
+                {
+                    ScalarT const *I_row;
+                    // @todo cast index calculation as int and make stride a float value.
+                    // I_x = I_x + (int)(j * _stride) * (<remaining dimensions>)
+                    if constexpr (op_type == OP_UPSAMPLE)
+                    {
+                        I_row = I_row_full + (j / _stride) * (I_w * _F_cb * G_left);
+                    }
+                    else
+                    {
+                        I_row = I_row_full + (j * _stride) * (I_w * F_c_left * G_left /*_F_cb * _G_b*/);
+                    }
+                    ScalarT const *F_row = F_channel_block_input + 0;
+                    AccumT *O_row =
+                        O_row_full + j * (O_w_w_pad * G_left * K_left); // ScalarT --> AccumT
+                    // Prologue with left padding
+                    rem_kernel_left_rem<ScalarT, AccumT,
+                                    _G_b, _K_b, _F_cb, _O_wb, _stride,
+                                    1, op_type, op_class>(
+                        first,
+                        F_h,
+                        F_w,
+                        I_w * _C_ib_left,
+                        l_pad_el,
+                        pad_left,
+                        I_row,
+                        F_row,
+                        O_row,
+                        F_c_left,
+                        G_left,
+                        K_left,
+                        0,
+                        0);
+
+                    ScalarT const *I_col_full =
+                        I_row + W_full_index * /*(_F_cb * _G_b)*/ F_c_left * G_left;
+                    AccumT *O_col_full = O_row + l_pad_el * (G_left * K_left); // ScalarT --> AccumT
+                    // Steady State with microkernel
+                    for (index_t l = 0; l < O_w_full; l += _O_wb)
+                    {
+                        ScalarT const *I_col;
+                        // @todo cast index calculation as int and make stride a float value.
+                        // I_x = I_x + (int)(j * _stride) * (<remaining dimensions>)
+                        if constexpr (op_type == OP_UPSAMPLE)
+                        {
+                            I_col = I_col_full + (l / _stride) * (_F_cb * G_left);
+                        }
+                        else
+                        {
+                            I_col = I_col_full + (l * _stride) * F_c_left * G_left /*(_F_cb * _G_b)*/;
+                        }
+                        ScalarT const *F_col = F_row + 0;
+                        AccumT *O_col = O_col_full + l * (G_left * K_left); // ScalarT --> AccumT
+
+                        rem_kernel_rem<ScalarT, AccumT,
+                                   _G_b, _K_b, _F_cb, _O_wb, _stride,
+                                   1, op_type, op_class>(
+                            first,
+                            F_h,
+                            F_w,
+                            I_w * _C_ib_left,
+                            I_col,
+                            F_col,
+                            O_col,
+                            F_c_left,
+                            G_left,
+                            K_left,
+                            0,
+                            0,
+                            0,
+                            0);
+                    }
+
+#if DEBUG
+                    printf(" end  kernel\n");
+#endif
+
+                    // Epilogue for microkernel + right padding elements
+                    ScalarT const *I_col_left;
+                    if constexpr (op_type == OP_UPSAMPLE)
+                    {
+                        I_col_left =
+                            I_col_full + (O_w_full / _stride) * (_F_cb * G_left);
+                    }
+                    else
+                    {
+                        I_col_left =
+                            I_col_full + (O_w_full * _stride) * F_c_left * G_left /*(_F_cb * _G_b)*/;
+                    }
+
+                    ScalarT const *F_col_left = F_row + 0;
+                    AccumT *O_col_left = O_col_full + O_w_full * (G_left * K_left); // ScalarT --> AccumT
+
+#if DEBUG
+                    printf("calling right\n");
+#endif
+                    rem_kernel_right_rem<ScalarT, AccumT,
+                                     _G_b, _K_b, _F_cb, _O_wb, _stride,
+                                     1, op_type, op_class>(
+                        first,
+                        F_h,
+                        F_w,
+                        I_w * _C_ib_left,
+                        O_w_left,
+                        r_pad_el,
+                        pad_right,
+                        I_col_left,
+                        F_col_left,
+                        O_col_left,
+                        F_c_left,
+                        G_left,
+                        K_left,
+                        0,
+                        0);
+                }
+                // Epilogue with bottom padding
+                ScalarT const *I_row_bot;
+                // @todo cast index calculation as int and make stride a float value.
+                // I_x = I_x + (int)(j * _stride) * (<remaining dimensions>)
+                if constexpr (op_type == OP_UPSAMPLE)
+                {
+                    I_row_bot =
+                        I_row_full + (O_h * _stride) * (I_w * _F_cb * G_left);
+                }
+                else
+                {
+                    I_row_bot =
+                        I_row_full + (O_h * _stride) * (I_w * F_c_left * G_left /*_F_cb * _G_b*/);
+                }
+                ScalarT const *F_row_bot = F_channel_block_input + 0;
+                AccumT *O_row_bot = O_row_full + O_h * (O_w_w_pad * G_left * K_left); // ScalarT --> AccumT
+
+                rem_kernel_bottom_rem<ScalarT, AccumT,
+                                  _G_b, _K_b, _F_cb, _O_wb, _stride,
+                                  1, op_type, op_class>(
+                    first,
+                    F_h,
+                    F_w,
+                    I_w * _C_ib_left,
+                    b_pad_el,
+                    pad_bottom,
+                    W_full_index,
+                    l_pad_el,
+                    pad_left,
+                    O_w_w_pad,
+                    O_w_full,
+                    O_w_left,
+                    r_pad_el,
+                    pad_right,
+                    I_row_bot,
+                    F_row_bot,
+                    O_row_bot,
+                    F_c_left,
+                    G_left,
+                    K_left);
             }
         }
     }
diff --git a/include/small/gen_float_intrinsics_platform.py b/include/small/gen_float_intrinsics_platform.py
new file mode 100644
index 0000000..d912027
--- /dev/null
+++ b/include/small/gen_float_intrinsics_platform.py
@@ -0,0 +1,234 @@
+# SMaLL, Software for Machine Learning Libraries
+# Copyright 2023 by The SMaLL Contributors, All Rights Reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# For additional details (including references to third party source code and
+# other files) see the LICENSE file or contact permission@sei.cmu.edu. See
+# Contributors.txt for a full list of contributors. Created, in part, with
+# funding and support from the U.S. Government (see Acknowledgments.txt file).
+# DM23-0126
+
+# Platforrm specific parameters
+#make this is its own file, import based on runtime platform parameters
+platform_name="arm"
+W_ob = 6
+C_ob2 = 1
+C_ob1 = 16
+C_ob = C_ob2 * C_ob1
+SIMD = 8
+UNROLL = 1 # only affects conv kernel.
+# UNROLL = SIMD # only affects conv kernel.
+
+NUM_FMA = 2
+NUM_MAX = 1
+NUM_LOAD = 2
+NUM_STORE = 1
+
+#ISA setup
+simd_intrin_lib_path="arm_neon.h"
+simd_reg_typename="float32x4_t"
+simd_load_func="vld1q_f32"
+simd_store_func="vst1q_f32"
+
+simd_fma_func="vfmaq_f32"
+simd_max_func="vmaxq_f32"
+simd_div_func="vdivq_f32"
+simd_add_func="vaddq_f32"
+#end ISA setup
+
+
+#path to put the generated code
+#get current directory
+import os
+cur_dir=os.getcwd()
+path_to_gen_code=cur_dir+"/platforms/"+platform_name
+print(path_to_gen_code)
+#end platform specific parameters
+
+#There should be a generalization of the kernel generation based on the op_class, whether it's binary, unary, etc.
+#Ideally, given the instruction sequence for a single operation and the op_class, the kernel can be generated.
+matmul_alg="in register broadcast"
+
+with open('{:}/params_temp.h'.format(path_to_gen_code), 'w') as f:
+    f.write(
+        '''
+//****************************************************************************
+// SMaLL, Software for Machine Learning Libraries
+// Copyright 2023 by The SMaLL Contributors, All Rights Reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// For additional details (including references to third party source code and
+// other files) see the LICENSE file or contact permission@sei.cmu.edu. See
+// Contributors.txt for a full list of contributors. Created, in part, with
+// funding and support from the U.S. Government (see Acknowledgments.txt file).
+// DM23-0126
+//****************************************************************************
+
+#pragma once
+
+#define SMALL_HAS_FLOAT_SUPPORT  1
+
+#define FLOAT_W_ob {W_ob}
+#define FLOAT_C_ob {C_ob}
+#define FLOAT_SIMD {SIMD}
+#define FLOAT_UNROLL {UNROLL}
+#define FLOAT_C_ib FLOAT_C_ob
+
+// not used for kernels, but used in throughput calculation.
+#define NUM_FMA {NUM_FMA}
+#define NUM_MAX {NUM_MAX}
+#define NUM_LOAD {NUM_LOAD}
+#define NUM_STORE {NUM_STORE}
+        '''.format(**locals())
+    )
+
+def redefine(name):
+    return ['#ifdef {n}\n#undef {n}\n#endif\n'.format(n=name)]
+
+with open('{:}/intrinsics-gen_temp.h'.format(path_to_gen_code), 'w') as f:
+    s = ['''
+//****************************************************************************
+// SMaLL, Software for Machine Learning Libraries
+// Copyright 2023 by The SMaLL Contributors, All Rights Reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// For additional details (including references to third party source code and
+// other files) see the LICENSE file or contact permission@sei.cmu.edu. See
+// Contributors.txt for a full list of contributors. Created, in part, with
+// funding and support from the U.S. Government (see Acknowledgments.txt file).
+// DM23-0126
+//****************************************************************************
+
+#pragma once
+
+#define SMALL_HAS_FLOAT_SUPPORT  1
+''']
+    s += ['#include <{}>'.format(simd_intrin_lib_path)]
+    
+    # define tile
+    # names of variables
+    s += redefine('FLOAT_DEF_TILE_C')
+    s += ['#define FLOAT_DEF_TILE_C(W_ob, C_ob)\\']
+    c_tile = [["c_{}_{}".format(kk, jj) for jj in range(C_ob//SIMD)] for kk in range(W_ob)]
+    s += ['float c_tile[W_ob * C_ob];\\']
+    for kk in range(W_ob):
+        for jj in range(C_ob//SIMD):
+            s += ['{} {};\\'.format(simd_reg_typename, c_tile[kk][jj])]
+    s += ['']
+
+
+    # zero tile
+    s += redefine('FLOAT_ZERO_TILE_C')
+    s += ['#define FLOAT_ZERO_TILE_C(W_ob, C_ob)\\']
+    for kk in range(W_ob):
+        for jj in range(C_ob//SIMD):
+            s += ['{} = vdupq_n_f32(0);\\'.format(c_tile[kk][jj])]
+    s += ['']
+
+    # load tile
+    s += redefine('FLOAT_LOAD_TILE_C')
+    s += ['#define FLOAT_LOAD_TILE_C(O, W_ob, C_ob)\\']
+    for kk in range(W_ob):
+        for jj in range(C_ob//SIMD):
+            s += ['{c} = vld1q_f32(O + {k} * C_ob + {j} * SIMD);\\'.format(c=c_tile[kk][jj], k=kk, j=jj)]
+    s += ['']
+
+    # load tile strided
+    s += redefine('FLOAT_LOAD_TILE_C_strided')
+    s += ['#define FLOAT_LOAD_TILE_C_strided(O, step, W_ob, C_ob)\\']
+    for kk in range(W_ob):
+        for jj in range(C_ob//SIMD):
+            s += ['{c} = vld1q_f32(O + {k} * step + {j} * SIMD);\\'.format(c=c_tile[kk][jj], k=kk, j=jj)]
+    s += ['']
+
+    # store tile
+    s += redefine('STORE_TILE_C')
+    s += ['#define STORE_TILE_C(O, W_ob, C_ob)\\']
+    for kk in range(W_ob):
+        for jj in range(C_ob//SIMD):
+            s += ['vst1q_f32(O + {k} * C_ob + {j} * SIMD, {c});\\'.format(c=c_tile[kk][jj], k=kk, j=jj)]
+    s += ['']
+
+    # convolution
+    s += redefine('CONV_TILE_C')
+    s += ['#define CONV_TILE_C(step, a, b, W_ob, C_ob)\\']
+    s += ['float *aa = a;\\']
+    s += ['float *bb = b;\\']
+    # define a
+    for kk in range(W_ob):
+        s += ['{} a_{kk};\\'.format(simd_reg_typename, kk=kk)]
+    # define b [half as many]
+    for jj in range(C_ob1//SIMD):
+        s += ['{} b_{jj};\\'.format(simd_reg_typename, jj=jj)]
+
+    for i in range(UNROLL//SIMD):
+        # load a SIMD width of a
+        # for kk in range(W_ob):
+        #     s += ['a_{kk} = vld1q_f32(a + {kk} * step + {i} * SIMD);\\'.format(kk=kk, i=i)]
+
+        for j in range(C_ob2):
+
+
+            for ii in range(SIMD):
+                # load B
+                # for jj in range(C_ob1//SIMD):
+                #     # s += ['b_{jj} = vld1q_f32(b + ({i} * SIMD + {ii})*C_ob + ({j} * {C_ob1} + {jj})*SIMD);\\'.format(i=i, ii=ii, j=j, C_ob1=C_ob1//SIMD, jj=jj)]
+                #     s += ['b_{jj} = vld1q_f32(bb + {ii}*C_ob + ({j} * {C_ob1} + {jj})*SIMD);\\'.format(ii=ii, j=j, C_ob1=C_ob1//SIMD, jj=jj)]
+
+                # compute
+                for kk in range(W_ob):
+                    if j == 0: # load a just before use
+                        s += ['a_{kk} = vld1q_f32(a + {kk} * step + {i} * SIMD);\\'.format(kk=kk, i=i)]
+
+                    for jj in range(C_ob1//SIMD):
+
+                        if kk == 0: # load b just before use
+                            # s += ['b_{jj} = vld1q_f32(b + ({i} * SIMD + {ii})*C_ob + ({j} * {C_ob1} + {jj})*SIMD);\\'.format(i=i, ii=ii, j=j, C_ob1=C_ob1//SIMD, jj=jj)]
+                            s += ['b_{jj} = vld1q_f32(bb + {ii}*C_ob + ({j} * {C_ob1} + {jj})*SIMD);\\'.format(ii=ii, j=j, C_ob1=C_ob1//SIMD, jj=jj)]
+
+                        # s += ['{c} = vfmaq_laneq_f32({c}, b_{jj}, a_{kk}, {ii});\\'.format(c=c_tile[kk][j * (C_ob1//SIMD) + jj], kk=kk, jj=jj, ii=ii)]
+                        # s += ['__asm__ volatile("fmla %[c].4s, %[b].4s, %[a].s[{ii}]\\n\\t" : [c] "+w"({c}) : [b] "w"(b_{jj}), [a] "w"(a_{kk}));'.format(
+                        s += ['__asm__ volatile ("fmla %0.4s, %1.4s, %2.s[{ii}]" : "+w"({c}) : "w"(b_{jj}), "w"(a_{kk}));\\'.format(
+                        # s += ['__asm__ ("fmla %0.4s, %1.4s, %2.s[{ii}]" : "+w"({c}) : "w"(b_{jj}), "w"(a_{kk}));\\'.format(
+                            c=c_tile[kk][j * (C_ob1//SIMD) + jj], kk=kk, jj=jj, ii=ii
+                        )]
+
+                        # s += ['{c} = fma_reg_broadcast({c}, b_{jj}, a_{kk}, {ii});\\'.format(c=c_tile[kk][j * (C_ob1//SIMD) + jj], kk=kk, jj=jj, ii=ii)]
+        s += ['bb += {};\\'.format(SIMD * C_ob)]
+        # s += ['aa += \\']
+
+    s += ['']
+
+    #
+
+    # max pooling / relu
+    s += redefine('MAX_TILE_C')
+    s += ['#define MAX_TILE_C(step, a, W_ob, C_ob)\\']
+    # compute
+    s += ['{} av; \\'.format(simd_reg_typename)]
+    for kk in range(W_ob):
+        for jj in range(C_ob//SIMD):
+            s += ['av = vld1q_f32(a + {k} * step + {j} * SIMD);\\'.format(k=kk, j=jj)]
+            s += ['{c} = vmaxq_f32({c}, av);\\'.format(c=c_tile[kk][jj], k=kk, j=jj)]
+    s += ['']
+
+    # depthwise
+    s += redefine('DW_TILE_C')
+    s += ['#define DW_TILE_C(step, a, b, W_ob, C_ob)\\']
+    s += ['{} av; \\'.format(simd_reg_typename)]
+    # load B
+    for jj in range(C_ob//SIMD):
+        s += ['{} b_{j} = vld1q_f32(b + {j}*SIMD);\\'.format(simd_reg_typename, j=jj)]
+    # compute
+    for kk in range(W_ob):
+        for jj in range(C_ob//SIMD):
+            s += ['av = vld1q_f32(a + {k} * step + {j} * SIMD);\\'.format(k=kk, j=jj)]
+            s += ['{c} = vfmaq_f32({c}, av, b_{j});\\'.format(c=c_tile[kk][jj], j=jj)]
+    s += ['']
+
+
+
+    # to fix backslash at end of file warning
+    s += ['']
+
+    f.write('\n'.join(s))
diff --git a/include/small/interface_abstract.hpp b/include/small/interface_abstract.hpp
index dbd1185..7db09bf 100644
--- a/include/small/interface_abstract.hpp
+++ b/include/small/interface_abstract.hpp
@@ -17,6 +17,8 @@
 #include <stdexcept>
 #include <type_traits>
 
+#define SMALL_HAS_FLOAT_SUPPORT 1
+
 #if defined(SMALL_HAS_FLOAT_SUPPORT)
 #include <small/abstract_layer.hpp> /// @todo abstract_layer_float.hpp
 #endif
@@ -155,11 +157,49 @@ void Conv2D<FloatBuffer>(
                 "Conv2D<float> ERROR: stride unsupported.");
         }
     }
+    
+    //Generic handling for input channels that are not a multiple
     else
     {
-        throw std::invalid_argument(
-            "Conv2D<float> ERROR: in_channels unsupported.");
+        // printf("Generic handling for input channels that are not a multiple\n");
+        // printf("call parameters: ");
+        if (stride == 1)
+        {
+            detail::abstract_layer<
+                FloatBuffer, 1, FLOAT_C_ob, FLOAT_C_ib,
+                FLOAT_W_ob, 1, FLOAT_UNROLL, OP_CONV, 2, 1>(
+                1,               // Output Channel Grouping
+                output_channels, // Output Channels per group
+                input_channels,
+                input_height, input_width,
+                kernel_height, kernel_width,
+                t_pad, l_pad, r_pad, b_pad,
+                &input_buf, &filter_buf, &output_buf);
+        }
+        else if (stride == 2)
+        {
+            detail::abstract_layer<
+                FloatBuffer, 1, FLOAT_C_ob, FLOAT_C_ib,
+                FLOAT_W_ob, 2, FLOAT_UNROLL, OP_CONV, 2, 1>(
+                1,               // Output Channel Grouping
+                output_channels, // Output Channels per group
+                input_channels,
+                input_height, input_width,
+                kernel_height, kernel_width,
+                t_pad, l_pad, r_pad, b_pad,
+                &input_buf, &filter_buf, &output_buf);
+        }
+        else
+        {
+            throw std::invalid_argument(
+                "Conv2D<float> ERROR: stride unsupported.");
+        }
     }
+    // else
+    // {
+    //     throw std::invalid_argument(
+    //         "Conv2D<float> ERROR: in_channels unsupported.");
+    // }
 }
 #endif
 
@@ -379,11 +419,11 @@ void PartialConv2D<FloatBuffer>(
 
     /// @todo We need another specific case for input_channels==1 (maybe more)
 
-    else
-    {
-        throw std::invalid_argument(
-            "PartialConv2D<float> ERROR: in_channels unsupported.");
-    }
+    // else
+    // {
+    //     throw std::invalid_argument(
+    //         "PartialConv2D<float> ERROR: in_channels unsupported.");
+    // }
 }
 #endif
 
@@ -525,8 +565,8 @@ void MaxPool2D<FloatBuffer>(
               << ",I,O)\n";
 #endif
 
-    if (input_channels % FLOAT_C_ib == 0)
-    {
+    // if (input_channels % FLOAT_C_ib == 0)
+    // {
         if (stride == 1)
         {
             detail::abstract_layer<
@@ -556,12 +596,12 @@ void MaxPool2D<FloatBuffer>(
             throw std::invalid_argument(
                 "MaxPool2D<float> ERROR: stride unsupported.");
         }
-    }
-    else
-    {
-        throw std::invalid_argument(
-            "MaxPool2D<float> ERROR: in_channels unsupported.");
-    }
+    // }
+    // else
+    // {
+    //     throw std::invalid_argument(
+    //         "MaxPool2D<float> ERROR: in_channels unsupported.");
+    // }
 }
 
 #endif
@@ -707,11 +747,11 @@ void AveragePool2D<FloatBuffer>(
                 "AveragePool2D<float> ERROR: stride unsupported.");
         }
     }
-    else
-    {
-        throw std::invalid_argument(
-            "AveragePool2D<float> ERROR: in_channels unsupported.");
-    }
+    // else
+    // {
+    //     throw std::invalid_argument(
+    //         "AveragePool2D<float> ERROR: in_channels unsupported.");
+    // }
 }
 
 #endif
@@ -754,8 +794,8 @@ void DepthwiseConv2D<FloatBuffer>(
               << ",img:" << input_height << "x" << input_width
               << ",I,F,O)\n";
 #endif
-    if (input_channels % FLOAT_C_ib == 0)
-    {
+    // if (input_channels % FLOAT_C_ib == 0)
+    // {
         if (stride == 1)
         {
             detail::abstract_layer<
@@ -786,12 +826,12 @@ void DepthwiseConv2D<FloatBuffer>(
             throw std::invalid_argument(
                 "DepthwiseConv2D<float> ERROR: stride unsupported.");
         }
-    }
-    else
-    {
-        throw std::invalid_argument(
-            "DepthwiseConv2D<float> ERROR: in_channels unsupported.");
-    }
+    // }
+    // else
+    // {
+    //     throw std::invalid_argument(
+    //         "DepthwiseConv2D<float> ERROR: in_channels unsupported.");
+    // }
 }
 #endif
 
@@ -1029,8 +1069,8 @@ void ReLUActivation<FloatBuffer>(int input_channels,
               << ",I,O)\n";
 #endif
 
-    if (input_channels % FLOAT_C_ib == 0)
-    {
+    // if (input_channels % FLOAT_C_ob == 0)
+    // {
         detail::abstract_layer<
             FloatBuffer, FLOAT_C_ob, 1, 1, FLOAT_W_ob, 1, 1, OP_RELU, 0, 1>(
             input_channels, // Output Channel Grouping
@@ -1040,12 +1080,12 @@ void ReLUActivation<FloatBuffer>(int input_channels,
             1, 1,
             0, 0, 0, 0,
             &input_buf, (FloatBuffer *)nullptr, &output_buf);
-    }
-    else
-    {
-        throw std::invalid_argument(
-            "ReLUActivation<float> ERROR: in_channels unsupported.");
-    }
+    // }
+    // else
+    // {
+    //     throw std::invalid_argument(
+    //         "ReLUActivation<float> ERROR: in_channels unsupported.");
+    // }
 }
 #endif
 
diff --git a/include/small/intrinsics-gen_temp.h b/include/small/intrinsics-gen_temp.h
new file mode 100644
index 0000000..8dfbe13
--- /dev/null
+++ b/include/small/intrinsics-gen_temp.h
@@ -0,0 +1,189 @@
+
+//****************************************************************************
+// SMaLL, Software for Machine Learning Libraries
+// Copyright 2023 by The SMaLL Contributors, All Rights Reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// For additional details (including references to third party source code and
+// other files) see the LICENSE file or contact permission@sei.cmu.edu. See
+// Contributors.txt for a full list of contributors. Created, in part, with
+// funding and support from the U.S. Government (see Acknowledgments.txt file).
+// DM23-0126
+//****************************************************************************
+
+#pragma once
+
+#define SMALL_HAS_FLOAT_SUPPORT  1
+
+#include <arm_neon.h>
+#ifdef FLOAT_DEF_TILE_C
+#undef FLOAT_DEF_TILE_C
+#endif
+
+#define FLOAT_DEF_TILE_C(W_ob, C_ob)\
+float c_tile[W_ob * C_ob];\
+float32x4_t c_0_0;\
+float32x4_t c_0_1;\
+float32x4_t c_1_0;\
+float32x4_t c_1_1;\
+float32x4_t c_2_0;\
+float32x4_t c_2_1;\
+float32x4_t c_3_0;\
+float32x4_t c_3_1;\
+float32x4_t c_4_0;\
+float32x4_t c_4_1;\
+float32x4_t c_5_0;\
+float32x4_t c_5_1;\
+
+#ifdef FLOAT_ZERO_TILE_C
+#undef FLOAT_ZERO_TILE_C
+#endif
+
+#define FLOAT_ZERO_TILE_C(W_ob, C_ob)\
+c_0_0 = vdupq_n_f32(0);\
+c_0_1 = vdupq_n_f32(0);\
+c_1_0 = vdupq_n_f32(0);\
+c_1_1 = vdupq_n_f32(0);\
+c_2_0 = vdupq_n_f32(0);\
+c_2_1 = vdupq_n_f32(0);\
+c_3_0 = vdupq_n_f32(0);\
+c_3_1 = vdupq_n_f32(0);\
+c_4_0 = vdupq_n_f32(0);\
+c_4_1 = vdupq_n_f32(0);\
+c_5_0 = vdupq_n_f32(0);\
+c_5_1 = vdupq_n_f32(0);\
+
+#ifdef FLOAT_LOAD_TILE_C
+#undef FLOAT_LOAD_TILE_C
+#endif
+
+#define FLOAT_LOAD_TILE_C(O, W_ob, C_ob)\
+c_0_0 = vld1q_f32(O + 0 * C_ob + 0 * SIMD);\
+c_0_1 = vld1q_f32(O + 0 * C_ob + 1 * SIMD);\
+c_1_0 = vld1q_f32(O + 1 * C_ob + 0 * SIMD);\
+c_1_1 = vld1q_f32(O + 1 * C_ob + 1 * SIMD);\
+c_2_0 = vld1q_f32(O + 2 * C_ob + 0 * SIMD);\
+c_2_1 = vld1q_f32(O + 2 * C_ob + 1 * SIMD);\
+c_3_0 = vld1q_f32(O + 3 * C_ob + 0 * SIMD);\
+c_3_1 = vld1q_f32(O + 3 * C_ob + 1 * SIMD);\
+c_4_0 = vld1q_f32(O + 4 * C_ob + 0 * SIMD);\
+c_4_1 = vld1q_f32(O + 4 * C_ob + 1 * SIMD);\
+c_5_0 = vld1q_f32(O + 5 * C_ob + 0 * SIMD);\
+c_5_1 = vld1q_f32(O + 5 * C_ob + 1 * SIMD);\
+
+#ifdef FLOAT_LOAD_TILE_C_strided
+#undef FLOAT_LOAD_TILE_C_strided
+#endif
+
+#define FLOAT_LOAD_TILE_C_strided(O, step, W_ob, C_ob)\
+c_0_0 = vld1q_f32(O + 0 * step + 0 * SIMD);\
+c_0_1 = vld1q_f32(O + 0 * step + 1 * SIMD);\
+c_1_0 = vld1q_f32(O + 1 * step + 0 * SIMD);\
+c_1_1 = vld1q_f32(O + 1 * step + 1 * SIMD);\
+c_2_0 = vld1q_f32(O + 2 * step + 0 * SIMD);\
+c_2_1 = vld1q_f32(O + 2 * step + 1 * SIMD);\
+c_3_0 = vld1q_f32(O + 3 * step + 0 * SIMD);\
+c_3_1 = vld1q_f32(O + 3 * step + 1 * SIMD);\
+c_4_0 = vld1q_f32(O + 4 * step + 0 * SIMD);\
+c_4_1 = vld1q_f32(O + 4 * step + 1 * SIMD);\
+c_5_0 = vld1q_f32(O + 5 * step + 0 * SIMD);\
+c_5_1 = vld1q_f32(O + 5 * step + 1 * SIMD);\
+
+#ifdef STORE_TILE_C
+#undef STORE_TILE_C
+#endif
+
+#define STORE_TILE_C(O, W_ob, C_ob)\
+vst1q_f32(O + 0 * C_ob + 0 * SIMD, c_0_0);\
+vst1q_f32(O + 0 * C_ob + 1 * SIMD, c_0_1);\
+vst1q_f32(O + 1 * C_ob + 0 * SIMD, c_1_0);\
+vst1q_f32(O + 1 * C_ob + 1 * SIMD, c_1_1);\
+vst1q_f32(O + 2 * C_ob + 0 * SIMD, c_2_0);\
+vst1q_f32(O + 2 * C_ob + 1 * SIMD, c_2_1);\
+vst1q_f32(O + 3 * C_ob + 0 * SIMD, c_3_0);\
+vst1q_f32(O + 3 * C_ob + 1 * SIMD, c_3_1);\
+vst1q_f32(O + 4 * C_ob + 0 * SIMD, c_4_0);\
+vst1q_f32(O + 4 * C_ob + 1 * SIMD, c_4_1);\
+vst1q_f32(O + 5 * C_ob + 0 * SIMD, c_5_0);\
+vst1q_f32(O + 5 * C_ob + 1 * SIMD, c_5_1);\
+
+#ifdef CONV_TILE_C
+#undef CONV_TILE_C
+#endif
+
+#define CONV_TILE_C(step, a, b, W_ob, C_ob)\
+float *aa = a;\
+float *bb = b;\
+float32x4_t a_0;\
+float32x4_t a_1;\
+float32x4_t a_2;\
+float32x4_t a_3;\
+float32x4_t a_4;\
+float32x4_t a_5;\
+float32x4_t b_0;\
+float32x4_t b_1;\
+
+#ifdef MAX_TILE_C
+#undef MAX_TILE_C
+#endif
+
+#define MAX_TILE_C(step, a, W_ob, C_ob)\
+float32x4_t av; \
+av = vld1q_f32(a + 0 * step + 0 * SIMD);\
+c_0_0 = vmaxq_f32(c_0_0, av);\
+av = vld1q_f32(a + 0 * step + 1 * SIMD);\
+c_0_1 = vmaxq_f32(c_0_1, av);\
+av = vld1q_f32(a + 1 * step + 0 * SIMD);\
+c_1_0 = vmaxq_f32(c_1_0, av);\
+av = vld1q_f32(a + 1 * step + 1 * SIMD);\
+c_1_1 = vmaxq_f32(c_1_1, av);\
+av = vld1q_f32(a + 2 * step + 0 * SIMD);\
+c_2_0 = vmaxq_f32(c_2_0, av);\
+av = vld1q_f32(a + 2 * step + 1 * SIMD);\
+c_2_1 = vmaxq_f32(c_2_1, av);\
+av = vld1q_f32(a + 3 * step + 0 * SIMD);\
+c_3_0 = vmaxq_f32(c_3_0, av);\
+av = vld1q_f32(a + 3 * step + 1 * SIMD);\
+c_3_1 = vmaxq_f32(c_3_1, av);\
+av = vld1q_f32(a + 4 * step + 0 * SIMD);\
+c_4_0 = vmaxq_f32(c_4_0, av);\
+av = vld1q_f32(a + 4 * step + 1 * SIMD);\
+c_4_1 = vmaxq_f32(c_4_1, av);\
+av = vld1q_f32(a + 5 * step + 0 * SIMD);\
+c_5_0 = vmaxq_f32(c_5_0, av);\
+av = vld1q_f32(a + 5 * step + 1 * SIMD);\
+c_5_1 = vmaxq_f32(c_5_1, av);\
+
+#ifdef DW_TILE_C
+#undef DW_TILE_C
+#endif
+
+#define DW_TILE_C(step, a, b, W_ob, C_ob)\
+float32x4_t av; \
+float32x4_t b_0 = vld1q_f32(b + 0*SIMD);\
+float32x4_t b_1 = vld1q_f32(b + 1*SIMD);\
+av = vld1q_f32(a + 0 * step + 0 * SIMD);\
+c_0_0 = vfmaq_f32(c_0_0, av, b_0);\
+av = vld1q_f32(a + 0 * step + 1 * SIMD);\
+c_0_1 = vfmaq_f32(c_0_1, av, b_1);\
+av = vld1q_f32(a + 1 * step + 0 * SIMD);\
+c_1_0 = vfmaq_f32(c_1_0, av, b_0);\
+av = vld1q_f32(a + 1 * step + 1 * SIMD);\
+c_1_1 = vfmaq_f32(c_1_1, av, b_1);\
+av = vld1q_f32(a + 2 * step + 0 * SIMD);\
+c_2_0 = vfmaq_f32(c_2_0, av, b_0);\
+av = vld1q_f32(a + 2 * step + 1 * SIMD);\
+c_2_1 = vfmaq_f32(c_2_1, av, b_1);\
+av = vld1q_f32(a + 3 * step + 0 * SIMD);\
+c_3_0 = vfmaq_f32(c_3_0, av, b_0);\
+av = vld1q_f32(a + 3 * step + 1 * SIMD);\
+c_3_1 = vfmaq_f32(c_3_1, av, b_1);\
+av = vld1q_f32(a + 4 * step + 0 * SIMD);\
+c_4_0 = vfmaq_f32(c_4_0, av, b_0);\
+av = vld1q_f32(a + 4 * step + 1 * SIMD);\
+c_4_1 = vfmaq_f32(c_4_1, av, b_1);\
+av = vld1q_f32(a + 5 * step + 0 * SIMD);\
+c_5_0 = vfmaq_f32(c_5_0, av, b_0);\
+av = vld1q_f32(a + 5 * step + 1 * SIMD);\
+c_5_1 = vfmaq_f32(c_5_1, av, b_1);\
+
diff --git a/include/small/params_temp.h b/include/small/params_temp.h
new file mode 100644
index 0000000..bc7f745
--- /dev/null
+++ b/include/small/params_temp.h
@@ -0,0 +1,29 @@
+
+//****************************************************************************
+// SMaLL, Software for Machine Learning Libraries
+// Copyright 2023 by The SMaLL Contributors, All Rights Reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// For additional details (including references to third party source code and
+// other files) see the LICENSE file or contact permission@sei.cmu.edu. See
+// Contributors.txt for a full list of contributors. Created, in part, with
+// funding and support from the U.S. Government (see Acknowledgments.txt file).
+// DM23-0126
+//****************************************************************************
+
+#pragma once
+
+#define SMALL_HAS_FLOAT_SUPPORT  1
+
+#define FLOAT_W_ob 6
+#define FLOAT_C_ob 16
+#define FLOAT_SIMD 8
+#define FLOAT_UNROLL 1
+#define FLOAT_C_ib FLOAT_C_ob
+
+// not used for kernels, but used in throughput calculation.
+#define NUM_FMA 2
+#define NUM_MAX 1
+#define NUM_LOAD 2
+#define NUM_STORE 1
+        
\ No newline at end of file
diff --git a/include/small/platforms/arm/intrinsics-gen_temp.h b/include/small/platforms/arm/intrinsics-gen_temp.h
new file mode 100644
index 0000000..8dfbe13
--- /dev/null
+++ b/include/small/platforms/arm/intrinsics-gen_temp.h
@@ -0,0 +1,189 @@
+
+//****************************************************************************
+// SMaLL, Software for Machine Learning Libraries
+// Copyright 2023 by The SMaLL Contributors, All Rights Reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// For additional details (including references to third party source code and
+// other files) see the LICENSE file or contact permission@sei.cmu.edu. See
+// Contributors.txt for a full list of contributors. Created, in part, with
+// funding and support from the U.S. Government (see Acknowledgments.txt file).
+// DM23-0126
+//****************************************************************************
+
+#pragma once
+
+#define SMALL_HAS_FLOAT_SUPPORT  1
+
+#include <arm_neon.h>
+#ifdef FLOAT_DEF_TILE_C
+#undef FLOAT_DEF_TILE_C
+#endif
+
+#define FLOAT_DEF_TILE_C(W_ob, C_ob)\
+float c_tile[W_ob * C_ob];\
+float32x4_t c_0_0;\
+float32x4_t c_0_1;\
+float32x4_t c_1_0;\
+float32x4_t c_1_1;\
+float32x4_t c_2_0;\
+float32x4_t c_2_1;\
+float32x4_t c_3_0;\
+float32x4_t c_3_1;\
+float32x4_t c_4_0;\
+float32x4_t c_4_1;\
+float32x4_t c_5_0;\
+float32x4_t c_5_1;\
+
+#ifdef FLOAT_ZERO_TILE_C
+#undef FLOAT_ZERO_TILE_C
+#endif
+
+#define FLOAT_ZERO_TILE_C(W_ob, C_ob)\
+c_0_0 = vdupq_n_f32(0);\
+c_0_1 = vdupq_n_f32(0);\
+c_1_0 = vdupq_n_f32(0);\
+c_1_1 = vdupq_n_f32(0);\
+c_2_0 = vdupq_n_f32(0);\
+c_2_1 = vdupq_n_f32(0);\
+c_3_0 = vdupq_n_f32(0);\
+c_3_1 = vdupq_n_f32(0);\
+c_4_0 = vdupq_n_f32(0);\
+c_4_1 = vdupq_n_f32(0);\
+c_5_0 = vdupq_n_f32(0);\
+c_5_1 = vdupq_n_f32(0);\
+
+#ifdef FLOAT_LOAD_TILE_C
+#undef FLOAT_LOAD_TILE_C
+#endif
+
+#define FLOAT_LOAD_TILE_C(O, W_ob, C_ob)\
+c_0_0 = vld1q_f32(O + 0 * C_ob + 0 * SIMD);\
+c_0_1 = vld1q_f32(O + 0 * C_ob + 1 * SIMD);\
+c_1_0 = vld1q_f32(O + 1 * C_ob + 0 * SIMD);\
+c_1_1 = vld1q_f32(O + 1 * C_ob + 1 * SIMD);\
+c_2_0 = vld1q_f32(O + 2 * C_ob + 0 * SIMD);\
+c_2_1 = vld1q_f32(O + 2 * C_ob + 1 * SIMD);\
+c_3_0 = vld1q_f32(O + 3 * C_ob + 0 * SIMD);\
+c_3_1 = vld1q_f32(O + 3 * C_ob + 1 * SIMD);\
+c_4_0 = vld1q_f32(O + 4 * C_ob + 0 * SIMD);\
+c_4_1 = vld1q_f32(O + 4 * C_ob + 1 * SIMD);\
+c_5_0 = vld1q_f32(O + 5 * C_ob + 0 * SIMD);\
+c_5_1 = vld1q_f32(O + 5 * C_ob + 1 * SIMD);\
+
+#ifdef FLOAT_LOAD_TILE_C_strided
+#undef FLOAT_LOAD_TILE_C_strided
+#endif
+
+#define FLOAT_LOAD_TILE_C_strided(O, step, W_ob, C_ob)\
+c_0_0 = vld1q_f32(O + 0 * step + 0 * SIMD);\
+c_0_1 = vld1q_f32(O + 0 * step + 1 * SIMD);\
+c_1_0 = vld1q_f32(O + 1 * step + 0 * SIMD);\
+c_1_1 = vld1q_f32(O + 1 * step + 1 * SIMD);\
+c_2_0 = vld1q_f32(O + 2 * step + 0 * SIMD);\
+c_2_1 = vld1q_f32(O + 2 * step + 1 * SIMD);\
+c_3_0 = vld1q_f32(O + 3 * step + 0 * SIMD);\
+c_3_1 = vld1q_f32(O + 3 * step + 1 * SIMD);\
+c_4_0 = vld1q_f32(O + 4 * step + 0 * SIMD);\
+c_4_1 = vld1q_f32(O + 4 * step + 1 * SIMD);\
+c_5_0 = vld1q_f32(O + 5 * step + 0 * SIMD);\
+c_5_1 = vld1q_f32(O + 5 * step + 1 * SIMD);\
+
+#ifdef STORE_TILE_C
+#undef STORE_TILE_C
+#endif
+
+#define STORE_TILE_C(O, W_ob, C_ob)\
+vst1q_f32(O + 0 * C_ob + 0 * SIMD, c_0_0);\
+vst1q_f32(O + 0 * C_ob + 1 * SIMD, c_0_1);\
+vst1q_f32(O + 1 * C_ob + 0 * SIMD, c_1_0);\
+vst1q_f32(O + 1 * C_ob + 1 * SIMD, c_1_1);\
+vst1q_f32(O + 2 * C_ob + 0 * SIMD, c_2_0);\
+vst1q_f32(O + 2 * C_ob + 1 * SIMD, c_2_1);\
+vst1q_f32(O + 3 * C_ob + 0 * SIMD, c_3_0);\
+vst1q_f32(O + 3 * C_ob + 1 * SIMD, c_3_1);\
+vst1q_f32(O + 4 * C_ob + 0 * SIMD, c_4_0);\
+vst1q_f32(O + 4 * C_ob + 1 * SIMD, c_4_1);\
+vst1q_f32(O + 5 * C_ob + 0 * SIMD, c_5_0);\
+vst1q_f32(O + 5 * C_ob + 1 * SIMD, c_5_1);\
+
+#ifdef CONV_TILE_C
+#undef CONV_TILE_C
+#endif
+
+#define CONV_TILE_C(step, a, b, W_ob, C_ob)\
+float *aa = a;\
+float *bb = b;\
+float32x4_t a_0;\
+float32x4_t a_1;\
+float32x4_t a_2;\
+float32x4_t a_3;\
+float32x4_t a_4;\
+float32x4_t a_5;\
+float32x4_t b_0;\
+float32x4_t b_1;\
+
+#ifdef MAX_TILE_C
+#undef MAX_TILE_C
+#endif
+
+#define MAX_TILE_C(step, a, W_ob, C_ob)\
+float32x4_t av; \
+av = vld1q_f32(a + 0 * step + 0 * SIMD);\
+c_0_0 = vmaxq_f32(c_0_0, av);\
+av = vld1q_f32(a + 0 * step + 1 * SIMD);\
+c_0_1 = vmaxq_f32(c_0_1, av);\
+av = vld1q_f32(a + 1 * step + 0 * SIMD);\
+c_1_0 = vmaxq_f32(c_1_0, av);\
+av = vld1q_f32(a + 1 * step + 1 * SIMD);\
+c_1_1 = vmaxq_f32(c_1_1, av);\
+av = vld1q_f32(a + 2 * step + 0 * SIMD);\
+c_2_0 = vmaxq_f32(c_2_0, av);\
+av = vld1q_f32(a + 2 * step + 1 * SIMD);\
+c_2_1 = vmaxq_f32(c_2_1, av);\
+av = vld1q_f32(a + 3 * step + 0 * SIMD);\
+c_3_0 = vmaxq_f32(c_3_0, av);\
+av = vld1q_f32(a + 3 * step + 1 * SIMD);\
+c_3_1 = vmaxq_f32(c_3_1, av);\
+av = vld1q_f32(a + 4 * step + 0 * SIMD);\
+c_4_0 = vmaxq_f32(c_4_0, av);\
+av = vld1q_f32(a + 4 * step + 1 * SIMD);\
+c_4_1 = vmaxq_f32(c_4_1, av);\
+av = vld1q_f32(a + 5 * step + 0 * SIMD);\
+c_5_0 = vmaxq_f32(c_5_0, av);\
+av = vld1q_f32(a + 5 * step + 1 * SIMD);\
+c_5_1 = vmaxq_f32(c_5_1, av);\
+
+#ifdef DW_TILE_C
+#undef DW_TILE_C
+#endif
+
+#define DW_TILE_C(step, a, b, W_ob, C_ob)\
+float32x4_t av; \
+float32x4_t b_0 = vld1q_f32(b + 0*SIMD);\
+float32x4_t b_1 = vld1q_f32(b + 1*SIMD);\
+av = vld1q_f32(a + 0 * step + 0 * SIMD);\
+c_0_0 = vfmaq_f32(c_0_0, av, b_0);\
+av = vld1q_f32(a + 0 * step + 1 * SIMD);\
+c_0_1 = vfmaq_f32(c_0_1, av, b_1);\
+av = vld1q_f32(a + 1 * step + 0 * SIMD);\
+c_1_0 = vfmaq_f32(c_1_0, av, b_0);\
+av = vld1q_f32(a + 1 * step + 1 * SIMD);\
+c_1_1 = vfmaq_f32(c_1_1, av, b_1);\
+av = vld1q_f32(a + 2 * step + 0 * SIMD);\
+c_2_0 = vfmaq_f32(c_2_0, av, b_0);\
+av = vld1q_f32(a + 2 * step + 1 * SIMD);\
+c_2_1 = vfmaq_f32(c_2_1, av, b_1);\
+av = vld1q_f32(a + 3 * step + 0 * SIMD);\
+c_3_0 = vfmaq_f32(c_3_0, av, b_0);\
+av = vld1q_f32(a + 3 * step + 1 * SIMD);\
+c_3_1 = vfmaq_f32(c_3_1, av, b_1);\
+av = vld1q_f32(a + 4 * step + 0 * SIMD);\
+c_4_0 = vfmaq_f32(c_4_0, av, b_0);\
+av = vld1q_f32(a + 4 * step + 1 * SIMD);\
+c_4_1 = vfmaq_f32(c_4_1, av, b_1);\
+av = vld1q_f32(a + 5 * step + 0 * SIMD);\
+c_5_0 = vfmaq_f32(c_5_0, av, b_0);\
+av = vld1q_f32(a + 5 * step + 1 * SIMD);\
+c_5_1 = vfmaq_f32(c_5_1, av, b_1);\
+
diff --git a/include/small/platforms/arm/intrinsics.h b/include/small/platforms/arm/intrinsics.h
index e01a4ba..2145daf 100644
--- a/include/small/platforms/arm/intrinsics.h
+++ b/include/small/platforms/arm/intrinsics.h
@@ -930,6 +930,31 @@ if constexpr(_C_ob == 1 && _C_ob != FLOAT_SIMD_EPILOGUE)\
     }\
 }
 
+#define FLOAT_REDUCE_REM_CHANNEL_END_C(O_w_left, _C_ob)                             \
+    if  (_C_ob == 1 && _C_ob != FLOAT_SIMD_EPILOGUE)                   \
+    {                                                                           \
+        float c_tile_array[FLOAT_C_ob];                                         \
+        for (uint32_t kk = 0; kk < O_w_left; kk++)                              \
+        {                                                                       \
+            float32x4_t *c_channel_v = c_tile + kk * (FLOAT_C_ob / FLOAT_SIMD); \
+            c_channel_v[0] = vaddq_f32(c_channel_v[0], c_channel_v[1]);         \
+            c_channel_v[2] = vaddq_f32(c_channel_v[2], c_channel_v[3]);         \
+            c_channel_v[0] = vaddq_f32(c_channel_v[0], c_channel_v[2]);         \
+                                                                                \
+            vst1q_f32(c_tile_array, c_channel_v[0]);                            \
+            for (uint32_t jj = 1; jj < FLOAT_SIMD; jj++)                        \
+            {                                                                   \
+                c_tile_array[0] += c_tile_array[jj];                            \
+                c_tile_array[jj] = 0;                                           \
+            }                                                                   \
+                                                                                \
+            c_channel_v[0] = vld1q_f32(c_tile_array);                           \
+            c_channel_v[1] = vdupq_n_f32(0.0);                                  \
+            c_channel_v[2] = vdupq_n_f32(0.0);                                  \
+            c_channel_v[3] = vdupq_n_f32(0.0);                                  \
+        }                                                                       \
+    }
+
 //****************************************************************************
 // AVG Pooling
 //****************************************************************************
diff --git a/include/small/platforms/arm/params.h b/include/small/platforms/arm/params.h
index cdaf6b5..d63c9d5 100644
--- a/include/small/platforms/arm/params.h
+++ b/include/small/platforms/arm/params.h
@@ -15,16 +15,16 @@
 /// @todo move this include to intrinsics.hpp?
 #include <arm_neon.h>
 
-#define SMALL_HAS_FLOAT_SUPPORT  1
+#define SMALL_HAS_FLOAT_SUPPORT 1
 
-#define FLOAT_W_ob   6
-#define FLOAT_C_ob   16
-#define FLOAT_SIMD   4
+#define FLOAT_W_ob 6
+#define FLOAT_C_ob 16
+#define FLOAT_SIMD 4
 #define FLOAT_UNROLL 4
-#define FLOAT_C_ib   FLOAT_C_ob
+#define FLOAT_C_ib FLOAT_C_ob
 
 // not used for kernels, but used in throughput calculation.
 #define FLOAT_NUM_FMA 2
 #define FLOAT_NUM_MAX 1
 #define FLOAT_NUM_LOAD 2
-#define FLOAT_NUM_STORE 1
+#define FLOAT_NUM_STORE 1
\ No newline at end of file
diff --git a/include/small/platforms/arm/params_temp.h b/include/small/platforms/arm/params_temp.h
new file mode 100644
index 0000000..bc7f745
--- /dev/null
+++ b/include/small/platforms/arm/params_temp.h
@@ -0,0 +1,29 @@
+
+//****************************************************************************
+// SMaLL, Software for Machine Learning Libraries
+// Copyright 2023 by The SMaLL Contributors, All Rights Reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// For additional details (including references to third party source code and
+// other files) see the LICENSE file or contact permission@sei.cmu.edu. See
+// Contributors.txt for a full list of contributors. Created, in part, with
+// funding and support from the U.S. Government (see Acknowledgments.txt file).
+// DM23-0126
+//****************************************************************************
+
+#pragma once
+
+#define SMALL_HAS_FLOAT_SUPPORT  1
+
+#define FLOAT_W_ob 6
+#define FLOAT_C_ob 16
+#define FLOAT_SIMD 8
+#define FLOAT_UNROLL 1
+#define FLOAT_C_ib FLOAT_C_ob
+
+// not used for kernels, but used in throughput calculation.
+#define NUM_FMA 2
+#define NUM_MAX 1
+#define NUM_LOAD 2
+#define NUM_STORE 1
+        
\ No newline at end of file
diff --git a/include/small/platforms/reference/intrinsics_float.h b/include/small/platforms/reference/intrinsics_float.h
index fe03cef..5bf83f4 100644
--- a/include/small/platforms/reference/intrinsics_float.h
+++ b/include/small/platforms/reference/intrinsics_float.h
@@ -441,6 +441,21 @@ for(uint32_t u =0 ; u < _UNROLL; u++)\
         }                                                                       \
     }
 
+#define FLOAT_REDUCE_REM_CHANNEL_END_C(O_w_left, _C_ob)           \
+    if(_C_ob == 1 && _C_ob != FLOAT_SIMD_EPILOGUE) \
+    {                                                         \
+        float c_tile_array[FLOAT_C_ob];                       \
+        for (uint32_t kk = 0; kk < O_w_left; kk++)            \
+        {                                                     \
+            float *c_channel_v = c_tile + kk * (FLOAT_C_ob);  \
+            for (uint32_t jj = 1; jj < FLOAT_C_ob; jj++)      \
+            {                                                 \
+                c_channel_v[0] += c_channel_v[jj];            \
+                c_channel_v[jj] = 0;                          \
+            }                                                 \
+        }                                                     \
+    }
+
 //****************************************************************************
 // Reduce kernels??
 //****************************************************************************
diff --git a/include/small/platforms/zen2/intrinsics.h b/include/small/platforms/zen2/intrinsics.h
index 23c678d..101ea55 100644
--- a/include/small/platforms/zen2/intrinsics.h
+++ b/include/small/platforms/zen2/intrinsics.h
@@ -617,6 +617,22 @@ for(uint32_t u =0 ; u < _UNROLL; u++)\
             }                                                 \
         }                                                     \
     }
+
+#define FLOAT_REDUCE_REM_CHANNEL_END_C(O_w_left, _C_ob)           \
+    if  (_C_ob == 1 && _C_ob != FLOAT_SIMD_EPILOGUE) \
+    {                                                         \
+        float c_tile_array[FLOAT_C_ob];                       \
+        for (uint32_t kk = 0; kk < O_w_left; kk++)            \
+        {                                                     \
+            float *c_channel_v = c_tile + kk * (FLOAT_C_ob);  \
+            for (uint32_t jj = 1; jj < FLOAT_C_ob; jj++)      \
+            {                                                 \
+                c_channel_v[0] += c_channel_v[jj];            \
+                c_channel_v[jj] = 0;                          \
+            }                                                 \
+        }                                                     \
+    }
+
 //****************************************************************************
 // FMA unused?
 //****************************************************************************