[onert-micro] Add cmsis-nn Pooling kernels (#11570)

This commit adds cmsis-nn Pooling kernels: AvgPool2D and MaxPool2D. ONE-DCO-1.0-Signed-off-by: Artem Balyshev <[email protected]> Co-authored-by: Artem Balyshev <[email protected]>
Samsung · Sep 26, 2023 · c3fb7e8 · c3fb7e8
1 parent 53195e7
commit c3fb7e8
Show file tree

Hide file tree

Showing 8 changed files with 256 additions and 77 deletions.
diff --git a/onert-micro/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst b/onert-micro/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst
@@ -1,5 +1,6 @@
 REGISTER_KERNEL(ABS, Abs)
 REGISTER_KERNEL(ADD, Add)
+REGISTER_KERNEL(AVERAGE_POOL_2D, AveragePool2D)
 REGISTER_KERNEL(ARG_MAX, ArgMax)
 REGISTER_KERNEL(ARG_MIN, ArgMin)
 REGISTER_KERNEL(DIV, Div)
@@ -29,6 +30,7 @@ REGISTER_KERNEL(LOGICAL_AND, LogicalAnd)
 REGISTER_KERNEL(LOGICAL_OR, LogicalOr)
 REGISTER_KERNEL(LEAKY_RELU, LeakyRelu)
 REGISTER_KERNEL(MUL, Mul)
+REGISTER_KERNEL(MAX_POOL_2D, MaxPool2D)
 REGISTER_KERNEL(CONCATENATION, Concatenation)
 REGISTER_KERNEL(SHAPE, Shape)
 REGISTER_KERNEL(NOT_EQUAL, NotEqual)

diff --git a/onert-micro/luci-interpreter/pal/cmsisnn/PALAveragePool2D.h b/onert-micro/luci-interpreter/pal/cmsisnn/PALAveragePool2D.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_CMSIS_NN_AVERAGE_POOL_2D_H
+#define LUCI_INTERPRETER_PAL_CMSIS_NN_AVERAGE_POOL_2D_H
+
+#include "PALAveragePool2DCommon.h"
+
+#include <arm_nnfunctions.h>
+
+namespace luci_interpreter_pal
+{
+inline void AveragePool(const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape,
+                        const uint8_t *input_data,
+                        const luci_interpreter::RuntimeShape &output_shape, uint8_t *output_data,
+                        luci_interpreter::DataType data_type)
+{
+  cmsis_nn_dims input_dims;
+  cmsis_nn_dims output_dims;
+  cmsis_nn_pool_params pool_params;
+  cmsis_nn_dims filter_dims;
+  cmsis_nn_context ctx;
+
+  const int depth = input_shape.dims(3);
+  const int output_width = output_shape.dims(2);
+
+  input_dims.n = 1;
+  input_dims.h = input_shape.dims(1);
+  input_dims.w = input_shape.dims(2);
+  input_dims.c = depth;
+
+  output_dims.n = 1;
+  output_dims.h = output_shape.dims(1);
+  output_dims.w = output_width;
+  output_dims.c = depth;
+
+  pool_params.stride.h = params.stride_height;
+  pool_params.stride.w = params.stride_width;
+  pool_params.padding.h = params.padding_values.height;
+  pool_params.padding.w = params.padding_values.width;
+  pool_params.activation.min = params.quantized_activation_min;
+  pool_params.activation.max = params.quantized_activation_max;
+
+  filter_dims.n = 1;
+  filter_dims.h = params.filter_height;
+  filter_dims.w = params.filter_width;
+  filter_dims.c = 1;
+
+  const int32_t buffer_size = data_type == luci_interpreter::DataType::S16
+                                ? arm_avgpool_s16_get_buffer_size(output_width, depth)
+                                : arm_avgpool_s8_get_buffer_size(output_width, depth);
+  int8_t *buffer = nullptr;
+  if (buffer_size > 0)
+  {
+    buffer = new int8_t[buffer_size];
+  }
+
+  ctx.buf = buffer;
+  ctx.size = buffer_size;
+
+  if (data_type == luci_interpreter::DataType::S8)
+  {
+    arm_avgpool_s8(&ctx, &pool_params, &input_dims,
+                   luci_interpreter::kernels::getTensorData<int8_t>(input_data), &filter_dims,
+                   &output_dims, luci_interpreter::kernels::getTensorData<int8_t>(output_data));
+  }
+  else
+  {
+    arm_avgpool_s16(&ctx, &pool_params, &input_dims,
+                    luci_interpreter::kernels::getTensorData<int16_t>(input_data), &filter_dims,
+                    &output_dims, luci_interpreter::kernels::getTensorData<int16_t>(output_data));
+  }
+
+  if (buffer_size > 0)
+    delete[] buffer;
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_CMSIS_NN_AVERAGE_POOL_2D_H
diff --git a/onert-micro/luci-interpreter/pal/cmsisnn/PALMaxPool2D.h b/onert-micro/luci-interpreter/pal/cmsisnn/PALMaxPool2D.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_CMSIS_NN_MAX_POOL_2D_H
+#define LUCI_INTERPRETER_PAL_CMSIS_NN_MAX_POOL_2D_H
+
+#include "PALMaxPool2DCommon.h"
+
+#include <arm_nnfunctions.h>
+
+namespace luci_interpreter_pal
+{
+
+inline void MaxPool(const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape,
+                    const uint8_t *input_data, const luci_interpreter::RuntimeShape &output_shape,
+                    uint8_t *output_data, luci_interpreter::DataType data_type)
+{
+  cmsis_nn_dims input_dims;
+  cmsis_nn_dims output_dims;
+  cmsis_nn_pool_params pool_params;
+  cmsis_nn_dims filter_dims;
+  cmsis_nn_context ctx;
+
+  const int depth = input_shape.dims(3);
+  const int output_width = output_shape.dims(2);
+
+  input_dims.n = 1;
+  input_dims.h = input_shape.dims(1);
+  input_dims.w = input_shape.dims(2);
+  input_dims.c = depth;
+
+  output_dims.n = 1;
+  output_dims.h = output_shape.dims(1);
+  output_dims.w = output_width;
+  output_dims.c = depth;
+
+  pool_params.stride.h = params.stride_height;
+  pool_params.stride.w = params.stride_width;
+  pool_params.padding.h = params.padding_values.height;
+  pool_params.padding.w = params.padding_values.width;
+  pool_params.activation.min = params.quantized_activation_min;
+  pool_params.activation.max = params.quantized_activation_max;
+
+  filter_dims.n = 1;
+  filter_dims.h = params.filter_height;
+  filter_dims.w = params.filter_width;
+  filter_dims.c = 1;
+
+  if (data_type == luci_interpreter::DataType::S8)
+  {
+    arm_max_pool_s8(&ctx, &pool_params, &input_dims,
+                    luci_interpreter::kernels::getTensorData<int8_t>(input_data), &filter_dims,
+                    &output_dims, luci_interpreter::kernels::getTensorData<int8_t>(output_data));
+  }
+  else
+  {
+    arm_max_pool_s16(&ctx, &pool_params, &input_dims,
+                     luci_interpreter::kernels::getTensorData<int16_t>(input_data), &filter_dims,
+                     &output_dims, luci_interpreter::kernels::getTensorData<int16_t>(output_data));
+  }
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_CMSIS_NN_MAX_POOL_2D_H
diff --git a/onert-micro/luci-interpreter/pal/common/PALMaxPool2DCommon.h b/onert-micro/luci-interpreter/pal/common/PALMaxPool2DCommon.h
@@ -81,66 +81,6 @@ inline void MaxPool(const PoolParams &params, const luci_interpreter::RuntimeSha
   }
 }
 
-template <typename T>
-inline void MaxPool(const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape,
-                    const T *input_data, const luci_interpreter::RuntimeShape &output_shape,
-                    T *output_data)
-{
-  const int batches = input_shape.dims(0);
-  const int depth = output_shape.dims(3);
-  const int input_height = input_shape.dims(1);
-  const int input_width = input_shape.dims(2);
-  const int output_height = output_shape.dims(1);
-  const int output_width = output_shape.dims(2);
-  const int stride_height = params.stride_height;
-  const int stride_width = params.stride_width;
-  for (int batch = 0; batch < batches; ++batch)
-  {
-    for (int out_y = 0; out_y < output_height; ++out_y)
-    {
-      for (int out_x = 0; out_x < output_width; ++out_x)
-      {
-        for (int channel = 0; channel < depth; ++channel)
-        {
-          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
-          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
-          // Compute the boundaries of the filter region clamped so as to
-          // ensure that the filter window fits in the input array.
-          const int filter_x_start = std::max(0, -in_x_origin);
-          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
-          const int filter_y_start = std::max(0, -in_y_origin);
-          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
-          T max = std::numeric_limits<T>::lowest();
-          for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
-          {
-            for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
-            {
-              const int in_x = in_x_origin + filter_x;
-              const int in_y = in_y_origin + filter_y;
-
-              const int input_data_offset =
-                ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) *
-                  input_shape.dims(3) +
-                channel;
-
-              max = std::max(max, input_data[input_data_offset]);
-            }
-          }
-          max = std::max<T>(max, params.quantized_activation_min);
-          max = std::min<T>(max, params.quantized_activation_max);
-
-          const int output_data_offset =
-            ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) *
-              output_shape.dims(3) +
-            channel;
-
-          output_data[output_data_offset] = static_cast<T>(max);
-        }
-      }
-    }
-  }
-}
-
 } // namespace luci_interpreter_pal
 
 #endif // LUCI_INTERPRETER_PAL_MAX_POOL_2D_COMMON_H
diff --git a/onert-micro/luci-interpreter/pal/mcu/PALAveragePool2D.h b/onert-micro/luci-interpreter/pal/mcu/PALAveragePool2D.h
@@ -22,7 +22,14 @@
 
 namespace luci_interpreter_pal
 {
-// TODO: add S8 and S16 kernel
+
+inline void AveragePool(const PoolParams &, const luci_interpreter::RuntimeShape &, const uint8_t *,
+                        const luci_interpreter::RuntimeShape &, uint8_t *,
+                        luci_interpreter::DataType)
+{
+  assert(false && "Not impl yet");
+}
+
 } // namespace luci_interpreter_pal
 
 #endif // LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_H
diff --git a/onert-micro/luci-interpreter/pal/mcu/PALMaxPool2D.h b/onert-micro/luci-interpreter/pal/mcu/PALMaxPool2D.h
@@ -22,7 +22,11 @@
 
 namespace luci_interpreter_pal
 {
-// TODO: Add INT8, INT16 kernels
+inline void MaxPool(const PoolParams &, const luci_interpreter::RuntimeShape &, const uint8_t *,
+                    const luci_interpreter::RuntimeShape &, uint8_t *, luci_interpreter::DataType)
+{
+  assert(false && "Not impl yet");
+}
 } // namespace luci_interpreter_pal
 
 #endif // LUCI_INTERPRETER_PAL_MAX_POOL_2D_H
diff --git a/onert-micro/luci-interpreter/src/kernels/AveragePool2D.cpp b/onert-micro/luci-interpreter/src/kernels/AveragePool2D.cpp
@@ -36,7 +36,7 @@ void configure_kernel_CircleAveragePool2D(const circle::Operator *cur_op,
   const auto output = runtime_graph->getCircleTensorByIndex(output_index);
 
   LUCI_INTERPRETER_CHECK(Tensor::element_type(input) == Tensor::element_type(output));
-  assert(Tensor::num_dims(input) == 4);
+  LUCI_INTERPRETER_CHECK(Tensor::num_dims(input) == 4);
 }
 
 void execute_kernel_CircleAveragePool2D(const circle::Operator *cur_op,
@@ -69,10 +69,34 @@ void execute_kernel_CircleAveragePool2D(const circle::Operator *cur_op,
   const auto *input_data = runtime_graph->getDataByTensor(input);
   auto *output_data = runtime_graph->getDataByTensor(output);
 
+  const DataType input_type = Tensor::element_type(input);
+
   float activation_min{};
   float activation_max{};
-  kernels::calculateActivationRange(luci_actfunc(options->fused_activation_function()),
-                                    &activation_min, &activation_max);
+
+  int32_t quantized_activation_min{};
+  int32_t quantized_activation_max{};
+
+  if (input_type == DataType::S8 or input_type == DataType::S16)
+  {
+#ifndef DIS_QUANT
+    kernels::calculateActivationRangeQuantized(luci_actfunc(options->fused_activation_function()),
+                                               output, &quantized_activation_min,
+                                               &quantized_activation_max);
+#endif // DIS_QUANT
+  }
+  else if (input_type == DataType::FLOAT32)
+  {
+#ifndef DIS_FLOAT
+    kernels::calculateActivationRange(luci_actfunc(options->fused_activation_function()),
+                                      &activation_min, &activation_max);
+#endif // DIS_FLOAT
+  }
+  else
+  {
+    assert(false && "Not supported type");
+  }
+
   luci_interpreter_pal::PoolParams params{};
   params.padding_values.height = padding_height;
   params.padding_values.width = padding_width;
@@ -82,8 +106,10 @@ void execute_kernel_CircleAveragePool2D(const circle::Operator *cur_op,
   params.filter_width = options->filter_width();
   params.float_activation_min = activation_min;
   params.float_activation_max = activation_max;
+  params.quantized_activation_max = quantized_activation_max;
+  params.quantized_activation_min = quantized_activation_min;
 
-  switch (Tensor::element_type(input))
+  switch (input_type)
   {
 #ifndef DIS_FLOAT
     case DataType::FLOAT32:
@@ -92,6 +118,14 @@ void execute_kernel_CircleAveragePool2D(const circle::Operator *cur_op,
         kernels::getTensorShape(output), kernels::getTensorData<float>(output_data));
       break;
 #endif // DIS_FLOAT
+#ifndef DIS_QUANT
+    case DataType::S8:
+    case DataType::S16:
+      luci_interpreter_pal::AveragePool(
+        params, kernels::getTensorShape(input), kernels::getTensorData<uint8_t>(input_data),
+        kernels::getTensorShape(output), kernels::getTensorData<uint8_t>(output_data), input_type);
+      break;
+#endif // DIS_QUANT
     default:
       assert(false && "Unsupported type.");
   }