From 9d1783b632bd64c4b94b41c4d76649e89b5eddc5 Mon Sep 17 00:00:00 2001
From: Artem Balyshev <a.balyshev@samsung.com>
Date: Wed, 4 Oct 2023 14:44:37 +0300
Subject: [PATCH] [onert-micro] Add float Mean kernels

This commit adds float Mean kernels for onert-micro.

ONE-DCO-1.0-Signed-off-by: Artem Balyshev <a.balyshev@samsung.com>
---
 .../test_models/mean/FloatMeanKernel.h        | 132 +++++++
 .../test_models/mean/NegMeanKernel.h          |  92 +++++
 .../test_models/mean/TestDataMeanBase.h       |  60 +++
 .../pal/cmsisnn/KernelsToBuild.lst            |   1 +
 .../luci-interpreter/pal/common/PALMean.h     | 205 ++++++++++
 .../luci-interpreter/pal/common/Params.h      |   6 +
 .../pal/mcu/KernelsToBuild.lst                |   1 +
 .../luci-interpreter/src/kernels/Mean.cpp     | 360 ++++--------------
 .../luci-interpreter/src/kernels/Mean.h       |  55 ---
 .../src/kernels/Mean.test.cpp                 | 233 ++----------
 10 files changed, 603 insertions(+), 542 deletions(-)
 create mode 100644 onert-micro/luci-interpreter/include/luci_interpreter/test_models/mean/FloatMeanKernel.h
 create mode 100644 onert-micro/luci-interpreter/include/luci_interpreter/test_models/mean/NegMeanKernel.h
 create mode 100644 onert-micro/luci-interpreter/include/luci_interpreter/test_models/mean/TestDataMeanBase.h
 create mode 100644 onert-micro/luci-interpreter/pal/common/PALMean.h
 delete mode 100644 onert-micro/luci-interpreter/src/kernels/Mean.h

diff --git a/onert-micro/luci-interpreter/include/luci_interpreter/test_models/mean/FloatMeanKernel.h b/onert-micro/luci-interpreter/include/luci_interpreter/test_models/mean/FloatMeanKernel.h
new file mode 100644
index 00000000000..909b7410cb1
--- /dev/null
+++ b/onert-micro/luci-interpreter/include/luci_interpreter/test_models/mean/FloatMeanKernel.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_TEST_MODELS_FLOAT_MEAN_KERNEL_H
+#define LUCI_INTERPRETER_TEST_MODELS_FLOAT_MEAN_KERNEL_H
+
+#include "TestDataMeanBase.h"
+
+namespace luci_interpreter
+{
+namespace test_kernel
+{
+namespace mean_float
+{
+/*
+ * Mean Kernel:
+ *
+ *      Input(1, 8, 8, 4)
+ *            |
+ *           Mean
+ *            |
+ *      Output(1, 8, 8, 1)
+ */
+const unsigned char test_kernel_model_circle[] = {
+  0x18, 0x00, 0x00, 0x00, 0x43, 0x49, 0x52, 0x30, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x08, 0x00, 0x10, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x94, 0x01, 0x00, 0x00, 0xb0, 0x01, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x34, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0xff, 0xff, 0xff, 0xfc, 0xff, 0xff, 0xff,
+  0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00,
+  0x6c, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e,
+  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x08, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x1b, 0x14, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+  0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x94, 0xff, 0xff, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x6f, 0x66, 0x6d, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0f, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x1c, 0x00, 0x00, 0x00,
+  0x11, 0x00, 0x00, 0x00, 0x72, 0x65, 0x64, 0x75, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x69, 0x6e,
+  0x64, 0x69, 0x63, 0x65, 0x73, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x69, 0x66, 0x6d, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x0c, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x28, 0x11, 0x00, 0x00, 0x00, 0x4f, 0x4e, 0x45, 0x2d,
+  0x74, 0x66, 0x6c, 0x69, 0x74, 0x65, 0x32, 0x63, 0x69, 0x72, 0x63, 0x6c, 0x65, 0x00, 0x00, 0x00};
+
+const std::vector<float> input_data = {
+  -73.19745,  -62.66789,  -15.868883, -69.99245,  -86.77558,  -47.07158,  -59.42521,  5.4639907,
+  -15.482954, 58.430527,  30.962307,  -8.479264,  64.87171,   67.23879,   54.92413,   -75.001656,
+  4.095402,   -11.012883, 1.7135352,  -13.673498, 87.62411,   88.27154,   86.84994,   61.68961,
+  -67.81691,  -36.073383, 54.346165,  -83.79197,  35.099308,  -23.05919,  26.401726,  20.99549,
+  -68.63421,  -93.027596, 20.0895,    -16.020033, 57.642673,  8.66057,    39.191364,  29.198711,
+  -5.9334397, 11.010835,  82.77485,   -34.213863, -38.869553, 16.539444,  51.105484,  25.632273,
+  -55.436813, -26.42026,  77.96095,   -59.019154, -82.52756,  -94.416176, -83.77591,  46.43875,
+  0.7686069,  57.346397,  -89.24597,  -8.594538,  -98.168755, -33.18969,  -41.993664, 13.660449,
+  50.10378,   9.801906,   -4.2520585, 27.210102,  48.8715,    -19.44194,  38.652195,  23.77053,
+  -82.0674,   -93.96652,  99.148094,  22.794533,  0.5715625,  0.84766275, 87.92019,   37.35077,
+  -32.265865, 67.46462,   -24.098558, 87.36311,   90.409134,  33.023712,  -15.923093, 40.05901,
+  -12.006578, 31.039108,  -63.882004, -73.78517,  -24.940235, 30.9098,    31.745,     -89.77378,
+  -46.777866, 58.79768,   -24.669464, 96.29413,   61.62126,   45.743416,  38.30191,   71.805405,
+  -31.20969,  33.56755,   -1.926614,  72.13441,   -22.292011, -16.355177, 21.689945,  87.95895,
+  -98.04168,  93.35264,   -12.684541, -18.105795, 30.574284,  42.890903,  -94.390366, -47.013157,
+  -98.465126, 28.63009,   -83.54015,  86.82799,   0.6768988,  6.070787,   43.308678,  1.8557712,
+  -73.0521,   -90.86948,  43.77232,   68.301056,  66.867775,  97.34002,   -59.342876, -51.359367,
+  17.27793,   52.223003,  -3.9915564, 29.598532,  34.474148,  -80.920456, -30.45005,  -17.469683,
+  -67.02992,  -34.23075,  -35.53944,  61.557327,  -66.91338,  -94.03176,  -45.88021,  97.36409,
+  96.45681,   -32.885677, 72.40823,   -62.28857,  20.948895,  1.259363,   -84.97583,  60.83626,
+  -94.692535, -15.315798, -99.92936,  40.56625,   -8.6356325, -7.3984733, 56.255993,  -31.700819,
+  62.08311,   52.800938,  32.27374,   -99.46793,  -40.924038, 24.67266,   -58.954403, 42.263252,
+  -72.13501,  -58.40316,  14.619292,  -43.400642, -82.13468,  -47.54976,  -42.642033, -8.409653,
+  74.90983,   97.76474,   -71.152916, 83.61312,   -37.22972,  21.405357,  -56.848846, 90.63024,
+  -70.21143,  -29.522697, 94.9647,    74.74478,   37.564766,  -40.22343,  -63.337795, -65.86191,
+  -48.546135, -58.20052,  36.73888,   67.78194,   -43.096832, 94.7046,    9.798892,   -79.97487,
+  -15.868657, -84.753975, 4.8745494,  -18.346195, 54.9818,    75.854,     41.797707,  -5.673281,
+  -36.31264,  -73.4931,   -41.090492, 6.3805137,  -73.66098,  85.20992,   91.28027,   -73.26658,
+  -92.18044,  41.29011,   5.5041995,  -73.70062,  -16.678818, 30.614132,  92.100555,  11.274231,
+  -37.915485, 34.91591,   36.32971,   -37.70164,  -23.708878, 19.026278,  -41.71216,  67.325356,
+  78.23511,   -43.154037, 22.667723,  30.742237,  -6.086414,  17.191307,  65.828896,  -40.83338,
+  -18.61725,  23.976517,  80.2347,    -92.53064,  71.6477,    -38.28841,  -60.853157, 24.402542};
+
+const std::vector<float> reference_output_data = {
+  -55.431667, -46.952095, 16.357655,   28.008245,  -4.7193613, 81.108795,  -33.334023, 14.859333,
+  -39.398083, 33.673332,  13.409595,   13.601912,  -15.728818, -53.57022,  -9.9313755, -39.922916,
+  20.71593,   22.963072,  -13.522823,  31.672546,  24.615828,  36.89219,   -29.65866,  -13.014804,
+  20.91112,   54.368,     18.141413,   17.750427,  -8.869844,  -16.984585, -16.636799, 12.978033,
+  -12.962048, 13.376387,  23.776978,   -23.59151,  -18.810696, -27.365314, 18.422699,  -0.4828272,
+  -42.342857, 2.1302667,  11.922464,   -8.235632,  -39.82988,  -45.184032, 46.28369,   4.489258,
+  17.493837,  -32.964592, -0.55646133, -4.6420527, -28.523571, 41.74006,   -36.128933, 7.3906593,
+  -29.771688, 29.327526,  -1.0928774,  5.232649,   22.122757,  9.025103,   -1.7341671, -0.7728319};
+
+} // namespace mean_float
+
+class TestDataFloatMean : public TestDataMeanBase<float>
+{
+public:
+  TestDataFloatMean()
+  {
+    _input_data = mean_float::input_data;
+    _reference_output_data = mean_float::reference_output_data;
+    _test_kernel_model_circle = mean_float::test_kernel_model_circle;
+  }
+
+  ~TestDataFloatMean() override = default;
+};
+
+} // namespace test_kernel
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_TEST_MODELS_FLOAT_MEAN_KERNEL_H
diff --git a/onert-micro/luci-interpreter/include/luci_interpreter/test_models/mean/NegMeanKernel.h b/onert-micro/luci-interpreter/include/luci_interpreter/test_models/mean/NegMeanKernel.h
new file mode 100644
index 00000000000..708f05e802d
--- /dev/null
+++ b/onert-micro/luci-interpreter/include/luci_interpreter/test_models/mean/NegMeanKernel.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_TEST_MODELS_NEG_MEAN_KERNEL_H
+#define LUCI_INTERPRETER_TEST_MODELS_NEG_MEAN_KERNEL_H
+
+#include "luci_interpreter/test_models/TestDataBase.h"
+
+namespace luci_interpreter
+{
+namespace test_kernel
+{
+namespace neg_input_output_type_mismatch_mean_kernel
+{
+/*
+ * Mean Kernel with input output type mismatch:
+ *
+ *      Input(1, 8, 8, 4) - Float32
+ *            |
+ *           Mean
+ *            |
+ *      Output(1, 8, 8, 1) - Int32
+ */
+const unsigned char test_kernel_model_circle[] = {
+  0x18, 0x00, 0x00, 0x00, 0x43, 0x49, 0x52, 0x30, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x08, 0x00, 0x10, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x98, 0x01, 0x00, 0x00, 0xb4, 0x01, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x34, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0xff, 0xff, 0xff, 0xfc, 0xff, 0xff, 0xff,
+  0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00,
+  0x6c, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e,
+  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x08, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x1b, 0x14, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+  0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0xd0, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x02, 0x0c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x6f, 0x66, 0x6d, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0f, 0x00, 0x08, 0x00, 0x04, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+  0x1c, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x72, 0x65, 0x64, 0x75, 0x63, 0x74, 0x69, 0x6f,
+  0x6e, 0x5f, 0x69, 0x6e, 0x64, 0x69, 0x63, 0x65, 0x73, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x04, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x69, 0x66, 0x6d, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x0c, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x28, 0x11, 0x00, 0x00, 0x00,
+  0x4f, 0x4e, 0x45, 0x2d, 0x74, 0x66, 0x6c, 0x69, 0x74, 0x65, 0x32, 0x63, 0x69, 0x72, 0x63, 0x6c,
+  0x65, 0x00, 0x00, 0x00};
+} // namespace neg_input_output_type_mismatch_mean_kernel
+
+class NegTestDataInputOutputTypeMismatchMeanKernel : public NegTestDataBase
+{
+public:
+  NegTestDataInputOutputTypeMismatchMeanKernel()
+  {
+    _test_kernel_model_circle =
+      neg_input_output_type_mismatch_mean_kernel::test_kernel_model_circle;
+  }
+
+  ~NegTestDataInputOutputTypeMismatchMeanKernel() override = default;
+
+  const unsigned char *get_model_ptr() override final { return _test_kernel_model_circle; }
+
+protected:
+  const unsigned char *_test_kernel_model_circle;
+};
+
+} // namespace test_kernel
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_TEST_MODELS_NEG_LOG_KERNEL_H
diff --git a/onert-micro/luci-interpreter/include/luci_interpreter/test_models/mean/TestDataMeanBase.h b/onert-micro/luci-interpreter/include/luci_interpreter/test_models/mean/TestDataMeanBase.h
new file mode 100644
index 00000000000..88928992864
--- /dev/null
+++ b/onert-micro/luci-interpreter/include/luci_interpreter/test_models/mean/TestDataMeanBase.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_TEST_MODELS_MEAN_KERNEL_BASE_H
+#define LUCI_INTERPRETER_TEST_MODELS_MEAN_KERNEL_BASE_H
+
+#include "luci_interpreter/test_models/TestDataBase.h"
+
+namespace luci_interpreter
+{
+namespace test_kernel
+{
+
+template <typename T> class TestDataMeanBase : public TestDataBase<T>
+{
+public:
+  TestDataMeanBase() = default;
+
+  const unsigned char *get_model_ptr() override final { return _test_kernel_model_circle; }
+
+  const std::vector<T> &get_input_data_by_index(int i) override final
+  {
+    switch (i)
+    {
+      case 0:
+        return _input_data;
+      default:
+        assert(false && "Wrong input index");
+    }
+  }
+
+  const std::vector<T> &get_output_data_by_index(int i) override final
+  {
+    assert(i == 0);
+    return _reference_output_data;
+  }
+
+protected:
+  std::vector<T> _input_data;
+  std::vector<T> _reference_output_data;
+  const unsigned char *_test_kernel_model_circle;
+};
+
+} // namespace test_kernel
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_TEST_MODELS_MEAN_KERNEL_BASE_H
diff --git a/onert-micro/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst b/onert-micro/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst
index e92ce5e85f1..0d135429f22 100644
--- a/onert-micro/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst
+++ b/onert-micro/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst
@@ -40,6 +40,7 @@ REGISTER_KERNEL(LEAKY_RELU, LeakyRelu)
 REGISTER_KERNEL(LOG_SOFTMAX, LogSoftmax)
 REGISTER_KERNEL(MUL, Mul)
 REGISTER_KERNEL(MAX_POOL_2D, MaxPool2D)
+REGISTER_KERNEL(MEAN, Mean)
 REGISTER_KERNEL(CONCATENATION, Concatenation)
 REGISTER_KERNEL(SHAPE, Shape)
 REGISTER_KERNEL(NOT_EQUAL, NotEqual)
diff --git a/onert-micro/luci-interpreter/pal/common/PALMean.h b/onert-micro/luci-interpreter/pal/common/PALMean.h
new file mode 100644
index 00000000000..f2926af5522
--- /dev/null
+++ b/onert-micro/luci-interpreter/pal/common/PALMean.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_MEAN_COMMON_H
+#define LUCI_INTERPRETER_PAL_MEAN_COMMON_H
+
+#include "Params.h"
+#include "PALUtils.h"
+
+namespace luci_interpreter_pal
+{
+namespace
+{
+// This method parses the input 'axis' to remove duplicates and handle negative
+// values, and returns a valid 'out_axis'
+inline bool resolveAxis(const int num_dims, const int *axis, const int64_t num_axis, int *out_axis,
+                        int *out_num_axis)
+{
+  *out_num_axis = 0; // Just in case.
+  // Short-circuit axis resolution for scalars; the axis will go unused.
+  if (num_dims == 0)
+  {
+    return true;
+  }
+  // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
+  for (int64_t idx = 0; idx < num_axis; ++idx)
+  {
+    // Handle negative index. A positive index 'p_idx' can be represented as a
+    // negative index 'n_idx' as: n_idx = p_idx-num_dims
+    // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1]  */
+    int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
+    if (current < 0 || current >= num_dims)
+    {
+      return false;
+    }
+    bool is_dup = false;
+    for (int j = 0; j < *out_num_axis; ++j)
+    {
+      if (out_axis[j] == current)
+      {
+        is_dup = true;
+        break;
+      }
+    }
+    if (!is_dup)
+    {
+      out_axis[*out_num_axis] = current;
+      *out_num_axis += 1;
+    }
+  }
+  return true;
+}
+
+// A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
+// This method iterates through input data and reduce elements along the
+// dimensions given in axis.
+template <typename In, typename Out>
+inline bool reduce(const In *input_data, const int *input_dims, const int *,
+                   const int input_num_dims, const int, const int *axis, const int num_axis,
+                   int *input_iter, Out reducer(Out, const In), Out *output_data)
+{
+  // Reset input iterator.
+  for (int idx = 0; idx < input_num_dims; ++idx)
+  {
+    input_iter[idx] = 0;
+  }
+  // Iterate through input_data.
+  do
+  {
+    size_t input_offset = reducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
+    size_t output_offset =
+      reducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
+    output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]);
+  } while (nextIndex(input_num_dims, input_dims, input_iter));
+  return true;
+}
+
+// This method expects that output_data has been initialized.
+template <typename In, typename Out>
+inline bool reduceSumImpl(const In *input_data, const int *input_dims, const int *output_dims,
+                          const int input_num_dims, const int output_num_dims, const int *axis,
+                          const int num_axis, int *input_iter, Out *output_data)
+{
+  auto reducer = [](const Out current, const In in) -> Out {
+    const Out actual_in = static_cast<Out>(in);
+    return current + actual_in;
+  };
+  return reduce<In, Out>(input_data, input_dims, output_dims, input_num_dims, output_num_dims, axis,
+                         num_axis, input_iter, reducer, output_data);
+}
+} // namespace
+
+template <typename T, typename U>
+inline bool Mean(const T *input_data, const int *input_dims, const int input_num_dims,
+                 T *output_data, const int *output_dims, const int output_num_dims, const int *axis,
+                 const int num_axis_dimensions, bool, int *temp_index, int *resolved_axis,
+                 U *temp_sum)
+{
+  // Reset output data.
+  size_t num_outputs = 1;
+  for (int idx = 0; idx < output_num_dims; ++idx)
+  {
+    size_t current = static_cast<size_t>(output_dims[idx]);
+    // Overflow prevention.
+    if (num_outputs > std::numeric_limits<size_t>::max() / current)
+    {
+      return false;
+    }
+    num_outputs *= current;
+  }
+  for (size_t idx = 0; idx < num_outputs; ++idx)
+  {
+    output_data[idx] = T();
+    temp_sum[idx] = U();
+  }
+
+  // Resolve axis.
+  int num_resolved_axis = 0;
+  if (!resolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis, &num_resolved_axis))
+  {
+    return false;
+  }
+
+  if (!reduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims, output_num_dims,
+                           resolved_axis, num_resolved_axis, temp_index, temp_sum))
+  {
+    return false;
+  }
+
+  // Calculate mean by dividing output_data by num of aggregated element.
+  size_t num_elements_in_axis = 1;
+  for (int idx = 0; idx < num_resolved_axis; ++idx)
+  {
+    size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
+    // Overflow prevention.
+    if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis))
+    {
+      return false;
+    }
+    num_elements_in_axis *= current;
+  }
+
+  if (num_elements_in_axis > 0)
+  {
+    for (size_t idx = 0; idx < num_outputs; ++idx)
+    {
+      output_data[idx] = static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
+    }
+  }
+  return true;
+}
+
+inline void Mean(const MeanParams &op_params,
+                 const luci_interpreter::RuntimeShape &unextended_input_shape,
+                 const float *input_data,
+                 const luci_interpreter::RuntimeShape &unextended_output_shape, float *output_data)
+{
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  const luci_interpreter::RuntimeShape input_shape =
+    luci_interpreter::RuntimeShape::extendedShape(4, unextended_input_shape);
+  const luci_interpreter::RuntimeShape output_shape =
+    luci_interpreter::RuntimeShape::extendedShape(4, unextended_output_shape);
+
+  const int output_batch = output_shape.dims(0);
+  const int output_depth = output_shape.dims(3);
+
+  const int input_height = input_shape.dims(1);
+  const int input_width = input_shape.dims(2);
+
+  for (int out_b = 0; out_b < output_batch; ++out_b)
+  {
+    for (int out_d = 0; out_d < output_depth; ++out_d)
+    {
+      float value = 0;
+      for (int in_h = 0; in_h < input_height; ++in_h)
+      {
+        for (int in_w = 0; in_w < input_width; ++in_w)
+        {
+          value += input_data[offset(input_shape.dimsData(), out_b, in_h, in_w, out_d)];
+        }
+      }
+      output_data[offset(output_shape.dimsData(), out_b, 0, 0, out_d)] =
+        value / (input_width * input_height);
+    }
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_MEAN_COMMON_H
diff --git a/onert-micro/luci-interpreter/pal/common/Params.h b/onert-micro/luci-interpreter/pal/common/Params.h
index d641ab87bc1..a6b8e46bb45 100644
--- a/onert-micro/luci-interpreter/pal/common/Params.h
+++ b/onert-micro/luci-interpreter/pal/common/Params.h
@@ -21,6 +21,12 @@
 namespace luci_interpreter_pal
 {
 
+struct MeanParams
+{
+  int8_t axis_count;
+  int16_t axis[4];
+};
+
 struct PadParams
 {
   int8_t left_padding_count;
diff --git a/onert-micro/luci-interpreter/pal/mcu/KernelsToBuild.lst b/onert-micro/luci-interpreter/pal/mcu/KernelsToBuild.lst
index 04e28bb89c4..f8fc3f75568 100644
--- a/onert-micro/luci-interpreter/pal/mcu/KernelsToBuild.lst
+++ b/onert-micro/luci-interpreter/pal/mcu/KernelsToBuild.lst
@@ -45,6 +45,7 @@ REGISTER_KERNEL(LEAKY_RELU, LeakyRelu)
 REGISTER_KERNEL(LOG_SOFTMAX, LogSoftmax)
 REGISTER_KERNEL(MUL, Mul)
 REGISTER_KERNEL(MAXIMUM, Maximum)
+REGISTER_KERNEL(MEAN, Mean)
 REGISTER_KERNEL(MAX_POOL_2D, MaxPool2D)
 REGISTER_KERNEL(MINIMUM, Minimum)
 REGISTER_KERNEL(CONCATENATION, Concatenation)
diff --git a/onert-micro/luci-interpreter/src/kernels/Mean.cpp b/onert-micro/luci-interpreter/src/kernels/Mean.cpp
index 4128aa68d1e..1b87336f55c 100644
--- a/onert-micro/luci-interpreter/src/kernels/Mean.cpp
+++ b/onert-micro/luci-interpreter/src/kernels/Mean.cpp
@@ -1,6 +1,5 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,330 +14,103 @@
  * limitations under the License.
  */
 
-#include "kernels/Mean.h"
-
+#include "Builders.h"
 #include "kernels/Utils.h"
+#include "TISOKernel.h"
+
+#include "PALMean.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reduce.h>
+#include <cassert>
 
 namespace luci_interpreter
 {
-namespace kernels
+namespace
 {
+const int kMaxNumberOfAxis = 5;
+const int kMaxNumberOfReducedAxis = 2;
 
-static void resolveAxes(const int32_t *axes_data, int num_axes, tflite::MeanParams *params)
+void ResolveAxis(const int *axis_data, int axis_count, luci_interpreter_pal::MeanParams *op_params)
 {
-  params->axis_count = num_axes;
-  for (int i = 0; i < num_axes; ++i)
+  int i = 0;
+  for (; i < axis_count; ++i)
   {
-    params->axis[i] = static_cast<int16>(axes_data[i]);
+    op_params->axis[i] = static_cast<int16_t>(axis_data[i]);
   }
-  for (int i = num_axes; i < 4; ++i)
+  for (; i < 4; ++i)
   {
-    params->axis[i] = 1;
+    op_params->axis[i] = 1;
   }
+  op_params->axis_count = axis_count;
 }
 
-// Returns the number of axes that will be reduced. Removes duplicates.
-static int getAxisReductionCount(const int32_t *axes_data, int num_axes, int input_num_dims)
-{
-  int reduction_count = num_axes;
-  for (int i = 0; i < num_axes; ++i)
-  {
-    int current = axes_data[i] >= 0 ? axes_data[i] : axes_data[i] + input_num_dims;
-    assert(current >= 0 && current < input_num_dims);
-    for (int j = 0; j < i; j++)
-    {
-      int previous = axes_data[j] >= 0 ? axes_data[j] : axes_data[j] + input_num_dims;
-      // This checks for duplicate axis
-      if (current == previous)
-      {
-        --reduction_count;
-        break;
-      }
-    }
-  }
-  return reduction_count;
-}
+} // namespace
 
-static Shape getOutputShape(const Shape &input_shape, const int32_t *axes_data, int num_axes,
-                            bool keep_dims)
+void configure_kernel_CircleMean(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph)
 {
-  int input_num_dims = input_shape.num_dims();
-  if (input_num_dims == 0)
-  {
-    return Shape(0);
-  }
+  kernels::TISOKernel kernel(cur_op, runtime_graph);
 
-  if (keep_dims)
-  {
-    Shape output_shape(input_num_dims);
-    for (int idx = 0; idx < input_num_dims; ++idx)
-    {
-      bool is_axis = false;
-      for (int axis_idx = 0; axis_idx < num_axes; ++axis_idx)
-      {
-        if (axes_data[axis_idx] == idx || axes_data[axis_idx] + input_num_dims == idx)
-        {
-          is_axis = true;
-          break;
-        }
-      }
-      if (is_axis)
-      {
-        output_shape.dim(idx) = 1;
-      }
-      else
-      {
-        output_shape.dim(idx) = input_shape.dim(idx);
-      }
-    }
-    return output_shape;
-  }
-  else
-  {
-    int num_reduce_axes = getAxisReductionCount(axes_data, num_axes, input_num_dims);
-    Shape output_shape(input_num_dims - num_reduce_axes);
-    int num_skip_axes = 0;
-    for (int idx = 0; idx < input_num_dims; ++idx)
-    {
-      bool is_axis = false;
-      for (int axis_idx = 0; axis_idx < num_axes; ++axis_idx)
-      {
-        if (axes_data[axis_idx] == idx || axes_data[axis_idx] + input_num_dims == idx)
-        {
-          ++num_skip_axes;
-          is_axis = true;
-          break;
-        }
-      }
-      if (!is_axis)
-      {
-        output_shape.dim(idx - num_skip_axes) = input_shape.dim(idx);
-      }
-    }
-    return output_shape;
-  }
-}
+  LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
+                         Tensor::element_type(kernel.output()));
+  LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input2()) == DataType::S32);
 
-Mean::Mean(const Tensor *input, const Tensor *axes, Tensor *output, Tensor *temp_index,
-           Tensor *resolved_axes, Tensor *temp_sum, const ReducerParams &params)
-  : KernelWithParams<ReducerParams>({input, axes}, {output, temp_index, resolved_axes, temp_sum},
-                                    params)
-{
+  const int32_t axis_value =
+    kernels::getTensorData<int>(runtime_graph->getConstDataByTensor(kernel.input2()))[0];
+  LUCI_INTERPRETER_CHECK(axis_value >= 0);
 }
 
-void Mean::configure()
+void execute_kernel_CircleMean(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph)
 {
-  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
-  LUCI_INTERPRETER_CHECK(axes()->element_type() == DataType::S32);
-  if (input()->element_type() == DataType::S16)
-  {
-    LUCI_INTERPRETER_CHECK(input()->zero_point() == 0 && output()->zero_point() == 0);
-  }
+  kernels::TISOKernel kernel(cur_op, runtime_graph);
+  kernels::TISOData tiso_data = kernel.readData();
 
-  const Shape &input_shape = input()->shape();
-  int input_num_dims = input_shape.num_dims();
+  const auto *input = kernel.input1();
+  const auto *axis = kernel.input2();
+  const auto *output = kernel.output();
 
-  const auto *axes_data = getTensorData<int32_t>(axes());
-  int num_axes = axes()->shape().num_elements();
-  assert(num_axes <= 4);
-  // TODO: enable it only if kernel with dynamic shapes
-  Shape output_shape = getOutputShape(input_shape, axes_data, num_axes, _params.keep_dims);
-  output()->resize(output_shape);
+  const auto *options = cur_op->builtin_options_as_ReducerOptions();
 
-  tflite::MeanParams params{};
-  resolveAxes(axes_data, num_axes, &params);
-  _need_temporaries = !(
-    _params.keep_dims && input_num_dims == 4 && params.axis_count == 2 &&
-    ((params.axis[0] == 1 && params.axis[1] == 2) || (params.axis[0] == 2 && params.axis[1] == 1)));
-  if (_need_temporaries)
-  {
-    auto temp_index = getOutputTensors()[1];
-    auto resolved_axes = getOutputTensors()[2];
-    auto temp_sum = getOutputTensors()[3];
+  int num_axis = static_cast<int>(Tensor::num_elements(axis));
+  int temp_index[kMaxNumberOfAxis];
+  int resolved_axis[kMaxNumberOfReducedAxis];
 
-    temp_index->resize(Shape(input_num_dims));
-    resolved_axes->resize(Shape(num_axes));
-    temp_sum->resize(output()->shape());
-  }
-  else
-  {
-    auto temp_index = getOutputTensors()[1];
-    auto resolved_axes = getOutputTensors()[2];
-    auto temp_sum = getOutputTensors()[3];
-
-    temp_index->set_allocatable(false);
-    resolved_axes->set_allocatable(false);
-    temp_sum->set_allocatable(false);
-  }
-}
-
-void Mean::execute() const
-{
-  switch (input()->element_type())
+  switch (Tensor::element_type(kernel.input1()))
   {
+#ifndef DIS_FLOAT
     case DataType::FLOAT32:
-      evalFloat();
-      break;
-    case DataType::U8:
-      evalQuantized();
-      break;
-    case DataType::S16:
-      evalQuantizedS16();
-      break;
-    default:
-      assert(false && "Unsupported type.");
-  }
-}
-
-void Mean::evalFloat() const
-{
-  const Shape &input_shape = input()->shape();
-  int input_num_dims = input_shape.num_dims();
-  const auto *axes_data = getTensorData<int32_t>(axes());
-  int num_axes = axes()->shape().num_elements();
-
-  tflite::MeanParams params{};
-  resolveAxes(axes_data, num_axes, &params);
-
-  auto temp_index = getOutputTensors()[1];
-  auto resolved_axes = getOutputTensors()[2];
-  auto temp_sum = getOutputTensors()[3];
-
-  // Defer to specialized implementation for 4D Mean across axes 1 & 2.
-  if (_params.keep_dims && input_num_dims == 4 && params.axis_count == 2 &&
-      ((params.axis[0] == 1 && params.axis[1] == 2) ||
-       (params.axis[0] == 2 && params.axis[1] == 1)))
-  {
-    tflite::reference_ops::Mean(params, getTensorShape(input()), getTensorData<float>(input()),
-                                getTensorShape(output()), getTensorData<float>(output()));
-  }
-  else
-  {
-    tflite::reference_ops::Mean(getTensorData<float>(input()), getTensorShape(input()).DimsData(),
-                                input()->shape().num_dims(), getTensorData<float>(output()),
-                                getTensorShape(output()).DimsData(), output()->shape().num_dims(),
-                                axes_data, num_axes, _params.keep_dims,
-                                getTensorData<int>(temp_index), getTensorData<int>(resolved_axes),
-                                getTensorData<float>(temp_sum));
-  }
-}
-
-void Mean::evalQuantized() const
-{
-  const Shape &input_shape = input()->shape();
-  int input_num_dims = input_shape.num_dims();
-  const auto *axes_data = getTensorData<int32_t>(axes());
-  int num_axes = axes()->shape().num_elements();
-
-  tflite::MeanParams params{};
-  resolveAxes(axes_data, num_axes, &params);
-
-  auto temp_index = getOutputTensors()[1];
-  auto resolved_axes = getOutputTensors()[2];
-  auto temp_sum = getOutputTensors()[3];
-
-  // Defer to specialized implementation for 4D Mean across axes 1 & 2.
-  if (_params.keep_dims && input_num_dims == 4 && params.axis_count == 2 &&
-      ((params.axis[0] == 1 && params.axis[1] == 2) ||
-       (params.axis[0] == 2 && params.axis[1] == 1)))
-  {
-    tflite::reference_ops::Mean(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
-                                input()->zero_point(), input()->scale(), getTensorShape(output()),
-                                getTensorData<uint8_t>(output()), output()->zero_point(),
-                                output()->scale());
-  }
-  else if (input()->zero_point() == output()->zero_point() && input()->scale() == output()->scale())
-  {
-    tflite::reference_ops::Mean(getTensorData<uint8_t>(input()), getTensorShape(input()).DimsData(),
-                                input()->shape().num_dims(), getTensorData<uint8_t>(output()),
-                                getTensorShape(output()).DimsData(), output()->shape().num_dims(),
-                                axes_data, num_axes, _params.keep_dims,
-                                getTensorData<int>(temp_index), getTensorData<int>(resolved_axes),
-                                getTensorData<int>(temp_sum));
-  }
-  else
-  {
-    tflite::reference_ops::QuantizedMeanOrSum<>(
-      getTensorData<uint8_t>(input()), input()->zero_point(), input()->scale(),
-      getTensorShape(input()).DimsData(), input()->shape().num_dims(),
-      getTensorData<uint8_t>(output()), output()->zero_point(), output()->scale(),
-      getTensorShape(output()).DimsData(), output()->shape().num_dims(), axes_data, num_axes,
-      _params.keep_dims, getTensorData<int>(temp_index), getTensorData<int>(resolved_axes),
-      getTensorData<int>(temp_sum),
-      /*compute_sum=*/false);
-  }
-}
-
-void Mean::evalQuantizedS16() const
-{
-  const auto *input_data = getTensorData<int16_t>(input());
-  auto *output_data = getTensorData<int16_t>(output());
-
-  const Shape &input_shape = input()->shape();
-  const Shape &output_shape = output()->shape();
-
-  const auto *axes_data = getTensorData<int32_t>(axes());
-  const int num_axes = axes()->shape().num_elements();
-
-  constexpr int32_t output_min = -std::numeric_limits<int16_t>::max();
-  constexpr int32_t output_max = std::numeric_limits<int16_t>::max();
-
-  // Defer to specialized implementation for 4D Mean across axes 1 & 2.
-  if (_params.keep_dims && input_shape.num_dims() == 4 && num_axes == 2 &&
-      ((axes_data[0] == 1 && axes_data[1] == 2) || (axes_data[0] == 2 && axes_data[1] == 1)))
-  {
-    const int32_t batches = input_shape.dim(0);
-    const int32_t input_height = input_shape.dim(1);
-    const int32_t input_width = input_shape.dim(2);
-    const int32_t depth = input_shape.dim(3);
-    assert(output_shape.num_dims() == 4);
-    assert(output_shape.dim(0) == batches);
-    assert(output_shape.dim(1) == 1);
-    assert(output_shape.dim(2) == 1);
-    assert(output_shape.dim(3) == depth);
-
-    const double real_multiplier =
-      static_cast<double>(input()->scale()) / static_cast<double>(output()->scale());
-
-    int32_t output_multiplier{};
-    int output_shift{};
-    quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
-
-    const int32_t num_elements_in_axes = input_height * input_width;
-
-    for (int32_t batch = 0; batch < batches; ++batch)
     {
-      for (int32_t c = 0; c < depth; ++c)
+      luci_interpreter_pal::MeanParams op_params;
+      ResolveAxis(kernels::getTensorData<int>(tiso_data.input2_data), num_axis, &op_params);
+
+      // Special case mean implementation exists for 4D mean across axes 1
+      // and 2.
+      bool special_case_4d_axes_1_and_2 = Tensor::num_dims(input) == 4 &&
+                                          op_params.axis_count == 2 &&
+                                          ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+                                           (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+
+      // Defer to specialized implementation for 4D Mean across axes 1 & 2.
+      if (options->keep_dims() && special_case_4d_axes_1_and_2)
       {
-        int32_t acc = 0;
-        for (int32_t in_y = 0; in_y < input_height; ++in_y)
-        {
-          for (int32_t in_x = 0; in_x < input_width; ++in_x)
-          {
-            acc += input_data[calcOffset(input_shape, batch, in_y, in_x, c)];
-          }
-        }
-        int32_t scaled_acc =
-          tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
-        // Divide by the number of elements rounding to the nearest integer.
-        scaled_acc = scaled_acc > 0
-                       ? (scaled_acc + num_elements_in_axes / 2) / num_elements_in_axes
-                       : (scaled_acc - num_elements_in_axes / 2) / num_elements_in_axes;
-
-        scaled_acc = std::max(scaled_acc, output_min);
-        scaled_acc = std::min(scaled_acc, output_max);
-
-        output_data[calcOffset(output_shape, batch, 0, 0, c)] = scaled_acc;
+        luci_interpreter_pal::Mean(op_params, kernels::getTensorShape(input),
+                                   kernels::getTensorData<float>(tiso_data.input1_data),
+                                   kernels::getTensorShape(output),
+                                   kernels::getTensorData<float>(tiso_data.output_data));
+      }
+      else
+      {
+        luci_interpreter_pal::Mean(
+          kernels::getTensorData<float>(tiso_data.input1_data), wrap(input->shape()).data(),
+          Tensor::num_dims(input), kernels::getTensorData<float>(tiso_data.output_data),
+          wrap(output->shape()).data(), Tensor::num_dims(output),
+          kernels::getTensorData<int>(tiso_data.input2_data), num_axis, options->keep_dims(),
+          temp_index, resolved_axis, kernels::getTensorData<float>(tiso_data.output_data));
       }
     }
-  }
-  else
-  {
-    assert(false && "Unsupported configuration.");
+    break;
+#endif // DIS_FLOAT
+    default:
+      assert(false && "Unsupported type");
   }
 }
 
-} // namespace kernels
 } // namespace luci_interpreter
diff --git a/onert-micro/luci-interpreter/src/kernels/Mean.h b/onert-micro/luci-interpreter/src/kernels/Mean.h
deleted file mode 100644
index ed07ae56177..00000000000
--- a/onert-micro/luci-interpreter/src/kernels/Mean.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LUCI_INTERPRETER_KERNELS_MEAN_H
-#define LUCI_INTERPRETER_KERNELS_MEAN_H
-
-#include "core/Kernel.h"
-#include "core/KernelParams.h"
-
-#include <memory>
-
-namespace luci_interpreter
-{
-namespace kernels
-{
-
-class Mean : public KernelWithParams<ReducerParams>
-{
-public:
-  Mean(const Tensor *input, const Tensor *axes, Tensor *output, Tensor *temp_index,
-       Tensor *resolved_axes, Tensor *temp_sum, const ReducerParams &params);
-
-  const Tensor *input() const { return _inputs[0]; }
-  const Tensor *axes() const { return _inputs[1]; }
-  Tensor *output() const { return _outputs[0]; }
-
-  void configure() override;
-  void execute() const override;
-
-private:
-  void evalFloat() const;
-  void evalQuantized() const;
-  void evalQuantizedS16() const;
-
-private:
-  bool _need_temporaries = false;
-};
-
-} // namespace kernels
-} // namespace luci_interpreter
-
-#endif // LUCI_INTERPRETER_KERNELS_MEAN_H
diff --git a/onert-micro/luci-interpreter/src/kernels/Mean.test.cpp b/onert-micro/luci-interpreter/src/kernels/Mean.test.cpp
index d2c00935ab0..bf03d4ac048 100644
--- a/onert-micro/luci-interpreter/src/kernels/Mean.test.cpp
+++ b/onert-micro/luci-interpreter/src/kernels/Mean.test.cpp
@@ -15,14 +15,14 @@
  * limitations under the License.
  */
 
-#include "kernels/Mean.h"
 #include "kernels/TestUtils.h"
-#include "luci_interpreter/TestMemoryManager.h"
+#include "luci_interpreter/test_models/mean/FloatMeanKernel.h"
+#include "luci_interpreter/test_models/mean/NegMeanKernel.h"
+
+#include "loader/ModuleLoader.h"
 
 namespace luci_interpreter
 {
-namespace kernels
-{
 namespace
 {
 
@@ -30,211 +30,58 @@ using namespace testing;
 
 class MeanTest : public ::testing::Test
 {
-protected:
-  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
-
-  std::unique_ptr<IMemoryManager> _memory_manager;
+  // Do nothing
 };
 
-TEST_F(MeanTest, FloatKeepDims)
-{
-  std::vector<float> input_data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
-                                   9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-                                   17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
-
-  std::vector<int32_t> axis_data{0, 2};
-  Tensor input_tensor =
-    makeInputTensor<DataType::FLOAT32>({4, 3, 2}, input_data, _memory_manager.get());
-  Tensor axis_tensor = makeInputTensor<DataType::S32>({2}, axis_data, _memory_manager.get());
-  Tensor temp_index(DataType::S32, Shape({}), {}, "");
-  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
-  Tensor temp_sum(DataType::FLOAT32, Shape({}), {}, "");
-  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
-
-  ReducerParams params{};
-  params.keep_dims = true;
-
-  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
-              params);
-  kernel.configure();
-  _memory_manager->allocate_memory(temp_index);
-  _memory_manager->allocate_memory(resolved_axes);
-  _memory_manager->allocate_memory(temp_sum);
-  _memory_manager->allocate_memory(output_tensor);
-  kernel.execute();
-
-  std::vector<float> ref_output_data{10.5, 12.5, 14.5};
-  std::initializer_list<int32_t> ref_output_shape{1, 3, 1};
-  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
-  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
-}
-
-TEST_F(MeanTest, FloatKeepDims4DMean)
-{
-  std::vector<float> input_data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
-                                   9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-                                   17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
-
-  std::vector<int32_t> axis_data{1, 2};
-  Tensor input_tensor =
-    makeInputTensor<DataType::FLOAT32>({2, 2, 3, 2}, input_data, _memory_manager.get());
-  Tensor axis_tensor = makeInputTensor<DataType::S32>({2}, axis_data, _memory_manager.get());
-  Tensor temp_index(DataType::S32, Shape({}), {}, "");
-  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
-  Tensor temp_sum(DataType::FLOAT32, Shape({}), {}, "");
-  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
-
-  ReducerParams params{};
-  params.keep_dims = true;
-
-  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
-              params);
-  kernel.configure();
-  _memory_manager->allocate_memory(temp_index);
-  _memory_manager->allocate_memory(resolved_axes);
-  _memory_manager->allocate_memory(temp_sum);
-  _memory_manager->allocate_memory(output_tensor);
-  kernel.execute();
-
-  std::vector<float> ref_output_data{6, 7, 18, 19};
-  std::initializer_list<int32_t> ref_output_shape{2, 1, 1, 2};
-  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
-  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
-}
-
-TEST_F(MeanTest, FloatNotKeepDims)
+template <typename T> std::vector<T> checkMeanKernel(test_kernel::TestDataBase<T> *test_data_base)
 {
-  std::vector<float> input_data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
-                                   9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-                                   17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
-
-  std::vector<int32_t> axis_data{1, 0, -3, -3};
-  Tensor input_tensor =
-    makeInputTensor<DataType::FLOAT32>({4, 3, 2}, input_data, _memory_manager.get());
-  Tensor axis_tensor = makeInputTensor<DataType::S32>({4}, axis_data, _memory_manager.get());
-  Tensor temp_index(DataType::S32, Shape({}), {}, "");
-  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
-  Tensor temp_sum(DataType::FLOAT32, Shape({}), {}, "");
-  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
-
-  ReducerParams params{};
-  params.keep_dims = false;
+  MemoryManager memory_manager{};
+  RuntimeModule runtime_module{};
+  bool dealloc_input = true;
 
-  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
-              params);
-  kernel.configure();
-  _memory_manager->allocate_memory(temp_index);
-  _memory_manager->allocate_memory(resolved_axes);
-  _memory_manager->allocate_memory(temp_sum);
-  _memory_manager->allocate_memory(output_tensor);
-  kernel.execute();
+  // Load model with single op
+  auto *model_data_raw = reinterpret_cast<const char *>(test_data_base->get_model_ptr());
+  ModuleLoader::load(&runtime_module, &memory_manager, model_data_raw, dealloc_input);
 
-  std::vector<float> ref_output_data{12, 13};
-  std::initializer_list<int32_t> ref_output_shape{2};
-  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
-  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
-}
-
-TEST_F(MeanTest, Uint8KeepDims)
-{
-  float kQuantizedTolerance = getTolerance(-1.0, 1.0, 255);
-  std::vector<float> input_data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
-  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
+  auto *main_runtime_graph = runtime_module.getMainGraph();
+  assert(main_runtime_graph->getNumOfInputTensors() == 1);
 
-  std::vector<int32_t> axis_data{1};
-  Tensor input_tensor = makeInputTensor<DataType::U8>({3, 2}, quant_param.first, quant_param.second,
-                                                      input_data, _memory_manager.get());
-  Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_data, _memory_manager.get());
-  Tensor temp_index(DataType::S32, Shape({}), {}, "");
-  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
-  Tensor temp_sum(DataType::U8, Shape({}), {}, "");
-  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+  // Set input data
+  {
+    auto *input_tensor_data = reinterpret_cast<T *>(main_runtime_graph->configureGraphInput(0));
+    std::copy(test_data_base->get_input_data_by_index(0).begin(),
+              test_data_base->get_input_data_by_index(0).end(), input_tensor_data);
+  }
 
-  ReducerParams params{};
-  params.keep_dims = true;
+  runtime_module.execute();
 
-  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
-              params);
-  kernel.configure();
-  _memory_manager->allocate_memory(temp_index);
-  _memory_manager->allocate_memory(resolved_axes);
-  _memory_manager->allocate_memory(temp_sum);
-  _memory_manager->allocate_memory(output_tensor);
-  kernel.execute();
+  assert(main_runtime_graph->getNumOfOutputTensors() == 1);
 
-  std::vector<float> ref_output_data{0.3, 0.35, 0.55};
-  std::initializer_list<int32_t> ref_output_shape{3, 1};
-  EXPECT_THAT(dequantizeTensorData(output_tensor),
-              FloatArrayNear(ref_output_data, kQuantizedTolerance));
-  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  T *output_data = reinterpret_cast<T *>(main_runtime_graph->getOutputDataByIndex(0));
+  const size_t num_elements = (main_runtime_graph->getOutputDataSizeByIndex(0) / sizeof(T));
+  std::vector<T> output_data_vector(output_data, output_data + num_elements);
+  return output_data_vector;
 }
 
-TEST_F(MeanTest, Uint8NotKeepDims)
+TEST_F(MeanTest, Float_P)
 {
-  float kQuantizedTolerance = getTolerance(-1.0, 1.0, 255);
-  std::vector<float> input_data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
-  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
-
-  std::vector<int32_t> axis_data{1};
-  Tensor input_tensor = makeInputTensor<DataType::U8>(
-    {1, 3, 2}, quant_param.first, quant_param.second, input_data, _memory_manager.get());
-  Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_data, _memory_manager.get());
-  Tensor temp_index(DataType::S32, Shape({}), {}, "");
-  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
-  Tensor temp_sum(DataType::FLOAT32, Shape({}), {}, "");
-  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
-
-  ReducerParams params{};
-  params.keep_dims = false;
-
-  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
-              params);
-  kernel.configure();
-  _memory_manager->allocate_memory(temp_index);
-  _memory_manager->allocate_memory(resolved_axes);
-  _memory_manager->allocate_memory(temp_sum);
-  _memory_manager->allocate_memory(output_tensor);
-  kernel.execute();
-
-  std::vector<float> ref_output_data{0.4, 0.4};
-  std::initializer_list<int32_t> ref_output_shape{1, 2};
-  EXPECT_THAT(dequantizeTensorData(output_tensor),
-              FloatArrayNear(ref_output_data, kQuantizedTolerance));
-  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  test_kernel::TestDataFloatMean test_data_kernel;
+  std::vector<float> output_data_vector = checkMeanKernel(&test_data_kernel);
+  EXPECT_THAT(output_data_vector, kernels::testing::FloatArrayNear(
+                                    test_data_kernel.get_output_data_by_index(0), 0.0001f));
 }
 
-TEST_F(MeanTest, SInt16KeepDims4D)
+TEST_F(MeanTest, Input_output_type_mismatch_NEG)
 {
-  std::vector<float> input_data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
-                                   9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-                                   17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
-  std::vector<int32_t> axes_data{1, 2};
-  std::vector<float> ref_output_data{6, 7, 18, 19};
-
-  Tensor input_tensor =
-    makeInputTensor<DataType::S16>({2, 2, 3, 2}, 0.25, 0, input_data, _memory_manager.get());
-  Tensor axes_tensor = makeInputTensor<DataType::S32>({2}, axes_data, _memory_manager.get());
-  Tensor temp_index(DataType::S32, Shape({}), {}, "");
-  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
-  Tensor temp_sum(DataType::FLOAT32, Shape({}), {}, "");
-  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.2, 0);
-
-  ReducerParams params{};
-  params.keep_dims = true;
-
-  Mean kernel(&input_tensor, &axes_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
-              params);
-  kernel.configure();
-  _memory_manager->allocate_memory(temp_index);
-  _memory_manager->allocate_memory(resolved_axes);
-  _memory_manager->allocate_memory(temp_sum);
-  _memory_manager->allocate_memory(output_tensor);
-  kernel.execute();
-
-  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 1, 1, 2}));
-  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+  test_kernel::NegTestDataInputOutputTypeMismatchMeanKernel test_data_kernel;
+  MemoryManager memory_manager{};
+  RuntimeModule runtime_module{};
+  bool dealloc_input = true;
+  // Load model with single op
+  auto *model_data_raw = reinterpret_cast<const char *>(test_data_kernel.get_model_ptr());
+  EXPECT_DEATH(ModuleLoader::load(&runtime_module, &memory_manager, model_data_raw, dealloc_input),
+               "");
 }
 
 } // namespace
-} // namespace kernels
 } // namespace luci_interpreter