diff --git a/compiler/fm-equalize/fm-equalize b/compiler/fm-equalize/fm-equalize
index 4e4e5395e18..36b4f99a003 100644
--- a/compiler/fm-equalize/fm-equalize
+++ b/compiler/fm-equalize/fm-equalize
@@ -62,6 +62,18 @@ def _get_parser():
         help="Allow to create duplicate operations when a feature map matches "
         "with multiple equalization patterns. This can increase the size of "
         "the model. Default is false.")
+    parser.add_argument("--fme_detect",
+                        type=str,
+                        help="Path to fme-detect driver.",
+                        required=False)
+    parser.add_argument("--dalgona",
+                        type=str,
+                        help="Path to dalgona driver.",
+                        required=False)
+    parser.add_argument("--fme_apply",
+                        type=str,
+                        help="Path to fme-apply driver.",
+                        required=False)
     parser.add_argument('--verbose', action='store_true', help='Print logs')
 
     return parser
@@ -78,12 +90,9 @@ def _run_cmd(cmd: str, verbose: bool):
         raise
 
 
-def _run_dalgona(model: str, data: Optional[str], analysis: str, save_dir: str,
-                 verbose: bool):
-    dir_path = os.getenv('ONE_BIN_PATH')
-    assert dir_path != None
-    dalgona_path = os.path.join(dir_path, 'dalgona')
-    cmd = [dalgona_path]
+def _run_dalgona(driver_path: str, model: str, data: Optional[str], analysis: str,
+                 save_dir: str, verbose: bool):
+    cmd = [driver_path]
     cmd += ['--input_model', model]
     cmd += ['--analysis', analysis]
     if data != None:
@@ -94,11 +103,9 @@ def _run_dalgona(model: str, data: Optional[str], analysis: str, save_dir: str,
     _run_cmd(cmd, verbose)
 
 
-def _run_fme_detect(input_model: str, fme_patterns: str, verbose: bool,
+def _run_fme_detect(driver_path: str, input_model: str, fme_patterns: str, verbose: bool,
                     allow_dup_op: bool):
-    dir_path = Path(__file__).parent.resolve()
-    fme_detect_path = os.path.join(dir_path, 'fme-detect')
-    cmd = [fme_detect_path]
+    cmd = [driver_path]
     cmd += ['--input', input_model]
     cmd += ['--output', fme_patterns]
     if allow_dup_op:
@@ -107,10 +114,9 @@ def _run_fme_detect(input_model: str, fme_patterns: str, verbose: bool,
     _run_cmd(cmd, verbose)
 
 
-def _run_fme_apply(input_model: str, fme_patterns: str, output_model: str, verbose: bool):
-    dir_path = Path(__file__).parent.resolve()
-    fme_apply_path = os.path.join(dir_path, 'fme-apply')
-    cmd = [fme_apply_path]
+def _run_fme_apply(driver_path: str, input_model: str, fme_patterns: str,
+                   output_model: str, verbose: bool):
+    cmd = [driver_path]
     cmd += ['--input', input_model]
     cmd += ['--fme_patterns', fme_patterns]
     cmd += ['--output', output_model]
@@ -128,6 +134,25 @@ def main():
     data = args.data
     verbose = args.verbose
     allow_dup_op = args.allow_dup_op
+    fme_detect_path = args.fme_detect
+    fme_apply_path = args.fme_apply
+    dalgona_path = args.dalgona
+
+    curr_dir = Path(__file__).parent.resolve()
+    dump_fme_param_py = curr_dir / 'fmelib' / 'DumpFMEParams.py'
+    if dump_fme_param_py.exists() == False:
+        raise FileNotFoundError('Error: DumpFMEParams.py not found')
+
+    if not fme_detect_path:
+        dir_path = Path(__file__).parent.resolve()
+        fme_detect_path = os.path.join(dir_path, 'fme-detect')
+    if not dalgona_path:
+        dir_path = os.getenv('ONE_BIN_PATH')
+        assert dir_path != None
+        dalgona_path = os.path.join(dir_path, 'dalgona')
+    if not fme_apply_path:
+        dir_path = Path(__file__).parent.resolve()
+        fme_apply_path = os.path.join(dir_path, 'fme-apply')
 
     with tempfile.TemporaryDirectory() as tmp_dir:
         fme_patterns = os.path.join(
@@ -135,7 +160,8 @@ def main():
             Path(output_model).with_suffix('.fme_patterns.json').name)
 
         # Step 1. Run fme-detect to find equalization patterns
-        _run_fme_detect(str(input_model),
+        _run_fme_detect(fme_detect_path,
+                        str(input_model),
                         str(fme_patterns),
                         verbose=verbose,
                         allow_dup_op=allow_dup_op)
@@ -144,8 +170,13 @@ def main():
         if args.fme_patterns != None:
             os.system(f'cp {fme_patterns} {args.fme_patterns}')
 
-        # TODO Step 2. Run dalgona
-        # _run_dalgona
+        # Step 2. Run dalgona
+        _run_dalgona(dalgona_path,
+                     str(input_model),
+                     data,
+                     str(dump_fme_param_py),
+                     str(fme_patterns),
+                     verbose=verbose)
 
         # Copy fme_patterns to the given path
         # Why copy twice? To observe the result of fme-detect too
@@ -153,7 +184,8 @@ def main():
             os.system(f'cp {fme_patterns} {args.fme_patterns}')
 
         # Step 3. Run fme-apply
-        _run_fme_apply(str(input_model),
+        _run_fme_apply(fme_apply_path,
+                       str(input_model),
                        str(fme_patterns),
                        str(output_model),
                        verbose=verbose)
diff --git a/compiler/fme-apply/src/FMEqualizer.cpp b/compiler/fme-apply/src/FMEqualizer.cpp
index 3d8ca2b0fb8..a8be34592a5 100644
--- a/compiler/fme-apply/src/FMEqualizer.cpp
+++ b/compiler/fme-apply/src/FMEqualizer.cpp
@@ -17,12 +17,8 @@
 #include "FMEqualizer.h"
 #include "InsertScaleShift.h"
 #include "EqualizePatternCheck.h"
-#include "pass/ForwardPreScalePass.h"
-#include "pass/ForwardPreShiftPass.h"
 #include "pass/FusePostScalePass.h"
-#include "pass/FusePostShiftPass.h"
 #include "pass/FusePreScalePass.h"
-#include "pass/FusePreShiftPass.h"
 #include "ProgressReporter.h"
 
 #include <luci/IR/CircleNode.h>
@@ -82,15 +78,9 @@ void FMEqualizer::equalize(loco::Graph *g, const std::vector<EqualizePattern> &p
   phase.emplace_back(std::make_unique<luci::CircleShapeInferencePass>());
   phase.emplace_back(std::make_unique<luci::CircleTypeInferencePass>());
 
-  // Forward PreScale/PreShift
-  phase.emplace_back(std::make_unique<fme_apply::ForwardPreScalePass>());
-  phase.emplace_back(std::make_unique<fme_apply::ForwardPreShiftPass>());
-
   // Fuse Pre/Post Scale/Shift
   phase.emplace_back(std::make_unique<fme_apply::FusePreScalePass>());
   phase.emplace_back(std::make_unique<fme_apply::FusePostScalePass>());
-  phase.emplace_back(std::make_unique<fme_apply::FusePreShiftPass>());
-  phase.emplace_back(std::make_unique<fme_apply::FusePostShiftPass>());
 
   ProgressReporter prog(g, logo::PhaseStrategy::Restart);
   logo::PhaseRunner<logo::PhaseStrategy::Restart> phase_runner{g};
diff --git a/compiler/fme-apply/src/RandomString.h b/compiler/fme-apply/src/RandomString.h
index 0f3533459eb..c147b7c3e2c 100644
--- a/compiler/fme-apply/src/RandomString.h
+++ b/compiler/fme-apply/src/RandomString.h
@@ -17,6 +17,7 @@
 #ifndef __FME_APPLY_RANDOM_STRING_H__
 #define __FME_APPLY_RANDOM_STRING_H__
 
+#include <cstdint>
 #include <string>
 
 namespace fme_apply
diff --git a/compiler/fme-apply/src/pass/ForwardPreScalePass.cpp b/compiler/fme-apply/src/pass/ForwardPreScalePass.cpp
deleted file mode 100644
index c9324bc6b7a..00000000000
--- a/compiler/fme-apply/src/pass/ForwardPreScalePass.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ForwardPreScalePass.h"
-#include "Support.Cast.h"
-#include "Support.Misc.h"
-
-#include <luci/IR/CircleNode.h>
-#include <luci/IR/CircleNodeVisitor.h>
-
-using namespace fme_apply;
-
-namespace
-{
-
-class ForwardPreScale final : public luci::CircleNodeMutableVisitor<bool>
-{
-protected:
-  bool visit(luci::CircleNode *node) { return false; }
-
-  bool visit(luci::CirclePad *node)
-  {
-    auto pre_scale = to_pre_scale(node->input());
-    if (not pre_scale)
-      return false;
-
-    if (loco::succs(pre_scale).size() != 1)
-      return false;
-
-    node->input(pre_scale->inputs(0));
-    loco::replace(node).with(pre_scale);
-    pre_scale->inputs(0, node);
-
-    // Shape should be copied, because
-    // shape inference does not work well for Custom Op (PreScale)
-    copy_shape(node, pre_scale);
-
-    return true;
-  }
-
-  bool visit(luci::CircleSlice *node)
-  {
-    auto pre_scale = to_pre_scale(node->input());
-    if (not pre_scale)
-      return false;
-
-    if (loco::succs(pre_scale).size() != 1)
-      return false;
-
-    node->input(pre_scale->inputs(0));
-    loco::replace(node).with(pre_scale);
-    pre_scale->inputs(0, node);
-
-    // Shape should be copied, because
-    // shape inference does not work well for Custom Op (PreScale)
-    copy_shape(node, pre_scale);
-
-    return true;
-  }
-};
-
-} // namespace
-
-namespace fme_apply
-{
-
-bool ForwardPreScalePass::run(loco::Graph *g)
-{
-  bool changed = false;
-
-  ForwardPreScale fps;
-  for (auto node : loco::active_nodes(loco::output_nodes(g)))
-  {
-    auto cnode = loco::must_cast<luci::CircleNode *>(node);
-    if (cnode->accept(&fps))
-      changed = true;
-  }
-
-  return changed;
-}
-
-} // namespace fme_apply
diff --git a/compiler/fme-apply/src/pass/ForwardPreScalePass.h b/compiler/fme-apply/src/pass/ForwardPreScalePass.h
deleted file mode 100644
index bb3819669ba..00000000000
--- a/compiler/fme-apply/src/pass/ForwardPreScalePass.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __FME_APPLY_FORWARD_PRE_SCALE_PASS_H__
-#define __FME_APPLY_FORWARD_PRE_SCALE_PASS_H__
-
-#include <loco.h>
-
-#include <logo/Pass.h>
-
-namespace fme_apply
-{
-
-/**
- * @brief Pass to forward CircleCustom(PreScale) to succeeding Op
- *
- * BEFORE
- *
- *    CircleCustom(PreScale)
- *          |
- *    Forwardable Op (ex: Pad, Slice)
- *
- * AFTER
- *
- *    Forwardable Op (ex: Pad, Slice)
- *          |
- *    CircleCustom(PreScale)
- */
-class ForwardPreScalePass : public logo::Pass
-{
-public:
-  virtual const char *name(void) const { return "fme_apply::ForwardPreScalePass"; }
-
-public:
-  bool run(loco::Graph *graph);
-};
-
-} // namespace fme_apply
-
-#endif //__FME_APPLY_FORWARD_PRE_SCALE_PASS_H__
diff --git a/compiler/fme-apply/src/pass/ForwardPreScalePass.test.cpp b/compiler/fme-apply/src/pass/ForwardPreScalePass.test.cpp
deleted file mode 100644
index e6e064b1a28..00000000000
--- a/compiler/fme-apply/src/pass/ForwardPreScalePass.test.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ForwardPreScalePass.h"
-#include "Support.Cast.h"
-
-#include <luci/test/TestIOGraph.h>
-
-#include <gtest/gtest.h>
-
-using namespace fme_apply;
-
-namespace
-{
-
-/**
- *  PreScale-Pad graphlet
- *
- *   [PreScale]
- *       |
- *     [Pad]
- *
- */
-class PreScalePadGraphlet
-{
-public:
-  void init(loco::Graph *g)
-  {
-    _prescale = g->nodes()->create<luci::CircleCustom>(2 /* arity */, 1 /* out */);
-    _prescale->dtype(loco::DataType::FLOAT32);
-    _prescale->shape({1, 4, 4, 16});
-    _prescale->custom_code("PreScale");
-    _prescale->name("prescale");
-
-    _pad = g->nodes()->create<luci::CirclePad>();
-    _pad->input(_prescale);
-    _pad->dtype(loco::DataType::FLOAT32);
-    _pad->shape({1, 5, 5, 16});
-    _pad->name("pad");
-  }
-
-public:
-  luci::CircleCustom *_prescale = nullptr;
-  luci::CirclePad *_pad = nullptr;
-};
-
-class PreScalePadGraph : public luci::test::TestIOGraph, public PreScalePadGraphlet
-{
-public:
-  void init(void)
-  {
-    luci::test::TestIOGraph::init({1, 4, 4, 16}, {1, 5, 5, 16});
-    PreScalePadGraphlet::init(g());
-
-    _prescale->inputs(0, input());
-
-    output()->from(_pad);
-  }
-
-  std::unique_ptr<loco::Graph> graph(void) { return std::move(_g); }
-};
-
-/**
- *  PreScale-Slice graphlet
- *
- *   [PreScale]
- *       |
- *     [Slice]
- *
- */
-class PreScaleSliceGraphlet
-{
-public:
-  void init(loco::Graph *g)
-  {
-    _prescale = g->nodes()->create<luci::CircleCustom>(2 /* arity */, 1 /* out */);
-    _prescale->dtype(loco::DataType::FLOAT32);
-    _prescale->shape({1, 4, 4, 16});
-    _prescale->custom_code("PreScale");
-    _prescale->name("prescale");
-
-    _slice = g->nodes()->create<luci::CircleSlice>();
-    _slice->input(_prescale);
-    _slice->dtype(loco::DataType::FLOAT32);
-    _slice->shape({1, 2, 2, 16});
-    _slice->name("slice");
-  }
-
-public:
-  luci::CircleCustom *_prescale = nullptr;
-  luci::CircleSlice *_slice = nullptr;
-};
-
-class PreScaleSliceGraph : public luci::test::TestIOGraph, public PreScaleSliceGraphlet
-{
-public:
-  void init(void)
-  {
-    luci::test::TestIOGraph::init({1, 4, 4, 16}, {1, 2, 2, 16});
-    PreScaleSliceGraphlet::init(g());
-
-    _prescale->inputs(0, input());
-
-    output()->from(_slice);
-  }
-
-  std::unique_ptr<loco::Graph> graph(void) { return std::move(_g); }
-};
-
-} // namespace
-
-TEST(ForwardPreScalePassTest, prescale_pad)
-{
-  PreScalePadGraph g;
-  g.init();
-
-  ForwardPreScalePass fpsp;
-  EXPECT_TRUE(fpsp.run(g.g()));
-
-  auto pre = to_pre_scale(g.output()->from());
-  EXPECT_NE(nullptr, pre);
-
-  auto pad = dynamic_cast<luci::CirclePad *>(pre->inputs(0));
-  EXPECT_NE(nullptr, pad);
-
-  EXPECT_EQ(4, pre->rank());
-  EXPECT_EQ(1, pre->dim(0).value());
-  EXPECT_EQ(5, pre->dim(1).value());
-  EXPECT_EQ(5, pre->dim(2).value());
-  EXPECT_EQ(16, pre->dim(3).value());
-}
-
-TEST(ForwardPreScalePassTest, prescale_slice)
-{
-  PreScaleSliceGraph g;
-  g.init();
-
-  ForwardPreScalePass fpsp;
-  EXPECT_TRUE(fpsp.run(g.g()));
-
-  auto pre = to_pre_scale(g.output()->from());
-  EXPECT_NE(nullptr, pre);
-
-  auto slice = dynamic_cast<luci::CircleSlice *>(pre->inputs(0));
-  EXPECT_NE(nullptr, slice);
-
-  EXPECT_EQ(4, pre->rank());
-  EXPECT_EQ(1, pre->dim(0).value());
-  EXPECT_EQ(2, pre->dim(1).value());
-  EXPECT_EQ(2, pre->dim(2).value());
-  EXPECT_EQ(16, pre->dim(3).value());
-}
-
-TEST(ForwardPreScalePassTest, prescale_conv_NEG)
-{
-  PreScalePadGraph g;
-  g.init();
-
-  // Replace Pad with Conv2D
-  auto conv = g.g()->nodes()->create<luci::CircleConv2D>();
-  conv->input(g._prescale);
-  g.output()->from(conv);
-
-  ForwardPreScalePass fpsp;
-  EXPECT_FALSE(fpsp.run(g.g()));
-}
diff --git a/compiler/fme-apply/src/pass/ForwardPreShiftPass.cpp b/compiler/fme-apply/src/pass/ForwardPreShiftPass.cpp
deleted file mode 100644
index 20357de8383..00000000000
--- a/compiler/fme-apply/src/pass/ForwardPreShiftPass.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ForwardPreShiftPass.h"
-#include "Support.Cast.h"
-#include "Support.Misc.h"
-
-#include <luci/IR/CircleNode.h>
-#include <luci/IR/CircleNodeVisitor.h>
-
-using namespace fme_apply;
-
-namespace
-{
-
-class ForwardPreShift final : public luci::CircleNodeMutableVisitor<bool>
-{
-protected:
-  bool visit(luci::CircleNode *node) { return false; }
-
-  bool visit(luci::CircleSlice *node)
-  {
-    auto pre_shift = to_pre_shift(node->input());
-    if (not pre_shift)
-      return false;
-
-    if (loco::succs(pre_shift).size() != 1)
-      return false;
-
-    node->input(pre_shift->inputs(0));
-    loco::replace(node).with(pre_shift);
-    pre_shift->inputs(0, node);
-
-    // Shape should be copied, because
-    // shape inference does not work well for Custom Op (PreShift)
-    copy_shape(node, pre_shift);
-
-    return true;
-  }
-};
-
-} // namespace
-
-namespace fme_apply
-{
-
-bool ForwardPreShiftPass::run(loco::Graph *g)
-{
-  bool changed = false;
-
-  ForwardPreShift fps;
-  for (auto node : loco::active_nodes(loco::output_nodes(g)))
-  {
-    auto cnode = loco::must_cast<luci::CircleNode *>(node);
-    if (cnode->accept(&fps))
-      changed = true;
-  }
-
-  return changed;
-}
-
-} // namespace fme_apply
diff --git a/compiler/fme-apply/src/pass/ForwardPreShiftPass.h b/compiler/fme-apply/src/pass/ForwardPreShiftPass.h
deleted file mode 100644
index dbc70933842..00000000000
--- a/compiler/fme-apply/src/pass/ForwardPreShiftPass.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __FME_APPLY_FORWARD_PRE_SHIFT_PASS_H__
-#define __FME_APPLY_FORWARD_PRE_SHIFT_PASS_H__
-
-#include <loco.h>
-
-#include <logo/Pass.h>
-
-namespace fme_apply
-{
-
-/**
- * @brief Pass to forward CircleCustom(PreShift) to succeeding Op
- *
- * BEFORE
- *
- *    CircleCustom(PreShift)
- *          |
- *    Forwardable Op (ex: Slice)
- *
- * AFTER
- *
- *    Forwardable Op (ex: Slice)
- *          |
- *    CircleCustom(PreShift)
- */
-class ForwardPreShiftPass : public logo::Pass
-{
-public:
-  virtual const char *name(void) const { return "fme_apply::ForwardPreShiftPass"; }
-
-public:
-  bool run(loco::Graph *graph);
-};
-
-} // namespace fme_apply
-
-#endif //__FME_APPLY_FORWARD_PRE_SHIFT_PASS_H__
diff --git a/compiler/fme-apply/src/pass/ForwardPreShiftPass.test.cpp b/compiler/fme-apply/src/pass/ForwardPreShiftPass.test.cpp
deleted file mode 100644
index 2fdcf90ad9a..00000000000
--- a/compiler/fme-apply/src/pass/ForwardPreShiftPass.test.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ForwardPreShiftPass.h"
-#include "Support.Cast.h"
-
-#include <luci/test/TestIOGraph.h>
-
-#include <gtest/gtest.h>
-
-using namespace fme_apply;
-
-namespace
-{
-
-/**
- *  PreShift-Slice graphlet
- *
- *   [PreShift]
- *       |
- *     [Slice]
- *
- */
-class PreShiftSliceGraphlet
-{
-public:
-  void init(loco::Graph *g)
-  {
-    _preshift = g->nodes()->create<luci::CircleCustom>(2 /* arity */, 1 /* out */);
-    _preshift->dtype(loco::DataType::FLOAT32);
-    _preshift->shape({1, 4, 4, 16});
-    _preshift->custom_code("PreShift");
-    _preshift->name("preshift");
-
-    _slice = g->nodes()->create<luci::CircleSlice>();
-    _slice->input(_preshift);
-    _slice->dtype(loco::DataType::FLOAT32);
-    _slice->shape({1, 2, 2, 16});
-    _slice->name("slice");
-  }
-
-public:
-  luci::CircleCustom *_preshift = nullptr;
-  luci::CircleSlice *_slice = nullptr;
-};
-
-class PreShiftSliceGraph : public luci::test::TestIOGraph, public PreShiftSliceGraphlet
-{
-public:
-  void init(void)
-  {
-    luci::test::TestIOGraph::init({1, 4, 4, 16}, {1, 2, 2, 16});
-    PreShiftSliceGraphlet::init(g());
-
-    _preshift->inputs(0, input());
-
-    output()->from(_slice);
-  }
-
-  std::unique_ptr<loco::Graph> graph(void) { return std::move(_g); }
-};
-
-} // namespace
-
-TEST(ForwardPreShiftPassTest, preshift_slice)
-{
-  PreShiftSliceGraph g;
-  g.init();
-
-  ForwardPreShiftPass fpsp;
-  EXPECT_TRUE(fpsp.run(g.g()));
-
-  auto pre = to_pre_shift(g.output()->from());
-  EXPECT_NE(nullptr, pre);
-
-  auto slice = dynamic_cast<luci::CircleSlice *>(pre->inputs(0));
-  EXPECT_NE(nullptr, slice);
-
-  EXPECT_EQ(4, pre->rank());
-  EXPECT_EQ(1, pre->dim(0).value());
-  EXPECT_EQ(2, pre->dim(1).value());
-  EXPECT_EQ(2, pre->dim(2).value());
-  EXPECT_EQ(16, pre->dim(3).value());
-}
-
-TEST(ForwardPreShiftPassTest, preshift_conv_NEG)
-{
-  PreShiftSliceGraph g;
-  g.init();
-
-  // Replace Pad with Conv2D
-  auto conv = g.g()->nodes()->create<luci::CircleConv2D>();
-  conv->input(g._preshift);
-  g.output()->from(conv);
-
-  ForwardPreShiftPass fpsp;
-  EXPECT_FALSE(fpsp.run(g.g()));
-}
diff --git a/compiler/fme-apply/src/pass/FusePostShiftPass.cpp b/compiler/fme-apply/src/pass/FusePostShiftPass.cpp
deleted file mode 100644
index 744fede9ae2..00000000000
--- a/compiler/fme-apply/src/pass/FusePostShiftPass.cpp
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "FusePostShiftPass.h"
-#include "Support.Cast.h"
-#include "RandomString.h"
-
-#include <luci/IR/CircleNode.h>
-#include <luci/IR/CircleNodeVisitor.h>
-#include <luci/Profile/CircleNodeOrigin.h>
-#include <luci/Service/CircleNodeClone.h>
-#include <luci/Service/Nodes/CircleConst.h>
-
-using namespace fme_apply;
-
-namespace
-{
-
-// Fuse Op + CircleCustom(PostShift)
-struct FusePostShift final : public luci::CircleNodeMutableVisitor<bool>
-{
-  bool visit(luci::CircleNode *) { return false; }
-
-  bool visit(luci::CircleConv2D *node)
-  {
-    if (node->fusedActivationFunction() != luci::FusedActFunc::NONE)
-      return false;
-
-    bool changed = false;
-    for (auto succ : loco::succs(node))
-    {
-      auto post_shift = to_post_shift(succ);
-      if (not post_shift)
-        continue;
-
-      auto param =
-        loco::must_cast<luci::CircleConst *>(post_shift->inputs(1)); // FIX_PostShift_UNLESS
-      auto bias = dynamic_cast<luci::CircleConst *>(node->bias());
-      if (not bias)
-        continue;
-
-      uint32_t channel_size = bias->size<loco::DataType::FLOAT32>();
-      if (channel_size != param->size<loco::DataType::FLOAT32>())
-      {
-        assert(false); // FIX_PostScale_Unless
-        return false;
-      }
-
-      auto cloned_conv = luci::clone_node(node, node->graph());
-      assert(cloned_conv != nullptr); // FIX_CALLER_UNLESS
-      auto fused_conv = loco::must_cast<luci::CircleConv2D *>(cloned_conv);
-      auto fused_bias = luci::clone(bias);
-
-      fused_conv->name(node->name() + "_" + random_str());
-      fused_bias->name(bias->name() + "_" + random_str());
-
-      add_origin(fused_conv, luci::get_origin(node));
-      add_origin(fused_bias, luci::get_origin(bias));
-
-      // Add param to bias
-      for (uint32_t c = 0; c < channel_size; ++c)
-      {
-        float shift = param->at<loco::DataType::FLOAT32>(c);
-        fused_bias->at<loco::DataType::FLOAT32>(c) =
-          fused_bias->at<loco::DataType::FLOAT32>(c) + shift;
-      }
-
-      fused_conv->input(node->input());
-      fused_conv->filter(node->filter());
-      fused_conv->bias(fused_bias);
-
-      loco::replace(post_shift).with(fused_conv);
-      changed = true;
-    }
-
-    return changed;
-  }
-
-  bool visit(luci::CircleDepthwiseConv2D *node)
-  {
-    if (node->fusedActivationFunction() != luci::FusedActFunc::NONE)
-      return false;
-
-    bool changed = false;
-    for (auto succ : loco::succs(node))
-    {
-      auto post_shift = to_post_shift(succ);
-      if (not post_shift)
-        continue;
-
-      auto param =
-        loco::must_cast<luci::CircleConst *>(post_shift->inputs(1)); // FIX_PostShift_UNLESS
-      auto bias = dynamic_cast<luci::CircleConst *>(node->bias());
-      if (not bias)
-        continue;
-
-      uint32_t channel_size = bias->size<loco::DataType::FLOAT32>();
-      if (channel_size != param->size<loco::DataType::FLOAT32>())
-      {
-        assert(false); // FIX_PostScale_Unless
-        return false;
-      }
-
-      auto cloned_dconv = luci::clone_node(node, node->graph());
-      assert(cloned_dconv != nullptr); // FIX_CALLER_UNLESS
-      auto fused_dconv = loco::must_cast<luci::CircleDepthwiseConv2D *>(cloned_dconv);
-      auto fused_bias = luci::clone(bias);
-
-      fused_dconv->name(node->name() + "_" + random_str());
-      fused_bias->name(bias->name() + "_" + random_str());
-
-      add_origin(fused_dconv, luci::get_origin(node));
-      add_origin(fused_bias, luci::get_origin(bias));
-
-      // Add param to bias
-      for (uint32_t c = 0; c < channel_size; ++c)
-      {
-        float shift = param->at<loco::DataType::FLOAT32>(c);
-        fused_bias->at<loco::DataType::FLOAT32>(c) =
-          fused_bias->at<loco::DataType::FLOAT32>(c) + shift;
-      }
-
-      fused_dconv->input(node->input());
-      fused_dconv->filter(node->filter());
-      fused_dconv->bias(fused_bias);
-
-      loco::replace(post_shift).with(fused_dconv);
-      changed = true;
-    }
-
-    return changed;
-  }
-
-  bool visit(luci::CircleTransposeConv *node)
-  {
-    bool changed = false;
-    for (auto succ : loco::succs(node))
-    {
-      auto post_shift = to_post_shift(succ);
-      if (not post_shift)
-        continue;
-
-      auto param =
-        loco::must_cast<luci::CircleConst *>(post_shift->inputs(1)); // FIX_PostShift_UNLESS
-
-      // TConv has bias. Update bias.
-      if (auto bias = dynamic_cast<luci::CircleConst *>(node->bias()))
-      {
-        uint32_t channel_size = bias->size<loco::DataType::FLOAT32>();
-        if (channel_size != param->size<loco::DataType::FLOAT32>())
-        {
-          assert(false); // FIX_PostScale_Unless
-          return false;
-        }
-
-        auto cloned_tconv = luci::clone_node(node, node->graph());
-        assert(cloned_tconv != nullptr); // FIX_CALLER_UNLESS
-        auto fused_tconv = loco::must_cast<luci::CircleTransposeConv *>(cloned_tconv);
-        auto fused_bias = luci::clone(bias);
-
-        fused_tconv->name(node->name() + "_" + random_str());
-        fused_bias->name(bias->name() + "_" + random_str());
-
-        add_origin(fused_tconv, luci::get_origin(node));
-        add_origin(fused_bias, luci::get_origin(bias));
-
-        // Add param to bias
-        for (uint32_t c = 0; c < channel_size; ++c)
-        {
-          float shift = param->at<loco::DataType::FLOAT32>(c);
-          fused_bias->at<loco::DataType::FLOAT32>(c) =
-            fused_bias->at<loco::DataType::FLOAT32>(c) + shift;
-        }
-
-        fused_tconv->inputSizes(node->inputSizes());
-        fused_tconv->outBackprop(node->outBackprop());
-        fused_tconv->filter(node->filter());
-        fused_tconv->bias(fused_bias);
-
-        loco::replace(post_shift).with(fused_tconv);
-        changed = true;
-        continue;
-      }
-
-      // TConv has no bias. Just use param
-      if (auto bias = dynamic_cast<luci::CircleOutputExclude *>(node->bias()))
-      {
-        auto cloned_tconv = luci::clone_node(node, node->graph());
-        assert(cloned_tconv != nullptr); // FIX_CALLER_UNLESS
-        auto fused_tconv = loco::must_cast<luci::CircleTransposeConv *>(cloned_tconv);
-
-        fused_tconv->inputSizes(node->inputSizes());
-        fused_tconv->outBackprop(node->outBackprop());
-        fused_tconv->filter(node->filter());
-        fused_tconv->bias(param);
-
-        loco::replace(post_shift).with(fused_tconv);
-        changed = true;
-        continue;
-      }
-    }
-
-    return changed;
-  }
-};
-
-} // namespace
-
-namespace fme_apply
-{
-
-bool FusePostShiftPass::run(loco::Graph *g)
-{
-  bool changed = false;
-  for (auto node : loco::postorder_traversal(loco::output_nodes(g)))
-  {
-    FusePostShift fps;
-    auto cnode = loco::must_cast<luci::CircleNode *>(node);
-    if (cnode->accept(&fps))
-      changed = true;
-  }
-
-  return changed;
-}
-
-} // namespace fme_apply
diff --git a/compiler/fme-apply/src/pass/FusePostShiftPass.h b/compiler/fme-apply/src/pass/FusePostShiftPass.h
deleted file mode 100644
index 37c0b74a246..00000000000
--- a/compiler/fme-apply/src/pass/FusePostShiftPass.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __FME_APPLY_FUSE_POST_SHIFT_PASS_H__
-#define __FME_APPLY_FUSE_POST_SHIFT_PASS_H__
-
-#include <loco.h>
-
-#include <logo/Pass.h>
-
-namespace fme_apply
-{
-
-/**
- * @brief Pass to fuse CircleCustom(PostShift) to preceding Ops
- *
- * BEFORE
- *
- *             [Node1]
- *               |
- *              [Op]
- *             /    \
- *    [PostShift]    [Node2]
- *
- * AFTER
- *
- *       [Node1]
- *        /  \
- *    [Op']   [Op]
- *              |
- *            [Node2]
- *
- * NOTE Op' is clone of Op with updated bias.
- */
-class FusePostShiftPass : public logo::Pass
-{
-public:
-  virtual const char *name(void) const { return "fme::FusePostShiftPass"; }
-
-public:
-  bool run(loco::Graph *graph);
-};
-
-} // namespace fme_apply
-
-#endif //__FME_APPLY_FUSE_POST_SHIFT_PASS_H__
diff --git a/compiler/fme-apply/src/pass/FusePostShiftPass.test.cpp b/compiler/fme-apply/src/pass/FusePostShiftPass.test.cpp
deleted file mode 100644
index f379623bb49..00000000000
--- a/compiler/fme-apply/src/pass/FusePostShiftPass.test.cpp
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "FusePostShiftPass.h"
-#include "Support.Cast.h"
-
-#include <luci/test/TestIOGraph.h>
-
-#include <gtest/gtest.h>
-
-using namespace fme_apply;
-
-namespace
-{
-
-luci::CircleConst *create_const_node(loco::Graph *g, const loco::DataType dtype,
-                                     const std::vector<uint32_t> &shape,
-                                     const std::vector<float> &values)
-{
-  auto node = g->nodes()->create<luci::CircleConst>();
-  node->dtype(dtype);
-  node->rank(shape.size());
-
-  uint32_t size = 1;
-  for (uint32_t i = 0; i < shape.size(); ++i)
-  {
-    node->dim(i) = shape[i];
-    size *= shape[i];
-  }
-  node->shape_status(luci::ShapeStatus::VALID);
-
-  assert(values.size() == size); // FIX_CALLER_UNLESS
-
-  node->size<loco::DataType::FLOAT32>(size);
-  for (uint32_t i = 0; i < values.size(); ++i)
-    node->at<loco::DataType::FLOAT32>(i) = values[i];
-
-  return node;
-}
-
-/**
- *  PostShift-Conv graphlet
- *
- *     [Conv]
- *       |
- *   [PostShift]
- *
- */
-class PostShiftConvGraphlet
-{
-public:
-  void init(loco::Graph *g)
-  {
-    std::vector<float> filter_val(3 * 3 * 3 * 3 /* size */, 1.0 /*value */);
-
-    _conv = g->nodes()->create<luci::CircleConv2D>();
-    _conv->filter(
-      create_const_node(g, loco::DataType::FLOAT32, {3, 3, 3, 3} /* shape */, filter_val));
-    _conv->bias(
-      create_const_node(g, loco::DataType::FLOAT32, {3} /* shape */, {2, 2, 2} /* value*/));
-    _conv->fusedActivationFunction(luci::FusedActFunc::NONE);
-    _conv->dtype(loco::DataType::FLOAT32);
-    _conv->shape({1, 4, 4, 3});
-    _conv->padding(luci::Padding::SAME);
-    _conv->name("conv");
-
-    _postshift = g->nodes()->create<luci::CircleCustom>(2 /* arity */, 1 /* out */);
-    _postshift->dtype(loco::DataType::FLOAT32);
-    _postshift->inputs(0, _conv);
-    _postshift->inputs(
-      1, create_const_node(g, loco::DataType::FLOAT32, {3} /* shape */, {2, 2, 2} /* value */));
-    _postshift->shape({1, 4, 4, 3});
-    _postshift->custom_code("PostShift");
-    _postshift->name("postshift");
-  }
-
-public:
-  luci::CircleCustom *_postshift = nullptr;
-  luci::CircleConv2D *_conv = nullptr;
-};
-
-class PostShiftConvGraph : public luci::test::TestIOGraph, public PostShiftConvGraphlet
-{
-public:
-  void init(void)
-  {
-    luci::test::TestIOGraph::init({1, 4, 4, 3}, {1, 4, 4, 3});
-    PostShiftConvGraphlet::init(g());
-
-    _conv->input(input());
-
-    output()->from(_postshift);
-  }
-
-  std::unique_ptr<loco::Graph> graph(void) { return std::move(_g); }
-};
-
-/**
- *  PostShift-DConv graphlet
- *
- *     [DConv]
- *       |
- *   [PostShift]
- *
- */
-class PostShiftDConvGraphlet
-{
-public:
-  void init(loco::Graph *g)
-  {
-    std::vector<float> filter_val(1 * 3 * 3 * 3 /* size */, 1.0 /*value */);
-
-    _dconv = g->nodes()->create<luci::CircleDepthwiseConv2D>();
-    _dconv->filter(
-      create_const_node(g, loco::DataType::FLOAT32, {1, 3, 3, 3} /* shape */, filter_val));
-    _dconv->bias(
-      create_const_node(g, loco::DataType::FLOAT32, {3} /* shape */, {2, 2, 2} /* value*/));
-    _dconv->fusedActivationFunction(luci::FusedActFunc::NONE);
-    _dconv->dtype(loco::DataType::FLOAT32);
-    _dconv->shape({1, 4, 4, 3});
-    _dconv->padding(luci::Padding::SAME);
-    _dconv->name("dconv");
-
-    _postshift = g->nodes()->create<luci::CircleCustom>(2 /* arity */, 1 /* out */);
-    _postshift->dtype(loco::DataType::FLOAT32);
-    _postshift->inputs(0, _dconv);
-    _postshift->inputs(
-      1, create_const_node(g, loco::DataType::FLOAT32, {3} /* shape */, {2, 2, 2} /* value */));
-    _postshift->shape({1, 4, 4, 3});
-    _postshift->custom_code("PostShift");
-    _postshift->name("postshift");
-  }
-
-public:
-  luci::CircleCustom *_postshift = nullptr;
-  luci::CircleDepthwiseConv2D *_dconv = nullptr;
-};
-
-class PostShiftDConvGraph : public luci::test::TestIOGraph, public PostShiftDConvGraphlet
-{
-public:
-  void init(void)
-  {
-    luci::test::TestIOGraph::init({1, 4, 4, 3}, {1, 4, 4, 3});
-    PostShiftDConvGraphlet::init(g());
-
-    _dconv->input(input());
-
-    output()->from(_postshift);
-  }
-
-  std::unique_ptr<loco::Graph> graph(void) { return std::move(_g); }
-};
-
-/**
- *  PostShift-TConv graphlet
- *
- *     [TConv]
- *       |
- *   [PostShift]
- *
- */
-class PostShiftTConvGraphlet
-{
-public:
-  void init(loco::Graph *g)
-  {
-    std::vector<float> filter_val(1 * 3 * 3 * 3 /* size */, 1.0 /*value */);
-
-    _tconv = g->nodes()->create<luci::CircleTransposeConv>();
-    _tconv->filter(
-      create_const_node(g, loco::DataType::FLOAT32, {1, 3, 3, 3} /* shape */, filter_val));
-    _tconv->bias(
-      create_const_node(g, loco::DataType::FLOAT32, {3} /* shape */, {2, 2, 2} /* value*/));
-    _tconv->dtype(loco::DataType::FLOAT32);
-    _tconv->shape({1, 4, 4, 3});
-    _tconv->padding(luci::Padding::SAME);
-    _tconv->name("dconv");
-
-    _postshift = g->nodes()->create<luci::CircleCustom>(2 /* arity */, 1 /* out */);
-    _postshift->dtype(loco::DataType::FLOAT32);
-    _postshift->inputs(0, _tconv);
-    _postshift->inputs(
-      1, create_const_node(g, loco::DataType::FLOAT32, {3} /* shape */, {2, 2, 2} /* value */));
-    _postshift->shape({1, 4, 4, 3});
-    _postshift->custom_code("PostShift");
-    _postshift->name("postshift");
-  }
-
-public:
-  luci::CircleCustom *_postshift = nullptr;
-  luci::CircleTransposeConv *_tconv = nullptr;
-};
-
-class PostShiftTConvGraph : public luci::test::TestIOGraph, public PostShiftTConvGraphlet
-{
-public:
-  void init(void)
-  {
-    luci::test::TestIOGraph::init({1, 4, 4, 3}, {1, 4, 4, 3});
-    PostShiftTConvGraphlet::init(g());
-
-    _tconv->outBackprop(input());
-
-    output()->from(_postshift);
-  }
-
-  std::unique_ptr<loco::Graph> graph(void) { return std::move(_g); }
-};
-
-} // namespace
-
-TEST(FusePostShiftPassTest, postshift_conv)
-{
-  PostShiftConvGraph g;
-  g.init();
-
-  FusePostShiftPass fpsp;
-  EXPECT_TRUE(fpsp.run(g.g()));
-
-  auto conv = dynamic_cast<luci::CircleConv2D *>(g.output()->from());
-  EXPECT_NE(nullptr, conv);
-
-  // Check bias
-  auto b = dynamic_cast<luci::CircleConst *>(conv->bias());
-  EXPECT_NE(nullptr, b);
-  EXPECT_EQ(loco::DataType::FLOAT32, b->dtype());
-  EXPECT_EQ(3, b->size<loco::DataType::FLOAT32>());
-  for (uint32_t i = 0; i < 3; i++)
-  {
-    EXPECT_FLOAT_EQ(4.0, b->at<loco::DataType::FLOAT32>(i));
-  }
-}
-
-TEST(FusePostShiftPassTest, postshift_conv_NEG)
-{
-  PostShiftConvGraph g;
-  g.init();
-  g._conv->fusedActivationFunction(luci::FusedActFunc::RELU6);
-
-  FusePostShiftPass fpsp;
-  EXPECT_FALSE(fpsp.run(g.g()));
-}
-
-TEST(FusePostShiftPassTest, postshift_dconv)
-{
-  PostShiftDConvGraph g;
-  g.init();
-
-  FusePostShiftPass fpsp;
-  EXPECT_TRUE(fpsp.run(g.g()));
-
-  auto dconv = dynamic_cast<luci::CircleDepthwiseConv2D *>(g.output()->from());
-  EXPECT_NE(nullptr, dconv);
-
-  // Check bias
-  auto b = dynamic_cast<luci::CircleConst *>(dconv->bias());
-  EXPECT_NE(nullptr, b);
-  EXPECT_EQ(loco::DataType::FLOAT32, b->dtype());
-  EXPECT_EQ(3, b->size<loco::DataType::FLOAT32>());
-  for (uint32_t i = 0; i < 3; i++)
-  {
-    EXPECT_FLOAT_EQ(4.0, b->at<loco::DataType::FLOAT32>(i));
-  }
-}
-
-TEST(FusePostShiftPassTest, postshift_dconv_NEG)
-{
-  PostShiftDConvGraph g;
-  g.init();
-  g._dconv->fusedActivationFunction(luci::FusedActFunc::RELU6);
-
-  FusePostShiftPass fpsp;
-  EXPECT_FALSE(fpsp.run(g.g()));
-}
-
-TEST(FusePostShiftPassTest, postshift_tconv)
-{
-  PostShiftTConvGraph g;
-  g.init();
-
-  FusePostShiftPass fpsp;
-  EXPECT_TRUE(fpsp.run(g.g()));
-
-  auto tconv = dynamic_cast<luci::CircleTransposeConv *>(g.output()->from());
-  EXPECT_NE(nullptr, tconv);
-
-  // Check bias
-  auto b = dynamic_cast<luci::CircleConst *>(tconv->bias());
-  EXPECT_NE(nullptr, b);
-  EXPECT_EQ(loco::DataType::FLOAT32, b->dtype());
-  EXPECT_EQ(3, b->size<loco::DataType::FLOAT32>());
-  for (uint32_t i = 0; i < 3; i++)
-  {
-    EXPECT_FLOAT_EQ(4.0, b->at<loco::DataType::FLOAT32>(i));
-  }
-}
-
-TEST(FusePostShiftPassTest, postshift_tconv_nobias)
-{
-  PostShiftTConvGraph g;
-  g.init();
-
-  auto no_bias = g.g()->nodes()->create<luci::CircleOutputExclude>();
-  g._tconv->bias(no_bias);
-
-  FusePostShiftPass fpsp;
-  EXPECT_TRUE(fpsp.run(g.g()));
-
-  auto tconv = dynamic_cast<luci::CircleTransposeConv *>(g.output()->from());
-  EXPECT_NE(nullptr, tconv);
-
-  // Check bias
-  auto b = dynamic_cast<luci::CircleConst *>(tconv->bias());
-  EXPECT_NE(nullptr, b);
-  EXPECT_EQ(loco::DataType::FLOAT32, b->dtype());
-  EXPECT_EQ(3, b->size<loco::DataType::FLOAT32>());
-  for (uint32_t i = 0; i < 3; i++)
-  {
-    EXPECT_FLOAT_EQ(2.0, b->at<loco::DataType::FLOAT32>(i));
-  }
-}
-
-TEST(FusePostShiftPassTest, postshift_tconv_NEG)
-{
-  PostShiftTConvGraph g;
-  g.init();
-  g._postshift->inputs(0, g.input());
-  g.output()->from(g._tconv);
-
-  FusePostShiftPass fpsp;
-  EXPECT_FALSE(fpsp.run(g.g()));
-}
diff --git a/compiler/fme-apply/src/pass/FusePreShiftPass.cpp b/compiler/fme-apply/src/pass/FusePreShiftPass.cpp
deleted file mode 100644
index 19cd9eb7e01..00000000000
--- a/compiler/fme-apply/src/pass/FusePreShiftPass.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "FusePreShiftPass.h"
-#include "Support.Cast.h"
-
-#include <luci/IR/CircleNode.h>
-#include <luci/IR/CircleNodeVisitor.h>
-#include <luci/Profile/CircleNodeOrigin.h>
-#include <luci/Service/CircleNodeClone.h>
-#include <luci/Service/Nodes/CircleConst.h>
-
-using namespace fme_apply;
-
-namespace
-{
-
-// Fuse CircleCustom(PreShift) + Op
-struct FusePreShift final : public luci::CircleNodeMutableVisitor<bool>
-{
-  bool visit(luci::CircleNode *) { return false; }
-
-  bool visit(luci::CircleInstanceNorm *node)
-  {
-    auto pre_shift = to_pre_shift(node->input());
-    if (not pre_shift)
-      return false;
-
-    auto param = loco::must_cast<luci::CircleConst *>(pre_shift->inputs(1)); // FIX_PreScale_UNLESS
-    auto channel = node->dim(node->rank() - 1).value();
-    if (channel != param->size<loco::DataType::FLOAT32>())
-    {
-      assert(false); // FIX_PreShift_Unless
-      return false;
-    }
-
-    // Output of InstanceNorm is not affected by PreShift
-    node->input(pre_shift->inputs(0));
-
-    return true;
-  }
-};
-
-} // namespace
-
-namespace fme_apply
-{
-
-bool FusePreShiftPass::run(loco::Graph *g)
-{
-  bool changed = false;
-  for (auto node : loco::postorder_traversal(loco::output_nodes(g)))
-  {
-    FusePreShift fps;
-    auto cnode = loco::must_cast<luci::CircleNode *>(node);
-    if (cnode->accept(&fps))
-      changed = true;
-  }
-
-  return changed;
-}
-
-} // namespace fme_apply
diff --git a/compiler/fme-apply/src/pass/FusePreShiftPass.h b/compiler/fme-apply/src/pass/FusePreShiftPass.h
deleted file mode 100644
index 31e3a4c0c98..00000000000
--- a/compiler/fme-apply/src/pass/FusePreShiftPass.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __FME_APPLY_FUSE_PRE_SHIFT_PASS_H__
-#define __FME_APPLY_FUSE_PRE_SHIFT_PASS_H__
-
-#include <loco.h>
-
-#include <logo/Pass.h>
-
-namespace fme_apply
-{
-
-/**
- * @brief Pass to fuse CircleCustom(PreShift) to succeeding Ops
- *
- * BEFORE
- *
- *      [Node]
- *        |
- *    [PreShift]
- *        |
- *       [Op]
- *
- * AFTER
- *
- *      [Node]
- *        |
- *      [Op']
- *
- */
-class FusePreShiftPass : public logo::Pass
-{
-public:
-  virtual const char *name(void) const { return "fme::FusePreShiftPass"; }
-
-public:
-  bool run(loco::Graph *graph);
-};
-
-} // namespace fme_apply
-
-#endif //__FME_APPLY_FUSE_PRE_SHIFT_PASS_H__
diff --git a/compiler/fme-apply/src/pass/FusePreShiftPass.test.cpp b/compiler/fme-apply/src/pass/FusePreShiftPass.test.cpp
deleted file mode 100644
index f74f49c1cd8..00000000000
--- a/compiler/fme-apply/src/pass/FusePreShiftPass.test.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "FusePreShiftPass.h"
-#include "Support.Cast.h"
-
-#include <luci/test/TestIOGraph.h>
-
-#include <gtest/gtest.h>
-
-using namespace fme_apply;
-
-namespace
-{
-
-luci::CircleConst *create_const_node(loco::Graph *g, const loco::DataType dtype,
-                                     const std::vector<uint32_t> &shape,
-                                     const std::vector<float> &values)
-{
-  auto node = g->nodes()->create<luci::CircleConst>();
-  node->dtype(dtype);
-  node->rank(shape.size());
-
-  uint32_t size = 1;
-  for (uint32_t i = 0; i < shape.size(); ++i)
-  {
-    node->dim(i) = shape[i];
-    size *= shape[i];
-  }
-  node->shape_status(luci::ShapeStatus::VALID);
-
-  assert(values.size() == size); // FIX_CALLER_UNLESS
-
-  node->size<loco::DataType::FLOAT32>(size);
-  for (uint32_t i = 0; i < values.size(); ++i)
-    node->at<loco::DataType::FLOAT32>(i) = values[i];
-
-  return node;
-}
-
-/**
- *  PreShift-Instnorm graphlet
- *
- *   [PreShift]
- *       |
- *   [Instnorm]
- *
- */
-class PreShiftInstnormGraphlet
-{
-public:
-  void init(loco::Graph *g)
-  {
-    _preshift = g->nodes()->create<luci::CircleCustom>(2 /* arity */, 1 /* out */);
-    _preshift->dtype(loco::DataType::FLOAT32);
-    _preshift->inputs(
-      1, create_const_node(g, loco::DataType::FLOAT32, {3} /* shape */, {2, 2, 2} /* value */));
-    _preshift->shape({1, 4, 4, 3});
-    _preshift->custom_code("PreShift");
-    _preshift->name("prescale");
-
-    _instnorm = g->nodes()->create<luci::CircleInstanceNorm>();
-    _instnorm->input(_preshift);
-    _instnorm->fusedActivationFunction(luci::FusedActFunc::NONE);
-    _instnorm->dtype(loco::DataType::FLOAT32);
-    _instnorm->shape({1, 4, 4, 3});
-    _instnorm->name("instnorm");
-  }
-
-public:
-  luci::CircleCustom *_preshift = nullptr;
-  luci::CircleInstanceNorm *_instnorm = nullptr;
-};
-
-class PreShiftInstnormGraph : public luci::test::TestIOGraph, public PreShiftInstnormGraphlet
-{
-public:
-  void init(void)
-  {
-    luci::test::TestIOGraph::init({1, 4, 4, 3}, {1, 4, 4, 3});
-    PreShiftInstnormGraphlet::init(g());
-
-    _preshift->inputs(0, input());
-
-    output()->from(_instnorm);
-  }
-
-  std::unique_ptr<loco::Graph> graph(void) { return std::move(_g); }
-};
-
-} // namespace
-
-TEST(FusePreShiftPassTest, preshift_instnorm)
-{
-  PreShiftInstnormGraph g;
-  g.init();
-
-  FusePreShiftPass fpsp;
-  EXPECT_TRUE(fpsp.run(g.g()));
-
-  auto instnorm = dynamic_cast<luci::CircleInstanceNorm *>(g.output()->from());
-  EXPECT_NE(nullptr, instnorm);
-
-  auto pre_shift = to_pre_shift(instnorm->input());
-  EXPECT_EQ(nullptr, pre_shift); // No pre_shift
-}
-
-TEST(FusePreShiftPassTest, preshift_instnorm_NEG)
-{
-  PreShiftInstnormGraph g;
-  g.init();
-  g._instnorm->input(g.input());
-
-  FusePreShiftPass fpsp;
-  EXPECT_FALSE(fpsp.run(g.g()));
-}
diff --git a/compiler/fme-detect/src/EqualizePatternFinder.cpp b/compiler/fme-detect/src/EqualizePatternFinder.cpp
index 2d5c977ce8b..b856f03a34d 100644
--- a/compiler/fme-detect/src/EqualizePatternFinder.cpp
+++ b/compiler/fme-detect/src/EqualizePatternFinder.cpp
@@ -117,6 +117,27 @@ struct GetFusability final : public luci::CircleNodeMutableVisitor<Fusability>
     }
     return f;
   }
+
+  Fusability visit(luci::CircleFullyConnected *node)
+  {
+    Fusability f;
+    {
+      f.pre_scale = true;
+      if (node->fusedActivationFunction() == luci::FusedActFunc::NONE)
+      {
+        f.post_scale = true;
+        f.post_shift = true;
+      }
+      // Negative scale is not fusable across ReLU, but fme-detect does not
+      // know the scale value. So, we assume that the scale is positive.
+      // NOTE If a pattern has negative scales, fm-equalize rejects the pattern
+      else if (node->fusedActivationFunction() == luci::FusedActFunc::RELU)
+      {
+        f.post_scale = true;
+      }
+    }
+    return f;
+  }
 };
 
 Fusability fusability(luci::CircleNode *node)
@@ -135,6 +156,7 @@ struct Forwardable
 };
 
 // Return Forwardable of node
+// Note that the degree of effect may vary from layer to layer.
 Forwardable forwardable(luci::CircleNode *node)
 {
   if (node == nullptr)
@@ -145,10 +167,13 @@ Forwardable forwardable(luci::CircleNode *node)
     case luci::CircleOpcode::PAD:
       return {true, false};
     case luci::CircleOpcode::MAX_POOL_2D:
-      // Assumption: all scale values are positive.
-      return {true, true};
-    case luci::CircleOpcode::SLICE:
       return {true, true};
+    case luci::CircleOpcode::RELU:
+      return {true, false};
+    case luci::CircleOpcode::LEAKY_RELU:
+      return {true, false};
+    case luci::CircleOpcode::GELU:
+      return {true, false};
     default:
       return {false, false};
   }
@@ -168,13 +193,13 @@ void match(luci::CircleNode *front, std::vector<EqualizePattern> &res)
     auto back = loco::must_cast<luci::CircleNode *>(succ);
     auto back_fusability = fusability(back);
 
-    // If 'back' is not fusable with PreScale/Shift, we check if PreScale/Shift
+    // If 'back' is not fusable with PreScale, we check if PreScale
     // can forward across 'back'
     // TODO Generalize this code to support multiple forwardable Ops
-    if ((not back_fusability.pre_scale) and (not back_fusability.pre_shift))
+    if (not back_fusability.pre_scale)
     {
       auto f = forwardable(back);
-      if (f.scale_forwardable or f.shift_forwardable)
+      if (f.scale_forwardable)
       {
         auto succ_succs = loco::succs(back);
         // Only support single successor for simplicity
@@ -184,10 +209,17 @@ void match(luci::CircleNode *front, std::vector<EqualizePattern> &res)
         auto next_back = loco::must_cast<luci::CircleNode *>(next_succ);
         back_fusability = fusability(next_back);
         back_fusability.pre_scale &= f.scale_forwardable;
-        back_fusability.pre_shift &= f.shift_forwardable;
+        back = next_back;
       }
     }
 
+    if (front_fusability.post_scale and back_fusability.pre_scale)
+    {
+      res.emplace_back(front->name(), back->name(), EqualizePattern::Type::ScaleOnly);
+    }
+
+    // TODO Let's consider "shift" when it is necessary.
+#if 0
     // Create EqualizePattern based on fusability
     // ScaleShift
     //   front: fusable_post_shift and fusable_post_scale
@@ -211,6 +243,7 @@ void match(luci::CircleNode *front, std::vector<EqualizePattern> &res)
     {
       res.emplace_back(front->name(), back->name(), EqualizePattern::Type::ScaleOnly);
     }
+#endif
   }
 }
 
diff --git a/compiler/fme-detect/src/EqualizePatternFinder.test.cpp b/compiler/fme-detect/src/EqualizePatternFinder.test.cpp
index 63a8ed01321..086225aa222 100644
--- a/compiler/fme-detect/src/EqualizePatternFinder.test.cpp
+++ b/compiler/fme-detect/src/EqualizePatternFinder.test.cpp
@@ -447,7 +447,7 @@ TEST(EqualizePatternFinderTest, conv_pad_conv)
 
   EXPECT_EQ(1, res.size());
   EXPECT_EQ("conv1", res[0].front);
-  EXPECT_EQ("pad", res[0].back);
+  EXPECT_EQ("conv2", res[0].back);
   EXPECT_EQ(EqualizePattern::Type::ScaleOnly, res[0].type);
 }
 
@@ -483,7 +483,7 @@ TEST(EqualizePatternFinderTest, conv_maxpool_conv)
 
   EXPECT_EQ(1, res.size());
   EXPECT_EQ("conv1", res[0].front);
-  EXPECT_EQ("maxpool", res[0].back);
+  EXPECT_EQ("conv2", res[0].back);
   EXPECT_EQ(EqualizePattern::Type::ScaleOnly, res[0].type);
 }
 
@@ -503,7 +503,7 @@ TEST(EqualizePatternFinderTest, conv_relu_pad_conv)
 
   EXPECT_EQ(1, res.size());
   EXPECT_EQ("conv1", res[0].front);
-  EXPECT_EQ("pad", res[0].back);
+  EXPECT_EQ("conv2", res[0].back);
   EXPECT_EQ(EqualizePattern::Type::ScaleOnly, res[0].type);
 }
 
@@ -541,25 +541,6 @@ TEST(EqualizePatternFinderTest, conv_tanh_pad_conv_NEG)
   EXPECT_EQ(0, res.size());
 }
 
-TEST(EqualizePatternFinderTest, tconv_slice_instnorm)
-{
-  EqualizePatternFinder::Context ctx;
-  {
-    ctx._allow_dup_op = true;
-  }
-  EqualizePatternFinder epf(ctx);
-
-  TConvSliceInstnormGraph g;
-  g.init();
-
-  auto res = epf.find(g.g());
-
-  EXPECT_EQ(1, res.size());
-  EXPECT_EQ("tconv", res[0].front);
-  EXPECT_EQ("slice", res[0].back);
-  EXPECT_EQ(EqualizePattern::Type::ShiftOnly, res[0].type);
-}
-
 TEST(EqualizePatternFinderTest, tconv_slice_NEG)
 {
   EqualizePatternFinder::Context ctx;
@@ -577,28 +558,6 @@ TEST(EqualizePatternFinderTest, tconv_slice_NEG)
   EXPECT_EQ(0, res.size());
 }
 
-TEST(EqualizePatternFinderTest, dup_op)
-{
-  EqualizePatternFinder::Context ctx;
-  {
-    ctx._allow_dup_op = true;
-  }
-  EqualizePatternFinder epf(ctx);
-
-  ConvConvINGraph g;
-  g.init();
-
-  auto res = epf.find(g.g());
-
-  EXPECT_EQ(2, res.size());
-  EXPECT_EQ("conv1", res[0].front);
-  EXPECT_EQ("instnorm", res[0].back);
-  EXPECT_EQ(EqualizePattern::Type::ShiftOnly, res[0].type);
-  EXPECT_EQ("conv1", res[1].front);
-  EXPECT_EQ("conv2", res[1].back);
-  EXPECT_EQ(EqualizePattern::Type::ScaleOnly, res[1].type);
-}
-
 TEST(EqualizePatternFinderTest, dup_op_NEG)
 {
   EqualizePatternFinder::Context ctx;
diff --git a/compiler/luci/service/include/luci/Service/CircleShapeInference.h b/compiler/luci/service/include/luci/Service/CircleShapeInference.h
index 8906983576f..0f3bfdf7855 100644
--- a/compiler/luci/service/include/luci/Service/CircleShapeInference.h
+++ b/compiler/luci/service/include/luci/Service/CircleShapeInference.h
@@ -74,7 +74,7 @@ class Algorithm final : public luci::CircleNodeVisitor<loco::TensorShape>
   // loco::TensorShape visit(const luci::CircleFloor *node) final;
   // loco::TensorShape visit(const luci::CircleFloorDiv *node) final;
   // loco::TensorShape visit(const luci::CircleFloorMod *node) final;
-  // loco::TensorShape visit(const luci::CircleFullyConnected *node) final;
+  loco::TensorShape visit(const luci::CircleFullyConnected *node) final;
   // loco::TensorShape visit(const luci::CircleGather *node) final;
   // loco::TensorShape visit(const luci::CircleGatherNd *node) final;
   // loco::TensorShape visit(const luci::CircleGreater *node) final;
@@ -112,7 +112,7 @@ class Algorithm final : public luci::CircleNodeVisitor<loco::TensorShape>
   // loco::TensorShape visit(const luci::CirclePow *node) final;
   // loco::TensorShape visit(const luci::CirclePRelu *node) final;
   loco::TensorShape visit(const luci::CircleQuantize *node) final;
-  // loco::TensorShape visit(const luci::CircleRange *node) final;
+  loco::TensorShape visit(const luci::CircleRange *node) final;
   // loco::TensorShape visit(const luci::CircleRank *node) final;
   // loco::TensorShape visit(const luci::CircleReduceAny *node) final;
   // loco::TensorShape visit(const luci::CircleReduceMax *node) final;
@@ -146,7 +146,7 @@ class Algorithm final : public luci::CircleNodeVisitor<loco::TensorShape>
   // loco::TensorShape visit(const luci::CircleSquare *node) final;
   // loco::TensorShape visit(const luci::CircleSquaredDifference *node) final;
   // loco::TensorShape visit(const luci::CircleSqueeze *node) final;
-  // loco::TensorShape visit(const luci::CircleStridedSlice *node) final;
+  loco::TensorShape visit(const luci::CircleStridedSlice *node) final;
   // loco::TensorShape visit(const luci::CircleSub *node) final;
   // loco::TensorShape visit(const luci::CircleSum *node) final;
   // loco::TensorShape visit(const luci::CircleTanh *node) final;
diff --git a/compiler/luci/service/src/CircleShapeInferenceHelper.cpp b/compiler/luci/service/src/CircleShapeInferenceHelper.cpp
index 76867ccafc1..3d78a31a12e 100644
--- a/compiler/luci/service/src/CircleShapeInferenceHelper.cpp
+++ b/compiler/luci/service/src/CircleShapeInferenceHelper.cpp
@@ -161,7 +161,7 @@ loco::TensorShape broadcast_shape(const loco::TensorShape &x, const loco::Tensor
   return output_shape;
 }
 
-loco::TensorShape pad_shape(const loco::TensorShape &input_shape, const luci::CircleConst *paddings)
+loco::TensorShape pad_shape(const loco::TensorShape &input_shape, const luci::CircleNode *paddings)
 {
   const loco::DataType S32 = loco::DataType::S32;
   const loco::DataType S64 = loco::DataType::S64;
@@ -180,6 +180,11 @@ loco::TensorShape pad_shape(const loco::TensorShape &input_shape, const luci::Ci
   loco::TensorShape output_shape;
 
   output_shape.rank(input_shape.rank());
+
+  auto const_padding = dynamic_cast<const luci::CircleConst *>(paddings);
+  if (const_padding == nullptr)
+    return output_shape;
+
   for (int32_t ni = 0; ni < n; ++ni)
   {
     if (not input_shape.dim(ni).known())
@@ -189,15 +194,15 @@ loco::TensorShape pad_shape(const loco::TensorShape &input_shape, const luci::Ci
     }
     int32_t idx = ni * 2;
     int value = input_shape.dim(ni).value();
-    if (paddings->dtype() == S32)
+    if (const_padding->dtype() == S32)
     {
-      value += paddings->at<S32>(idx + 0); // left
-      value += paddings->at<S32>(idx + 1); // right
+      value += const_padding->at<S32>(idx + 0); // left
+      value += const_padding->at<S32>(idx + 1); // right
     }
     else
     {
-      auto pl = paddings->at<S64>(idx + 0);
-      auto pr = paddings->at<S64>(idx + 1);
+      auto pl = const_padding->at<S64>(idx + 0);
+      auto pr = const_padding->at<S64>(idx + 1);
       auto max = static_cast<int64_t>(std::numeric_limits<int32_t>::max());
       auto low = static_cast<int64_t>(std::numeric_limits<int32_t>::lowest());
       LUCI_ASSERT(pl <= max, "paddings is over 32 bit limit");
diff --git a/compiler/luci/service/src/CircleShapeInferenceHelper.h b/compiler/luci/service/src/CircleShapeInferenceHelper.h
index 4961e9c40de..1f99bb5b444 100644
--- a/compiler/luci/service/src/CircleShapeInferenceHelper.h
+++ b/compiler/luci/service/src/CircleShapeInferenceHelper.h
@@ -49,8 +49,8 @@ loco::TensorShape circle_shape(const luci::CircleNode *node);
 loco::TensorShape broadcast_shape(const loco::TensorShape &x, const loco::TensorShape &y);
 
 // Return shape of pad ops using paddings.
-loco::TensorShape pad_shape(const loco::TensorShape &input_shape,
-                            const luci::CircleConst *paddings);
+// If paddings is not static, return the shape filled with unknown dimensions.
+loco::TensorShape pad_shape(const loco::TensorShape &input_shape, const luci::CircleNode *paddings);
 
 /**
  * @brief Create a higher-rank TensorShape following NumPy broadcasting semantics
diff --git a/compiler/luci/service/src/CircleShapeInferenceRule.cpp b/compiler/luci/service/src/CircleShapeInferenceRule.cpp
index c6a57b5b723..b6c5d0d5bb7 100644
--- a/compiler/luci/service/src/CircleShapeInferenceRule.cpp
+++ b/compiler/luci/service/src/CircleShapeInferenceRule.cpp
@@ -19,7 +19,6 @@
 #include "Check.h"
 
 #include "CircleShapeInferenceHelper.h"
-#include "ShapeInfer_StridedSlice.h"
 
 #include <luci/IR/CircleNodes.h>
 #include <luci/IR/CircleDialect.h>
@@ -637,47 +636,6 @@ loco::NodeShape infer_fill(const luci::CircleFill *node)
   return loco::NodeShape{shape};
 }
 
-loco::NodeShape infer_fully_connected(const luci::CircleFullyConnected *node)
-{
-  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
-  auto weights_shape = luci::shape_get(node->weights()).as<loco::TensorShape>();
-
-  loco::TensorShape out_shape;
-
-  // NOTE Some recipes in some repositories are using rank 4 input for FullyConnected.
-  //      Until they are all fixed, disable following assert.
-  // TODO Enable following assert after related fixes are applied
-  // https://github.com/tensorflow/tensorflow/blob/ea33c1e7a25d8025e8ee405ad8ab7be261798d76/tensorflow/lite/kernels/fully_connected.cc#L194
-  // LUCI_ASSERT(input_shape.rank() == 2 || input_shape.rank() == 3,
-  //             "Input rank of FullyConnected should be 2 or 3");
-
-  // https://github.com/tensorflow/tensorflow/blob/ea33c1e7a25d8025e8ee405ad8ab7be261798d76/tensorflow/lite/kernels/fully_connected.cc#L225
-  LUCI_ASSERT(weights_shape.rank() == 2, "Weights of FullyConnected should be 2");
-
-  // https://github.com/tensorflow/tensorflow/blob/ea33c1e7a25d8025e8ee405ad8ab7be261798d76/tensorflow/lite/kernels/fully_connected.cc#L353-L367
-  if (node->keep_num_dims())
-  {
-    out_shape.rank(input_shape.rank());
-    for (uint32_t i = 0; i < input_shape.rank(); ++i)
-      out_shape.dim(i) = input_shape.dim(i);
-    out_shape.dim(out_shape.rank() - 1) = weights_shape.dim(0);
-  }
-  else
-  {
-    uint32_t input_size = 1;
-    for (uint32_t i = 0; i < input_shape.rank(); i++)
-    {
-      input_size = input_size * input_shape.dim(i).value();
-    }
-    const uint32_t batch_size = input_size / weights_shape.dim(1).value();
-    out_shape.rank(2);
-    out_shape.dim(0) = batch_size;
-    out_shape.dim(1) = weights_shape.dim(0);
-  }
-
-  return loco::NodeShape{out_shape};
-}
-
 loco::NodeShape infer_gather(const luci::CircleGather *node)
 {
   loco::TensorShape output_shape;
@@ -953,49 +911,6 @@ loco::NodeShape infer_p_relu(const luci::CirclePRelu *node)
   return loco::NodeShape{output_shape};
 }
 
-loco::NodeShape infer_range(const luci::CircleRange *node)
-{
-  loco::TensorShape output_shape;
-  output_shape.rank(1);
-
-  auto start_node = dynamic_cast<luci::CircleConst *>(node->start());
-  auto limit_node = dynamic_cast<luci::CircleConst *>(node->limit());
-  auto delta_node = dynamic_cast<luci::CircleConst *>(node->delta());
-
-  if (start_node == nullptr || limit_node == nullptr || delta_node == nullptr)
-  {
-    return use_own(node);
-  }
-
-  double start = 0, limit = 0, delta = 0;
-
-#define GET_RANGE_PARAM(DT)         \
-  start = start_node->scalar<DT>(); \
-  limit = limit_node->scalar<DT>(); \
-  delta = delta_node->scalar<DT>();
-
-  switch (start_node->dtype())
-  {
-    case loco::DataType::FLOAT32:
-      GET_RANGE_PARAM(loco::DataType::FLOAT32)
-      break;
-    case loco::DataType::S32:
-      GET_RANGE_PARAM(loco::DataType::S32)
-      break;
-    default:
-      INTERNAL_EXN("Range data type not supported");
-  }
-
-#undef GET_RANGE_PARAM
-
-  if (delta == 0)
-    INTERNAL_EXN("Delta can not be zero");
-
-  output_shape.dim(0) = ceil((limit - start) / delta);
-
-  return loco::NodeShape{output_shape};
-}
-
 template <class CIRCLENODE> loco::NodeShape infer_resize_type(const CIRCLENODE *node)
 {
   auto input_shape = luci::shape_get(node->input()).template as<loco::TensorShape>();
@@ -1306,21 +1221,6 @@ loco::NodeShape infer_sparse_to_dense(const luci::CircleSparseToDense *node)
   return loco::NodeShape{shape};
 }
 
-loco::NodeShape infer_strided_slice(const luci::CircleStridedSlice *node)
-{
-  auto begin_node = dynamic_cast<luci::CircleConst *>(node->begin());
-  auto end_node = dynamic_cast<luci::CircleConst *>(node->end());
-  auto strides_node = dynamic_cast<luci::CircleConst *>(node->strides());
-
-  if (begin_node == nullptr || end_node == nullptr || strides_node == nullptr)
-  {
-    return use_own(node);
-  }
-
-  loco::TensorShape shape = infer_output_shape(node);
-  return loco::NodeShape{shape};
-}
-
 loco::NodeShape infer_squeeze(const luci::CircleSqueeze *node)
 {
   auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
@@ -1967,11 +1867,6 @@ class ShapeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::NodeS
 
   loco::NodeShape visit(const luci::CircleFloorMod *node) final { return broadcast_xy(node); }
 
-  loco::NodeShape visit(const luci::CircleFullyConnected *node) final
-  {
-    return infer_fully_connected(node);
-  }
-
   loco::NodeShape visit(const luci::CircleGather *node) final { return infer_gather(node); }
 
   loco::NodeShape visit(const luci::CircleGatherNd *node) final { return infer_gather_nd(node); }
@@ -2088,8 +1983,6 @@ class ShapeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::NodeS
 
   loco::NodeShape visit(const luci::CirclePRelu *node) final { return infer_p_relu(node); }
 
-  loco::NodeShape visit(const luci::CircleRange *node) final { return infer_range(node); }
-
   loco::NodeShape visit(const luci::CircleRank *) final
   {
     loco::TensorShape shape_output;
@@ -2234,11 +2127,6 @@ class ShapeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::NodeS
     return broadcast_xy(node);
   }
 
-  loco::NodeShape visit(const luci::CircleStridedSlice *node) final
-  {
-    return infer_strided_slice(node);
-  }
-
   loco::NodeShape visit(const luci::CircleSqueeze *node) final { return infer_squeeze(node); }
 
   loco::NodeShape visit(const luci::CircleSub *node) final { return broadcast_xy(node); }
diff --git a/compiler/luci/service/src/Nodes/CircleFullyConnected.cpp b/compiler/luci/service/src/Nodes/CircleFullyConnected.cpp
index 7c37060c46a..d7da18e0747 100644
--- a/compiler/luci/service/src/Nodes/CircleFullyConnected.cpp
+++ b/compiler/luci/service/src/Nodes/CircleFullyConnected.cpp
@@ -13,8 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "luci/Service/CircleShapeInference.h"
 
 #include "CircleCloneNode.h"
+#include "CircleShapeInferenceHelper.h"
+
+#include "Check.h"
 
 namespace luci
 {
@@ -36,4 +40,88 @@ luci::CircleNode *CloneNodeLet<CN::DEF>::visit(const luci::CircleFullyConnected
   return cloned;
 }
 
+namespace sinf
+{
+
+loco::TensorShape Algorithm::visit(const luci::CircleFullyConnected *node)
+{
+  auto input_shape = circle_shape(loco::must_cast<CircleNode *>(node->input()));
+  auto weights_shape = circle_shape(loco::must_cast<CircleNode *>(node->weights()));
+
+  loco::TensorShape out_shape;
+
+  // NOTE Some recipes in some repositories are using rank 4 input for FullyConnected.
+  //      Until they are all fixed, disable following assert.
+  // TODO Enable following assert after related fixes are applied
+  // https://github.com/tensorflow/tensorflow/blob/ea33c1e7a25d8025e8ee405ad8ab7be261798d76/tensorflow/lite/kernels/fully_connected.cc#L194
+  // LUCI_ASSERT(input_shape.rank() == 2 || input_shape.rank() == 3,
+  //             "Input rank of FullyConnected should be 2 or 3");
+
+  // https://github.com/tensorflow/tensorflow/blob/ea33c1e7a25d8025e8ee405ad8ab7be261798d76/tensorflow/lite/kernels/fully_connected.cc#L225
+  LUCI_ASSERT(weights_shape.rank() == 2, "Weights of FullyConnected should be 2");
+  LUCI_ASSERT(weights_shape.dim(0).known() && weights_shape.dim(1).known(),
+              "Weights of FullyConnected should be known")
+  // https://github.com/tensorflow/tensorflow/blob/ea33c1e7a25d8025e8ee405ad8ab7be261798d76/tensorflow/lite/kernels/fully_connected.cc#L353-L367
+
+  /*
+   * **Pre-conditions:**
+   *    input_shape.rank() <= 4
+   *      * remark: TFLite allows <=3 ranks, but there are rank 4 input recipes in ONE
+   *    weights_shape.rank() == 2 and all dimensions are known.
+   *    When runtime(input_shape[-1] and weights_shape[-1] are both known), it should be same value.
+   *
+   * **Shape Inference Rule:**
+   *    **Input Shape:**
+   *      input_shape : (A, B, C, D)
+   *      weights_shape : (E, F)
+   *      A, B, C, D are "positive numbers" or "unknown".
+   *      E, F are always "positive numbers".
+   *
+   *    **Output Shape:**
+   *      If keep_dims = True : (A, B, C, E)
+   *      If keep_dims = False : (G, E)
+   *       * G = unknown (if any of A, B, or C is unknown.)
+   *       * G = A * B * C  (otherwise.)
+   */
+
+  if (node->keep_num_dims())
+  {
+    out_shape.rank(input_shape.rank());
+    for (uint32_t i = 0; i < input_shape.rank(); ++i)
+      out_shape.dim(i) = input_shape.dim(i);
+    out_shape.dim(out_shape.rank() - 1) = weights_shape.dim(0);
+  }
+  else
+  {
+    bool is_dynamic_shape = false;
+
+    for (uint32_t i = 0; i < input_shape.rank() - 1; i++)
+    {
+      if (not input_shape.dim(i).known())
+      {
+        is_dynamic_shape = true;
+        break;
+      }
+    }
+
+    uint32_t batch_size = 1;
+
+    for (uint32_t i = 0; i < input_shape.rank() - 1; i++)
+    {
+      batch_size *= input_shape.dim(i).value();
+    }
+
+    out_shape.rank(2);
+    if (is_dynamic_shape)
+      out_shape.dim(0).unset();
+    else
+      out_shape.dim(0) = batch_size;
+    out_shape.dim(1) = weights_shape.dim(0);
+  }
+
+  return out_shape;
+}
+
+} // namespace sinf
+
 } // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleFullyConnected.test.cpp b/compiler/luci/service/src/Nodes/CircleFullyConnected.test.cpp
index 965b5913051..9fcab69d4ec 100644
--- a/compiler/luci/service/src/Nodes/CircleFullyConnected.test.cpp
+++ b/compiler/luci/service/src/Nodes/CircleFullyConnected.test.cpp
@@ -16,6 +16,8 @@
 
 #include "luci/Service/CircleNodeClone.h"
 
+#include "luci/Service/CircleShapeInference.h"
+
 #include <gtest/gtest.h>
 
 TEST(CloneNodeTest, clone_FullyConnected)
@@ -59,3 +61,300 @@ TEST(CloneNodeTest, clone_FullyConnected_wf_NEG)
   auto cloned = luci::clone_node(node_fc, gc.get());
   ASSERT_EQ(nullptr, cloned);
 }
+
+TEST(ShapeRuleTest, fully_connected_dynamic_shape_keep_dims)
+{
+  luci::CircleInput input;
+  luci::CircleConst weights;
+  luci::CircleConst bias;
+  luci::CircleFullyConnected fully_connected;
+
+  input.shape({1, 10, 15, 20});
+  input.shape_status(luci::ShapeStatus::VALID);
+  input.dim(1).unset();
+
+  weights.shape({30, 20});
+  weights.shape_status(luci::ShapeStatus::VALID);
+
+  bias.shape_status(luci::ShapeStatus::VALID);
+
+  fully_connected.input(&input);
+  fully_connected.weights(&weights);
+  fully_connected.bias(&bias);
+  fully_connected.keep_num_dims(true);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&fully_connected, shape));
+  ASSERT_EQ(4, shape.rank());
+  ASSERT_TRUE(shape.dim(0).known());
+  ASSERT_FALSE(shape.dim(1).known());
+  ASSERT_TRUE(shape.dim(2).known());
+  ASSERT_TRUE(shape.dim(3).known());
+
+  ASSERT_EQ(1, shape.dim(0).value());
+  ASSERT_EQ(0, shape.dim(1).value());
+  ASSERT_EQ(15, shape.dim(2).value());
+  ASSERT_EQ(30, shape.dim(3).value());
+}
+
+TEST(ShapeRuleTest, fully_connected_last_dim_dynamic_keep_dims)
+{
+  luci::CircleInput input;
+  luci::CircleConst weights;
+  luci::CircleConst bias;
+  luci::CircleFullyConnected fully_connected;
+
+  input.shape({1, 10, 15, 20});
+  input.shape_status(luci::ShapeStatus::VALID);
+  input.dim(3).unset();
+
+  weights.shape({30, 20});
+  weights.shape_status(luci::ShapeStatus::VALID);
+
+  bias.shape_status(luci::ShapeStatus::VALID);
+
+  fully_connected.input(&input);
+  fully_connected.weights(&weights);
+  fully_connected.bias(&bias);
+  fully_connected.keep_num_dims(true);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&fully_connected, shape));
+  ASSERT_EQ(4, shape.rank());
+  ASSERT_TRUE(shape.dim(0).known());
+  ASSERT_TRUE(shape.dim(1).known());
+  ASSERT_TRUE(shape.dim(2).known());
+  ASSERT_TRUE(shape.dim(3).known());
+
+  ASSERT_EQ(1, shape.dim(0).value());
+  ASSERT_EQ(10, shape.dim(1).value());
+  ASSERT_EQ(15, shape.dim(2).value());
+  ASSERT_EQ(30, shape.dim(3).value());
+}
+
+TEST(ShapeRuleTest, fully_connected_dynamic_shape_no_keep_dims)
+{
+  luci::CircleInput input;
+  luci::CircleConst weights;
+  luci::CircleConst bias;
+  luci::CircleFullyConnected fully_connected;
+
+  input.shape({1, 10, 15, 20});
+  input.shape_status(luci::ShapeStatus::VALID);
+  input.dim(2).unset();
+
+  weights.shape({30, 20});
+  weights.shape_status(luci::ShapeStatus::VALID);
+
+  bias.shape_status(luci::ShapeStatus::VALID);
+
+  fully_connected.input(&input);
+  fully_connected.weights(&weights);
+  fully_connected.bias(&bias);
+  fully_connected.keep_num_dims(false);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&fully_connected, shape));
+  ASSERT_EQ(2, shape.rank());
+  ASSERT_FALSE(shape.dim(0).known());
+  ASSERT_TRUE(shape.dim(1).known());
+
+  ASSERT_EQ(0, shape.dim(0).value());
+  ASSERT_EQ(30, shape.dim(1).value());
+}
+
+TEST(ShapeRuleTest, fully_connected_last_dim_dynamic_no_keep_dims)
+{
+  luci::CircleInput input;
+  luci::CircleConst weights;
+  luci::CircleConst bias;
+  luci::CircleFullyConnected fully_connected;
+
+  input.shape({1, 10, 15, 20});
+  input.shape_status(luci::ShapeStatus::VALID);
+  input.dim(3).unset();
+
+  weights.shape({30, 20});
+  weights.shape_status(luci::ShapeStatus::VALID);
+
+  bias.shape_status(luci::ShapeStatus::VALID);
+
+  fully_connected.input(&input);
+  fully_connected.weights(&weights);
+  fully_connected.bias(&bias);
+  fully_connected.keep_num_dims(false);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&fully_connected, shape));
+  ASSERT_EQ(2, shape.rank());
+  ASSERT_TRUE(shape.dim(0).known());
+  ASSERT_TRUE(shape.dim(1).known());
+
+  ASSERT_EQ(150, shape.dim(0).value());
+  ASSERT_EQ(30, shape.dim(1).value());
+}
+
+TEST(ShapeRuleTest, fully_connected_all_dim_dynamic_no_keep_dims)
+{
+  luci::CircleInput input;
+  luci::CircleConst weights;
+  luci::CircleConst bias;
+  luci::CircleFullyConnected fully_connected;
+
+  input.shape({1, 10, 15, 20});
+  input.shape_status(luci::ShapeStatus::VALID);
+  input.dim(0).unset();
+  input.dim(1).unset();
+  input.dim(2).unset();
+  input.dim(3).unset();
+
+  weights.shape({30, 20});
+  weights.shape_status(luci::ShapeStatus::VALID);
+
+  bias.shape_status(luci::ShapeStatus::VALID);
+
+  fully_connected.input(&input);
+  fully_connected.weights(&weights);
+  fully_connected.bias(&bias);
+  fully_connected.keep_num_dims(false);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&fully_connected, shape));
+  ASSERT_EQ(2, shape.rank());
+  ASSERT_FALSE(shape.dim(0).known());
+  ASSERT_TRUE(shape.dim(1).known());
+
+  ASSERT_EQ(0, shape.dim(0).value());
+  ASSERT_EQ(30, shape.dim(1).value());
+}
+
+TEST(ShapeRuleTest, fully_connected_nullptr_weights_NEG)
+{
+  luci::CircleInput input;
+  luci::CircleConst weights;
+  luci::CircleConst bias;
+  luci::CircleFullyConnected fully_connected;
+
+  input.shape({1, 10, 20});
+  input.shape_status(luci::ShapeStatus::VALID);
+
+  bias.shape_status(luci::ShapeStatus::VALID);
+
+  fully_connected.input(&input);
+  fully_connected.weights(nullptr);
+  fully_connected.bias(&bias);
+  fully_connected.keep_num_dims(true);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_ANY_THROW(shape_inf_rule.infer(&fully_connected, shape));
+}
+
+TEST(ShapeRuleTest, fully_connected_nullptr_input_NEG)
+{
+  luci::CircleInput input;
+  luci::CircleConst weights;
+  luci::CircleConst bias;
+  luci::CircleFullyConnected fully_connected;
+
+  weights.shape({30, 20});
+  weights.shape_status(luci::ShapeStatus::VALID);
+
+  bias.shape_status(luci::ShapeStatus::VALID);
+
+  fully_connected.input(nullptr);
+  fully_connected.weights(&weights);
+  fully_connected.bias(&bias);
+  fully_connected.keep_num_dims(true);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_ANY_THROW(shape_inf_rule.infer(&fully_connected, shape));
+}
+
+TEST(ShapeRuleTest, fully_connected_nullptr_bias_NEG)
+{
+  luci::CircleInput input;
+  luci::CircleConst weights;
+  luci::CircleConst bias;
+  luci::CircleFullyConnected fully_connected;
+
+  input.shape({1, 15, 20});
+  input.shape_status(luci::ShapeStatus::VALID);
+
+  weights.shape({30, 20});
+  weights.shape_status(luci::ShapeStatus::VALID);
+
+  fully_connected.input(&input);
+  fully_connected.weights(&weights);
+  fully_connected.bias(nullptr);
+  fully_connected.keep_num_dims(true);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_ANY_THROW(shape_inf_rule.infer(&fully_connected, shape));
+}
+
+TEST(ShapeRuleTest, fully_connected_undefined_bias_NEG)
+{
+  luci::CircleInput input;
+  luci::CircleConst weights;
+  luci::CircleConst bias;
+  luci::CircleFullyConnected fully_connected;
+
+  input.shape({1, 15, 20});
+  input.shape_status(luci::ShapeStatus::VALID);
+
+  weights.shape({30, 20});
+  weights.shape_status(luci::ShapeStatus::VALID);
+
+  bias.shape_status(luci::ShapeStatus::UNDEFINED);
+
+  fully_connected.input(&input);
+  fully_connected.weights(&weights);
+  fully_connected.bias(&bias);
+  fully_connected.keep_num_dims(true);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_FALSE(shape_inf_rule.infer(&fully_connected, shape));
+}
+
+TEST(ShapeRuleTest, fully_connected_undefined_input_NEG)
+{
+  luci::CircleInput input;
+  luci::CircleConst weights;
+  luci::CircleConst bias;
+  luci::CircleFullyConnected fully_connected;
+
+  input.shape_status(luci::ShapeStatus::UNDEFINED);
+
+  weights.shape({30, 20});
+  weights.shape_status(luci::ShapeStatus::VALID);
+
+  bias.shape_status(luci::ShapeStatus::VALID);
+
+  fully_connected.input(&input);
+  fully_connected.weights(&weights);
+  fully_connected.bias(&bias);
+  fully_connected.keep_num_dims(true);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_FALSE(shape_inf_rule.infer(&fully_connected, shape));
+}
diff --git a/compiler/luci/service/src/Nodes/CirclePad.cpp b/compiler/luci/service/src/Nodes/CirclePad.cpp
index 2f4f90140af..8dbaf399b54 100644
--- a/compiler/luci/service/src/Nodes/CirclePad.cpp
+++ b/compiler/luci/service/src/Nodes/CirclePad.cpp
@@ -32,8 +32,7 @@ namespace sinf
 
 loco::TensorShape Algorithm::visit(const luci::CirclePad *node)
 {
-  // TODO support non-const case
-  auto paddings = loco::must_cast<luci::CircleConst *>(node->paddings());
+  auto paddings = loco::must_cast<const luci::CircleNode *>(node->paddings());
   auto circle_input = loco::must_cast<const luci::CircleNode *>(node->input());
   auto input_shape = circle_shape(circle_input);
   return pad_shape(input_shape, paddings);
diff --git a/compiler/luci/service/src/Nodes/CirclePad.test.cpp b/compiler/luci/service/src/Nodes/CirclePad.test.cpp
index 996426164f3..070b9b31075 100644
--- a/compiler/luci/service/src/Nodes/CirclePad.test.cpp
+++ b/compiler/luci/service/src/Nodes/CirclePad.test.cpp
@@ -90,3 +90,37 @@ TEST(ShapeRuleTest, pad_without_padding_NEG)
 
   ASSERT_ANY_THROW(shape_inf_rule.infer(node_pad, shape));
 }
+
+TEST(ShapeRuleTest, pad_non_const_paddings)
+{
+  auto g = loco::make_graph();
+  auto node_pad = g->nodes()->create<luci::CirclePad>();
+
+  auto node_paddings = g->nodes()->create<luci::CircleInput>();
+  auto node_input = g->nodes()->create<luci::CircleInput>();
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  node_input->shape({1, 2, 3, 4});
+  node_input->shape_status(luci::ShapeStatus::VALID);
+
+  node_paddings->dtype(loco::DataType::S64);
+  node_paddings->shape({4, 2});
+  node_paddings->shape_status(luci::ShapeStatus::VALID);
+
+  node_pad->input(node_input);
+  node_pad->paddings(node_paddings);
+
+  ASSERT_TRUE(shape_inf_rule.infer(node_pad, shape));
+  ASSERT_EQ(shape.rank(), 4);
+  ASSERT_FALSE(shape.dim(0).known());
+  ASSERT_FALSE(shape.dim(1).known());
+  ASSERT_FALSE(shape.dim(2).known());
+  ASSERT_FALSE(shape.dim(3).known());
+
+  ASSERT_EQ(0, shape.dim(0).value());
+  ASSERT_EQ(0, shape.dim(1).value());
+  ASSERT_EQ(0, shape.dim(2).value());
+  ASSERT_EQ(0, shape.dim(3).value());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleRange.cpp b/compiler/luci/service/src/Nodes/CircleRange.cpp
index ccb975d4ea0..b2c8be4d5f8 100644
--- a/compiler/luci/service/src/Nodes/CircleRange.cpp
+++ b/compiler/luci/service/src/Nodes/CircleRange.cpp
@@ -14,7 +14,12 @@
  * limitations under the License.
  */
 
+#include "luci/Service/CircleShapeInference.h"
+
 #include "CircleCloneNode.h"
+#include "CircleShapeInferenceHelper.h"
+
+#include <cmath>
 
 namespace luci
 {
@@ -24,4 +29,66 @@ luci::CircleNode *CloneNodeLet<CN::OPQR>::visit(const luci::CircleRange *)
   return _graph->nodes()->create<luci::CircleRange>();
 }
 
+namespace sinf
+{
+
+loco::TensorShape Algorithm::visit(const luci::CircleRange *node)
+{
+  loco::TensorShape output_shape;
+  output_shape.rank(1);
+
+  auto start_node = dynamic_cast<luci::CircleConst *>(node->start());
+  auto limit_node = dynamic_cast<luci::CircleConst *>(node->limit());
+  auto delta_node = dynamic_cast<luci::CircleConst *>(node->delta());
+
+  if (start_node == nullptr || limit_node == nullptr || delta_node == nullptr)
+  {
+    // We use shape from the node itself
+    loco::TensorShape shape;
+    shape.rank(node->rank());
+    for (uint32_t r = 0; r < node->rank(); ++r)
+    {
+      // TODO remove this copy from `use_own(node);`
+      // Shape inference rules in this file did not consider unknown dimension.
+      // If some node has unknown dimension, 0 is inserted and wrong shape
+      // inference was done as a result.
+      // To fix this, new shape inference algorithm is being implemented.
+      // Until new inference algorithm is fully implemented, unknown dimension
+      // would be represented as 1 along with TFLite expression.
+      shape.dim(r) = node->dim(r).known() ? node->dim(r).value() : 1;
+    }
+    return shape;
+  }
+
+  double start = 0, limit = 0, delta = 0;
+
+#define GET_RANGE_PARAM(DT)         \
+  start = start_node->scalar<DT>(); \
+  limit = limit_node->scalar<DT>(); \
+  delta = delta_node->scalar<DT>();
+
+  switch (start_node->dtype())
+  {
+    case loco::DataType::FLOAT32:
+      GET_RANGE_PARAM(loco::DataType::FLOAT32)
+      break;
+    case loco::DataType::S32:
+      GET_RANGE_PARAM(loco::DataType::S32)
+      break;
+    default:
+      INTERNAL_EXN("Range data type not supported");
+  }
+
+#undef GET_RANGE_PARAM
+
+  if (delta == 0)
+    INTERNAL_EXN("Delta can not be zero");
+
+  output_shape.dim(0) = ceil((limit - start) / delta);
+
+  return output_shape;
+}
+
+} // namespace sinf
+
 } // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleRange.test.cpp b/compiler/luci/service/src/Nodes/CircleRange.test.cpp
index b2fb296177a..b67d287d1ab 100644
--- a/compiler/luci/service/src/Nodes/CircleRange.test.cpp
+++ b/compiler/luci/service/src/Nodes/CircleRange.test.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "luci/Service/CircleNodeClone.h"
+#include "luci/Service/CircleShapeInference.h"
 
 #include <gtest/gtest.h>
 
@@ -31,3 +32,66 @@ TEST(CloneNodeTest, clone_Range)
   auto cloned_range = dynamic_cast<luci::CircleRange *>(cloned);
   ASSERT_NE(nullptr, cloned_range);
 }
+
+TEST(ShapeRuleTest, range_const_param)
+{
+  luci::CircleConst start, limit, delta;
+  luci::CircleRange range;
+
+  start.dtype(loco::DataType::S32);
+  start.size<loco::DataType::S32>(1);
+  start.at<loco::DataType::S32>(0) = 0;
+  start.shape_status(luci::ShapeStatus::VALID);
+
+  limit.dtype(loco::DataType::S32);
+  limit.size<loco::DataType::S32>(1);
+  limit.at<loco::DataType::S32>(0) = 10;
+  limit.shape_status(luci::ShapeStatus::VALID);
+
+  delta.dtype(loco::DataType::S32);
+  delta.size<loco::DataType::S32>(1);
+  delta.at<loco::DataType::S32>(0) = 2;
+  delta.shape_status(luci::ShapeStatus::VALID);
+
+  range.start(&start);
+  range.limit(&limit);
+  range.delta(&delta);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&range, shape));
+  ASSERT_EQ(1, shape.rank());
+  ASSERT_TRUE(shape.dim(0).known());
+  ASSERT_EQ(5, shape.dim(0).value());
+}
+
+TEST(ShapeRuleTest, range_zero_delta_NEG)
+{
+  luci::CircleConst start, limit, delta;
+  luci::CircleRange range;
+
+  start.dtype(loco::DataType::S32);
+  start.size<loco::DataType::S32>(1);
+  start.at<loco::DataType::S32>(0) = 0;
+  start.shape_status(luci::ShapeStatus::VALID);
+
+  limit.dtype(loco::DataType::S32);
+  limit.size<loco::DataType::S32>(1);
+  limit.at<loco::DataType::S32>(0) = 10;
+  limit.shape_status(luci::ShapeStatus::VALID);
+
+  delta.dtype(loco::DataType::S32);
+  delta.size<loco::DataType::S32>(1);
+  delta.at<loco::DataType::S32>(0) = 0;
+  delta.shape_status(luci::ShapeStatus::VALID);
+
+  range.start(&start);
+  range.limit(&limit);
+  range.delta(&delta);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_ANY_THROW(shape_inf_rule.infer(&range, shape));
+}
diff --git a/compiler/luci/service/src/Nodes/CircleStridedSlice.cpp b/compiler/luci/service/src/Nodes/CircleStridedSlice.cpp
index 3298c92b53e..cc594f63ddc 100644
--- a/compiler/luci/service/src/Nodes/CircleStridedSlice.cpp
+++ b/compiler/luci/service/src/Nodes/CircleStridedSlice.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,8 +14,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "luci/Service/CircleShapeInference.h"
 
+#include "Check.h"
 #include "CircleCloneNode.h"
+#include "CircleShapeInferenceHelper.h"
+
+#include <luci/IR/CircleNode.h>
+#include <loco/IR/DataType.h>
+#include <loco/IR/NodeShape.h>
+#include <oops/InternalExn.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <limits>
 
 namespace luci
 {
@@ -33,4 +47,413 @@ luci::CircleNode *CloneNodeLet<CN::STUV>::visit(const luci::CircleStridedSlice *
   return cloned;
 }
 
+// code referenced from
+// https://github.com/tensorflow/tensorflow/blob/3f878cff5b698b82eea85db2b60d65a2e320850e/
+//    tensorflow/lite/kernels/strided_slice.cc
+//    tensorflow/lite/kernels/internal/strided_slice_logic.h
+namespace sinf
+{
+
+// This Op only supports 1-5D cases and since we use the reference 4D
+// implementation, the 1-3D tensors are mapped to 4D.
+const int kMaxDim = 5;
+
+const loco::DataType S32 = loco::DataType::S32;
+
+struct StridedSliceParams
+{
+  int8_t start_indices_count = 0;
+  int32_t start_indices[kMaxDim];
+  int8_t stop_indices_count = 0;
+  int32_t stop_indices[kMaxDim];
+  int8_t strides_count = 0;
+  int32_t strides[kMaxDim];
+
+  int16_t begin_mask = 0;
+  int16_t ellipsis_mask = 0;
+  int16_t end_mask = 0;
+  int16_t new_axis_mask = 0;
+  int16_t shrink_axis_mask = 0;
+};
+
+struct StridedSliceContext
+{
+  StridedSliceContext(const luci::CircleStridedSlice *node)
+  {
+    // check overflow issues
+    assert(static_cast<int16_t>(node->begin_mask()) == node->begin_mask());
+    assert(static_cast<int16_t>(node->ellipsis_mask()) == node->ellipsis_mask());
+    assert(static_cast<int16_t>(node->end_mask()) == node->end_mask());
+    assert(static_cast<int16_t>(node->new_axis_mask()) == node->new_axis_mask());
+    assert(static_cast<int16_t>(node->shrink_axis_mask()) == node->shrink_axis_mask());
+
+    params.begin_mask = node->begin_mask();
+    params.ellipsis_mask = node->ellipsis_mask();
+    params.end_mask = node->end_mask();
+    params.new_axis_mask = node->new_axis_mask();
+    params.shrink_axis_mask = node->shrink_axis_mask();
+
+    input = loco::must_cast<luci::CircleNode *>(node->input());
+    begin = loco::must_cast<luci::CircleConst *>(node->begin());
+    end = loco::must_cast<luci::CircleConst *>(node->end());
+    strides = loco::must_cast<luci::CircleConst *>(node->strides());
+
+    loco::TensorShape input_shape = circle_shape(input);
+    input_dims = input_shape.rank();
+  }
+  StridedSliceParams params;
+  luci::CircleNode *input = nullptr;
+  luci::CircleConst *begin = nullptr;
+  luci::CircleConst *end = nullptr;
+  luci::CircleConst *strides = nullptr;
+
+  // Equivalent input shape after adding axis according to new_axis_mask.
+  loco::TensorShape effective_input_shape;
+  int64_t input_dims = 0;
+};
+
+// Use until std::clamp() is available from C++17.
+inline int Clamp(const int32_t v, const int32_t lo, const int32_t hi)
+{
+  LUCI_ASSERT(!(hi < lo), "Clamp hi < lo");
+  if (hi < v)
+    return hi;
+  if (v < lo)
+    return lo;
+  return v;
+}
+
+// Return the index for the first element along that axis. This index will be a
+// positive integer between [0, axis_size - 1] that can be used to index
+// directly into the data.
+inline int64_t StartForAxis(const StridedSliceParams &params, const loco::TensorShape &input_shape,
+                            int64_t axis)
+{
+  const auto begin_mask = params.begin_mask;
+  const auto *start_indices = params.start_indices;
+  const auto *strides = params.strides;
+  const int64_t axis_size = static_cast<int64_t>(input_shape.dim(axis).value());
+  if (axis_size == 0)
+  {
+    return 0;
+  }
+  // Begin with the specified index.
+  int64_t start = start_indices[axis];
+
+  // begin_mask override
+  if (begin_mask & (1LL << axis))
+  {
+    if (strides[axis] > 0)
+    {
+      // Forward iteration - use the first element. These values will get
+      // clamped below (Note: We could have set them to 0 and axis_size-1, but
+      // use lowest() and max() to maintain symmetry with StopForAxis())
+      start = std::numeric_limits<int32_t>::lowest();
+    }
+    else
+    {
+      // Backward iteration - use the last element.
+      start = std::numeric_limits<int32_t>::max();
+    }
+  }
+
+  // Handle negative indices
+  if (start < 0)
+  {
+    start += axis_size;
+  }
+
+  // Clamping
+  if (strides[axis] > 0)
+  {
+    // Forward iteration
+    start = Clamp(start, 0, axis_size);
+  }
+  else
+  {
+    // Backward iteration
+    start = Clamp(start, -1, axis_size - 1);
+  }
+
+  return start;
+}
+
+// Return the "real" index for the end of iteration along that axis. This is an
+// "end" in the traditional C sense, in that it points to one past the last
+// element. ie. So if you were iterating through all elements of a 1D array of
+// size 4, this function would return 4 as the stop, because it is one past the
+// "real" indices of 0, 1, 2 & 3.
+inline int64_t StopForAxis(const StridedSliceParams &params, const loco::TensorShape &input_shape,
+                           int64_t axis, int64_t start_for_axis)
+{
+  const auto end_mask = params.end_mask;
+  const auto shrink_axis_mask = params.shrink_axis_mask;
+  const auto *stop_indices = params.stop_indices;
+  const auto *strides = params.strides;
+  const int64_t axis_size = static_cast<int64_t>(input_shape.dim(axis).value());
+  if (axis_size == 0)
+  {
+    return 0;
+  }
+
+  // Begin with the specified index
+  const bool shrink_axis = shrink_axis_mask & (1LL << axis);
+  int64_t stop = stop_indices[axis];
+
+  // When shrinking an axis, the end position does not matter (and can be
+  // incorrect when negative indexing is used, see Issue #19260). Always use
+  // start_for_axis + 1 to generate a length 1 slice, since start_for_axis has
+  // already been adjusted for negative indices.
+  if (shrink_axis)
+  {
+    return start_for_axis + 1;
+  }
+
+  // end_mask override
+  if (end_mask & (1LL << axis))
+  {
+    if (strides[axis] > 0)
+    {
+      // Forward iteration - use the last element. These values will get
+      // clamped below
+      stop = std::numeric_limits<int32_t>::max();
+    }
+    else
+    {
+      // Backward iteration - use the first element.
+      stop = std::numeric_limits<int32_t>::lowest();
+    }
+  }
+
+  // Handle negative indices
+  if (stop < 0)
+  {
+    stop += axis_size;
+  }
+
+  // Clamping
+  // Because the end index points one past the last element, we need slightly
+  // different clamping ranges depending on the direction.
+  if (strides[axis] > 0)
+  {
+    // Forward iteration
+    stop = Clamp(stop, 0, axis_size);
+  }
+  else
+  {
+    // Backward iteration
+    stop = Clamp(stop, -1, axis_size - 1);
+  }
+
+  return stop;
+}
+
+StridedSliceParams BuildStridedSliceParams(StridedSliceContext *op_context)
+{
+  StridedSliceParams op_params;
+
+  // The ellipsis_mask and new_axis_mask in op_params are not used. Those masks
+  // are processed here to update begin_mask, end_mask and the index range.
+  op_params.begin_mask = 0;
+  op_params.ellipsis_mask = 0;
+  op_params.end_mask = 0;
+  op_params.new_axis_mask = 0;
+  op_params.shrink_axis_mask = 0;
+
+  // Count indexes where the new_axis_mask is set but the ellipsis_mask is not.
+  loco::TensorShape begin_shape = circle_shape(op_context->begin);
+  const int64_t begin_count = static_cast<int64_t>(begin_shape.dim(0).value());
+  int64_t num_add_axis = 0;
+  for (int64_t i = 0; i < begin_count; ++i)
+  {
+    if (!((1LL << i) & op_context->params.ellipsis_mask) &&
+        ((1LL << i) & op_context->params.new_axis_mask))
+    {
+      num_add_axis++;
+    }
+  }
+
+  // Calculate the dims of input after adding new axises.
+  const int64_t effective_dims = op_context->input_dims + num_add_axis;
+
+  // If begin, end and strides are not fully provided, it means Ellipsis should
+  // be expanded to multiple dimensions (Ex: for spec [Ellipsis, 2] on a 3D
+  // input, the Ellipsis should be applied for the first 2 dimensions). Besides,
+  // If the new_axis_mask and the ellipsis_mask are set at the same index, the
+  // new_axis_mask will have no effect.
+  int64_t effective_ellipsis_mask = 0, effective_new_axis_mask = 0;
+  int64_t ellipsis_start_idx = effective_dims, expanded_ellipsis = 0;
+  for (int64_t i = 0; i < effective_dims;)
+  {
+    if ((1LL << i) & op_context->params.ellipsis_mask)
+    {
+      ellipsis_start_idx = i;
+      int64_t ellipsis_end_idx =
+        std::max(i + 1, std::min(i + 1 + num_add_axis + op_context->input_dims - begin_count,
+                                 effective_dims));
+      expanded_ellipsis = ellipsis_end_idx - ellipsis_start_idx - 1;
+
+      // Set bit for effective_ellipsis_mask.
+      for (; i < ellipsis_end_idx; ++i)
+      {
+        effective_ellipsis_mask |= (1LL << i);
+      }
+      continue;
+    }
+
+    if ((1LL << (i - expanded_ellipsis)) & op_context->params.new_axis_mask)
+    {
+      effective_new_axis_mask |= (1LL << i);
+    }
+    ++i;
+  }
+
+  // Calculate effective_input_shape and its corresponding begin, end, strides.
+  loco::TensorShape input_shape = circle_shape(op_context->input);
+  int64_t added_ellipsis = 0, added_axises = 0;
+  op_context->effective_input_shape.rank(effective_dims);
+
+  for (int64_t i = 0; i < effective_dims; ++i)
+  {
+    if ((1LL << i) & effective_ellipsis_mask)
+    {
+      // If ellipsis_mask, set the begin_mask and end_mask at that index.
+      added_ellipsis = std::max(int64_t(0), i - ellipsis_start_idx);
+      assert(i < 16);
+      op_params.begin_mask |= (1LL << i);
+      op_params.end_mask |= (1LL << i);
+      op_params.strides[i] = 1;
+      op_context->effective_input_shape.dim(i) = input_shape.dim(i - added_axises);
+    }
+    else if ((1LL << i) & effective_new_axis_mask)
+    {
+      // If new_axis_mask is set, it is equivalent to adding a new dim of 1 to
+      // input tensor. Store added shape to effective_input_shape.
+      op_params.start_indices[i] = 0;
+      op_params.stop_indices[i] = 1;
+      op_params.strides[i] = 1;
+      op_context->effective_input_shape.dim(i) = loco::Dimension(1);
+      added_axises++;
+    }
+    else if (i >= begin_count + expanded_ellipsis)
+    {
+      op_params.start_indices[i] = 0;
+      op_params.stop_indices[i] = 0;
+      op_params.strides[i] = 1;
+      assert(i < 16);
+      op_params.begin_mask |= (1LL << i);
+      op_params.end_mask |= (1LL << i);
+      op_context->effective_input_shape.dim(i) = input_shape.dim(i - added_axises);
+    }
+    else
+    {
+      const int64_t orig_idx = i - added_ellipsis;
+      op_params.start_indices[i] = op_context->begin->at<S32>(orig_idx);
+      op_params.stop_indices[i] = op_context->end->at<S32>(orig_idx);
+      op_params.strides[i] = op_context->strides->at<S32>(orig_idx);
+      if (op_context->params.begin_mask & (1LL << orig_idx))
+      {
+        assert(i < 16);
+        op_params.begin_mask |= (1LL << i);
+      }
+      if (op_context->params.end_mask & (1LL << orig_idx))
+      {
+        assert(i < 16);
+        op_params.end_mask |= (1LL << i);
+      }
+      if (op_context->params.shrink_axis_mask & (1LL << orig_idx))
+      {
+        assert(i < 16);
+        op_params.shrink_axis_mask |= (1LL << i);
+      }
+      op_context->effective_input_shape.dim(i) = input_shape.dim(i - added_axises);
+    }
+  }
+
+  // make sure no overflow
+  assert(static_cast<int8_t>(effective_dims) == static_cast<int32_t>(effective_dims));
+
+  op_params.start_indices_count = effective_dims;
+  op_params.stop_indices_count = effective_dims;
+  op_params.strides_count = effective_dims;
+
+  return op_params;
+}
+
+loco::TensorShape Algorithm::visit(const luci::CircleStridedSlice *node)
+{
+  loco::TensorShape output_shape;
+
+  auto input_node = loco::must_cast<luci::CircleNode *>(node->input());
+
+  auto begin_node = dynamic_cast<luci::CircleConst *>(node->begin());
+  auto end_node = dynamic_cast<luci::CircleConst *>(node->end());
+  auto strides_node = dynamic_cast<luci::CircleConst *>(node->strides());
+  // TODO support non-const case
+  if (begin_node == nullptr || end_node == nullptr || strides_node == nullptr)
+  {
+    INTERNAL_EXN("StridedSlice begin/end/strides nodes are not Constant");
+  }
+
+  LUCI_ASSERT(begin_node->dtype() == S32, "Only support S32 for begin_node");
+  LUCI_ASSERT(end_node->dtype() == S32, "Only support S32 for end_node");
+  LUCI_ASSERT(strides_node->dtype() == S32, "Only support S32 for strides_node");
+
+  LUCI_ASSERT(begin_node->rank() == 1, "Only support rank 1 for begin_node");
+  LUCI_ASSERT(end_node->rank() == 1, "Only support rank 1 for end_node");
+  LUCI_ASSERT(strides_node->rank() == 1, "Only support rank 1 for strides_node");
+
+  loco::TensorShape input_shape = circle_shape(input_node);
+
+  assert(begin_node->size<S32>() <= input_shape.rank());
+  assert(end_node->size<S32>() <= input_shape.rank());
+  assert(strides_node->size<S32>() <= input_shape.rank());
+
+  StridedSliceContext op_context(node);
+  auto op_params = BuildStridedSliceParams(&op_context);
+  auto &effective_input_shape = op_context.effective_input_shape;
+  std::vector<int64_t> output_shape_vector;
+
+  for (int32_t idx = effective_input_shape.rank() - 1; idx >= 0; --idx)
+  {
+    int32_t stride = op_params.strides[idx];
+    LUCI_ASSERT(stride != 0, "stride value has to be non-zero");
+
+    int64_t begin = StartForAxis(op_params, effective_input_shape, idx);
+    int64_t end = StopForAxis(op_params, effective_input_shape, idx, begin);
+
+    // When shrinking an axis, the end position does not matter (and can be
+    // incorrect when negative indexing is used, see Issue #19260). Always use
+    // begin + 1 to generate a length 1 slice, since begin has
+    // already been adjusted for negative indices by GetBeginValueAtIndex.
+    const bool shrink_axis = op_params.shrink_axis_mask & (1 << idx);
+    if (shrink_axis)
+    {
+      end = begin + 1;
+    }
+
+    // This is valid for both positive and negative strides
+    int64_t dim_shape = std::ceil((end - begin) / static_cast<float>(stride));
+    dim_shape = dim_shape < 0 ? 0 : dim_shape;
+    if (!shrink_axis)
+    {
+      output_shape_vector.push_back(dim_shape);
+    }
+  }
+
+  auto shape_size = output_shape_vector.size();
+  output_shape.rank(shape_size);
+  for (uint32_t idx = 0; idx < shape_size; ++idx)
+  {
+    int64_t dim = output_shape_vector.at(shape_size - 1u - idx);
+    LUCI_ASSERT(0 <= dim && dim < 0xfffffffL, "Dimension size exceeds limit");
+    // reverse copy
+    output_shape.dim(idx) = static_cast<uint32_t>(dim);
+  }
+
+  return output_shape;
+}
+
+} // namespace sinf
+
 } // namespace luci
diff --git a/compiler/luci/service/src/ShapeInfer_StridedSlice.cpp b/compiler/luci/service/src/ShapeInfer_StridedSlice.cpp
deleted file mode 100644
index 5a22da3198c..00000000000
--- a/compiler/luci/service/src/ShapeInfer_StridedSlice.cpp
+++ /dev/null
@@ -1,444 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ShapeInfer_StridedSlice.h"
-#include "Check.h"
-#include "CircleShapeInferenceHelper.h"
-
-#include <luci/IR/CircleNode.h>
-#include <loco/IR/DataType.h>
-#include <loco/IR/NodeShape.h>
-#include <oops/InternalExn.h>
-
-#include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <limits>
-
-// code referenced from
-// https://github.com/tensorflow/tensorflow/blob/3f878cff5b698b82eea85db2b60d65a2e320850e/
-//    tensorflow/lite/kernels/strided_slice.cc
-//    tensorflow/lite/kernels/internal/strided_slice_logic.h
-
-namespace
-{
-
-// This Op only supports 1-5D cases and since we use the reference 4D
-// implementation, the 1-3D tensors are mapped to 4D.
-const int kMaxDim = 5;
-
-const loco::DataType S32 = loco::DataType::S32;
-
-struct StridedSliceParams
-{
-  int8_t start_indices_count = 0;
-  int32_t start_indices[kMaxDim];
-  int8_t stop_indices_count = 0;
-  int32_t stop_indices[kMaxDim];
-  int8_t strides_count = 0;
-  int32_t strides[kMaxDim];
-
-  int16_t begin_mask = 0;
-  int16_t ellipsis_mask = 0;
-  int16_t end_mask = 0;
-  int16_t new_axis_mask = 0;
-  int16_t shrink_axis_mask = 0;
-};
-
-struct StridedSliceContext
-{
-  StridedSliceContext(const luci::CircleStridedSlice *node)
-  {
-    // check overflow issues
-    assert(static_cast<int16_t>(node->begin_mask()) == node->begin_mask());
-    assert(static_cast<int16_t>(node->ellipsis_mask()) == node->ellipsis_mask());
-    assert(static_cast<int16_t>(node->end_mask()) == node->end_mask());
-    assert(static_cast<int16_t>(node->new_axis_mask()) == node->new_axis_mask());
-    assert(static_cast<int16_t>(node->shrink_axis_mask()) == node->shrink_axis_mask());
-
-    params.begin_mask = node->begin_mask();
-    params.ellipsis_mask = node->ellipsis_mask();
-    params.end_mask = node->end_mask();
-    params.new_axis_mask = node->new_axis_mask();
-    params.shrink_axis_mask = node->shrink_axis_mask();
-
-    input = loco::must_cast<luci::CircleNode *>(node->input());
-    begin = loco::must_cast<luci::CircleConst *>(node->begin());
-    end = loco::must_cast<luci::CircleConst *>(node->end());
-    strides = loco::must_cast<luci::CircleConst *>(node->strides());
-
-    loco::TensorShape input_shape = luci::shape_get(input).as<loco::TensorShape>();
-    input_dims = input_shape.rank();
-  }
-  StridedSliceParams params;
-  luci::CircleNode *input = nullptr;
-  luci::CircleConst *begin = nullptr;
-  luci::CircleConst *end = nullptr;
-  luci::CircleConst *strides = nullptr;
-
-  // Equivalent input shape after adding axis according to new_axis_mask.
-  loco::TensorShape effective_input_shape;
-  int64_t input_dims = 0;
-};
-
-// Use until std::clamp() is available from C++17.
-inline int Clamp(const int32_t v, const int32_t lo, const int32_t hi)
-{
-  LUCI_ASSERT(!(hi < lo), "Clamp hi < lo");
-  if (hi < v)
-    return hi;
-  if (v < lo)
-    return lo;
-  return v;
-}
-
-// Return the index for the first element along that axis. This index will be a
-// positive integer between [0, axis_size - 1] that can be used to index
-// directly into the data.
-inline int64_t StartForAxis(const StridedSliceParams &params, const loco::TensorShape &input_shape,
-                            int64_t axis)
-{
-  const auto begin_mask = params.begin_mask;
-  const auto *start_indices = params.start_indices;
-  const auto *strides = params.strides;
-  const int64_t axis_size = static_cast<int64_t>(input_shape.dim(axis).value());
-  if (axis_size == 0)
-  {
-    return 0;
-  }
-  // Begin with the specified index.
-  int64_t start = start_indices[axis];
-
-  // begin_mask override
-  if (begin_mask & (1LL << axis))
-  {
-    if (strides[axis] > 0)
-    {
-      // Forward iteration - use the first element. These values will get
-      // clamped below (Note: We could have set them to 0 and axis_size-1, but
-      // use lowest() and max() to maintain symmetry with StopForAxis())
-      start = std::numeric_limits<int32_t>::lowest();
-    }
-    else
-    {
-      // Backward iteration - use the last element.
-      start = std::numeric_limits<int32_t>::max();
-    }
-  }
-
-  // Handle negative indices
-  if (start < 0)
-  {
-    start += axis_size;
-  }
-
-  // Clamping
-  if (strides[axis] > 0)
-  {
-    // Forward iteration
-    start = Clamp(start, 0, axis_size);
-  }
-  else
-  {
-    // Backward iteration
-    start = Clamp(start, -1, axis_size - 1);
-  }
-
-  return start;
-}
-
-// Return the "real" index for the end of iteration along that axis. This is an
-// "end" in the traditional C sense, in that it points to one past the last
-// element. ie. So if you were iterating through all elements of a 1D array of
-// size 4, this function would return 4 as the stop, because it is one past the
-// "real" indices of 0, 1, 2 & 3.
-inline int64_t StopForAxis(const StridedSliceParams &params, const loco::TensorShape &input_shape,
-                           int64_t axis, int64_t start_for_axis)
-{
-  const auto end_mask = params.end_mask;
-  const auto shrink_axis_mask = params.shrink_axis_mask;
-  const auto *stop_indices = params.stop_indices;
-  const auto *strides = params.strides;
-  const int64_t axis_size = static_cast<int64_t>(input_shape.dim(axis).value());
-  if (axis_size == 0)
-  {
-    return 0;
-  }
-
-  // Begin with the specified index
-  const bool shrink_axis = shrink_axis_mask & (1LL << axis);
-  int64_t stop = stop_indices[axis];
-
-  // When shrinking an axis, the end position does not matter (and can be
-  // incorrect when negative indexing is used, see Issue #19260). Always use
-  // start_for_axis + 1 to generate a length 1 slice, since start_for_axis has
-  // already been adjusted for negative indices.
-  if (shrink_axis)
-  {
-    return start_for_axis + 1;
-  }
-
-  // end_mask override
-  if (end_mask & (1LL << axis))
-  {
-    if (strides[axis] > 0)
-    {
-      // Forward iteration - use the last element. These values will get
-      // clamped below
-      stop = std::numeric_limits<int32_t>::max();
-    }
-    else
-    {
-      // Backward iteration - use the first element.
-      stop = std::numeric_limits<int32_t>::lowest();
-    }
-  }
-
-  // Handle negative indices
-  if (stop < 0)
-  {
-    stop += axis_size;
-  }
-
-  // Clamping
-  // Because the end index points one past the last element, we need slightly
-  // different clamping ranges depending on the direction.
-  if (strides[axis] > 0)
-  {
-    // Forward iteration
-    stop = Clamp(stop, 0, axis_size);
-  }
-  else
-  {
-    // Backward iteration
-    stop = Clamp(stop, -1, axis_size - 1);
-  }
-
-  return stop;
-}
-
-StridedSliceParams BuildStridedSliceParams(StridedSliceContext *op_context)
-{
-  StridedSliceParams op_params;
-
-  // The ellipsis_mask and new_axis_mask in op_params are not used. Those masks
-  // are processed here to update begin_mask, end_mask and the index range.
-  op_params.begin_mask = 0;
-  op_params.ellipsis_mask = 0;
-  op_params.end_mask = 0;
-  op_params.new_axis_mask = 0;
-  op_params.shrink_axis_mask = 0;
-
-  // Count indexes where the new_axis_mask is set but the ellipsis_mask is not.
-  loco::TensorShape begin_shape = luci::shape_get(op_context->begin).as<loco::TensorShape>();
-  const int64_t begin_count = static_cast<int64_t>(begin_shape.dim(0).value());
-  int64_t num_add_axis = 0;
-  for (int64_t i = 0; i < begin_count; ++i)
-  {
-    if (!((1LL << i) & op_context->params.ellipsis_mask) &&
-        ((1LL << i) & op_context->params.new_axis_mask))
-    {
-      num_add_axis++;
-    }
-  }
-
-  // Calculate the dims of input after adding new axises.
-  const int64_t effective_dims = op_context->input_dims + num_add_axis;
-
-  // If begin, end and strides are not fully provided, it means Ellipsis should
-  // be expanded to multiple dimensions (Ex: for spec [Ellipsis, 2] on a 3D
-  // input, the Ellipsis should be applied for the first 2 dimensions). Besides,
-  // If the new_axis_mask and the ellipsis_mask are set at the same index, the
-  // new_axis_mask will have no effect.
-  int64_t effective_ellipsis_mask = 0, effective_new_axis_mask = 0;
-  int64_t ellipsis_start_idx = effective_dims, expanded_ellipsis = 0;
-  for (int64_t i = 0; i < effective_dims;)
-  {
-    if ((1LL << i) & op_context->params.ellipsis_mask)
-    {
-      ellipsis_start_idx = i;
-      int64_t ellipsis_end_idx =
-        std::max(i + 1, std::min(i + 1 + num_add_axis + op_context->input_dims - begin_count,
-                                 effective_dims));
-      expanded_ellipsis = ellipsis_end_idx - ellipsis_start_idx - 1;
-
-      // Set bit for effective_ellipsis_mask.
-      for (; i < ellipsis_end_idx; ++i)
-      {
-        effective_ellipsis_mask |= (1LL << i);
-      }
-      continue;
-    }
-
-    if ((1LL << (i - expanded_ellipsis)) & op_context->params.new_axis_mask)
-    {
-      effective_new_axis_mask |= (1LL << i);
-    }
-    ++i;
-  }
-
-  // Calculate effective_input_shape and its corresponding begin, end, strides.
-  loco::TensorShape input_shape = luci::shape_get(op_context->input).as<loco::TensorShape>();
-  int64_t added_ellipsis = 0, added_axises = 0;
-  op_context->effective_input_shape.rank(effective_dims);
-
-  for (int64_t i = 0; i < effective_dims; ++i)
-  {
-    if ((1LL << i) & effective_ellipsis_mask)
-    {
-      // If ellipsis_mask, set the begin_mask and end_mask at that index.
-      added_ellipsis = std::max(int64_t(0), i - ellipsis_start_idx);
-      assert(i < 16);
-      op_params.begin_mask |= (1LL << i);
-      op_params.end_mask |= (1LL << i);
-      op_params.strides[i] = 1;
-      op_context->effective_input_shape.dim(i) = input_shape.dim(i - added_axises);
-    }
-    else if ((1LL << i) & effective_new_axis_mask)
-    {
-      // If new_axis_mask is set, it is equivalent to adding a new dim of 1 to
-      // input tensor. Store added shape to effective_input_shape.
-      op_params.start_indices[i] = 0;
-      op_params.stop_indices[i] = 1;
-      op_params.strides[i] = 1;
-      op_context->effective_input_shape.dim(i) = loco::Dimension(1);
-      added_axises++;
-    }
-    else if (i >= begin_count + expanded_ellipsis)
-    {
-      op_params.start_indices[i] = 0;
-      op_params.stop_indices[i] = 0;
-      op_params.strides[i] = 1;
-      assert(i < 16);
-      op_params.begin_mask |= (1LL << i);
-      op_params.end_mask |= (1LL << i);
-      op_context->effective_input_shape.dim(i) = input_shape.dim(i - added_axises);
-    }
-    else
-    {
-      const int64_t orig_idx = i - added_ellipsis;
-      op_params.start_indices[i] = op_context->begin->at<S32>(orig_idx);
-      op_params.stop_indices[i] = op_context->end->at<S32>(orig_idx);
-      op_params.strides[i] = op_context->strides->at<S32>(orig_idx);
-      if (op_context->params.begin_mask & (1LL << orig_idx))
-      {
-        assert(i < 16);
-        op_params.begin_mask |= (1LL << i);
-      }
-      if (op_context->params.end_mask & (1LL << orig_idx))
-      {
-        assert(i < 16);
-        op_params.end_mask |= (1LL << i);
-      }
-      if (op_context->params.shrink_axis_mask & (1LL << orig_idx))
-      {
-        assert(i < 16);
-        op_params.shrink_axis_mask |= (1LL << i);
-      }
-      op_context->effective_input_shape.dim(i) = input_shape.dim(i - added_axises);
-    }
-  }
-
-  // make sure no overflow
-  assert(static_cast<int8_t>(effective_dims) == static_cast<int32_t>(effective_dims));
-
-  op_params.start_indices_count = effective_dims;
-  op_params.stop_indices_count = effective_dims;
-  op_params.strides_count = effective_dims;
-
-  return op_params;
-}
-
-} // namespace
-
-namespace luci
-{
-
-loco::TensorShape infer_output_shape(const CircleStridedSlice *node)
-{
-  loco::TensorShape output_shape;
-
-  auto input_node = loco::must_cast<luci::CircleNode *>(node->input());
-
-  auto begin_node = dynamic_cast<luci::CircleConst *>(node->begin());
-  auto end_node = dynamic_cast<luci::CircleConst *>(node->end());
-  auto strides_node = dynamic_cast<luci::CircleConst *>(node->strides());
-  if (begin_node == nullptr || end_node == nullptr || strides_node == nullptr)
-  {
-    INTERNAL_EXN("StridedSlice begin/end/strides nodes are not Constant");
-  }
-
-  LUCI_ASSERT(begin_node->dtype() == S32, "Only support S32 for begin_node");
-  LUCI_ASSERT(end_node->dtype() == S32, "Only support S32 for end_node");
-  LUCI_ASSERT(strides_node->dtype() == S32, "Only support S32 for strides_node");
-
-  LUCI_ASSERT(begin_node->rank() == 1, "Only support rank 1 for begin_node");
-  LUCI_ASSERT(end_node->rank() == 1, "Only support rank 1 for end_node");
-  LUCI_ASSERT(strides_node->rank() == 1, "Only support rank 1 for strides_node");
-
-  loco::TensorShape input_shape = luci::shape_get(input_node).as<loco::TensorShape>();
-
-  assert(begin_node->size<S32>() <= input_shape.rank());
-  assert(end_node->size<S32>() <= input_shape.rank());
-  assert(strides_node->size<S32>() <= input_shape.rank());
-
-  StridedSliceContext op_context(node);
-  auto op_params = BuildStridedSliceParams(&op_context);
-  auto &effective_input_shape = op_context.effective_input_shape;
-  std::vector<int64_t> output_shape_vector;
-
-  for (int32_t idx = effective_input_shape.rank() - 1; idx >= 0; --idx)
-  {
-    int32_t stride = op_params.strides[idx];
-    LUCI_ASSERT(stride != 0, "stride value has to be non-zero");
-
-    int64_t begin = StartForAxis(op_params, effective_input_shape, idx);
-    int64_t end = StopForAxis(op_params, effective_input_shape, idx, begin);
-
-    // When shrinking an axis, the end position does not matter (and can be
-    // incorrect when negative indexing is used, see Issue #19260). Always use
-    // begin + 1 to generate a length 1 slice, since begin has
-    // already been adjusted for negative indices by GetBeginValueAtIndex.
-    const bool shrink_axis = op_params.shrink_axis_mask & (1 << idx);
-    if (shrink_axis)
-    {
-      end = begin + 1;
-    }
-
-    // This is valid for both positive and negative strides
-    int64_t dim_shape = std::ceil((end - begin) / static_cast<float>(stride));
-    dim_shape = dim_shape < 0 ? 0 : dim_shape;
-    if (!shrink_axis)
-    {
-      output_shape_vector.push_back(dim_shape);
-    }
-  }
-
-  auto shape_size = output_shape_vector.size();
-  output_shape.rank(shape_size);
-  for (uint32_t idx = 0; idx < shape_size; ++idx)
-  {
-    int64_t dim = output_shape_vector.at(shape_size - 1u - idx);
-    LUCI_ASSERT(0 <= dim && dim < 0xfffffffL, "Dimension size exceeds limit");
-    // reverse copy
-    output_shape.dim(idx) = static_cast<uint32_t>(dim);
-  }
-
-  return output_shape;
-}
-
-} // namespace luci
diff --git a/compiler/mio-circle08/CMakeLists.txt b/compiler/mio-circle08/CMakeLists.txt
index cee15c96993..345ada948bc 100644
--- a/compiler/mio-circle08/CMakeLists.txt
+++ b/compiler/mio-circle08/CMakeLists.txt
@@ -8,9 +8,7 @@ endif(NOT FlatBuffers_FOUND)
 message(STATUS "Build mio-circle08: TRUE")
 
 # TODO Find a better way
-# TODO use nnpackage
-# set(SCHEMA_FILE "${NNAS_PROJECT_SOURCE_DIR}/nnpackage/schema/circle_schema.fbs")
-set(SCHEMA_FILE "${NNAS_PROJECT_SOURCE_DIR}/res/CircleSchema/0.8/circle_schema.fbs")
+set(SCHEMA_FILE "${NNAS_PROJECT_SOURCE_DIR}/nnpackage/schema/circle_schema.fbs")
 
 # NOTE Copy circle_schema.fbs as schema.fbs to generate "schema_generated.fbs" instead of "circle_schema_generated.fbs"
 add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/schema.fbs"
diff --git a/compiler/one-cmds/onelib/constant.py b/compiler/one-cmds/onelib/constant.py
index 97b207488f3..b15ca653bdd 100644
--- a/compiler/one-cmds/onelib/constant.py
+++ b/compiler/one-cmds/onelib/constant.py
@@ -63,7 +63,7 @@ class CONSTANT:
         'remove_redundant_reshape',
         'remove_redundant_transpose',
         'remove_unnecessary_add',
-        'remove_unnecessary_cast'
+        'remove_unnecessary_cast',
         'remove_unnecessary_reshape',
         'remove_unnecessary_slice',
         'remove_unnecessary_strided_slice',
diff --git a/compute/cker/include/cker/eigen/xent_op.h b/compute/cker/include/cker/eigen/xent_op.h
new file mode 100644
index 00000000000..60d996cd94e
--- /dev/null
+++ b/compute/cker/include/cker/eigen/xent_op.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_EIGEN_XENT_OPS_H__
+#define __NNFW_CKER_EIGEN_XENT_OPS_H__
+
+// From tensorflow/core/kernels/xent_op.cc
+#define EIGEN_USE_THREADS
+
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "cker/operation/Helper/Tensor.h"
+
+// From tensorflow/core/kernels/xent_op.h
+namespace nnfw
+{
+namespace cker
+{
+namespace xent_ops
+{
+namespace functor
+{
+
+// Functor used by XentOp to do the computations.
+template <typename Device, typename T> struct XentFunctor
+{
+  // Computes Cross Entropy loss and backprop.
+  //
+  // logits: batch_size, num_classes.
+  // labels: batch_size, num_classes.
+  // scratch: temporary tensor, dims: batch_size, 1
+  // loss: output tensor for the loss, dims: batch_size.
+  // backprop: output tensor for the backprop, dims: batch_size, num_classes.
+  void operator()(const Device &d, const Eigen::DSizes<Eigen::DenseIndex, 2> &shape,
+                  const Eigen::array<Eigen::DenseIndex, 2> &logits_bcast,
+                  const Eigen::array<Eigen::DenseIndex, 2> &labels_bcast,
+                  typename TTypes<T>::ConstMatrix logits, typename TTypes<T>::ConstMatrix labels,
+                  typename TTypes<T>::Matrix scratch, typename TTypes<T>::Vec loss,
+                  typename TTypes<T>::Matrix backprop, T reduction_size);
+};
+
+} // namespace functor
+} // namespace xent_ops
+} // namespace cker
+} // namespace nnfw
+
+// From tensorflow/core/kernels/xent_op.cc
+namespace nnfw
+{
+namespace cker
+{
+namespace xent_ops
+{
+
+// Enable CPUDevice only for xent_ops
+using CPUDevice = Eigen::ThreadPoolDevice;
+using Index = Eigen::Index;
+
+// Partial specialization for a CPUDevice, that uses the Eigen implementation
+// from XentEigenImpl.
+namespace functor
+{
+template <typename Device, typename T> struct XentFunctorBase
+{
+  void operator()(const Device &d, const Eigen::DSizes<Eigen::DenseIndex, 2> &shape,
+                  const Eigen::array<Eigen::DenseIndex, 2> &logits_bcast,
+                  const Eigen::array<Eigen::DenseIndex, 2> &labels_bcast,
+                  typename TTypes<T>::ConstMatrix logits, typename TTypes<T>::ConstMatrix labels,
+                  typename TTypes<T>::Matrix scratch, typename TTypes<T>::Vec loss,
+                  typename TTypes<T>::Matrix backprop, T reduction_size)
+  {
+    T *scratch_ptr = scratch.data();
+    T *backprop_ptr = backprop.data();
+
+    T *loss_ptr = loss.data();
+
+    int row_size = shape[1];
+
+    if (shape[0] > 0)
+    {
+      backprop.device(d) = logits.broadcast(logits_bcast);
+      scratch.device(d) = labels.broadcast(labels_bcast);
+      auto reductionWorker = [&](int64_t begin, int64_t end) -> void {
+        for (int i = begin; i < end; i++)
+        {
+          T *this_backprop = backprop_ptr + (i * row_size);
+          T *this_logits = backprop_ptr + (i * row_size);
+          T *this_labels = scratch_ptr + (i * row_size);
+          T max_logits = this_logits[0];
+
+          // calculating max_logits
+          for (int j = 1; j < row_size; j++)
+          {
+            max_logits = std::max(max_logits, this_logits[j]);
+          }
+
+          T sum = T(0);
+          T loss_sum = T(0);
+
+          for (int j = 0; j < row_size; j++)
+          {
+            // Note that if input is reused than this_logits and this_backprop
+            // is same buffer, so after this calculation this_logits should no
+            // longer be trusted
+            this_backprop[j] = this_logits[j] - max_logits;
+            sum = sum + exp(this_backprop[j]);
+          }
+
+          // loss calculation
+          T log_sum = log(sum);
+          for (int j = 0; j < row_size; j++)
+          {
+            loss_sum += this_labels[j] * (log_sum - this_backprop[j]);
+            this_backprop[j] = ((exp(this_backprop[j]) / sum) - this_labels[j]) / reduction_size;
+          }
+          loss_ptr[i] = loss_sum;
+        }
+      };
+      const int64_t compute_cycles = 50 * row_size;
+      const int64_t input_bytes = sizeof(T) * row_size;
+      const int64_t output_bytes = sizeof(T) * row_size;
+      const Eigen::TensorOpCost cost(input_bytes, output_bytes, compute_cycles);
+
+      d.parallelFor(shape[0], cost, reductionWorker);
+    }
+  }
+};
+
+template <typename T> struct XentFunctor<CPUDevice, T> : XentFunctorBase<CPUDevice, T>
+{
+};
+
+} // namespace functor
+} // namespace xent_ops
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_EIGEN_XENT_OPS_H__
diff --git a/compute/cker/include/cker/operation/Helper/Tensor.h b/compute/cker/include/cker/operation/Helper/Tensor.h
index ec29a15c3e2..5e670730431 100644
--- a/compute/cker/include/cker/operation/Helper/Tensor.h
+++ b/compute/cker/include/cker/operation/Helper/Tensor.h
@@ -157,6 +157,10 @@ struct Tensor
   {
     return typename TTypes<T>::ConstScalar(base<T>());
   }
+
+  template <typename T> typename TTypes<T>::Vec vec() { return shaped<T, 1>(); }
+
+  template <typename T> typename TTypes<T>::Matrix matrix() { return shaped<T, 2>(); }
 }; // Tensor
 
 template <typename DSizes> Eigen::DSizes<Index32, DSizes::count> To32BitDims(const DSizes &in)
diff --git a/compiler/luci/service/src/ShapeInfer_StridedSlice.h b/compute/cker/include/cker/train/Types.h
similarity index 61%
rename from compiler/luci/service/src/ShapeInfer_StridedSlice.h
rename to compute/cker/include/cker/train/Types.h
index fa800b72019..d873a900ab6 100644
--- a/compiler/luci/service/src/ShapeInfer_StridedSlice.h
+++ b/compute/cker/include/cker/train/Types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,18 +14,24 @@
  * limitations under the License.
  */
 
-#ifndef __SHAPE_INFER_STRIDED_SLICE_H__
-#define __SHAPE_INFER_STRIDED_SLICE_H__
+#ifndef __NNFW_CKER_TRAIN_TYPES_H__
+#define __NNFW_CKER_TRAIN_TYPES_H__
 
-#include <luci/IR/CircleNodes.h>
-
-#include <loco/IR/NodeShape.h>
-
-namespace luci
+namespace nnfw
+{
+namespace cker
+{
+namespace train
 {
 
-loco::TensorShape infer_output_shape(const CircleStridedSlice *node);
+enum class LossReductionType
+{
+  SUM_OVER_BATCH_SIZE,
+  SUM,
+};
 
-} // namespace luci
+} // namespace train
+} // namespace cker
+} // namespace nnfw
 
-#endif // __SHAPE_INFER_STRIDED_SLICE_H__
+#endif // __NNFW_CKER_TYPES_H__
diff --git a/compute/cker/include/cker/train/operation/Loss.h b/compute/cker/include/cker/train/operation/Loss.h
index 70f54ad59c0..fe9acb6d8a9 100644
--- a/compute/cker/include/cker/train/operation/Loss.h
+++ b/compute/cker/include/cker/train/operation/Loss.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +21,11 @@
 #include <numeric>
 
 #include "cker/Shape.h"
+#include "cker/eigen/EigenSupport.h"
 #include "cker/eigen/Utils.h"
+#include "cker/eigen/xent_op.h"
+#include "cker/operation/Helper/BCast.h"
+#include "cker/train/Types.h"
 
 namespace nnfw
 {
@@ -59,17 +64,38 @@ inline void MSE(const Shape &y_pred_shape, const T *y_pred_data, const Shape &y_
 
 template <typename T>
 inline void MSEGrad(const Shape &y_pred_shape, const T *y_pred_data, const Shape &y_true_shape,
-                    const T *y_true_data, const Shape &grad_shape, T *grad_data)
+                    const T *y_true_data, const Shape &grad_shape, T *grad_data,
+                    LossReductionType reduction_type)
 {
   if (y_pred_shape != y_true_shape)
     throw std::runtime_error("cker::MSEGrad: y_pred_shape != y_true_shape");
   if (y_pred_shape != grad_shape)
     throw std::runtime_error("cker::MSEGrad: y_pred_shape != grad_shape");
 
-  const int size = grad_shape.FlatSize();
-  for (int i = 0; i < size; ++i)
+  const int batch_size = grad_shape.Dims(0);
+  const auto flat_size = FlatSizeSkipDim(grad_shape, 0);
+  auto reduction_size = 1;
+  switch (reduction_type)
   {
-    grad_data[i] = static_cast<T>(-2 * (y_true_data[i] - y_pred_data[i]) / size);
+    case LossReductionType::SUM_OVER_BATCH_SIZE:
+      reduction_size = batch_size * flat_size;
+      break;
+    case LossReductionType::SUM:
+      reduction_size = flat_size;
+      break;
+    default:
+      throw std::runtime_error("Unsupported reduction type");
+  }
+
+  for (int b = 0; b < batch_size; ++b)
+  {
+    for (int i = 0; i < flat_size; ++i)
+    {
+      const int offset = b * flat_size + i;
+      assert(offset >= 0);
+      grad_data[offset] =
+        static_cast<T>(-2 * (y_true_data[offset] - y_pred_data[offset]) / reduction_size);
+    }
   }
 }
 
@@ -97,7 +123,8 @@ inline void CategoricalCrossEntropy(const Shape &y_pred_shape, const T *y_pred_d
 template <typename T>
 inline void CategoricalCrossEntropyGrad(const Shape &y_pred_shape, const T *y_pred_data,
                                         const Shape &y_true_shape, const T *y_true_data,
-                                        const Shape &grad_shape, T *grad_data)
+                                        const Shape &grad_shape, T *grad_data,
+                                        LossReductionType reduction_type)
 {
   if (y_pred_shape != y_true_shape)
     throw std::runtime_error(
@@ -110,7 +137,103 @@ inline void CategoricalCrossEntropyGrad(const Shape &y_pred_shape, const T *y_pr
   const auto y_true = MapAsMatrixWithLastDimAsRows(y_true_data, y_true_shape);
   auto grad = MapAsMatrixWithLastDimAsRows(grad_data, grad_shape);
 
-  grad = -(y_true.array() / y_pred.array().cwiseMax(log_threshold<T>()));
+  const int32_t batch_size = grad_shape.Dims(0);
+  int32_t reduction_size = 1;
+  switch (reduction_type)
+  {
+    case LossReductionType::SUM_OVER_BATCH_SIZE:
+      reduction_size = batch_size;
+      break;
+    case LossReductionType::SUM:
+      reduction_size = 1;
+      break;
+    default:
+      throw std::runtime_error("Unsupported reduction type");
+  }
+  assert(reduction_size > 0);
+
+  grad = -(y_true.array() / y_pred.array().cwiseMax(log_threshold<T>())) /
+         static_cast<T>(reduction_size);
+}
+
+template <typename T>
+void CategoricalCrossEntropyWithLogits(const Shape &logits_shape, const T *logits_data,
+                                       const Shape &y_true_shape, const T *y_true_data,
+                                       const Shape &loss_out_shape, T *loss_out_data,
+                                       const Shape &grad_shape, T *grad_data,
+                                       LossReductionType reduction_type)
+{
+  // TODO Enable broadcast shapes
+  if (loss_out_shape.DimensionsCount() != 1)
+    throw std::runtime_error(
+      "cker::CategoricalCrossEntropyWithLogits: loss output dimension count should be 1");
+  if (logits_shape != y_true_shape)
+    throw std::runtime_error(
+      "cker::CategoricalCrossEntropyWithLogits: logits and y_true do not have the same shape");
+  if (loss_out_shape.Dims(0) != logits_shape.Dims(0))
+    throw std::runtime_error(
+      "cker::CategoricalCrossEntropyWithLogits: loss_out and logits do not have the same batch");
+  if (logits_shape != grad_shape)
+    throw std::runtime_error(
+      "cker::CategoricalCrossEntropyWithLogits: logits and grad do not have the same shape");
+
+  auto shape_in = logits_shape;
+
+  BCast bcast(BCast::FromShape(shape_in), BCast::FromShape(y_true_shape),
+              /*fewer_dims_optimization=*/false);
+
+  // loss is 1-D (one per example), and size is batch_size.
+
+  Tensor logits_in;
+  Tensor labels_in;
+  Tensor scratch;
+  Tensor loss_out;
+  Tensor back_out;
+
+  logits_in.shape.ReplaceWith(shape_in.DimensionsCount(), shape_in.DimsData());
+  logits_in.buffer = const_cast<T *>(logits_data);
+
+  labels_in.shape.ReplaceWith(y_true_shape.DimensionsCount(), y_true_shape.DimsData());
+  labels_in.buffer = const_cast<T *>(y_true_data);
+
+  scratch.shape.ReplaceWith(shape_in.DimensionsCount(), shape_in.DimsData());
+  std::vector<T> scratch_vec(shape_in.Dims(0) * shape_in.Dims(1), static_cast<T>(0));
+  scratch.buffer = scratch_vec.data();
+
+  Shape shape_loss_out{shape_in.Dims(0)};
+  loss_out.shape.ReplaceWith(shape_loss_out.DimensionsCount(), shape_loss_out.DimsData());
+  loss_out.buffer = loss_out_data;
+
+  back_out.shape.ReplaceWith(shape_in.DimensionsCount(), shape_in.DimsData());
+  back_out.buffer = grad_data;
+
+  if (shape_in.Dims(0) > 0)
+  {
+    const int32_t batch_size = grad_shape.Dims(0);
+    int32_t reduction_size = 1;
+    switch (reduction_type)
+    {
+      case LossReductionType::SUM_OVER_BATCH_SIZE:
+        reduction_size = batch_size;
+        break;
+      case LossReductionType::SUM:
+        reduction_size = 1;
+        break;
+      default:
+        throw std::runtime_error("Unsupported reduction type");
+    }
+    assert(reduction_size > 0);
+
+    const xent_ops::CPUDevice &device = *eigen_support::GetThreadPoolDevice();
+    xent_ops::functor::XentFunctor<xent_ops::CPUDevice, T> functor;
+    const Eigen::DSizes<Eigen::DenseIndex, 2> shape{shape_in.Dims(0), shape_in.Dims(1)};
+
+    functor(device, shape, BCast::ToIndexArray<2>(bcast.x_bcast()),
+            BCast::ToIndexArray<2>(bcast.y_bcast()),
+            logits_in.template shaped<const T, 2>(bcast.x_reshape()),
+            labels_in.template shaped<const T, 2>(bcast.y_reshape()), scratch.matrix<T>(),
+            loss_out.vec<T>(), back_out.matrix<T>(), static_cast<T>(reduction_size));
+  }
 }
 
 } // namespace train
diff --git a/compute/cker/src/train/Loss.test.cc b/compute/cker/src/train/Loss.test.cc
index 034d07ce2af..ff894a5a74f 100644
--- a/compute/cker/src/train/Loss.test.cc
+++ b/compute/cker/src/train/Loss.test.cc
@@ -48,7 +48,7 @@ template <typename T> class LossCCEVerifier
     // because it involes calculations such as log or exp.
     for (int i = 0; i < output.size(); ++i)
     {
-      EXPECT_NEAR(output[i], expected[i], 1e-3f);
+      EXPECT_NEAR(output[i], expected[i], 1e-4f);
     }
   }
 
@@ -66,7 +66,8 @@ template <typename T> class LossCCEVerifier
   }
 
   void verifyBackward(const std::vector<T> &y_pred, const std::vector<T> &y_true,
-                      const std::vector<T> &expected)
+                      const std::vector<T> &expected,
+                      nnfw::cker::train::LossReductionType reduction)
   {
     assert(y_pred.size() == y_true.size());
 
@@ -74,20 +75,46 @@ template <typename T> class LossCCEVerifier
     const int N = _in_shape.Dims(0);
     const int D = _in_shape.FlatSize() / N;
 
-    nnfw::cker::train::CategoricalCrossEntropyGrad(_in_shape, y_pred.data(), _in_shape,
-                                                   y_true.data(), _out_shape, output.data());
+    nnfw::cker::train::CategoricalCrossEntropyGrad(
+      _in_shape, y_pred.data(), _in_shape, y_true.data(), _out_shape, output.data(), reduction);
 
     // Don't be panic when it fails after kernel implementation or input is changed.
     // CrossEntropy Gradient formula can be calculated slightly differently depending on the
     // environment because it involes calculations such as log or exp.
     for (int i = 0; i < output.size(); ++i)
     {
-      EXPECT_NEAR(output[i], expected[i], 1e-3f);
+      EXPECT_NEAR(output[i], expected[i], 1e-4f);
+    }
+  }
+
+  void verifyBackwardWithLogits(const std::vector<T> &logits, const std::vector<T> &y_true,
+                                const std::vector<T> &expected_loss_out,
+                                const std::vector<T> &expected_grad,
+                                nnfw::cker::train::LossReductionType reduction)
+  {
+    assert(logits.size() == y_true.size());
+    assert(logits.size() == expected_grad.size());
+
+    std::vector<T> loss_out(_out_shape.FlatSize());
+    std::vector<T> grad(_in_shape.FlatSize());
+
+    nnfw::cker::train::CategoricalCrossEntropyWithLogits(_in_shape, logits.data(), _in_shape,
+                                                         y_true.data(), _out_shape, loss_out.data(),
+                                                         _in_shape, grad.data(), reduction);
+
+    for (int i = 0; i < loss_out.size(); ++i)
+    {
+      EXPECT_NEAR(loss_out[i], expected_loss_out[i], 1e-4f);
+    }
+
+    for (int i = 0; i < grad.size(); ++i)
+    {
+      EXPECT_NEAR(grad[i], expected_grad[i], 1e-4f);
     }
   }
 
   void throwBackward(const std::vector<T> &y_pred, const std::vector<T> &y_true,
-                     const std::vector<T> &expected)
+                     const std::vector<T> &expected, nnfw::cker::train::LossReductionType reduction)
   {
     assert(y_pred.size() == y_true.size());
 
@@ -96,7 +123,23 @@ template <typename T> class LossCCEVerifier
     const int D = _in_shape.FlatSize() / N;
 
     EXPECT_ANY_THROW(nnfw::cker::train::CategoricalCrossEntropyGrad(
-      _in_shape, y_pred.data(), _in_shape, y_true.data(), _out_shape, output.data()));
+      _in_shape, y_pred.data(), _in_shape, y_true.data(), _out_shape, output.data(), reduction));
+  }
+
+  void throwBackwardWithLogits(const std::vector<T> &logits, const std::vector<T> &y_true,
+                               const std::vector<T> &expected_loss_out,
+                               const std::vector<T> &expected_grad,
+                               nnfw::cker::train::LossReductionType reduction)
+  {
+    assert(logits.size() == y_true.size());
+    assert(logits.size() == expected_grad.size());
+
+    std::vector<T> loss_out(_out_shape.FlatSize());
+    std::vector<T> grad(_in_shape.FlatSize());
+
+    EXPECT_ANY_THROW(nnfw::cker::train::CategoricalCrossEntropyWithLogits(
+      _in_shape, logits.data(), _in_shape, y_true.data(), _out_shape, loss_out.data(), _in_shape,
+      grad.data(), reduction));
   }
 
 private:
@@ -221,7 +264,8 @@ TEST(CKer_Operation, LossMSEGrad)
     std::vector<int> expected = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
     nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10},
-                               y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data());
+                               y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data(),
+                               nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE);
 
     for (size_t i = 0; i < deriv_y_pred.size(); ++i)
       EXPECT_EQ(deriv_y_pred[i], expected[i]);
@@ -235,21 +279,38 @@ TEST(CKer_Operation, LossMSEGrad)
     std::vector<float> expected = {0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2};
 
     nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10},
-                               y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data());
+                               y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data(),
+                               nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE);
 
     for (size_t i = 0; i < deriv_y_pred.size(); ++i)
       EXPECT_FLOAT_EQ(deriv_y_pred[i], expected[i]);
   }
 
   {
-    // Shape: {2, 3} -> m_rows:3, m_cols:2
+    // Shape: {2, 3} -> m_rows:3, m_cols:2, LossReductionType::SUM_OVER_BATCH_SIZE
     std::vector<float> y_pred = {27.2, 31.8, 51.9, 10.2, 34.2, 12.4};
     std::vector<float> y_true = {31.3, 40.3, 29.7, 12.9, 25.8, 11.9};
     std::vector<float> deriv_y_pred(6);
     std::vector<float> expected = {-1.3666667, -2.8333333, 7.4, -0.9, 2.8, 0.1666667};
 
     nnfw::cker::train::MSEGrad(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3},
-                               y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data());
+                               y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data(),
+                               nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE);
+
+    for (size_t i = 0; i < deriv_y_pred.size(); ++i)
+      EXPECT_FLOAT_EQ(deriv_y_pred[i], expected[i]);
+  }
+
+  {
+    // Shape: {2, 3} -> m_rows:3, m_cols:2, LossReductionType::SUM
+    std::vector<float> y_pred = {27.2, 31.8, 51.9, 10.2, 34.2, 12.4};
+    std::vector<float> y_true = {31.3, 40.3, 29.7, 12.9, 25.8, 11.9};
+    std::vector<float> deriv_y_pred(6);
+    std::vector<float> expected = {-2.7333324, -5.6666665, 14.8, -1.7999998, 5.6, 0.33333334};
+
+    nnfw::cker::train::MSEGrad(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3},
+                               y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data(),
+                               nnfw::cker::train::LossReductionType::SUM);
 
     for (size_t i = 0; i < deriv_y_pred.size(); ++i)
       EXPECT_FLOAT_EQ(deriv_y_pred[i], expected[i]);
@@ -266,7 +327,8 @@ TEST(CKer_Operation, neg_LossMSEGrad)
     std::vector<float> expected = {1., 1., 1., 1., 1., 1.};
 
     nnfw::cker::train::MSEGrad(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3},
-                               y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data());
+                               y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data(),
+                               nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE);
 
     for (size_t i = 0; i < deriv_y_pred.size(); ++i)
       EXPECT_NE(deriv_y_pred[i], expected[i]);
@@ -278,9 +340,10 @@ TEST(CKer_Operation, neg_LossMSEGrad)
     std::vector<float> y_true = {0., 1., 2., 3., 4., 5.};
     std::vector<float> deriv_y_pred(10);
 
-    EXPECT_ANY_THROW(nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(),
-                                                nnfw::cker::Shape{2, 3}, y_true.data(),
-                                                nnfw::cker::Shape{1, 10}, deriv_y_pred.data()));
+    EXPECT_ANY_THROW(
+      nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{2, 3},
+                                 y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data(),
+                                 nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE));
   }
 
   {
@@ -289,9 +352,10 @@ TEST(CKer_Operation, neg_LossMSEGrad)
     std::vector<float> y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.};
     std::vector<float> deriv_y_pred(6);
 
-    EXPECT_ANY_THROW(nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(),
-                                                nnfw::cker::Shape{1, 10}, y_true.data(),
-                                                nnfw::cker::Shape{2, 3}, deriv_y_pred.data()));
+    EXPECT_ANY_THROW(
+      nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10},
+                                 y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data(),
+                                 nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE));
   }
 }
 
@@ -354,7 +418,7 @@ TEST(CKer_Operation, LossCategoricalCrossEntropyGrad)
     std::vector<float> expected = {0, 0, 0, 0, 0, 0, 0, 0, 0, -16.66666667};
 
     LossCCEVerifier<float> verifier(in_shape, grad_shape);
-    verifier.verifyBackward(y_pred, y_true, expected);
+    verifier.verifyBackward(y_pred, y_true, expected, nnfw::cker::train::LossReductionType::SUM);
   }
 
   {
@@ -368,7 +432,65 @@ TEST(CKer_Operation, LossCategoricalCrossEntropyGrad)
                                    0, 0, 0, 0, 0, 0, 0, 0, 0};
 
     LossCCEVerifier<float> verifier(in_shape, grad_shape);
-    verifier.verifyBackward(y_pred, y_true, expected);
+    verifier.verifyBackward(y_pred, y_true, expected, nnfw::cker::train::LossReductionType::SUM);
+  }
+
+  {
+    nnfw::cker::Shape in_shape{2, 10};
+    nnfw::cker::Shape grad_shape{2, 10};
+    std::vector<float> y_pred = {0.01, 0.03, 0.05, 0.35,  0.04,  0.05,  0.28,  0.09,  0.04,  0.06,
+                                 0.89, 0.03, 0.04, 0.005, 0.023, 0.001, 0.004, 0.005, 0.001, 0.001};
+    std::vector<float> y_true = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    std::vector<float> expected = {0, 0, 0, 0, 0, 0, 0, 0, 0, -8.333333, -0.561797738,
+                                   0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+    LossCCEVerifier<float> verifier(in_shape, grad_shape);
+    verifier.verifyBackward(y_pred, y_true, expected,
+                            nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE);
+  }
+
+  {
+    nnfw::cker::Shape in_shape{1, 10};
+    nnfw::cker::Shape out_shape{1};
+
+    std::vector<float> logits = {1, 3, 5, 35, 4, 5, 28, 9, 4, 6};
+    std::vector<float> y_true = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+    std::vector<float> expected_loss_out = {29.0009};
+    std::vector<float> expected_grad = {0, 0, 0, 0.9991, 0, 0, 0.0009, 0, 0, -1};
+
+    LossCCEVerifier<float> verifier(in_shape, out_shape);
+    verifier.verifyBackwardWithLogits(logits, y_true, expected_loss_out, expected_grad,
+                                      nnfw::cker::train::LossReductionType::SUM);
+  }
+
+  {
+    nnfw::cker::Shape in_shape{2, 10};
+    nnfw::cker::Shape out_shape{2};
+
+    std::vector<float> logits = {1, 3, 5, 35, 4, 5, 28, 9, 4, 6, 89, 3, 4, 5, 23, 1, 4, 5, 1, 101};
+    std::vector<float> y_true = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    std::vector<float> expected_loss_out = {29.0009, 12};
+    std::vector<float> expected_grad = {0,  0, 0, 0.9991, 0, 0, 0.0009, 0, 0, -1,
+                                        -1, 0, 0, 0,      0, 0, 0,      0, 0, 1};
+
+    LossCCEVerifier<float> verifier(in_shape, out_shape);
+    verifier.verifyBackwardWithLogits(logits, y_true, expected_loss_out, expected_grad,
+                                      nnfw::cker::train::LossReductionType::SUM);
+  }
+
+  {
+    nnfw::cker::Shape in_shape{2, 10};
+    nnfw::cker::Shape out_shape{2};
+
+    std::vector<float> logits = {1, 3, 5, 35, 4, 5, 28, 9, 4, 6, 89, 3, 4, 5, 23, 1, 4, 5, 1, 101};
+    std::vector<float> y_true = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    std::vector<float> expected_loss_out = {29.0009, 12};
+    std::vector<float> expected_grad = {0,    0, 0, 0.4995, 0, 0, 0.0005, 0, 0, -0.5,
+                                        -0.5, 0, 0, 0,      0, 0, 0,      0, 0, 0.5};
+
+    LossCCEVerifier<float> verifier(in_shape, out_shape);
+    verifier.verifyBackwardWithLogits(logits, y_true, expected_loss_out, expected_grad,
+                                      nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE);
   }
 }
 
@@ -384,6 +506,24 @@ TEST(CKer_Operation, neg_LossCategoricalCrossEntropyGrad)
     std::vector<float> expected = {0, 0, 0, 0, 0, 0, 0, 0, 0, -16.66666667};
 
     LossCCEVerifier<float> verifier(in_shape, grad_shape);
-    verifier.throwBackward(y_pred, y_true, expected);
+    verifier.throwBackward(y_pred, y_true, expected, nnfw::cker::train::LossReductionType::SUM);
+  }
+}
+
+TEST(CKer_Operation, neg_LossCategoricalCrossEntropyWithLogits)
+{
+  // Invalid out shape
+  {
+    nnfw::cker::Shape in_shape{1, 10};
+    nnfw::cker::Shape out_shape{1, 1};
+
+    std::vector<float> logits = {1, 3, 5, 35, 4, 5, 28, 9, 4, 6};
+    std::vector<float> y_true = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+    std::vector<float> expected_loss_out = {29.0009};
+    std::vector<float> expected_grad = {0, 0, 0, 0.9991, 0, 0, 0.0009, 0, 0, -1};
+
+    LossCCEVerifier<float> verifier(in_shape, out_shape);
+    verifier.throwBackwardWithLogits(logits, y_true, expected_loss_out, expected_grad,
+                                     nnfw::cker::train::LossReductionType::SUM);
   }
 }
diff --git a/infra/nnfw/cmake/packages/FlatBuffers-23.5.26/FlatBuffersConfig.cmake b/infra/nnfw/cmake/packages/FlatBuffers-23.5.26/FlatBuffersConfig.cmake
index eefa2033ebe..4dbfad070b7 100644
--- a/infra/nnfw/cmake/packages/FlatBuffers-23.5.26/FlatBuffersConfig.cmake
+++ b/infra/nnfw/cmake/packages/FlatBuffers-23.5.26/FlatBuffersConfig.cmake
@@ -27,7 +27,7 @@ function(_FlatBuffers_import)
     list(APPEND FlatBuffers_Library_SRCS "${FlatBuffersSource_DIR}/src/util.cpp")
 
     if(NOT TARGET flatbuffers::flatbuffers-23.5.26)
-      add_library(flatbuffers-23.5.26 ${FlatBuffers_Library_SRCS})
+      add_library(flatbuffers-23.5.26 STATIC ${FlatBuffers_Library_SRCS})
       target_include_directories(flatbuffers-23.5.26 PUBLIC "${FlatBuffersSource_DIR}/include")
       set_property(TARGET flatbuffers-23.5.26 PROPERTY POSITION_INDEPENDENT_CODE ON)
       target_compile_options(flatbuffers-23.5.26 PUBLIC $<$<CONFIG:Debug>:-Wno-sign-compare>)
diff --git a/nnpackage/schema/circle_schema.fbs b/nnpackage/schema/circle_schema.fbs
index 515b314e280..0498318bfce 100644
--- a/nnpackage/schema/circle_schema.fbs
+++ b/nnpackage/schema/circle_schema.fbs
@@ -286,6 +286,7 @@ table Tensor {
 // set of acceptable options.
 // LINT.IfChange
 enum BuiltinOperator : int32 {
+  RMS_NORM = -6,
   GRU = -5,
   BCQ_GATHER = -4,
   BCQ_FULLY_CONNECTED = -3,
@@ -635,6 +636,7 @@ union BuiltinOptions {
   BitcastOptions,
   BitwiseXorOptions,
   RightShiftOptions,
+  RmsNormOptions = 250,
   GRUOptions = 251,
   BCQGatherOptions = 252,
   BCQFullyConnectedOptions = 253,
@@ -1519,6 +1521,10 @@ table InstanceNormOptions {
   fused_activation_function:ActivationFunctionType;
 }
 
+table RmsNormOptions {
+  epsilon:float;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/onert-micro/onert-micro/include/execute/OMUtils.h b/onert-micro/onert-micro/include/execute/OMUtils.h
index b45feb08953..109dbea23fa 100644
--- a/onert-micro/onert-micro/include/execute/OMUtils.h
+++ b/onert-micro/onert-micro/include/execute/OMUtils.h
@@ -163,6 +163,10 @@ void calculateQuantParams(core::ArithmeticQuantParams &params, const circle::Ten
 OMStatus SISOHeader(const OMExecuteArgs &execute_args, const circle::Tensor **input,
                     const circle::Tensor **output, uint8_t **input_data, uint8_t **output_data);
 
+OMStatus TISOHeader(const OMExecuteArgs &execute_args, const circle::Tensor **input1,
+                    const circle::Tensor **input2, const circle::Tensor **output,
+                    OMRuntimeKernel *runtime_kernel);
+
 } // namespace execute
 } // namespace onert_micro
 
diff --git a/onert-micro/onert-micro/src/execute/OMUtils.cpp b/onert-micro/onert-micro/src/execute/OMUtils.cpp
index 9bda002018c..1fca5d95331 100644
--- a/onert-micro/onert-micro/src/execute/OMUtils.cpp
+++ b/onert-micro/onert-micro/src/execute/OMUtils.cpp
@@ -236,3 +236,32 @@ void onert_micro::execute::calculateQuantParams(core::ArithmeticQuantParams &par
                                     &params.quantized_activation_min,
                                     &params.quantized_activation_max);
 }
+
+OMStatus onert_micro::execute::TISOHeader(const OMExecuteArgs &execute_args,
+                                          const circle::Tensor **input1,
+                                          const circle::Tensor **input2,
+                                          const circle::Tensor **output,
+                                          OMRuntimeKernel *runtime_kernel)
+{
+  OMStatus status;
+
+  core::OMRuntimeContext &runtime_context = execute_args.runtime_context;
+  core::OMRuntimeStorage &runtime_storage = execute_args.runtime_storage;
+  uint16_t op_index = execute_args.kernel_index;
+
+  status = runtime_kernel->readKernel(op_index, runtime_context);
+
+  *input1 = runtime_kernel->inputs[0];
+  *input2 = runtime_kernel->inputs[1];
+  *output = runtime_kernel->outputs[0];
+
+  assert(*input1 != nullptr);
+  assert(*input2 != nullptr);
+  assert(*output != nullptr);
+
+  status = runtime_kernel->getDataFromStorage(op_index, runtime_storage, runtime_context);
+  if (status != Ok)
+    return status;
+
+  return status;
+}
diff --git a/onert-micro/onert-micro/src/execute/kernels/Equal.cpp b/onert-micro/onert-micro/src/execute/kernels/Equal.cpp
index 0ff8ca8443d..4c7a6177176 100644
--- a/onert-micro/onert-micro/src/execute/kernels/Equal.cpp
+++ b/onert-micro/onert-micro/src/execute/kernels/Equal.cpp
@@ -21,6 +21,8 @@
 #include "execute/kernels/ComparisonCommon.h"
 #include "PALComparisons.h"
 
+#include "execute/OMUtils.h"
+
 using namespace onert_micro;
 using namespace onert_micro::core;
 using namespace onert_micro::execute;
@@ -36,10 +38,6 @@ constexpr uint32_t outputTensorIdx = 0;
 
 OMStatus onert_micro::execute::execute_kernel_CircleEqual(const OMExecuteArgs &execute_args)
 {
-  core::OMRuntimeContext &runtime_context = execute_args.runtime_context;
-  core::OMRuntimeStorage &runtime_storage = execute_args.runtime_storage;
-  uint16_t op_index = execute_args.kernel_index;
-
   OMStatus status = Ok;
 
   const circle::Tensor *input1 = nullptr;
@@ -47,19 +45,8 @@ OMStatus onert_micro::execute::execute_kernel_CircleEqual(const OMExecuteArgs &e
   const circle::Tensor *output = nullptr;
 
   OMRuntimeKernel runtime_kernel;
-  runtime_kernel.readKernel(op_index, runtime_context);
-
-  status = runtime_kernel.getDataFromStorage(op_index, runtime_storage, runtime_context);
-  if (status != Ok)
-    return status;
-
-  input1 = runtime_kernel.inputs[input1TensorIdx];
-  input2 = runtime_kernel.inputs[input2TensorIdx];
-  output = runtime_kernel.outputs[outputTensorIdx];
 
-  assert(input1 != nullptr);
-  assert(input2 != nullptr);
-  assert(output != nullptr);
+  TISOHeader(execute_args, &input1, &input2, &output, &runtime_kernel);
 
   switch (input1->type())
   {
diff --git a/onert-micro/onert-micro/src/execute/kernels/Greater.cpp b/onert-micro/onert-micro/src/execute/kernels/Greater.cpp
index 5f95c0e0a79..82feefc148f 100644
--- a/onert-micro/onert-micro/src/execute/kernels/Greater.cpp
+++ b/onert-micro/onert-micro/src/execute/kernels/Greater.cpp
@@ -21,6 +21,8 @@
 #include "execute/kernels/ComparisonCommon.h"
 #include "PALComparisons.h"
 
+#include "execute/OMUtils.h"
+
 using namespace onert_micro;
 using namespace onert_micro::core;
 using namespace onert_micro::execute;
@@ -36,10 +38,6 @@ constexpr uint32_t outputTensorIdx = 0;
 
 OMStatus onert_micro::execute::execute_kernel_CircleGreater(const OMExecuteArgs &execute_args)
 {
-  core::OMRuntimeContext &runtime_context = execute_args.runtime_context;
-  core::OMRuntimeStorage &runtime_storage = execute_args.runtime_storage;
-  uint16_t op_index = execute_args.kernel_index;
-
   OMStatus status = Ok;
 
   const circle::Tensor *input1 = nullptr;
@@ -47,19 +45,8 @@ OMStatus onert_micro::execute::execute_kernel_CircleGreater(const OMExecuteArgs
   const circle::Tensor *output = nullptr;
 
   OMRuntimeKernel runtime_kernel;
-  runtime_kernel.readKernel(op_index, runtime_context);
-
-  status = runtime_kernel.getDataFromStorage(op_index, runtime_storage, runtime_context);
-  if (status != Ok)
-    return status;
-
-  input1 = runtime_kernel.inputs[input1TensorIdx];
-  input2 = runtime_kernel.inputs[input2TensorIdx];
-  output = runtime_kernel.outputs[outputTensorIdx];
 
-  assert(input1 != nullptr);
-  assert(input2 != nullptr);
-  assert(output != nullptr);
+  TISOHeader(execute_args, &input1, &input2, &output, &runtime_kernel);
 
   switch (input1->type())
   {
diff --git a/onert-micro/onert-micro/src/execute/kernels/GreaterEqual.cpp b/onert-micro/onert-micro/src/execute/kernels/GreaterEqual.cpp
index 325e332f838..70255d919f2 100644
--- a/onert-micro/onert-micro/src/execute/kernels/GreaterEqual.cpp
+++ b/onert-micro/onert-micro/src/execute/kernels/GreaterEqual.cpp
@@ -21,6 +21,8 @@
 #include "execute/kernels/ComparisonCommon.h"
 #include "PALComparisons.h"
 
+#include "execute/OMUtils.h"
+
 using namespace onert_micro;
 using namespace onert_micro::core;
 using namespace onert_micro::execute;
@@ -36,10 +38,6 @@ constexpr uint32_t outputTensorIdx = 0;
 
 OMStatus onert_micro::execute::execute_kernel_CircleGreaterEqual(const OMExecuteArgs &execute_args)
 {
-  core::OMRuntimeContext &runtime_context = execute_args.runtime_context;
-  core::OMRuntimeStorage &runtime_storage = execute_args.runtime_storage;
-  uint16_t op_index = execute_args.kernel_index;
-
   OMStatus status = Ok;
 
   const circle::Tensor *input1 = nullptr;
@@ -47,19 +45,8 @@ OMStatus onert_micro::execute::execute_kernel_CircleGreaterEqual(const OMExecute
   const circle::Tensor *output = nullptr;
 
   OMRuntimeKernel runtime_kernel;
-  runtime_kernel.readKernel(op_index, runtime_context);
-
-  status = runtime_kernel.getDataFromStorage(op_index, runtime_storage, runtime_context);
-  if (status != Ok)
-    return status;
-
-  input1 = runtime_kernel.inputs[input1TensorIdx];
-  input2 = runtime_kernel.inputs[input2TensorIdx];
-  output = runtime_kernel.outputs[outputTensorIdx];
 
-  assert(input1 != nullptr);
-  assert(input2 != nullptr);
-  assert(output != nullptr);
+  TISOHeader(execute_args, &input1, &input2, &output, &runtime_kernel);
 
   switch (input1->type())
   {
diff --git a/onert-micro/onert-micro/src/execute/kernels/Less.cpp b/onert-micro/onert-micro/src/execute/kernels/Less.cpp
index b815849cf6f..a7a3b2f9d9b 100644
--- a/onert-micro/onert-micro/src/execute/kernels/Less.cpp
+++ b/onert-micro/onert-micro/src/execute/kernels/Less.cpp
@@ -21,6 +21,8 @@
 #include "execute/kernels/ComparisonCommon.h"
 #include "PALComparisons.h"
 
+#include "execute/OMUtils.h"
+
 using namespace onert_micro;
 using namespace onert_micro::core;
 using namespace onert_micro::execute;
@@ -36,10 +38,6 @@ constexpr uint32_t outputTensorIdx = 0;
 
 OMStatus onert_micro::execute::execute_kernel_CircleLess(const OMExecuteArgs &execute_args)
 {
-  core::OMRuntimeContext &runtime_context = execute_args.runtime_context;
-  core::OMRuntimeStorage &runtime_storage = execute_args.runtime_storage;
-  uint16_t op_index = execute_args.kernel_index;
-
   OMStatus status = Ok;
 
   const circle::Tensor *input1 = nullptr;
@@ -47,19 +45,8 @@ OMStatus onert_micro::execute::execute_kernel_CircleLess(const OMExecuteArgs &ex
   const circle::Tensor *output = nullptr;
 
   OMRuntimeKernel runtime_kernel;
-  runtime_kernel.readKernel(op_index, runtime_context);
-
-  status = runtime_kernel.getDataFromStorage(op_index, runtime_storage, runtime_context);
-  if (status != Ok)
-    return status;
-
-  input1 = runtime_kernel.inputs[input1TensorIdx];
-  input2 = runtime_kernel.inputs[input2TensorIdx];
-  output = runtime_kernel.outputs[outputTensorIdx];
 
-  assert(input1 != nullptr);
-  assert(input2 != nullptr);
-  assert(output != nullptr);
+  TISOHeader(execute_args, &input1, &input2, &output, &runtime_kernel);
 
   switch (input1->type())
   {
diff --git a/onert-micro/onert-micro/src/execute/kernels/NotEqual.cpp b/onert-micro/onert-micro/src/execute/kernels/NotEqual.cpp
index 5591c3eb76c..b2c8bc97ad2 100644
--- a/onert-micro/onert-micro/src/execute/kernels/NotEqual.cpp
+++ b/onert-micro/onert-micro/src/execute/kernels/NotEqual.cpp
@@ -21,6 +21,8 @@
 #include "execute/kernels/ComparisonCommon.h"
 #include "PALComparisons.h"
 
+#include "execute/OMUtils.h"
+
 using namespace onert_micro;
 using namespace onert_micro::core;
 using namespace onert_micro::execute;
@@ -36,10 +38,6 @@ constexpr uint32_t outputTensorIdx = 0;
 
 OMStatus onert_micro::execute::execute_kernel_CircleNotEqual(const OMExecuteArgs &execute_args)
 {
-  core::OMRuntimeContext &runtime_context = execute_args.runtime_context;
-  core::OMRuntimeStorage &runtime_storage = execute_args.runtime_storage;
-  uint16_t op_index = execute_args.kernel_index;
-
   OMStatus status = Ok;
 
   const circle::Tensor *input1 = nullptr;
@@ -47,19 +45,8 @@ OMStatus onert_micro::execute::execute_kernel_CircleNotEqual(const OMExecuteArgs
   const circle::Tensor *output = nullptr;
 
   OMRuntimeKernel runtime_kernel;
-  runtime_kernel.readKernel(op_index, runtime_context);
-
-  status = runtime_kernel.getDataFromStorage(op_index, runtime_storage, runtime_context);
-  if (status != Ok)
-    return status;
-
-  input1 = runtime_kernel.inputs[input1TensorIdx];
-  input2 = runtime_kernel.inputs[input2TensorIdx];
-  output = runtime_kernel.outputs[outputTensorIdx];
 
-  assert(input1 != nullptr);
-  assert(input2 != nullptr);
-  assert(output != nullptr);
+  TISOHeader(execute_args, &input1, &input2, &output, &runtime_kernel);
 
   switch (input1->type())
   {
diff --git a/onert-micro/training-configure-tool/src/TrainingDriverHandler.cpp b/onert-micro/training-configure-tool/src/TrainingDriverHandler.cpp
index fe4a3ad8431..eea65336980 100644
--- a/onert-micro/training-configure-tool/src/TrainingDriverHandler.cpp
+++ b/onert-micro/training-configure-tool/src/TrainingDriverHandler.cpp
@@ -156,7 +156,7 @@ OMStatus training_configure_tool::runTrainProcessWithCurConfig(
         case onert_micro::CROSS_ENTROPY_METRICS:
         {
           float cross_entropy_metric = 0.f;
-          train_interpreter.evaluateMetric(onert_micro::CROSS_ENTROPY_METRICS,
+          train_interpreter.evaluateMetric(config, onert_micro::CROSS_ENTROPY_METRICS,
                                            reinterpret_cast<void *>(&cross_entropy_metric),
                                            cur_batch_size);
           cross_entropy_v.push_back(cross_entropy_metric);
@@ -165,7 +165,7 @@ OMStatus training_configure_tool::runTrainProcessWithCurConfig(
         case onert_micro::ACCURACY:
         {
           float accuracy = 0.f;
-          train_interpreter.evaluateMetric(onert_micro::ACCURACY,
+          train_interpreter.evaluateMetric(config, onert_micro::ACCURACY,
                                            reinterpret_cast<void *>(&accuracy), cur_batch_size);
           accuracy_v.push_back(accuracy);
         }
@@ -173,16 +173,16 @@ OMStatus training_configure_tool::runTrainProcessWithCurConfig(
         case onert_micro::MSE_METRICS:
         {
           float mse = 0.f;
-          train_interpreter.evaluateMetric(onert_micro::MSE_METRICS, reinterpret_cast<void *>(&mse),
-                                           cur_batch_size);
+          train_interpreter.evaluateMetric(config, onert_micro::MSE_METRICS,
+                                           reinterpret_cast<void *>(&mse), cur_batch_size);
           mse_v.push_back(mse);
         }
         break;
         case onert_micro::MAE_METRICS:
         {
           float mae = 0.f;
-          train_interpreter.evaluateMetric(onert_micro::MAE_METRICS, reinterpret_cast<void *>(&mae),
-                                           cur_batch_size);
+          train_interpreter.evaluateMetric(config, onert_micro::MAE_METRICS,
+                                           reinterpret_cast<void *>(&mae), cur_batch_size);
           mae_v.push_back(mae);
         }
         break;
diff --git a/res/CircleSchema/0.9/circle_schema.fbs b/res/CircleSchema/0.9/circle_schema.fbs
index 515b314e280..0498318bfce 100644
--- a/res/CircleSchema/0.9/circle_schema.fbs
+++ b/res/CircleSchema/0.9/circle_schema.fbs
@@ -286,6 +286,7 @@ table Tensor {
 // set of acceptable options.
 // LINT.IfChange
 enum BuiltinOperator : int32 {
+  RMS_NORM = -6,
   GRU = -5,
   BCQ_GATHER = -4,
   BCQ_FULLY_CONNECTED = -3,
@@ -635,6 +636,7 @@ union BuiltinOptions {
   BitcastOptions,
   BitwiseXorOptions,
   RightShiftOptions,
+  RmsNormOptions = 250,
   GRUOptions = 251,
   BCQGatherOptions = 252,
   BCQFullyConnectedOptions = 253,
@@ -1519,6 +1521,10 @@ table InstanceNormOptions {
   fused_activation_function:ActivationFunctionType;
 }
 
+table RmsNormOptions {
+  epsilon:float;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/res/TensorFlowLiteRecipes/Conv2D_007/test.recipe b/res/TensorFlowLiteRecipes/Conv2D_007/test.recipe
new file mode 100644
index 00000000000..a0879c460db
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Conv2D_007/test.recipe
@@ -0,0 +1,83 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 8 dim: 8 dim: 1 }
+}
+operand {
+  name: "ker_1"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 1 dim: 1 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "bias_1"
+  type: FLOAT32
+  shape { dim: 1 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "ofm_1"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 dim: 4 dim: 1 }
+}
+operation {
+  type: "Conv2D"
+  conv2d_options {
+    padding: SAME
+    stride_w: 2
+    stride_h: 2
+    activation: RELU
+  }
+  input: "ifm"
+  input: "ker_1"
+  input: "bias_1"
+  output: "ofm_1"
+}
+operand {
+  name: "ker_2"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 1 dim: 1 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "bias_2"
+  type: FLOAT32
+  shape { dim: 1 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "ofm_2"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 2 dim: 1 }
+}
+operation {
+  type: "Conv2D"
+  conv2d_options {
+    padding: SAME
+    stride_w: 2
+    stride_h: 2
+    activation: RELU
+  }
+  input: "ofm_1"
+  input: "ker_2"
+  input: "bias_2"
+  output: "ofm_2"
+}
+input: "ifm"
+output: "ofm_2"
diff --git a/res/TensorFlowLiteRecipes/Conv2D_007/test.reverse b/res/TensorFlowLiteRecipes/Conv2D_007/test.reverse
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/res/TensorFlowLiteRecipes/FullyConnected_010/test.recipe b/res/TensorFlowLiteRecipes/FullyConnected_010/test.recipe
new file mode 100644
index 00000000000..f7e086f95e0
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/FullyConnected_010/test.recipe
@@ -0,0 +1,77 @@
+operand {
+  name: "in"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 }
+}
+operand {
+  name: "weight"
+  type: FLOAT32
+  shape { dim: 16 dim: 16 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "bias"
+  type: FLOAT32
+  shape { dim: 16 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "out"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 }
+}
+operation {
+  type: "FullyConnected"
+  fullyconnected_options {
+    activation: RELU
+  }
+  input: "in"
+  input: "weight"
+  input: "bias"
+  output: "out"
+}
+operand {
+  name: "weight_2"
+  type: FLOAT32
+  shape { dim: 4 dim: 16 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "bias_2"
+  type: FLOAT32
+  shape { dim: 4 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "out_2"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 }
+}
+operation {
+  type: "FullyConnected"
+  fullyconnected_options {
+    activation: RELU
+  }
+  input: "out"
+  input: "weight_2"
+  input: "bias_2"
+  output: "out_2"
+}
+input: "in"
+output: "out_2"
diff --git a/res/TensorFlowLiteRecipes/FullyConnected_010/test.reverse b/res/TensorFlowLiteRecipes/FullyConnected_010/test.reverse
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/res/TensorFlowLiteRecipes/Net_Conv_Gelu_000/test.recipe b/res/TensorFlowLiteRecipes/Net_Conv_Gelu_000/test.recipe
new file mode 100644
index 00000000000..89314254f20
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Conv_Gelu_000/test.recipe
@@ -0,0 +1,78 @@
+operand {
+  name: "Placeholder"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 16 dim: 3 }
+}
+operand {
+  name: "Conv2D_1"
+  type: FLOAT32
+  shape { dim: 3 dim: 3 dim: 3 dim: 3 }
+  filler { tag: "gaussian" arg: "0.0" arg: "0.1" }
+}
+operand {
+  name: "Conv2D_2"
+  type: FLOAT32
+  shape { dim: 3 }
+  filler { tag: "gaussian" arg: "0.0" arg: "0.1" }
+}
+operand {
+  name: "Conv2D_21"
+  type: FLOAT32
+  shape { dim: 3 dim: 3 dim: 3 dim: 3 }
+  filler { tag: "gaussian" arg: "0.0" arg: "0.1" }
+}
+operand {
+  name: "Conv2D_11"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 16 dim: 3 }
+}
+operand {
+  name: "Gelu"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 16 dim: 3 }
+}
+operand {
+  name: "Conv2D_22"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 16 dim: 3 }
+}
+operation {
+  type: "Conv2D"
+  input: "Placeholder"
+  input: "Conv2D_1"
+  input: "Conv2D_2"
+  output: "Conv2D_11"
+  conv2d_options {
+    padding: SAME
+    stride_w: 1
+    stride_h: 1
+    activation: NONE
+    dilation_w_factor: 1
+    dilation_h_factor: 1
+  }
+}
+operation {
+  type: "Gelu"
+  gelu_options {
+    approximate: false
+  }
+  input: "Conv2D_11"
+  output: "Gelu"
+}
+operation {
+  type: "Conv2D"
+  input: "Gelu"
+  input: "Conv2D_21"
+  input: "Conv2D_2"
+  output: "Conv2D_22"
+  conv2d_options {
+    padding: SAME
+    stride_w: 1
+    stride_h: 1
+    activation: NONE
+    dilation_w_factor: 1
+    dilation_h_factor: 1
+  }
+}
+input: "Placeholder"
+output: "Conv2D_22"
diff --git a/res/TensorFlowLiteRecipes/Net_Conv_Pad_000/test.recipe b/res/TensorFlowLiteRecipes/Net_Conv_Pad_000/test.recipe
new file mode 100644
index 00000000000..c09793b160b
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Conv_Pad_000/test.recipe
@@ -0,0 +1,107 @@
+# Conv - Pad - Conv
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 8 dim: 8 dim: 1 }
+}
+operand {
+  name: "ker_1"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 2 dim: 1 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "bias_1"
+  type: FLOAT32
+  shape { dim: 1 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "ofm_1"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 dim: 4 dim: 1 }
+}
+operation {
+  type: "Conv2D"
+  conv2d_options {
+    padding: SAME
+    stride_w: 2
+    stride_h: 2
+    activation: RELU
+  }
+  input: "ifm"
+  input: "ker_1"
+  input: "bias_1"
+  output: "ofm_1"
+}
+operand {
+  name: "pad"
+  type: INT32
+  shape { dim: 4 dim: 2 }
+  filler {
+    tag: "explicit"
+    arg: "0" arg: "0"
+    arg: "2" arg: "2"
+    arg: "2" arg: "2"
+    arg: "0" arg: "0"
+  }
+}
+operand {
+  name: "pad_ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 8 dim: 8 dim: 1 }
+}
+operation {
+  type: "Pad"
+  input: "ofm_1"
+  input: "pad"
+  output: "pad_ofm"
+}
+operand {
+  name: "ker_2"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 1 dim: 1 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "bias_2"
+  type: FLOAT32
+  shape { dim: 1 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "ofm_2"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 dim: 4 dim: 1 }
+}
+operation {
+  type: "Conv2D"
+  conv2d_options {
+    padding: SAME
+    stride_w: 2
+    stride_h: 2
+    activation: RELU
+  }
+  input: "pad_ofm"
+  input: "ker_2"
+  input: "bias_2"
+  output: "ofm_2"
+}
+input: "ifm"
+output: "ofm_2"
diff --git a/res/TensorFlowLiteRecipes/Net_Conv_TConv_000/test.recipe b/res/TensorFlowLiteRecipes/Net_Conv_TConv_000/test.recipe
new file mode 100644
index 00000000000..d00455cfcd6
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Conv_TConv_000/test.recipe
@@ -0,0 +1,83 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 8 dim: 8 dim: 1 }
+}
+operand {
+  name: "ker_1"
+  type: FLOAT32
+  shape { dim: 3 dim: 2 dim: 2 dim: 1 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "bias_1"
+  type: FLOAT32
+  shape { dim: 3 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "ofm_1"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 dim: 4 dim: 3 }
+}
+operation {
+  type: "Conv2D"
+  conv2d_options {
+    padding: SAME
+    stride_w: 2
+    stride_h: 2
+    activation: RELU
+  }
+  input: "ifm"
+  input: "ker_1"
+  input: "bias_1"
+  output: "ofm_1"
+}
+operand {
+  name: "out_shape"
+  type: INT32
+  shape { dim: 4 }
+  filler {
+    tag: "explicit"
+    arg: "1" arg: "4" arg: "4" arg: "3" 
+  }
+}
+operand {
+  name: "ker_2"
+  type: FLOAT32
+  shape { dim: 3 dim: 1 dim: 1 dim: 3 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "ofm_2"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 dim: 4 dim: 3 }
+}
+
+operation {
+  type: "TransposeConv"
+  transpose_conv_options {
+    padding: SAME
+    stride_w: 1
+    stride_h: 1
+    activation: NONE
+  }
+  input: "out_shape"
+  input: "ker_2"
+  input: "ofm_1"
+  output: "ofm_2"
+}
+input: "ifm"
+output: "ofm_2"
diff --git a/res/TensorFlowLiteRecipes/Net_Conv_TConv_000/test.reverse b/res/TensorFlowLiteRecipes/Net_Conv_TConv_000/test.reverse
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/res/TensorFlowLiteRecipes/Net_DConv_Conv_000/test.recipe b/res/TensorFlowLiteRecipes/Net_DConv_Conv_000/test.recipe
new file mode 100644
index 00000000000..ad2e784231a
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_DConv_Conv_000/test.recipe
@@ -0,0 +1,86 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 8 dim: 8 dim: 2 }
+}
+operand {
+  name: "ker"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 2 dim: 4 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "bias"
+  type: FLOAT32
+  shape { dim: 4 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 dim: 4 dim: 4 }
+}
+operation {
+  type: "DepthwiseConv2D"
+  depthwiseconv2d_options {
+    padding: VALID
+    stride_w: 2
+    stride_h: 2
+    dilation_w_factor: 1
+    dilation_h_factor: 1
+    depth_multiplier: 2
+    activation : NONE
+  }
+  input: "ifm"
+  input: "ker"
+  input: "bias"
+  output: "ofm"
+}
+operand {
+  name: "ker_1"
+  type: FLOAT32
+  shape { dim: 3 dim: 2 dim: 2 dim: 4 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "bias_1"
+  type: FLOAT32
+  shape { dim: 3 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "ofm_1"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 2 dim: 3 }
+}
+operation {
+  type: "Conv2D"
+  conv2d_options {
+    padding: SAME
+    stride_w: 2
+    stride_h: 2
+    activation: NONE
+  }
+  input: "ofm"
+  input: "ker_1"
+  input: "bias_1"
+  output: "ofm_1"
+}
+input: "ifm"
+output: "ofm_1"
diff --git a/res/TensorFlowLiteRecipes/Net_DConv_Conv_000/test.reverse b/res/TensorFlowLiteRecipes/Net_DConv_Conv_000/test.reverse
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Gelu_000/test.recipe b/res/TensorFlowLiteRecipes/Net_FullyConnected_Gelu_000/test.recipe
new file mode 100644
index 00000000000..0e47be64b46
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Gelu_000/test.recipe
@@ -0,0 +1,90 @@
+operand {
+  name: "in"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 }
+}
+operand {
+  name: "weight"
+  type: FLOAT32
+  shape { dim: 16 dim: 16 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "bias"
+  type: FLOAT32
+  shape { dim: 16 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "out"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 }
+}
+operation {
+  type: "FullyConnected"
+  fullyconnected_options {
+    activation: RELU
+  }
+  input: "in"
+  input: "weight"
+  input: "bias"
+  output: "out"
+}
+operand {
+  name: "gelu_out"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 }
+}
+operation {
+  type: "Gelu"
+  gelu_options {
+    approximate: false
+  }
+  input: "out"
+  output: "gelu_out"
+}
+operand {
+  name: "weight_2"
+  type: FLOAT32
+  shape { dim: 4 dim: 16 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "bias_2"
+  type: FLOAT32
+  shape { dim: 4 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "out_2"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 }
+}
+operation {
+  type: "FullyConnected"
+  fullyconnected_options {
+    activation: RELU
+  }
+  input: "gelu_out"
+  input: "weight_2"
+  input: "bias_2"
+  output: "out_2"
+}
+input: "in"
+output: "out_2"
diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Gelu_000/test.reverse b/res/TensorFlowLiteRecipes/Net_FullyConnected_Gelu_000/test.reverse
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/res/TensorFlowLiteRecipes/Net_InstNorm_Conv_000/test.recipe b/res/TensorFlowLiteRecipes/Net_InstNorm_Conv_000/test.recipe
new file mode 100644
index 00000000000..33864d2d815
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_InstNorm_Conv_000/test.recipe
@@ -0,0 +1,332 @@
+#
+# InstanceNorm - LeakyRelu - Conv2D
+#
+
+operand {
+  name: "Input"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 8
+    dim: 6
+    dim: 12
+  }
+  quant {
+    min: 0
+    max: 255
+  }
+}
+operand {
+  name: "InstanceNorm/beta/read"
+  type: FLOAT32
+  shape {
+    dim: 12
+  }
+  filler {
+    tag: "explicit"
+    arg: "1.9714"
+    arg: "1.4517"
+    arg: "1.20315"
+    arg: "0.287979"
+    arg: "0.161815"
+    arg: "-0.281398"
+    arg: "2.70276"
+    arg: "-0.166962"
+    arg: "0.266389"
+    arg: "0.890943"
+    arg: "-0.279833"
+    arg: "1.82808"
+  }
+}
+operand {
+  name: "InstanceNorm/gamma/read"
+  type: FLOAT32
+  shape {
+    dim: 12
+  }
+  filler {
+    tag: "explicit"
+    arg: "0.574708"
+    arg: "0.387735"
+    arg: "0.8995"
+    arg: "0.484296"
+    arg: "2.35851"
+    arg: "1.06661"
+    arg: "0.343602"
+    arg: "2.27583"
+    arg: "1.14559"
+    arg: "0.690169"
+    arg: "1.2044"
+    arg: "0.350952"
+  }
+}
+operand {
+  name: "InstanceNorm/instancenorm/Rsqrt"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 1
+    dim: 12
+  }
+}
+operand {
+  name: "InstanceNorm/instancenorm/add"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 1
+    dim: 12
+  }
+}
+operand {
+  name: "InstanceNorm/instancenorm/add/y"
+  type: FLOAT32
+  shape {
+  }
+  filler {
+    tag: "explicit"
+    arg: "0.001"
+  }
+}
+operand {
+  name: "InstanceNorm/instancenorm/mul"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 1
+    dim: 12
+  }
+}
+operand {
+  name: "InstanceNorm/instancenorm/mul_1"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 8
+    dim: 6
+    dim: 12
+  }
+}
+operand {
+  name: "InstanceNorm/instancenorm/mul_2"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 1
+    dim: 12
+  }
+}
+operand {
+  name: "InstanceNorm/instancenorm/sub"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 1
+    dim: 12
+  }
+}
+operand {
+  name: "InstanceNorm/moments/SquaredDifference"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 8
+    dim: 6
+    dim: 12
+  }
+}
+operand {
+  name: "InstanceNorm/moments/mean"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 1
+    dim: 12
+  }
+}
+operand {
+  name: "InstanceNorm/moments/mean/reduction_indices"
+  type: INT32
+  shape {
+    dim: 2
+  }
+  filler {
+    tag: "explicit"
+    arg: "1"
+    arg: "2"
+  }
+}
+operand {
+  name: "InstanceNorm/moments/variance"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 1
+    dim: 12
+  }
+}
+operand {
+  name: "InstanceNorm/moments/variance/reduction_indices"
+  type: INT32
+  shape {
+    dim: 2
+  }
+  filler {
+    tag: "explicit"
+    arg: "1"
+    arg: "2"
+  }
+}
+operand {
+  name: "InstanceNorm_out"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 8
+    dim: 6
+    dim: 12
+  }
+}
+operation {
+  type: "Mean"
+  input: "Input"
+  input: "InstanceNorm/moments/mean/reduction_indices"
+  output: "InstanceNorm/moments/mean"
+  mean_options {
+    keep_dims: true
+  }
+}
+operation {
+  type: "SquaredDifference"
+  input: "Input"
+  input: "InstanceNorm/moments/mean"
+  output: "InstanceNorm/moments/SquaredDifference"
+}
+operation {
+  type: "Mean"
+  input: "InstanceNorm/moments/SquaredDifference"
+  input: "InstanceNorm/moments/variance/reduction_indices"
+  output: "InstanceNorm/moments/variance"
+  mean_options {
+    keep_dims: true
+  }
+}
+operation {
+  type: "Add"
+  input: "InstanceNorm/moments/variance"
+  input: "InstanceNorm/instancenorm/add/y"
+  output: "InstanceNorm/instancenorm/add"
+  add_options {
+    activation: NONE
+  }
+}
+operation {
+  type: "Rsqrt"
+  input: "InstanceNorm/instancenorm/add"
+  output: "InstanceNorm/instancenorm/Rsqrt"
+}
+operation {
+  type: "Mul"
+  input: "InstanceNorm/instancenorm/Rsqrt"
+  input: "InstanceNorm/gamma/read"
+  output: "InstanceNorm/instancenorm/mul"
+  mul_options {
+    activation: NONE
+  }
+}
+operation {
+  type: "Mul"
+  input: "Input"
+  input: "InstanceNorm/instancenorm/mul"
+  output: "InstanceNorm/instancenorm/mul_1"
+  mul_options {
+    activation: NONE
+  }
+}
+operation {
+  type: "Mul"
+  input: "InstanceNorm/moments/mean"
+  input: "InstanceNorm/instancenorm/mul"
+  output: "InstanceNorm/instancenorm/mul_2"
+  mul_options {
+    activation: NONE
+  }
+}
+operation {
+  type: "Sub"
+  input: "InstanceNorm/beta/read"
+  input: "InstanceNorm/instancenorm/mul_2"
+  output: "InstanceNorm/instancenorm/sub"
+  sub_options {
+    activation: NONE
+  }
+}
+operation {
+  type: "Add"
+  input: "InstanceNorm/instancenorm/mul_1"
+  input: "InstanceNorm/instancenorm/sub"
+  output: "InstanceNorm_out"
+  add_options {
+    activation: NONE
+  }
+}
+operand {
+  name: "LeakyRelu"
+  type: FLOAT32
+  shape { dim: 1 dim: 8 dim: 6 dim: 12 }
+}
+operation {
+  type: "LeakyRelu"
+  leaky_relu_options {
+    alpha: 2.0
+  }
+  input: "InstanceNorm_out"
+  output: "LeakyRelu"
+}
+operand {
+  name: "Conv2D/ker"
+  type: FLOAT32
+  shape { dim: 3 dim: 1 dim: 1 dim: 12 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "Conv2D/bias"
+  type: FLOAT32
+  shape { dim: 3 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "Conv2D/ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 8 dim: 6 dim: 3 }
+}
+operation {
+  type: "Conv2D"
+  conv2d_options {
+    padding: SAME
+    stride_w: 1
+    stride_h: 1
+    activation: NONE
+  }
+  input: "LeakyRelu"
+  input: "Conv2D/ker"
+  input: "Conv2D/bias"
+  output: "Conv2D/ofm"
+}
+input: "Input"
+output: "Conv2D/ofm"
diff --git a/runtime/contrib/android/api/Prebuilt.mk b/runtime/contrib/android/api/Prebuilt.mk
index 9f0c0146a05..479dc34049c 100644
--- a/runtime/contrib/android/api/Prebuilt.mk
+++ b/runtime/contrib/android/api/Prebuilt.mk
@@ -18,7 +18,7 @@ include $(CLEAR_VARS)
 LOCAL_MODULE := onert_core
 PREBUILT_LIB += onert_core
 LOCAL_SRC_FILES := \
-		$(ONERT_PREBUILT_LIB_DIR)/libonert_core.so
+		$(ONERT_PREBUILT_LIB_DIR)/nnfw/libonert_core.so
 include $(PREBUILT_SHARED_LIBRARY)
 
 # backend_cpu
diff --git a/runtime/libs/circle-schema/include/circle_schema_generated.h b/runtime/libs/circle-schema/include/circle_schema_generated.h
index 7a563042b81..b481875e9cd 100644
--- a/runtime/libs/circle-schema/include/circle_schema_generated.h
+++ b/runtime/libs/circle-schema/include/circle_schema_generated.h
@@ -701,10 +701,10 @@ struct ModelT;
 
 enum TensorType : int8_t
 {
-  TensorType_Q8_1 = -5,
-  TensorType_Q8_0 = -4,
-  TensorType_Q4_1 = -3,
-  TensorType_Q4_0 = -2,
+  TensorType_GGML_Q8_1 = -5,
+  TensorType_GGML_Q8_0 = -4,
+  TensorType_GGML_Q4_1 = -3,
+  TensorType_GGML_Q4_0 = -2,
   TensorType_UINT4 = -1,
   TensorType_FLOAT32 = 0,
   TensorType_FLOAT16 = 1,
@@ -724,18 +724,18 @@ enum TensorType : int8_t
   TensorType_UINT32 = 15,
   TensorType_UINT16 = 16,
   TensorType_INT4 = 17,
-  TensorType_MIN = TensorType_Q8_1,
+  TensorType_MIN = TensorType_GGML_Q8_1,
   TensorType_MAX = TensorType_INT4
 };
 
 inline const TensorType (&EnumValuesTensorType())[23]
 {
   static const TensorType values[] = {
-    TensorType_Q8_1,       TensorType_Q8_0,      TensorType_Q4_1,     TensorType_Q4_0,
-    TensorType_UINT4,      TensorType_FLOAT32,   TensorType_FLOAT16,  TensorType_INT32,
-    TensorType_UINT8,      TensorType_INT64,     TensorType_STRING,   TensorType_BOOL,
-    TensorType_INT16,      TensorType_COMPLEX64, TensorType_INT8,     TensorType_FLOAT64,
-    TensorType_COMPLEX128, TensorType_UINT64,    TensorType_RESOURCE, TensorType_VARIANT,
+    TensorType_GGML_Q8_1,  TensorType_GGML_Q8_0, TensorType_GGML_Q4_1, TensorType_GGML_Q4_0,
+    TensorType_UINT4,      TensorType_FLOAT32,   TensorType_FLOAT16,   TensorType_INT32,
+    TensorType_UINT8,      TensorType_INT64,     TensorType_STRING,    TensorType_BOOL,
+    TensorType_INT16,      TensorType_COMPLEX64, TensorType_INT8,      TensorType_FLOAT64,
+    TensorType_COMPLEX128, TensorType_UINT64,    TensorType_RESOURCE,  TensorType_VARIANT,
     TensorType_UINT32,     TensorType_UINT16,    TensorType_INT4};
   return values;
 }
@@ -743,17 +743,18 @@ inline const TensorType (&EnumValuesTensorType())[23]
 inline const char *const *EnumNamesTensorType()
 {
   static const char *const names[24] = {
-    "Q8_1",       "Q8_0",   "Q4_1",     "Q4_0",    "UINT4",  "FLOAT32",   "FLOAT16", "INT32",
-    "UINT8",      "INT64",  "STRING",   "BOOL",    "INT16",  "COMPLEX64", "INT8",    "FLOAT64",
-    "COMPLEX128", "UINT64", "RESOURCE", "VARIANT", "UINT32", "UINT16",    "INT4",    nullptr};
+    "GGML_Q8_1", "GGML_Q8_0", "GGML_Q4_1", "GGML_Q4_0", "UINT4",      "FLOAT32",
+    "FLOAT16",   "INT32",     "UINT8",     "INT64",     "STRING",     "BOOL",
+    "INT16",     "COMPLEX64", "INT8",      "FLOAT64",   "COMPLEX128", "UINT64",
+    "RESOURCE",  "VARIANT",   "UINT32",    "UINT16",    "INT4",       nullptr};
   return names;
 }
 
 inline const char *EnumNameTensorType(TensorType e)
 {
-  if (::flatbuffers::IsOutRange(e, TensorType_Q8_1, TensorType_INT4))
+  if (::flatbuffers::IsOutRange(e, TensorType_GGML_Q8_1, TensorType_INT4))
     return "";
-  const size_t index = static_cast<size_t>(e) - static_cast<size_t>(TensorType_Q8_1);
+  const size_t index = static_cast<size_t>(e) - static_cast<size_t>(TensorType_GGML_Q8_1);
   return EnumNamesTensorType()[index];
 }
 
@@ -1066,6 +1067,34 @@ bool VerifySparseIndexVectorVector(::flatbuffers::Verifier &verifier,
                                    const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values,
                                    const ::flatbuffers::Vector<uint8_t> *types);
 
+enum CompressionType : int8_t
+{
+  CompressionType_NONE = 0,
+  CompressionType_HUFFMAN = 1,
+  CompressionType_MIN = CompressionType_NONE,
+  CompressionType_MAX = CompressionType_HUFFMAN
+};
+
+inline const CompressionType (&EnumValuesCompressionType())[2]
+{
+  static const CompressionType values[] = {CompressionType_NONE, CompressionType_HUFFMAN};
+  return values;
+}
+
+inline const char *const *EnumNamesCompressionType()
+{
+  static const char *const names[3] = {"NONE", "HUFFMAN", nullptr};
+  return names;
+}
+
+inline const char *EnumNameCompressionType(CompressionType e)
+{
+  if (::flatbuffers::IsOutRange(e, CompressionType_NONE, CompressionType_HUFFMAN))
+    return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesCompressionType()[index];
+}
+
 enum BuiltinOperator : int32_t
 {
   BuiltinOperator_GRU = -5,
@@ -7061,6 +7090,7 @@ struct TensorT : public ::flatbuffers::NativeTable
   std::vector<int32_t> shape_signature{};
   bool has_rank = false;
   std::vector<std::unique_ptr<circle::VariantSubTypeT>> variant_tensors{};
+  circle::CompressionType compression_type = circle::CompressionType_NONE;
   TensorT() = default;
   TensorT(const TensorT &o);
   TensorT(TensorT &&) FLATBUFFERS_NOEXCEPT = default;
@@ -7082,7 +7112,8 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
     VT_SPARSITY = 16,
     VT_SHAPE_SIGNATURE = 18,
     VT_HAS_RANK = 20,
-    VT_VARIANT_TENSORS = 22
+    VT_VARIANT_TENSORS = 22,
+    VT_COMPRESSION_TYPE = 24
   };
   const ::flatbuffers::Vector<int32_t> *shape() const
   {
@@ -7117,6 +7148,10 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
     return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<circle::VariantSubType>> *>(
       VT_VARIANT_TENSORS);
   }
+  circle::CompressionType compression_type() const
+  {
+    return static_cast<circle::CompressionType>(GetField<int8_t>(VT_COMPRESSION_TYPE, 0));
+  }
   bool Verify(::flatbuffers::Verifier &verifier) const
   {
     return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_SHAPE) &&
@@ -7129,7 +7164,8 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
            VerifyOffset(verifier, VT_SHAPE_SIGNATURE) && verifier.VerifyVector(shape_signature()) &&
            VerifyField<uint8_t>(verifier, VT_HAS_RANK, 1) &&
            VerifyOffset(verifier, VT_VARIANT_TENSORS) && verifier.VerifyVector(variant_tensors()) &&
-           verifier.VerifyVectorOfTables(variant_tensors()) && verifier.EndTable();
+           verifier.VerifyVectorOfTables(variant_tensors()) &&
+           VerifyField<int8_t>(verifier, VT_COMPRESSION_TYPE, 1) && verifier.EndTable();
   }
   TensorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
   void UnPackTo(TensorT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -7182,6 +7218,10 @@ struct TensorBuilder
   {
     fbb_.AddOffset(Tensor::VT_VARIANT_TENSORS, variant_tensors);
   }
+  void add_compression_type(circle::CompressionType compression_type)
+  {
+    fbb_.AddElement<int8_t>(Tensor::VT_COMPRESSION_TYPE, static_cast<int8_t>(compression_type), 0);
+  }
   explicit TensorBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb)
   {
     start_ = fbb_.StartTable();
@@ -7203,7 +7243,8 @@ inline ::flatbuffers::Offset<Tensor> CreateTensor(
   ::flatbuffers::Offset<circle::SparsityParameters> sparsity = 0,
   ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape_signature = 0, bool has_rank = false,
   ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<circle::VariantSubType>>>
-    variant_tensors = 0)
+    variant_tensors = 0,
+  circle::CompressionType compression_type = circle::CompressionType_NONE)
 {
   TensorBuilder builder_(_fbb);
   builder_.add_variant_tensors(variant_tensors);
@@ -7213,6 +7254,7 @@ inline ::flatbuffers::Offset<Tensor> CreateTensor(
   builder_.add_name(name);
   builder_.add_buffer(buffer);
   builder_.add_shape(shape);
+  builder_.add_compression_type(compression_type);
   builder_.add_has_rank(has_rank);
   builder_.add_is_variable(is_variable);
   builder_.add_type(type);
@@ -7226,7 +7268,8 @@ inline ::flatbuffers::Offset<Tensor> CreateTensorDirect(
   ::flatbuffers::Offset<circle::QuantizationParameters> quantization = 0, bool is_variable = false,
   ::flatbuffers::Offset<circle::SparsityParameters> sparsity = 0,
   const std::vector<int32_t> *shape_signature = nullptr, bool has_rank = false,
-  const std::vector<::flatbuffers::Offset<circle::VariantSubType>> *variant_tensors = nullptr)
+  const std::vector<::flatbuffers::Offset<circle::VariantSubType>> *variant_tensors = nullptr,
+  circle::CompressionType compression_type = circle::CompressionType_NONE)
 {
   auto shape__ = shape ? _fbb.CreateVector<int32_t>(*shape) : 0;
   auto name__ = name ? _fbb.CreateString(name) : 0;
@@ -7236,7 +7279,8 @@ inline ::flatbuffers::Offset<Tensor> CreateTensorDirect(
       ? _fbb.CreateVector<::flatbuffers::Offset<circle::VariantSubType>>(*variant_tensors)
       : 0;
   return circle::CreateTensor(_fbb, shape__, type, buffer, name__, quantization, is_variable,
-                              sparsity, shape_signature__, has_rank, variant_tensors__);
+                              sparsity, shape_signature__, has_rank, variant_tensors__,
+                              compression_type);
 }
 
 ::flatbuffers::Offset<Tensor>
@@ -21280,7 +21324,7 @@ inline TensorT::TensorT(const TensorT &o)
     quantization((o.quantization) ? new circle::QuantizationParametersT(*o.quantization) : nullptr),
     is_variable(o.is_variable),
     sparsity((o.sparsity) ? new circle::SparsityParametersT(*o.sparsity) : nullptr),
-    shape_signature(o.shape_signature), has_rank(o.has_rank)
+    shape_signature(o.shape_signature), has_rank(o.has_rank), compression_type(o.compression_type)
 {
   variant_tensors.reserve(o.variant_tensors.size());
   for (const auto &variant_tensors_ : o.variant_tensors)
@@ -21302,6 +21346,7 @@ inline TensorT &TensorT::operator=(TensorT o) FLATBUFFERS_NOEXCEPT
   std::swap(shape_signature, o.shape_signature);
   std::swap(has_rank, o.has_rank);
   std::swap(variant_tensors, o.variant_tensors);
+  std::swap(compression_type, o.compression_type);
   return *this;
 }
 
@@ -21426,6 +21471,10 @@ inline void Tensor::UnPackTo(TensorT *_o, const ::flatbuffers::resolver_function
       _o->variant_tensors.resize(0);
     }
   }
+  {
+    auto _e = compression_type();
+    _o->compression_type = _e;
+  }
 }
 
 inline ::flatbuffers::Offset<Tensor>
@@ -21468,8 +21517,10 @@ CreateTensor(::flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o,
           },
           &_va)
       : 0;
+  auto _compression_type = _o->compression_type;
   return circle::CreateTensor(_fbb, _shape, _type, _buffer, _name, _quantization, _is_variable,
-                              _sparsity, _shape_signature, _has_rank, _variant_tensors);
+                              _sparsity, _shape_signature, _has_rank, _variant_tensors,
+                              _compression_type);
 }
 
 inline StablehloGatherOptionsT *
diff --git a/runtime/onert/api/nnapi/CMakeLists.txt b/runtime/onert/api/nnapi/CMakeLists.txt
index 324e59afe71..eb48a7beac1 100644
--- a/runtime/onert/api/nnapi/CMakeLists.txt
+++ b/runtime/onert/api/nnapi/CMakeLists.txt
@@ -2,18 +2,13 @@ file(GLOB_RECURSE SOURCES_FRONTEND "*.cc")
 file(GLOB_RECURSE TESTS_FRONTEND "*.test.cc")
 list(REMOVE_ITEM SOURCES_FRONTEND ${TESTS_FRONTEND})
 
-set(LIB_ONERT onert)
+set(LIB_NNAPI onert_nnapi)
 
-add_library(${LIB_ONERT} SHARED ${SOURCES_FRONTEND})
-target_include_directories(${LIB_ONERT} PUBLIC include)
-target_link_libraries(${LIB_ONERT} PUBLIC onert_core) # TODO Link PRIVATE onert_core
-target_link_libraries(${LIB_ONERT} PRIVATE nnfw_common)
-target_link_libraries(${LIB_ONERT} PRIVATE nnfw_coverage)
-
-set_target_properties(${LIB_ONERT} PROPERTIES OUTPUT_NAME neuralnetworks)
-set_target_properties(${LIB_ONERT} PROPERTIES INSTALL_RPATH "$ORIGIN:$ORIGIN/nnfw")
-
-install(TARGETS ${LIB_ONERT} DESTINATION lib)
+add_library(${LIB_NNAPI} STATIC ${SOURCES_FRONTEND})
+target_include_directories(${LIB_NNAPI} PUBLIC include)
+target_link_libraries(${LIB_NNAPI} PUBLIC onert_core) # TODO Link PRIVATE onert_core
+target_link_libraries(${LIB_NNAPI} PRIVATE nnfw_common)
+target_link_libraries(${LIB_NNAPI} PRIVATE nnfw_coverage)
 
 if(NOT ENABLE_TEST)
   return()
@@ -21,11 +16,10 @@ endif(NOT ENABLE_TEST)
 
 add_executable(test_onert_frontend_nnapi ${TESTS_FRONTEND})
 
-target_link_libraries(test_onert_frontend_nnapi PRIVATE ${LIB_ONERT} dl)
-target_link_libraries(test_onert_frontend_nnapi PRIVATE gtest)
-target_link_libraries(test_onert_frontend_nnapi PRIVATE gtest_main)
-# INSTALL_RPATH is for onert_core public link
-# TODO Remove INSTALL_RPATH
-set_target_properties(test_onert_frontend_nnapi PROPERTIES INSTALL_RPATH "$ORIGIN/../lib:$ORIGIN/../lib/nnfw")
+target_link_libraries(test_onert_frontend_nnapi ${LIB_NNAPI} dl)
+target_link_libraries(test_onert_frontend_nnapi gtest)
+target_link_libraries(test_onert_frontend_nnapi gtest_main)
+# Set INSTALL_RPATH to find onert_core
+set_target_properties(test_onert_frontend_nnapi PROPERTIES INSTALL_RPATH "$ORIGIN/../lib/nnfw")
 
 install(TARGETS test_onert_frontend_nnapi DESTINATION unittest)
diff --git a/runtime/onert/api/nnapi/compilation.cc b/runtime/onert/api/nnapi/compilation.cc
index 2c56f061aaa..f7d64a6cbd6 100644
--- a/runtime/onert/api/nnapi/compilation.cc
+++ b/runtime/onert/api/nnapi/compilation.cc
@@ -104,3 +104,10 @@ int ANeuralNetworksCompilation_setPreference(ANeuralNetworksCompilation *compila
   // NYI: nothing to set
   return ANEURALNETWORKS_NO_ERROR;
 }
+
+int ANeuralNetworksCompilation_setCaching(ANeuralNetworksCompilation *, const char *,
+                                          const uint8_t *)
+{
+  VERBOSE(NNAPI::Compilation) << "setCaching: NYI" << std::endl;
+  return ANEURALNETWORKS_BAD_STATE;
+}
diff --git a/runtime/onert/api/nnapi/execution.cc b/runtime/onert/api/nnapi/execution.cc
index 4e1a985f305..3d5f7fc05a8 100644
--- a/runtime/onert/api/nnapi/execution.cc
+++ b/runtime/onert/api/nnapi/execution.cc
@@ -502,3 +502,21 @@ int ANeuralNetworksExecution_getOutputOperandDimensions(ANeuralNetworksExecution
 
   return ANEURALNETWORKS_NO_ERROR;
 }
+
+int ANeuralNetworksBurst_create(ANeuralNetworksCompilation *, ANeuralNetworksBurst **)
+{
+  VERBOSE(NNAPI::Execution) << "BurstCreate: NYI" << std::endl;
+  return ANEURALNETWORKS_BAD_STATE;
+}
+
+void ANeuralNetworksBurst_free(ANeuralNetworksBurst *)
+{
+  // TODO delete burst
+  // delete burst;
+}
+
+int ANeuralNetworksExecution_burstCompute(ANeuralNetworksExecution *, ANeuralNetworksBurst *)
+{
+  VERBOSE(NNAPI::Execution) << "burstCompute: NYI" << std::endl;
+  return ANEURALNETWORKS_BAD_STATE;
+}
diff --git a/runtime/onert/api/nnapi/model.cc b/runtime/onert/api/nnapi/model.cc
index 8c7bd178929..93f091a07dd 100644
--- a/runtime/onert/api/nnapi/model.cc
+++ b/runtime/onert/api/nnapi/model.cc
@@ -174,6 +174,13 @@ int ANeuralNetworksModel_setOperandValue(ANeuralNetworksModel *model, int32_t in
   return ANEURALNETWORKS_NO_ERROR;
 }
 
+int ANeuralNetworksModel_setOperandSymmPerChannelQuantParams(
+  ANeuralNetworksModel *, int32_t, const ANeuralNetworksSymmPerChannelQuantParams *)
+{
+  VERBOSE(NNAPI::Model) << "setOperandSymmPerChannelQuantParams: NYI" << std::endl;
+  return ANEURALNETWORKS_BAD_STATE;
+}
+
 int ANeuralNetworksModel_setOperandValueFromMemory(ANeuralNetworksModel *model, int32_t index,
                                                    const ANeuralNetworksMemory *memory,
                                                    size_t offset, size_t length)
diff --git a/runtime/onert/backend/train/KernelGenerator.cc b/runtime/onert/backend/train/KernelGenerator.cc
index aaaa50f1b72..7e0d42c25e5 100644
--- a/runtime/onert/backend/train/KernelGenerator.cc
+++ b/runtime/onert/backend/train/KernelGenerator.cc
@@ -405,13 +405,15 @@ void KernelGenerator::visit(const ir::train::operation::Loss &node)
 
   auto loss_code = node.param().loss_code;
   auto loss_param = node.param().loss_param;
+  const auto reduction_type = node.param().reduction_type;
 
   switch (loss_code)
   {
     case ir::train::LossCode::MeanSquaredError:
     {
       auto fn = std::make_unique<ops::LossMeanSquaredErrorLayer>();
-      fn->configure(y_pred_tensor, y_true_tensor, output_tensor, back_prop_y_pred_tensor);
+      fn->configure(y_pred_tensor, y_true_tensor, output_tensor, back_prop_y_pred_tensor,
+                    reduction_type);
       _return_fn = std::move(fn);
       break;
     }
@@ -419,7 +421,7 @@ void KernelGenerator::visit(const ir::train::operation::Loss &node)
     {
       auto fn = std::make_unique<ops::LossCategoricalCrossentropyLayer>();
       fn->configure(y_pred_tensor, y_true_tensor, output_tensor, back_prop_y_pred_tensor,
-                    loss_param.cce.axis, loss_param.cce.label_smoothing);
+                    reduction_type, loss_param.cce.axis, loss_param.cce.label_smoothing);
       _return_fn = std::move(fn);
       break;
     }
diff --git a/runtime/onert/backend/train/MemoryManager.cc b/runtime/onert/backend/train/MemoryManager.cc
index 4902e2a7eaa..fd156fea231 100644
--- a/runtime/onert/backend/train/MemoryManager.cc
+++ b/runtime/onert/backend/train/MemoryManager.cc
@@ -61,13 +61,13 @@ DisposableMemoryManager::DisposableMemoryManager() : _mem_planner{createMemoryPl
 basic::IMemoryPlanner<DisposableTensorIndex> *DisposableMemoryManager::createMemoryPlanner()
 {
   auto planner_id = util::getConfigString(util::config::CPU_MEMORY_PLANNER);
-  return MemoryPlannerFactory::get().create(planner_id);
+  return MemoryPlannerFactory<DisposableTensorIndex>::get().create(planner_id);
 }
 
 basic::IMemoryPlanner<DisposableTensorIndex> *
 DisposableMemoryManager::createMemoryPlanner(const std::string planner_id)
 {
-  return MemoryPlannerFactory::get().create(planner_id);
+  return MemoryPlannerFactory<DisposableTensorIndex>::get().create(planner_id);
 }
 
 void DisposableMemoryManager::claimPlan(const DisposableTensorIndex &ind, uint32_t size)
diff --git a/runtime/onert/backend/train/MemoryManager.h b/runtime/onert/backend/train/MemoryManager.h
index 19a60e32deb..98e840bf7f7 100644
--- a/runtime/onert/backend/train/MemoryManager.h
+++ b/runtime/onert/backend/train/MemoryManager.h
@@ -67,6 +67,8 @@ class DisposableMemoryManager
   std::shared_ptr<basic::Allocator> _mem_alloc;
 };
 
+// TODO: Add LayerScopeMemoryManager using MemoryPlannerFactory<LayerScopeTensorIndex>
+
 } // namespace train
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/train/MemoryPlanner.test.cc b/runtime/onert/backend/train/MemoryPlanner.test.cc
index 05d8ade2620..19b8b537b43 100644
--- a/runtime/onert/backend/train/MemoryPlanner.test.cc
+++ b/runtime/onert/backend/train/MemoryPlanner.test.cc
@@ -17,6 +17,7 @@
 #include <gtest/gtest.h>
 
 #include "DisposableTensorIndex.h"
+#include "LayerScopeTensorIndex.h"
 #include "MemoryPlanner.h"
 #include "ir/Index.h"
 
@@ -25,9 +26,74 @@ using onert::ir::OperandIndex;
 using onert::ir::OperationIndex;
 
 // TODO: Add test testcase for {Bump, FirstFit, WIC}Planner<LayerScopeTensor>
+namespace
+{
+
+template <typename T> T to_index(uint32_t first, uint32_t second) = delete;
+
+template <> DisposableTensorIndex to_index<DisposableTensorIndex>(uint32_t first, uint32_t second)
+{
+  return DisposableTensorIndex{OperationIndex{first}, OperandIndex{second}};
+}
 
-TEST(BumpPlanner, claim_test)
+template <> LayerScopeTensorIndex to_index<LayerScopeTensorIndex>(uint32_t first, uint32_t second)
 {
+  return LayerScopeTensorIndex{OperationIndex{first}, second};
+}
+
+template <template <typename> class Planner, typename Index> class PlannerVerifier final
+{
+public:
+  void claim(uint32_t first_idx, uint32_t second_idx, size_t size)
+  {
+    auto index = to_index<Index>(first_idx, second_idx);
+    _planner.claim(index, size);
+  }
+
+  // Claim plan and verify newly added plan, call ASSERT_* on failure
+  void claim(uint32_t first_idx, uint32_t second_idx, size_t size, uint32_t expected_offset)
+  {
+    auto index = to_index<Index>(first_idx, second_idx);
+    _planner.claim(index, size);
+
+    auto mem_blk = _planner.memory_plans()[index];
+    ASSERT_EQ(mem_blk.offset, expected_offset);
+    ASSERT_EQ(mem_blk.size, size);
+  }
+
+  void release(uint32_t first_idx, uint32_t second_idx)
+  {
+    _planner.release(to_index<Index>(first_idx, second_idx));
+  }
+
+  // Verify capacity, call ASSERT_* on failure
+  void capacity(uint32_t expected_capacity)
+  {
+    auto actual_capacity = _planner.capacity();
+    ASSERT_EQ(actual_capacity, expected_capacity);
+  }
+
+  // Verify memory_plans's size and offset, calls ASSERT_* on failure
+  void verify(uint32_t first_idx, uint32_t second_idx, size_t expected_size,
+              uint32_t expected_offset)
+  {
+    auto index = to_index<Index>(first_idx, second_idx);
+    auto mem_blk = _planner.memory_plans()[index];
+    ASSERT_EQ(mem_blk.offset, expected_offset);
+    ASSERT_EQ(mem_blk.size, expected_size);
+  }
+
+private:
+  Planner<Index> _planner;
+};
+
+} // namespace
+
+TEST(BumpPlanner, disposable_claim_test)
+{
+  PlannerVerifier<BumpPlanner, DisposableTensorIndex> p;
+  /*
+  TODO - remove commented code
   BumpPlanner<DisposableTensorIndex> planner;
 
   auto claim = [&planner](uint32_t op_index, uint32_t operand_index, size_t size,
@@ -48,16 +114,22 @@ TEST(BumpPlanner, claim_test)
     auto actual_capacity = planner.capacity();
     ASSERT_EQ(actual_capacity, expected_capacity);
   };
-
-  claim(0, 0, 10, 0);
-  claim(1, 0, 20, 10);
-  claim(2, 2, 30, 30);
-  release(0, 0);
-  capacity(60);
+  */
+
+  ASSERT_NO_FATAL_FAILURE({
+    p.claim(0, 0, 10, 0);
+    p.claim(1, 0, 20, 10);
+    p.claim(2, 2, 30, 30);
+    p.release(0, 0);
+    p.capacity(60);
+  });
 }
 
-TEST(FirstFitPlanner, claim_release_test)
+TEST(FirstFitPlanner, disposable_claim_release_test)
 {
+  PlannerVerifier<FirstFitPlanner, DisposableTensorIndex> p;
+  /*
+  TODO - remove commented code
   FirstFitPlanner<DisposableTensorIndex> planner;
 
   auto claim = [&planner](uint32_t op_index, uint32_t operand_index, size_t size,
@@ -78,79 +150,84 @@ TEST(FirstFitPlanner, claim_release_test)
     auto actual_capacity = planner.capacity();
     ASSERT_EQ(actual_capacity, expected_capacity);
   };
+  */
 
-  // 0 CLAIM - 10
-  claim(0, 0, 10, 0);
+  ASSERT_NO_FATAL_FAILURE({
+    // 0 CLAIM - 10
+    p.claim(0, 0, 10, 0);
 
-  // 1 CLAIM - 20
-  claim(1, 0, 20, 10);
+    // 1 CLAIM - 20
+    p.claim(1, 0, 20, 10);
 
-  // 2 CLAIM - 30
-  claim(2, 2, 30, 30);
+    // 2 CLAIM - 30
+    p.claim(2, 2, 30, 30);
 
-  // 0 RELEASE - 10
-  release(0, 0);
+    // 0 RELEASE - 10
+    p.release(0, 0);
 
-  // 3 CLAIM - 20
-  claim(3, 1, 20, 60);
+    // 3 CLAIM - 20
+    p.claim(3, 1, 20, 60);
 
-  // 4 CLAIM - 5
-  claim(4, 1, 5, 0);
+    // 4 CLAIM - 5
+    p.claim(4, 1, 5, 0);
 
-  // 5 CLAIM - 10
-  claim(5, 1, 10, 80);
+    // 5 CLAIM - 10
+    p.claim(5, 1, 10, 80);
 
-  // 6 CLAIM - 5
-  claim(6, 1, 5, 5);
+    // 6 CLAIM - 5
+    p.claim(6, 1, 5, 5);
 
-  // 2 RELEASE - 30
-  release(2, 2);
+    // 2 RELEASE - 30
+    p.release(2, 2);
 
-  // 7 CLAIM - 35
-  claim(7, 1, 35, 90);
+    // 7 CLAIM - 35
+    p.claim(7, 1, 35, 90);
 
-  // 8 CLAIM - 10
-  claim(8, 1, 10, 30);
+    // 8 CLAIM - 10
+    p.claim(8, 1, 10, 30);
 
-  // 4 RELEASE - 5
-  release(4, 1);
+    // 4 RELEASE - 5
+    p.release(4, 1);
 
-  // 9 CLAIM - 10
-  claim(9, 0, 10, 40);
+    // 9 CLAIM - 10
+    p.claim(9, 0, 10, 40);
 
-  // 10 CLAIM - 10
-  claim(10, 0, 10, 50);
+    // 10 CLAIM - 10
+    p.claim(10, 0, 10, 50);
 
-  // 6 RELEASE
-  release(6, 1);
+    // 6 RELEASE
+    p.release(6, 1);
 
-  // 1 RELEASE
-  release(1, 0);
+    // 1 RELEASE
+    p.release(1, 0);
 
-  // 8 RELEASE
-  release(8, 1);
+    // 8 RELEASE
+    p.release(8, 1);
 
-  // 9 RELEASE
-  release(9, 0);
+    // 9 RELEASE
+    p.release(9, 0);
 
-  // 10 RELEASE
-  release(10, 0);
+    // 10 RELEASE
+    p.release(10, 0);
 
-  // 3 RELEASE
-  release(3, 1);
+    // 3 RELEASE
+    p.release(3, 1);
 
-  // 5 RELEASE
-  release(5, 1);
+    // 5 RELEASE
+    p.release(5, 1);
 
-  // 7 RELEASE
-  release(7, 1);
+    // 7 RELEASE
+    p.release(7, 1);
 
-  // CAPACITY - 125
-  capacity(125);
+    // CAPACITY - 125
+    p.capacity(125);
+  });
 }
 
-TEST(FirstFitPlanner, neg_release_non_existing_index)
+TEST(FirstFitPlanner, disposable_neg_release_non_existing_index)
 {
+  PlannerVerifier<FirstFitPlanner, DisposableTensorIndex> p;
+  /*
   FirstFitPlanner<DisposableTensorIndex> planner;
 
   auto claim = [&planner](uint32_t op_index, uint32_t operand_index, size_t size,
@@ -166,27 +243,33 @@ TEST(FirstFitPlanner, neg_release_non_existing_index)
     DisposableTensorIndex mem_idx{OperationIndex{op_index}, OperandIndex{operand_index}};
     planner.release(mem_idx);
   };
+  */
 
-  // 0 CLAIM - 10
-  claim(0, 0, 10, 0);
-
-  // 1 CLAIM - 20
-  claim(1, 0, 20, 10);
-
-  // 2 CLAIM - 30
-  claim(2, 2, 30, 30);
-
-  // RELEASE non-existing index
-  auto on_only_debug_mode = [&release]() {
-    EXPECT_DEATH({ release(0, 1); },
+  auto on_only_debug_mode = [&p]() {
+    EXPECT_DEATH({ p.release(0, 1); },
                  "Cannot release for given index. It has been not claimed or released already.");
     return true;
   };
-  assert(on_only_debug_mode());
+
+  ASSERT_NO_FATAL_FAILURE({
+    // 0 CLAIM - 10
+    p.claim(0, 0, 10, 0);
+
+    // 1 CLAIM - 20
+    p.claim(1, 0, 20, 10);
+
+    // 2 CLAIM - 30
+    p.claim(2, 2, 30, 30);
+
+    // RELEASE non-existing index
+    assert(on_only_debug_mode());
+  });
 }
 
-TEST(FirstFitPlanner, neg_release_twice)
+TEST(FirstFitPlanner, disposable_neg_release_twice)
 {
+  PlannerVerifier<FirstFitPlanner, DisposableTensorIndex> p;
+  /*
   FirstFitPlanner<DisposableTensorIndex> planner;
 
   auto claim = [&planner](uint32_t op_index, uint32_t operand_index, size_t size,
@@ -202,30 +285,35 @@ TEST(FirstFitPlanner, neg_release_twice)
     DisposableTensorIndex mem_idx{OperationIndex{op_index}, OperandIndex{operand_index}};
     planner.release(mem_idx);
   };
+  */
+  auto on_only_debug_mode = [&p]() {
+    EXPECT_EXIT({ p.release(0, 0); }, ::testing::KilledBySignal(SIGABRT),
+                "Cannot release for given index. It has been not claimed or released already.");
+    return true;
+  };
 
-  // 0 CLAIM - 10
-  claim(0, 0, 10, 0);
+  ASSERT_NO_FATAL_FAILURE({
+    // 0 CLAIM - 10
+    p.claim(0, 0, 10, 0);
 
-  // 1 CLAIM - 20
-  claim(1, 0, 20, 10);
+    // 1 CLAIM - 20
+    p.claim(1, 0, 20, 10);
 
-  // 2 CLAIM - 30
-  claim(2, 2, 30, 30);
+    // 2 CLAIM - 30
+    p.claim(2, 2, 30, 30);
 
-  // 0 RELEASE - 10
-  release(0, 0);
+    // 0 RELEASE - 10
+    p.release(0, 0);
 
-  // 0 RELEASE again
-  auto on_only_debug_mode = [&release]() {
-    EXPECT_EXIT({ release(0, 0); }, ::testing::KilledBySignal(SIGABRT),
-                "Cannot release for given index. It has been not claimed or released already.");
-    return true;
-  };
-  assert(on_only_debug_mode());
+    // 0 RELEASE again
+    assert(on_only_debug_mode());
+  });
 }
 
-TEST(WICPlanner, claim_release_test)
+TEST(WICPlanner, disposable_claim_release_test)
 {
+  PlannerVerifier<WICPlanner, DisposableTensorIndex> p;
+  /*
   WICPlanner<DisposableTensorIndex> planner;
 
   auto claim = [&planner](uint32_t op_index, uint32_t operand_index, size_t size) {
@@ -250,42 +338,47 @@ TEST(WICPlanner, claim_release_test)
     auto actual_capacity = planner.capacity();
     ASSERT_EQ(actual_capacity, expected_capacity);
   };
-
-  claim(0, 0, 20);
-  claim(1, 0, 5);
-  release(0, 0);
-  claim(2, 2, 10);
-  release(1, 0);
-  claim(3, 1, 10);
-  release(2, 2);
-  claim(4, 1, 10);
-  release(3, 1);
-  claim(5, 1, 20);
-  release(4, 1);
-  claim(6, 1, 20);
-  release(5, 1);
-
-  // VERIFY 0 - 0
-  verify(0, 0, 20, 0);
-
-  // VERIFY 1 - 20
-  verify(1, 0, 5, 20);
-
-  // VERIFY 2 - 0
-  verify(2, 2, 10, 0);
-
-  // VERIFY 3 - 10
-  verify(3, 1, 10, 10);
-
-  // VERIFY 4 - 20
-  verify(4, 1, 10, 20);
-
-  // VERIFY 5 - 0
-  verify(5, 1, 20, 0);
-
-  // VERIFY 6 - 20
-  verify(6, 1, 20, 20);
-
-  // CAPACITY - 40
-  capacity(40);
+  */
+
+  ASSERT_NO_FATAL_FAILURE({
+    p.claim(0, 0, 20);
+    p.claim(1, 0, 5);
+    p.release(0, 0);
+    p.claim(2, 2, 10);
+    p.release(1, 0);
+    p.claim(3, 1, 10);
+    p.release(2, 2);
+    p.claim(4, 1, 10);
+    p.release(3, 1);
+    p.claim(5, 1, 20);
+    p.release(4, 1);
+    p.claim(6, 1, 20);
+    p.release(5, 1);
+
+    // VERIFY 0 - 0
+    p.verify(0, 0, 20, 0);
+
+    // VERIFY 1 - 20
+    p.verify(1, 0, 5, 20);
+
+    // VERIFY 2 - 0
+    p.verify(2, 2, 10, 0);
+
+    // VERIFY 3 - 10
+    p.verify(3, 1, 10, 10);
+
+    // VERIFY 4 - 20
+    p.verify(4, 1, 10, 20);
+
+    // VERIFY 5 - 0
+    p.verify(5, 1, 20, 0);
+
+    // VERIFY 6 - 20
+    p.verify(6, 1, 20, 20);
+
+    // CAPACITY - 40
+    p.capacity(40);
+  });
 }
+
+// Add Testcase for LayerScopeTensorIndex, using PlannerVerifier
diff --git a/runtime/onert/backend/train/MemoryPlannerFactory.cc b/runtime/onert/backend/train/MemoryPlannerFactory.cc
index 157b2f6af91..258feafad35 100644
--- a/runtime/onert/backend/train/MemoryPlannerFactory.cc
+++ b/runtime/onert/backend/train/MemoryPlannerFactory.cc
@@ -16,6 +16,9 @@
 
 #include "MemoryPlannerFactory.h"
 
+#include "DisposableTensorIndex.h"
+#include "LayerScopeTensorIndex.h"
+
 namespace onert
 {
 namespace backend
@@ -23,30 +26,33 @@ namespace backend
 namespace train
 {
 
-MemoryPlannerFactory &MemoryPlannerFactory::get()
+template <typename Index> MemoryPlannerFactory<Index> &MemoryPlannerFactory<Index>::get()
 {
-  static MemoryPlannerFactory instance;
+  static MemoryPlannerFactory<Index> instance;
   return instance;
 }
 
-// TODO: Update to use template varialbe instead of DisposableTensorIndex
-basic::IMemoryPlanner<DisposableTensorIndex> *MemoryPlannerFactory::create(const std::string &key)
+template <typename Index>
+basic::IMemoryPlanner<Index> *MemoryPlannerFactory<Index>::create(const std::string &key)
 {
   if (key == "FirstFit")
   {
-    return new FirstFitPlanner<DisposableTensorIndex>();
+    return new FirstFitPlanner<Index>();
   }
   else if (key == "Bump")
   {
-    return new BumpPlanner<DisposableTensorIndex>();
+    return new BumpPlanner<Index>();
   }
   else if (key == "WIC")
   {
-    return new WICPlanner<DisposableTensorIndex>();
+    return new WICPlanner<Index>();
   }
-  return new FirstFitPlanner<DisposableTensorIndex>(); // Default Planner
+  return new FirstFitPlanner<Index>(); // Default Planner
 }
 
+template class MemoryPlannerFactory<DisposableTensorIndex>;
+template class MemoryPlannerFactory<LayerScopeTensorIndex>;
+
 } // namespace train
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/train/MemoryPlannerFactory.h b/runtime/onert/backend/train/MemoryPlannerFactory.h
index d1609e17559..7d005b01c2e 100644
--- a/runtime/onert/backend/train/MemoryPlannerFactory.h
+++ b/runtime/onert/backend/train/MemoryPlannerFactory.h
@@ -28,17 +28,17 @@ namespace backend
 namespace train
 {
 
-class MemoryPlannerFactory
+template <typename Index> class MemoryPlannerFactory
 {
 public:
-  static MemoryPlannerFactory &get();
+  static MemoryPlannerFactory<Index> &get();
 
 private:
   MemoryPlannerFactory() = default;
 
 public:
   // Currently, only the memory planner for DisposableTensor is supported
-  basic::IMemoryPlanner<DisposableTensorIndex> *create(const std::string &key);
+  basic::IMemoryPlanner<Index> *create(const std::string &key);
 };
 
 } // namespace train
diff --git a/runtime/onert/backend/train/ops/LossCategoricalCrossentropyLayer.cc b/runtime/onert/backend/train/ops/LossCategoricalCrossentropyLayer.cc
index e6a778a2690..3ccd17dda21 100644
--- a/runtime/onert/backend/train/ops/LossCategoricalCrossentropyLayer.cc
+++ b/runtime/onert/backend/train/ops/LossCategoricalCrossentropyLayer.cc
@@ -31,10 +31,11 @@ namespace ops
 void LossCategoricalCrossentropyLayer::configure(const IPortableTensor *y_pred,
                                                  const IPortableTensor *y_true,
                                                  IPortableTensor *output,
-                                                 IPortableTensor *back_prop_y_pred, int32_t axis,
-                                                 float label_smoothing)
+                                                 IPortableTensor *back_prop_y_pred,
+                                                 ir::train::LossReductionType reduction_type,
+                                                 int32_t axis, float label_smoothing)
 {
-  LossLayer::configure(y_pred, y_true, output, back_prop_y_pred);
+  LossLayer::configure(y_pred, y_true, output, back_prop_y_pred, reduction_type);
 
   _axis = axis;
   _label_smoothing = label_smoothing;
@@ -58,11 +59,12 @@ void LossCategoricalCrossentropyLayer::backward()
 {
   assert(_back_prop_y_pred != nullptr);
 
+  const auto reduction_type = convertLossReductionType(_reduction_type);
   if (_y_pred->data_type() == OperandType::FLOAT32)
   {
     nnfw::cker::train::CategoricalCrossEntropyGrad(
       getShape(_y_pred), getBuffer<float>(_y_pred), getShape(_y_true), getBuffer<float>(_y_true),
-      getShape(_back_prop_y_pred), getBuffer<float>(_back_prop_y_pred));
+      getShape(_back_prop_y_pred), getBuffer<float>(_back_prop_y_pred), reduction_type);
   }
   else
   {
diff --git a/runtime/onert/backend/train/ops/LossCategoricalCrossentropyLayer.h b/runtime/onert/backend/train/ops/LossCategoricalCrossentropyLayer.h
index 35001e02308..7e3419a71fb 100644
--- a/runtime/onert/backend/train/ops/LossCategoricalCrossentropyLayer.h
+++ b/runtime/onert/backend/train/ops/LossCategoricalCrossentropyLayer.h
@@ -35,8 +35,8 @@ class LossCategoricalCrossentropyLayer : public LossLayer
   LossCategoricalCrossentropyLayer() = default;
 
   void configure(const IPortableTensor *y_pred, const IPortableTensor *y_true,
-                 IPortableTensor *output, IPortableTensor *back_prop_y_pred, int32_t axis,
-                 float label_smoothing);
+                 IPortableTensor *output, IPortableTensor *back_prop_y_pred,
+                 ir::train::LossReductionType reduction_type, int32_t axis, float label_smoothing);
   void forward(bool training) override;
   void backward() override;
 
diff --git a/runtime/onert/backend/train/ops/LossLayer.cc b/runtime/onert/backend/train/ops/LossLayer.cc
index b175f14fb7c..6f5f8705bba 100644
--- a/runtime/onert/backend/train/ops/LossLayer.cc
+++ b/runtime/onert/backend/train/ops/LossLayer.cc
@@ -32,7 +32,8 @@ LossLayer::LossLayer()
 }
 
 void LossLayer::configure(const IPortableTensor *y_pred, const IPortableTensor *y_true,
-                          IPortableTensor *output, IPortableTensor *back_prop_y_pred)
+                          IPortableTensor *output, IPortableTensor *back_prop_y_pred,
+                          ir::train::LossReductionType reduction_type)
 {
   assert(y_pred != nullptr);
   assert(y_true != nullptr);
@@ -43,6 +44,7 @@ void LossLayer::configure(const IPortableTensor *y_pred, const IPortableTensor *
   _y_true = y_true;
   _output = output;
   _back_prop_y_pred = back_prop_y_pred;
+  _reduction_type = reduction_type;
 }
 
 } // namespace ops
diff --git a/runtime/onert/backend/train/ops/LossLayer.h b/runtime/onert/backend/train/ops/LossLayer.h
index 17b3276b322..f2bb3b38ee0 100644
--- a/runtime/onert/backend/train/ops/LossLayer.h
+++ b/runtime/onert/backend/train/ops/LossLayer.h
@@ -21,6 +21,7 @@
 #include <ops/ElementwiseActivationLayer.h>
 
 #include <exec/train/ITrainableFunction.h>
+#include <ir/train/LossInfo.h>
 
 namespace onert
 {
@@ -42,13 +43,15 @@ class LossLayer : public ::onert::exec::train::ITrainableFunction
   LossLayer();
 
   void configure(const IPortableTensor *y_pred, const IPortableTensor *y_true,
-                 IPortableTensor *output, IPortableTensor *back_prop_y_pred);
+                 IPortableTensor *output, IPortableTensor *back_prop_y_pred,
+                 ir::train::LossReductionType reduction_type);
 
 protected:
   const IPortableTensor *_y_pred;
   const IPortableTensor *_y_true;
   IPortableTensor *_output;
   IPortableTensor *_back_prop_y_pred;
+  ir::train::LossReductionType _reduction_type;
 };
 
 } // namespace ops
diff --git a/runtime/onert/backend/train/ops/LossMeanSquaredErrorLayer.cc b/runtime/onert/backend/train/ops/LossMeanSquaredErrorLayer.cc
index 65c81ffa6c2..03d3196c335 100644
--- a/runtime/onert/backend/train/ops/LossMeanSquaredErrorLayer.cc
+++ b/runtime/onert/backend/train/ops/LossMeanSquaredErrorLayer.cc
@@ -30,9 +30,10 @@ namespace ops
 
 void LossMeanSquaredErrorLayer::configure(const IPortableTensor *y_pred,
                                           const IPortableTensor *y_true, IPortableTensor *output,
-                                          IPortableTensor *back_prop_y_pred)
+                                          IPortableTensor *back_prop_y_pred,
+                                          ir::train::LossReductionType reduction_type)
 {
-  LossLayer::configure(y_pred, y_true, output, back_prop_y_pred);
+  LossLayer::configure(y_pred, y_true, output, back_prop_y_pred, reduction_type);
 }
 
 void LossMeanSquaredErrorLayer::forward(bool)
@@ -53,11 +54,12 @@ void LossMeanSquaredErrorLayer::backward()
 {
   assert(_back_prop_y_pred != nullptr);
 
+  const auto reduction_type = convertLossReductionType(_reduction_type);
   if (_y_pred->data_type() == OperandType::FLOAT32)
   {
     nnfw::cker::train::MSEGrad(getShape(_y_pred), getBuffer<float>(_y_pred), getShape(_y_true),
                                getBuffer<float>(_y_true), getShape(_back_prop_y_pred),
-                               getBuffer<float>(_back_prop_y_pred));
+                               getBuffer<float>(_back_prop_y_pred), reduction_type);
   }
   else
   {
diff --git a/runtime/onert/backend/train/ops/LossMeanSquaredErrorLayer.h b/runtime/onert/backend/train/ops/LossMeanSquaredErrorLayer.h
index a0d2a1bb17a..a3ed41ec804 100644
--- a/runtime/onert/backend/train/ops/LossMeanSquaredErrorLayer.h
+++ b/runtime/onert/backend/train/ops/LossMeanSquaredErrorLayer.h
@@ -34,7 +34,8 @@ class LossMeanSquaredErrorLayer : public LossLayer
   LossMeanSquaredErrorLayer() = default;
 
   void configure(const IPortableTensor *y_pred, const IPortableTensor *y_true,
-                 IPortableTensor *output, IPortableTensor *back_prop_y_pred);
+                 IPortableTensor *output, IPortableTensor *back_prop_y_pred,
+                 ir::train::LossReductionType reduction_type);
   void forward(bool training) override;
   void backward() override;
 };
diff --git a/runtime/onert/backend/train/ops/OperationUtils.cc b/runtime/onert/backend/train/ops/OperationUtils.cc
index 94a2f52491a..5dd41f05858 100644
--- a/runtime/onert/backend/train/ops/OperationUtils.cc
+++ b/runtime/onert/backend/train/ops/OperationUtils.cc
@@ -97,6 +97,19 @@ void biasGrad(const IPortableTensor *input_backprop, IPortableTensor *bias_grad)
                                            bias_grad_buffer, bias_grad_shape);
 }
 
+nnfw::cker::train::LossReductionType convertLossReductionType(ir::train::LossReductionType type)
+{
+  switch (type)
+  {
+    case ir::train::LossReductionType::SumOverBatchSize:
+      return nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE;
+    case ir::train::LossReductionType::Sum:
+      return nnfw::cker::train::LossReductionType::SUM;
+    default:
+      throw std::runtime_error("Unsupported LossReductionType");
+  }
+}
+
 } // namespace ops
 } // namespace train
 } // namespace backend
diff --git a/runtime/onert/backend/train/ops/OperationUtils.h b/runtime/onert/backend/train/ops/OperationUtils.h
index e4329657fe0..f2c3b6b6029 100644
--- a/runtime/onert/backend/train/ops/OperationUtils.h
+++ b/runtime/onert/backend/train/ops/OperationUtils.h
@@ -18,6 +18,8 @@
 #define __ONERT_BACKEND_TRAIN_OPS_OPERATION_UTILS_H__
 
 #include <ops/OperationUtils.h>
+#include <cker/train/Types.h>
+#include <ir/train/LossInfo.h>
 
 namespace onert
 {
@@ -77,6 +79,14 @@ const IPortableTensor *backpropActivation(const ir::Activation &activation,
  */
 void biasGrad(const IPortableTensor *input_backprop, IPortableTensor *bias_grad);
 
+/**
+ * @brief convert loss reduction type
+ *
+ * @param type loss reduction type defined in ir::train::LossReductionType
+ * @return corresponding type defined in cker::train::LossReductionType
+ */
+nnfw::cker::train::LossReductionType convertLossReductionType(ir::train::LossReductionType type);
+
 } // namespace ops
 } // namespace train
 } // namespace backend
diff --git a/runtime/onert/core/include/backend/train/LayerScopeTensor.h b/runtime/onert/core/include/backend/train/LayerScopeTensor.h
index f23ba39e7cb..0100eb58a11 100644
--- a/runtime/onert/core/include/backend/train/LayerScopeTensor.h
+++ b/runtime/onert/core/include/backend/train/LayerScopeTensor.h
@@ -58,6 +58,8 @@ class LayerScopeTensor final : public basic::Tensor
   LayerScopeTensorLifeTime _lifetime;
 };
 
+using LayerScopeTensors = std::vector<std::shared_ptr<LayerScopeTensor>>;
+
 } // namespace train
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/include/exec/train/ITrainableFunction.h b/runtime/onert/core/include/exec/train/ITrainableFunction.h
index 45adc258f68..7c246d9f36c 100644
--- a/runtime/onert/core/include/exec/train/ITrainableFunction.h
+++ b/runtime/onert/core/include/exec/train/ITrainableFunction.h
@@ -18,6 +18,9 @@
 #define __ONERT_EXEC_TRAIN_I_TRAINABLE_FUNCTION_H__
 
 #include <cstdint>
+#include <optional>
+
+#include "backend/train/LayerScopeTensor.h"
 
 namespace onert
 {
@@ -32,6 +35,12 @@ class ITrainableFunction
   virtual ~ITrainableFunction() = default;
   virtual void forward(bool training) = 0;
   virtual void backward() = 0;
+
+  // Implement this if LayerScopeTensors is necessary
+  virtual std::optional<backend::train::LayerScopeTensors> registerLayerScopeTensors()
+  {
+    return std::nullopt;
+  }
 };
 
 } // namespace train
diff --git a/runtime/onert/core/src/loader/CircleLoader.cc b/runtime/onert/core/src/loader/CircleLoader.cc
index d370498c334..da5b4b892be 100644
--- a/runtime/onert/core/src/loader/CircleLoader.cc
+++ b/runtime/onert/core/src/loader/CircleLoader.cc
@@ -90,9 +90,9 @@ class CircleLoader final : public loader::BaseLoader<LoaderDomain>
 protected:
   ir::DataType tensorTypeToDataType(const TensorType type) override
   {
-    if (type == TensorType::TensorType_Q4_0)
+    if (type == TensorType::TensorType_GGML_Q4_0)
       return ir::DataType::QUANT_GGML_Q4_0;
-    if (type == TensorType::TensorType_Q8_0)
+    if (type == TensorType::TensorType_GGML_Q8_0)
       return ir::DataType::QUANT_GGML_Q8_0;
 
     return BaseLoader::tensorTypeToDataType(type);
diff --git a/runtime/onert/core/src/loader/train/CheckpointLoader.cc b/runtime/onert/core/src/loader/train/CheckpointLoader.cc
index d1ce03a0d4a..a08a713a050 100644
--- a/runtime/onert/core/src/loader/train/CheckpointLoader.cc
+++ b/runtime/onert/core/src/loader/train/CheckpointLoader.cc
@@ -32,6 +32,48 @@ using namespace train;
 using namespace checkpoint;
 using namespace exec;
 
+struct DataBufferPair
+{
+  uint32_t offset;
+  uint32_t size;
+};
+
+class DataBuffer
+{
+public:
+  DataBuffer() = default;
+  DataBuffer(uint32_t size) { setSize(size); }
+
+  void setSize(uint32_t size)
+  {
+    _offset.resize(size);
+    _size.resize(size);
+  }
+
+  char *getOffsetBuf() { return reinterpret_cast<char *>(_offset.data()); }
+
+  // This function should be called after loading the _offset buffer.
+  void calculateSize(uint32_t next_start_offset)
+  {
+    assert(_offset.size() == _size.size());
+    for (size_t i = 0; i < _offset.size() - 1; ++i)
+      _size[i] = _offset[i + 1] - _offset[i];
+    _size.back() = next_start_offset - _offset.back();
+  }
+
+  // offset, size
+  DataBufferPair operator[](uint32_t i) const
+  {
+    assert(_offset.size() == _size.size());
+    assert(i < _offset.size());
+    return DataBufferPair{_offset[i], _size[i]};
+  }
+
+private:
+  std::vector<uint32_t> _offset;
+  std::vector<uint32_t> _size;
+};
+
 class CheckpointLoader final
 {
 public:
@@ -62,6 +104,33 @@ class CheckpointLoader final
     if (_header.schema != checkpoint::SCHEMA_VERSION)
       throw std::runtime_error{"Invalid SCHEMA VERSION"};
 
+    _tensor_data.setSize(_header.length);
+    _file.read(_tensor_data.getOffsetBuf(),
+               static_cast<std::streamsize>(_header.length * sizeof(uint32_t)));
+    if (_file.fail())
+      throw std::runtime_error{"Failed to load tensor data"};
+    _tensor_data.calculateSize(_header.opt1_offset);
+
+    if (_header.opt1_offset)
+    {
+      DataBuffer opt1_data(_header.length);
+      _file.seekg(static_cast<std::streamoff>(_header.opt1_offset), std::ios::beg);
+      _file.read(opt1_data.getOffsetBuf(),
+                 static_cast<std::streamsize>(_header.length * sizeof(uint32_t)));
+      opt1_data.calculateSize(_header.opt2_offset);
+      _optimizer_data.emplace_back(std::move(opt1_data));
+    }
+
+    if (_header.opt2_offset)
+    {
+      DataBuffer opt2_data(_header.length);
+      _file.seekg(static_cast<std::streamoff>(_header.opt2_offset), std::ios::beg);
+      _file.read(opt2_data.getOffsetBuf(),
+                 static_cast<std::streamsize>(_header.length * sizeof(uint32_t)));
+      opt2_data.calculateSize(_header.other_offset);
+      _optimizer_data.emplace_back(std::move(opt2_data));
+    }
+
     if ((filesize - _header.other_offset) != sizeof(_footer))
       throw std::runtime_error{"Invalid checkpoint file footer data"};
 
@@ -76,10 +145,95 @@ class CheckpointLoader final
       _file.close();
   }
 
+  void updateTensor(const std::unique_ptr<exec::Execution> &exec)
+  {
+    uint32_t vindex = 0;
+    exec->iterateTrainableTensors(
+      [&](const ir::OperandIndex &, const backend::train::ITrainableTensor *) { vindex++; });
+
+    if (_header.length != vindex)
+      throw std::runtime_error{
+        "Invalid number of tensors between TrainingInfo and checkpoint file"};
+
+    // Reset EOF bit
+    _file.clear();
+
+    vindex = 0;
+    exec->iterateTrainableTensors(
+      [&](const ir::OperandIndex &, const backend::train::ITrainableTensor *tensor) {
+        assert(tensor);
+        assert(_tensor_data[vindex].size == tensor->total_size());
+        _file.seekg(static_cast<std::streamoff>(_tensor_data[vindex].offset), std::ios::beg);
+        _file.read(reinterpret_cast<char *>(tensor->buffer()),
+                   static_cast<std::streamsize>(tensor->total_size()));
+        vindex++;
+      });
+  }
+
+  void updateOptimizer(const std::unique_ptr<ir::train::TrainingInfo> &train_info,
+                       const std::unique_ptr<onert::exec::Execution> &exec)
+  {
+    ir::train::OptimizerCode ckpt_opt_code = ir::train::OptimizerCode::SGD;
+    // TODO Support other optimizers
+    if (_optimizer_data.size() == 2)
+      ckpt_opt_code = ir::train::OptimizerCode::Adam;
+
+    if (ckpt_opt_code != train_info->optimizerInfo().optim_code)
+      throw std::runtime_error{
+        "Not compatible optimizer type between TrainingInfo and checkpoint file"};
+
+    switch (train_info->optimizerInfo().optim_code)
+    {
+      case ir::train::OptimizerCode::Adam:
+        updateAdamOptimizer(exec);
+        break;
+      default:
+        break;
+    }
+  }
+
+  void updateTrainingInfo(const std::unique_ptr<ir::train::TrainingInfo> &train_info)
+  {
+    // TODO Verify cur_step value
+    train_info->trainingStep() = _footer.cur_step;
+  }
+
+private:
+  void updateAdamOptimizer(const std::unique_ptr<onert::exec::Execution> &exec)
+  {
+    // Adam optimizer has two optimizer variables. (mean, variance)
+    [[maybe_unused]] const std::size_t ADAM_VARIABLE_COUNT = 2;
+
+    // Reset EOF bit
+    _file.clear();
+
+    uint32_t vindex = 0;
+    exec->iterateTrainableTensors([&](const ir::OperandIndex &,
+                                      const backend::train::ITrainableTensor *tensor) {
+      assert(tensor);
+      auto trainable_tensor = const_cast<backend::train::ITrainableTensor *>(tensor);
+      const auto opt_vars = trainable_tensor->optVars();
+
+      // Untrainable tensor should not have any optimizer variables.
+      assert(opt_vars.size() == ADAM_VARIABLE_COUNT || opt_vars.size() == 0);
+
+      for (size_t i = 0; i < opt_vars.size(); ++i)
+      {
+        assert(opt_vars[i]->total_size() == _optimizer_data[i][vindex].size);
+        _file.seekg(static_cast<std::streamoff>(_optimizer_data[i][vindex].offset), std::ios::beg);
+        _file.read(reinterpret_cast<char *>(opt_vars[i]->buffer()),
+                   static_cast<std::streamsize>(opt_vars[i]->total_size()));
+      }
+      vindex++;
+    });
+  }
+
 private:
   std::ifstream _file;
   checkpoint::Header _header;
   checkpoint::Footer _footer;
+  DataBuffer _tensor_data;
+  std::vector<DataBuffer> _optimizer_data;
 };
 
 } // namespace
@@ -96,11 +250,9 @@ void loadCheckpoint(const std::string &filename,
                     const std::unique_ptr<onert::exec::Execution> &exec)
 {
   CheckpointLoader loader(filename);
-
-  // TODO Load tensor data
-  UNUSED_RELEASE(exec);
-  // TODO Update step in train_info
-  UNUSED_RELEASE(train_info);
+  loader.updateTensor(exec);
+  loader.updateOptimizer(train_info, exec);
+  loader.updateTrainingInfo(train_info);
 }
 
 } // namespace train
diff --git a/tests/nnapi/CMakeLists.txt b/tests/nnapi/CMakeLists.txt
index b8f4573b217..b1fd64596f5 100644
--- a/tests/nnapi/CMakeLists.txt
+++ b/tests/nnapi/CMakeLists.txt
@@ -49,9 +49,11 @@ target_include_directories(${RUNTIME_NNAPI_TEST} PRIVATE ${RUNTIME_NNAPI_TEST_SR
 # Define NNTEST_ONLY_PUBLIC_API to avoid android dependency
 target_compile_definitions(${RUNTIME_NNAPI_TEST} PRIVATE NNTEST_ONLY_PUBLIC_API)
 
-target_link_libraries(${RUNTIME_NNAPI_TEST} onert)
+target_link_libraries(${RUNTIME_NNAPI_TEST} onert_nnapi)
 target_link_libraries(${RUNTIME_NNAPI_TEST} gtest gmock)
 target_link_libraries(${RUNTIME_NNAPI_TEST} ${LIB_PTHREAD} dl)
+# Set INSTALL_RPATH to find onert_core
+set_target_properties(${RUNTIME_NNAPI_TEST} PROPERTIES INSTALL_RPATH "$ORIGIN/../lib/nnfw")
 
 install(TARGETS ${RUNTIME_NNAPI_TEST} DESTINATION nnapi-gtest)
 
diff --git a/tests/nnapi/include/NeuralNetworksWrapper.h b/tests/nnapi/include/NeuralNetworksWrapper.h
index 724d4cd0a8c..dcd2947920e 100644
--- a/tests/nnapi/include/NeuralNetworksWrapper.h
+++ b/tests/nnapi/include/NeuralNetworksWrapper.h
@@ -21,10 +21,9 @@
 #define __NNFW_RT_NEURAL_NETWORKS_WRAPPER_H__
 
 // Fix for onert:
-//  NeuralNetworks.h => NeuralNetworksShim.h
-//  Additional include NeuralNetworksExShim.h
-#include "NeuralNetworksShim.h"
-#include "NeuralNetworksExShim.h"
+//  Additional include NeuralNetworksEx.h
+#include "NeuralNetworks.h"
+#include "NeuralNetworksEx.h"
 
 #include <math.h>
 #include <optional>