diff --git a/compiler/fm-equalize/fm-equalize b/compiler/fm-equalize/fm-equalize index 4e4e5395e18..36b4f99a003 100644 --- a/compiler/fm-equalize/fm-equalize +++ b/compiler/fm-equalize/fm-equalize @@ -62,6 +62,18 @@ def _get_parser(): help="Allow to create duplicate operations when a feature map matches " "with multiple equalization patterns. This can increase the size of " "the model. Default is false.") + parser.add_argument("--fme_detect", + type=str, + help="Path to fme-detect driver.", + required=False) + parser.add_argument("--dalgona", + type=str, + help="Path to dalgona driver.", + required=False) + parser.add_argument("--fme_apply", + type=str, + help="Path to fme-apply driver.", + required=False) parser.add_argument('--verbose', action='store_true', help='Print logs') return parser @@ -78,12 +90,9 @@ def _run_cmd(cmd: str, verbose: bool): raise -def _run_dalgona(model: str, data: Optional[str], analysis: str, save_dir: str, - verbose: bool): - dir_path = os.getenv('ONE_BIN_PATH') - assert dir_path != None - dalgona_path = os.path.join(dir_path, 'dalgona') - cmd = [dalgona_path] +def _run_dalgona(driver_path: str, model: str, data: Optional[str], analysis: str, + save_dir: str, verbose: bool): + cmd = [driver_path] cmd += ['--input_model', model] cmd += ['--analysis', analysis] if data != None: @@ -94,11 +103,9 @@ def _run_dalgona(model: str, data: Optional[str], analysis: str, save_dir: str, _run_cmd(cmd, verbose) -def _run_fme_detect(input_model: str, fme_patterns: str, verbose: bool, +def _run_fme_detect(driver_path: str, input_model: str, fme_patterns: str, verbose: bool, allow_dup_op: bool): - dir_path = Path(__file__).parent.resolve() - fme_detect_path = os.path.join(dir_path, 'fme-detect') - cmd = [fme_detect_path] + cmd = [driver_path] cmd += ['--input', input_model] cmd += ['--output', fme_patterns] if allow_dup_op: @@ -107,10 +114,9 @@ def _run_fme_detect(input_model: str, fme_patterns: str, verbose: bool, _run_cmd(cmd, verbose) -def _run_fme_apply(input_model: str, fme_patterns: str, output_model: str, verbose: bool): - dir_path = Path(__file__).parent.resolve() - fme_apply_path = os.path.join(dir_path, 'fme-apply') - cmd = [fme_apply_path] +def _run_fme_apply(driver_path: str, input_model: str, fme_patterns: str, + output_model: str, verbose: bool): + cmd = [driver_path] cmd += ['--input', input_model] cmd += ['--fme_patterns', fme_patterns] cmd += ['--output', output_model] @@ -128,6 +134,25 @@ def main(): data = args.data verbose = args.verbose allow_dup_op = args.allow_dup_op + fme_detect_path = args.fme_detect + fme_apply_path = args.fme_apply + dalgona_path = args.dalgona + + curr_dir = Path(__file__).parent.resolve() + dump_fme_param_py = curr_dir / 'fmelib' / 'DumpFMEParams.py' + if dump_fme_param_py.exists() == False: + raise FileNotFoundError('Error: DumpFMEParams.py not found') + + if not fme_detect_path: + dir_path = Path(__file__).parent.resolve() + fme_detect_path = os.path.join(dir_path, 'fme-detect') + if not dalgona_path: + dir_path = os.getenv('ONE_BIN_PATH') + assert dir_path != None + dalgona_path = os.path.join(dir_path, 'dalgona') + if not fme_apply_path: + dir_path = Path(__file__).parent.resolve() + fme_apply_path = os.path.join(dir_path, 'fme-apply') with tempfile.TemporaryDirectory() as tmp_dir: fme_patterns = os.path.join( @@ -135,7 +160,8 @@ def main(): Path(output_model).with_suffix('.fme_patterns.json').name) # Step 1. Run fme-detect to find equalization patterns - _run_fme_detect(str(input_model), + _run_fme_detect(fme_detect_path, + str(input_model), str(fme_patterns), verbose=verbose, allow_dup_op=allow_dup_op) @@ -144,8 +170,13 @@ def main(): if args.fme_patterns != None: os.system(f'cp {fme_patterns} {args.fme_patterns}') - # TODO Step 2. Run dalgona - # _run_dalgona + # Step 2. Run dalgona + _run_dalgona(dalgona_path, + str(input_model), + data, + str(dump_fme_param_py), + str(fme_patterns), + verbose=verbose) # Copy fme_patterns to the given path # Why copy twice? To observe the result of fme-detect too @@ -153,7 +184,8 @@ def main(): os.system(f'cp {fme_patterns} {args.fme_patterns}') # Step 3. Run fme-apply - _run_fme_apply(str(input_model), + _run_fme_apply(fme_apply_path, + str(input_model), str(fme_patterns), str(output_model), verbose=verbose) diff --git a/compiler/fme-apply/src/FMEqualizer.cpp b/compiler/fme-apply/src/FMEqualizer.cpp index 3d8ca2b0fb8..a8be34592a5 100644 --- a/compiler/fme-apply/src/FMEqualizer.cpp +++ b/compiler/fme-apply/src/FMEqualizer.cpp @@ -17,12 +17,8 @@ #include "FMEqualizer.h" #include "InsertScaleShift.h" #include "EqualizePatternCheck.h" -#include "pass/ForwardPreScalePass.h" -#include "pass/ForwardPreShiftPass.h" #include "pass/FusePostScalePass.h" -#include "pass/FusePostShiftPass.h" #include "pass/FusePreScalePass.h" -#include "pass/FusePreShiftPass.h" #include "ProgressReporter.h" #include @@ -82,15 +78,9 @@ void FMEqualizer::equalize(loco::Graph *g, const std::vector &p phase.emplace_back(std::make_unique()); phase.emplace_back(std::make_unique()); - // Forward PreScale/PreShift - phase.emplace_back(std::make_unique()); - phase.emplace_back(std::make_unique()); - // Fuse Pre/Post Scale/Shift phase.emplace_back(std::make_unique()); phase.emplace_back(std::make_unique()); - phase.emplace_back(std::make_unique()); - phase.emplace_back(std::make_unique()); ProgressReporter prog(g, logo::PhaseStrategy::Restart); logo::PhaseRunner phase_runner{g}; diff --git a/compiler/fme-apply/src/RandomString.h b/compiler/fme-apply/src/RandomString.h index 0f3533459eb..c147b7c3e2c 100644 --- a/compiler/fme-apply/src/RandomString.h +++ b/compiler/fme-apply/src/RandomString.h @@ -17,6 +17,7 @@ #ifndef __FME_APPLY_RANDOM_STRING_H__ #define __FME_APPLY_RANDOM_STRING_H__ +#include #include namespace fme_apply diff --git a/compiler/fme-apply/src/pass/ForwardPreScalePass.cpp b/compiler/fme-apply/src/pass/ForwardPreScalePass.cpp deleted file mode 100644 index c9324bc6b7a..00000000000 --- a/compiler/fme-apply/src/pass/ForwardPreScalePass.cpp +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ForwardPreScalePass.h" -#include "Support.Cast.h" -#include "Support.Misc.h" - -#include -#include - -using namespace fme_apply; - -namespace -{ - -class ForwardPreScale final : public luci::CircleNodeMutableVisitor -{ -protected: - bool visit(luci::CircleNode *node) { return false; } - - bool visit(luci::CirclePad *node) - { - auto pre_scale = to_pre_scale(node->input()); - if (not pre_scale) - return false; - - if (loco::succs(pre_scale).size() != 1) - return false; - - node->input(pre_scale->inputs(0)); - loco::replace(node).with(pre_scale); - pre_scale->inputs(0, node); - - // Shape should be copied, because - // shape inference does not work well for Custom Op (PreScale) - copy_shape(node, pre_scale); - - return true; - } - - bool visit(luci::CircleSlice *node) - { - auto pre_scale = to_pre_scale(node->input()); - if (not pre_scale) - return false; - - if (loco::succs(pre_scale).size() != 1) - return false; - - node->input(pre_scale->inputs(0)); - loco::replace(node).with(pre_scale); - pre_scale->inputs(0, node); - - // Shape should be copied, because - // shape inference does not work well for Custom Op (PreScale) - copy_shape(node, pre_scale); - - return true; - } -}; - -} // namespace - -namespace fme_apply -{ - -bool ForwardPreScalePass::run(loco::Graph *g) -{ - bool changed = false; - - ForwardPreScale fps; - for (auto node : loco::active_nodes(loco::output_nodes(g))) - { - auto cnode = loco::must_cast(node); - if (cnode->accept(&fps)) - changed = true; - } - - return changed; -} - -} // namespace fme_apply diff --git a/compiler/fme-apply/src/pass/ForwardPreScalePass.h b/compiler/fme-apply/src/pass/ForwardPreScalePass.h deleted file mode 100644 index bb3819669ba..00000000000 --- a/compiler/fme-apply/src/pass/ForwardPreScalePass.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __FME_APPLY_FORWARD_PRE_SCALE_PASS_H__ -#define __FME_APPLY_FORWARD_PRE_SCALE_PASS_H__ - -#include - -#include - -namespace fme_apply -{ - -/** - * @brief Pass to forward CircleCustom(PreScale) to succeeding Op - * - * BEFORE - * - * CircleCustom(PreScale) - * | - * Forwardable Op (ex: Pad, Slice) - * - * AFTER - * - * Forwardable Op (ex: Pad, Slice) - * | - * CircleCustom(PreScale) - */ -class ForwardPreScalePass : public logo::Pass -{ -public: - virtual const char *name(void) const { return "fme_apply::ForwardPreScalePass"; } - -public: - bool run(loco::Graph *graph); -}; - -} // namespace fme_apply - -#endif //__FME_APPLY_FORWARD_PRE_SCALE_PASS_H__ diff --git a/compiler/fme-apply/src/pass/ForwardPreScalePass.test.cpp b/compiler/fme-apply/src/pass/ForwardPreScalePass.test.cpp deleted file mode 100644 index e6e064b1a28..00000000000 --- a/compiler/fme-apply/src/pass/ForwardPreScalePass.test.cpp +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ForwardPreScalePass.h" -#include "Support.Cast.h" - -#include - -#include - -using namespace fme_apply; - -namespace -{ - -/** - * PreScale-Pad graphlet - * - * [PreScale] - * | - * [Pad] - * - */ -class PreScalePadGraphlet -{ -public: - void init(loco::Graph *g) - { - _prescale = g->nodes()->create(2 /* arity */, 1 /* out */); - _prescale->dtype(loco::DataType::FLOAT32); - _prescale->shape({1, 4, 4, 16}); - _prescale->custom_code("PreScale"); - _prescale->name("prescale"); - - _pad = g->nodes()->create(); - _pad->input(_prescale); - _pad->dtype(loco::DataType::FLOAT32); - _pad->shape({1, 5, 5, 16}); - _pad->name("pad"); - } - -public: - luci::CircleCustom *_prescale = nullptr; - luci::CirclePad *_pad = nullptr; -}; - -class PreScalePadGraph : public luci::test::TestIOGraph, public PreScalePadGraphlet -{ -public: - void init(void) - { - luci::test::TestIOGraph::init({1, 4, 4, 16}, {1, 5, 5, 16}); - PreScalePadGraphlet::init(g()); - - _prescale->inputs(0, input()); - - output()->from(_pad); - } - - std::unique_ptr graph(void) { return std::move(_g); } -}; - -/** - * PreScale-Slice graphlet - * - * [PreScale] - * | - * [Slice] - * - */ -class PreScaleSliceGraphlet -{ -public: - void init(loco::Graph *g) - { - _prescale = g->nodes()->create(2 /* arity */, 1 /* out */); - _prescale->dtype(loco::DataType::FLOAT32); - _prescale->shape({1, 4, 4, 16}); - _prescale->custom_code("PreScale"); - _prescale->name("prescale"); - - _slice = g->nodes()->create(); - _slice->input(_prescale); - _slice->dtype(loco::DataType::FLOAT32); - _slice->shape({1, 2, 2, 16}); - _slice->name("slice"); - } - -public: - luci::CircleCustom *_prescale = nullptr; - luci::CircleSlice *_slice = nullptr; -}; - -class PreScaleSliceGraph : public luci::test::TestIOGraph, public PreScaleSliceGraphlet -{ -public: - void init(void) - { - luci::test::TestIOGraph::init({1, 4, 4, 16}, {1, 2, 2, 16}); - PreScaleSliceGraphlet::init(g()); - - _prescale->inputs(0, input()); - - output()->from(_slice); - } - - std::unique_ptr graph(void) { return std::move(_g); } -}; - -} // namespace - -TEST(ForwardPreScalePassTest, prescale_pad) -{ - PreScalePadGraph g; - g.init(); - - ForwardPreScalePass fpsp; - EXPECT_TRUE(fpsp.run(g.g())); - - auto pre = to_pre_scale(g.output()->from()); - EXPECT_NE(nullptr, pre); - - auto pad = dynamic_cast(pre->inputs(0)); - EXPECT_NE(nullptr, pad); - - EXPECT_EQ(4, pre->rank()); - EXPECT_EQ(1, pre->dim(0).value()); - EXPECT_EQ(5, pre->dim(1).value()); - EXPECT_EQ(5, pre->dim(2).value()); - EXPECT_EQ(16, pre->dim(3).value()); -} - -TEST(ForwardPreScalePassTest, prescale_slice) -{ - PreScaleSliceGraph g; - g.init(); - - ForwardPreScalePass fpsp; - EXPECT_TRUE(fpsp.run(g.g())); - - auto pre = to_pre_scale(g.output()->from()); - EXPECT_NE(nullptr, pre); - - auto slice = dynamic_cast(pre->inputs(0)); - EXPECT_NE(nullptr, slice); - - EXPECT_EQ(4, pre->rank()); - EXPECT_EQ(1, pre->dim(0).value()); - EXPECT_EQ(2, pre->dim(1).value()); - EXPECT_EQ(2, pre->dim(2).value()); - EXPECT_EQ(16, pre->dim(3).value()); -} - -TEST(ForwardPreScalePassTest, prescale_conv_NEG) -{ - PreScalePadGraph g; - g.init(); - - // Replace Pad with Conv2D - auto conv = g.g()->nodes()->create(); - conv->input(g._prescale); - g.output()->from(conv); - - ForwardPreScalePass fpsp; - EXPECT_FALSE(fpsp.run(g.g())); -} diff --git a/compiler/fme-apply/src/pass/ForwardPreShiftPass.cpp b/compiler/fme-apply/src/pass/ForwardPreShiftPass.cpp deleted file mode 100644 index 20357de8383..00000000000 --- a/compiler/fme-apply/src/pass/ForwardPreShiftPass.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ForwardPreShiftPass.h" -#include "Support.Cast.h" -#include "Support.Misc.h" - -#include -#include - -using namespace fme_apply; - -namespace -{ - -class ForwardPreShift final : public luci::CircleNodeMutableVisitor -{ -protected: - bool visit(luci::CircleNode *node) { return false; } - - bool visit(luci::CircleSlice *node) - { - auto pre_shift = to_pre_shift(node->input()); - if (not pre_shift) - return false; - - if (loco::succs(pre_shift).size() != 1) - return false; - - node->input(pre_shift->inputs(0)); - loco::replace(node).with(pre_shift); - pre_shift->inputs(0, node); - - // Shape should be copied, because - // shape inference does not work well for Custom Op (PreShift) - copy_shape(node, pre_shift); - - return true; - } -}; - -} // namespace - -namespace fme_apply -{ - -bool ForwardPreShiftPass::run(loco::Graph *g) -{ - bool changed = false; - - ForwardPreShift fps; - for (auto node : loco::active_nodes(loco::output_nodes(g))) - { - auto cnode = loco::must_cast(node); - if (cnode->accept(&fps)) - changed = true; - } - - return changed; -} - -} // namespace fme_apply diff --git a/compiler/fme-apply/src/pass/ForwardPreShiftPass.h b/compiler/fme-apply/src/pass/ForwardPreShiftPass.h deleted file mode 100644 index dbc70933842..00000000000 --- a/compiler/fme-apply/src/pass/ForwardPreShiftPass.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __FME_APPLY_FORWARD_PRE_SHIFT_PASS_H__ -#define __FME_APPLY_FORWARD_PRE_SHIFT_PASS_H__ - -#include - -#include - -namespace fme_apply -{ - -/** - * @brief Pass to forward CircleCustom(PreShift) to succeeding Op - * - * BEFORE - * - * CircleCustom(PreShift) - * | - * Forwardable Op (ex: Slice) - * - * AFTER - * - * Forwardable Op (ex: Slice) - * | - * CircleCustom(PreShift) - */ -class ForwardPreShiftPass : public logo::Pass -{ -public: - virtual const char *name(void) const { return "fme_apply::ForwardPreShiftPass"; } - -public: - bool run(loco::Graph *graph); -}; - -} // namespace fme_apply - -#endif //__FME_APPLY_FORWARD_PRE_SHIFT_PASS_H__ diff --git a/compiler/fme-apply/src/pass/ForwardPreShiftPass.test.cpp b/compiler/fme-apply/src/pass/ForwardPreShiftPass.test.cpp deleted file mode 100644 index 2fdcf90ad9a..00000000000 --- a/compiler/fme-apply/src/pass/ForwardPreShiftPass.test.cpp +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ForwardPreShiftPass.h" -#include "Support.Cast.h" - -#include - -#include - -using namespace fme_apply; - -namespace -{ - -/** - * PreShift-Slice graphlet - * - * [PreShift] - * | - * [Slice] - * - */ -class PreShiftSliceGraphlet -{ -public: - void init(loco::Graph *g) - { - _preshift = g->nodes()->create(2 /* arity */, 1 /* out */); - _preshift->dtype(loco::DataType::FLOAT32); - _preshift->shape({1, 4, 4, 16}); - _preshift->custom_code("PreShift"); - _preshift->name("preshift"); - - _slice = g->nodes()->create(); - _slice->input(_preshift); - _slice->dtype(loco::DataType::FLOAT32); - _slice->shape({1, 2, 2, 16}); - _slice->name("slice"); - } - -public: - luci::CircleCustom *_preshift = nullptr; - luci::CircleSlice *_slice = nullptr; -}; - -class PreShiftSliceGraph : public luci::test::TestIOGraph, public PreShiftSliceGraphlet -{ -public: - void init(void) - { - luci::test::TestIOGraph::init({1, 4, 4, 16}, {1, 2, 2, 16}); - PreShiftSliceGraphlet::init(g()); - - _preshift->inputs(0, input()); - - output()->from(_slice); - } - - std::unique_ptr graph(void) { return std::move(_g); } -}; - -} // namespace - -TEST(ForwardPreShiftPassTest, preshift_slice) -{ - PreShiftSliceGraph g; - g.init(); - - ForwardPreShiftPass fpsp; - EXPECT_TRUE(fpsp.run(g.g())); - - auto pre = to_pre_shift(g.output()->from()); - EXPECT_NE(nullptr, pre); - - auto slice = dynamic_cast(pre->inputs(0)); - EXPECT_NE(nullptr, slice); - - EXPECT_EQ(4, pre->rank()); - EXPECT_EQ(1, pre->dim(0).value()); - EXPECT_EQ(2, pre->dim(1).value()); - EXPECT_EQ(2, pre->dim(2).value()); - EXPECT_EQ(16, pre->dim(3).value()); -} - -TEST(ForwardPreShiftPassTest, preshift_conv_NEG) -{ - PreShiftSliceGraph g; - g.init(); - - // Replace Pad with Conv2D - auto conv = g.g()->nodes()->create(); - conv->input(g._preshift); - g.output()->from(conv); - - ForwardPreShiftPass fpsp; - EXPECT_FALSE(fpsp.run(g.g())); -} diff --git a/compiler/fme-apply/src/pass/FusePostShiftPass.cpp b/compiler/fme-apply/src/pass/FusePostShiftPass.cpp deleted file mode 100644 index 744fede9ae2..00000000000 --- a/compiler/fme-apply/src/pass/FusePostShiftPass.cpp +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "FusePostShiftPass.h" -#include "Support.Cast.h" -#include "RandomString.h" - -#include -#include -#include -#include -#include - -using namespace fme_apply; - -namespace -{ - -// Fuse Op + CircleCustom(PostShift) -struct FusePostShift final : public luci::CircleNodeMutableVisitor -{ - bool visit(luci::CircleNode *) { return false; } - - bool visit(luci::CircleConv2D *node) - { - if (node->fusedActivationFunction() != luci::FusedActFunc::NONE) - return false; - - bool changed = false; - for (auto succ : loco::succs(node)) - { - auto post_shift = to_post_shift(succ); - if (not post_shift) - continue; - - auto param = - loco::must_cast(post_shift->inputs(1)); // FIX_PostShift_UNLESS - auto bias = dynamic_cast(node->bias()); - if (not bias) - continue; - - uint32_t channel_size = bias->size(); - if (channel_size != param->size()) - { - assert(false); // FIX_PostScale_Unless - return false; - } - - auto cloned_conv = luci::clone_node(node, node->graph()); - assert(cloned_conv != nullptr); // FIX_CALLER_UNLESS - auto fused_conv = loco::must_cast(cloned_conv); - auto fused_bias = luci::clone(bias); - - fused_conv->name(node->name() + "_" + random_str()); - fused_bias->name(bias->name() + "_" + random_str()); - - add_origin(fused_conv, luci::get_origin(node)); - add_origin(fused_bias, luci::get_origin(bias)); - - // Add param to bias - for (uint32_t c = 0; c < channel_size; ++c) - { - float shift = param->at(c); - fused_bias->at(c) = - fused_bias->at(c) + shift; - } - - fused_conv->input(node->input()); - fused_conv->filter(node->filter()); - fused_conv->bias(fused_bias); - - loco::replace(post_shift).with(fused_conv); - changed = true; - } - - return changed; - } - - bool visit(luci::CircleDepthwiseConv2D *node) - { - if (node->fusedActivationFunction() != luci::FusedActFunc::NONE) - return false; - - bool changed = false; - for (auto succ : loco::succs(node)) - { - auto post_shift = to_post_shift(succ); - if (not post_shift) - continue; - - auto param = - loco::must_cast(post_shift->inputs(1)); // FIX_PostShift_UNLESS - auto bias = dynamic_cast(node->bias()); - if (not bias) - continue; - - uint32_t channel_size = bias->size(); - if (channel_size != param->size()) - { - assert(false); // FIX_PostScale_Unless - return false; - } - - auto cloned_dconv = luci::clone_node(node, node->graph()); - assert(cloned_dconv != nullptr); // FIX_CALLER_UNLESS - auto fused_dconv = loco::must_cast(cloned_dconv); - auto fused_bias = luci::clone(bias); - - fused_dconv->name(node->name() + "_" + random_str()); - fused_bias->name(bias->name() + "_" + random_str()); - - add_origin(fused_dconv, luci::get_origin(node)); - add_origin(fused_bias, luci::get_origin(bias)); - - // Add param to bias - for (uint32_t c = 0; c < channel_size; ++c) - { - float shift = param->at(c); - fused_bias->at(c) = - fused_bias->at(c) + shift; - } - - fused_dconv->input(node->input()); - fused_dconv->filter(node->filter()); - fused_dconv->bias(fused_bias); - - loco::replace(post_shift).with(fused_dconv); - changed = true; - } - - return changed; - } - - bool visit(luci::CircleTransposeConv *node) - { - bool changed = false; - for (auto succ : loco::succs(node)) - { - auto post_shift = to_post_shift(succ); - if (not post_shift) - continue; - - auto param = - loco::must_cast(post_shift->inputs(1)); // FIX_PostShift_UNLESS - - // TConv has bias. Update bias. - if (auto bias = dynamic_cast(node->bias())) - { - uint32_t channel_size = bias->size(); - if (channel_size != param->size()) - { - assert(false); // FIX_PostScale_Unless - return false; - } - - auto cloned_tconv = luci::clone_node(node, node->graph()); - assert(cloned_tconv != nullptr); // FIX_CALLER_UNLESS - auto fused_tconv = loco::must_cast(cloned_tconv); - auto fused_bias = luci::clone(bias); - - fused_tconv->name(node->name() + "_" + random_str()); - fused_bias->name(bias->name() + "_" + random_str()); - - add_origin(fused_tconv, luci::get_origin(node)); - add_origin(fused_bias, luci::get_origin(bias)); - - // Add param to bias - for (uint32_t c = 0; c < channel_size; ++c) - { - float shift = param->at(c); - fused_bias->at(c) = - fused_bias->at(c) + shift; - } - - fused_tconv->inputSizes(node->inputSizes()); - fused_tconv->outBackprop(node->outBackprop()); - fused_tconv->filter(node->filter()); - fused_tconv->bias(fused_bias); - - loco::replace(post_shift).with(fused_tconv); - changed = true; - continue; - } - - // TConv has no bias. Just use param - if (auto bias = dynamic_cast(node->bias())) - { - auto cloned_tconv = luci::clone_node(node, node->graph()); - assert(cloned_tconv != nullptr); // FIX_CALLER_UNLESS - auto fused_tconv = loco::must_cast(cloned_tconv); - - fused_tconv->inputSizes(node->inputSizes()); - fused_tconv->outBackprop(node->outBackprop()); - fused_tconv->filter(node->filter()); - fused_tconv->bias(param); - - loco::replace(post_shift).with(fused_tconv); - changed = true; - continue; - } - } - - return changed; - } -}; - -} // namespace - -namespace fme_apply -{ - -bool FusePostShiftPass::run(loco::Graph *g) -{ - bool changed = false; - for (auto node : loco::postorder_traversal(loco::output_nodes(g))) - { - FusePostShift fps; - auto cnode = loco::must_cast(node); - if (cnode->accept(&fps)) - changed = true; - } - - return changed; -} - -} // namespace fme_apply diff --git a/compiler/fme-apply/src/pass/FusePostShiftPass.h b/compiler/fme-apply/src/pass/FusePostShiftPass.h deleted file mode 100644 index 37c0b74a246..00000000000 --- a/compiler/fme-apply/src/pass/FusePostShiftPass.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __FME_APPLY_FUSE_POST_SHIFT_PASS_H__ -#define __FME_APPLY_FUSE_POST_SHIFT_PASS_H__ - -#include - -#include - -namespace fme_apply -{ - -/** - * @brief Pass to fuse CircleCustom(PostShift) to preceding Ops - * - * BEFORE - * - * [Node1] - * | - * [Op] - * / \ - * [PostShift] [Node2] - * - * AFTER - * - * [Node1] - * / \ - * [Op'] [Op] - * | - * [Node2] - * - * NOTE Op' is clone of Op with updated bias. - */ -class FusePostShiftPass : public logo::Pass -{ -public: - virtual const char *name(void) const { return "fme::FusePostShiftPass"; } - -public: - bool run(loco::Graph *graph); -}; - -} // namespace fme_apply - -#endif //__FME_APPLY_FUSE_POST_SHIFT_PASS_H__ diff --git a/compiler/fme-apply/src/pass/FusePostShiftPass.test.cpp b/compiler/fme-apply/src/pass/FusePostShiftPass.test.cpp deleted file mode 100644 index f379623bb49..00000000000 --- a/compiler/fme-apply/src/pass/FusePostShiftPass.test.cpp +++ /dev/null @@ -1,346 +0,0 @@ -/* - * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "FusePostShiftPass.h" -#include "Support.Cast.h" - -#include - -#include - -using namespace fme_apply; - -namespace -{ - -luci::CircleConst *create_const_node(loco::Graph *g, const loco::DataType dtype, - const std::vector &shape, - const std::vector &values) -{ - auto node = g->nodes()->create(); - node->dtype(dtype); - node->rank(shape.size()); - - uint32_t size = 1; - for (uint32_t i = 0; i < shape.size(); ++i) - { - node->dim(i) = shape[i]; - size *= shape[i]; - } - node->shape_status(luci::ShapeStatus::VALID); - - assert(values.size() == size); // FIX_CALLER_UNLESS - - node->size(size); - for (uint32_t i = 0; i < values.size(); ++i) - node->at(i) = values[i]; - - return node; -} - -/** - * PostShift-Conv graphlet - * - * [Conv] - * | - * [PostShift] - * - */ -class PostShiftConvGraphlet -{ -public: - void init(loco::Graph *g) - { - std::vector filter_val(3 * 3 * 3 * 3 /* size */, 1.0 /*value */); - - _conv = g->nodes()->create(); - _conv->filter( - create_const_node(g, loco::DataType::FLOAT32, {3, 3, 3, 3} /* shape */, filter_val)); - _conv->bias( - create_const_node(g, loco::DataType::FLOAT32, {3} /* shape */, {2, 2, 2} /* value*/)); - _conv->fusedActivationFunction(luci::FusedActFunc::NONE); - _conv->dtype(loco::DataType::FLOAT32); - _conv->shape({1, 4, 4, 3}); - _conv->padding(luci::Padding::SAME); - _conv->name("conv"); - - _postshift = g->nodes()->create(2 /* arity */, 1 /* out */); - _postshift->dtype(loco::DataType::FLOAT32); - _postshift->inputs(0, _conv); - _postshift->inputs( - 1, create_const_node(g, loco::DataType::FLOAT32, {3} /* shape */, {2, 2, 2} /* value */)); - _postshift->shape({1, 4, 4, 3}); - _postshift->custom_code("PostShift"); - _postshift->name("postshift"); - } - -public: - luci::CircleCustom *_postshift = nullptr; - luci::CircleConv2D *_conv = nullptr; -}; - -class PostShiftConvGraph : public luci::test::TestIOGraph, public PostShiftConvGraphlet -{ -public: - void init(void) - { - luci::test::TestIOGraph::init({1, 4, 4, 3}, {1, 4, 4, 3}); - PostShiftConvGraphlet::init(g()); - - _conv->input(input()); - - output()->from(_postshift); - } - - std::unique_ptr graph(void) { return std::move(_g); } -}; - -/** - * PostShift-DConv graphlet - * - * [DConv] - * | - * [PostShift] - * - */ -class PostShiftDConvGraphlet -{ -public: - void init(loco::Graph *g) - { - std::vector filter_val(1 * 3 * 3 * 3 /* size */, 1.0 /*value */); - - _dconv = g->nodes()->create(); - _dconv->filter( - create_const_node(g, loco::DataType::FLOAT32, {1, 3, 3, 3} /* shape */, filter_val)); - _dconv->bias( - create_const_node(g, loco::DataType::FLOAT32, {3} /* shape */, {2, 2, 2} /* value*/)); - _dconv->fusedActivationFunction(luci::FusedActFunc::NONE); - _dconv->dtype(loco::DataType::FLOAT32); - _dconv->shape({1, 4, 4, 3}); - _dconv->padding(luci::Padding::SAME); - _dconv->name("dconv"); - - _postshift = g->nodes()->create(2 /* arity */, 1 /* out */); - _postshift->dtype(loco::DataType::FLOAT32); - _postshift->inputs(0, _dconv); - _postshift->inputs( - 1, create_const_node(g, loco::DataType::FLOAT32, {3} /* shape */, {2, 2, 2} /* value */)); - _postshift->shape({1, 4, 4, 3}); - _postshift->custom_code("PostShift"); - _postshift->name("postshift"); - } - -public: - luci::CircleCustom *_postshift = nullptr; - luci::CircleDepthwiseConv2D *_dconv = nullptr; -}; - -class PostShiftDConvGraph : public luci::test::TestIOGraph, public PostShiftDConvGraphlet -{ -public: - void init(void) - { - luci::test::TestIOGraph::init({1, 4, 4, 3}, {1, 4, 4, 3}); - PostShiftDConvGraphlet::init(g()); - - _dconv->input(input()); - - output()->from(_postshift); - } - - std::unique_ptr graph(void) { return std::move(_g); } -}; - -/** - * PostShift-TConv graphlet - * - * [TConv] - * | - * [PostShift] - * - */ -class PostShiftTConvGraphlet -{ -public: - void init(loco::Graph *g) - { - std::vector filter_val(1 * 3 * 3 * 3 /* size */, 1.0 /*value */); - - _tconv = g->nodes()->create(); - _tconv->filter( - create_const_node(g, loco::DataType::FLOAT32, {1, 3, 3, 3} /* shape */, filter_val)); - _tconv->bias( - create_const_node(g, loco::DataType::FLOAT32, {3} /* shape */, {2, 2, 2} /* value*/)); - _tconv->dtype(loco::DataType::FLOAT32); - _tconv->shape({1, 4, 4, 3}); - _tconv->padding(luci::Padding::SAME); - _tconv->name("dconv"); - - _postshift = g->nodes()->create(2 /* arity */, 1 /* out */); - _postshift->dtype(loco::DataType::FLOAT32); - _postshift->inputs(0, _tconv); - _postshift->inputs( - 1, create_const_node(g, loco::DataType::FLOAT32, {3} /* shape */, {2, 2, 2} /* value */)); - _postshift->shape({1, 4, 4, 3}); - _postshift->custom_code("PostShift"); - _postshift->name("postshift"); - } - -public: - luci::CircleCustom *_postshift = nullptr; - luci::CircleTransposeConv *_tconv = nullptr; -}; - -class PostShiftTConvGraph : public luci::test::TestIOGraph, public PostShiftTConvGraphlet -{ -public: - void init(void) - { - luci::test::TestIOGraph::init({1, 4, 4, 3}, {1, 4, 4, 3}); - PostShiftTConvGraphlet::init(g()); - - _tconv->outBackprop(input()); - - output()->from(_postshift); - } - - std::unique_ptr graph(void) { return std::move(_g); } -}; - -} // namespace - -TEST(FusePostShiftPassTest, postshift_conv) -{ - PostShiftConvGraph g; - g.init(); - - FusePostShiftPass fpsp; - EXPECT_TRUE(fpsp.run(g.g())); - - auto conv = dynamic_cast(g.output()->from()); - EXPECT_NE(nullptr, conv); - - // Check bias - auto b = dynamic_cast(conv->bias()); - EXPECT_NE(nullptr, b); - EXPECT_EQ(loco::DataType::FLOAT32, b->dtype()); - EXPECT_EQ(3, b->size()); - for (uint32_t i = 0; i < 3; i++) - { - EXPECT_FLOAT_EQ(4.0, b->at(i)); - } -} - -TEST(FusePostShiftPassTest, postshift_conv_NEG) -{ - PostShiftConvGraph g; - g.init(); - g._conv->fusedActivationFunction(luci::FusedActFunc::RELU6); - - FusePostShiftPass fpsp; - EXPECT_FALSE(fpsp.run(g.g())); -} - -TEST(FusePostShiftPassTest, postshift_dconv) -{ - PostShiftDConvGraph g; - g.init(); - - FusePostShiftPass fpsp; - EXPECT_TRUE(fpsp.run(g.g())); - - auto dconv = dynamic_cast(g.output()->from()); - EXPECT_NE(nullptr, dconv); - - // Check bias - auto b = dynamic_cast(dconv->bias()); - EXPECT_NE(nullptr, b); - EXPECT_EQ(loco::DataType::FLOAT32, b->dtype()); - EXPECT_EQ(3, b->size()); - for (uint32_t i = 0; i < 3; i++) - { - EXPECT_FLOAT_EQ(4.0, b->at(i)); - } -} - -TEST(FusePostShiftPassTest, postshift_dconv_NEG) -{ - PostShiftDConvGraph g; - g.init(); - g._dconv->fusedActivationFunction(luci::FusedActFunc::RELU6); - - FusePostShiftPass fpsp; - EXPECT_FALSE(fpsp.run(g.g())); -} - -TEST(FusePostShiftPassTest, postshift_tconv) -{ - PostShiftTConvGraph g; - g.init(); - - FusePostShiftPass fpsp; - EXPECT_TRUE(fpsp.run(g.g())); - - auto tconv = dynamic_cast(g.output()->from()); - EXPECT_NE(nullptr, tconv); - - // Check bias - auto b = dynamic_cast(tconv->bias()); - EXPECT_NE(nullptr, b); - EXPECT_EQ(loco::DataType::FLOAT32, b->dtype()); - EXPECT_EQ(3, b->size()); - for (uint32_t i = 0; i < 3; i++) - { - EXPECT_FLOAT_EQ(4.0, b->at(i)); - } -} - -TEST(FusePostShiftPassTest, postshift_tconv_nobias) -{ - PostShiftTConvGraph g; - g.init(); - - auto no_bias = g.g()->nodes()->create(); - g._tconv->bias(no_bias); - - FusePostShiftPass fpsp; - EXPECT_TRUE(fpsp.run(g.g())); - - auto tconv = dynamic_cast(g.output()->from()); - EXPECT_NE(nullptr, tconv); - - // Check bias - auto b = dynamic_cast(tconv->bias()); - EXPECT_NE(nullptr, b); - EXPECT_EQ(loco::DataType::FLOAT32, b->dtype()); - EXPECT_EQ(3, b->size()); - for (uint32_t i = 0; i < 3; i++) - { - EXPECT_FLOAT_EQ(2.0, b->at(i)); - } -} - -TEST(FusePostShiftPassTest, postshift_tconv_NEG) -{ - PostShiftTConvGraph g; - g.init(); - g._postshift->inputs(0, g.input()); - g.output()->from(g._tconv); - - FusePostShiftPass fpsp; - EXPECT_FALSE(fpsp.run(g.g())); -} diff --git a/compiler/fme-apply/src/pass/FusePreShiftPass.cpp b/compiler/fme-apply/src/pass/FusePreShiftPass.cpp deleted file mode 100644 index 19cd9eb7e01..00000000000 --- a/compiler/fme-apply/src/pass/FusePreShiftPass.cpp +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "FusePreShiftPass.h" -#include "Support.Cast.h" - -#include -#include -#include -#include -#include - -using namespace fme_apply; - -namespace -{ - -// Fuse CircleCustom(PreShift) + Op -struct FusePreShift final : public luci::CircleNodeMutableVisitor -{ - bool visit(luci::CircleNode *) { return false; } - - bool visit(luci::CircleInstanceNorm *node) - { - auto pre_shift = to_pre_shift(node->input()); - if (not pre_shift) - return false; - - auto param = loco::must_cast(pre_shift->inputs(1)); // FIX_PreScale_UNLESS - auto channel = node->dim(node->rank() - 1).value(); - if (channel != param->size()) - { - assert(false); // FIX_PreShift_Unless - return false; - } - - // Output of InstanceNorm is not affected by PreShift - node->input(pre_shift->inputs(0)); - - return true; - } -}; - -} // namespace - -namespace fme_apply -{ - -bool FusePreShiftPass::run(loco::Graph *g) -{ - bool changed = false; - for (auto node : loco::postorder_traversal(loco::output_nodes(g))) - { - FusePreShift fps; - auto cnode = loco::must_cast(node); - if (cnode->accept(&fps)) - changed = true; - } - - return changed; -} - -} // namespace fme_apply diff --git a/compiler/fme-apply/src/pass/FusePreShiftPass.h b/compiler/fme-apply/src/pass/FusePreShiftPass.h deleted file mode 100644 index 31e3a4c0c98..00000000000 --- a/compiler/fme-apply/src/pass/FusePreShiftPass.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __FME_APPLY_FUSE_PRE_SHIFT_PASS_H__ -#define __FME_APPLY_FUSE_PRE_SHIFT_PASS_H__ - -#include - -#include - -namespace fme_apply -{ - -/** - * @brief Pass to fuse CircleCustom(PreShift) to succeeding Ops - * - * BEFORE - * - * [Node] - * | - * [PreShift] - * | - * [Op] - * - * AFTER - * - * [Node] - * | - * [Op'] - * - */ -class FusePreShiftPass : public logo::Pass -{ -public: - virtual const char *name(void) const { return "fme::FusePreShiftPass"; } - -public: - bool run(loco::Graph *graph); -}; - -} // namespace fme_apply - -#endif //__FME_APPLY_FUSE_PRE_SHIFT_PASS_H__ diff --git a/compiler/fme-apply/src/pass/FusePreShiftPass.test.cpp b/compiler/fme-apply/src/pass/FusePreShiftPass.test.cpp deleted file mode 100644 index f74f49c1cd8..00000000000 --- a/compiler/fme-apply/src/pass/FusePreShiftPass.test.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "FusePreShiftPass.h" -#include "Support.Cast.h" - -#include - -#include - -using namespace fme_apply; - -namespace -{ - -luci::CircleConst *create_const_node(loco::Graph *g, const loco::DataType dtype, - const std::vector &shape, - const std::vector &values) -{ - auto node = g->nodes()->create(); - node->dtype(dtype); - node->rank(shape.size()); - - uint32_t size = 1; - for (uint32_t i = 0; i < shape.size(); ++i) - { - node->dim(i) = shape[i]; - size *= shape[i]; - } - node->shape_status(luci::ShapeStatus::VALID); - - assert(values.size() == size); // FIX_CALLER_UNLESS - - node->size(size); - for (uint32_t i = 0; i < values.size(); ++i) - node->at(i) = values[i]; - - return node; -} - -/** - * PreShift-Instnorm graphlet - * - * [PreShift] - * | - * [Instnorm] - * - */ -class PreShiftInstnormGraphlet -{ -public: - void init(loco::Graph *g) - { - _preshift = g->nodes()->create(2 /* arity */, 1 /* out */); - _preshift->dtype(loco::DataType::FLOAT32); - _preshift->inputs( - 1, create_const_node(g, loco::DataType::FLOAT32, {3} /* shape */, {2, 2, 2} /* value */)); - _preshift->shape({1, 4, 4, 3}); - _preshift->custom_code("PreShift"); - _preshift->name("prescale"); - - _instnorm = g->nodes()->create(); - _instnorm->input(_preshift); - _instnorm->fusedActivationFunction(luci::FusedActFunc::NONE); - _instnorm->dtype(loco::DataType::FLOAT32); - _instnorm->shape({1, 4, 4, 3}); - _instnorm->name("instnorm"); - } - -public: - luci::CircleCustom *_preshift = nullptr; - luci::CircleInstanceNorm *_instnorm = nullptr; -}; - -class PreShiftInstnormGraph : public luci::test::TestIOGraph, public PreShiftInstnormGraphlet -{ -public: - void init(void) - { - luci::test::TestIOGraph::init({1, 4, 4, 3}, {1, 4, 4, 3}); - PreShiftInstnormGraphlet::init(g()); - - _preshift->inputs(0, input()); - - output()->from(_instnorm); - } - - std::unique_ptr graph(void) { return std::move(_g); } -}; - -} // namespace - -TEST(FusePreShiftPassTest, preshift_instnorm) -{ - PreShiftInstnormGraph g; - g.init(); - - FusePreShiftPass fpsp; - EXPECT_TRUE(fpsp.run(g.g())); - - auto instnorm = dynamic_cast(g.output()->from()); - EXPECT_NE(nullptr, instnorm); - - auto pre_shift = to_pre_shift(instnorm->input()); - EXPECT_EQ(nullptr, pre_shift); // No pre_shift -} - -TEST(FusePreShiftPassTest, preshift_instnorm_NEG) -{ - PreShiftInstnormGraph g; - g.init(); - g._instnorm->input(g.input()); - - FusePreShiftPass fpsp; - EXPECT_FALSE(fpsp.run(g.g())); -} diff --git a/compiler/fme-detect/src/EqualizePatternFinder.cpp b/compiler/fme-detect/src/EqualizePatternFinder.cpp index 2d5c977ce8b..b856f03a34d 100644 --- a/compiler/fme-detect/src/EqualizePatternFinder.cpp +++ b/compiler/fme-detect/src/EqualizePatternFinder.cpp @@ -117,6 +117,27 @@ struct GetFusability final : public luci::CircleNodeMutableVisitor } return f; } + + Fusability visit(luci::CircleFullyConnected *node) + { + Fusability f; + { + f.pre_scale = true; + if (node->fusedActivationFunction() == luci::FusedActFunc::NONE) + { + f.post_scale = true; + f.post_shift = true; + } + // Negative scale is not fusable across ReLU, but fme-detect does not + // know the scale value. So, we assume that the scale is positive. + // NOTE If a pattern has negative scales, fm-equalize rejects the pattern + else if (node->fusedActivationFunction() == luci::FusedActFunc::RELU) + { + f.post_scale = true; + } + } + return f; + } }; Fusability fusability(luci::CircleNode *node) @@ -135,6 +156,7 @@ struct Forwardable }; // Return Forwardable of node +// Note that the degree of effect may vary from layer to layer. Forwardable forwardable(luci::CircleNode *node) { if (node == nullptr) @@ -145,10 +167,13 @@ Forwardable forwardable(luci::CircleNode *node) case luci::CircleOpcode::PAD: return {true, false}; case luci::CircleOpcode::MAX_POOL_2D: - // Assumption: all scale values are positive. - return {true, true}; - case luci::CircleOpcode::SLICE: return {true, true}; + case luci::CircleOpcode::RELU: + return {true, false}; + case luci::CircleOpcode::LEAKY_RELU: + return {true, false}; + case luci::CircleOpcode::GELU: + return {true, false}; default: return {false, false}; } @@ -168,13 +193,13 @@ void match(luci::CircleNode *front, std::vector &res) auto back = loco::must_cast(succ); auto back_fusability = fusability(back); - // If 'back' is not fusable with PreScale/Shift, we check if PreScale/Shift + // If 'back' is not fusable with PreScale, we check if PreScale // can forward across 'back' // TODO Generalize this code to support multiple forwardable Ops - if ((not back_fusability.pre_scale) and (not back_fusability.pre_shift)) + if (not back_fusability.pre_scale) { auto f = forwardable(back); - if (f.scale_forwardable or f.shift_forwardable) + if (f.scale_forwardable) { auto succ_succs = loco::succs(back); // Only support single successor for simplicity @@ -184,10 +209,17 @@ void match(luci::CircleNode *front, std::vector &res) auto next_back = loco::must_cast(next_succ); back_fusability = fusability(next_back); back_fusability.pre_scale &= f.scale_forwardable; - back_fusability.pre_shift &= f.shift_forwardable; + back = next_back; } } + if (front_fusability.post_scale and back_fusability.pre_scale) + { + res.emplace_back(front->name(), back->name(), EqualizePattern::Type::ScaleOnly); + } + + // TODO Let's consider "shift" when it is necessary. +#if 0 // Create EqualizePattern based on fusability // ScaleShift // front: fusable_post_shift and fusable_post_scale @@ -211,6 +243,7 @@ void match(luci::CircleNode *front, std::vector &res) { res.emplace_back(front->name(), back->name(), EqualizePattern::Type::ScaleOnly); } +#endif } } diff --git a/compiler/fme-detect/src/EqualizePatternFinder.test.cpp b/compiler/fme-detect/src/EqualizePatternFinder.test.cpp index 63a8ed01321..086225aa222 100644 --- a/compiler/fme-detect/src/EqualizePatternFinder.test.cpp +++ b/compiler/fme-detect/src/EqualizePatternFinder.test.cpp @@ -447,7 +447,7 @@ TEST(EqualizePatternFinderTest, conv_pad_conv) EXPECT_EQ(1, res.size()); EXPECT_EQ("conv1", res[0].front); - EXPECT_EQ("pad", res[0].back); + EXPECT_EQ("conv2", res[0].back); EXPECT_EQ(EqualizePattern::Type::ScaleOnly, res[0].type); } @@ -483,7 +483,7 @@ TEST(EqualizePatternFinderTest, conv_maxpool_conv) EXPECT_EQ(1, res.size()); EXPECT_EQ("conv1", res[0].front); - EXPECT_EQ("maxpool", res[0].back); + EXPECT_EQ("conv2", res[0].back); EXPECT_EQ(EqualizePattern::Type::ScaleOnly, res[0].type); } @@ -503,7 +503,7 @@ TEST(EqualizePatternFinderTest, conv_relu_pad_conv) EXPECT_EQ(1, res.size()); EXPECT_EQ("conv1", res[0].front); - EXPECT_EQ("pad", res[0].back); + EXPECT_EQ("conv2", res[0].back); EXPECT_EQ(EqualizePattern::Type::ScaleOnly, res[0].type); } @@ -541,25 +541,6 @@ TEST(EqualizePatternFinderTest, conv_tanh_pad_conv_NEG) EXPECT_EQ(0, res.size()); } -TEST(EqualizePatternFinderTest, tconv_slice_instnorm) -{ - EqualizePatternFinder::Context ctx; - { - ctx._allow_dup_op = true; - } - EqualizePatternFinder epf(ctx); - - TConvSliceInstnormGraph g; - g.init(); - - auto res = epf.find(g.g()); - - EXPECT_EQ(1, res.size()); - EXPECT_EQ("tconv", res[0].front); - EXPECT_EQ("slice", res[0].back); - EXPECT_EQ(EqualizePattern::Type::ShiftOnly, res[0].type); -} - TEST(EqualizePatternFinderTest, tconv_slice_NEG) { EqualizePatternFinder::Context ctx; @@ -577,28 +558,6 @@ TEST(EqualizePatternFinderTest, tconv_slice_NEG) EXPECT_EQ(0, res.size()); } -TEST(EqualizePatternFinderTest, dup_op) -{ - EqualizePatternFinder::Context ctx; - { - ctx._allow_dup_op = true; - } - EqualizePatternFinder epf(ctx); - - ConvConvINGraph g; - g.init(); - - auto res = epf.find(g.g()); - - EXPECT_EQ(2, res.size()); - EXPECT_EQ("conv1", res[0].front); - EXPECT_EQ("instnorm", res[0].back); - EXPECT_EQ(EqualizePattern::Type::ShiftOnly, res[0].type); - EXPECT_EQ("conv1", res[1].front); - EXPECT_EQ("conv2", res[1].back); - EXPECT_EQ(EqualizePattern::Type::ScaleOnly, res[1].type); -} - TEST(EqualizePatternFinderTest, dup_op_NEG) { EqualizePatternFinder::Context ctx; diff --git a/compiler/luci/service/include/luci/Service/CircleShapeInference.h b/compiler/luci/service/include/luci/Service/CircleShapeInference.h index 8906983576f..0f3bfdf7855 100644 --- a/compiler/luci/service/include/luci/Service/CircleShapeInference.h +++ b/compiler/luci/service/include/luci/Service/CircleShapeInference.h @@ -74,7 +74,7 @@ class Algorithm final : public luci::CircleNodeVisitor // loco::TensorShape visit(const luci::CircleFloor *node) final; // loco::TensorShape visit(const luci::CircleFloorDiv *node) final; // loco::TensorShape visit(const luci::CircleFloorMod *node) final; - // loco::TensorShape visit(const luci::CircleFullyConnected *node) final; + loco::TensorShape visit(const luci::CircleFullyConnected *node) final; // loco::TensorShape visit(const luci::CircleGather *node) final; // loco::TensorShape visit(const luci::CircleGatherNd *node) final; // loco::TensorShape visit(const luci::CircleGreater *node) final; @@ -112,7 +112,7 @@ class Algorithm final : public luci::CircleNodeVisitor // loco::TensorShape visit(const luci::CirclePow *node) final; // loco::TensorShape visit(const luci::CirclePRelu *node) final; loco::TensorShape visit(const luci::CircleQuantize *node) final; - // loco::TensorShape visit(const luci::CircleRange *node) final; + loco::TensorShape visit(const luci::CircleRange *node) final; // loco::TensorShape visit(const luci::CircleRank *node) final; // loco::TensorShape visit(const luci::CircleReduceAny *node) final; // loco::TensorShape visit(const luci::CircleReduceMax *node) final; @@ -146,7 +146,7 @@ class Algorithm final : public luci::CircleNodeVisitor // loco::TensorShape visit(const luci::CircleSquare *node) final; // loco::TensorShape visit(const luci::CircleSquaredDifference *node) final; // loco::TensorShape visit(const luci::CircleSqueeze *node) final; - // loco::TensorShape visit(const luci::CircleStridedSlice *node) final; + loco::TensorShape visit(const luci::CircleStridedSlice *node) final; // loco::TensorShape visit(const luci::CircleSub *node) final; // loco::TensorShape visit(const luci::CircleSum *node) final; // loco::TensorShape visit(const luci::CircleTanh *node) final; diff --git a/compiler/luci/service/src/CircleShapeInferenceHelper.cpp b/compiler/luci/service/src/CircleShapeInferenceHelper.cpp index 76867ccafc1..3d78a31a12e 100644 --- a/compiler/luci/service/src/CircleShapeInferenceHelper.cpp +++ b/compiler/luci/service/src/CircleShapeInferenceHelper.cpp @@ -161,7 +161,7 @@ loco::TensorShape broadcast_shape(const loco::TensorShape &x, const loco::Tensor return output_shape; } -loco::TensorShape pad_shape(const loco::TensorShape &input_shape, const luci::CircleConst *paddings) +loco::TensorShape pad_shape(const loco::TensorShape &input_shape, const luci::CircleNode *paddings) { const loco::DataType S32 = loco::DataType::S32; const loco::DataType S64 = loco::DataType::S64; @@ -180,6 +180,11 @@ loco::TensorShape pad_shape(const loco::TensorShape &input_shape, const luci::Ci loco::TensorShape output_shape; output_shape.rank(input_shape.rank()); + + auto const_padding = dynamic_cast(paddings); + if (const_padding == nullptr) + return output_shape; + for (int32_t ni = 0; ni < n; ++ni) { if (not input_shape.dim(ni).known()) @@ -189,15 +194,15 @@ loco::TensorShape pad_shape(const loco::TensorShape &input_shape, const luci::Ci } int32_t idx = ni * 2; int value = input_shape.dim(ni).value(); - if (paddings->dtype() == S32) + if (const_padding->dtype() == S32) { - value += paddings->at(idx + 0); // left - value += paddings->at(idx + 1); // right + value += const_padding->at(idx + 0); // left + value += const_padding->at(idx + 1); // right } else { - auto pl = paddings->at(idx + 0); - auto pr = paddings->at(idx + 1); + auto pl = const_padding->at(idx + 0); + auto pr = const_padding->at(idx + 1); auto max = static_cast(std::numeric_limits::max()); auto low = static_cast(std::numeric_limits::lowest()); LUCI_ASSERT(pl <= max, "paddings is over 32 bit limit"); diff --git a/compiler/luci/service/src/CircleShapeInferenceHelper.h b/compiler/luci/service/src/CircleShapeInferenceHelper.h index 4961e9c40de..1f99bb5b444 100644 --- a/compiler/luci/service/src/CircleShapeInferenceHelper.h +++ b/compiler/luci/service/src/CircleShapeInferenceHelper.h @@ -49,8 +49,8 @@ loco::TensorShape circle_shape(const luci::CircleNode *node); loco::TensorShape broadcast_shape(const loco::TensorShape &x, const loco::TensorShape &y); // Return shape of pad ops using paddings. -loco::TensorShape pad_shape(const loco::TensorShape &input_shape, - const luci::CircleConst *paddings); +// If paddings is not static, return the shape filled with unknown dimensions. +loco::TensorShape pad_shape(const loco::TensorShape &input_shape, const luci::CircleNode *paddings); /** * @brief Create a higher-rank TensorShape following NumPy broadcasting semantics diff --git a/compiler/luci/service/src/CircleShapeInferenceRule.cpp b/compiler/luci/service/src/CircleShapeInferenceRule.cpp index c6a57b5b723..b6c5d0d5bb7 100644 --- a/compiler/luci/service/src/CircleShapeInferenceRule.cpp +++ b/compiler/luci/service/src/CircleShapeInferenceRule.cpp @@ -19,7 +19,6 @@ #include "Check.h" #include "CircleShapeInferenceHelper.h" -#include "ShapeInfer_StridedSlice.h" #include #include @@ -637,47 +636,6 @@ loco::NodeShape infer_fill(const luci::CircleFill *node) return loco::NodeShape{shape}; } -loco::NodeShape infer_fully_connected(const luci::CircleFullyConnected *node) -{ - auto input_shape = luci::shape_get(node->input()).as(); - auto weights_shape = luci::shape_get(node->weights()).as(); - - loco::TensorShape out_shape; - - // NOTE Some recipes in some repositories are using rank 4 input for FullyConnected. - // Until they are all fixed, disable following assert. - // TODO Enable following assert after related fixes are applied - // https://github.com/tensorflow/tensorflow/blob/ea33c1e7a25d8025e8ee405ad8ab7be261798d76/tensorflow/lite/kernels/fully_connected.cc#L194 - // LUCI_ASSERT(input_shape.rank() == 2 || input_shape.rank() == 3, - // "Input rank of FullyConnected should be 2 or 3"); - - // https://github.com/tensorflow/tensorflow/blob/ea33c1e7a25d8025e8ee405ad8ab7be261798d76/tensorflow/lite/kernels/fully_connected.cc#L225 - LUCI_ASSERT(weights_shape.rank() == 2, "Weights of FullyConnected should be 2"); - - // https://github.com/tensorflow/tensorflow/blob/ea33c1e7a25d8025e8ee405ad8ab7be261798d76/tensorflow/lite/kernels/fully_connected.cc#L353-L367 - if (node->keep_num_dims()) - { - out_shape.rank(input_shape.rank()); - for (uint32_t i = 0; i < input_shape.rank(); ++i) - out_shape.dim(i) = input_shape.dim(i); - out_shape.dim(out_shape.rank() - 1) = weights_shape.dim(0); - } - else - { - uint32_t input_size = 1; - for (uint32_t i = 0; i < input_shape.rank(); i++) - { - input_size = input_size * input_shape.dim(i).value(); - } - const uint32_t batch_size = input_size / weights_shape.dim(1).value(); - out_shape.rank(2); - out_shape.dim(0) = batch_size; - out_shape.dim(1) = weights_shape.dim(0); - } - - return loco::NodeShape{out_shape}; -} - loco::NodeShape infer_gather(const luci::CircleGather *node) { loco::TensorShape output_shape; @@ -953,49 +911,6 @@ loco::NodeShape infer_p_relu(const luci::CirclePRelu *node) return loco::NodeShape{output_shape}; } -loco::NodeShape infer_range(const luci::CircleRange *node) -{ - loco::TensorShape output_shape; - output_shape.rank(1); - - auto start_node = dynamic_cast(node->start()); - auto limit_node = dynamic_cast(node->limit()); - auto delta_node = dynamic_cast(node->delta()); - - if (start_node == nullptr || limit_node == nullptr || delta_node == nullptr) - { - return use_own(node); - } - - double start = 0, limit = 0, delta = 0; - -#define GET_RANGE_PARAM(DT) \ - start = start_node->scalar
(); \ - limit = limit_node->scalar
(); \ - delta = delta_node->scalar
(); - - switch (start_node->dtype()) - { - case loco::DataType::FLOAT32: - GET_RANGE_PARAM(loco::DataType::FLOAT32) - break; - case loco::DataType::S32: - GET_RANGE_PARAM(loco::DataType::S32) - break; - default: - INTERNAL_EXN("Range data type not supported"); - } - -#undef GET_RANGE_PARAM - - if (delta == 0) - INTERNAL_EXN("Delta can not be zero"); - - output_shape.dim(0) = ceil((limit - start) / delta); - - return loco::NodeShape{output_shape}; -} - template loco::NodeShape infer_resize_type(const CIRCLENODE *node) { auto input_shape = luci::shape_get(node->input()).template as(); @@ -1306,21 +1221,6 @@ loco::NodeShape infer_sparse_to_dense(const luci::CircleSparseToDense *node) return loco::NodeShape{shape}; } -loco::NodeShape infer_strided_slice(const luci::CircleStridedSlice *node) -{ - auto begin_node = dynamic_cast(node->begin()); - auto end_node = dynamic_cast(node->end()); - auto strides_node = dynamic_cast(node->strides()); - - if (begin_node == nullptr || end_node == nullptr || strides_node == nullptr) - { - return use_own(node); - } - - loco::TensorShape shape = infer_output_shape(node); - return loco::NodeShape{shape}; -} - loco::NodeShape infer_squeeze(const luci::CircleSqueeze *node) { auto input_shape = luci::shape_get(node->input()).as(); @@ -1967,11 +1867,6 @@ class ShapeInferenceAlgorithm final : public luci::CircleNodeVisitor::visit(const luci::CircleFullyConnected return cloned; } +namespace sinf +{ + +loco::TensorShape Algorithm::visit(const luci::CircleFullyConnected *node) +{ + auto input_shape = circle_shape(loco::must_cast(node->input())); + auto weights_shape = circle_shape(loco::must_cast(node->weights())); + + loco::TensorShape out_shape; + + // NOTE Some recipes in some repositories are using rank 4 input for FullyConnected. + // Until they are all fixed, disable following assert. + // TODO Enable following assert after related fixes are applied + // https://github.com/tensorflow/tensorflow/blob/ea33c1e7a25d8025e8ee405ad8ab7be261798d76/tensorflow/lite/kernels/fully_connected.cc#L194 + // LUCI_ASSERT(input_shape.rank() == 2 || input_shape.rank() == 3, + // "Input rank of FullyConnected should be 2 or 3"); + + // https://github.com/tensorflow/tensorflow/blob/ea33c1e7a25d8025e8ee405ad8ab7be261798d76/tensorflow/lite/kernels/fully_connected.cc#L225 + LUCI_ASSERT(weights_shape.rank() == 2, "Weights of FullyConnected should be 2"); + LUCI_ASSERT(weights_shape.dim(0).known() && weights_shape.dim(1).known(), + "Weights of FullyConnected should be known") + // https://github.com/tensorflow/tensorflow/blob/ea33c1e7a25d8025e8ee405ad8ab7be261798d76/tensorflow/lite/kernels/fully_connected.cc#L353-L367 + + /* + * **Pre-conditions:** + * input_shape.rank() <= 4 + * * remark: TFLite allows <=3 ranks, but there are rank 4 input recipes in ONE + * weights_shape.rank() == 2 and all dimensions are known. + * When runtime(input_shape[-1] and weights_shape[-1] are both known), it should be same value. + * + * **Shape Inference Rule:** + * **Input Shape:** + * input_shape : (A, B, C, D) + * weights_shape : (E, F) + * A, B, C, D are "positive numbers" or "unknown". + * E, F are always "positive numbers". + * + * **Output Shape:** + * If keep_dims = True : (A, B, C, E) + * If keep_dims = False : (G, E) + * * G = unknown (if any of A, B, or C is unknown.) + * * G = A * B * C (otherwise.) + */ + + if (node->keep_num_dims()) + { + out_shape.rank(input_shape.rank()); + for (uint32_t i = 0; i < input_shape.rank(); ++i) + out_shape.dim(i) = input_shape.dim(i); + out_shape.dim(out_shape.rank() - 1) = weights_shape.dim(0); + } + else + { + bool is_dynamic_shape = false; + + for (uint32_t i = 0; i < input_shape.rank() - 1; i++) + { + if (not input_shape.dim(i).known()) + { + is_dynamic_shape = true; + break; + } + } + + uint32_t batch_size = 1; + + for (uint32_t i = 0; i < input_shape.rank() - 1; i++) + { + batch_size *= input_shape.dim(i).value(); + } + + out_shape.rank(2); + if (is_dynamic_shape) + out_shape.dim(0).unset(); + else + out_shape.dim(0) = batch_size; + out_shape.dim(1) = weights_shape.dim(0); + } + + return out_shape; +} + +} // namespace sinf + } // namespace luci diff --git a/compiler/luci/service/src/Nodes/CircleFullyConnected.test.cpp b/compiler/luci/service/src/Nodes/CircleFullyConnected.test.cpp index 965b5913051..9fcab69d4ec 100644 --- a/compiler/luci/service/src/Nodes/CircleFullyConnected.test.cpp +++ b/compiler/luci/service/src/Nodes/CircleFullyConnected.test.cpp @@ -16,6 +16,8 @@ #include "luci/Service/CircleNodeClone.h" +#include "luci/Service/CircleShapeInference.h" + #include TEST(CloneNodeTest, clone_FullyConnected) @@ -59,3 +61,300 @@ TEST(CloneNodeTest, clone_FullyConnected_wf_NEG) auto cloned = luci::clone_node(node_fc, gc.get()); ASSERT_EQ(nullptr, cloned); } + +TEST(ShapeRuleTest, fully_connected_dynamic_shape_keep_dims) +{ + luci::CircleInput input; + luci::CircleConst weights; + luci::CircleConst bias; + luci::CircleFullyConnected fully_connected; + + input.shape({1, 10, 15, 20}); + input.shape_status(luci::ShapeStatus::VALID); + input.dim(1).unset(); + + weights.shape({30, 20}); + weights.shape_status(luci::ShapeStatus::VALID); + + bias.shape_status(luci::ShapeStatus::VALID); + + fully_connected.input(&input); + fully_connected.weights(&weights); + fully_connected.bias(&bias); + fully_connected.keep_num_dims(true); + + loco::TensorShape shape; + luci::sinf::Rule shape_inf_rule; + + ASSERT_TRUE(shape_inf_rule.infer(&fully_connected, shape)); + ASSERT_EQ(4, shape.rank()); + ASSERT_TRUE(shape.dim(0).known()); + ASSERT_FALSE(shape.dim(1).known()); + ASSERT_TRUE(shape.dim(2).known()); + ASSERT_TRUE(shape.dim(3).known()); + + ASSERT_EQ(1, shape.dim(0).value()); + ASSERT_EQ(0, shape.dim(1).value()); + ASSERT_EQ(15, shape.dim(2).value()); + ASSERT_EQ(30, shape.dim(3).value()); +} + +TEST(ShapeRuleTest, fully_connected_last_dim_dynamic_keep_dims) +{ + luci::CircleInput input; + luci::CircleConst weights; + luci::CircleConst bias; + luci::CircleFullyConnected fully_connected; + + input.shape({1, 10, 15, 20}); + input.shape_status(luci::ShapeStatus::VALID); + input.dim(3).unset(); + + weights.shape({30, 20}); + weights.shape_status(luci::ShapeStatus::VALID); + + bias.shape_status(luci::ShapeStatus::VALID); + + fully_connected.input(&input); + fully_connected.weights(&weights); + fully_connected.bias(&bias); + fully_connected.keep_num_dims(true); + + loco::TensorShape shape; + luci::sinf::Rule shape_inf_rule; + + ASSERT_TRUE(shape_inf_rule.infer(&fully_connected, shape)); + ASSERT_EQ(4, shape.rank()); + ASSERT_TRUE(shape.dim(0).known()); + ASSERT_TRUE(shape.dim(1).known()); + ASSERT_TRUE(shape.dim(2).known()); + ASSERT_TRUE(shape.dim(3).known()); + + ASSERT_EQ(1, shape.dim(0).value()); + ASSERT_EQ(10, shape.dim(1).value()); + ASSERT_EQ(15, shape.dim(2).value()); + ASSERT_EQ(30, shape.dim(3).value()); +} + +TEST(ShapeRuleTest, fully_connected_dynamic_shape_no_keep_dims) +{ + luci::CircleInput input; + luci::CircleConst weights; + luci::CircleConst bias; + luci::CircleFullyConnected fully_connected; + + input.shape({1, 10, 15, 20}); + input.shape_status(luci::ShapeStatus::VALID); + input.dim(2).unset(); + + weights.shape({30, 20}); + weights.shape_status(luci::ShapeStatus::VALID); + + bias.shape_status(luci::ShapeStatus::VALID); + + fully_connected.input(&input); + fully_connected.weights(&weights); + fully_connected.bias(&bias); + fully_connected.keep_num_dims(false); + + loco::TensorShape shape; + luci::sinf::Rule shape_inf_rule; + + ASSERT_TRUE(shape_inf_rule.infer(&fully_connected, shape)); + ASSERT_EQ(2, shape.rank()); + ASSERT_FALSE(shape.dim(0).known()); + ASSERT_TRUE(shape.dim(1).known()); + + ASSERT_EQ(0, shape.dim(0).value()); + ASSERT_EQ(30, shape.dim(1).value()); +} + +TEST(ShapeRuleTest, fully_connected_last_dim_dynamic_no_keep_dims) +{ + luci::CircleInput input; + luci::CircleConst weights; + luci::CircleConst bias; + luci::CircleFullyConnected fully_connected; + + input.shape({1, 10, 15, 20}); + input.shape_status(luci::ShapeStatus::VALID); + input.dim(3).unset(); + + weights.shape({30, 20}); + weights.shape_status(luci::ShapeStatus::VALID); + + bias.shape_status(luci::ShapeStatus::VALID); + + fully_connected.input(&input); + fully_connected.weights(&weights); + fully_connected.bias(&bias); + fully_connected.keep_num_dims(false); + + loco::TensorShape shape; + luci::sinf::Rule shape_inf_rule; + + ASSERT_TRUE(shape_inf_rule.infer(&fully_connected, shape)); + ASSERT_EQ(2, shape.rank()); + ASSERT_TRUE(shape.dim(0).known()); + ASSERT_TRUE(shape.dim(1).known()); + + ASSERT_EQ(150, shape.dim(0).value()); + ASSERT_EQ(30, shape.dim(1).value()); +} + +TEST(ShapeRuleTest, fully_connected_all_dim_dynamic_no_keep_dims) +{ + luci::CircleInput input; + luci::CircleConst weights; + luci::CircleConst bias; + luci::CircleFullyConnected fully_connected; + + input.shape({1, 10, 15, 20}); + input.shape_status(luci::ShapeStatus::VALID); + input.dim(0).unset(); + input.dim(1).unset(); + input.dim(2).unset(); + input.dim(3).unset(); + + weights.shape({30, 20}); + weights.shape_status(luci::ShapeStatus::VALID); + + bias.shape_status(luci::ShapeStatus::VALID); + + fully_connected.input(&input); + fully_connected.weights(&weights); + fully_connected.bias(&bias); + fully_connected.keep_num_dims(false); + + loco::TensorShape shape; + luci::sinf::Rule shape_inf_rule; + + ASSERT_TRUE(shape_inf_rule.infer(&fully_connected, shape)); + ASSERT_EQ(2, shape.rank()); + ASSERT_FALSE(shape.dim(0).known()); + ASSERT_TRUE(shape.dim(1).known()); + + ASSERT_EQ(0, shape.dim(0).value()); + ASSERT_EQ(30, shape.dim(1).value()); +} + +TEST(ShapeRuleTest, fully_connected_nullptr_weights_NEG) +{ + luci::CircleInput input; + luci::CircleConst weights; + luci::CircleConst bias; + luci::CircleFullyConnected fully_connected; + + input.shape({1, 10, 20}); + input.shape_status(luci::ShapeStatus::VALID); + + bias.shape_status(luci::ShapeStatus::VALID); + + fully_connected.input(&input); + fully_connected.weights(nullptr); + fully_connected.bias(&bias); + fully_connected.keep_num_dims(true); + + loco::TensorShape shape; + luci::sinf::Rule shape_inf_rule; + + ASSERT_ANY_THROW(shape_inf_rule.infer(&fully_connected, shape)); +} + +TEST(ShapeRuleTest, fully_connected_nullptr_input_NEG) +{ + luci::CircleInput input; + luci::CircleConst weights; + luci::CircleConst bias; + luci::CircleFullyConnected fully_connected; + + weights.shape({30, 20}); + weights.shape_status(luci::ShapeStatus::VALID); + + bias.shape_status(luci::ShapeStatus::VALID); + + fully_connected.input(nullptr); + fully_connected.weights(&weights); + fully_connected.bias(&bias); + fully_connected.keep_num_dims(true); + + loco::TensorShape shape; + luci::sinf::Rule shape_inf_rule; + + ASSERT_ANY_THROW(shape_inf_rule.infer(&fully_connected, shape)); +} + +TEST(ShapeRuleTest, fully_connected_nullptr_bias_NEG) +{ + luci::CircleInput input; + luci::CircleConst weights; + luci::CircleConst bias; + luci::CircleFullyConnected fully_connected; + + input.shape({1, 15, 20}); + input.shape_status(luci::ShapeStatus::VALID); + + weights.shape({30, 20}); + weights.shape_status(luci::ShapeStatus::VALID); + + fully_connected.input(&input); + fully_connected.weights(&weights); + fully_connected.bias(nullptr); + fully_connected.keep_num_dims(true); + + loco::TensorShape shape; + luci::sinf::Rule shape_inf_rule; + + ASSERT_ANY_THROW(shape_inf_rule.infer(&fully_connected, shape)); +} + +TEST(ShapeRuleTest, fully_connected_undefined_bias_NEG) +{ + luci::CircleInput input; + luci::CircleConst weights; + luci::CircleConst bias; + luci::CircleFullyConnected fully_connected; + + input.shape({1, 15, 20}); + input.shape_status(luci::ShapeStatus::VALID); + + weights.shape({30, 20}); + weights.shape_status(luci::ShapeStatus::VALID); + + bias.shape_status(luci::ShapeStatus::UNDEFINED); + + fully_connected.input(&input); + fully_connected.weights(&weights); + fully_connected.bias(&bias); + fully_connected.keep_num_dims(true); + + loco::TensorShape shape; + luci::sinf::Rule shape_inf_rule; + + ASSERT_FALSE(shape_inf_rule.infer(&fully_connected, shape)); +} + +TEST(ShapeRuleTest, fully_connected_undefined_input_NEG) +{ + luci::CircleInput input; + luci::CircleConst weights; + luci::CircleConst bias; + luci::CircleFullyConnected fully_connected; + + input.shape_status(luci::ShapeStatus::UNDEFINED); + + weights.shape({30, 20}); + weights.shape_status(luci::ShapeStatus::VALID); + + bias.shape_status(luci::ShapeStatus::VALID); + + fully_connected.input(&input); + fully_connected.weights(&weights); + fully_connected.bias(&bias); + fully_connected.keep_num_dims(true); + + loco::TensorShape shape; + luci::sinf::Rule shape_inf_rule; + + ASSERT_FALSE(shape_inf_rule.infer(&fully_connected, shape)); +} diff --git a/compiler/luci/service/src/Nodes/CirclePad.cpp b/compiler/luci/service/src/Nodes/CirclePad.cpp index 2f4f90140af..8dbaf399b54 100644 --- a/compiler/luci/service/src/Nodes/CirclePad.cpp +++ b/compiler/luci/service/src/Nodes/CirclePad.cpp @@ -32,8 +32,7 @@ namespace sinf loco::TensorShape Algorithm::visit(const luci::CirclePad *node) { - // TODO support non-const case - auto paddings = loco::must_cast(node->paddings()); + auto paddings = loco::must_cast(node->paddings()); auto circle_input = loco::must_cast(node->input()); auto input_shape = circle_shape(circle_input); return pad_shape(input_shape, paddings); diff --git a/compiler/luci/service/src/Nodes/CirclePad.test.cpp b/compiler/luci/service/src/Nodes/CirclePad.test.cpp index 996426164f3..070b9b31075 100644 --- a/compiler/luci/service/src/Nodes/CirclePad.test.cpp +++ b/compiler/luci/service/src/Nodes/CirclePad.test.cpp @@ -90,3 +90,37 @@ TEST(ShapeRuleTest, pad_without_padding_NEG) ASSERT_ANY_THROW(shape_inf_rule.infer(node_pad, shape)); } + +TEST(ShapeRuleTest, pad_non_const_paddings) +{ + auto g = loco::make_graph(); + auto node_pad = g->nodes()->create(); + + auto node_paddings = g->nodes()->create(); + auto node_input = g->nodes()->create(); + + loco::TensorShape shape; + luci::sinf::Rule shape_inf_rule; + + node_input->shape({1, 2, 3, 4}); + node_input->shape_status(luci::ShapeStatus::VALID); + + node_paddings->dtype(loco::DataType::S64); + node_paddings->shape({4, 2}); + node_paddings->shape_status(luci::ShapeStatus::VALID); + + node_pad->input(node_input); + node_pad->paddings(node_paddings); + + ASSERT_TRUE(shape_inf_rule.infer(node_pad, shape)); + ASSERT_EQ(shape.rank(), 4); + ASSERT_FALSE(shape.dim(0).known()); + ASSERT_FALSE(shape.dim(1).known()); + ASSERT_FALSE(shape.dim(2).known()); + ASSERT_FALSE(shape.dim(3).known()); + + ASSERT_EQ(0, shape.dim(0).value()); + ASSERT_EQ(0, shape.dim(1).value()); + ASSERT_EQ(0, shape.dim(2).value()); + ASSERT_EQ(0, shape.dim(3).value()); +} diff --git a/compiler/luci/service/src/Nodes/CircleRange.cpp b/compiler/luci/service/src/Nodes/CircleRange.cpp index ccb975d4ea0..b2c8be4d5f8 100644 --- a/compiler/luci/service/src/Nodes/CircleRange.cpp +++ b/compiler/luci/service/src/Nodes/CircleRange.cpp @@ -14,7 +14,12 @@ * limitations under the License. */ +#include "luci/Service/CircleShapeInference.h" + #include "CircleCloneNode.h" +#include "CircleShapeInferenceHelper.h" + +#include namespace luci { @@ -24,4 +29,66 @@ luci::CircleNode *CloneNodeLet::visit(const luci::CircleRange *) return _graph->nodes()->create(); } +namespace sinf +{ + +loco::TensorShape Algorithm::visit(const luci::CircleRange *node) +{ + loco::TensorShape output_shape; + output_shape.rank(1); + + auto start_node = dynamic_cast(node->start()); + auto limit_node = dynamic_cast(node->limit()); + auto delta_node = dynamic_cast(node->delta()); + + if (start_node == nullptr || limit_node == nullptr || delta_node == nullptr) + { + // We use shape from the node itself + loco::TensorShape shape; + shape.rank(node->rank()); + for (uint32_t r = 0; r < node->rank(); ++r) + { + // TODO remove this copy from `use_own(node);` + // Shape inference rules in this file did not consider unknown dimension. + // If some node has unknown dimension, 0 is inserted and wrong shape + // inference was done as a result. + // To fix this, new shape inference algorithm is being implemented. + // Until new inference algorithm is fully implemented, unknown dimension + // would be represented as 1 along with TFLite expression. + shape.dim(r) = node->dim(r).known() ? node->dim(r).value() : 1; + } + return shape; + } + + double start = 0, limit = 0, delta = 0; + +#define GET_RANGE_PARAM(DT) \ + start = start_node->scalar
(); \ + limit = limit_node->scalar
(); \ + delta = delta_node->scalar
(); + + switch (start_node->dtype()) + { + case loco::DataType::FLOAT32: + GET_RANGE_PARAM(loco::DataType::FLOAT32) + break; + case loco::DataType::S32: + GET_RANGE_PARAM(loco::DataType::S32) + break; + default: + INTERNAL_EXN("Range data type not supported"); + } + +#undef GET_RANGE_PARAM + + if (delta == 0) + INTERNAL_EXN("Delta can not be zero"); + + output_shape.dim(0) = ceil((limit - start) / delta); + + return output_shape; +} + +} // namespace sinf + } // namespace luci diff --git a/compiler/luci/service/src/Nodes/CircleRange.test.cpp b/compiler/luci/service/src/Nodes/CircleRange.test.cpp index b2fb296177a..b67d287d1ab 100644 --- a/compiler/luci/service/src/Nodes/CircleRange.test.cpp +++ b/compiler/luci/service/src/Nodes/CircleRange.test.cpp @@ -15,6 +15,7 @@ */ #include "luci/Service/CircleNodeClone.h" +#include "luci/Service/CircleShapeInference.h" #include @@ -31,3 +32,66 @@ TEST(CloneNodeTest, clone_Range) auto cloned_range = dynamic_cast(cloned); ASSERT_NE(nullptr, cloned_range); } + +TEST(ShapeRuleTest, range_const_param) +{ + luci::CircleConst start, limit, delta; + luci::CircleRange range; + + start.dtype(loco::DataType::S32); + start.size(1); + start.at(0) = 0; + start.shape_status(luci::ShapeStatus::VALID); + + limit.dtype(loco::DataType::S32); + limit.size(1); + limit.at(0) = 10; + limit.shape_status(luci::ShapeStatus::VALID); + + delta.dtype(loco::DataType::S32); + delta.size(1); + delta.at(0) = 2; + delta.shape_status(luci::ShapeStatus::VALID); + + range.start(&start); + range.limit(&limit); + range.delta(&delta); + + loco::TensorShape shape; + luci::sinf::Rule shape_inf_rule; + + ASSERT_TRUE(shape_inf_rule.infer(&range, shape)); + ASSERT_EQ(1, shape.rank()); + ASSERT_TRUE(shape.dim(0).known()); + ASSERT_EQ(5, shape.dim(0).value()); +} + +TEST(ShapeRuleTest, range_zero_delta_NEG) +{ + luci::CircleConst start, limit, delta; + luci::CircleRange range; + + start.dtype(loco::DataType::S32); + start.size(1); + start.at(0) = 0; + start.shape_status(luci::ShapeStatus::VALID); + + limit.dtype(loco::DataType::S32); + limit.size(1); + limit.at(0) = 10; + limit.shape_status(luci::ShapeStatus::VALID); + + delta.dtype(loco::DataType::S32); + delta.size(1); + delta.at(0) = 0; + delta.shape_status(luci::ShapeStatus::VALID); + + range.start(&start); + range.limit(&limit); + range.delta(&delta); + + loco::TensorShape shape; + luci::sinf::Rule shape_inf_rule; + + ASSERT_ANY_THROW(shape_inf_rule.infer(&range, shape)); +} diff --git a/compiler/luci/service/src/Nodes/CircleStridedSlice.cpp b/compiler/luci/service/src/Nodes/CircleStridedSlice.cpp index 3298c92b53e..cc594f63ddc 100644 --- a/compiler/luci/service/src/Nodes/CircleStridedSlice.cpp +++ b/compiler/luci/service/src/Nodes/CircleStridedSlice.cpp @@ -1,5 +1,6 @@ /* * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,8 +14,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "luci/Service/CircleShapeInference.h" +#include "Check.h" #include "CircleCloneNode.h" +#include "CircleShapeInferenceHelper.h" + +#include +#include +#include +#include + +#include +#include +#include +#include namespace luci { @@ -33,4 +47,413 @@ luci::CircleNode *CloneNodeLet::visit(const luci::CircleStridedSlice * return cloned; } +// code referenced from +// https://github.com/tensorflow/tensorflow/blob/3f878cff5b698b82eea85db2b60d65a2e320850e/ +// tensorflow/lite/kernels/strided_slice.cc +// tensorflow/lite/kernels/internal/strided_slice_logic.h +namespace sinf +{ + +// This Op only supports 1-5D cases and since we use the reference 4D +// implementation, the 1-3D tensors are mapped to 4D. +const int kMaxDim = 5; + +const loco::DataType S32 = loco::DataType::S32; + +struct StridedSliceParams +{ + int8_t start_indices_count = 0; + int32_t start_indices[kMaxDim]; + int8_t stop_indices_count = 0; + int32_t stop_indices[kMaxDim]; + int8_t strides_count = 0; + int32_t strides[kMaxDim]; + + int16_t begin_mask = 0; + int16_t ellipsis_mask = 0; + int16_t end_mask = 0; + int16_t new_axis_mask = 0; + int16_t shrink_axis_mask = 0; +}; + +struct StridedSliceContext +{ + StridedSliceContext(const luci::CircleStridedSlice *node) + { + // check overflow issues + assert(static_cast(node->begin_mask()) == node->begin_mask()); + assert(static_cast(node->ellipsis_mask()) == node->ellipsis_mask()); + assert(static_cast(node->end_mask()) == node->end_mask()); + assert(static_cast(node->new_axis_mask()) == node->new_axis_mask()); + assert(static_cast(node->shrink_axis_mask()) == node->shrink_axis_mask()); + + params.begin_mask = node->begin_mask(); + params.ellipsis_mask = node->ellipsis_mask(); + params.end_mask = node->end_mask(); + params.new_axis_mask = node->new_axis_mask(); + params.shrink_axis_mask = node->shrink_axis_mask(); + + input = loco::must_cast(node->input()); + begin = loco::must_cast(node->begin()); + end = loco::must_cast(node->end()); + strides = loco::must_cast(node->strides()); + + loco::TensorShape input_shape = circle_shape(input); + input_dims = input_shape.rank(); + } + StridedSliceParams params; + luci::CircleNode *input = nullptr; + luci::CircleConst *begin = nullptr; + luci::CircleConst *end = nullptr; + luci::CircleConst *strides = nullptr; + + // Equivalent input shape after adding axis according to new_axis_mask. + loco::TensorShape effective_input_shape; + int64_t input_dims = 0; +}; + +// Use until std::clamp() is available from C++17. +inline int Clamp(const int32_t v, const int32_t lo, const int32_t hi) +{ + LUCI_ASSERT(!(hi < lo), "Clamp hi < lo"); + if (hi < v) + return hi; + if (v < lo) + return lo; + return v; +} + +// Return the index for the first element along that axis. This index will be a +// positive integer between [0, axis_size - 1] that can be used to index +// directly into the data. +inline int64_t StartForAxis(const StridedSliceParams ¶ms, const loco::TensorShape &input_shape, + int64_t axis) +{ + const auto begin_mask = params.begin_mask; + const auto *start_indices = params.start_indices; + const auto *strides = params.strides; + const int64_t axis_size = static_cast(input_shape.dim(axis).value()); + if (axis_size == 0) + { + return 0; + } + // Begin with the specified index. + int64_t start = start_indices[axis]; + + // begin_mask override + if (begin_mask & (1LL << axis)) + { + if (strides[axis] > 0) + { + // Forward iteration - use the first element. These values will get + // clamped below (Note: We could have set them to 0 and axis_size-1, but + // use lowest() and max() to maintain symmetry with StopForAxis()) + start = std::numeric_limits::lowest(); + } + else + { + // Backward iteration - use the last element. + start = std::numeric_limits::max(); + } + } + + // Handle negative indices + if (start < 0) + { + start += axis_size; + } + + // Clamping + if (strides[axis] > 0) + { + // Forward iteration + start = Clamp(start, 0, axis_size); + } + else + { + // Backward iteration + start = Clamp(start, -1, axis_size - 1); + } + + return start; +} + +// Return the "real" index for the end of iteration along that axis. This is an +// "end" in the traditional C sense, in that it points to one past the last +// element. ie. So if you were iterating through all elements of a 1D array of +// size 4, this function would return 4 as the stop, because it is one past the +// "real" indices of 0, 1, 2 & 3. +inline int64_t StopForAxis(const StridedSliceParams ¶ms, const loco::TensorShape &input_shape, + int64_t axis, int64_t start_for_axis) +{ + const auto end_mask = params.end_mask; + const auto shrink_axis_mask = params.shrink_axis_mask; + const auto *stop_indices = params.stop_indices; + const auto *strides = params.strides; + const int64_t axis_size = static_cast(input_shape.dim(axis).value()); + if (axis_size == 0) + { + return 0; + } + + // Begin with the specified index + const bool shrink_axis = shrink_axis_mask & (1LL << axis); + int64_t stop = stop_indices[axis]; + + // When shrinking an axis, the end position does not matter (and can be + // incorrect when negative indexing is used, see Issue #19260). Always use + // start_for_axis + 1 to generate a length 1 slice, since start_for_axis has + // already been adjusted for negative indices. + if (shrink_axis) + { + return start_for_axis + 1; + } + + // end_mask override + if (end_mask & (1LL << axis)) + { + if (strides[axis] > 0) + { + // Forward iteration - use the last element. These values will get + // clamped below + stop = std::numeric_limits::max(); + } + else + { + // Backward iteration - use the first element. + stop = std::numeric_limits::lowest(); + } + } + + // Handle negative indices + if (stop < 0) + { + stop += axis_size; + } + + // Clamping + // Because the end index points one past the last element, we need slightly + // different clamping ranges depending on the direction. + if (strides[axis] > 0) + { + // Forward iteration + stop = Clamp(stop, 0, axis_size); + } + else + { + // Backward iteration + stop = Clamp(stop, -1, axis_size - 1); + } + + return stop; +} + +StridedSliceParams BuildStridedSliceParams(StridedSliceContext *op_context) +{ + StridedSliceParams op_params; + + // The ellipsis_mask and new_axis_mask in op_params are not used. Those masks + // are processed here to update begin_mask, end_mask and the index range. + op_params.begin_mask = 0; + op_params.ellipsis_mask = 0; + op_params.end_mask = 0; + op_params.new_axis_mask = 0; + op_params.shrink_axis_mask = 0; + + // Count indexes where the new_axis_mask is set but the ellipsis_mask is not. + loco::TensorShape begin_shape = circle_shape(op_context->begin); + const int64_t begin_count = static_cast(begin_shape.dim(0).value()); + int64_t num_add_axis = 0; + for (int64_t i = 0; i < begin_count; ++i) + { + if (!((1LL << i) & op_context->params.ellipsis_mask) && + ((1LL << i) & op_context->params.new_axis_mask)) + { + num_add_axis++; + } + } + + // Calculate the dims of input after adding new axises. + const int64_t effective_dims = op_context->input_dims + num_add_axis; + + // If begin, end and strides are not fully provided, it means Ellipsis should + // be expanded to multiple dimensions (Ex: for spec [Ellipsis, 2] on a 3D + // input, the Ellipsis should be applied for the first 2 dimensions). Besides, + // If the new_axis_mask and the ellipsis_mask are set at the same index, the + // new_axis_mask will have no effect. + int64_t effective_ellipsis_mask = 0, effective_new_axis_mask = 0; + int64_t ellipsis_start_idx = effective_dims, expanded_ellipsis = 0; + for (int64_t i = 0; i < effective_dims;) + { + if ((1LL << i) & op_context->params.ellipsis_mask) + { + ellipsis_start_idx = i; + int64_t ellipsis_end_idx = + std::max(i + 1, std::min(i + 1 + num_add_axis + op_context->input_dims - begin_count, + effective_dims)); + expanded_ellipsis = ellipsis_end_idx - ellipsis_start_idx - 1; + + // Set bit for effective_ellipsis_mask. + for (; i < ellipsis_end_idx; ++i) + { + effective_ellipsis_mask |= (1LL << i); + } + continue; + } + + if ((1LL << (i - expanded_ellipsis)) & op_context->params.new_axis_mask) + { + effective_new_axis_mask |= (1LL << i); + } + ++i; + } + + // Calculate effective_input_shape and its corresponding begin, end, strides. + loco::TensorShape input_shape = circle_shape(op_context->input); + int64_t added_ellipsis = 0, added_axises = 0; + op_context->effective_input_shape.rank(effective_dims); + + for (int64_t i = 0; i < effective_dims; ++i) + { + if ((1LL << i) & effective_ellipsis_mask) + { + // If ellipsis_mask, set the begin_mask and end_mask at that index. + added_ellipsis = std::max(int64_t(0), i - ellipsis_start_idx); + assert(i < 16); + op_params.begin_mask |= (1LL << i); + op_params.end_mask |= (1LL << i); + op_params.strides[i] = 1; + op_context->effective_input_shape.dim(i) = input_shape.dim(i - added_axises); + } + else if ((1LL << i) & effective_new_axis_mask) + { + // If new_axis_mask is set, it is equivalent to adding a new dim of 1 to + // input tensor. Store added shape to effective_input_shape. + op_params.start_indices[i] = 0; + op_params.stop_indices[i] = 1; + op_params.strides[i] = 1; + op_context->effective_input_shape.dim(i) = loco::Dimension(1); + added_axises++; + } + else if (i >= begin_count + expanded_ellipsis) + { + op_params.start_indices[i] = 0; + op_params.stop_indices[i] = 0; + op_params.strides[i] = 1; + assert(i < 16); + op_params.begin_mask |= (1LL << i); + op_params.end_mask |= (1LL << i); + op_context->effective_input_shape.dim(i) = input_shape.dim(i - added_axises); + } + else + { + const int64_t orig_idx = i - added_ellipsis; + op_params.start_indices[i] = op_context->begin->at(orig_idx); + op_params.stop_indices[i] = op_context->end->at(orig_idx); + op_params.strides[i] = op_context->strides->at(orig_idx); + if (op_context->params.begin_mask & (1LL << orig_idx)) + { + assert(i < 16); + op_params.begin_mask |= (1LL << i); + } + if (op_context->params.end_mask & (1LL << orig_idx)) + { + assert(i < 16); + op_params.end_mask |= (1LL << i); + } + if (op_context->params.shrink_axis_mask & (1LL << orig_idx)) + { + assert(i < 16); + op_params.shrink_axis_mask |= (1LL << i); + } + op_context->effective_input_shape.dim(i) = input_shape.dim(i - added_axises); + } + } + + // make sure no overflow + assert(static_cast(effective_dims) == static_cast(effective_dims)); + + op_params.start_indices_count = effective_dims; + op_params.stop_indices_count = effective_dims; + op_params.strides_count = effective_dims; + + return op_params; +} + +loco::TensorShape Algorithm::visit(const luci::CircleStridedSlice *node) +{ + loco::TensorShape output_shape; + + auto input_node = loco::must_cast(node->input()); + + auto begin_node = dynamic_cast(node->begin()); + auto end_node = dynamic_cast(node->end()); + auto strides_node = dynamic_cast(node->strides()); + // TODO support non-const case + if (begin_node == nullptr || end_node == nullptr || strides_node == nullptr) + { + INTERNAL_EXN("StridedSlice begin/end/strides nodes are not Constant"); + } + + LUCI_ASSERT(begin_node->dtype() == S32, "Only support S32 for begin_node"); + LUCI_ASSERT(end_node->dtype() == S32, "Only support S32 for end_node"); + LUCI_ASSERT(strides_node->dtype() == S32, "Only support S32 for strides_node"); + + LUCI_ASSERT(begin_node->rank() == 1, "Only support rank 1 for begin_node"); + LUCI_ASSERT(end_node->rank() == 1, "Only support rank 1 for end_node"); + LUCI_ASSERT(strides_node->rank() == 1, "Only support rank 1 for strides_node"); + + loco::TensorShape input_shape = circle_shape(input_node); + + assert(begin_node->size() <= input_shape.rank()); + assert(end_node->size() <= input_shape.rank()); + assert(strides_node->size() <= input_shape.rank()); + + StridedSliceContext op_context(node); + auto op_params = BuildStridedSliceParams(&op_context); + auto &effective_input_shape = op_context.effective_input_shape; + std::vector output_shape_vector; + + for (int32_t idx = effective_input_shape.rank() - 1; idx >= 0; --idx) + { + int32_t stride = op_params.strides[idx]; + LUCI_ASSERT(stride != 0, "stride value has to be non-zero"); + + int64_t begin = StartForAxis(op_params, effective_input_shape, idx); + int64_t end = StopForAxis(op_params, effective_input_shape, idx, begin); + + // When shrinking an axis, the end position does not matter (and can be + // incorrect when negative indexing is used, see Issue #19260). Always use + // begin + 1 to generate a length 1 slice, since begin has + // already been adjusted for negative indices by GetBeginValueAtIndex. + const bool shrink_axis = op_params.shrink_axis_mask & (1 << idx); + if (shrink_axis) + { + end = begin + 1; + } + + // This is valid for both positive and negative strides + int64_t dim_shape = std::ceil((end - begin) / static_cast(stride)); + dim_shape = dim_shape < 0 ? 0 : dim_shape; + if (!shrink_axis) + { + output_shape_vector.push_back(dim_shape); + } + } + + auto shape_size = output_shape_vector.size(); + output_shape.rank(shape_size); + for (uint32_t idx = 0; idx < shape_size; ++idx) + { + int64_t dim = output_shape_vector.at(shape_size - 1u - idx); + LUCI_ASSERT(0 <= dim && dim < 0xfffffffL, "Dimension size exceeds limit"); + // reverse copy + output_shape.dim(idx) = static_cast(dim); + } + + return output_shape; +} + +} // namespace sinf + } // namespace luci diff --git a/compiler/luci/service/src/ShapeInfer_StridedSlice.cpp b/compiler/luci/service/src/ShapeInfer_StridedSlice.cpp deleted file mode 100644 index 5a22da3198c..00000000000 --- a/compiler/luci/service/src/ShapeInfer_StridedSlice.cpp +++ /dev/null @@ -1,444 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2018 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ShapeInfer_StridedSlice.h" -#include "Check.h" -#include "CircleShapeInferenceHelper.h" - -#include -#include -#include -#include - -#include -#include -#include -#include - -// code referenced from -// https://github.com/tensorflow/tensorflow/blob/3f878cff5b698b82eea85db2b60d65a2e320850e/ -// tensorflow/lite/kernels/strided_slice.cc -// tensorflow/lite/kernels/internal/strided_slice_logic.h - -namespace -{ - -// This Op only supports 1-5D cases and since we use the reference 4D -// implementation, the 1-3D tensors are mapped to 4D. -const int kMaxDim = 5; - -const loco::DataType S32 = loco::DataType::S32; - -struct StridedSliceParams -{ - int8_t start_indices_count = 0; - int32_t start_indices[kMaxDim]; - int8_t stop_indices_count = 0; - int32_t stop_indices[kMaxDim]; - int8_t strides_count = 0; - int32_t strides[kMaxDim]; - - int16_t begin_mask = 0; - int16_t ellipsis_mask = 0; - int16_t end_mask = 0; - int16_t new_axis_mask = 0; - int16_t shrink_axis_mask = 0; -}; - -struct StridedSliceContext -{ - StridedSliceContext(const luci::CircleStridedSlice *node) - { - // check overflow issues - assert(static_cast(node->begin_mask()) == node->begin_mask()); - assert(static_cast(node->ellipsis_mask()) == node->ellipsis_mask()); - assert(static_cast(node->end_mask()) == node->end_mask()); - assert(static_cast(node->new_axis_mask()) == node->new_axis_mask()); - assert(static_cast(node->shrink_axis_mask()) == node->shrink_axis_mask()); - - params.begin_mask = node->begin_mask(); - params.ellipsis_mask = node->ellipsis_mask(); - params.end_mask = node->end_mask(); - params.new_axis_mask = node->new_axis_mask(); - params.shrink_axis_mask = node->shrink_axis_mask(); - - input = loco::must_cast(node->input()); - begin = loco::must_cast(node->begin()); - end = loco::must_cast(node->end()); - strides = loco::must_cast(node->strides()); - - loco::TensorShape input_shape = luci::shape_get(input).as(); - input_dims = input_shape.rank(); - } - StridedSliceParams params; - luci::CircleNode *input = nullptr; - luci::CircleConst *begin = nullptr; - luci::CircleConst *end = nullptr; - luci::CircleConst *strides = nullptr; - - // Equivalent input shape after adding axis according to new_axis_mask. - loco::TensorShape effective_input_shape; - int64_t input_dims = 0; -}; - -// Use until std::clamp() is available from C++17. -inline int Clamp(const int32_t v, const int32_t lo, const int32_t hi) -{ - LUCI_ASSERT(!(hi < lo), "Clamp hi < lo"); - if (hi < v) - return hi; - if (v < lo) - return lo; - return v; -} - -// Return the index for the first element along that axis. This index will be a -// positive integer between [0, axis_size - 1] that can be used to index -// directly into the data. -inline int64_t StartForAxis(const StridedSliceParams ¶ms, const loco::TensorShape &input_shape, - int64_t axis) -{ - const auto begin_mask = params.begin_mask; - const auto *start_indices = params.start_indices; - const auto *strides = params.strides; - const int64_t axis_size = static_cast(input_shape.dim(axis).value()); - if (axis_size == 0) - { - return 0; - } - // Begin with the specified index. - int64_t start = start_indices[axis]; - - // begin_mask override - if (begin_mask & (1LL << axis)) - { - if (strides[axis] > 0) - { - // Forward iteration - use the first element. These values will get - // clamped below (Note: We could have set them to 0 and axis_size-1, but - // use lowest() and max() to maintain symmetry with StopForAxis()) - start = std::numeric_limits::lowest(); - } - else - { - // Backward iteration - use the last element. - start = std::numeric_limits::max(); - } - } - - // Handle negative indices - if (start < 0) - { - start += axis_size; - } - - // Clamping - if (strides[axis] > 0) - { - // Forward iteration - start = Clamp(start, 0, axis_size); - } - else - { - // Backward iteration - start = Clamp(start, -1, axis_size - 1); - } - - return start; -} - -// Return the "real" index for the end of iteration along that axis. This is an -// "end" in the traditional C sense, in that it points to one past the last -// element. ie. So if you were iterating through all elements of a 1D array of -// size 4, this function would return 4 as the stop, because it is one past the -// "real" indices of 0, 1, 2 & 3. -inline int64_t StopForAxis(const StridedSliceParams ¶ms, const loco::TensorShape &input_shape, - int64_t axis, int64_t start_for_axis) -{ - const auto end_mask = params.end_mask; - const auto shrink_axis_mask = params.shrink_axis_mask; - const auto *stop_indices = params.stop_indices; - const auto *strides = params.strides; - const int64_t axis_size = static_cast(input_shape.dim(axis).value()); - if (axis_size == 0) - { - return 0; - } - - // Begin with the specified index - const bool shrink_axis = shrink_axis_mask & (1LL << axis); - int64_t stop = stop_indices[axis]; - - // When shrinking an axis, the end position does not matter (and can be - // incorrect when negative indexing is used, see Issue #19260). Always use - // start_for_axis + 1 to generate a length 1 slice, since start_for_axis has - // already been adjusted for negative indices. - if (shrink_axis) - { - return start_for_axis + 1; - } - - // end_mask override - if (end_mask & (1LL << axis)) - { - if (strides[axis] > 0) - { - // Forward iteration - use the last element. These values will get - // clamped below - stop = std::numeric_limits::max(); - } - else - { - // Backward iteration - use the first element. - stop = std::numeric_limits::lowest(); - } - } - - // Handle negative indices - if (stop < 0) - { - stop += axis_size; - } - - // Clamping - // Because the end index points one past the last element, we need slightly - // different clamping ranges depending on the direction. - if (strides[axis] > 0) - { - // Forward iteration - stop = Clamp(stop, 0, axis_size); - } - else - { - // Backward iteration - stop = Clamp(stop, -1, axis_size - 1); - } - - return stop; -} - -StridedSliceParams BuildStridedSliceParams(StridedSliceContext *op_context) -{ - StridedSliceParams op_params; - - // The ellipsis_mask and new_axis_mask in op_params are not used. Those masks - // are processed here to update begin_mask, end_mask and the index range. - op_params.begin_mask = 0; - op_params.ellipsis_mask = 0; - op_params.end_mask = 0; - op_params.new_axis_mask = 0; - op_params.shrink_axis_mask = 0; - - // Count indexes where the new_axis_mask is set but the ellipsis_mask is not. - loco::TensorShape begin_shape = luci::shape_get(op_context->begin).as(); - const int64_t begin_count = static_cast(begin_shape.dim(0).value()); - int64_t num_add_axis = 0; - for (int64_t i = 0; i < begin_count; ++i) - { - if (!((1LL << i) & op_context->params.ellipsis_mask) && - ((1LL << i) & op_context->params.new_axis_mask)) - { - num_add_axis++; - } - } - - // Calculate the dims of input after adding new axises. - const int64_t effective_dims = op_context->input_dims + num_add_axis; - - // If begin, end and strides are not fully provided, it means Ellipsis should - // be expanded to multiple dimensions (Ex: for spec [Ellipsis, 2] on a 3D - // input, the Ellipsis should be applied for the first 2 dimensions). Besides, - // If the new_axis_mask and the ellipsis_mask are set at the same index, the - // new_axis_mask will have no effect. - int64_t effective_ellipsis_mask = 0, effective_new_axis_mask = 0; - int64_t ellipsis_start_idx = effective_dims, expanded_ellipsis = 0; - for (int64_t i = 0; i < effective_dims;) - { - if ((1LL << i) & op_context->params.ellipsis_mask) - { - ellipsis_start_idx = i; - int64_t ellipsis_end_idx = - std::max(i + 1, std::min(i + 1 + num_add_axis + op_context->input_dims - begin_count, - effective_dims)); - expanded_ellipsis = ellipsis_end_idx - ellipsis_start_idx - 1; - - // Set bit for effective_ellipsis_mask. - for (; i < ellipsis_end_idx; ++i) - { - effective_ellipsis_mask |= (1LL << i); - } - continue; - } - - if ((1LL << (i - expanded_ellipsis)) & op_context->params.new_axis_mask) - { - effective_new_axis_mask |= (1LL << i); - } - ++i; - } - - // Calculate effective_input_shape and its corresponding begin, end, strides. - loco::TensorShape input_shape = luci::shape_get(op_context->input).as(); - int64_t added_ellipsis = 0, added_axises = 0; - op_context->effective_input_shape.rank(effective_dims); - - for (int64_t i = 0; i < effective_dims; ++i) - { - if ((1LL << i) & effective_ellipsis_mask) - { - // If ellipsis_mask, set the begin_mask and end_mask at that index. - added_ellipsis = std::max(int64_t(0), i - ellipsis_start_idx); - assert(i < 16); - op_params.begin_mask |= (1LL << i); - op_params.end_mask |= (1LL << i); - op_params.strides[i] = 1; - op_context->effective_input_shape.dim(i) = input_shape.dim(i - added_axises); - } - else if ((1LL << i) & effective_new_axis_mask) - { - // If new_axis_mask is set, it is equivalent to adding a new dim of 1 to - // input tensor. Store added shape to effective_input_shape. - op_params.start_indices[i] = 0; - op_params.stop_indices[i] = 1; - op_params.strides[i] = 1; - op_context->effective_input_shape.dim(i) = loco::Dimension(1); - added_axises++; - } - else if (i >= begin_count + expanded_ellipsis) - { - op_params.start_indices[i] = 0; - op_params.stop_indices[i] = 0; - op_params.strides[i] = 1; - assert(i < 16); - op_params.begin_mask |= (1LL << i); - op_params.end_mask |= (1LL << i); - op_context->effective_input_shape.dim(i) = input_shape.dim(i - added_axises); - } - else - { - const int64_t orig_idx = i - added_ellipsis; - op_params.start_indices[i] = op_context->begin->at(orig_idx); - op_params.stop_indices[i] = op_context->end->at(orig_idx); - op_params.strides[i] = op_context->strides->at(orig_idx); - if (op_context->params.begin_mask & (1LL << orig_idx)) - { - assert(i < 16); - op_params.begin_mask |= (1LL << i); - } - if (op_context->params.end_mask & (1LL << orig_idx)) - { - assert(i < 16); - op_params.end_mask |= (1LL << i); - } - if (op_context->params.shrink_axis_mask & (1LL << orig_idx)) - { - assert(i < 16); - op_params.shrink_axis_mask |= (1LL << i); - } - op_context->effective_input_shape.dim(i) = input_shape.dim(i - added_axises); - } - } - - // make sure no overflow - assert(static_cast(effective_dims) == static_cast(effective_dims)); - - op_params.start_indices_count = effective_dims; - op_params.stop_indices_count = effective_dims; - op_params.strides_count = effective_dims; - - return op_params; -} - -} // namespace - -namespace luci -{ - -loco::TensorShape infer_output_shape(const CircleStridedSlice *node) -{ - loco::TensorShape output_shape; - - auto input_node = loco::must_cast(node->input()); - - auto begin_node = dynamic_cast(node->begin()); - auto end_node = dynamic_cast(node->end()); - auto strides_node = dynamic_cast(node->strides()); - if (begin_node == nullptr || end_node == nullptr || strides_node == nullptr) - { - INTERNAL_EXN("StridedSlice begin/end/strides nodes are not Constant"); - } - - LUCI_ASSERT(begin_node->dtype() == S32, "Only support S32 for begin_node"); - LUCI_ASSERT(end_node->dtype() == S32, "Only support S32 for end_node"); - LUCI_ASSERT(strides_node->dtype() == S32, "Only support S32 for strides_node"); - - LUCI_ASSERT(begin_node->rank() == 1, "Only support rank 1 for begin_node"); - LUCI_ASSERT(end_node->rank() == 1, "Only support rank 1 for end_node"); - LUCI_ASSERT(strides_node->rank() == 1, "Only support rank 1 for strides_node"); - - loco::TensorShape input_shape = luci::shape_get(input_node).as(); - - assert(begin_node->size() <= input_shape.rank()); - assert(end_node->size() <= input_shape.rank()); - assert(strides_node->size() <= input_shape.rank()); - - StridedSliceContext op_context(node); - auto op_params = BuildStridedSliceParams(&op_context); - auto &effective_input_shape = op_context.effective_input_shape; - std::vector output_shape_vector; - - for (int32_t idx = effective_input_shape.rank() - 1; idx >= 0; --idx) - { - int32_t stride = op_params.strides[idx]; - LUCI_ASSERT(stride != 0, "stride value has to be non-zero"); - - int64_t begin = StartForAxis(op_params, effective_input_shape, idx); - int64_t end = StopForAxis(op_params, effective_input_shape, idx, begin); - - // When shrinking an axis, the end position does not matter (and can be - // incorrect when negative indexing is used, see Issue #19260). Always use - // begin + 1 to generate a length 1 slice, since begin has - // already been adjusted for negative indices by GetBeginValueAtIndex. - const bool shrink_axis = op_params.shrink_axis_mask & (1 << idx); - if (shrink_axis) - { - end = begin + 1; - } - - // This is valid for both positive and negative strides - int64_t dim_shape = std::ceil((end - begin) / static_cast(stride)); - dim_shape = dim_shape < 0 ? 0 : dim_shape; - if (!shrink_axis) - { - output_shape_vector.push_back(dim_shape); - } - } - - auto shape_size = output_shape_vector.size(); - output_shape.rank(shape_size); - for (uint32_t idx = 0; idx < shape_size; ++idx) - { - int64_t dim = output_shape_vector.at(shape_size - 1u - idx); - LUCI_ASSERT(0 <= dim && dim < 0xfffffffL, "Dimension size exceeds limit"); - // reverse copy - output_shape.dim(idx) = static_cast(dim); - } - - return output_shape; -} - -} // namespace luci diff --git a/compiler/mio-circle08/CMakeLists.txt b/compiler/mio-circle08/CMakeLists.txt index cee15c96993..345ada948bc 100644 --- a/compiler/mio-circle08/CMakeLists.txt +++ b/compiler/mio-circle08/CMakeLists.txt @@ -8,9 +8,7 @@ endif(NOT FlatBuffers_FOUND) message(STATUS "Build mio-circle08: TRUE") # TODO Find a better way -# TODO use nnpackage -# set(SCHEMA_FILE "${NNAS_PROJECT_SOURCE_DIR}/nnpackage/schema/circle_schema.fbs") -set(SCHEMA_FILE "${NNAS_PROJECT_SOURCE_DIR}/res/CircleSchema/0.8/circle_schema.fbs") +set(SCHEMA_FILE "${NNAS_PROJECT_SOURCE_DIR}/nnpackage/schema/circle_schema.fbs") # NOTE Copy circle_schema.fbs as schema.fbs to generate "schema_generated.fbs" instead of "circle_schema_generated.fbs" add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/schema.fbs" diff --git a/compiler/one-cmds/onelib/constant.py b/compiler/one-cmds/onelib/constant.py index 97b207488f3..b15ca653bdd 100644 --- a/compiler/one-cmds/onelib/constant.py +++ b/compiler/one-cmds/onelib/constant.py @@ -63,7 +63,7 @@ class CONSTANT: 'remove_redundant_reshape', 'remove_redundant_transpose', 'remove_unnecessary_add', - 'remove_unnecessary_cast' + 'remove_unnecessary_cast', 'remove_unnecessary_reshape', 'remove_unnecessary_slice', 'remove_unnecessary_strided_slice', diff --git a/compute/cker/include/cker/eigen/xent_op.h b/compute/cker/include/cker/eigen/xent_op.h new file mode 100644 index 00000000000..60d996cd94e --- /dev/null +++ b/compute/cker/include/cker/eigen/xent_op.h @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2016 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_EIGEN_XENT_OPS_H__ +#define __NNFW_CKER_EIGEN_XENT_OPS_H__ + +// From tensorflow/core/kernels/xent_op.cc +#define EIGEN_USE_THREADS + +#include "unsupported/Eigen/CXX11/Tensor" +#include "cker/operation/Helper/Tensor.h" + +// From tensorflow/core/kernels/xent_op.h +namespace nnfw +{ +namespace cker +{ +namespace xent_ops +{ +namespace functor +{ + +// Functor used by XentOp to do the computations. +template struct XentFunctor +{ + // Computes Cross Entropy loss and backprop. + // + // logits: batch_size, num_classes. + // labels: batch_size, num_classes. + // scratch: temporary tensor, dims: batch_size, 1 + // loss: output tensor for the loss, dims: batch_size. + // backprop: output tensor for the backprop, dims: batch_size, num_classes. + void operator()(const Device &d, const Eigen::DSizes &shape, + const Eigen::array &logits_bcast, + const Eigen::array &labels_bcast, + typename TTypes::ConstMatrix logits, typename TTypes::ConstMatrix labels, + typename TTypes::Matrix scratch, typename TTypes::Vec loss, + typename TTypes::Matrix backprop, T reduction_size); +}; + +} // namespace functor +} // namespace xent_ops +} // namespace cker +} // namespace nnfw + +// From tensorflow/core/kernels/xent_op.cc +namespace nnfw +{ +namespace cker +{ +namespace xent_ops +{ + +// Enable CPUDevice only for xent_ops +using CPUDevice = Eigen::ThreadPoolDevice; +using Index = Eigen::Index; + +// Partial specialization for a CPUDevice, that uses the Eigen implementation +// from XentEigenImpl. +namespace functor +{ +template struct XentFunctorBase +{ + void operator()(const Device &d, const Eigen::DSizes &shape, + const Eigen::array &logits_bcast, + const Eigen::array &labels_bcast, + typename TTypes::ConstMatrix logits, typename TTypes::ConstMatrix labels, + typename TTypes::Matrix scratch, typename TTypes::Vec loss, + typename TTypes::Matrix backprop, T reduction_size) + { + T *scratch_ptr = scratch.data(); + T *backprop_ptr = backprop.data(); + + T *loss_ptr = loss.data(); + + int row_size = shape[1]; + + if (shape[0] > 0) + { + backprop.device(d) = logits.broadcast(logits_bcast); + scratch.device(d) = labels.broadcast(labels_bcast); + auto reductionWorker = [&](int64_t begin, int64_t end) -> void { + for (int i = begin; i < end; i++) + { + T *this_backprop = backprop_ptr + (i * row_size); + T *this_logits = backprop_ptr + (i * row_size); + T *this_labels = scratch_ptr + (i * row_size); + T max_logits = this_logits[0]; + + // calculating max_logits + for (int j = 1; j < row_size; j++) + { + max_logits = std::max(max_logits, this_logits[j]); + } + + T sum = T(0); + T loss_sum = T(0); + + for (int j = 0; j < row_size; j++) + { + // Note that if input is reused than this_logits and this_backprop + // is same buffer, so after this calculation this_logits should no + // longer be trusted + this_backprop[j] = this_logits[j] - max_logits; + sum = sum + exp(this_backprop[j]); + } + + // loss calculation + T log_sum = log(sum); + for (int j = 0; j < row_size; j++) + { + loss_sum += this_labels[j] * (log_sum - this_backprop[j]); + this_backprop[j] = ((exp(this_backprop[j]) / sum) - this_labels[j]) / reduction_size; + } + loss_ptr[i] = loss_sum; + } + }; + const int64_t compute_cycles = 50 * row_size; + const int64_t input_bytes = sizeof(T) * row_size; + const int64_t output_bytes = sizeof(T) * row_size; + const Eigen::TensorOpCost cost(input_bytes, output_bytes, compute_cycles); + + d.parallelFor(shape[0], cost, reductionWorker); + } + } +}; + +template struct XentFunctor : XentFunctorBase +{ +}; + +} // namespace functor +} // namespace xent_ops +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_EIGEN_XENT_OPS_H__ diff --git a/compute/cker/include/cker/operation/Helper/Tensor.h b/compute/cker/include/cker/operation/Helper/Tensor.h index ec29a15c3e2..5e670730431 100644 --- a/compute/cker/include/cker/operation/Helper/Tensor.h +++ b/compute/cker/include/cker/operation/Helper/Tensor.h @@ -157,6 +157,10 @@ struct Tensor { return typename TTypes::ConstScalar(base()); } + + template typename TTypes::Vec vec() { return shaped(); } + + template typename TTypes::Matrix matrix() { return shaped(); } }; // Tensor template Eigen::DSizes To32BitDims(const DSizes &in) diff --git a/compiler/luci/service/src/ShapeInfer_StridedSlice.h b/compute/cker/include/cker/train/Types.h similarity index 61% rename from compiler/luci/service/src/ShapeInfer_StridedSlice.h rename to compute/cker/include/cker/train/Types.h index fa800b72019..d873a900ab6 100644 --- a/compiler/luci/service/src/ShapeInfer_StridedSlice.h +++ b/compute/cker/include/cker/train/Types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,18 +14,24 @@ * limitations under the License. */ -#ifndef __SHAPE_INFER_STRIDED_SLICE_H__ -#define __SHAPE_INFER_STRIDED_SLICE_H__ +#ifndef __NNFW_CKER_TRAIN_TYPES_H__ +#define __NNFW_CKER_TRAIN_TYPES_H__ -#include - -#include - -namespace luci +namespace nnfw +{ +namespace cker +{ +namespace train { -loco::TensorShape infer_output_shape(const CircleStridedSlice *node); +enum class LossReductionType +{ + SUM_OVER_BATCH_SIZE, + SUM, +}; -} // namespace luci +} // namespace train +} // namespace cker +} // namespace nnfw -#endif // __SHAPE_INFER_STRIDED_SLICE_H__ +#endif // __NNFW_CKER_TYPES_H__ diff --git a/compute/cker/include/cker/train/operation/Loss.h b/compute/cker/include/cker/train/operation/Loss.h index 70f54ad59c0..fe9acb6d8a9 100644 --- a/compute/cker/include/cker/train/operation/Loss.h +++ b/compute/cker/include/cker/train/operation/Loss.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2016 The TensorFlow Authors. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +21,11 @@ #include #include "cker/Shape.h" +#include "cker/eigen/EigenSupport.h" #include "cker/eigen/Utils.h" +#include "cker/eigen/xent_op.h" +#include "cker/operation/Helper/BCast.h" +#include "cker/train/Types.h" namespace nnfw { @@ -59,17 +64,38 @@ inline void MSE(const Shape &y_pred_shape, const T *y_pred_data, const Shape &y_ template inline void MSEGrad(const Shape &y_pred_shape, const T *y_pred_data, const Shape &y_true_shape, - const T *y_true_data, const Shape &grad_shape, T *grad_data) + const T *y_true_data, const Shape &grad_shape, T *grad_data, + LossReductionType reduction_type) { if (y_pred_shape != y_true_shape) throw std::runtime_error("cker::MSEGrad: y_pred_shape != y_true_shape"); if (y_pred_shape != grad_shape) throw std::runtime_error("cker::MSEGrad: y_pred_shape != grad_shape"); - const int size = grad_shape.FlatSize(); - for (int i = 0; i < size; ++i) + const int batch_size = grad_shape.Dims(0); + const auto flat_size = FlatSizeSkipDim(grad_shape, 0); + auto reduction_size = 1; + switch (reduction_type) { - grad_data[i] = static_cast(-2 * (y_true_data[i] - y_pred_data[i]) / size); + case LossReductionType::SUM_OVER_BATCH_SIZE: + reduction_size = batch_size * flat_size; + break; + case LossReductionType::SUM: + reduction_size = flat_size; + break; + default: + throw std::runtime_error("Unsupported reduction type"); + } + + for (int b = 0; b < batch_size; ++b) + { + for (int i = 0; i < flat_size; ++i) + { + const int offset = b * flat_size + i; + assert(offset >= 0); + grad_data[offset] = + static_cast(-2 * (y_true_data[offset] - y_pred_data[offset]) / reduction_size); + } } } @@ -97,7 +123,8 @@ inline void CategoricalCrossEntropy(const Shape &y_pred_shape, const T *y_pred_d template inline void CategoricalCrossEntropyGrad(const Shape &y_pred_shape, const T *y_pred_data, const Shape &y_true_shape, const T *y_true_data, - const Shape &grad_shape, T *grad_data) + const Shape &grad_shape, T *grad_data, + LossReductionType reduction_type) { if (y_pred_shape != y_true_shape) throw std::runtime_error( @@ -110,7 +137,103 @@ inline void CategoricalCrossEntropyGrad(const Shape &y_pred_shape, const T *y_pr const auto y_true = MapAsMatrixWithLastDimAsRows(y_true_data, y_true_shape); auto grad = MapAsMatrixWithLastDimAsRows(grad_data, grad_shape); - grad = -(y_true.array() / y_pred.array().cwiseMax(log_threshold())); + const int32_t batch_size = grad_shape.Dims(0); + int32_t reduction_size = 1; + switch (reduction_type) + { + case LossReductionType::SUM_OVER_BATCH_SIZE: + reduction_size = batch_size; + break; + case LossReductionType::SUM: + reduction_size = 1; + break; + default: + throw std::runtime_error("Unsupported reduction type"); + } + assert(reduction_size > 0); + + grad = -(y_true.array() / y_pred.array().cwiseMax(log_threshold())) / + static_cast(reduction_size); +} + +template +void CategoricalCrossEntropyWithLogits(const Shape &logits_shape, const T *logits_data, + const Shape &y_true_shape, const T *y_true_data, + const Shape &loss_out_shape, T *loss_out_data, + const Shape &grad_shape, T *grad_data, + LossReductionType reduction_type) +{ + // TODO Enable broadcast shapes + if (loss_out_shape.DimensionsCount() != 1) + throw std::runtime_error( + "cker::CategoricalCrossEntropyWithLogits: loss output dimension count should be 1"); + if (logits_shape != y_true_shape) + throw std::runtime_error( + "cker::CategoricalCrossEntropyWithLogits: logits and y_true do not have the same shape"); + if (loss_out_shape.Dims(0) != logits_shape.Dims(0)) + throw std::runtime_error( + "cker::CategoricalCrossEntropyWithLogits: loss_out and logits do not have the same batch"); + if (logits_shape != grad_shape) + throw std::runtime_error( + "cker::CategoricalCrossEntropyWithLogits: logits and grad do not have the same shape"); + + auto shape_in = logits_shape; + + BCast bcast(BCast::FromShape(shape_in), BCast::FromShape(y_true_shape), + /*fewer_dims_optimization=*/false); + + // loss is 1-D (one per example), and size is batch_size. + + Tensor logits_in; + Tensor labels_in; + Tensor scratch; + Tensor loss_out; + Tensor back_out; + + logits_in.shape.ReplaceWith(shape_in.DimensionsCount(), shape_in.DimsData()); + logits_in.buffer = const_cast(logits_data); + + labels_in.shape.ReplaceWith(y_true_shape.DimensionsCount(), y_true_shape.DimsData()); + labels_in.buffer = const_cast(y_true_data); + + scratch.shape.ReplaceWith(shape_in.DimensionsCount(), shape_in.DimsData()); + std::vector scratch_vec(shape_in.Dims(0) * shape_in.Dims(1), static_cast(0)); + scratch.buffer = scratch_vec.data(); + + Shape shape_loss_out{shape_in.Dims(0)}; + loss_out.shape.ReplaceWith(shape_loss_out.DimensionsCount(), shape_loss_out.DimsData()); + loss_out.buffer = loss_out_data; + + back_out.shape.ReplaceWith(shape_in.DimensionsCount(), shape_in.DimsData()); + back_out.buffer = grad_data; + + if (shape_in.Dims(0) > 0) + { + const int32_t batch_size = grad_shape.Dims(0); + int32_t reduction_size = 1; + switch (reduction_type) + { + case LossReductionType::SUM_OVER_BATCH_SIZE: + reduction_size = batch_size; + break; + case LossReductionType::SUM: + reduction_size = 1; + break; + default: + throw std::runtime_error("Unsupported reduction type"); + } + assert(reduction_size > 0); + + const xent_ops::CPUDevice &device = *eigen_support::GetThreadPoolDevice(); + xent_ops::functor::XentFunctor functor; + const Eigen::DSizes shape{shape_in.Dims(0), shape_in.Dims(1)}; + + functor(device, shape, BCast::ToIndexArray<2>(bcast.x_bcast()), + BCast::ToIndexArray<2>(bcast.y_bcast()), + logits_in.template shaped(bcast.x_reshape()), + labels_in.template shaped(bcast.y_reshape()), scratch.matrix(), + loss_out.vec(), back_out.matrix(), static_cast(reduction_size)); + } } } // namespace train diff --git a/compute/cker/src/train/Loss.test.cc b/compute/cker/src/train/Loss.test.cc index 034d07ce2af..ff894a5a74f 100644 --- a/compute/cker/src/train/Loss.test.cc +++ b/compute/cker/src/train/Loss.test.cc @@ -48,7 +48,7 @@ template class LossCCEVerifier // because it involes calculations such as log or exp. for (int i = 0; i < output.size(); ++i) { - EXPECT_NEAR(output[i], expected[i], 1e-3f); + EXPECT_NEAR(output[i], expected[i], 1e-4f); } } @@ -66,7 +66,8 @@ template class LossCCEVerifier } void verifyBackward(const std::vector &y_pred, const std::vector &y_true, - const std::vector &expected) + const std::vector &expected, + nnfw::cker::train::LossReductionType reduction) { assert(y_pred.size() == y_true.size()); @@ -74,20 +75,46 @@ template class LossCCEVerifier const int N = _in_shape.Dims(0); const int D = _in_shape.FlatSize() / N; - nnfw::cker::train::CategoricalCrossEntropyGrad(_in_shape, y_pred.data(), _in_shape, - y_true.data(), _out_shape, output.data()); + nnfw::cker::train::CategoricalCrossEntropyGrad( + _in_shape, y_pred.data(), _in_shape, y_true.data(), _out_shape, output.data(), reduction); // Don't be panic when it fails after kernel implementation or input is changed. // CrossEntropy Gradient formula can be calculated slightly differently depending on the // environment because it involes calculations such as log or exp. for (int i = 0; i < output.size(); ++i) { - EXPECT_NEAR(output[i], expected[i], 1e-3f); + EXPECT_NEAR(output[i], expected[i], 1e-4f); + } + } + + void verifyBackwardWithLogits(const std::vector &logits, const std::vector &y_true, + const std::vector &expected_loss_out, + const std::vector &expected_grad, + nnfw::cker::train::LossReductionType reduction) + { + assert(logits.size() == y_true.size()); + assert(logits.size() == expected_grad.size()); + + std::vector loss_out(_out_shape.FlatSize()); + std::vector grad(_in_shape.FlatSize()); + + nnfw::cker::train::CategoricalCrossEntropyWithLogits(_in_shape, logits.data(), _in_shape, + y_true.data(), _out_shape, loss_out.data(), + _in_shape, grad.data(), reduction); + + for (int i = 0; i < loss_out.size(); ++i) + { + EXPECT_NEAR(loss_out[i], expected_loss_out[i], 1e-4f); + } + + for (int i = 0; i < grad.size(); ++i) + { + EXPECT_NEAR(grad[i], expected_grad[i], 1e-4f); } } void throwBackward(const std::vector &y_pred, const std::vector &y_true, - const std::vector &expected) + const std::vector &expected, nnfw::cker::train::LossReductionType reduction) { assert(y_pred.size() == y_true.size()); @@ -96,7 +123,23 @@ template class LossCCEVerifier const int D = _in_shape.FlatSize() / N; EXPECT_ANY_THROW(nnfw::cker::train::CategoricalCrossEntropyGrad( - _in_shape, y_pred.data(), _in_shape, y_true.data(), _out_shape, output.data())); + _in_shape, y_pred.data(), _in_shape, y_true.data(), _out_shape, output.data(), reduction)); + } + + void throwBackwardWithLogits(const std::vector &logits, const std::vector &y_true, + const std::vector &expected_loss_out, + const std::vector &expected_grad, + nnfw::cker::train::LossReductionType reduction) + { + assert(logits.size() == y_true.size()); + assert(logits.size() == expected_grad.size()); + + std::vector loss_out(_out_shape.FlatSize()); + std::vector grad(_in_shape.FlatSize()); + + EXPECT_ANY_THROW(nnfw::cker::train::CategoricalCrossEntropyWithLogits( + _in_shape, logits.data(), _in_shape, y_true.data(), _out_shape, loss_out.data(), _in_shape, + grad.data(), reduction)); } private: @@ -221,7 +264,8 @@ TEST(CKer_Operation, LossMSEGrad) std::vector expected = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10}, - y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data()); + y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data(), + nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE); for (size_t i = 0; i < deriv_y_pred.size(); ++i) EXPECT_EQ(deriv_y_pred[i], expected[i]); @@ -235,21 +279,38 @@ TEST(CKer_Operation, LossMSEGrad) std::vector expected = {0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2}; nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10}, - y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data()); + y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data(), + nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE); for (size_t i = 0; i < deriv_y_pred.size(); ++i) EXPECT_FLOAT_EQ(deriv_y_pred[i], expected[i]); } { - // Shape: {2, 3} -> m_rows:3, m_cols:2 + // Shape: {2, 3} -> m_rows:3, m_cols:2, LossReductionType::SUM_OVER_BATCH_SIZE std::vector y_pred = {27.2, 31.8, 51.9, 10.2, 34.2, 12.4}; std::vector y_true = {31.3, 40.3, 29.7, 12.9, 25.8, 11.9}; std::vector deriv_y_pred(6); std::vector expected = {-1.3666667, -2.8333333, 7.4, -0.9, 2.8, 0.1666667}; nnfw::cker::train::MSEGrad(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3}, - y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data()); + y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data(), + nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE); + + for (size_t i = 0; i < deriv_y_pred.size(); ++i) + EXPECT_FLOAT_EQ(deriv_y_pred[i], expected[i]); + } + + { + // Shape: {2, 3} -> m_rows:3, m_cols:2, LossReductionType::SUM + std::vector y_pred = {27.2, 31.8, 51.9, 10.2, 34.2, 12.4}; + std::vector y_true = {31.3, 40.3, 29.7, 12.9, 25.8, 11.9}; + std::vector deriv_y_pred(6); + std::vector expected = {-2.7333324, -5.6666665, 14.8, -1.7999998, 5.6, 0.33333334}; + + nnfw::cker::train::MSEGrad(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3}, + y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data(), + nnfw::cker::train::LossReductionType::SUM); for (size_t i = 0; i < deriv_y_pred.size(); ++i) EXPECT_FLOAT_EQ(deriv_y_pred[i], expected[i]); @@ -266,7 +327,8 @@ TEST(CKer_Operation, neg_LossMSEGrad) std::vector expected = {1., 1., 1., 1., 1., 1.}; nnfw::cker::train::MSEGrad(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3}, - y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data()); + y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data(), + nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE); for (size_t i = 0; i < deriv_y_pred.size(); ++i) EXPECT_NE(deriv_y_pred[i], expected[i]); @@ -278,9 +340,10 @@ TEST(CKer_Operation, neg_LossMSEGrad) std::vector y_true = {0., 1., 2., 3., 4., 5.}; std::vector deriv_y_pred(10); - EXPECT_ANY_THROW(nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), - nnfw::cker::Shape{2, 3}, y_true.data(), - nnfw::cker::Shape{1, 10}, deriv_y_pred.data())); + EXPECT_ANY_THROW( + nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{2, 3}, + y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data(), + nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE)); } { @@ -289,9 +352,10 @@ TEST(CKer_Operation, neg_LossMSEGrad) std::vector y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.}; std::vector deriv_y_pred(6); - EXPECT_ANY_THROW(nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), - nnfw::cker::Shape{1, 10}, y_true.data(), - nnfw::cker::Shape{2, 3}, deriv_y_pred.data())); + EXPECT_ANY_THROW( + nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10}, + y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data(), + nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE)); } } @@ -354,7 +418,7 @@ TEST(CKer_Operation, LossCategoricalCrossEntropyGrad) std::vector expected = {0, 0, 0, 0, 0, 0, 0, 0, 0, -16.66666667}; LossCCEVerifier verifier(in_shape, grad_shape); - verifier.verifyBackward(y_pred, y_true, expected); + verifier.verifyBackward(y_pred, y_true, expected, nnfw::cker::train::LossReductionType::SUM); } { @@ -368,7 +432,65 @@ TEST(CKer_Operation, LossCategoricalCrossEntropyGrad) 0, 0, 0, 0, 0, 0, 0, 0, 0}; LossCCEVerifier verifier(in_shape, grad_shape); - verifier.verifyBackward(y_pred, y_true, expected); + verifier.verifyBackward(y_pred, y_true, expected, nnfw::cker::train::LossReductionType::SUM); + } + + { + nnfw::cker::Shape in_shape{2, 10}; + nnfw::cker::Shape grad_shape{2, 10}; + std::vector y_pred = {0.01, 0.03, 0.05, 0.35, 0.04, 0.05, 0.28, 0.09, 0.04, 0.06, + 0.89, 0.03, 0.04, 0.005, 0.023, 0.001, 0.004, 0.005, 0.001, 0.001}; + std::vector y_true = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector expected = {0, 0, 0, 0, 0, 0, 0, 0, 0, -8.333333, -0.561797738, + 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + LossCCEVerifier verifier(in_shape, grad_shape); + verifier.verifyBackward(y_pred, y_true, expected, + nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE); + } + + { + nnfw::cker::Shape in_shape{1, 10}; + nnfw::cker::Shape out_shape{1}; + + std::vector logits = {1, 3, 5, 35, 4, 5, 28, 9, 4, 6}; + std::vector y_true = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + std::vector expected_loss_out = {29.0009}; + std::vector expected_grad = {0, 0, 0, 0.9991, 0, 0, 0.0009, 0, 0, -1}; + + LossCCEVerifier verifier(in_shape, out_shape); + verifier.verifyBackwardWithLogits(logits, y_true, expected_loss_out, expected_grad, + nnfw::cker::train::LossReductionType::SUM); + } + + { + nnfw::cker::Shape in_shape{2, 10}; + nnfw::cker::Shape out_shape{2}; + + std::vector logits = {1, 3, 5, 35, 4, 5, 28, 9, 4, 6, 89, 3, 4, 5, 23, 1, 4, 5, 1, 101}; + std::vector y_true = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector expected_loss_out = {29.0009, 12}; + std::vector expected_grad = {0, 0, 0, 0.9991, 0, 0, 0.0009, 0, 0, -1, + -1, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + + LossCCEVerifier verifier(in_shape, out_shape); + verifier.verifyBackwardWithLogits(logits, y_true, expected_loss_out, expected_grad, + nnfw::cker::train::LossReductionType::SUM); + } + + { + nnfw::cker::Shape in_shape{2, 10}; + nnfw::cker::Shape out_shape{2}; + + std::vector logits = {1, 3, 5, 35, 4, 5, 28, 9, 4, 6, 89, 3, 4, 5, 23, 1, 4, 5, 1, 101}; + std::vector y_true = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector expected_loss_out = {29.0009, 12}; + std::vector expected_grad = {0, 0, 0, 0.4995, 0, 0, 0.0005, 0, 0, -0.5, + -0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0.5}; + + LossCCEVerifier verifier(in_shape, out_shape); + verifier.verifyBackwardWithLogits(logits, y_true, expected_loss_out, expected_grad, + nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE); } } @@ -384,6 +506,24 @@ TEST(CKer_Operation, neg_LossCategoricalCrossEntropyGrad) std::vector expected = {0, 0, 0, 0, 0, 0, 0, 0, 0, -16.66666667}; LossCCEVerifier verifier(in_shape, grad_shape); - verifier.throwBackward(y_pred, y_true, expected); + verifier.throwBackward(y_pred, y_true, expected, nnfw::cker::train::LossReductionType::SUM); + } +} + +TEST(CKer_Operation, neg_LossCategoricalCrossEntropyWithLogits) +{ + // Invalid out shape + { + nnfw::cker::Shape in_shape{1, 10}; + nnfw::cker::Shape out_shape{1, 1}; + + std::vector logits = {1, 3, 5, 35, 4, 5, 28, 9, 4, 6}; + std::vector y_true = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + std::vector expected_loss_out = {29.0009}; + std::vector expected_grad = {0, 0, 0, 0.9991, 0, 0, 0.0009, 0, 0, -1}; + + LossCCEVerifier verifier(in_shape, out_shape); + verifier.throwBackwardWithLogits(logits, y_true, expected_loss_out, expected_grad, + nnfw::cker::train::LossReductionType::SUM); } } diff --git a/infra/nnfw/cmake/packages/FlatBuffers-23.5.26/FlatBuffersConfig.cmake b/infra/nnfw/cmake/packages/FlatBuffers-23.5.26/FlatBuffersConfig.cmake index eefa2033ebe..4dbfad070b7 100644 --- a/infra/nnfw/cmake/packages/FlatBuffers-23.5.26/FlatBuffersConfig.cmake +++ b/infra/nnfw/cmake/packages/FlatBuffers-23.5.26/FlatBuffersConfig.cmake @@ -27,7 +27,7 @@ function(_FlatBuffers_import) list(APPEND FlatBuffers_Library_SRCS "${FlatBuffersSource_DIR}/src/util.cpp") if(NOT TARGET flatbuffers::flatbuffers-23.5.26) - add_library(flatbuffers-23.5.26 ${FlatBuffers_Library_SRCS}) + add_library(flatbuffers-23.5.26 STATIC ${FlatBuffers_Library_SRCS}) target_include_directories(flatbuffers-23.5.26 PUBLIC "${FlatBuffersSource_DIR}/include") set_property(TARGET flatbuffers-23.5.26 PROPERTY POSITION_INDEPENDENT_CODE ON) target_compile_options(flatbuffers-23.5.26 PUBLIC $<$:-Wno-sign-compare>) diff --git a/nnpackage/schema/circle_schema.fbs b/nnpackage/schema/circle_schema.fbs index 515b314e280..0498318bfce 100644 --- a/nnpackage/schema/circle_schema.fbs +++ b/nnpackage/schema/circle_schema.fbs @@ -286,6 +286,7 @@ table Tensor { // set of acceptable options. // LINT.IfChange enum BuiltinOperator : int32 { + RMS_NORM = -6, GRU = -5, BCQ_GATHER = -4, BCQ_FULLY_CONNECTED = -3, @@ -635,6 +636,7 @@ union BuiltinOptions { BitcastOptions, BitwiseXorOptions, RightShiftOptions, + RmsNormOptions = 250, GRUOptions = 251, BCQGatherOptions = 252, BCQFullyConnectedOptions = 253, @@ -1519,6 +1521,10 @@ table InstanceNormOptions { fused_activation_function:ActivationFunctionType; } +table RmsNormOptions { + epsilon:float; +} + // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a // builtin, or a string if the operator is custom. table OperatorCode { diff --git a/onert-micro/onert-micro/include/execute/OMUtils.h b/onert-micro/onert-micro/include/execute/OMUtils.h index b45feb08953..109dbea23fa 100644 --- a/onert-micro/onert-micro/include/execute/OMUtils.h +++ b/onert-micro/onert-micro/include/execute/OMUtils.h @@ -163,6 +163,10 @@ void calculateQuantParams(core::ArithmeticQuantParams ¶ms, const circle::Ten OMStatus SISOHeader(const OMExecuteArgs &execute_args, const circle::Tensor **input, const circle::Tensor **output, uint8_t **input_data, uint8_t **output_data); +OMStatus TISOHeader(const OMExecuteArgs &execute_args, const circle::Tensor **input1, + const circle::Tensor **input2, const circle::Tensor **output, + OMRuntimeKernel *runtime_kernel); + } // namespace execute } // namespace onert_micro diff --git a/onert-micro/onert-micro/src/execute/OMUtils.cpp b/onert-micro/onert-micro/src/execute/OMUtils.cpp index 9bda002018c..1fca5d95331 100644 --- a/onert-micro/onert-micro/src/execute/OMUtils.cpp +++ b/onert-micro/onert-micro/src/execute/OMUtils.cpp @@ -236,3 +236,32 @@ void onert_micro::execute::calculateQuantParams(core::ArithmeticQuantParams &par ¶ms.quantized_activation_min, ¶ms.quantized_activation_max); } + +OMStatus onert_micro::execute::TISOHeader(const OMExecuteArgs &execute_args, + const circle::Tensor **input1, + const circle::Tensor **input2, + const circle::Tensor **output, + OMRuntimeKernel *runtime_kernel) +{ + OMStatus status; + + core::OMRuntimeContext &runtime_context = execute_args.runtime_context; + core::OMRuntimeStorage &runtime_storage = execute_args.runtime_storage; + uint16_t op_index = execute_args.kernel_index; + + status = runtime_kernel->readKernel(op_index, runtime_context); + + *input1 = runtime_kernel->inputs[0]; + *input2 = runtime_kernel->inputs[1]; + *output = runtime_kernel->outputs[0]; + + assert(*input1 != nullptr); + assert(*input2 != nullptr); + assert(*output != nullptr); + + status = runtime_kernel->getDataFromStorage(op_index, runtime_storage, runtime_context); + if (status != Ok) + return status; + + return status; +} diff --git a/onert-micro/onert-micro/src/execute/kernels/Equal.cpp b/onert-micro/onert-micro/src/execute/kernels/Equal.cpp index 0ff8ca8443d..4c7a6177176 100644 --- a/onert-micro/onert-micro/src/execute/kernels/Equal.cpp +++ b/onert-micro/onert-micro/src/execute/kernels/Equal.cpp @@ -21,6 +21,8 @@ #include "execute/kernels/ComparisonCommon.h" #include "PALComparisons.h" +#include "execute/OMUtils.h" + using namespace onert_micro; using namespace onert_micro::core; using namespace onert_micro::execute; @@ -36,10 +38,6 @@ constexpr uint32_t outputTensorIdx = 0; OMStatus onert_micro::execute::execute_kernel_CircleEqual(const OMExecuteArgs &execute_args) { - core::OMRuntimeContext &runtime_context = execute_args.runtime_context; - core::OMRuntimeStorage &runtime_storage = execute_args.runtime_storage; - uint16_t op_index = execute_args.kernel_index; - OMStatus status = Ok; const circle::Tensor *input1 = nullptr; @@ -47,19 +45,8 @@ OMStatus onert_micro::execute::execute_kernel_CircleEqual(const OMExecuteArgs &e const circle::Tensor *output = nullptr; OMRuntimeKernel runtime_kernel; - runtime_kernel.readKernel(op_index, runtime_context); - - status = runtime_kernel.getDataFromStorage(op_index, runtime_storage, runtime_context); - if (status != Ok) - return status; - - input1 = runtime_kernel.inputs[input1TensorIdx]; - input2 = runtime_kernel.inputs[input2TensorIdx]; - output = runtime_kernel.outputs[outputTensorIdx]; - assert(input1 != nullptr); - assert(input2 != nullptr); - assert(output != nullptr); + TISOHeader(execute_args, &input1, &input2, &output, &runtime_kernel); switch (input1->type()) { diff --git a/onert-micro/onert-micro/src/execute/kernels/Greater.cpp b/onert-micro/onert-micro/src/execute/kernels/Greater.cpp index 5f95c0e0a79..82feefc148f 100644 --- a/onert-micro/onert-micro/src/execute/kernels/Greater.cpp +++ b/onert-micro/onert-micro/src/execute/kernels/Greater.cpp @@ -21,6 +21,8 @@ #include "execute/kernels/ComparisonCommon.h" #include "PALComparisons.h" +#include "execute/OMUtils.h" + using namespace onert_micro; using namespace onert_micro::core; using namespace onert_micro::execute; @@ -36,10 +38,6 @@ constexpr uint32_t outputTensorIdx = 0; OMStatus onert_micro::execute::execute_kernel_CircleGreater(const OMExecuteArgs &execute_args) { - core::OMRuntimeContext &runtime_context = execute_args.runtime_context; - core::OMRuntimeStorage &runtime_storage = execute_args.runtime_storage; - uint16_t op_index = execute_args.kernel_index; - OMStatus status = Ok; const circle::Tensor *input1 = nullptr; @@ -47,19 +45,8 @@ OMStatus onert_micro::execute::execute_kernel_CircleGreater(const OMExecuteArgs const circle::Tensor *output = nullptr; OMRuntimeKernel runtime_kernel; - runtime_kernel.readKernel(op_index, runtime_context); - - status = runtime_kernel.getDataFromStorage(op_index, runtime_storage, runtime_context); - if (status != Ok) - return status; - - input1 = runtime_kernel.inputs[input1TensorIdx]; - input2 = runtime_kernel.inputs[input2TensorIdx]; - output = runtime_kernel.outputs[outputTensorIdx]; - assert(input1 != nullptr); - assert(input2 != nullptr); - assert(output != nullptr); + TISOHeader(execute_args, &input1, &input2, &output, &runtime_kernel); switch (input1->type()) { diff --git a/onert-micro/onert-micro/src/execute/kernels/GreaterEqual.cpp b/onert-micro/onert-micro/src/execute/kernels/GreaterEqual.cpp index 325e332f838..70255d919f2 100644 --- a/onert-micro/onert-micro/src/execute/kernels/GreaterEqual.cpp +++ b/onert-micro/onert-micro/src/execute/kernels/GreaterEqual.cpp @@ -21,6 +21,8 @@ #include "execute/kernels/ComparisonCommon.h" #include "PALComparisons.h" +#include "execute/OMUtils.h" + using namespace onert_micro; using namespace onert_micro::core; using namespace onert_micro::execute; @@ -36,10 +38,6 @@ constexpr uint32_t outputTensorIdx = 0; OMStatus onert_micro::execute::execute_kernel_CircleGreaterEqual(const OMExecuteArgs &execute_args) { - core::OMRuntimeContext &runtime_context = execute_args.runtime_context; - core::OMRuntimeStorage &runtime_storage = execute_args.runtime_storage; - uint16_t op_index = execute_args.kernel_index; - OMStatus status = Ok; const circle::Tensor *input1 = nullptr; @@ -47,19 +45,8 @@ OMStatus onert_micro::execute::execute_kernel_CircleGreaterEqual(const OMExecute const circle::Tensor *output = nullptr; OMRuntimeKernel runtime_kernel; - runtime_kernel.readKernel(op_index, runtime_context); - - status = runtime_kernel.getDataFromStorage(op_index, runtime_storage, runtime_context); - if (status != Ok) - return status; - - input1 = runtime_kernel.inputs[input1TensorIdx]; - input2 = runtime_kernel.inputs[input2TensorIdx]; - output = runtime_kernel.outputs[outputTensorIdx]; - assert(input1 != nullptr); - assert(input2 != nullptr); - assert(output != nullptr); + TISOHeader(execute_args, &input1, &input2, &output, &runtime_kernel); switch (input1->type()) { diff --git a/onert-micro/onert-micro/src/execute/kernels/Less.cpp b/onert-micro/onert-micro/src/execute/kernels/Less.cpp index b815849cf6f..a7a3b2f9d9b 100644 --- a/onert-micro/onert-micro/src/execute/kernels/Less.cpp +++ b/onert-micro/onert-micro/src/execute/kernels/Less.cpp @@ -21,6 +21,8 @@ #include "execute/kernels/ComparisonCommon.h" #include "PALComparisons.h" +#include "execute/OMUtils.h" + using namespace onert_micro; using namespace onert_micro::core; using namespace onert_micro::execute; @@ -36,10 +38,6 @@ constexpr uint32_t outputTensorIdx = 0; OMStatus onert_micro::execute::execute_kernel_CircleLess(const OMExecuteArgs &execute_args) { - core::OMRuntimeContext &runtime_context = execute_args.runtime_context; - core::OMRuntimeStorage &runtime_storage = execute_args.runtime_storage; - uint16_t op_index = execute_args.kernel_index; - OMStatus status = Ok; const circle::Tensor *input1 = nullptr; @@ -47,19 +45,8 @@ OMStatus onert_micro::execute::execute_kernel_CircleLess(const OMExecuteArgs &ex const circle::Tensor *output = nullptr; OMRuntimeKernel runtime_kernel; - runtime_kernel.readKernel(op_index, runtime_context); - - status = runtime_kernel.getDataFromStorage(op_index, runtime_storage, runtime_context); - if (status != Ok) - return status; - - input1 = runtime_kernel.inputs[input1TensorIdx]; - input2 = runtime_kernel.inputs[input2TensorIdx]; - output = runtime_kernel.outputs[outputTensorIdx]; - assert(input1 != nullptr); - assert(input2 != nullptr); - assert(output != nullptr); + TISOHeader(execute_args, &input1, &input2, &output, &runtime_kernel); switch (input1->type()) { diff --git a/onert-micro/onert-micro/src/execute/kernels/NotEqual.cpp b/onert-micro/onert-micro/src/execute/kernels/NotEqual.cpp index 5591c3eb76c..b2c8bc97ad2 100644 --- a/onert-micro/onert-micro/src/execute/kernels/NotEqual.cpp +++ b/onert-micro/onert-micro/src/execute/kernels/NotEqual.cpp @@ -21,6 +21,8 @@ #include "execute/kernels/ComparisonCommon.h" #include "PALComparisons.h" +#include "execute/OMUtils.h" + using namespace onert_micro; using namespace onert_micro::core; using namespace onert_micro::execute; @@ -36,10 +38,6 @@ constexpr uint32_t outputTensorIdx = 0; OMStatus onert_micro::execute::execute_kernel_CircleNotEqual(const OMExecuteArgs &execute_args) { - core::OMRuntimeContext &runtime_context = execute_args.runtime_context; - core::OMRuntimeStorage &runtime_storage = execute_args.runtime_storage; - uint16_t op_index = execute_args.kernel_index; - OMStatus status = Ok; const circle::Tensor *input1 = nullptr; @@ -47,19 +45,8 @@ OMStatus onert_micro::execute::execute_kernel_CircleNotEqual(const OMExecuteArgs const circle::Tensor *output = nullptr; OMRuntimeKernel runtime_kernel; - runtime_kernel.readKernel(op_index, runtime_context); - - status = runtime_kernel.getDataFromStorage(op_index, runtime_storage, runtime_context); - if (status != Ok) - return status; - - input1 = runtime_kernel.inputs[input1TensorIdx]; - input2 = runtime_kernel.inputs[input2TensorIdx]; - output = runtime_kernel.outputs[outputTensorIdx]; - assert(input1 != nullptr); - assert(input2 != nullptr); - assert(output != nullptr); + TISOHeader(execute_args, &input1, &input2, &output, &runtime_kernel); switch (input1->type()) { diff --git a/onert-micro/training-configure-tool/src/TrainingDriverHandler.cpp b/onert-micro/training-configure-tool/src/TrainingDriverHandler.cpp index fe4a3ad8431..eea65336980 100644 --- a/onert-micro/training-configure-tool/src/TrainingDriverHandler.cpp +++ b/onert-micro/training-configure-tool/src/TrainingDriverHandler.cpp @@ -156,7 +156,7 @@ OMStatus training_configure_tool::runTrainProcessWithCurConfig( case onert_micro::CROSS_ENTROPY_METRICS: { float cross_entropy_metric = 0.f; - train_interpreter.evaluateMetric(onert_micro::CROSS_ENTROPY_METRICS, + train_interpreter.evaluateMetric(config, onert_micro::CROSS_ENTROPY_METRICS, reinterpret_cast(&cross_entropy_metric), cur_batch_size); cross_entropy_v.push_back(cross_entropy_metric); @@ -165,7 +165,7 @@ OMStatus training_configure_tool::runTrainProcessWithCurConfig( case onert_micro::ACCURACY: { float accuracy = 0.f; - train_interpreter.evaluateMetric(onert_micro::ACCURACY, + train_interpreter.evaluateMetric(config, onert_micro::ACCURACY, reinterpret_cast(&accuracy), cur_batch_size); accuracy_v.push_back(accuracy); } @@ -173,16 +173,16 @@ OMStatus training_configure_tool::runTrainProcessWithCurConfig( case onert_micro::MSE_METRICS: { float mse = 0.f; - train_interpreter.evaluateMetric(onert_micro::MSE_METRICS, reinterpret_cast(&mse), - cur_batch_size); + train_interpreter.evaluateMetric(config, onert_micro::MSE_METRICS, + reinterpret_cast(&mse), cur_batch_size); mse_v.push_back(mse); } break; case onert_micro::MAE_METRICS: { float mae = 0.f; - train_interpreter.evaluateMetric(onert_micro::MAE_METRICS, reinterpret_cast(&mae), - cur_batch_size); + train_interpreter.evaluateMetric(config, onert_micro::MAE_METRICS, + reinterpret_cast(&mae), cur_batch_size); mae_v.push_back(mae); } break; diff --git a/res/CircleSchema/0.9/circle_schema.fbs b/res/CircleSchema/0.9/circle_schema.fbs index 515b314e280..0498318bfce 100644 --- a/res/CircleSchema/0.9/circle_schema.fbs +++ b/res/CircleSchema/0.9/circle_schema.fbs @@ -286,6 +286,7 @@ table Tensor { // set of acceptable options. // LINT.IfChange enum BuiltinOperator : int32 { + RMS_NORM = -6, GRU = -5, BCQ_GATHER = -4, BCQ_FULLY_CONNECTED = -3, @@ -635,6 +636,7 @@ union BuiltinOptions { BitcastOptions, BitwiseXorOptions, RightShiftOptions, + RmsNormOptions = 250, GRUOptions = 251, BCQGatherOptions = 252, BCQFullyConnectedOptions = 253, @@ -1519,6 +1521,10 @@ table InstanceNormOptions { fused_activation_function:ActivationFunctionType; } +table RmsNormOptions { + epsilon:float; +} + // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a // builtin, or a string if the operator is custom. table OperatorCode { diff --git a/res/TensorFlowLiteRecipes/Conv2D_007/test.recipe b/res/TensorFlowLiteRecipes/Conv2D_007/test.recipe new file mode 100644 index 00000000000..a0879c460db --- /dev/null +++ b/res/TensorFlowLiteRecipes/Conv2D_007/test.recipe @@ -0,0 +1,83 @@ +operand { + name: "ifm" + type: FLOAT32 + shape { dim: 1 dim: 8 dim: 8 dim: 1 } +} +operand { + name: "ker_1" + type: FLOAT32 + shape { dim: 1 dim: 1 dim: 1 dim: 1 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "bias_1" + type: FLOAT32 + shape { dim: 1 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "ofm_1" + type: FLOAT32 + shape { dim: 1 dim: 4 dim: 4 dim: 1 } +} +operation { + type: "Conv2D" + conv2d_options { + padding: SAME + stride_w: 2 + stride_h: 2 + activation: RELU + } + input: "ifm" + input: "ker_1" + input: "bias_1" + output: "ofm_1" +} +operand { + name: "ker_2" + type: FLOAT32 + shape { dim: 1 dim: 1 dim: 1 dim: 1 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "bias_2" + type: FLOAT32 + shape { dim: 1 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "ofm_2" + type: FLOAT32 + shape { dim: 1 dim: 2 dim: 2 dim: 1 } +} +operation { + type: "Conv2D" + conv2d_options { + padding: SAME + stride_w: 2 + stride_h: 2 + activation: RELU + } + input: "ofm_1" + input: "ker_2" + input: "bias_2" + output: "ofm_2" +} +input: "ifm" +output: "ofm_2" diff --git a/res/TensorFlowLiteRecipes/Conv2D_007/test.reverse b/res/TensorFlowLiteRecipes/Conv2D_007/test.reverse new file mode 100644 index 00000000000..e69de29bb2d diff --git a/res/TensorFlowLiteRecipes/FullyConnected_010/test.recipe b/res/TensorFlowLiteRecipes/FullyConnected_010/test.recipe new file mode 100644 index 00000000000..f7e086f95e0 --- /dev/null +++ b/res/TensorFlowLiteRecipes/FullyConnected_010/test.recipe @@ -0,0 +1,77 @@ +operand { + name: "in" + type: FLOAT32 + shape { dim: 1 dim: 16 } +} +operand { + name: "weight" + type: FLOAT32 + shape { dim: 16 dim: 16 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "bias" + type: FLOAT32 + shape { dim: 16 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "out" + type: FLOAT32 + shape { dim: 1 dim: 16 } +} +operation { + type: "FullyConnected" + fullyconnected_options { + activation: RELU + } + input: "in" + input: "weight" + input: "bias" + output: "out" +} +operand { + name: "weight_2" + type: FLOAT32 + shape { dim: 4 dim: 16 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "bias_2" + type: FLOAT32 + shape { dim: 4 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "out_2" + type: FLOAT32 + shape { dim: 1 dim: 4 } +} +operation { + type: "FullyConnected" + fullyconnected_options { + activation: RELU + } + input: "out" + input: "weight_2" + input: "bias_2" + output: "out_2" +} +input: "in" +output: "out_2" diff --git a/res/TensorFlowLiteRecipes/FullyConnected_010/test.reverse b/res/TensorFlowLiteRecipes/FullyConnected_010/test.reverse new file mode 100644 index 00000000000..e69de29bb2d diff --git a/res/TensorFlowLiteRecipes/Net_Conv_Gelu_000/test.recipe b/res/TensorFlowLiteRecipes/Net_Conv_Gelu_000/test.recipe new file mode 100644 index 00000000000..89314254f20 --- /dev/null +++ b/res/TensorFlowLiteRecipes/Net_Conv_Gelu_000/test.recipe @@ -0,0 +1,78 @@ +operand { + name: "Placeholder" + type: FLOAT32 + shape { dim: 1 dim: 16 dim: 16 dim: 3 } +} +operand { + name: "Conv2D_1" + type: FLOAT32 + shape { dim: 3 dim: 3 dim: 3 dim: 3 } + filler { tag: "gaussian" arg: "0.0" arg: "0.1" } +} +operand { + name: "Conv2D_2" + type: FLOAT32 + shape { dim: 3 } + filler { tag: "gaussian" arg: "0.0" arg: "0.1" } +} +operand { + name: "Conv2D_21" + type: FLOAT32 + shape { dim: 3 dim: 3 dim: 3 dim: 3 } + filler { tag: "gaussian" arg: "0.0" arg: "0.1" } +} +operand { + name: "Conv2D_11" + type: FLOAT32 + shape { dim: 1 dim: 16 dim: 16 dim: 3 } +} +operand { + name: "Gelu" + type: FLOAT32 + shape { dim: 1 dim: 16 dim: 16 dim: 3 } +} +operand { + name: "Conv2D_22" + type: FLOAT32 + shape { dim: 1 dim: 16 dim: 16 dim: 3 } +} +operation { + type: "Conv2D" + input: "Placeholder" + input: "Conv2D_1" + input: "Conv2D_2" + output: "Conv2D_11" + conv2d_options { + padding: SAME + stride_w: 1 + stride_h: 1 + activation: NONE + dilation_w_factor: 1 + dilation_h_factor: 1 + } +} +operation { + type: "Gelu" + gelu_options { + approximate: false + } + input: "Conv2D_11" + output: "Gelu" +} +operation { + type: "Conv2D" + input: "Gelu" + input: "Conv2D_21" + input: "Conv2D_2" + output: "Conv2D_22" + conv2d_options { + padding: SAME + stride_w: 1 + stride_h: 1 + activation: NONE + dilation_w_factor: 1 + dilation_h_factor: 1 + } +} +input: "Placeholder" +output: "Conv2D_22" diff --git a/res/TensorFlowLiteRecipes/Net_Conv_Pad_000/test.recipe b/res/TensorFlowLiteRecipes/Net_Conv_Pad_000/test.recipe new file mode 100644 index 00000000000..c09793b160b --- /dev/null +++ b/res/TensorFlowLiteRecipes/Net_Conv_Pad_000/test.recipe @@ -0,0 +1,107 @@ +# Conv - Pad - Conv +operand { + name: "ifm" + type: FLOAT32 + shape { dim: 1 dim: 8 dim: 8 dim: 1 } +} +operand { + name: "ker_1" + type: FLOAT32 + shape { dim: 1 dim: 2 dim: 2 dim: 1 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "bias_1" + type: FLOAT32 + shape { dim: 1 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "ofm_1" + type: FLOAT32 + shape { dim: 1 dim: 4 dim: 4 dim: 1 } +} +operation { + type: "Conv2D" + conv2d_options { + padding: SAME + stride_w: 2 + stride_h: 2 + activation: RELU + } + input: "ifm" + input: "ker_1" + input: "bias_1" + output: "ofm_1" +} +operand { + name: "pad" + type: INT32 + shape { dim: 4 dim: 2 } + filler { + tag: "explicit" + arg: "0" arg: "0" + arg: "2" arg: "2" + arg: "2" arg: "2" + arg: "0" arg: "0" + } +} +operand { + name: "pad_ofm" + type: FLOAT32 + shape { dim: 1 dim: 8 dim: 8 dim: 1 } +} +operation { + type: "Pad" + input: "ofm_1" + input: "pad" + output: "pad_ofm" +} +operand { + name: "ker_2" + type: FLOAT32 + shape { dim: 1 dim: 1 dim: 1 dim: 1 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "bias_2" + type: FLOAT32 + shape { dim: 1 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "ofm_2" + type: FLOAT32 + shape { dim: 1 dim: 4 dim: 4 dim: 1 } +} +operation { + type: "Conv2D" + conv2d_options { + padding: SAME + stride_w: 2 + stride_h: 2 + activation: RELU + } + input: "pad_ofm" + input: "ker_2" + input: "bias_2" + output: "ofm_2" +} +input: "ifm" +output: "ofm_2" diff --git a/res/TensorFlowLiteRecipes/Net_Conv_TConv_000/test.recipe b/res/TensorFlowLiteRecipes/Net_Conv_TConv_000/test.recipe new file mode 100644 index 00000000000..d00455cfcd6 --- /dev/null +++ b/res/TensorFlowLiteRecipes/Net_Conv_TConv_000/test.recipe @@ -0,0 +1,83 @@ +operand { + name: "ifm" + type: FLOAT32 + shape { dim: 1 dim: 8 dim: 8 dim: 1 } +} +operand { + name: "ker_1" + type: FLOAT32 + shape { dim: 3 dim: 2 dim: 2 dim: 1 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "bias_1" + type: FLOAT32 + shape { dim: 3 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "ofm_1" + type: FLOAT32 + shape { dim: 1 dim: 4 dim: 4 dim: 3 } +} +operation { + type: "Conv2D" + conv2d_options { + padding: SAME + stride_w: 2 + stride_h: 2 + activation: RELU + } + input: "ifm" + input: "ker_1" + input: "bias_1" + output: "ofm_1" +} +operand { + name: "out_shape" + type: INT32 + shape { dim: 4 } + filler { + tag: "explicit" + arg: "1" arg: "4" arg: "4" arg: "3" + } +} +operand { + name: "ker_2" + type: FLOAT32 + shape { dim: 3 dim: 1 dim: 1 dim: 3 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "ofm_2" + type: FLOAT32 + shape { dim: 1 dim: 4 dim: 4 dim: 3 } +} + +operation { + type: "TransposeConv" + transpose_conv_options { + padding: SAME + stride_w: 1 + stride_h: 1 + activation: NONE + } + input: "out_shape" + input: "ker_2" + input: "ofm_1" + output: "ofm_2" +} +input: "ifm" +output: "ofm_2" diff --git a/res/TensorFlowLiteRecipes/Net_Conv_TConv_000/test.reverse b/res/TensorFlowLiteRecipes/Net_Conv_TConv_000/test.reverse new file mode 100644 index 00000000000..e69de29bb2d diff --git a/res/TensorFlowLiteRecipes/Net_DConv_Conv_000/test.recipe b/res/TensorFlowLiteRecipes/Net_DConv_Conv_000/test.recipe new file mode 100644 index 00000000000..ad2e784231a --- /dev/null +++ b/res/TensorFlowLiteRecipes/Net_DConv_Conv_000/test.recipe @@ -0,0 +1,86 @@ +operand { + name: "ifm" + type: FLOAT32 + shape { dim: 1 dim: 8 dim: 8 dim: 2 } +} +operand { + name: "ker" + type: FLOAT32 + shape { dim: 1 dim: 2 dim: 2 dim: 4 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "bias" + type: FLOAT32 + shape { dim: 4 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "ofm" + type: FLOAT32 + shape { dim: 1 dim: 4 dim: 4 dim: 4 } +} +operation { + type: "DepthwiseConv2D" + depthwiseconv2d_options { + padding: VALID + stride_w: 2 + stride_h: 2 + dilation_w_factor: 1 + dilation_h_factor: 1 + depth_multiplier: 2 + activation : NONE + } + input: "ifm" + input: "ker" + input: "bias" + output: "ofm" +} +operand { + name: "ker_1" + type: FLOAT32 + shape { dim: 3 dim: 2 dim: 2 dim: 4 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "bias_1" + type: FLOAT32 + shape { dim: 3 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "ofm_1" + type: FLOAT32 + shape { dim: 1 dim: 2 dim: 2 dim: 3 } +} +operation { + type: "Conv2D" + conv2d_options { + padding: SAME + stride_w: 2 + stride_h: 2 + activation: NONE + } + input: "ofm" + input: "ker_1" + input: "bias_1" + output: "ofm_1" +} +input: "ifm" +output: "ofm_1" diff --git a/res/TensorFlowLiteRecipes/Net_DConv_Conv_000/test.reverse b/res/TensorFlowLiteRecipes/Net_DConv_Conv_000/test.reverse new file mode 100644 index 00000000000..e69de29bb2d diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Gelu_000/test.recipe b/res/TensorFlowLiteRecipes/Net_FullyConnected_Gelu_000/test.recipe new file mode 100644 index 00000000000..0e47be64b46 --- /dev/null +++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Gelu_000/test.recipe @@ -0,0 +1,90 @@ +operand { + name: "in" + type: FLOAT32 + shape { dim: 1 dim: 16 } +} +operand { + name: "weight" + type: FLOAT32 + shape { dim: 16 dim: 16 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "bias" + type: FLOAT32 + shape { dim: 16 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "out" + type: FLOAT32 + shape { dim: 1 dim: 16 } +} +operation { + type: "FullyConnected" + fullyconnected_options { + activation: RELU + } + input: "in" + input: "weight" + input: "bias" + output: "out" +} +operand { + name: "gelu_out" + type: FLOAT32 + shape { dim: 1 dim: 16 } +} +operation { + type: "Gelu" + gelu_options { + approximate: false + } + input: "out" + output: "gelu_out" +} +operand { + name: "weight_2" + type: FLOAT32 + shape { dim: 4 dim: 16 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "bias_2" + type: FLOAT32 + shape { dim: 4 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "out_2" + type: FLOAT32 + shape { dim: 1 dim: 4 } +} +operation { + type: "FullyConnected" + fullyconnected_options { + activation: RELU + } + input: "gelu_out" + input: "weight_2" + input: "bias_2" + output: "out_2" +} +input: "in" +output: "out_2" diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Gelu_000/test.reverse b/res/TensorFlowLiteRecipes/Net_FullyConnected_Gelu_000/test.reverse new file mode 100644 index 00000000000..e69de29bb2d diff --git a/res/TensorFlowLiteRecipes/Net_InstNorm_Conv_000/test.recipe b/res/TensorFlowLiteRecipes/Net_InstNorm_Conv_000/test.recipe new file mode 100644 index 00000000000..33864d2d815 --- /dev/null +++ b/res/TensorFlowLiteRecipes/Net_InstNorm_Conv_000/test.recipe @@ -0,0 +1,332 @@ +# +# InstanceNorm - LeakyRelu - Conv2D +# + +operand { + name: "Input" + type: FLOAT32 + shape { + dim: 1 + dim: 8 + dim: 6 + dim: 12 + } + quant { + min: 0 + max: 255 + } +} +operand { + name: "InstanceNorm/beta/read" + type: FLOAT32 + shape { + dim: 12 + } + filler { + tag: "explicit" + arg: "1.9714" + arg: "1.4517" + arg: "1.20315" + arg: "0.287979" + arg: "0.161815" + arg: "-0.281398" + arg: "2.70276" + arg: "-0.166962" + arg: "0.266389" + arg: "0.890943" + arg: "-0.279833" + arg: "1.82808" + } +} +operand { + name: "InstanceNorm/gamma/read" + type: FLOAT32 + shape { + dim: 12 + } + filler { + tag: "explicit" + arg: "0.574708" + arg: "0.387735" + arg: "0.8995" + arg: "0.484296" + arg: "2.35851" + arg: "1.06661" + arg: "0.343602" + arg: "2.27583" + arg: "1.14559" + arg: "0.690169" + arg: "1.2044" + arg: "0.350952" + } +} +operand { + name: "InstanceNorm/instancenorm/Rsqrt" + type: FLOAT32 + shape { + dim: 1 + dim: 1 + dim: 1 + dim: 12 + } +} +operand { + name: "InstanceNorm/instancenorm/add" + type: FLOAT32 + shape { + dim: 1 + dim: 1 + dim: 1 + dim: 12 + } +} +operand { + name: "InstanceNorm/instancenorm/add/y" + type: FLOAT32 + shape { + } + filler { + tag: "explicit" + arg: "0.001" + } +} +operand { + name: "InstanceNorm/instancenorm/mul" + type: FLOAT32 + shape { + dim: 1 + dim: 1 + dim: 1 + dim: 12 + } +} +operand { + name: "InstanceNorm/instancenorm/mul_1" + type: FLOAT32 + shape { + dim: 1 + dim: 8 + dim: 6 + dim: 12 + } +} +operand { + name: "InstanceNorm/instancenorm/mul_2" + type: FLOAT32 + shape { + dim: 1 + dim: 1 + dim: 1 + dim: 12 + } +} +operand { + name: "InstanceNorm/instancenorm/sub" + type: FLOAT32 + shape { + dim: 1 + dim: 1 + dim: 1 + dim: 12 + } +} +operand { + name: "InstanceNorm/moments/SquaredDifference" + type: FLOAT32 + shape { + dim: 1 + dim: 8 + dim: 6 + dim: 12 + } +} +operand { + name: "InstanceNorm/moments/mean" + type: FLOAT32 + shape { + dim: 1 + dim: 1 + dim: 1 + dim: 12 + } +} +operand { + name: "InstanceNorm/moments/mean/reduction_indices" + type: INT32 + shape { + dim: 2 + } + filler { + tag: "explicit" + arg: "1" + arg: "2" + } +} +operand { + name: "InstanceNorm/moments/variance" + type: FLOAT32 + shape { + dim: 1 + dim: 1 + dim: 1 + dim: 12 + } +} +operand { + name: "InstanceNorm/moments/variance/reduction_indices" + type: INT32 + shape { + dim: 2 + } + filler { + tag: "explicit" + arg: "1" + arg: "2" + } +} +operand { + name: "InstanceNorm_out" + type: FLOAT32 + shape { + dim: 1 + dim: 8 + dim: 6 + dim: 12 + } +} +operation { + type: "Mean" + input: "Input" + input: "InstanceNorm/moments/mean/reduction_indices" + output: "InstanceNorm/moments/mean" + mean_options { + keep_dims: true + } +} +operation { + type: "SquaredDifference" + input: "Input" + input: "InstanceNorm/moments/mean" + output: "InstanceNorm/moments/SquaredDifference" +} +operation { + type: "Mean" + input: "InstanceNorm/moments/SquaredDifference" + input: "InstanceNorm/moments/variance/reduction_indices" + output: "InstanceNorm/moments/variance" + mean_options { + keep_dims: true + } +} +operation { + type: "Add" + input: "InstanceNorm/moments/variance" + input: "InstanceNorm/instancenorm/add/y" + output: "InstanceNorm/instancenorm/add" + add_options { + activation: NONE + } +} +operation { + type: "Rsqrt" + input: "InstanceNorm/instancenorm/add" + output: "InstanceNorm/instancenorm/Rsqrt" +} +operation { + type: "Mul" + input: "InstanceNorm/instancenorm/Rsqrt" + input: "InstanceNorm/gamma/read" + output: "InstanceNorm/instancenorm/mul" + mul_options { + activation: NONE + } +} +operation { + type: "Mul" + input: "Input" + input: "InstanceNorm/instancenorm/mul" + output: "InstanceNorm/instancenorm/mul_1" + mul_options { + activation: NONE + } +} +operation { + type: "Mul" + input: "InstanceNorm/moments/mean" + input: "InstanceNorm/instancenorm/mul" + output: "InstanceNorm/instancenorm/mul_2" + mul_options { + activation: NONE + } +} +operation { + type: "Sub" + input: "InstanceNorm/beta/read" + input: "InstanceNorm/instancenorm/mul_2" + output: "InstanceNorm/instancenorm/sub" + sub_options { + activation: NONE + } +} +operation { + type: "Add" + input: "InstanceNorm/instancenorm/mul_1" + input: "InstanceNorm/instancenorm/sub" + output: "InstanceNorm_out" + add_options { + activation: NONE + } +} +operand { + name: "LeakyRelu" + type: FLOAT32 + shape { dim: 1 dim: 8 dim: 6 dim: 12 } +} +operation { + type: "LeakyRelu" + leaky_relu_options { + alpha: 2.0 + } + input: "InstanceNorm_out" + output: "LeakyRelu" +} +operand { + name: "Conv2D/ker" + type: FLOAT32 + shape { dim: 3 dim: 1 dim: 1 dim: 12 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "Conv2D/bias" + type: FLOAT32 + shape { dim: 3 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "Conv2D/ofm" + type: FLOAT32 + shape { dim: 1 dim: 8 dim: 6 dim: 3 } +} +operation { + type: "Conv2D" + conv2d_options { + padding: SAME + stride_w: 1 + stride_h: 1 + activation: NONE + } + input: "LeakyRelu" + input: "Conv2D/ker" + input: "Conv2D/bias" + output: "Conv2D/ofm" +} +input: "Input" +output: "Conv2D/ofm" diff --git a/runtime/contrib/android/api/Prebuilt.mk b/runtime/contrib/android/api/Prebuilt.mk index 9f0c0146a05..479dc34049c 100644 --- a/runtime/contrib/android/api/Prebuilt.mk +++ b/runtime/contrib/android/api/Prebuilt.mk @@ -18,7 +18,7 @@ include $(CLEAR_VARS) LOCAL_MODULE := onert_core PREBUILT_LIB += onert_core LOCAL_SRC_FILES := \ - $(ONERT_PREBUILT_LIB_DIR)/libonert_core.so + $(ONERT_PREBUILT_LIB_DIR)/nnfw/libonert_core.so include $(PREBUILT_SHARED_LIBRARY) # backend_cpu diff --git a/runtime/libs/circle-schema/include/circle_schema_generated.h b/runtime/libs/circle-schema/include/circle_schema_generated.h index 7a563042b81..b481875e9cd 100644 --- a/runtime/libs/circle-schema/include/circle_schema_generated.h +++ b/runtime/libs/circle-schema/include/circle_schema_generated.h @@ -701,10 +701,10 @@ struct ModelT; enum TensorType : int8_t { - TensorType_Q8_1 = -5, - TensorType_Q8_0 = -4, - TensorType_Q4_1 = -3, - TensorType_Q4_0 = -2, + TensorType_GGML_Q8_1 = -5, + TensorType_GGML_Q8_0 = -4, + TensorType_GGML_Q4_1 = -3, + TensorType_GGML_Q4_0 = -2, TensorType_UINT4 = -1, TensorType_FLOAT32 = 0, TensorType_FLOAT16 = 1, @@ -724,18 +724,18 @@ enum TensorType : int8_t TensorType_UINT32 = 15, TensorType_UINT16 = 16, TensorType_INT4 = 17, - TensorType_MIN = TensorType_Q8_1, + TensorType_MIN = TensorType_GGML_Q8_1, TensorType_MAX = TensorType_INT4 }; inline const TensorType (&EnumValuesTensorType())[23] { static const TensorType values[] = { - TensorType_Q8_1, TensorType_Q8_0, TensorType_Q4_1, TensorType_Q4_0, - TensorType_UINT4, TensorType_FLOAT32, TensorType_FLOAT16, TensorType_INT32, - TensorType_UINT8, TensorType_INT64, TensorType_STRING, TensorType_BOOL, - TensorType_INT16, TensorType_COMPLEX64, TensorType_INT8, TensorType_FLOAT64, - TensorType_COMPLEX128, TensorType_UINT64, TensorType_RESOURCE, TensorType_VARIANT, + TensorType_GGML_Q8_1, TensorType_GGML_Q8_0, TensorType_GGML_Q4_1, TensorType_GGML_Q4_0, + TensorType_UINT4, TensorType_FLOAT32, TensorType_FLOAT16, TensorType_INT32, + TensorType_UINT8, TensorType_INT64, TensorType_STRING, TensorType_BOOL, + TensorType_INT16, TensorType_COMPLEX64, TensorType_INT8, TensorType_FLOAT64, + TensorType_COMPLEX128, TensorType_UINT64, TensorType_RESOURCE, TensorType_VARIANT, TensorType_UINT32, TensorType_UINT16, TensorType_INT4}; return values; } @@ -743,17 +743,18 @@ inline const TensorType (&EnumValuesTensorType())[23] inline const char *const *EnumNamesTensorType() { static const char *const names[24] = { - "Q8_1", "Q8_0", "Q4_1", "Q4_0", "UINT4", "FLOAT32", "FLOAT16", "INT32", - "UINT8", "INT64", "STRING", "BOOL", "INT16", "COMPLEX64", "INT8", "FLOAT64", - "COMPLEX128", "UINT64", "RESOURCE", "VARIANT", "UINT32", "UINT16", "INT4", nullptr}; + "GGML_Q8_1", "GGML_Q8_0", "GGML_Q4_1", "GGML_Q4_0", "UINT4", "FLOAT32", + "FLOAT16", "INT32", "UINT8", "INT64", "STRING", "BOOL", + "INT16", "COMPLEX64", "INT8", "FLOAT64", "COMPLEX128", "UINT64", + "RESOURCE", "VARIANT", "UINT32", "UINT16", "INT4", nullptr}; return names; } inline const char *EnumNameTensorType(TensorType e) { - if (::flatbuffers::IsOutRange(e, TensorType_Q8_1, TensorType_INT4)) + if (::flatbuffers::IsOutRange(e, TensorType_GGML_Q8_1, TensorType_INT4)) return ""; - const size_t index = static_cast(e) - static_cast(TensorType_Q8_1); + const size_t index = static_cast(e) - static_cast(TensorType_GGML_Q8_1); return EnumNamesTensorType()[index]; } @@ -1066,6 +1067,34 @@ bool VerifySparseIndexVectorVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset> *values, const ::flatbuffers::Vector *types); +enum CompressionType : int8_t +{ + CompressionType_NONE = 0, + CompressionType_HUFFMAN = 1, + CompressionType_MIN = CompressionType_NONE, + CompressionType_MAX = CompressionType_HUFFMAN +}; + +inline const CompressionType (&EnumValuesCompressionType())[2] +{ + static const CompressionType values[] = {CompressionType_NONE, CompressionType_HUFFMAN}; + return values; +} + +inline const char *const *EnumNamesCompressionType() +{ + static const char *const names[3] = {"NONE", "HUFFMAN", nullptr}; + return names; +} + +inline const char *EnumNameCompressionType(CompressionType e) +{ + if (::flatbuffers::IsOutRange(e, CompressionType_NONE, CompressionType_HUFFMAN)) + return ""; + const size_t index = static_cast(e); + return EnumNamesCompressionType()[index]; +} + enum BuiltinOperator : int32_t { BuiltinOperator_GRU = -5, @@ -7061,6 +7090,7 @@ struct TensorT : public ::flatbuffers::NativeTable std::vector shape_signature{}; bool has_rank = false; std::vector> variant_tensors{}; + circle::CompressionType compression_type = circle::CompressionType_NONE; TensorT() = default; TensorT(const TensorT &o); TensorT(TensorT &&) FLATBUFFERS_NOEXCEPT = default; @@ -7082,7 +7112,8 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table VT_SPARSITY = 16, VT_SHAPE_SIGNATURE = 18, VT_HAS_RANK = 20, - VT_VARIANT_TENSORS = 22 + VT_VARIANT_TENSORS = 22, + VT_COMPRESSION_TYPE = 24 }; const ::flatbuffers::Vector *shape() const { @@ -7117,6 +7148,10 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table return GetPointer> *>( VT_VARIANT_TENSORS); } + circle::CompressionType compression_type() const + { + return static_cast(GetField(VT_COMPRESSION_TYPE, 0)); + } bool Verify(::flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_SHAPE) && @@ -7129,7 +7164,8 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table VerifyOffset(verifier, VT_SHAPE_SIGNATURE) && verifier.VerifyVector(shape_signature()) && VerifyField(verifier, VT_HAS_RANK, 1) && VerifyOffset(verifier, VT_VARIANT_TENSORS) && verifier.VerifyVector(variant_tensors()) && - verifier.VerifyVectorOfTables(variant_tensors()) && verifier.EndTable(); + verifier.VerifyVectorOfTables(variant_tensors()) && + VerifyField(verifier, VT_COMPRESSION_TYPE, 1) && verifier.EndTable(); } TensorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const; void UnPackTo(TensorT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const; @@ -7182,6 +7218,10 @@ struct TensorBuilder { fbb_.AddOffset(Tensor::VT_VARIANT_TENSORS, variant_tensors); } + void add_compression_type(circle::CompressionType compression_type) + { + fbb_.AddElement(Tensor::VT_COMPRESSION_TYPE, static_cast(compression_type), 0); + } explicit TensorBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); @@ -7203,7 +7243,8 @@ inline ::flatbuffers::Offset CreateTensor( ::flatbuffers::Offset sparsity = 0, ::flatbuffers::Offset<::flatbuffers::Vector> shape_signature = 0, bool has_rank = false, ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> - variant_tensors = 0) + variant_tensors = 0, + circle::CompressionType compression_type = circle::CompressionType_NONE) { TensorBuilder builder_(_fbb); builder_.add_variant_tensors(variant_tensors); @@ -7213,6 +7254,7 @@ inline ::flatbuffers::Offset CreateTensor( builder_.add_name(name); builder_.add_buffer(buffer); builder_.add_shape(shape); + builder_.add_compression_type(compression_type); builder_.add_has_rank(has_rank); builder_.add_is_variable(is_variable); builder_.add_type(type); @@ -7226,7 +7268,8 @@ inline ::flatbuffers::Offset CreateTensorDirect( ::flatbuffers::Offset quantization = 0, bool is_variable = false, ::flatbuffers::Offset sparsity = 0, const std::vector *shape_signature = nullptr, bool has_rank = false, - const std::vector<::flatbuffers::Offset> *variant_tensors = nullptr) + const std::vector<::flatbuffers::Offset> *variant_tensors = nullptr, + circle::CompressionType compression_type = circle::CompressionType_NONE) { auto shape__ = shape ? _fbb.CreateVector(*shape) : 0; auto name__ = name ? _fbb.CreateString(name) : 0; @@ -7236,7 +7279,8 @@ inline ::flatbuffers::Offset CreateTensorDirect( ? _fbb.CreateVector<::flatbuffers::Offset>(*variant_tensors) : 0; return circle::CreateTensor(_fbb, shape__, type, buffer, name__, quantization, is_variable, - sparsity, shape_signature__, has_rank, variant_tensors__); + sparsity, shape_signature__, has_rank, variant_tensors__, + compression_type); } ::flatbuffers::Offset @@ -21280,7 +21324,7 @@ inline TensorT::TensorT(const TensorT &o) quantization((o.quantization) ? new circle::QuantizationParametersT(*o.quantization) : nullptr), is_variable(o.is_variable), sparsity((o.sparsity) ? new circle::SparsityParametersT(*o.sparsity) : nullptr), - shape_signature(o.shape_signature), has_rank(o.has_rank) + shape_signature(o.shape_signature), has_rank(o.has_rank), compression_type(o.compression_type) { variant_tensors.reserve(o.variant_tensors.size()); for (const auto &variant_tensors_ : o.variant_tensors) @@ -21302,6 +21346,7 @@ inline TensorT &TensorT::operator=(TensorT o) FLATBUFFERS_NOEXCEPT std::swap(shape_signature, o.shape_signature); std::swap(has_rank, o.has_rank); std::swap(variant_tensors, o.variant_tensors); + std::swap(compression_type, o.compression_type); return *this; } @@ -21426,6 +21471,10 @@ inline void Tensor::UnPackTo(TensorT *_o, const ::flatbuffers::resolver_function _o->variant_tensors.resize(0); } } + { + auto _e = compression_type(); + _o->compression_type = _e; + } } inline ::flatbuffers::Offset @@ -21468,8 +21517,10 @@ CreateTensor(::flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, }, &_va) : 0; + auto _compression_type = _o->compression_type; return circle::CreateTensor(_fbb, _shape, _type, _buffer, _name, _quantization, _is_variable, - _sparsity, _shape_signature, _has_rank, _variant_tensors); + _sparsity, _shape_signature, _has_rank, _variant_tensors, + _compression_type); } inline StablehloGatherOptionsT * diff --git a/runtime/onert/api/nnapi/CMakeLists.txt b/runtime/onert/api/nnapi/CMakeLists.txt index 324e59afe71..eb48a7beac1 100644 --- a/runtime/onert/api/nnapi/CMakeLists.txt +++ b/runtime/onert/api/nnapi/CMakeLists.txt @@ -2,18 +2,13 @@ file(GLOB_RECURSE SOURCES_FRONTEND "*.cc") file(GLOB_RECURSE TESTS_FRONTEND "*.test.cc") list(REMOVE_ITEM SOURCES_FRONTEND ${TESTS_FRONTEND}) -set(LIB_ONERT onert) +set(LIB_NNAPI onert_nnapi) -add_library(${LIB_ONERT} SHARED ${SOURCES_FRONTEND}) -target_include_directories(${LIB_ONERT} PUBLIC include) -target_link_libraries(${LIB_ONERT} PUBLIC onert_core) # TODO Link PRIVATE onert_core -target_link_libraries(${LIB_ONERT} PRIVATE nnfw_common) -target_link_libraries(${LIB_ONERT} PRIVATE nnfw_coverage) - -set_target_properties(${LIB_ONERT} PROPERTIES OUTPUT_NAME neuralnetworks) -set_target_properties(${LIB_ONERT} PROPERTIES INSTALL_RPATH "$ORIGIN:$ORIGIN/nnfw") - -install(TARGETS ${LIB_ONERT} DESTINATION lib) +add_library(${LIB_NNAPI} STATIC ${SOURCES_FRONTEND}) +target_include_directories(${LIB_NNAPI} PUBLIC include) +target_link_libraries(${LIB_NNAPI} PUBLIC onert_core) # TODO Link PRIVATE onert_core +target_link_libraries(${LIB_NNAPI} PRIVATE nnfw_common) +target_link_libraries(${LIB_NNAPI} PRIVATE nnfw_coverage) if(NOT ENABLE_TEST) return() @@ -21,11 +16,10 @@ endif(NOT ENABLE_TEST) add_executable(test_onert_frontend_nnapi ${TESTS_FRONTEND}) -target_link_libraries(test_onert_frontend_nnapi PRIVATE ${LIB_ONERT} dl) -target_link_libraries(test_onert_frontend_nnapi PRIVATE gtest) -target_link_libraries(test_onert_frontend_nnapi PRIVATE gtest_main) -# INSTALL_RPATH is for onert_core public link -# TODO Remove INSTALL_RPATH -set_target_properties(test_onert_frontend_nnapi PROPERTIES INSTALL_RPATH "$ORIGIN/../lib:$ORIGIN/../lib/nnfw") +target_link_libraries(test_onert_frontend_nnapi ${LIB_NNAPI} dl) +target_link_libraries(test_onert_frontend_nnapi gtest) +target_link_libraries(test_onert_frontend_nnapi gtest_main) +# Set INSTALL_RPATH to find onert_core +set_target_properties(test_onert_frontend_nnapi PROPERTIES INSTALL_RPATH "$ORIGIN/../lib/nnfw") install(TARGETS test_onert_frontend_nnapi DESTINATION unittest) diff --git a/runtime/onert/api/nnapi/compilation.cc b/runtime/onert/api/nnapi/compilation.cc index 2c56f061aaa..f7d64a6cbd6 100644 --- a/runtime/onert/api/nnapi/compilation.cc +++ b/runtime/onert/api/nnapi/compilation.cc @@ -104,3 +104,10 @@ int ANeuralNetworksCompilation_setPreference(ANeuralNetworksCompilation *compila // NYI: nothing to set return ANEURALNETWORKS_NO_ERROR; } + +int ANeuralNetworksCompilation_setCaching(ANeuralNetworksCompilation *, const char *, + const uint8_t *) +{ + VERBOSE(NNAPI::Compilation) << "setCaching: NYI" << std::endl; + return ANEURALNETWORKS_BAD_STATE; +} diff --git a/runtime/onert/api/nnapi/execution.cc b/runtime/onert/api/nnapi/execution.cc index 4e1a985f305..3d5f7fc05a8 100644 --- a/runtime/onert/api/nnapi/execution.cc +++ b/runtime/onert/api/nnapi/execution.cc @@ -502,3 +502,21 @@ int ANeuralNetworksExecution_getOutputOperandDimensions(ANeuralNetworksExecution return ANEURALNETWORKS_NO_ERROR; } + +int ANeuralNetworksBurst_create(ANeuralNetworksCompilation *, ANeuralNetworksBurst **) +{ + VERBOSE(NNAPI::Execution) << "BurstCreate: NYI" << std::endl; + return ANEURALNETWORKS_BAD_STATE; +} + +void ANeuralNetworksBurst_free(ANeuralNetworksBurst *) +{ + // TODO delete burst + // delete burst; +} + +int ANeuralNetworksExecution_burstCompute(ANeuralNetworksExecution *, ANeuralNetworksBurst *) +{ + VERBOSE(NNAPI::Execution) << "burstCompute: NYI" << std::endl; + return ANEURALNETWORKS_BAD_STATE; +} diff --git a/runtime/onert/api/nnapi/model.cc b/runtime/onert/api/nnapi/model.cc index 8c7bd178929..93f091a07dd 100644 --- a/runtime/onert/api/nnapi/model.cc +++ b/runtime/onert/api/nnapi/model.cc @@ -174,6 +174,13 @@ int ANeuralNetworksModel_setOperandValue(ANeuralNetworksModel *model, int32_t in return ANEURALNETWORKS_NO_ERROR; } +int ANeuralNetworksModel_setOperandSymmPerChannelQuantParams( + ANeuralNetworksModel *, int32_t, const ANeuralNetworksSymmPerChannelQuantParams *) +{ + VERBOSE(NNAPI::Model) << "setOperandSymmPerChannelQuantParams: NYI" << std::endl; + return ANEURALNETWORKS_BAD_STATE; +} + int ANeuralNetworksModel_setOperandValueFromMemory(ANeuralNetworksModel *model, int32_t index, const ANeuralNetworksMemory *memory, size_t offset, size_t length) diff --git a/runtime/onert/backend/train/KernelGenerator.cc b/runtime/onert/backend/train/KernelGenerator.cc index aaaa50f1b72..7e0d42c25e5 100644 --- a/runtime/onert/backend/train/KernelGenerator.cc +++ b/runtime/onert/backend/train/KernelGenerator.cc @@ -405,13 +405,15 @@ void KernelGenerator::visit(const ir::train::operation::Loss &node) auto loss_code = node.param().loss_code; auto loss_param = node.param().loss_param; + const auto reduction_type = node.param().reduction_type; switch (loss_code) { case ir::train::LossCode::MeanSquaredError: { auto fn = std::make_unique(); - fn->configure(y_pred_tensor, y_true_tensor, output_tensor, back_prop_y_pred_tensor); + fn->configure(y_pred_tensor, y_true_tensor, output_tensor, back_prop_y_pred_tensor, + reduction_type); _return_fn = std::move(fn); break; } @@ -419,7 +421,7 @@ void KernelGenerator::visit(const ir::train::operation::Loss &node) { auto fn = std::make_unique(); fn->configure(y_pred_tensor, y_true_tensor, output_tensor, back_prop_y_pred_tensor, - loss_param.cce.axis, loss_param.cce.label_smoothing); + reduction_type, loss_param.cce.axis, loss_param.cce.label_smoothing); _return_fn = std::move(fn); break; } diff --git a/runtime/onert/backend/train/MemoryManager.cc b/runtime/onert/backend/train/MemoryManager.cc index 4902e2a7eaa..fd156fea231 100644 --- a/runtime/onert/backend/train/MemoryManager.cc +++ b/runtime/onert/backend/train/MemoryManager.cc @@ -61,13 +61,13 @@ DisposableMemoryManager::DisposableMemoryManager() : _mem_planner{createMemoryPl basic::IMemoryPlanner *DisposableMemoryManager::createMemoryPlanner() { auto planner_id = util::getConfigString(util::config::CPU_MEMORY_PLANNER); - return MemoryPlannerFactory::get().create(planner_id); + return MemoryPlannerFactory::get().create(planner_id); } basic::IMemoryPlanner * DisposableMemoryManager::createMemoryPlanner(const std::string planner_id) { - return MemoryPlannerFactory::get().create(planner_id); + return MemoryPlannerFactory::get().create(planner_id); } void DisposableMemoryManager::claimPlan(const DisposableTensorIndex &ind, uint32_t size) diff --git a/runtime/onert/backend/train/MemoryManager.h b/runtime/onert/backend/train/MemoryManager.h index 19a60e32deb..98e840bf7f7 100644 --- a/runtime/onert/backend/train/MemoryManager.h +++ b/runtime/onert/backend/train/MemoryManager.h @@ -67,6 +67,8 @@ class DisposableMemoryManager std::shared_ptr _mem_alloc; }; +// TODO: Add LayerScopeMemoryManager using MemoryPlannerFactory + } // namespace train } // namespace backend } // namespace onert diff --git a/runtime/onert/backend/train/MemoryPlanner.test.cc b/runtime/onert/backend/train/MemoryPlanner.test.cc index 05d8ade2620..19b8b537b43 100644 --- a/runtime/onert/backend/train/MemoryPlanner.test.cc +++ b/runtime/onert/backend/train/MemoryPlanner.test.cc @@ -17,6 +17,7 @@ #include #include "DisposableTensorIndex.h" +#include "LayerScopeTensorIndex.h" #include "MemoryPlanner.h" #include "ir/Index.h" @@ -25,9 +26,74 @@ using onert::ir::OperandIndex; using onert::ir::OperationIndex; // TODO: Add test testcase for {Bump, FirstFit, WIC}Planner +namespace +{ + +template T to_index(uint32_t first, uint32_t second) = delete; + +template <> DisposableTensorIndex to_index(uint32_t first, uint32_t second) +{ + return DisposableTensorIndex{OperationIndex{first}, OperandIndex{second}}; +} -TEST(BumpPlanner, claim_test) +template <> LayerScopeTensorIndex to_index(uint32_t first, uint32_t second) { + return LayerScopeTensorIndex{OperationIndex{first}, second}; +} + +template