From e0eea90b55415090d42bc1ba44cbb18ed111a80b Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Wed, 27 Dec 2023 10:56:02 +0400 Subject: [PATCH] [Snippets] Propagate work amounts from Subgraph To LinearIR (#21338) --- .../snippets/include/snippets/op/subgraph.hpp | 12 +++--------- src/common/snippets/src/op/subgraph.cpp | 18 ++++++------------ .../snippets/tests/include/lowering_utils.hpp | 1 + .../tests/src/lowered/pass/optimize_domain.cpp | 8 +++----- .../snippets/tests/src/lowering_utils.cpp | 3 ++- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 10 ++++------ .../mul_add_to_fma.cpp | 1 + 7 files changed, 20 insertions(+), 33 deletions(-) diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp index 5d5b7f85270a55..f524e1a7e51c67 100644 --- a/src/common/snippets/include/snippets/op/subgraph.hpp +++ b/src/common/snippets/include/snippets/op/subgraph.hpp @@ -106,6 +106,7 @@ class Subgraph : public ov::op::util::SubGraphOp { const std::vector& data_flow_passes = {}, const lowered::pass::PassPipeline& control_flow_passes_pre_common = {}, const lowered::pass::PassPipeline& control_flow_passes_post_common = {}, + size_t min_parallel_work_amount = 8, size_t min_kernel_work_amount = 256, const std::shared_ptr& factory = nullptr, const void* compile_params = nullptr); @@ -119,8 +120,6 @@ class Subgraph : public ov::op::util::SubGraphOp { void set_generator(std::shared_ptr generator); void set_tile_rank(size_t newRank) {tileRank = newRank;} void set_virtual_port_count(size_t count); - void set_min_jit_work_amount(size_t jit_work_amount); - void set_min_parallel_work_amount(size_t parallel_work_amount); void print() const; @@ -143,7 +142,8 @@ class Subgraph : public ov::op::util::SubGraphOp { const std::vector& output_precisions = {}, const std::vector& = {}); std::shared_ptr - convert_body_to_linear_ir(const std::shared_ptr& shape_infer_factory = std::make_shared()); + convert_body_to_linear_ir(size_t min_parallel_work_amount = 8, size_t min_kernel_work_amount = 256, + const std::shared_ptr& shape_infer_factory = std::make_shared()); std::shared_ptr clone() const; private: @@ -176,12 +176,6 @@ class Subgraph : public ov::op::util::SubGraphOp { // True if body has operations that don't support plugin-side domain optimizations // (e.g. Transpose, Softmax, MatMul in general doesn't support dimensions collapsing) bool m_has_domain_sensitive_ops = false; - // Minimal advised work amount for parallel execution. - // Set by a backend, typically equals to the number of threads available on the machine. - size_t m_min_parallel_work_amount = 8; - // Minimal advised work amount every JIT kernel should process during one execution call - // Set by a backend, should be large enough to compensate for the kernel call overheads - size_t m_min_jit_work_amount = 256; } config; std::shared_ptr m_shape_infer = nullptr; diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index c88e760c4b9d9f..b5e5e5c526dd5b 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -69,14 +69,6 @@ void Subgraph::set_virtual_port_count(const size_t count) { m_virtual_port_count = count; } -void Subgraph::set_min_jit_work_amount(const size_t jit_work_amount) { - config.m_min_jit_work_amount = jit_work_amount; -} - -void Subgraph::set_min_parallel_work_amount(const size_t parallel_work_amount) { - config.m_min_parallel_work_amount = parallel_work_amount; -} - auto Subgraph::is_domain_sensitive_op(const std::shared_ptr& op) -> bool { return ov::is_type(op) || ov::is_type(op) || @@ -347,7 +339,8 @@ VectorDims Subgraph::infer_master_shape() { } std::shared_ptr -Subgraph::convert_body_to_linear_ir(const std::shared_ptr& shape_infer_factory) { +Subgraph::convert_body_to_linear_ir(size_t min_parallel_work_amount, size_t min_kernel_work_amount, + const std::shared_ptr& shape_infer_factory) { lowered::Config lowering_config; lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops; #ifdef SNIPPETS_DEBUG_CAPS @@ -356,8 +349,8 @@ Subgraph::convert_body_to_linear_ir(const std::shared_ptr(body_ptr(), shape_infer_factory, lowering_config); m_shape_infer = m_linear_ir->get_shape_infer_instance(); @@ -475,10 +468,11 @@ snippets::Schedule Subgraph::generate(const BlockedShapeVector& blocked_input_sh const std::vector& data_flow_backend_passes, const lowered::pass::PassPipeline& backend_passes_pre_common, const lowered::pass::PassPipeline& backend_passes_post_common, + size_t min_parallel_work_amount, size_t min_kernel_work_amount, const std::shared_ptr& factory, const void* compile_params) { data_flow_transformations(blocked_input_shapes, input_precisions, output_precisions, data_flow_backend_passes); - convert_body_to_linear_ir(factory); + convert_body_to_linear_ir(min_parallel_work_amount, min_kernel_work_amount, factory); return generate_from_linear_ir(backend_passes_pre_common, backend_passes_post_common, compile_params); } diff --git a/src/common/snippets/tests/include/lowering_utils.hpp b/src/common/snippets/tests/include/lowering_utils.hpp index 379a8f16cec4f0..98fde1b7626f54 100644 --- a/src/common/snippets/tests/include/lowering_utils.hpp +++ b/src/common/snippets/tests/include/lowering_utils.hpp @@ -65,6 +65,7 @@ class LoweringTests : public TransformationTestsF { const ov::snippets::lowered::pass::PassPipeline& lowered_pre_common = {}, const ov::snippets::lowered::pass::PassPipeline& lowered_post_common = {}, const std::shared_ptr& generator = nullptr, + size_t min_parallel_work_amount = 8, size_t min_kernel_work_amount = 256, const std::shared_ptr& factory = std::make_shared()); static std::shared_ptr getTokenizedSubgraph(const std::shared_ptr& f); diff --git a/src/common/snippets/tests/src/lowered/pass/optimize_domain.cpp b/src/common/snippets/tests/src/lowered/pass/optimize_domain.cpp index 025c2406ea33ab..88766a4b3ea886 100644 --- a/src/common/snippets/tests/src/lowered/pass/optimize_domain.cpp +++ b/src/common/snippets/tests/src/lowered/pass/optimize_domain.cpp @@ -44,12 +44,10 @@ void OptimizeDomainTest::SetUp() { TEST_P(OptimizeDomainTest, DomainOptimization) { auto subgraph = LoweringTests::getTokenizedSubgraph(m_model); - subgraph->set_min_jit_work_amount(m_domain_opt_params.min_jit_work_amount); - subgraph->set_min_parallel_work_amount(m_domain_opt_params.min_parallel_work_amount); - auto linear_ir = *subgraph->convert_body_to_linear_ir(); + auto linear_ir = subgraph->convert_body_to_linear_ir(m_domain_opt_params.min_parallel_work_amount, m_domain_opt_params.min_jit_work_amount); size_t loop_depth = 1; - ov::snippets::lowered::pass::OptimizeDomain(loop_depth).run(linear_ir); - const auto& master_shape = linear_ir.get_master_shape(); + ov::snippets::lowered::pass::OptimizeDomain(loop_depth).run(*linear_ir); + const auto& master_shape = linear_ir->get_master_shape(); EXPECT_EQ(loop_depth, m_domain_opt_params.exp_loop_depth) << "Inconsistent loop depth detected"; EXPECT_THAT(master_shape, testing::ContainerEq(m_domain_opt_params.exp_master_shape)) << "Inconsistent master_shape detected"; } diff --git a/src/common/snippets/tests/src/lowering_utils.cpp b/src/common/snippets/tests/src/lowering_utils.cpp index 58d37c196b2c9a..7e630c0dc905dc 100644 --- a/src/common/snippets/tests/src/lowering_utils.cpp +++ b/src/common/snippets/tests/src/lowering_utils.cpp @@ -112,12 +112,13 @@ std::shared_ptr const ov::snippets::lowered::pass::PassPipeline& lowered_pre_common, const ov::snippets::lowered::pass::PassPipeline& lowered_post_common, const std::shared_ptr& generator, + size_t min_parallel_work_amount, size_t min_kernel_work_amount, const std::shared_ptr& factory) { auto subgraph = getTokenizedSubgraph(f); subgraph->set_generator(generator == nullptr ? std::make_shared() : generator); subgraph->set_tile_rank(2); // Note: lowered_pipeline would have no effect on subgraph body, since it's applied on linear IR - subgraph->generate({}, {}, {}, backend_passes, lowered_pre_common, lowered_post_common, factory); + subgraph->generate({}, {}, {}, backend_passes, lowered_pre_common, lowered_post_common, min_parallel_work_amount, min_kernel_work_amount, factory); return subgraph; } diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index a96956952ddcf7..a0c3cd650b4a03 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -361,7 +361,10 @@ void Snippet::initOptimalPrimitiveDescriptor() { output_precisions.push_back(p); snippetAttrs.snippet->data_flow_transformations(in_blocked_shapes, input_precisions, output_precisions, backend_passes); - snippetAttrs.snippet->convert_body_to_linear_ir(std::make_shared()); + // Note: minimal JIT work amount is a predefined value that describes the number of kernel iterations (work amount) + // needed to cover kernel call overhead. It is used for balancing between parallel and JIT work amounts in domain optimization. + snippetAttrs.snippet->convert_body_to_linear_ir(static_cast(parallel_get_max_threads()), 256, + std::make_shared()); } ov::element::Type Snippet::getRuntimePrecision() const { @@ -570,11 +573,6 @@ Snippet::SnippetJitExecutor::SnippetJitExecutor(SnippetAttrs attrs, bool is_dyna if (std::any_of(canonicalShape.begin(), canonicalShape.end(), [](size_t x){return x == snippets::IShapeInferSnippets::DYNAMIC_DIMENSION;})) OPENVINO_THROW("Snippets: Canonicalization returned dynamic shape in static pipeline"); - snippetAttrs.snippet->set_min_parallel_work_amount(static_cast(parallel_get_max_threads())); - - // Note: minimal JIT work amount is a predefined value that describes the number of kernel iterations (work amount) - // needed to cover kernel call overhead. It is used for balancing between parallel and JIT work amounts in domain optimization. - snippetAttrs.snippet->set_min_jit_work_amount(256); // generate jit_snippets_compile_args jcp; diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/mul_add_to_fma.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/mul_add_to_fma.cpp index 5912cb9debfc83..836956f1f4cd66 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/mul_add_to_fma.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/mul_add_to_fma.cpp @@ -150,6 +150,7 @@ TEST_P(MulAddToFMATests, MulAddToFMATests) { {}, {}, generator, + 8, 256, std::make_shared()); model = subgraph->body_ptr(); model_ref = snippets_model->getLowered();