From c6801aacdeb87ddf668290d3bd4e7a863f4b88af Mon Sep 17 00:00:00 2001 From: Shaojun_Yao Date: Tue, 15 Oct 2024 23:20:17 +0800 Subject: [PATCH] [NPUW] extend DQ & PMM processing and make reduceSum not to keep axis (#26779) ### Details: - extend DQ and PMM to support more patterns. e.g. fp16 matmuls - Make reduceSum not to keep axis because then it will convert to poolings in compiler. Otherwise reduceSum will convert to the convolution which is less efficient than poolings. ### Tickets: - E-140570 --------- Co-authored-by: Dmitry Matveev --- .../src/plugin/npuw/partitioning/patterns/opt.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index c9a162421fe243..26b24a15509a4f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -335,7 +335,7 @@ DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) { auto qcvtw = opp::wrap_type({qweight}); auto qmuls = opp::wrap_type({qcvtw, qcoeff}); auto qreshp = opp::wrap_type({qmuls, opp::any_input()}); - auto qcvtr = opp::wrap_type({qreshp}); + auto qcvtr = opp::optional({qreshp->output(0)}); auto qmmi = opp::any_input(); auto qmm = opp::wrap_type({qmmi, qcvtr}); @@ -409,13 +409,18 @@ DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) { auto rshp_ccat = std::make_shared(scaled, rshp_ccat_c, false); auto reduce_axis = std::make_shared(ov::element::i32, ov::Shape{}, 1); - auto reduce = std::make_shared(rshp_ccat, reduce_axis, true); + // Make reduceSum not to keep axis because then it will convert to poolings in compiler. + // Otherwise reduceSum will convert to the convolution which is less efficient than poolings. + auto reduce = std::make_shared(rshp_ccat, reduce_axis, false); auto rshp_out_c = std::make_shared(ov::element::i32, ov::Shape{3}, out_shape); auto rshp_out = std::make_shared(reduce, rshp_out_c, false); - // Convert the result to f32 to maintain the graph contracts. FIXME should be avoided - auto out = std::make_shared(rshp_out, ov::element::f32); + // Convert the result to f32 to maintain the graph contracts if required. + std::shared_ptr out = rshp_out; + if (matched_matmul->get_element_type() == ov::element::f32) { + out = std::make_shared(rshp_out, ov::element::f32); + } // Now.. Reconnect the matmul readers to the new output (reducesum) for (auto&& r : matched_matmul->output(0).get_target_inputs()) { @@ -752,7 +757,7 @@ void mergeParallelMatMuls(const std::shared_ptr& m, Context& ctx) { auto new_cvt = std::make_shared(new_w, new_s->get_element_type()); std::shared_ptr new_mul = std::make_shared(new_cvt, new_s); - if (new_s->get_element_type() == ov::element::f16) { + if ((new_s->get_element_type() == ov::element::f16) && (orig_multiply.get_element_type() == ov::element::f32)) { new_mul = std::make_shared(new_mul, ov::element::f32); } auto new_w_shape = new_w->get_shape();