From c6801aacdeb87ddf668290d3bd4e7a863f4b88af Mon Sep 17 00:00:00 2001
From: Shaojun_Yao <shaojun.yao@intel.com>
Date: Tue, 15 Oct 2024 23:20:17 +0800
Subject: [PATCH] [NPUW] extend DQ & PMM processing and make reduceSum not to
 keep axis (#26779)

### Details:
 - extend DQ and PMM to support more patterns. e.g. fp16 matmuls
- Make reduceSum not to keep axis because then it will convert to
poolings in compiler. Otherwise reduceSum will convert to the
convolution which is less efficient than poolings.

### Tickets:
 - E-140570

---------

Co-authored-by: Dmitry Matveev <dmitry.matveev@intel.com>
---
 .../src/plugin/npuw/partitioning/patterns/opt.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
index c9a162421fe243..26b24a15509a4f 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
@@ -335,7 +335,7 @@ DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) {
     auto qcvtw = opp::wrap_type<ov::op::v0::Convert>({qweight});
     auto qmuls = opp::wrap_type<ov::op::v1::Multiply>({qcvtw, qcoeff});
     auto qreshp = opp::wrap_type<ov::op::v1::Reshape>({qmuls, opp::any_input()});
-    auto qcvtr = opp::wrap_type<ov::op::v0::Convert>({qreshp});
+    auto qcvtr = opp::optional<ov::op::v0::Convert>({qreshp->output(0)});
     auto qmmi = opp::any_input();
     auto qmm = opp::wrap_type<ov::op::v0::MatMul>({qmmi, qcvtr});
 
@@ -409,13 +409,18 @@ DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) {
             auto rshp_ccat = std::make_shared<ov::op::v1::Reshape>(scaled, rshp_ccat_c, false);
 
             auto reduce_axis = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, 1);
-            auto reduce = std::make_shared<ov::op::v1::ReduceSum>(rshp_ccat, reduce_axis, true);
+            // Make reduceSum not to keep axis because then it will convert to poolings in compiler.
+            // Otherwise reduceSum will convert to the convolution which is less efficient than poolings.
+            auto reduce = std::make_shared<ov::op::v1::ReduceSum>(rshp_ccat, reduce_axis, false);
 
             auto rshp_out_c = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, out_shape);
             auto rshp_out = std::make_shared<ov::op::v1::Reshape>(reduce, rshp_out_c, false);
 
-            // Convert the result to f32 to maintain the graph contracts. FIXME should be avoided
-            auto out = std::make_shared<ov::op::v0::Convert>(rshp_out, ov::element::f32);
+            // Convert the result to f32 to maintain the graph contracts if required.
+            std::shared_ptr<ov::Node> out = rshp_out;
+            if (matched_matmul->get_element_type() == ov::element::f32) {
+                out = std::make_shared<ov::op::v0::Convert>(rshp_out, ov::element::f32);
+            }
 
             // Now.. Reconnect the matmul readers to the new output (reducesum)
             for (auto&& r : matched_matmul->output(0).get_target_inputs()) {
@@ -752,7 +757,7 @@ void mergeParallelMatMuls(const std::shared_ptr<ov::Model>& m, Context& ctx) {
         auto new_cvt = std::make_shared<ov::op::v0::Convert>(new_w, new_s->get_element_type());
 
         std::shared_ptr<ov::Node> new_mul = std::make_shared<ov::op::v1::Multiply>(new_cvt, new_s);
-        if (new_s->get_element_type() == ov::element::f16) {
+        if ((new_s->get_element_type() == ov::element::f16) && (orig_multiply.get_element_type() == ov::element::f32)) {
             new_mul = std::make_shared<ov::op::v0::Convert>(new_mul, ov::element::f32);
         }
         auto new_w_shape = new_w->get_shape();