Merge branch 'master' into cb-by-default-int8-respect-ir

openvinotoolkit · Jan 22, 2025 · 788de0f · 788de0f
2 parents f4a02cb + af41f9c
commit 788de0f
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 22 deletions.
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -153,7 +153,7 @@ jobs:
 
   stable_diffusion_1_5_cpp-windows:
     needs: [ openvino_download_windows ]
-    runs-on: windows-2019
+    runs-on: windows-2022
     defaults:
       run:
         shell: pwsh

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -1,4 +1,4 @@
-name: Windows (VS 2019, Python 3.11)
+name: Windows (VS 2022, Python 3.11)
 on:
   workflow_dispatch:
   pull_request:
@@ -59,7 +59,7 @@ jobs:
     defaults:
       run:
         shell: pwsh
-    runs-on: windows-2019-16-core
+    runs-on: windows-2022-16-core
     env:
       CMAKE_BUILD_TYPE: 'Release'
       CMAKE_GENERATOR: 'Ninja Multi-Config'
@@ -121,6 +121,8 @@ jobs:
 
       - name: Configure Developer Command Prompt for Microsoft Visual C++
         uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
+        with:
+          toolset: 14.42 # v2022
 
       - name: CMake configure - OpenVINO
         run: |
@@ -192,7 +194,7 @@ jobs:
     defaults:
       run:
         shell: pwsh
-    runs-on: windows-2019
+    runs-on: windows-2022
 
     env:
       OV_INSTALL_DIR: ${{ github.workspace }}\\ov
@@ -225,6 +227,8 @@ jobs:
 
       - name: Configure Developer Command Prompt for Microsoft Visual C++
         uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
+        with:
+          toolset: 14.42 # v2022
 
       - name: Build genai libs
         run: |
@@ -257,7 +261,7 @@ jobs:
     defaults:
       run:
         shell: pwsh
-    runs-on: windows-2019
+    runs-on: windows-2022
 
     env:
       OV_INSTALL_DIR: ${{ github.workspace }}\\ov
@@ -290,6 +294,8 @@ jobs:
 
       - name: Configure Developer Command Prompt for Microsoft Visual C++
         uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
+        with:
+          toolset: 14.42 # v2022
 
       - name: Build genai libs
         run: |
@@ -327,7 +333,7 @@ jobs:
     defaults:
       run:
         shell: pwsh
-    runs-on: windows-2019-16-core
+    runs-on: windows-2022-16-core
 
     env:
       OV_INSTALL_DIR: ${{ github.workspace }}\\ov
@@ -360,6 +366,8 @@ jobs:
 
       - name: Configure Developer Command Prompt for Microsoft Visual C++
         uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
+        with:
+          toolset: 14.42 # v2022
 
       - name: Build genai libs
         run: |
@@ -388,7 +396,7 @@ jobs:
     defaults:
       run:
         shell: pwsh
-    runs-on: windows-2019
+    runs-on: windows-2022
 
     env:
       OV_INSTALL_DIR: ${{ github.workspace }}\\ov
@@ -421,6 +429,8 @@ jobs:
 
       - name: Configure Developer Command Prompt for Microsoft Visual C++
         uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
+        with:
+          toolset: 14.42 # v2022
 
       - name: Build genai libs
         run: |

diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
@@ -233,50 +233,74 @@ ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::Token
 }
 
 namespace {
-std::shared_ptr<ov::Node> find_llm_matmul(const std::shared_ptr<ov::Model>& model) {
+
+bool has_op_with_type(const std::shared_ptr<const ov::Model>& function, const std::string& type_name) {
+    for (const auto& op : function->get_ops()) {
+        if (op->get_type_name() == type_name) {
+            return true;
+        }
+    }
+    return false;
+}
+
+std::tuple<std::shared_ptr<ov::Node>, int64_t> find_llm_matmul(const std::shared_ptr<ov::Model>& model) {
     auto last_node = model->output(0).get_node()->input_value(0).get_node_shared_ptr();
-    std::shared_ptr<ov::Node> matmul = std::dynamic_pointer_cast<ov::op::v0::MatMul>(last_node);
+    std::shared_ptr<ov::Node> matmul = ov::as_type_ptr<ov::op::v0::MatMul>(last_node);
+
+    // in case of PA all tokens are moved to batch dimension and we have to slice / gather accordingly
+    const bool pa_based_model = has_op_with_type(model, "PagedAttentionExtension");
+    int64_t slice_gather_dim = pa_based_model ? 0 : 1;
+
     // There are several patterns for matmul we are looking for:
     // Matmul -> Result
     // Matmul -> Add -> Result
     // Matmul -> Transpose -> Result
     // MatMul -> Divide -> Tanh -> Multiply -> Result
     if (!matmul) {
-        if(auto add = std::dynamic_pointer_cast<ov::op::v1::Add>(last_node)) {
-            matmul = std::dynamic_pointer_cast<ov::op::v0::MatMul>(add->input_value(0).get_node_shared_ptr());
-        } else if (auto transpose = std::dynamic_pointer_cast<ov::op::v1::Transpose>(last_node)) {
-            matmul = std::dynamic_pointer_cast<ov::op::v0::MatMul>(transpose->input_value(0).get_node_shared_ptr());
-        } else if (auto multiply = std::dynamic_pointer_cast<ov::op::v1::Multiply>(last_node)) {
-            if (auto tanh = std::dynamic_pointer_cast<ov::op::v0::Tanh>(multiply->input_value(0).get_node_shared_ptr())) {
-                if (auto divide = std::dynamic_pointer_cast<ov::op::v1::Divide>(tanh->input_value(0).get_node_shared_ptr())) {
-                    matmul = std::dynamic_pointer_cast<ov::op::v0::MatMul>(divide->input_value(0).get_node_shared_ptr());
+        if (auto add = ov::as_type_ptr<ov::op::v1::Add>(last_node)) {
+            matmul = ov::as_type_ptr<ov::op::v0::MatMul>(add->input_value(0).get_node_shared_ptr());
+        } else if (auto transpose = ov::as_type_ptr<ov::op::v1::Transpose>(last_node)) {
+            matmul = ov::as_type_ptr<ov::op::v0::MatMul>(transpose->input_value(0).get_node_shared_ptr());
+            auto order = ov::as_type_ptr<ov::op::v0::Constant>(transpose->input_value(1).get_node_shared_ptr())->get_axis_vector_val();
+            slice_gather_dim = order[slice_gather_dim];
+        } else if (auto multiply = ov::as_type_ptr<ov::op::v1::Multiply>(last_node)) {
+            if (auto tanh = ov::as_type_ptr<ov::op::v0::Tanh>(multiply->input_value(0).get_node_shared_ptr())) {
+                if (auto divide = ov::as_type_ptr<ov::op::v1::Divide>(tanh->input_value(0).get_node_shared_ptr())) {
+                    matmul = as_type_ptr<ov::op::v0::MatMul>(divide->input_value(0).get_node_shared_ptr());
                 }
             }
         }
     }
-    return matmul;
+    return std::make_tuple(matmul, slice_gather_dim);
 }
+
 } // namespace
 
 void apply_slice_before_matmul_transformation(std::shared_ptr<ov::Model> model) {
-    auto matmul = find_llm_matmul(model);
+    std::shared_ptr<ov::Node> matmul = nullptr;
+    int64_t slice_gather_dim = -1;
+    std::tie(matmul, slice_gather_dim) = find_llm_matmul(model);
+
     if (matmul && matmul->input(0).get_partial_shape().rank().get_length() == 3) {
         auto start = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-1});
         auto stop = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-2});
         auto step = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-1});
-        auto axis = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{1});
+        auto axis = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{slice_gather_dim});
         auto slice = std::make_shared<ov::op::v8::Slice>(matmul->input_value(0), start, stop, step, axis);
         matmul->input(0).replace_source_output(slice);
     }
 }
 
 void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model) {
-    auto matmul =  ov::genai::utils::find_llm_matmul(model);
+    std::shared_ptr<ov::Node> matmul = nullptr;
+    int64_t slice_gather_dim = -1;
+    std::tie(matmul, slice_gather_dim) = find_llm_matmul(model);
+
     if (matmul && matmul->input(0).get_partial_shape().rank().get_length() == 3) {
         auto indices = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::PartialShape{-1});
         indices->set_friendly_name("sampled_tokens_indices");
         indices->output(0).get_tensor().set_names({"sampled_tokens_indices"});
-        auto axis = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{0});
+        auto axis = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{slice_gather_dim});
         auto gather = std::make_shared<ov::op::v8::Gather>(matmul->input_value(0), indices, axis);
         matmul->input(0).replace_source_output(gather);
         model->add_parameters({indices});