openvinotoolkit · Wovchena · May 13, 2024 · May 7, 2024 · May 7, 2024 · May 7, 2024
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -30,6 +30,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -57,6 +58,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -185,6 +187,7 @@ jobs:
           call w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64\setupvars.bat
           python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -225,6 +228,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -252,6 +256,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -279,6 +284,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j 15
@@ -306,6 +312,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -333,6 +340,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
@@ -371,6 +379,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -415,6 +424,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j 15
@@ -459,6 +469,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j

diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -28,6 +28,8 @@ find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime)
 set_target_properties(speculative_decoding_lm PROPERTIES CXX_STANDARD 17)
 set_target_properties(speculative_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
+find_package(TBB COMPONENTS tbb)
+target_link_libraries(speculative_decoding_lm PRIVATE TBB::tbb)
 
 add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp)
 target_compile_definitions(prompt_lookup_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
@@ -36,3 +38,5 @@ find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime)
 set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD 17)
 set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
+find_package(TBB COMPONENTS tbb)
+target_link_libraries(prompt_lookup_decoding_lm PRIVATE TBB::tbb)
diff --git a/text_generation/causal_lm/cpp/README.md b/text_generation/causal_lm/cpp/README.md
@@ -55,6 +55,18 @@ This approach reduces the need for multiple infer requests to the main model, en
 
 Install [OpenVINO Archives >= 2024.1](docs.openvino.ai/install). `master` and possibly the latest `releases/*` branch correspond to not yet released OpenVINO versions. https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/ can be used for these branches early testing. `<INSTALL_DIR>` below refers to the extraction location.
 
+## Install TBB
+
+### Linux
+
+```sh
+sudo apt-get install libtbb-dev
+```
+
+### Windows/macOs
+
+Follow the [installation guide](https://github.com/oneapi-src/oneTBB/blob/master/INSTALL.md). Use `devel` install component.
+
 ## Build `greedy_causal_lm`, `beam_search_causal_lm` and `openvino_tokenizers`
 
 ### Linux/macOS
@@ -84,6 +96,7 @@ source <INSTALL_DIR>/setupvars.sh
 python3 -m pip install --upgrade-strategy eager -r requirements.txt
 # Update openvino_tokenizers from the submodule
 python3 -m pip install ./../../../thirdparty/openvino_tokenizers/[transformers]
+sudo apt-get install libtbb-dev
 optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
 ```
 

diff --git a/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp b/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp
@@ -1,6 +1,7 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
+#include <openvino/core/parallel.hpp>
 #include <openvino/openvino.hpp>
 
 namespace {
@@ -94,10 +95,11 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_
 
 void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) {
     // trim kv_cache values up to the new_seq_len
-    for (auto& state : request.query_state()) {
-        ov::Tensor old_tensor = state.get_state();
-        state.set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len));
-    }
+    auto states = request.query_state();
+    ov::parallel_for(states.size(), [&](size_t i) {
+        ov::Tensor old_tensor = states.at(i).get_state();
+        states.at(i).set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len));
+    });
 }
 
 class PromptLookupCandidateGenerator {

diff --git a/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp b/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <cmath>
+#include <openvino/core/parallel.hpp>
 #include <openvino/openvino.hpp>
 #include <random>
 
@@ -69,6 +70,7 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_
 
     auto old_tensor_data = tensor.data<float>();
     auto shape = tensor.get_shape();
+    size_t batch_size = shape[0];
     size_t num_kv_heads = shape[1];
     size_t old_seq_len = shape[2];
     size_t head_size = shape[3];
@@ -82,31 +84,23 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_
     if (seq_len_axis == 0) {
         shape[0] = new_seq_len;
         tensor.set_shape(shape);
+        return tensor;
     }
 
-    // if seq_len_axis == 2, then data is not contiguous, in order to trim need to repack tensor
-    auto new_tensor = ov::Tensor{ov::element::f32, {BATCH_SIZE, num_kv_heads, new_seq_len, head_size}};
-    auto new_tensor_data = new_tensor.data<float>();
-    for (size_t batch = 0; batch < BATCH_SIZE; ++batch) {
-        for (size_t i = 0; i < num_kv_heads; ++i) {
-            for (size_t j = 0; j < new_seq_len; ++j) {
-                auto dst_ptr = new_tensor_data + num_kv_heads * new_seq_len * head_size * batch +
-                               new_seq_len * head_size * i + head_size * j;
-                auto src_ptr = old_tensor_data + num_kv_heads * new_seq_len * head_size * batch +
-                               old_seq_len * head_size * i + head_size * j;
-                std::memcpy(dst_ptr, src_ptr, head_size * sizeof(float));
-            }
-        }
-    }
+    ov::Coordinate new_shape_begin{0, 0, 0, 0};
+    ov::Coordinate new_shape_end{batch_size, num_kv_heads, new_seq_len, head_size};
+    auto new_tensor = ov::Tensor(tensor, new_shape_begin, new_shape_end);
+
     return new_tensor;
 }
 
 void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) {
     // trim kv_cache values up to the new_seq_len
-    for (auto& state : request.query_state()) {
-        ov::Tensor old_tensor = state.get_state();
-        state.set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len));
-    }
+    auto states = request.query_state();
+    ov::parallel_for(states.size(), [&](size_t i) {
+        ov::Tensor old_tensor = states.at(i).get_state();
+        states.at(i).set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len));
+    });
 }
 
 class AssistedCandidateGenerator {