diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 52f8656344..df03bab7c6 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -30,6 +30,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2 cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j @@ -57,6 +58,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j @@ -225,6 +227,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j @@ -252,6 +255,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j @@ -279,6 +283,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2 cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j 15 @@ -306,6 +311,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1 cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j @@ -333,6 +339,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ @@ -371,6 +378,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j @@ -415,6 +423,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5 cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j 15 @@ -459,6 +468,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt index eb4cab5048..6da39c6abe 100644 --- a/text_generation/causal_lm/cpp/CMakeLists.txt +++ b/text_generation/causal_lm/cpp/CMakeLists.txt @@ -28,6 +28,8 @@ find_package(OpenVINO REQUIRED COMPONENTS Runtime) target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime) set_target_properties(speculative_decoding_lm PROPERTIES CXX_STANDARD 17) set_target_properties(speculative_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED ON) +find_package(TBB REQUIRED COMPONENTS tbb) +target_link_libraries(speculative_decoding_lm PRIVATE TBB::tbb) add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp) target_compile_definitions(prompt_lookup_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$\") @@ -36,3 +38,5 @@ find_package(OpenVINO REQUIRED COMPONENTS Runtime) target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime) set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD 17) set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED ON) +find_package(TBB REQUIRED COMPONENTS tbb) +target_link_libraries(prompt_lookup_decoding_lm PRIVATE TBB::tbb) diff --git a/text_generation/causal_lm/cpp/README.md b/text_generation/causal_lm/cpp/README.md index 21f3a066a4..08b91ab70e 100644 --- a/text_generation/causal_lm/cpp/README.md +++ b/text_generation/causal_lm/cpp/README.md @@ -55,6 +55,15 @@ This approach reduces the need for multiple infer requests to the main model, en Install [OpenVINO Archives >= 2024.1](docs.openvino.ai/install). `master` and possibly the latest `releases/*` branch correspond to not yet released OpenVINO versions. https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/ can be used for these branches early testing. `` below refers to the extraction location. +## Install `libtbb-dev` on Linux + +> [!NOTE] +> `tbb` development files are installed with OpenVINO Archive on Windows and macOS. + +```sh +sudo apt-get install libtbb-dev +``` + ## Build `greedy_causal_lm`, `beam_search_causal_lm` and `openvino_tokenizers` ### Linux/macOS diff --git a/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp b/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp index f4a50e94bb..5060b88642 100644 --- a/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp +++ b/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp @@ -1,6 +1,7 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#include #include namespace { @@ -94,10 +95,11 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_ void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) { // trim kv_cache values up to the new_seq_len - for (auto& state : request.query_state()) { - ov::Tensor old_tensor = state.get_state(); - state.set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len)); - } + auto states = request.query_state(); + ov::parallel_for(states.size(), [&](size_t i) { + ov::Tensor old_tensor = states.at(i).get_state(); + states.at(i).set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len)); + }); } class PromptLookupCandidateGenerator { diff --git a/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp b/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp index 4aefec14db..b0c40a7a9f 100644 --- a/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp +++ b/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include #include #include @@ -69,6 +70,7 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_ auto old_tensor_data = tensor.data(); auto shape = tensor.get_shape(); + size_t batch_size = shape[0]; size_t num_kv_heads = shape[1]; size_t old_seq_len = shape[2]; size_t head_size = shape[3]; @@ -82,31 +84,23 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_ if (seq_len_axis == 0) { shape[0] = new_seq_len; tensor.set_shape(shape); + return tensor; } - // if seq_len_axis == 2, then data is not contiguous, in order to trim need to repack tensor - auto new_tensor = ov::Tensor{ov::element::f32, {BATCH_SIZE, num_kv_heads, new_seq_len, head_size}}; - auto new_tensor_data = new_tensor.data(); - for (size_t batch = 0; batch < BATCH_SIZE; ++batch) { - for (size_t i = 0; i < num_kv_heads; ++i) { - for (size_t j = 0; j < new_seq_len; ++j) { - auto dst_ptr = new_tensor_data + num_kv_heads * new_seq_len * head_size * batch + - new_seq_len * head_size * i + head_size * j; - auto src_ptr = old_tensor_data + num_kv_heads * new_seq_len * head_size * batch + - old_seq_len * head_size * i + head_size * j; - std::memcpy(dst_ptr, src_ptr, head_size * sizeof(float)); - } - } - } + ov::Coordinate new_shape_begin{0, 0, 0, 0}; + ov::Coordinate new_shape_end{batch_size, num_kv_heads, new_seq_len, head_size}; + auto new_tensor = ov::Tensor(tensor, new_shape_begin, new_shape_end); + return new_tensor; } void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) { // trim kv_cache values up to the new_seq_len - for (auto& state : request.query_state()) { - ov::Tensor old_tensor = state.get_state(); - state.set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len)); - } + auto states = request.query_state(); + ov::parallel_for(states.size(), [&](size_t i) { + ov::Tensor old_tensor = states.at(i).get_state(); + states.at(i).set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len)); + }); } class AssistedCandidateGenerator {