From 6cd66d0274ddc8fde544643f74113fb6c40d2394 Mon Sep 17 00:00:00 2001 From: Anna Likholat Date: Wed, 20 Nov 2024 19:17:38 +0100 Subject: [PATCH 01/24] Text2Image Readme update: decode method usage (#1237) --- samples/cpp/text2image/README.md | 6 ++-- samples/python/text2image/README.md | 4 ++- .../src/image_generation/flux_pipeline.hpp | 36 ++++++++++--------- 3 files changed, 27 insertions(+), 19 deletions(-) diff --git a/samples/cpp/text2image/README.md b/samples/cpp/text2image/README.md index c5ffd53a84..ac736b2383 100644 --- a/samples/cpp/text2image/README.md +++ b/samples/cpp/text2image/README.md @@ -46,14 +46,16 @@ You can also add a callback to the `main.cpp` file to interrupt the image genera Please find the template of the callback usage below. ```cpp -auto callback = [](size_t step, ov::Tensor& intermediate_res) -> bool { +ov::genai::Text2ImagePipeline pipe(models_path, device); + +auto callback = [&](size_t step, ov::Tensor& intermediate_res) -> bool { std::cout << "Image generation step: " << step << std::endl; + ov::Tensor img = pipe.decode(intermediate_res); // get intermediate image tensor if (your_condition) // return true if you want to interrupt image generation return true; return false; }; -ov::genai::Text2ImagePipeline pipe(models_path, device); ov::Tensor image = pipe.generate(prompt, ... ov::genai::callback(callback) diff --git a/samples/python/text2image/README.md b/samples/python/text2image/README.md index 9421061885..2e841673d3 100644 --- a/samples/python/text2image/README.md +++ b/samples/python/text2image/README.md @@ -46,13 +46,15 @@ You can also add a callback to the `main.py` file to interrupt the image generat Please find the template of the callback usage below. ```python +pipe = openvino_genai.Text2ImagePipeline(model_dir, device) + def callback(step, intermediate_res): print("Image generation step: ", step) + image_tensor = pipe.decode(intermediate_res) # get intermediate image tensor if your_condition: # return True if you want to interrupt image generation return True return False -pipe = openvino_genai.Text2ImagePipeline(model_dir, device) image = pipe.generate( ... callback = callback diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp index 101401d434..e684443e47 100644 --- a/src/cpp/src/image_generation/flux_pipeline.hpp +++ b/src/cpp/src/image_generation/flux_pipeline.hpp @@ -297,33 +297,33 @@ class FluxPipeline : public DiffusionPipeline { ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, const ov::AnyMap& properties) override { - ImageGenerationConfig generation_config = m_generation_config; - generation_config.update_generation_config(properties); + m_custom_generation_config = m_generation_config; + m_custom_generation_config.update_generation_config(properties); if (!initial_image) { // in case of typical text to image generation, we need to ignore 'strength' - generation_config.strength = 1.0f; + m_custom_generation_config.strength = 1.0f; } const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); const auto& transformer_config = m_transformer->get_config(); - if (generation_config.height < 0) - generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor; - if (generation_config.width < 0) - generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor; + if (m_custom_generation_config.height < 0) + m_custom_generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor; + if (m_custom_generation_config.width < 0) + m_custom_generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor; - check_inputs(generation_config, initial_image); + check_inputs(m_custom_generation_config, initial_image); - compute_hidden_states(positive_prompt, generation_config); + compute_hidden_states(positive_prompt, m_custom_generation_config); - ov::Tensor latents = prepare_latents(initial_image, generation_config); + ov::Tensor latents = prepare_latents(initial_image, m_custom_generation_config); size_t image_seq_len = latents.get_shape()[1]; float mu = m_scheduler->calculate_shift(image_seq_len); - float linspace_end = 1.0f / generation_config.num_inference_steps; - std::vector sigmas = numpy_utils::linspace(1.0f, linspace_end, generation_config.num_inference_steps, true); + float linspace_end = 1.0f / m_custom_generation_config.num_inference_steps; + std::vector sigmas = numpy_utils::linspace(1.0f, linspace_end, m_custom_generation_config.num_inference_steps, true); m_scheduler->set_timesteps_with_sigma(sigmas, mu); std::vector timesteps = m_scheduler->get_float_timesteps(); @@ -345,7 +345,7 @@ class FluxPipeline : public DiffusionPipeline { ov::Tensor noise_pred_tensor = m_transformer->infer(latents, timestep); - auto scheduler_step_result = m_scheduler->step(noise_pred_tensor, latents, inference_step, generation_config.generator); + auto scheduler_step_result = m_scheduler->step(noise_pred_tensor, latents, inference_step, m_custom_generation_config.generator); latents = scheduler_step_result["latent"]; if (do_callback) { @@ -355,12 +355,16 @@ class FluxPipeline : public DiffusionPipeline { } } - latents = unpack_latents(latents, generation_config.height, generation_config.width, vae_scale_factor); + latents = unpack_latents(latents, m_custom_generation_config.height, m_custom_generation_config.width, vae_scale_factor); return m_vae->decode(latents); } ov::Tensor decode(const ov::Tensor latent) override { - return m_vae->decode(latent); + ov::Tensor unpacked_latent = unpack_latents(latent, + m_custom_generation_config.height, + m_custom_generation_config.width, + m_vae->get_vae_scale_factor()); + return m_vae->decode(unpacked_latent); } private: @@ -407,7 +411,7 @@ class FluxPipeline : public DiffusionPipeline { std::shared_ptr m_clip_text_encoder; std::shared_ptr m_t5_text_encoder; std::shared_ptr m_vae; - + ImageGenerationConfig m_custom_generation_config; }; } // namespace genai From cd05c8eb9ce1eb22411c2107afcdb1b3e2344fa9 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 20 Nov 2024 20:03:33 +0100 Subject: [PATCH 02/24] Fixed passing of generation config params to VLM generate. (#1180) - Fixed passing of generation config params to VLM generate(). - Updated generation config params params list in `update_config_from_kwargs()` method. Ticket: CVS-157050 --------- Co-authored-by: Ilya Lavrenov --- .../openvino/genai/generation_config.hpp | 2 +- src/cpp/src/llm_pipeline_static.cpp | 7 + src/cpp/src/utils.hpp | 22 +- .../openvino_genai/py_openvino_genai.pyi | 2 +- src/python/py_image_generation_pipelines.cpp | 108 +-------- src/python/py_tokenizer.cpp | 13 +- src/python/py_utils.cpp | 209 ++++++++++++------ src/python/py_utils.hpp | 2 +- src/python/py_vlm_pipeline.cpp | 44 +--- src/python/py_whisper_pipeline.cpp | 55 +---- tests/cpp/utils.cpp | 21 ++ tests/python_tests/test_generate_api.py | 11 +- tests/python_tests/test_sampling.py | 6 +- 13 files changed, 220 insertions(+), 282 deletions(-) create mode 100644 tests/cpp/utils.cpp diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 22edcb98c0..8d23b298ba 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -156,7 +156,7 @@ static constexpr ov::Property ignore_eos{"ignore_eos"}; static constexpr ov::Property min_new_tokens{"min_new_tokens"}; static constexpr ov::Property> stop_strings{"stop_strings"}; static constexpr ov::Property include_stop_str_in_output{"include_stop_str_in_output"}; -static constexpr ov::Property>> stop_token_ids{"stop_token_ids"}; +static constexpr ov::Property> stop_token_ids{"stop_token_ids"}; static constexpr ov::Property num_beam_groups{"num_beam_groups"}; static constexpr ov::Property num_beams{"num_beams"}; diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 40089384a8..2beb7d64be 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -530,6 +530,13 @@ template T pop_or_default(ov::AnyMap& config, const std::string& key, const T& default_value) { auto anyopt = pop_option(config, key); if (anyopt.has_value()) { + if (anyopt.value().empty()) { + if (ov::genai::utils::is_container) + return T{}; + else { + OPENVINO_THROW("Got empty ov::Any for key: " + key); + } + } return anyopt.value().as(); } return default_value; diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 9adc46c87a..3487fccb81 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once +#include #include "openvino/genai/llm_pipeline.hpp" #include "openvino/runtime/core.hpp" @@ -12,6 +13,16 @@ namespace ov { namespace genai { namespace utils { +// Variable template that checks if a type has begin() and end() member functions +template +constexpr bool is_container = false; + +template +constexpr bool is_container().begin()), + decltype(std::declval().end())>> = true; + + Tensor init_attention_mask(const Tensor& position_ids); void print_tensor(const ov::Tensor& tensor); @@ -31,7 +42,16 @@ template void read_anymap_param(const ov::AnyMap& config_map, const std::string& name, T& param) { auto it = config_map.find(name); if (it != config_map.end()) { - param = it->second.as::value>(); + if (it->second.empty()) { + if (ov::genai::utils::is_container) + param = T{}; + else { + OPENVINO_THROW("Got empty ov::Any for parameter name: " + name); + } + } + else { + param = it->second.as::value>(); + } } } diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index a16b74b703..df290a9744 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -1296,7 +1296,7 @@ class Tokenizer: openvino_genai.Tokenizer object is used to initialize Tokenizer if it's located in a different path than the main model. """ - def __init__(self, tokenizer_path: os.PathLike, properties: dict[str, typing.Any] = {}) -> None: + def __init__(self, tokenizer_path: os.PathLike, properties: dict[str, typing.Any] = {}, **kwargs) -> None: ... def apply_chat_template(self, history: list[dict[str, str]], add_generation_prompt: bool, chat_template: str = '') -> str: """ diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp index f70faaca61..dade8a170e 100644 --- a/src/python/py_image_generation_pipelines.cpp +++ b/src/python/py_image_generation_pipelines.cpp @@ -67,108 +67,6 @@ auto text2image_generate_docstring = R"( )"; -void update_image_generation_config_from_kwargs( - ov::genai::ImageGenerationConfig& config, - const py::kwargs& kwargs) { - for (const auto& item : kwargs) { - std::string key = py::cast(item.first); - py::object value = py::cast(item.second); - - if (key == "prompt_2") { - config.prompt_2 = py::cast(value); - } else if (key == "prompt_3") { - config.prompt_3 = py::cast(value); - } else if (key == "negative_prompt") { - config.negative_prompt = py::cast(value); - } else if (key == "negative_prompt_2") { - config.negative_prompt_2 = py::cast(value); - } else if (key == "negative_prompt_3") { - config.negative_prompt_3 = py::cast(value); - } else if (key == "num_images_per_prompt") { - config.num_images_per_prompt = py::cast(value); - } else if (key == "guidance_scale") { - config.guidance_scale = py::cast(value); - } else if (key == "height") { - config.height = py::cast(value); - } else if (key == "width") { - config.width = py::cast(value); - } else if (key == "num_inference_steps") { - config.num_inference_steps = py::cast(value); - } else if (key == "generator") { - auto py_generator = py::cast>(value); - config.generator = py_generator; - } else if (key == "adapters") { - config.adapters = py::cast(value); - } else if (key == "strength") { - config.strength = py::cast(value); - } else if (key == "max_sequence_length") { - config.max_sequence_length = py::cast(value); - } else { - throw(std::invalid_argument("'" + key + "' is unexpected parameter name. " - "Use help(openvino_genai.ImageGenerationConfig) to get list of acceptable parameters.")); - } - } -} - -ov::AnyMap text2image_kwargs_to_any_map(const py::kwargs& kwargs, bool allow_compile_properties=true) { - ov::AnyMap params = {}; - - for (const auto& item : kwargs) { - std::string key = py::cast(item.first); - py::object value = py::cast(item.second); - - if (key == "prompt_2") { - params.insert({ov::genai::prompt_2(std::move(py::cast(value)))}); - } else if (key == "prompt_3") { - params.insert({ov::genai::prompt_3(std::move(py::cast(value)))}); - } else if (key == "negative_prompt") { - params.insert({ov::genai::negative_prompt(std::move(py::cast(value)))}); - } else if (key == "negative_prompt_2") { - params.insert({ov::genai::negative_prompt_2(std::move(py::cast(value)))}); - } else if (key == "negative_prompt_3") { - params.insert({ov::genai::negative_prompt_3(std::move(py::cast(value)))}); - } else if (key == "num_images_per_prompt") { - params.insert({ov::genai::num_images_per_prompt(std::move(py::cast(value)))}); - } else if (key == "guidance_scale") { - params.insert({ov::genai::guidance_scale(std::move(py::cast(value)))}); - } else if (key == "height") { - params.insert({ov::genai::height(std::move(py::cast(value)))}); - } else if (key == "width") { - params.insert({ov::genai::width(std::move(py::cast(value)))}); - } else if (key == "num_inference_steps") { - params.insert({ov::genai::num_inference_steps(std::move(py::cast(value)))}); - } else if (key == "generator") { - auto py_generator =py::cast>(value); - params.insert({ov::genai::generator(std::move(py_generator))}); - } else if (key == "adapters") { - params.insert({ov::genai::adapters(std::move(py::cast(value)))}); - } else if (key == "strength") { - params.insert({ov::genai::strength(std::move(py::cast(value)))}); - } else if (key == "max_sequence_length") { - params.insert({ov::genai::max_sequence_length(std::move(py::cast(value)))}); - } else if (key == "callback") { - params.insert({ov::genai::callback(std::move(py::cast>(value)))}); - } - else { - if (allow_compile_properties) { - // convert arbitrary objects to ov::Any - // not supported properties are not checked, as these properties are passed to compile(), which will throw exception in case of unsupported property - if (pyutils::py_object_is_any_map(value)) { - auto map = pyutils::py_object_to_any_map(value); - params.insert(map.begin(), map.end()); - } else { - params[key] = pyutils::py_object_to_any(value); - } - } - else { - // generate doesn't run compile(), so only Text2ImagePipeline specific properties are allowed - throw(std::invalid_argument("'" + key + "' is unexpected parameter name. " - "Use help(openvino_genai.Text2ImagePipeline.generate) to get list of acceptable parameters.")); - } - } - } - return params; -} } // namespace @@ -230,7 +128,7 @@ void init_image_generation_pipelines(py::module_& m) { .def("update_generation_config", []( ov::genai::ImageGenerationConfig config, const py::kwargs& kwargs) { - update_image_generation_config_from_kwargs(config, kwargs); + config.update_generation_config(pyutils::kwargs_to_any_map(kwargs)); }); auto text2image_pipeline = py::class_(m, "Text2ImagePipeline", "This class is used for generation with text-to-image models.") @@ -252,7 +150,7 @@ void init_image_generation_pipelines(py::module_& m) { const py::kwargs& kwargs ) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); - return std::make_unique(models_path, device, text2image_kwargs_to_any_map(kwargs, true)); + return std::make_unique(models_path, device, pyutils::kwargs_to_any_map(kwargs)); }), py::arg("models_path"), "folder with exported model files.", py::arg("device"), "device on which inference will be done", @@ -289,7 +187,7 @@ void init_image_generation_pipelines(py::module_& m) { const std::string& prompt, const py::kwargs& kwargs ) -> py::typing::Union { - ov::AnyMap params = text2image_kwargs_to_any_map(kwargs, false); + ov::AnyMap params = pyutils::kwargs_to_any_map(kwargs); return py::cast(pipe.generate(prompt, params)); }, py::arg("prompt"), "Input string", diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp index b3c52cd28b..2ccccff4c0 100644 --- a/src/python/py_tokenizer.cpp +++ b/src/python/py_tokenizer.cpp @@ -30,9 +30,18 @@ void init_tokenizer(py::module_& m) { R"(openvino_genai.Tokenizer object is used to initialize Tokenizer if it's located in a different path than the main model.)") - .def(py::init([](const std::filesystem::path& tokenizer_path, const std::map& properties) { + .def(py::init([](const std::filesystem::path& tokenizer_path, const std::map& properties, const py::kwargs& kwargs) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); - return std::make_unique(tokenizer_path, pyutils::properties_to_any_map(properties)); + auto kwargs_properties = pyutils::kwargs_to_any_map(kwargs); + if (properties.size()) { + PyErr_WarnEx(PyExc_DeprecationWarning, + "'properties' parameters is deprecated, please use kwargs to pass config properties instead.", + 1); + auto map_properties = pyutils::properties_to_any_map(properties); + kwargs_properties.insert(map_properties.begin(), map_properties.end()); + } + + return std::make_unique(tokenizer_path, kwargs_properties); }), py::arg("tokenizer_path"), py::arg("properties") = ov::AnyMap({})) .def("encode", [](Tokenizer& tok, std::vector& prompts, bool add_special_tokens) { diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp index a2e8630059..579fe6b789 100644 --- a/src/python/py_utils.cpp +++ b/src/python/py_utils.cpp @@ -6,11 +6,15 @@ #include #include #include +#include #include #include "tokenizers_path.hpp" #include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/visual_language/pipeline.hpp" +#include "openvino/genai/image_generation/generation_config.hpp" +#include "openvino/genai/whisper_generation_config.hpp" namespace py = pybind11; namespace ov::genai::pybind::utils { @@ -43,7 +47,7 @@ bool py_object_is_any_map(const py::object& py_obj) { }); } -ov::Any py_object_to_any(const py::object& py_obj); +ov::Any py_object_to_any(const py::object& py_obj, std::string property_name); ov::AnyMap py_object_to_any_map(const py::object& py_obj) { OPENVINO_ASSERT(py_object_is_any_map(py_obj), "Unsupported attribute type."); @@ -54,16 +58,34 @@ ov::AnyMap py_object_to_any_map(const py::object& py_obj) { if (py_object_is_any_map(value)) { return_value[key] = py_object_to_any_map(value); } else { - return_value[key] = py_object_to_any(value); + return_value[key] = py_object_to_any(value, key); } } return return_value; } -ov::Any py_object_to_any(const py::object& py_obj) { +ov::Any py_object_to_any(const py::object& py_obj, std::string property_name) { // Python types + // TODO: Remove this after ov::Any is fixed to allow pass types, that can be casted to target type. Ticket: 157622 + std::set size_t_properties = { + "max_new_tokens", + "max_length", + "min_new_tokens", + "logprobs", + "num_beam_groups", + "num_beams", + "num_return_sequences", + "no_repeat_ngram_size", + "top_k", + "rng_seed", + "num_assistant_tokens", + "max_initial_timestamp_index", + "num_images_per_prompt", + "num_inference_steps", + "max_sequence_length" + }; + py::object float_32_type = py::module_::import("numpy").attr("float32"); - if (py::isinstance(py_obj)) { return py_obj.cast(); } else if (py::isinstance(py_obj)) { @@ -71,16 +93,19 @@ ov::Any py_object_to_any(const py::object& py_obj) { } else if (py::isinstance(py_obj)) { return py_obj.cast(); } else if (py::isinstance(py_obj)) { - return py_obj.cast(); + return py_obj.cast(); } else if (py::isinstance(py_obj, float_32_type)) { return py_obj.cast(); } else if (py::isinstance(py_obj)) { + if (size_t_properties.find(property_name) != size_t_properties.end()) { + return py_obj.cast(); + } return py_obj.cast(); } else if (py::isinstance(py_obj)) { return {}; } else if (py::isinstance(py_obj)) { auto _list = py_obj.cast(); - enum class PY_TYPE : int { UNKNOWN = 0, STR, INT, FLOAT, BOOL, PARTIAL_SHAPE }; + enum class PY_TYPE : int { UNKNOWN = 0, STR, INT, FLOAT, BOOL, PARTIAL_SHAPE, TENSOR}; PY_TYPE detected_type = PY_TYPE::UNKNOWN; for (const auto& it : _list) { auto check_type = [&](PY_TYPE type) { @@ -88,7 +113,7 @@ ov::Any py_object_to_any(const py::object& py_obj) { detected_type = type; return; } - OPENVINO_THROW("Incorrect attribute. Mixed types in the list are not allowed."); + OPENVINO_THROW("Incorrect value in \"" + property_name + "\". Mixed types in the list are not allowed."); }; if (py::isinstance(it)) { check_type(PY_TYPE::STR); @@ -100,6 +125,8 @@ ov::Any py_object_to_any(const py::object& py_obj) { check_type(PY_TYPE::BOOL); } else if (py::isinstance(it)) { check_type(PY_TYPE::PARTIAL_SHAPE); + } else if (py::isinstance(it)) { + check_type(PY_TYPE::TENSOR); } } @@ -117,10 +144,89 @@ ov::Any py_object_to_any(const py::object& py_obj) { return _list.cast>(); case PY_TYPE::PARTIAL_SHAPE: return _list.cast>(); + case PY_TYPE::TENSOR: + return _list.cast>(); + default: + OPENVINO_THROW("Property \"" + property_name + "\" got unsupported type."); + } + + } else if (py::isinstance(py_obj)) { + auto _dict = py_obj.cast(); + enum class PY_TYPE : int { UNKNOWN = 0, STR, INT}; + PY_TYPE detected_key_type = PY_TYPE::UNKNOWN; + PY_TYPE detected_value_type = PY_TYPE::UNKNOWN; + for (const auto& it : _dict) { + auto check_type = [&](PY_TYPE type, PY_TYPE& detected_type) { + if (detected_type == PY_TYPE::UNKNOWN || detected_type == type) { + detected_type = type; + return; + } + OPENVINO_THROW("Incorrect value in \"" + property_name + "\". Mixed types in the dict are not allowed."); + }; + // check key type + if (py::isinstance(it.first)) { + check_type(PY_TYPE::STR, detected_key_type); + } + + // check value type + if (py::isinstance(it.second)) { + check_type(PY_TYPE::INT, detected_value_type); + } + } + if (_dict.empty()) { + return ov::Any(); + } + + switch (detected_key_type) { + case PY_TYPE::STR: + switch (detected_value_type) { + case PY_TYPE::INT: + return _dict.cast>(); + default: + OPENVINO_THROW("Property \"" + property_name + "\" got unsupported type."); + } + default: + OPENVINO_THROW("Property \"" + property_name + "\" got unsupported type."); + } + } else if (py::isinstance(py_obj)) { + auto _set = py_obj.cast(); + enum class PY_TYPE : int { UNKNOWN = 0, STR, INT, FLOAT, BOOL}; + PY_TYPE detected_type = PY_TYPE::UNKNOWN; + for (const auto& it : _set) { + auto check_type = [&](PY_TYPE type) { + if (detected_type == PY_TYPE::UNKNOWN || detected_type == type) { + detected_type = type; + return; + } + OPENVINO_THROW("Incorrect value in \"" + property_name + "\". Mixed types in the set are not allowed."); + }; + if (py::isinstance(it)) { + check_type(PY_TYPE::STR); + } else if (py::isinstance(it)) { + check_type(PY_TYPE::INT); + } else if (py::isinstance(it)) { + check_type(PY_TYPE::FLOAT); + } else if (py::isinstance(it)) { + check_type(PY_TYPE::BOOL); + } + } + + if (_set.empty()) + return ov::Any(); + + switch (detected_type) { + case PY_TYPE::STR: + return _set.cast>(); + case PY_TYPE::FLOAT: + return _set.cast>(); + case PY_TYPE::INT: + return _set.cast>(); + case PY_TYPE::BOOL: + return _set.cast>(); default: - OPENVINO_ASSERT(false, "Unsupported attribute type."); + OPENVINO_THROW("Property \"" + property_name + "\" got unsupported type."); } - + // OV types } else if (py_object_is_any_map(py_obj)) { return py_object_to_any_map(py_obj); @@ -156,18 +262,33 @@ ov::Any py_object_to_any(const py::object& py_obj) { return py::cast>(py_obj); } else if (py::isinstance(py_obj)) { return py::cast(py_obj); - } else if (py::isinstance(py_obj)) { + } else if (py::isinstance(py_obj)) { return py::cast(py_obj); + } else if (py::isinstance(py_obj)) { + return py::cast(py_obj); + } else if (py::isinstance(py_obj)) { + return py::cast(py_obj); + } else if (py::isinstance(py_obj)) { + return py::cast(py_obj); + } else if (py::isinstance(py_obj)) { + return py::cast(py_obj); + } else if (py::isinstance(py_obj)) { + return py::cast>(py_obj); + } else if (py::isinstance(py_obj) && property_name == "callback") { + return py::cast>(py_obj); + } else if ((py::isinstance(py_obj) || py::isinstance(py_obj) || py::isinstance(py_obj)) && property_name == "streamer") { + auto streamer = py::cast(py_obj); + return ov::genai::streamer(pystreamer_to_streamer(streamer)).second; } else if (py::isinstance(py_obj)) { return py_obj; } - OPENVINO_ASSERT(false, "Unsupported attribute type."); + OPENVINO_THROW("Property \"" + property_name + "\" got unsupported type."); } std::map properties_to_any_map(const std::map& properties) { std::map properties_to_cpp; for (const auto& property : properties) { - properties_to_cpp[property.first] = py_object_to_any(property.second); + properties_to_cpp[property.first] = py_object_to_any(property.second, property.first); } return properties_to_cpp; } @@ -179,11 +300,16 @@ ov::AnyMap kwargs_to_any_map(const py::kwargs& kwargs) { for (const auto& item : kwargs) { std::string key = py::cast(item.first); py::object value = py::cast(item.second); - if (utils::py_object_is_any_map(value)) { + // we need to unpack only dictionaries, which are passed with "config" name, + // because there are dictionary properties that should not be unpacked + if (utils::py_object_is_any_map(value) && key == "config") { auto map = utils::py_object_to_any_map(value); params.insert(map.begin(), map.end()); } else { - params[key] = utils::py_object_to_any(value); + if (py::isinstance(value)) { + OPENVINO_ASSERT(!py::isinstance(value), "Property \"", key, "\" can't be None."); + } + params[key] = utils::py_object_to_any(value, key); } } @@ -227,60 +353,9 @@ ov::genai::OptionalGenerationConfig update_config_from_kwargs(const ov::genai::O ov::genai::GenerationConfig res_config; if(config.has_value()) res_config = *config; - - for (const auto& item : kwargs) { - std::string key = py::cast(item.first); - py::object value = py::cast(item.second); - - if (item.second.is_none()) { - // Even if argument key name does not fit GenerationConfig name - // it's not an error if it's not defined. - // Some HF configs can have parameters for methods currently unsupported in ov_genai - // but if their values are not set / None, then this should not block - // us from reading such configs, e.g. {"typical_p": None, 'top_p': 1.0,...} - return res_config; - } - if (key == "max_new_tokens") { - res_config.max_new_tokens = py::cast(item.second); - } else if (key == "max_length") { - res_config.max_length = py::cast(item.second); - } else if (key == "ignore_eos") { - res_config.ignore_eos = py::cast(item.second); - } else if (key == "num_beam_groups") { - res_config.num_beam_groups = py::cast(item.second); - } else if (key == "num_beams") { - res_config.num_beams = py::cast(item.second); - } else if (key == "diversity_penalty") { - res_config.diversity_penalty = py::cast(item.second); - } else if (key == "length_penalty") { - res_config.length_penalty = py::cast(item.second); - } else if (key == "num_return_sequences") { - res_config.num_return_sequences = py::cast(item.second); - } else if (key == "no_repeat_ngram_size") { - res_config.no_repeat_ngram_size = py::cast(item.second); - } else if (key == "stop_criteria") { - res_config.stop_criteria = py::cast(item.second); - } else if (key == "temperature") { - res_config.temperature = py::cast(item.second); - } else if (key == "top_p") { - res_config.top_p = py::cast(item.second); - } else if (key == "top_k") { - res_config.top_k = py::cast(item.second); - } else if (key == "do_sample") { - res_config.do_sample = py::cast(item.second); - } else if (key == "repetition_penalty") { - res_config.repetition_penalty = py::cast(item.second); - } else if (key == "eos_token_id") { - res_config.set_eos_token_id(py::cast(item.second)); - } else if (key == "adapters") { - res_config.adapters = py::cast(item.second); - } else { - throw(std::invalid_argument("'" + key + "' is incorrect GenerationConfig parameter name. " - "Use help(openvino_genai.GenerationConfig) to get list of acceptable parameters.")); - } - } - + res_config.update_generation_config(kwargs_to_any_map(kwargs)); return res_config; } + } // namespace ov::genai::pybind::utils diff --git a/src/python/py_utils.hpp b/src/python/py_utils.hpp index 9213060660..20094196a6 100644 --- a/src/python/py_utils.hpp +++ b/src/python/py_utils.hpp @@ -28,7 +28,7 @@ py::list handle_utf8(const std::vector& decoded_res); py::str handle_utf8(const std::string& text); -ov::Any py_object_to_any(const py::object& py_obj); +ov::Any py_object_to_any(const py::object& py_obj, std::string property_name); bool py_object_is_any_map(const py::object& py_obj); diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index 30e2e04a14..9572652204 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -72,46 +72,6 @@ py::object call_vlm_generate( return py::cast(pipe.generate(prompt, images, updated_config, streamer)); } -ov::AnyMap vlm_kwargs_to_any_map(const py::kwargs& kwargs, bool allow_compile_properties=true) { - ov::AnyMap params = {}; - - for (const auto& item : kwargs) { - std::string key = py::cast(item.first); - py::object value = py::cast(item.second); - - if (key == "images") { - params.insert({ov::genai::images(std::move(py::cast>(value)))}); - } else if (key == "image") { - params.insert({ov::genai::image(std::move(py::cast(value)))}); - } else if (key == "generation_config") { - params.insert({ov::genai::generation_config(std::move(py::cast(value)))}); - } else if (key == "streamer") { - auto py_streamer = py::cast(value); - params.insert({ov::genai::streamer(std::move(pyutils::pystreamer_to_streamer(py_streamer)))}); - - } - else { - if (allow_compile_properties) { - // convert arbitrary objects to ov::Any - // not supported properties are not checked, as these properties are passed to compile(), which will throw exception in case of unsupported property - if (pyutils::py_object_is_any_map(value)) { - auto map = pyutils::py_object_to_any_map(value); - params.insert(map.begin(), map.end()); - } else { - params[key] = pyutils::py_object_to_any(value); - } - } - else { - // generate doesn't run compile(), so only VLMPipeline specific properties are allowed - throw(std::invalid_argument("'" + key + "' is unexpected parameter name. " - "Use help(openvino_genai.VLMPipeline.generate) to get list of acceptable parameters.")); - } - } - } - - return params; -} - void init_vlm_pipeline(py::module_& m) { py::class_(m, "VLMPipeline", "This class is used for generation with VLMs") .def(py::init([]( @@ -120,7 +80,7 @@ void init_vlm_pipeline(py::module_& m) { const py::kwargs& kwargs ) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); - return std::make_unique(models_path, device, vlm_kwargs_to_any_map(kwargs, true)); + return std::make_unique(models_path, device, pyutils::kwargs_to_any_map(kwargs)); }), py::arg("models_path"), "folder with exported model files", py::arg("device"), "device on which inference will be done" @@ -177,7 +137,7 @@ void init_vlm_pipeline(py::module_& m) { const std::string& prompt, const py::kwargs& kwargs ) -> py::typing::Union { - return py::cast(pipe.generate(prompt, vlm_kwargs_to_any_map(kwargs, false))); + return py::cast(pipe.generate(prompt, pyutils::kwargs_to_any_map(kwargs))); }, py::arg("prompt"), "Input string", (vlm_generate_kwargs_docstring + std::string(" \n ")).c_str() diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp index 3bf777f739..d34bd5f3b6 100644 --- a/src/python/py_whisper_pipeline.cpp +++ b/src/python/py_whisper_pipeline.cpp @@ -162,60 +162,7 @@ OptionalWhisperGenerationConfig update_whisper_config_from_kwargs(const Optional WhisperGenerationConfig res_config; if (config.has_value()) res_config = *config; - - for (const auto& item : kwargs) { - std::string key = py::cast(item.first); - py::object value = py::cast(item.second); - - if (item.second.is_none()) { - // Even if argument key name does not fit GenerationConfig name - // it's not an error if it's not defined. - // Some HF configs can have parameters for methods currently unsupported in ov_genai - // but if their values are not set / None, then this should not block - // us from reading such configs, e.g. {"typical_p": None, 'top_p': 1.0,...} - return res_config; - } - - if (key == "max_new_tokens") { - res_config.max_new_tokens = py::cast(item.second); - } else if (key == "max_length") { - res_config.max_length = py::cast(item.second); - } else if (key == "decoder_start_token_id") { - res_config.decoder_start_token_id = py::cast(item.second); - } else if (key == "pad_token_id") { - res_config.pad_token_id = py::cast(item.second); - } else if (key == "translate_token_id") { - res_config.translate_token_id = py::cast(item.second); - } else if (key == "transcribe_token_id") { - res_config.transcribe_token_id = py::cast(item.second); - } else if (key == "no_timestamps_token_id") { - res_config.no_timestamps_token_id = py::cast(item.second); - } else if (key == "max_initial_timestamp_index") { - res_config.max_initial_timestamp_index = py::cast(item.second); - } else if (key == "begin_suppress_tokens") { - res_config.begin_suppress_tokens = py::cast>(item.second); - } else if (key == "suppress_tokens") { - res_config.suppress_tokens = py::cast>(item.second); - } else if (key == "is_multilingual") { - res_config.is_multilingual = py::cast(item.second); - } else if (key == "language") { - res_config.language = py::cast(item.second); - } else if (key == "lang_to_id") { - res_config.lang_to_id = py::cast>(item.second); - } else if (key == "task") { - res_config.task = py::cast(item.second); - } else if (key == "return_timestamps") { - res_config.return_timestamps = py::cast(item.second); - } else if (key == "eos_token_id") { - res_config.set_eos_token_id(py::cast(item.second)); - } else { - throw(std::invalid_argument( - "'" + key + - "' is incorrect WhisperGenerationConfig parameter name. " - "Use help(openvino_genai.WhisperGenerationConfig) to get list of acceptable parameters.")); - } - } - + res_config.update_generation_config(pyutils::kwargs_to_any_map(kwargs)); return res_config; } diff --git a/tests/cpp/utils.cpp b/tests/cpp/utils.cpp new file mode 100644 index 0000000000..d00edae6fb --- /dev/null +++ b/tests/cpp/utils.cpp @@ -0,0 +1,21 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "utils.hpp" + + +using namespace ov::genai::utils; +using map_type = std::map; + +TEST(TestIsContainer, test_is_container) { + EXPECT_EQ(is_container, false); + EXPECT_EQ(is_container, false); + EXPECT_EQ(is_container, false); + EXPECT_EQ(is_container, false); + EXPECT_EQ(is_container, true); + EXPECT_EQ(is_container>, true); + EXPECT_EQ(is_container, true); + EXPECT_EQ(is_container>, true); +} \ No newline at end of file diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py index ba934e3bda..80df79f31b 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_generate_api.py @@ -38,7 +38,7 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro # Need to set explicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. config['do_sample'] = False - config['repetition_penalty'] = None + config['repetition_penalty'] = 1.0 # 1.0 means no penalty generation_config_hf = config.copy() if generation_config_hf.get('stop_criteria'): @@ -78,7 +78,7 @@ def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str # Need to set explicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. config['do_sample'] = False - config['repetition_penalty'] = None + config['repetition_penalty'] = 1.0 # 1.0 means no penalty generation_config_hf = config.copy() if generation_config_hf.get('stop_criteria'): @@ -117,7 +117,7 @@ def hf_ov_genai_tensors_comparison( # Need to set explicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. config['do_sample'] = False - config['repetition_penalty'] = None + config['repetition_penalty'] = 1.0 # 1.0 means no penalty generation_config_hf = config.copy() if generation_config_hf.get('stop_criteria'): @@ -635,7 +635,8 @@ def test_valid_configs(model_tmp_path): invalid_py_configs = [ dict(num_beam_groups=3, num_beams=15, do_sample=True), - dict(unexisting_key_name=True), # no eos_token_id no max_new_tokens, no max_len + # TODO: Currently unexpected params do not cause exceptions. Need to implement it in c++ and return this test + # dict(unexisting_key_name=True), # no eos_token_id no max_new_tokens, no max_len dict(eos_token_id=42, ignore_eos=True), # no max_new_tokens, no max_len with ignore_eos dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp @@ -763,7 +764,7 @@ def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: st # Need to set explicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. config['do_sample'] = False - config['repetition_penalty'] = None + config['repetition_penalty'] = 1.0 # 1.0 means no penalty return pipe.generate([prompt], **config).perf_metrics diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py index 9973e20e1d..9aa6931d85 100644 --- a/tests/python_tests/test_sampling.py +++ b/tests/python_tests/test_sampling.py @@ -334,7 +334,7 @@ def test_echo_without_completion(tmp_path, get_generation_config, max_num_batche model_path : Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, model_path) - pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix(), {}), scheduler_config, "CPU", {}) + pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()), scheduler_config, "CPU", {}) outputs = pipe.generate(["What is OpenVINO?"], generation_configs) assert(len(outputs)) @@ -361,7 +361,7 @@ def test_echo_with_completion(tmp_path, get_generation_config, max_num_batched_t model_path : Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, model_path) - pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix(), {}), scheduler_config, "CPU", {}) + pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()), scheduler_config, "CPU", {}) outputs = pipe.generate(["What is OpenVINO?"], generation_configs) assert(len(outputs)) @@ -389,7 +389,7 @@ def test_post_oom_health(tmp_path, sampling_config): models_path : Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, models_path) - pipe = ContinuousBatchingPipeline(models_path.absolute().as_posix(), Tokenizer(models_path.absolute().as_posix(), {}), scheduler_config, "CPU", {}) + pipe = ContinuousBatchingPipeline(models_path.absolute().as_posix(), Tokenizer(models_path.absolute().as_posix()), scheduler_config, "CPU", {}) # First run should return incomplete response output = pipe.generate(["What is OpenVINO?"], generation_configs) assert (len(output)) From 89865c3e3856abec5fe6b7896a5e42cb81f5ff75 Mon Sep 17 00:00:00 2001 From: Helena Kloosterman Date: Thu, 21 Nov 2024 07:28:53 +0100 Subject: [PATCH 03/24] Update Python VLM example in README (#1178) Existing example uses an undefined "read_image" function, and using max_new_tokens in pipe.generate() resulted in an error with latest nightly. I updated the example to work out of the box. Makes it a bit longer, but this section is hidden by default in the README, so it doesn't add to visual clutter for people just visiting the repo. Also added links to the relevant samples. --------- Co-authored-by: Vladimir Zlobin --- README.md | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index fe18205028..c1217a0215 100644 --- a/README.md +++ b/README.md @@ -117,17 +117,34 @@ optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code -- ### Run generation using VLMPipeline API in Python +See [Visual Language Chat](https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/python/visual_language_chat) for a demo application. + +Run the following command to download a sample image: + +```sh +curl -O "https://storage.openvinotoolkit.org/test_data/images/dog.jpg" +``` + ```python +import numpy as np +import openvino as ov import openvino_genai as ov_genai -#Will run model on CPU, GPU is a possible option +from PIL import Image + +# Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU pipe = ov_genai.VLMPipeline("./MiniCPM-V-2_6/", "CPU") -rgb = read_image("cat.jpg") -print(pipe.generate(prompt, image=rgb, max_new_tokens=100)) + +image = Image.open("dog.jpg") +image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8) +image_data = ov.Tensor(image_data) + +prompt = "Can you describe the image?" +print(pipe.generate(prompt, image=image_data, max_new_tokens=100)) ``` ### Run generation using VLMPipeline in C++ -Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for more details) +Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for more details). See [Visual Language Chat](https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/cpp/visual_language_chat) for a demo application. ```cpp #include "load_image.hpp" From 799454f5731518e795193721a77b44c95b45fb0f Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Thu, 21 Nov 2024 13:59:58 +0400 Subject: [PATCH 04/24] Install deployment and export requirements.txt (#1231) (#1241) Ticket 157649 Co-authored-by: Ilya Lavrenov --- samples/CMakeLists.txt | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 229eccb3fe..860ced072b 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -14,8 +14,12 @@ add_subdirectory(cpp/text2image) add_subdirectory(cpp/visual_language_chat) add_subdirectory(cpp/whisper_speech_recognition) -install(FILES requirements.txt DESTINATION samples - COMPONENT cpp_samples_genai) +install(FILES + deployment-requirements.txt + export-requirements.txt + requirements.txt + DESTINATION samples + COMPONENT cpp_samples_genai) install(DIRECTORY cpp/beam_search_causal_lm From 5d5fe7512398778681e0e2d2f5325e9c7995a7d0 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Thu, 21 Nov 2024 18:08:22 +0400 Subject: [PATCH 05/24] Allow missing OpenVINODeveloperPackage (#1243) Compiliung GenAI against ov archives prints Warning: Please, install pybind11-stubgen==2.5.1 otherwise --- src/python/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index 898e18b895..25d81277d6 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -182,12 +182,14 @@ if(pybind11_stubgen_AVAILABLE) VERBATIM) add_custom_target(${TARGET_NAME}_stub ALL DEPENDS ${output_file}) -else() +elseif(OpenVINODeveloperPackage_FOUND) # Produce warning message at build time as well add_custom_command(OUTPUT pybind11_stub_gen_not_found.txt COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --red "Warning: Please, install ${pybind11_stubgen_dep}") add_custom_target(${TARGET_NAME}_stub ALL DEPENDS pybind11_stub_gen_not_found.txt) +else() + add_custom_target(${TARGET_NAME}_stub ALL) endif() add_dependencies(${TARGET_NAME}_stub ${TARGET_NAME}) From ac7d39ffe66b04a52df69ad7950b4d7963d7f681 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Thu, 21 Nov 2024 22:08:05 +0100 Subject: [PATCH 06/24] parametrize decode in Tokenizers --- src/cpp/include/openvino/genai/tokenizer.hpp | 45 +++++++++- .../src/make_combine_segments_stateful.cpp | 44 ++++++++++ .../src/make_combine_segments_stateful.hpp | 37 +++++++++ src/cpp/src/tokenizer.cpp | 82 +++++++++++-------- src/python/py_tokenizer.cpp | 24 ++++-- tests/python_tests/test_chat_generate_api.py | 21 +++++ 6 files changed, 208 insertions(+), 45 deletions(-) diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index bcb8da68a3..e90e9c80de 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -87,23 +87,59 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /** * @brief decode sequence of tokens * @param tokens vector storing tokens + * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false} * @return sequence string */ - std::string decode(std::vector tokens); - + std::string decode(std::vector tokens, const ov::AnyMap& detokenization_params = {}); + + /** + * @brief decode sequence of tokens + * @param tokens vector storing tokens + * @param tokenization_params detokenization parameters, e.g. ov::genai::skip_special_tokens(true) + * @return sequence string + */ + template + util::EnableIfAllStringAny decode(std::vector& tokens, Properties&&... properties) { + return decode(tokens, AnyMap{std::forward(properties)...}); + } + /** * @brief decode tokens. * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len] + * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false} + * @return vector of std::string, with size = batch_size + */ + std::vector decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params = {}); + + /** + * @brief decode sequence of tokens + * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len] + * @param tokenization_params detokenization parameters, e.g. ov::genai::skip_special_tokens(true) * @return vector of std::string, with size = batch_size */ - std::vector decode(ov::Tensor tokens); + template + util::EnableIfAllStringAny, Properties...> decode(ov::Tensor tokens, Properties&&... properties) { + return decode(tokens, AnyMap{std::forward(properties)...}); + } /** * @brief batched decoding of tokens. * @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size + * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false} * @return vector of std::string, with size equal to batch_size */ - std::vector decode(std::vector> tokens); + std::vector decode(std::vector> tokens, const ov::AnyMap& detokenization_params = {}); + + /** + * @brief decode sequence of tokens + * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len] + * @param tokenization_params detokenization parameters, e.g. ov::genai::skip_special_tokens(true) + * @return vector of std::string, with size = batch_size + */ + template + util::EnableIfAllStringAny, Properties...> decode(std::vector> tokens, Properties&&... properties) { + return decode(tokens, AnyMap{std::forward(properties)...}); + } /** * @brief Embeds input prompts with special tags for a chat scenario. @@ -143,6 +179,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { }; static constexpr ov::Property add_special_tokens{"add_special_tokens"}; +static constexpr ov::Property skip_special_tokens{"skip_special_tokens"}; } // namespace genai } // namespace ov diff --git a/src/cpp/src/make_combine_segments_stateful.cpp b/src/cpp/src/make_combine_segments_stateful.cpp index 2285c172dc..26c58b8fca 100644 --- a/src/cpp/src/make_combine_segments_stateful.cpp +++ b/src/cpp/src/make_combine_segments_stateful.cpp @@ -4,6 +4,8 @@ #include "make_combine_segments_stateful.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/select.hpp" +#include "openvino/op/slice.hpp" +#include "openvino/op/multiply.hpp" #include "openvino/op/read_value.hpp" #include "openvino/op/assign.hpp" @@ -44,3 +46,45 @@ bool ov::genai::MakeCombineSegmentsSatateful::run_on_model(const std::shared_ptr model->add_variables({variable}); return true; } + +bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr& model) { + + std::shared_ptr vocab_decoder_node; + for (auto node: model->get_ordered_ops()) { + if (strcmp(node->get_type_info().name, "VocabDecoder") == 0) { + vocab_decoder_node = node; + } + } + auto val = vocab_decoder_node->input_value(4); + auto val_type = vocab_decoder_node->input_value(4).get_element_type(); + + if (!vocab_decoder_node || !vocab_decoder_node->input_value(4).get_element_type().is_integral_number()) { + return false; + } + + std::shared_ptr skip_tokens_const = std::dynamic_pointer_cast(vocab_decoder_node->get_input_node_shared_ptr(4)); + if (!skip_tokens_const) { + return false; + } + + + auto start_const = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{0}); + auto int_max_const = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{std::numeric_limits::max()}); + auto one_const = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{1}); + + // By default, INT_MAX will multiply with 1 and all skip_tokens will be selected. + op::util::VariableInfo var_info{ov::Shape{1}, ov::element::i32, SKIP_SPECIAL_TOKENS_VAR_ID}; + auto variable = std::make_shared(var_info); + auto read_value = std::make_shared(one_const, variable); + // if flag is set, then slice up to the int_max which means skip all tokens. + auto stop = std::make_shared(int_max_const, read_value); + + std::shared_ptr slice_node = std::make_shared(skip_tokens_const, start_const, stop, one_const); + + vocab_decoder_node->input(4).replace_source_output(slice_node->output(0)); + + auto assign = std::make_shared(read_value, variable); + model->add_sinks({assign}); + model->add_variables({variable}); + return true; +} \ No newline at end of file diff --git a/src/cpp/src/make_combine_segments_stateful.hpp b/src/cpp/src/make_combine_segments_stateful.hpp index 6365497140..307c6199c8 100644 --- a/src/cpp/src/make_combine_segments_stateful.hpp +++ b/src/cpp/src/make_combine_segments_stateful.hpp @@ -38,7 +38,44 @@ class MakeCombineSegmentsSatateful : public ov::pass::ModelPass { bool run_on_model(const std::shared_ptr& model) override; }; +/** + * @brief This pass modifies tokenizer ov::Model so that special tokens adding will be + * enabled or disabled depending on stateful value. + * + * +--------------+ + * | DefaultMode | + * +--------------+ + * | + * v + * +------------+ +-----------+ + * | ReadValue | | INT_MAX | + * +------------+ +-----------+ + * \ / + * \ / + * v v + * +--------------------+ +---------+ +---------+ + * | Const with tokens | | start | | Mul | + * +--------------------+ +---------+ +---------+ + * \ | / + * \ | / + * v v v + * +-----------------+ + * | Slice | + * +-----------------+ + * | + * v + * +----------------------+ + * | VocabDecoder | + * +----------------------+ +**/ +class MakeVocabDecoderSatateful : public ov::pass::ModelPass { +public: + OPENVINO_RTTI("MakeVocabDecoderSatateful", "0"); + bool run_on_model(const std::shared_ptr& model) override; +}; + const std::string ADD_SPECIAL_TOKENS_VAR_ID = "add_special_tokens"; +const std::string SKIP_SPECIAL_TOKENS_VAR_ID = "skip_special_tokens"; } // namespace genai } // namespace ov diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index f52417a94e..78b94915dd 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -10,6 +10,7 @@ #include #include +#include "openvino/pass/visualize_tree.hpp" #include "openvino/pass/manager.hpp" #include "openvino/runtime/core.hpp" #include "openvino/genai/tokenizer.hpp" @@ -73,7 +74,8 @@ class Tokenizer::TokenizerImpl { std::unique_ptr> m_ireq_queue_detokenizer; // To change the adding special tokens mode we use a statefull subgraph, // this flag holds the current state value of the CompiledModel. - bool m_add_special_tokens = true; + bool m_add_special_tokens = true; + bool m_skip_special_tokens = false; bool m_older_than_24_5 = false; int64_t m_pad_token_id = -1; @@ -86,11 +88,16 @@ class Tokenizer::TokenizerImpl { std::string m_chat_template = {}; - void set_state_if_necessary(CircularBufferQueueElementGuard& infer_request_guard, bool add_special_tokens) { + void set_state_if_necessary(CircularBufferQueueElementGuard& infer_request_guard, const ov::AnyMap& params) { + bool add_special_tokens_flag = true; + bool skip_special_tokens_flag = false; + ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag); + ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag); + // If user requested add_special_tokens mode different from the current one, // need to set state variable. // If requested mode matches the stored state set, then don't touch states. - if (add_special_tokens == m_add_special_tokens) { + if (add_special_tokens_flag == m_add_special_tokens && skip_special_tokens_flag == m_skip_special_tokens) { return; } if (m_older_than_24_5) { @@ -100,19 +107,23 @@ class Tokenizer::TokenizerImpl { return; } - // auto states = m_ireq_queue_tokenizer->get(0).query_state(); + // add_special_tokens is managed by Select op with a bool input. ov::Tensor add_special_tensor = ov::Tensor(ov::element::boolean, {}); - *add_special_tensor.data() = add_special_tokens; + *add_special_tensor.data() = add_special_tokens_flag; + + // skip_special_tokens is managed by multiplication with a number, therefore i32. + ov::Tensor skip_special_tensor = ov::Tensor(ov::element::i32, {1}); + *skip_special_tensor.data() = skip_special_tokens_flag; for (auto& state: infer_request_guard.get().query_state()) { - if (state.get_name().find(ov::genai::ADD_SPECIAL_TOKENS_VAR_ID) == std::string::npos) { - // It's not add_special_tokens flag state. - continue; + if (state.get_name().find(ov::genai::ADD_SPECIAL_TOKENS_VAR_ID) != std::string::npos) { + state.set_state(add_special_tensor); + } else if (state.get_name().find(ov::genai::SKIP_SPECIAL_TOKENS_VAR_ID) != std::string::npos) { + state.set_state(skip_special_tensor); } - state.set_state(add_special_tensor); - break; } - m_add_special_tokens = add_special_tokens; + m_add_special_tokens = add_special_tokens_flag; + m_skip_special_tokens = skip_special_tokens_flag; } TokenizerImpl() = default; @@ -135,15 +146,25 @@ class Tokenizer::TokenizerImpl { auto device = "CPU"; // currently openvino_tokenizer supports only CPU auto ov_tokenizer = core.read_model(tokenizer_path / "openvino_tokenizer.xml"); + std::shared_ptr ov_detokenizer; + if (std::filesystem::exists(tokenizer_path / "openvino_detokenizer.xml")) { + ov_detokenizer = core.read_model(tokenizer_path / "openvino_detokenizer.xml"); + } m_older_than_24_5 = ov_tokenizer->get_rt_info().count("openvino_tokenizers_version") != 1; - ov::pass::Manager manager; - manager.register_pass(); - manager.run_passes(ov_tokenizer); + ov::pass::Manager manager_tok; + manager_tok.register_pass(); + manager_tok.run_passes(ov_tokenizer); + + ov::pass::Manager manager_detok; + manager_detok.register_pass("before.svg"); + manager_detok.register_pass(); + manager_detok.register_pass("after.svg"); + manager_detok.run_passes(ov_detokenizer); m_tokenizer = core.compile_model(ov_tokenizer, device, properties); if (std::filesystem::exists(tokenizer_path / "openvino_detokenizer.xml")) { - m_detokenizer = core.compile_model(tokenizer_path / "openvino_detokenizer.xml", device, properties); + m_detokenizer = core.compile_model(ov_detokenizer, device, properties); } @@ -298,11 +319,8 @@ class Tokenizer::TokenizerImpl { } TokenizedInputs encode(std::string prompt, const ov::AnyMap& tokenization_params = {}) { - bool add_special_tokens_flag = true; - ov::genai::utils::read_anymap_param(tokenization_params, add_special_tokens.name(), add_special_tokens_flag); - CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_tokenizer.get()); - set_state_if_necessary(infer_request_guard, add_special_tokens_flag); + set_state_if_necessary(infer_request_guard, tokenization_params); size_t batch_size = 1; infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt}); infer_request_guard.get().start_async(); @@ -316,11 +334,8 @@ class Tokenizer::TokenizerImpl { TokenizedInputs encode(std::vector& prompts, const ov::AnyMap& tokenization_params = {}) { TokenizedInputs unpadded; { - bool add_special_tokens_flag = true; - ov::genai::utils::read_anymap_param(tokenization_params, add_special_tokens.name(), add_special_tokens_flag); - CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_tokenizer.get()); - set_state_if_necessary(infer_request_guard, add_special_tokens_flag); + set_state_if_necessary(infer_request_guard, tokenization_params); infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()}); auto size_ = infer_request_guard.get().get_input_tensor().get_shape(); infer_request_guard.get().start_async(); @@ -343,10 +358,11 @@ class Tokenizer::TokenizerImpl { return {input_ids_, attention_mask_}; } - std::string decode(std::vector tokens) { + std::string decode(std::vector tokens, const ov::AnyMap& detokenization_params = {}) { OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available"); CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_detokenizer.get()); + set_state_if_necessary(infer_request_guard, detokenization_params); size_t batch_size = 1; infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()}); infer_request_guard.get().start_async(); @@ -354,12 +370,13 @@ class Tokenizer::TokenizerImpl { return infer_request_guard.get().get_output_tensor().data()[0]; } - std::vector decode(ov::Tensor tokens) { + std::vector decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params = {}) { OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available"); OPENVINO_ASSERT(tokens.get_element_type() == ov::element::i64, "tokens tensor element type should be an i64"); OPENVINO_ASSERT(tokens.get_shape().size() == 2, "tokens tensor should of rank 2 with shape [batch_size, seq_len]"); CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_detokenizer.get()); + set_state_if_necessary(infer_request_guard, detokenization_params); infer_request_guard.get().set_input_tensor(tokens); infer_request_guard.get().start_async(); infer_request_guard.get().wait(); @@ -369,7 +386,7 @@ class Tokenizer::TokenizerImpl { return std::vector(res_data, res_data + res.get_shape()[0]); } - std::vector decode(std::vector> lines) { + std::vector decode(std::vector> lines, const ov::AnyMap& detokenization_params = {}) { OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available"); auto compare_lengths = [](const std::vector& a, const std::vector& b) { @@ -388,6 +405,7 @@ class Tokenizer::TokenizerImpl { } CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_detokenizer.get()); + set_state_if_necessary(infer_request_guard, detokenization_params); infer_request_guard.get().set_input_tensor(tokens); infer_request_guard.get().start_async(); infer_request_guard.get().wait(); @@ -517,16 +535,16 @@ TokenizedInputs Tokenizer::encode(std::initializer_list& text, cons return encode(std::vector(text.begin(), text.end()), tokenization_params); } -std::string Tokenizer::decode(std::vector tokens) { - return m_pimpl->decode(tokens); +std::string Tokenizer::decode(std::vector tokens, const ov::AnyMap& detokenization_params) { + return m_pimpl->decode(tokens, detokenization_params); } -std::vector Tokenizer::decode(ov::Tensor tokens) { - return m_pimpl->decode(tokens); +std::vector Tokenizer::decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params) { + return m_pimpl->decode(tokens, detokenization_params); } -std::vector Tokenizer::decode(std::vector> lines) { - return m_pimpl->decode(lines); +std::vector Tokenizer::decode(std::vector> lines, const ov::AnyMap& detokenization_params) { + return m_pimpl->decode(lines, detokenization_params); } int64_t Tokenizer::get_bos_token_id() const { diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp index 2ccccff4c0..db4643a65c 100644 --- a/src/python/py_tokenizer.cpp +++ b/src/python/py_tokenizer.cpp @@ -63,27 +63,33 @@ void init_tokenizer(py::module_& m) { .def( "decode", - [](Tokenizer& tok, std::vector& tokens) -> py::str { - return pyutils::handle_utf8(tok.decode(tokens)); + [](Tokenizer& tok, std::vector& tokens, bool skip_special_tokens) -> py::str { + ov::AnyMap detokenization_params; + detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens; + return pyutils::handle_utf8(tok.decode(tokens, detokenization_params)); }, - py::arg("tokens"), + py::arg("tokens"), py::arg("skip_special_tokens") = true, R"(Decode a sequence into a string prompt.)" ) .def( "decode", - [](Tokenizer& tok, ov::Tensor& tokens) -> py::typing::List { - return pyutils::handle_utf8(tok.decode(tokens)); + [](Tokenizer& tok, ov::Tensor& tokens, bool skip_special_tokens) -> py::typing::List { + ov::AnyMap detokenization_params; + detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens; + return pyutils::handle_utf8(tok.decode(tokens, detokenization_params)); }, - py::arg("tokens"), + py::arg("tokens"), py::arg("skip_special_tokens") = true, R"(Decode tensor into a list of string prompts.)") .def( "decode", - [](Tokenizer& tok, std::vector>& tokens) -> py::typing::List { - return pyutils::handle_utf8(tok.decode(tokens)); + [](Tokenizer& tok, std::vector>& tokens, bool skip_special_tokens) -> py::typing::List { + ov::AnyMap detokenization_params; + detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens; + return pyutils::handle_utf8(tok.decode(tokens, detokenization_params)); }, - py::arg("tokens"), + py::arg("tokens"), py::arg("skip_special_tokens") = true, R"(Decode a batch of tokens into a list of string prompt.)") .def("apply_chat_template", [](Tokenizer& tok, diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py index 25d0798994..a87a2c7555 100644 --- a/tests/python_tests/test_chat_generate_api.py +++ b/tests/python_tests/test_chat_generate_api.py @@ -217,3 +217,24 @@ def test_add_special_tokens(add_special_tokens, prompt): res_genai = genai_tokenzier.encode(prompt, add_special_tokens).input_ids.data res_hf = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"] assert np.all(res_genai == res_hf) + +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize("add_special_tokens", [True, False]) +@pytest.mark.parametrize("skip_special_tokens", [True, False]) +@pytest.mark.parametrize("prompt", prompts) +def test_add_special_tokens(add_special_tokens, skip_special_tokens, prompt): + import numpy as np + model_descr = get_chat_models_list()[0] + model_id, path, hf_tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) + genai_tokenizer = pipe.get_tokenizer() + + # Calling encode with add_special_tokens will set state flag. + res_genai = genai_tokenizer.encode(prompt, add_special_tokens).input_ids.data + res_hf = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"] + assert np.all(res_genai == res_hf) + + # Decode with skip_special_tokens + decoded_genai = genai_tokenizer.decode(res_genai, skip_special_tokens=skip_special_tokens) + decoded_hf = hf_tokenizer.decode(res_hf[0], skip_special_tokens=skip_special_tokens) + assert decoded_genai == decoded_hf From e46466d94124aa73daa34a91cf94a7e0ce4e1265 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Thu, 21 Nov 2024 22:10:47 +0100 Subject: [PATCH 07/24] rename pass --- .github/labeler.yml | 4 ++-- ...ine_segments_stateful.cpp => make_tokenizer_stateful.cpp} | 2 +- ...ine_segments_stateful.hpp => make_tokenizer_stateful.hpp} | 0 src/cpp/src/tokenizer.cpp | 5 +---- 4 files changed, 4 insertions(+), 7 deletions(-) rename src/cpp/src/{make_combine_segments_stateful.cpp => make_tokenizer_stateful.cpp} (98%) rename src/cpp/src/{make_combine_segments_stateful.hpp => make_tokenizer_stateful.hpp} (100%) diff --git a/.github/labeler.yml b/.github/labeler.yml index c5d0db312c..c162f6aff4 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -8,8 +8,8 @@ - 'src/cpp/src/tokenizers_path.hpp' - 'src/cpp/src/circular_buffer_queue.hpp' - 'src/cpp/src/synchronized_queue.hpp' -- 'src/cpp/src/make_combine_segments_stateful.cpp' -- 'src/cpp/src/make_combine_segments_stateful.hpp' +- 'src/cpp/src/make_tokenizer_stateful.cpp' +- 'src/cpp/src/make_tokenizer_stateful.hpp' - 'src/python/py_tokenizer.cpp' - 'thirdparty/openvino_tokenizers' - 'tests/python_tests/tokenizer_configs.py' diff --git a/src/cpp/src/make_combine_segments_stateful.cpp b/src/cpp/src/make_tokenizer_stateful.cpp similarity index 98% rename from src/cpp/src/make_combine_segments_stateful.cpp rename to src/cpp/src/make_tokenizer_stateful.cpp index 26c58b8fca..538a935e56 100644 --- a/src/cpp/src/make_combine_segments_stateful.cpp +++ b/src/cpp/src/make_tokenizer_stateful.cpp @@ -1,7 +1,7 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include "make_combine_segments_stateful.hpp" +#include "make_tokenizer_stateful.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/select.hpp" #include "openvino/op/slice.hpp" diff --git a/src/cpp/src/make_combine_segments_stateful.hpp b/src/cpp/src/make_tokenizer_stateful.hpp similarity index 100% rename from src/cpp/src/make_combine_segments_stateful.hpp rename to src/cpp/src/make_tokenizer_stateful.hpp diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 78b94915dd..fc6ba75d90 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -10,12 +10,11 @@ #include #include -#include "openvino/pass/visualize_tree.hpp" #include "openvino/pass/manager.hpp" #include "openvino/runtime/core.hpp" #include "openvino/genai/tokenizer.hpp" -#include "make_combine_segments_stateful.hpp" +#include "make_tokenizer_stateful.hpp" #include "tokenizers_path.hpp" #include "circular_buffer_queue.hpp" #include "json_utils.hpp" @@ -157,9 +156,7 @@ class Tokenizer::TokenizerImpl { manager_tok.run_passes(ov_tokenizer); ov::pass::Manager manager_detok; - manager_detok.register_pass("before.svg"); manager_detok.register_pass(); - manager_detok.register_pass("after.svg"); manager_detok.run_passes(ov_detokenizer); m_tokenizer = core.compile_model(ov_tokenizer, device, properties); From ff8846ae599bc2a05b3173c0dd05a027a376e32c Mon Sep 17 00:00:00 2001 From: guozhong wang Date: Fri, 22 Nov 2024 09:11:36 +0800 Subject: [PATCH 08/24] Fix wrong token latency when batch size is greater than 1 (#1244) Fix the wrong 2nd token latency when batch size is greater than 1. python benchmark.py -m /mnt/llm_irs/models_6c715998_ww45.4_optimum/llama-2-7b-chat/pytorch/dldt/FP16 -n 1 --genai -ic 128 -bs 16 [ INFO ] [Average] P[0] Input token size: 128, 1st token latency: **0.36 ms/16tokens**, **2nd token latency: 1958228200.33 ms/16tokens**, 2nd tokens throughput: **0.00** 16tokenss/s Fix result: [ INFO ] [Average] P[0] Input token size: 128, 1st token latency: 91.54 ms/16tokens, 2nd token latency: 69.81 ms/16tokens, 2nd tokens throughput: 229.18 tokens/s --- tools/llm_bench/llm_bench_utils/metrics_print.py | 4 ++-- tools/llm_bench/task/speech_to_text_generation.py | 4 ++-- tools/llm_bench/task/text_generation.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/llm_bench/llm_bench_utils/metrics_print.py b/tools/llm_bench/llm_bench_utils/metrics_print.py index 905decf72b..de9d0126f8 100644 --- a/tools/llm_bench/llm_bench_utils/metrics_print.py +++ b/tools/llm_bench/llm_bench_utils/metrics_print.py @@ -149,7 +149,7 @@ def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch avg_input_size = int(avg_input_size / index_num) if avg_2nd_tokens_latency > 0: avg_2nd_token_tput = (1 / avg_2nd_tokens_latency) * batch_size * 1000 - latency_unit = 'token' if is_text_gen is True else 'step' + tput_unit = latency_unit = 'token' if is_text_gen is True else 'step' if batch_size > 1: if is_text_gen is True: latency_unit = '{}tokens'.format(batch_size) @@ -157,7 +157,7 @@ def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch latency_unit = '{}steps'.format(batch_size) avg_1st_token_latency = 'NA' if avg_1st_token_latency < 0 else f'{avg_1st_token_latency:.2f} ms/{latency_unit}' avg_2nd_tokens_latency = 'NA' if avg_2nd_tokens_latency < 0 else f'{avg_2nd_tokens_latency:.2f} ms/{latency_unit}' - avg_2nd_token_tput = 'NA' if avg_2nd_tokens_latency == 'NA' else f'{avg_2nd_token_tput:.2f} {latency_unit}s/s' + avg_2nd_token_tput = 'NA' if avg_2nd_tokens_latency == 'NA' else f'{avg_2nd_token_tput:.2f} {tput_unit}s/s' prefix = f'[ INFO ] [Average] P[{p_idx}]L[{loop_idx}]' if loop_idx != -1 else f'[ INFO ] [Average] P[{p_idx}]' if is_text_gen is True: output_info = '' diff --git a/tools/llm_bench/task/speech_to_text_generation.py b/tools/llm_bench/task/speech_to_text_generation.py index ad49109bab..be9c9ab295 100644 --- a/tools/llm_bench/task/speech_to_text_generation.py +++ b/tools/llm_bench/task/speech_to_text_generation.py @@ -51,10 +51,10 @@ def run_speech_2_txt_generation(input_param, args, md5_list, iter_data_list): ) end = time.perf_counter() perf_metrics = result_text.perf_metrics - first_token_time = perf_metrics.get_ttft().mean / args["batch_size"] + first_token_time = perf_metrics.get_ttft().mean second_tokens_durations = ( np.array(perf_metrics.raw_metrics.m_new_token_times[1:]) - - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1]) / args["batch_size"] + - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1]) ).tolist() tm_list = (np.array([first_token_time] + second_tokens_durations) / 1000).tolist() tm_infer_list = [] diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index 029bcdf16d..7718773560 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -240,10 +240,10 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data per_token_time = generation_time * 1000 / (num_tokens / args['batch_size']) else: log.warning("No generated tokens") - first_token_time = (perf_metrics.get_ttft().mean - perf_metrics.raw_metrics.tokenization_durations[-1] / 1000) / args["batch_size"] + first_token_time = (perf_metrics.get_ttft().mean - perf_metrics.raw_metrics.tokenization_durations[-1] / 1000) * args["batch_size"] second_tokens_durations = ( np.array(perf_metrics.raw_metrics.m_new_token_times[1:]) - - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1]) / args["batch_size"] + - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1]) ).tolist() tm_list = np.array([first_token_time] + second_tokens_durations) / 1000 From 4529dec255b603d711a479f1a90c4cbec9ae3ebf Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Fri, 22 Nov 2024 10:49:11 +0100 Subject: [PATCH 09/24] fix typos --- src/cpp/include/openvino/genai/tokenizer.hpp | 28 ++++++++++---------- src/cpp/src/make_tokenizer_stateful.cpp | 17 +++++------- src/python/py_tokenizer.cpp | 6 ++--- tests/python_tests/test_chat_generate_api.py | 3 ++- 4 files changed, 25 insertions(+), 29 deletions(-) diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index e90e9c80de..8d2d63ea80 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -47,7 +47,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /** * @brief encode a single prompt * @param prompt std::string with input prompt - * @param tokenization_params AnyMap with tokenization parameters, e.g. {'add_special_tokens', false} + * @param tokenization_params AnyMap with tokenization parameters, e.g. {"add_special_tokens", false} * @return pair of [input_ids, attention_mask] */ TokenizedInputs encode(const std::string prompt, const ov::AnyMap& tokenization_params = {}); @@ -55,7 +55,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /** * @brief encode batch of prompts. Left padding will be applied by default * @param prompts vector storing batch of prompts - * @param tokenization_params AnyMap with tokenization parameters, e.g. {'add_special_tokens', false} + * @param tokenization_params AnyMap with tokenization parameters, e.g. {"add_special_tokens", false} * @return pair of [input_ids, attention_mask] */ TokenizedInputs encode(std::vector& prompt, const ov::AnyMap& tokenization_params = {}); @@ -87,7 +87,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /** * @brief decode sequence of tokens * @param tokens vector storing tokens - * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false} + * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false} * @return sequence string */ std::string decode(std::vector tokens, const ov::AnyMap& detokenization_params = {}); @@ -95,18 +95,18 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /** * @brief decode sequence of tokens * @param tokens vector storing tokens - * @param tokenization_params detokenization parameters, e.g. ov::genai::skip_special_tokens(true) + * @param detokenization_params detokenization parameters, e.g. ov::genai::skip_special_tokens(true) * @return sequence string */ template - util::EnableIfAllStringAny decode(std::vector& tokens, Properties&&... properties) { - return decode(tokens, AnyMap{std::forward(properties)...}); + util::EnableIfAllStringAny decode(std::vector& tokens, Properties&&... detokenization_params) { + return decode(tokens, AnyMap{std::forward(detokenization_params)...}); } /** * @brief decode tokens. * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len] - * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false} + * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false} * @return vector of std::string, with size = batch_size */ std::vector decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params = {}); @@ -114,18 +114,18 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /** * @brief decode sequence of tokens * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len] - * @param tokenization_params detokenization parameters, e.g. ov::genai::skip_special_tokens(true) + * @param detokenization_params detokenization parameters, e.g. ov::genai::skip_special_tokens(true) * @return vector of std::string, with size = batch_size */ template - util::EnableIfAllStringAny, Properties...> decode(ov::Tensor tokens, Properties&&... properties) { - return decode(tokens, AnyMap{std::forward(properties)...}); + util::EnableIfAllStringAny, Properties...> decode(ov::Tensor tokens, Properties&&... detokenization_params) { + return decode(tokens, AnyMap{std::forward(detokenization_params)...}); } /** * @brief batched decoding of tokens. * @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size - * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false} + * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false} * @return vector of std::string, with size equal to batch_size */ std::vector decode(std::vector> tokens, const ov::AnyMap& detokenization_params = {}); @@ -133,12 +133,12 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /** * @brief decode sequence of tokens * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len] - * @param tokenization_params detokenization parameters, e.g. ov::genai::skip_special_tokens(true) + * @param detokenization_params detokenization parameters, e.g. ov::genai::skip_special_tokens(true) * @return vector of std::string, with size = batch_size */ template - util::EnableIfAllStringAny, Properties...> decode(std::vector> tokens, Properties&&... properties) { - return decode(tokens, AnyMap{std::forward(properties)...}); + util::EnableIfAllStringAny, Properties...> decode(std::vector> tokens, Properties&&... detokenization_params) { + return decode(tokens, AnyMap{std::forward(detokenization_params)...}); } /** diff --git a/src/cpp/src/make_tokenizer_stateful.cpp b/src/cpp/src/make_tokenizer_stateful.cpp index 538a935e56..3551e713c9 100644 --- a/src/cpp/src/make_tokenizer_stateful.cpp +++ b/src/cpp/src/make_tokenizer_stateful.cpp @@ -48,25 +48,20 @@ bool ov::genai::MakeCombineSegmentsSatateful::run_on_model(const std::shared_ptr } bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr& model) { - std::shared_ptr vocab_decoder_node; for (auto node: model->get_ordered_ops()) { - if (strcmp(node->get_type_info().name, "VocabDecoder") == 0) { + if (strcmp(node->get_type_info().name, "VocabDecoder") == 0) vocab_decoder_node = node; - } } - auto val = vocab_decoder_node->input_value(4); - auto val_type = vocab_decoder_node->input_value(4).get_element_type(); - if (!vocab_decoder_node || !vocab_decoder_node->input_value(4).get_element_type().is_integral_number()) { + if (!vocab_decoder_node || vocab_decoder_node->get_input_size() < 5) + return false; + if (!vocab_decoder_node->input_value(4).get_element_type().is_integral_number()) return false; - } std::shared_ptr skip_tokens_const = std::dynamic_pointer_cast(vocab_decoder_node->get_input_node_shared_ptr(4)); - if (!skip_tokens_const) { + if (!skip_tokens_const) return false; - } - auto start_const = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{0}); auto int_max_const = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{std::numeric_limits::max()}); @@ -87,4 +82,4 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptradd_sinks({assign}); model->add_variables({variable}); return true; -} \ No newline at end of file +} diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp index db4643a65c..dae2ffe775 100644 --- a/src/python/py_tokenizer.cpp +++ b/src/python/py_tokenizer.cpp @@ -68,7 +68,7 @@ void init_tokenizer(py::module_& m) { detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens; return pyutils::handle_utf8(tok.decode(tokens, detokenization_params)); }, - py::arg("tokens"), py::arg("skip_special_tokens") = true, + py::arg("tokens"), py::arg("skip_special_tokens") = false, R"(Decode a sequence into a string prompt.)" ) @@ -79,7 +79,7 @@ void init_tokenizer(py::module_& m) { detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens; return pyutils::handle_utf8(tok.decode(tokens, detokenization_params)); }, - py::arg("tokens"), py::arg("skip_special_tokens") = true, + py::arg("tokens"), py::arg("skip_special_tokens") = false, R"(Decode tensor into a list of string prompts.)") .def( @@ -89,7 +89,7 @@ void init_tokenizer(py::module_& m) { detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens; return pyutils::handle_utf8(tok.decode(tokens, detokenization_params)); }, - py::arg("tokens"), py::arg("skip_special_tokens") = true, + py::arg("tokens"), py::arg("skip_special_tokens") = false, R"(Decode a batch of tokens into a list of string prompt.)") .def("apply_chat_template", [](Tokenizer& tok, diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py index a87a2c7555..efd1d87416 100644 --- a/tests/python_tests/test_chat_generate_api.py +++ b/tests/python_tests/test_chat_generate_api.py @@ -219,6 +219,7 @@ def test_add_special_tokens(add_special_tokens, prompt): assert np.all(res_genai == res_hf) @pytest.mark.precommit +@pytest.mark.xfail(reason="Need to turn them back on when openvino_tokenizers will be updated.") @pytest.mark.nightly @pytest.mark.parametrize("add_special_tokens", [True, False]) @pytest.mark.parametrize("skip_special_tokens", [True, False]) @@ -235,6 +236,6 @@ def test_add_special_tokens(add_special_tokens, skip_special_tokens, prompt): assert np.all(res_genai == res_hf) # Decode with skip_special_tokens - decoded_genai = genai_tokenizer.decode(res_genai, skip_special_tokens=skip_special_tokens) + decoded_genai = genai_tokenizer.decode(res_genai, skip_special_tokens=skip_special_tokens)[0] decoded_hf = hf_tokenizer.decode(res_hf[0], skip_special_tokens=skip_special_tokens) assert decoded_genai == decoded_hf From 18e8d5b59c9f4776a59811db4f299c2da1ea974f Mon Sep 17 00:00:00 2001 From: Alexander Kozlov Date: Fri, 22 Nov 2024 16:12:08 +0400 Subject: [PATCH 10/24] [WWB]: Updated readme with the latest information (#1248) --- tools/who_what_benchmark/README.md | 34 ++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/tools/who_what_benchmark/README.md b/tools/who_what_benchmark/README.md index 012782bad3..0e597859d2 100644 --- a/tools/who_what_benchmark/README.md +++ b/tools/who_what_benchmark/README.md @@ -9,12 +9,12 @@ WWB provides default datasets for the supported use cases. However, it is relati * Command-line interface for Hugging Face and OpenVINO models and API to support broader inference backends. * Simple and quick accuracy test for compressed, quantized, pruned, distilled LLMs. It works with any model that supports HuggingFace Transformers text generation API including: * HuggingFace Transformers compressed models via [Bitsandbytes](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.BitsAndBytesConfig) + * [OpenVINO](https://github.com/openvinotoolkit/openvino) and [NNCF](https://github.com/openvinotoolkit/nncf) via [Optimum-Intel](https://github.com/huggingface/optimum-intel) and OpenVINO [GenAI](https://github.com/openvinotoolkit/openvino.genai) * [GPTQ](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.GPTQConfig) via HuggingFace API * Llama.cpp via [BigDL-LLM](https://github.com/intel-analytics/BigDL/tree/main/python/llm) - * [OpenVINO](https://github.com/openvinotoolkit/openvino) and [NNCF](https://github.com/openvinotoolkit/nncf) via [Optimum-Intel](https://github.com/huggingface/optimum-intel) * Support of custom datasets of the user choice -* Validation of text-to-image pipelines. Computes similarity score between generated images: - * Supports Diffusers library and Optimum-Intel via `Text2ImageEvaluator` class. +* Validation of text-to-image pipelines. Computes similarity score between generated images with Diffusers library, Optimum-Intel, and OpenVINO GenAI via `Text2ImageEvaluator` class. +* Validation of Visual Language pipelines. Computes similarity score between generated images with Diffusers library, Optimum-Intel, and OpenVINO GenAI via `VisualTextEvaluator` class. ### Installation Install WWB and its requirements from the source using `pip` or any other package manager. For example, @@ -41,18 +41,30 @@ wwb --target-model phi-3-openvino --gt-data gt.csv --model-type text wwb --target-model phi-3-openvino --gt-data gt.csv --model-type text --genai ``` -### Compare Text-to-image models (Diffusers) +> **NOTE**: use --verbose option for debug to see the outputs with the largest difference. + +### Compare Text-to-image models ```sh -# Export FP16 model to OpenVINO -optimum-cli export openvino -m SimianLuo/LCM_Dreamshaper_v7 --weight-format fp16 sd-lcm-fp16 # Export model with 8-bit quantized weights to OpenVINO optimum-cli export openvino -m SimianLuo/LCM_Dreamshaper_v7 --weight-format int8 sd-lcm-int8 -# Collect the references and save the mappling in the .json file. -# Reference images will be stored in the "reference" subfolder under the same path with .json. -wwb --base-model sd-lcm-fp16 --gt-data lcm_test/sd_xl.json --model-type text-to-image +# Collect the references and save the mappling in the .csv file. +# Reference images will be stored in the "reference" subfolder under the same path with .csv. +wwb --base-model SimianLuo/LCM_Dreamshaper_v7--gt-data lcm_test/gt.csv --model-type text-to-image --hf +# Compute the metric +# Target images will be stored in the "target" subfolder under the same path with .csv. +wwb --target-model sd-lcm-int8 --gt-data lcm_test/gt.csv --model-type text-to-image --genai +``` + +### Compare Visual Language Models (VLMs) +```sh +# Export FP16 model to OpenVINO +optimum-cli export openvino -m llava-hf/llava-v1.6-mistral-7b-hf --weight-format int8 llava-int8 +# Collect the references and save the mappling in the .csv file. +# Reference images will be stored in the "reference" subfolder under the same path with .csv. +wwb --base-model llava-hf/llava-v1.6-mistral-7b-hf --gt-data llava_test/gt.csv --model-type visual-text --hf # Compute the metric -# Target images will be stored in the "target" subfolder under the same path with .json. -wwb --target-model sd-lcm-int8 --gt-data lcm_test/sd_xl.json --model-type text-to-image +# Target images will be stored in the "target" subfolder under the same path with .csv. +wwb --target-model llava-int8 --gt-data llava_test/gt.csv --model-type visual-text --genai ``` ### API From d490c18aabe6c9491fab6d6601948e91f10d6fc3 Mon Sep 17 00:00:00 2001 From: Alexander Kozlov Date: Mon, 25 Nov 2024 10:38:04 +0300 Subject: [PATCH 11/24] [WWB]: Added ability to compare results for previously collected outputs w/o models provided (#1238) - Compare outputs collected from the previous runs - Kept only "similarity" metric by default as the only one that is used in CI Example: ```shell optimum-cli export openvino -m Qwen/Qwen2-0.5B-Instruct --weight-format fp16 models/Qwen2-0.5B-Instruct-fp16 mkdir qwen2_N_FP16 # References from NAT FP16 wwb --base-model Qwen/Qwen2-0.5B-Instruct --gt-data qwen2_N_FP16/gt.csv --hf --num-samples 4 # Compare N_O_FP16, save Optimum data for references wwb --target-model models/Qwen2-0.5B-Instruct-fp16 --gt-data qwen2_N_FP16/gt.csv --output qwen2_N_O_FP16 --num-samples 4 # Compare N_G_FP16, save GenAI data for references wwb --target-model models/Qwen2-0.5B-Instruct-fp16 --gt-data qwen2_N_FP16/gt.csv --genai --output qwen2_N_G_FP16 --num-samples 4 # Compare O_G_FP16, use pre-generated grout truth and target data from the previous runs wwb --target-data qwen2_N_G_FP16/target.csv --gt-data qwen2_N_O_FP16/target.csv --genai --output qwen2_O_G_FP16 --num-samples 4 # The same for INT8 optimum-cli export openvino -m Qwen/Qwen2-0.5B-Instruct --weight-format int8 models/Qwen2-0.5B-Instruct-int8 # Compare N_G_INT8, save GenAI data for references wwb --target-model models/Qwen2-0.5B-Instruct-int8 --gt-data qwen2_N_FP16/gt.csv --genai --output qwen2_N_G_INT8 --num-samples 4 ``` --- .../tests/test_cli_image.py | 166 ++++++++++-------- .../who_what_benchmark/tests/test_cli_text.py | 128 ++++++++------ .../who_what_benchmark/tests/test_cli_vlm.py | 142 ++++++++------- .../whowhatbench/registry.py | 2 +- .../whowhatbench/text2image_evaluator.py | 14 +- .../whowhatbench/text_evaluator.py | 10 +- .../whowhatbench/visualtext_evaluator.py | 8 +- tools/who_what_benchmark/whowhatbench/wwb.py | 59 ++++--- 8 files changed, 298 insertions(+), 231 deletions(-) diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py index 374df2a1ec..b2c2015f80 100644 --- a/tools/who_what_benchmark/tests/test_cli_image.py +++ b/tools/who_what_benchmark/tests/test_cli_image.py @@ -14,7 +14,6 @@ def run_wwb(args): logger.info(" ".join(["TRANSFOREMRS_VERBOSITY=debug wwb"] + args)) result = subprocess.run(["wwb"] + args, capture_output=True, text=True) logger.info(result) - print(" ".join(["TRANSFOREMRS_VERBOSITY=debug wwb"] + args)) return result @@ -27,7 +26,7 @@ def run_wwb(args): ], ) def test_image_model_types(model_id, model_type, backend): - GT_FILE = "test_sd.json" + GT_FILE = "test_sd.csv" wwb_args = [ "--base-model", model_id, @@ -70,79 +69,94 @@ def test_image_model_types(model_id, model_type, backend): ], ) def test_image_model_genai(model_id, model_type): - GT_FILE = "test_sd.json" - MODEL_PATH = tempfile.TemporaryDirectory().name - - result = subprocess.run(["optimum-cli", "export", - "openvino", "-m", model_id, - MODEL_PATH], capture_output=True, text=True) - assert result.returncode == 0 - - wwb_args = [ - "--base-model", - MODEL_PATH, - "--num-samples", - "1", - "--gt-data", - GT_FILE, - "--device", - "CPU", - "--model-type", - model_type, - ] - result = run_wwb(wwb_args) - assert result.returncode == 0 - assert os.path.exists(GT_FILE) - assert os.path.exists("reference") - - wwb_args = [ - "--target-model", - MODEL_PATH, - "--num-samples", - "1", - "--gt-data", - GT_FILE, - "--device", - "CPU", - "--model-type", - model_type, - "--genai", - ] - result = run_wwb(wwb_args) - - assert result.returncode == 0 - assert "Metrics for model" in result.stderr - similarity = float(str(result.stderr).split(" ")[-1]) - assert similarity >= 0.98 - assert os.path.exists("target") - - output_dir = tempfile.TemporaryDirectory().name - wwb_args = [ - "--target-model", - MODEL_PATH, - "--num-samples", - "1", - "--gt-data", - GT_FILE, - "--device", - "CPU", - "--model-type", - model_type, - "--output", - output_dir, - ] - result = run_wwb(wwb_args) - assert os.path.exists(os.path.join(output_dir, "target")) - assert os.path.exists(os.path.join(output_dir, "target.json")) - - try: - os.remove(GT_FILE) - except OSError: - pass - shutil.rmtree("reference", ignore_errors=True) - shutil.rmtree("target", ignore_errors=True) - shutil.rmtree(MODEL_PATH, ignore_errors=True) - shutil.rmtree(output_dir, ignore_errors=True) + with tempfile.TemporaryDirectory() as temp_dir: + GT_FILE = os.path.join(temp_dir, "gt.csv") + MODEL_PATH = os.path.join(temp_dir, model_id.replace("/", "--")) + + result = subprocess.run(["optimum-cli", "export", + "openvino", "-m", model_id, + MODEL_PATH], + capture_output=True, text=True) + assert result.returncode == 0 + + wwb_args = [ + "--base-model", + MODEL_PATH, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + ] + result = run_wwb(wwb_args) + assert result.returncode == 0 + assert os.path.exists(GT_FILE) + assert os.path.exists(os.path.join(temp_dir, "reference")) + + wwb_args = [ + "--target-model", + MODEL_PATH, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + "--genai", + ] + result = run_wwb(wwb_args) + + assert result.returncode == 0 + assert "Metrics for model" in result.stderr + similarity = float(str(result.stderr).split(" ")[-1]) + assert similarity >= 0.98 + assert os.path.exists(os.path.join(temp_dir, "target")) + + output_dir = tempfile.TemporaryDirectory().name + wwb_args = [ + "--target-model", + MODEL_PATH, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + "--output", + output_dir, + ] + result = run_wwb(wwb_args) + assert result.returncode == 0 + assert os.path.exists(os.path.join(output_dir, "target")) + assert os.path.exists(os.path.join(output_dir, "target.csv")) + + # test w/o models + wwb_args = [ + "--target-data", + os.path.join(output_dir, "target.csv"), + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + ] + result = run_wwb(wwb_args) + assert result.returncode == 0 + + shutil.rmtree("reference", ignore_errors=True) + shutil.rmtree("target", ignore_errors=True) + shutil.rmtree(MODEL_PATH, ignore_errors=True) + shutil.rmtree(output_dir, ignore_errors=True) @pytest.mark.parametrize( @@ -152,7 +166,7 @@ def test_image_model_genai(model_id, model_type): ], ) def test_image_custom_dataset(model_id, model_type, backend): - GT_FILE = "test_sd.json" + GT_FILE = "test_sd.csv" wwb_args = [ "--base-model", model_id, diff --git a/tools/who_what_benchmark/tests/test_cli_text.py b/tools/who_what_benchmark/tests/test_cli_text.py index cf71adc08a..0baf60a5a4 100644 --- a/tools/who_what_benchmark/tests/test_cli_text.py +++ b/tools/who_what_benchmark/tests/test_cli_text.py @@ -73,29 +73,28 @@ def test_text_target_model(): @pytest.fixture def test_text_gt_data(): - with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile: - temp_file_name = tmpfile.name + with tempfile.TemporaryDirectory() as temp_dir: + temp_file_name = os.path.join(temp_dir, "gt.csv") - result = run_wwb( - [ - "--base-model", - base_model_path, - "--gt-data", - temp_file_name, - "--dataset", - "EleutherAI/lambada_openai,en", - "--dataset-field", - "text", - "--split", - "test", - "--num-samples", - "2", - "--device", - "CPU", - ] - ) - data = pd.read_csv(temp_file_name) - os.remove(temp_file_name) + result = run_wwb( + [ + "--base-model", + base_model_path, + "--gt-data", + temp_file_name, + "--dataset", + "EleutherAI/lambada_openai,en", + "--dataset-field", + "text", + "--split", + "test", + "--num-samples", + "2", + "--device", + "CPU", + ] + ) + data = pd.read_csv(temp_file_name) assert result.returncode == 0 assert len(data["questions"].values) == 2 @@ -107,6 +106,8 @@ def test_text_output_directory(): [ "--base-model", base_model_path, + "--gt-data", + os.path.join(temp_dir, "gt.csv"), "--target-model", target_model_path, "--num-samples", @@ -121,7 +122,23 @@ def test_text_output_directory(): assert "Metrics for model" in result.stderr assert os.path.exists(os.path.join(temp_dir, "metrics_per_qustion.csv")) assert os.path.exists(os.path.join(temp_dir, "metrics.csv")) - assert os.path.exists(os.path.join(temp_dir, "target.json")) + assert os.path.exists(os.path.join(temp_dir, "target.csv")) + + # test measurtement w/o models + result = run_wwb( + [ + "--gt-data", + os.path.join(temp_dir, "gt.csv"), + "--target-data", + os.path.join(temp_dir, "target.csv"), + "--num-samples", + "2", + "--device", + "CPU", + ] + ) + assert result.returncode == 0 + assert "Metrics for model" in result.stderr def test_text_verbose(): @@ -143,46 +160,43 @@ def test_text_verbose(): def test_text_language_autodetect(): - temp_file_name = tempfile.NamedTemporaryFile(suffix=".csv").name - - result = run_wwb( - [ - "--base-model", - "Qwen/Qwen2-0.5B", - "--gt-data", - temp_file_name, - "--num-samples", - "2", - "--device", - "CPU", - ] - ) - data = pd.read_csv(temp_file_name) - os.remove(temp_file_name) + with tempfile.TemporaryDirectory() as temp_dir: + temp_file_name = os.path.join(temp_dir, "gt.csv") + result = run_wwb( + [ + "--base-model", + "Qwen/Qwen2-0.5B", + "--gt-data", + temp_file_name, + "--num-samples", + "2", + "--device", + "CPU", + ] + ) + data = pd.read_csv(temp_file_name) assert result.returncode == 0 assert "马克" in data["prompts"].values[0] def test_text_hf_model(): - with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile: - temp_file_name = tmpfile.name - - result = run_wwb( - [ - "--base-model", - model_id, - "--gt-data", - temp_file_name, - "--num-samples", - "2", - "--device", - "CPU", - "--hf", - ] - ) - data = pd.read_csv(temp_file_name) - os.remove(temp_file_name) + with tempfile.TemporaryDirectory() as temp_dir: + temp_file_name = os.path.join(temp_dir, "gt.csv") + result = run_wwb( + [ + "--base-model", + model_id, + "--gt-data", + temp_file_name, + "--num-samples", + "2", + "--device", + "CPU", + "--hf", + ] + ) + data = pd.read_csv(temp_file_name) assert result.returncode == 0 assert len(data["prompts"].values) == 2 diff --git a/tools/who_what_benchmark/tests/test_cli_vlm.py b/tools/who_what_benchmark/tests/test_cli_vlm.py index d45283493e..5b33abf33c 100644 --- a/tools/who_what_benchmark/tests/test_cli_vlm.py +++ b/tools/who_what_benchmark/tests/test_cli_vlm.py @@ -24,70 +24,88 @@ def run_wwb(args): ], ) def test_vlm_basic(model_id, model_type): - GT_FILE = tempfile.NamedTemporaryFile(suffix=".json").name - MODEL_PATH = tempfile.TemporaryDirectory().name + with tempfile.TemporaryDirectory() as temp_dir: + GT_FILE = os.path.join(temp_dir, "gt.csv") + MODEL_PATH = os.path.join(temp_dir, model_id.replace("/", "--")) - result = subprocess.run(["optimum-cli", "export", - "openvino", "-m", model_id, - MODEL_PATH, "--task", - "image-text-to-text", - "--trust-remote-code"], - capture_output=True, - text=True, - ) - assert result.returncode == 0 + result = subprocess.run(["optimum-cli", "export", + "openvino", "-m", model_id, + MODEL_PATH, "--task", + "image-text-to-text", + "--trust-remote-code"], + capture_output=True, + text=True, + ) + assert result.returncode == 0 - wwb_args = [ - "--base-model", - model_id, - "--num-samples", - "1", - "--gt-data", - GT_FILE, - "--device", - "CPU", - "--model-type", - model_type, - "--hf", - ] - result = run_wwb(wwb_args) - assert result.returncode == 0 + # Collect reference with HF model + wwb_args = [ + "--base-model", + model_id, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + "--hf", + ] + result = run_wwb(wwb_args) + assert result.returncode == 0 - wwb_args = [ - "--target-model", - MODEL_PATH, - "--num-samples", - "1", - "--gt-data", - GT_FILE, - "--device", - "CPU", - "--model-type", - model_type, - ] - result = run_wwb(wwb_args) - assert result.returncode == 0 + # test Optimum + wwb_args = [ + "--target-model", + MODEL_PATH, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + ] + result = run_wwb(wwb_args) + assert result.returncode == 0 - wwb_args = [ - "--target-model", - MODEL_PATH, - "--num-samples", - "1", - "--gt-data", - GT_FILE, - "--device", - "CPU", - "--model-type", - model_type, - "--genai", - ] - result = run_wwb(wwb_args) - assert result.returncode == 0 + # test GenAI + wwb_args = [ + "--target-model", + MODEL_PATH, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + "--genai", + "--output", + "target", + ] + result = run_wwb(wwb_args) + assert result.returncode == 0 - try: - os.remove(GT_FILE) - except OSError: - pass - shutil.rmtree("reference", ignore_errors=True) - shutil.rmtree("target", ignore_errors=True) - shutil.rmtree(MODEL_PATH, ignore_errors=True) + # test w/o models + wwb_args = [ + "--target-data", + "target/target.csv", + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + "--genai", + ] + result = run_wwb(wwb_args) + assert result.returncode == 0 + shutil.rmtree("reference", ignore_errors=True) + shutil.rmtree("target", ignore_errors=True) + shutil.rmtree(MODEL_PATH, ignore_errors=True) diff --git a/tools/who_what_benchmark/whowhatbench/registry.py b/tools/who_what_benchmark/whowhatbench/registry.py index 85fabf618e..0cfbf8e440 100644 --- a/tools/who_what_benchmark/whowhatbench/registry.py +++ b/tools/who_what_benchmark/whowhatbench/registry.py @@ -29,7 +29,7 @@ def dump_predictions(self, csv_name: str): pass @abstractmethod - def score(self, model, **kwargs): + def score(self, model_or_data, **kwargs): pass @abstractmethod diff --git a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py index 2663414917..1ff7ff5e21 100644 --- a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py +++ b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py @@ -84,15 +84,19 @@ def __init__( def get_generation_fn(self): return self.generation_fn - def score(self, model, gen_image_fn=None, output_dir=None, **kwargs): - model.resolution = self.resolution + def score(self, model_or_data, gen_image_fn=None, output_dir=None, **kwargs): if output_dir is None: image_folder = os.path.join(self.gt_dir, "target") else: image_folder = os.path.join(output_dir, "target") - predictions = self._generate_data( - model, gen_image_fn, image_folder - ) + + if isinstance(model_or_data, str) and os.path.exists(model_or_data): + predictions = pd.read_csv(model_or_data, keep_default_na=False) + else: + model_or_data.resolution = self.resolution + predictions = self._generate_data( + model_or_data, gen_image_fn, image_folder + ) self.predictions = predictions all_metrics_per_prompt = {} diff --git a/tools/who_what_benchmark/whowhatbench/text_evaluator.py b/tools/who_what_benchmark/whowhatbench/text_evaluator.py index eb89083496..50ce224def 100644 --- a/tools/who_what_benchmark/whowhatbench/text_evaluator.py +++ b/tools/who_what_benchmark/whowhatbench/text_evaluator.py @@ -1,5 +1,6 @@ from typing import Any, Union +import os import pandas as pd from tqdm import tqdm @@ -97,7 +98,7 @@ def __init__( tokenizer: Any = None, gt_data: str = None, test_data: Union[str, list] = None, - metrics=("similarity", "divergency"), + metrics="similarity", similarity_model_id: str = "sentence-transformers/all-mpnet-base-v2", max_new_tokens=128, crop_question=True, @@ -155,8 +156,11 @@ def __init__( def get_generation_fn(self): return self.generation_fn - def score(self, model, gen_answer_fn=None, **kwargs): - predictions = self._generate_data(model, gen_answer_fn, self.generation_config) + def score(self, model_or_data, gen_answer_fn=None, **kwargs): + if isinstance(model_or_data, str) and os.path.exists(model_or_data): + predictions = pd.read_csv(model_or_data, keep_default_na=False) + else: + predictions = self._generate_data(model_or_data, gen_answer_fn, self.generation_config) self.predictions = predictions all_metrics_per_prompt = {} diff --git a/tools/who_what_benchmark/whowhatbench/visualtext_evaluator.py b/tools/who_what_benchmark/whowhatbench/visualtext_evaluator.py index ef10bdafcf..99027971d8 100644 --- a/tools/who_what_benchmark/whowhatbench/visualtext_evaluator.py +++ b/tools/who_what_benchmark/whowhatbench/visualtext_evaluator.py @@ -1,5 +1,6 @@ from typing import Any, Union +import os import datasets import pandas as pd from diffusers.utils.loading_utils import load_image @@ -64,8 +65,11 @@ def __init__( seqs_per_request=seqs_per_request, ) - def score(self, model, gen_answer_fn=None, **kwargs): - predictions = self._generate_data(model, gen_answer_fn) + def score(self, model_or_data, gen_answer_fn=None, **kwargs): + if isinstance(model_or_data, str) and os.path.exists(model_or_data): + predictions = pd.read_csv(model_or_data, keep_default_na=False) + else: + predictions = self._generate_data(model_or_data, gen_answer_fn, self.generation_config) self.predictions = predictions all_metrics_per_prompt = {} diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py index f3c5f8224a..0a01a8e8df 100644 --- a/tools/who_what_benchmark/whowhatbench/wwb.py +++ b/tools/who_what_benchmark/whowhatbench/wwb.py @@ -271,12 +271,17 @@ def parse_args(): default=None, help="Tokenizer for divergency metric. If not provided, it will be load from base_model or target_model.", ) - parser.add_argument( "--gt-data", default=None, - help="CSV file containing GT outputs from base_model. If defined and exists then base_model will not used." - " If the files does not exist, it will be generated by base_model evaluation.", + help="CSV file containing GT outputs from --base-model. If defined and exists then --base-model will not used." + " If the files does not exist, it will be generated by --base-model evaluation.", + ) + parser.add_argument( + "--target-data", + default=None, + help="CSV file containing outputs from target model. If defined and exists then --target-model will not used." + " If the files does not exist, it will be generated by --target-model evaluation.", ) parser.add_argument( "--model-type", @@ -385,14 +390,11 @@ def parse_args(): def check_args(args): - if args.base_model is None and args.target_model is None: - raise ValueError( - "Wether --base-model or --target-model should be provided") if args.base_model is None and args.gt_data is None: raise ValueError("Wether --base-model or --gt-data should be provided") - if args.target_model is None and args.gt_data is None: + if args.target_model is None and args.gt_data is None and args.target_data: raise ValueError( - "Wether --target-model or --gt-data should be provided") + "Wether --target-model, --target-data or --gt-data should be provided") def load_tokenizer(args): @@ -405,7 +407,7 @@ def load_tokenizer(args): tokenizer = AutoTokenizer.from_pretrained( args.base_model, trust_remote_code=True ) - else: + elif args.target_model is not None: tokenizer = AutoTokenizer.from_pretrained( args.target_model, trust_remote_code=True ) @@ -419,7 +421,7 @@ def load_processor(args): processor = AutoProcessor.from_pretrained( args.base_model, trust_remote_code=True ) - else: + elif args.target_model is not None: processor = AutoProcessor.from_pretrained( args.target_model, trust_remote_code=True ) @@ -611,20 +613,27 @@ def main(): evaluator.dump_gt(args.gt_data) del base_model - if args.target_model: - target_model = load_model( - args.model_type, - args.target_model, - args.device, - args.ov_config, - args.hf, - args.genai, - ) - all_metrics_per_question, all_metrics = evaluator.score( - target_model, - evaluator.get_generation_fn() if args.genai else None, - output_dir=args.output - ) + if args.target_data or args.target_model: + if args.target_data and os.path.exists(args.target_data): + all_metrics_per_question, all_metrics = evaluator.score( + args.target_data, + None, + output_dir=args.output + ) + else: + target_model = load_model( + args.model_type, + args.target_model, + args.device, + args.ov_config, + args.hf, + args.genai, + ) + all_metrics_per_question, all_metrics = evaluator.score( + target_model, + evaluator.get_generation_fn() if args.genai else None, + output_dir=args.output + ) logger.info("Metrics for model: %s", args.target_model) logger.info(all_metrics) @@ -635,7 +644,7 @@ def main(): df.to_csv(os.path.join(args.output, "metrics_per_qustion.csv")) df = pd.DataFrame(all_metrics) df.to_csv(os.path.join(args.output, "metrics.csv")) - evaluator.dump_predictions(os.path.join(args.output, "target.json")) + evaluator.dump_predictions(os.path.join(args.output, "target.csv")) if args.verbose and args.target_model is not None: if args.model_type == "text" or args.model_type == "visual-text": From 43caa0b1352e8508b91ac658c143231fe16ead9c Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Mon, 25 Nov 2024 13:32:29 +0400 Subject: [PATCH 12/24] use genai callback in image gen and switch to genai by default (#1249) CVS-157814 --- .github/workflows/llm_bench-python.yml | 18 ++--- tools/llm_bench/README.md | 5 +- tools/llm_bench/benchmark.py | 3 +- .../llm_bench/llm_bench_utils/config_class.py | 15 ++-- .../llm_bench_utils/metrics_print.py | 17 +++-- .../llm_bench/llm_bench_utils/model_utils.py | 15 +++- tools/llm_bench/llm_bench_utils/ov_utils.py | 69 ++++++++++++++++--- tools/llm_bench/task/image_generation.py | 14 ++-- 8 files changed, 108 insertions(+), 48 deletions(-) diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml index 0ac47d1aa0..77f26d33a0 100644 --- a/.github/workflows/llm_bench-python.yml +++ b/.github/workflows/llm_bench-python.yml @@ -66,28 +66,28 @@ jobs: python ./tools/llm_bench/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt env: GIT_LFS_SKIP_SMUDGE: 0 - - name: Test tiny-random-baichuan2 on Linux + - name: Test tiny-random-baichuan2 on Linux Optimum Intel run: | optimum-cli export openvino --model katuni4ka/tiny-random-baichuan2 --trust-remote-code --weight-format fp16 ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16 - python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1 - - name: Test tiny-stable-diffusion on Linux + python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1 --optimum + - name: Test tiny-stable-diffusion on Linux Optimum Intel run: | optimum-cli export openvino --model segmind/tiny-sd --trust-remote-code --weight-format fp16 ./ov_models/tiny-sd/pytorch/dldt/FP16/ - python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-sd/pytorch/dldt/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 + python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-sd/pytorch/dldt/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum - name: Test dreamlike-anime on Linux with GenAI run: | optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 ov_models/dreamlike-art-dreamlike-anime-1.0/FP16 - python ./tools/llm_bench/benchmark.py -m ./ov_models/dreamlike-art-dreamlike-anime-1.0/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --genai + python ./tools/llm_bench/benchmark.py -m ./ov_models/dreamlike-art-dreamlike-anime-1.0/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 - name: Test dreamlike-anime on Linux with GenAI and LoRA run: | wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591 - python ./tools/llm_bench/benchmark.py -m ./ov_models/dreamlike-art-dreamlike-anime-1.0/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --genai --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 + python ./tools/llm_bench/benchmark.py -m ./ov_models/dreamlike-art-dreamlike-anime-1.0/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux run: | optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format fp16 ov_models/TinyLlama-1.1B-Chat-v1.0/FP16 optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format int8 ov_models/TinyLlama-1.1B-Chat-v1.0/INT8 - python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --genai --assistant_confidence_threshold 0.4 - python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --genai --num_assistant_tokens 5 + python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --assistant_confidence_threshold 0.4 + python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --num_assistant_tokens 5 - name: Test whisper-tiny on Linux run: | GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --branch main --single-branch https://huggingface.co/datasets/facebook/multilingual_librispeech @@ -97,8 +97,8 @@ jobs: tar zxvf data/mls_polish/train/audio/3283_1447_000.tar.gz -C data/mls_polish/train/audio/3283_1447_000/ cd .. optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny ./ov_models/whisper-tiny + python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1 --optimum python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1 - python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1 --genai - name: WWB Tests run: | GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.WWB_PATH }}/requirements.txt diff --git a/tools/llm_bench/README.md b/tools/llm_bench/README.md index d3f643b58f..87f6e91271 100755 --- a/tools/llm_bench/README.md +++ b/tools/llm_bench/README.md @@ -161,11 +161,10 @@ For example, `--load_config config.json` as following will result in streams.num ## 6. Execution on CPU device -OpenVINO is by default built with [oneTBB](https://github.com/oneapi-src/oneTBB/) threading library, while Torch uses [OpenMP](https://www.openmp.org/). Both threading libraries have ['busy-wait spin'](https://gcc.gnu.org/onlinedocs/libgomp/GOMP_005fSPINCOUNT.html) by default. When running LLM pipeline on CPU device, there is threading overhead in the switching between inference on CPU with OpenVINO (oneTBB) and postprocessing (For example: greedy search or beam search) with Torch (OpenMP). +OpenVINO is by default built with [oneTBB](https://github.com/oneapi-src/oneTBB/) threading library, while Torch uses [OpenMP](https://www.openmp.org/). Both threading libraries have ['busy-wait spin'](https://gcc.gnu.org/onlinedocs/libgomp/GOMP_005fSPINCOUNT.html) by default. When running LLM pipeline on CPU device, there is threading overhead in the switching between inference on CPU with OpenVINO (oneTBB) and postprocessing (For example: greedy search or beam search) with Torch (OpenMP). The default benchmarking scenarion uses OpenVINO GenAI that implements own postprocessing api without additional dependencies. **Alternative solutions** -1. Use --genai option which uses OpenVINO genai API instead of optimum-intel API. In this case postprocessing is executed with OpenVINO genai API. -2. Without --genai option which uses optimum-intel API, set environment variable [OMP_WAIT_POLICY](https://gcc.gnu.org/onlinedocs/libgomp/OMP_005fWAIT_005fPOLICY.html) to PASSIVE which will disable OpenMP 'busy-wait', and benchmark.py will limit the Torch thread number by default to avoid using CPU cores which is in 'busy-wait' by OpenVINO inference. Users can also set the number with --set_torch_thread option. +1. With --optimum option which uses optimum-intel API, set environment variable [OMP_WAIT_POLICY](https://gcc.gnu.org/onlinedocs/libgomp/OMP_005fWAIT_005fPOLICY.html) to PASSIVE which will disable OpenMP 'busy-wait', and benchmark.py will limit the Torch thread number by default to avoid using CPU cores which is in 'busy-wait' by OpenVINO inference. Users can also set the number with --set_torch_thread option. ## 7. Additional Resources diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py index d652c8b48f..fe5068b009 100644 --- a/tools/llm_bench/benchmark.py +++ b/tools/llm_bench/benchmark.py @@ -130,7 +130,8 @@ def get_argprser(): ) parser.add_argument('-od', '--output_dir', help='Save the input text and generated text, images to files') llm_bench_utils.model_utils.add_stateful_model_arguments(parser) - parser.add_argument("--genai", action="store_true", help="Use OpenVINO GenAI optimized pipelines for benchmarking") + parser.add_argument("--genai", action="store_true", help="[DEPRECATED] Use OpenVINO GenAI optimized pipelines for benchmarking. Enabled by default") + parser.add_argument("--optimum", action="store_true", help="Use Optimum Intel pipelines for benchmarking") parser.add_argument( "--lora", nargs='*', diff --git a/tools/llm_bench/llm_bench_utils/config_class.py b/tools/llm_bench/llm_bench_utils/config_class.py index 2f6cd95664..12385d2879 100644 --- a/tools/llm_bench/llm_bench_utils/config_class.py +++ b/tools/llm_bench/llm_bench_utils/config_class.py @@ -7,9 +7,7 @@ from optimum.intel.openvino import ( OVModelForCausalLM, OVModelForSeq2SeqLM, - OVStableDiffusionPipeline, - OVLatentConsistencyModelPipeline, - OVStableDiffusionXLPipeline, + OVDiffusionPipeline, OVModelForSpeechSeq2Seq ) from llm_bench_utils.ov_model_classes import OVMPTModel, OVLDMSuperResolutionPipeline, OVChatGLMModel @@ -22,19 +20,14 @@ 'falcon': AutoTokenizer, } +IMAGE_GEN_CLS = OVDiffusionPipeline + OV_MODEL_CLASSES_MAPPING = { 'decoder': OVModelForCausalLM, 't5': OVModelForSeq2SeqLM, 'blenderbot': OVModelForSeq2SeqLM, 'falcon': OVModelForCausalLM, 'mpt': OVMPTModel, - 'stable-diffusion-xl': OVStableDiffusionXLPipeline, - 'sdxl': OVStableDiffusionXLPipeline, - 'lcm-sdxl': OVStableDiffusionXLPipeline, - 'ssd-': OVStableDiffusionXLPipeline, - 'lcm-ssd-': OVStableDiffusionXLPipeline, - 'stable_diffusion': OVStableDiffusionPipeline, - 'lcm': OVLatentConsistencyModelPipeline, 'replit': OVMPTModel, 'codet5': OVModelForSeq2SeqLM, 'codegen2': OVModelForCausalLM, @@ -57,7 +50,7 @@ } USE_CASES = { - 'image_gen': ['stable-diffusion-', 'ssd-', 'deepfloyd-if', 'tiny-sd', 'small-sd', 'lcm-', 'sdxl', 'dreamlike'], + 'image_gen': ['stable-diffusion-', 'ssd-', 'tiny-sd', 'small-sd', 'lcm-', 'sdxl', 'dreamlike', "flux"], 'speech2text': ['whisper'], 'image_cls': ['vit'], 'code_gen': ['replit', 'codegen2', 'codegen', 'codet5', "stable-code"], diff --git a/tools/llm_bench/llm_bench_utils/metrics_print.py b/tools/llm_bench/llm_bench_utils/metrics_print.py index de9d0126f8..73e83dc672 100644 --- a/tools/llm_bench/llm_bench_utils/metrics_print.py +++ b/tools/llm_bench/llm_bench_utils/metrics_print.py @@ -97,12 +97,17 @@ def print_stable_diffusion_infer_latency(iter_str, iter_data, stable_diffusion, prefix = f'[{iter_str}][P{prompt_idx}]' log.info(f"{prefix} First step of unet latency: {iter_data['first_token_latency']:.2f} ms/step, " f"other steps of unet latency: {iter_data['other_tokens_avg_latency']:.2f} ms/step",) - log.info(f"{prefix} Text encoder latency: {stable_diffusion.get_text_encoder_latency():.2f} ms/step, " - f"unet latency: {stable_diffusion.get_unet_latency():.2f} ms/step, " - f"vae decoder latency: {stable_diffusion.get_vae_decoder_latency():.2f} ms/step, " - f"text encoder step count: {stable_diffusion.get_text_encoder_step_count()}, " - f"unet step count: {stable_diffusion.get_unet_step_count()}, " - f"vae decoder step count: {stable_diffusion.get_vae_decoder_step_count()}",) + has_text_encoder_time = stable_diffusion.get_text_encoder_step_count() != -1 + log_str = ( + f"{prefix} Text encoder latency: {stable_diffusion.get_text_encoder_latency():.2f}" if has_text_encoder_time else f"{prefix} Text encoder latency: N/A " + f"unet latency: {stable_diffusion.get_unet_latency():.2f} ms/step, " + f"vae decoder latency: {stable_diffusion.get_vae_decoder_latency():.2f} ms/step, ") + if has_text_encoder_time: + log_str += f"text encoder step count: {stable_diffusion.get_text_encoder_step_count()}, " + log_str += ( + f"unet step count: {stable_diffusion.get_unet_step_count()}, " + f"vae decoder step count: {stable_diffusion.get_vae_decoder_step_count()}") + log.info(log_str) def print_ldm_unet_vqvae_infer_latency(iter_num, iter_data, tms=None, warm_up=False, prompt_idx=-1): diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py index 6539bef232..f72557b6c5 100644 --- a/tools/llm_bench/llm_bench_utils/model_utils.py +++ b/tools/llm_bench/llm_bench_utils/model_utils.py @@ -95,6 +95,13 @@ def analyze_args(args): model_args['torch_compile_input_module'] = args.torch_compile_input_module model_args['media'] = args.media + optimum = args.optimum + + if optimum and args.genai: + raise RuntimeError("`--genai` and `--optimum` can not be selected in the same time") + model_args["optimum"] = optimum + model_args["genai"] = not optimum + has_torch_compile_options = any([args.torch_compile_options is not None, args.torch_compile_options is not None, args.torch_compile_dynamic]) if model_args["torch_compile_backend"] is None and has_torch_compile_options: log.warning("torch.compile configuration options provided, but backend is not selected, openvino backend will be used") @@ -102,7 +109,6 @@ def analyze_args(args): model_args['convert_tokenizer'] = args.convert_tokenizer model_args['subsequent'] = args.subsequent model_args['output_dir'] = args.output_dir - model_args['genai'] = args.genai model_args['lora'] = args.lora model_args['lora_alphas'] = args.lora_alphas model_args["use_cb"] = args.use_cb @@ -135,7 +141,7 @@ def analyze_args(args): model_args['model_type'] = get_model_type(model_name, use_case, model_framework) model_args['model_name'] = model_name - if (args.use_cb or args.draft_model) and not args.genai: + if (args.use_cb or args.draft_model) and optimum: raise RuntimeError("Continuous batching mode supported only via OpenVINO GenAI") cb_config = None if args.cb_config: @@ -169,6 +175,11 @@ def get_use_case(model_name_or_path): config = json.loads(config_file.read_text()) except Exception: config = None + if (Path(model_name_or_path) / "model_index.json").exists(): + diffusers_config = json.loads((Path(model_name_or_path) / "model_index.json").read_text()) + pipe_type = diffusers_config.get("_class_name") + if pipe_type in ["StableDiffusionPipeline", "StableDiffusionXLPipeline", "StableDiffusion3Pipeline", "FluxPipeline", "LatentConsistencyModelPipeline"]: + return "image_gen", pipe_type.replace("Pipeline", "") if config is not None: for case, model_ids in USE_CASES.items(): diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py index cf0d0d831c..9ebd1363e3 100644 --- a/tools/llm_bench/llm_bench_utils/ov_utils.py +++ b/tools/llm_bench/llm_bench_utils/ov_utils.py @@ -11,7 +11,7 @@ import json import types from llm_bench_utils.hook_common import get_bench_hook -from llm_bench_utils.config_class import OV_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES +from llm_bench_utils.config_class import OV_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES, IMAGE_GEN_CLS import openvino.runtime.opset13 as opset from transformers import pipeline @@ -171,11 +171,13 @@ def create_text_gen_model(model_path, device, **kwargs): if not model_path_existed: raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist') else: - if kwargs.get("genai", False) and is_genai_available(log_msg=True): + if kwargs.get("genai", True) and is_genai_available(log_msg=True): if model_class not in [OV_MODEL_CLASSES_MAPPING[default_model_type], OV_MODEL_CLASSES_MAPPING["mpt"], OV_MODEL_CLASSES_MAPPING["chatglm"]]: log.warning("OpenVINO GenAI based benchmarking is not available for {model_type}. Will be switched to default benchmarking") else: + log.info("Selected OpenVINO GenAI for benchmarking") return create_genai_text_gen_model(model_path, device, ov_config, **kwargs) + log.info("Selected Optimum Intel for benchmarking") remote_code = False try: model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=False) @@ -295,23 +297,23 @@ def convert_ov_tokenizer(tokenizer_path): def create_image_gen_model(model_path, device, **kwargs): - default_model_type = DEFAULT_MODEL_CLASSES[kwargs['use_case']] - model_type = kwargs.get('model_type', default_model_type) - model_class = OV_MODEL_CLASSES_MAPPING[model_type] + model_class = IMAGE_GEN_CLS model_path = Path(model_path) ov_config = kwargs['config'] if not Path(model_path).exists(): raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist') else: - if kwargs.get("genai", False) and is_genai_available(log_msg=True): + if kwargs.get("genai", True) and is_genai_available(log_msg=True): + log.info("Selected OpenVINO GenAI for benchmarking") return create_genai_image_gen_model(model_path, device, ov_config, **kwargs) + log.info("Selected Optimum Intel for benchmarking") start = time.perf_counter() ov_model = model_class.from_pretrained(model_path, device=device, ov_config=ov_config) end = time.perf_counter() from_pretrained_time = end - start log.info(f'From pretrained time: {from_pretrained_time:.2f}s') - return ov_model, from_pretrained_time, False + return ov_model, from_pretrained_time, False, None def get_genai_clip_text_encoder(model_index_data, model_path, device, ov_config): @@ -350,6 +352,51 @@ def get_genai_unet_model(model_index_data, model_path, device, ov_config): def create_genai_image_gen_model(model_path, device, ov_config, **kwargs): import openvino_genai + class PerfCollector: + def __init__(self) -> types.NoneType: + self.iteration_time = [] + self.start_time = time.perf_counter() + self.duration = -1 + + def __call__(self, step, latents): + self.iteration_time.append(time.perf_counter() - self.start_time) + self.start_time = time.perf_counter() + return False + + def reset(self): + self.iteration_time = [] + self.start_time = time.perf_counter() + self.duration = -1 + + def get_1st_unet_latency(self): + return self.iteration_time[0] * 1000 if len(self.iteration_time) > 0 else 0 + + def get_2nd_unet_latency(self): + return sum(self.iteration_time[1:]) / (len(self.iteration_time) - 1) * 1000 if len(self.iteration_time) > 1 else 0 + + def get_unet_latency(self): + return (sum(self.iteration_time) / len(self.iteration_time)) * 1000 if len(self.iteration_time) > 0 else 0 + + def get_vae_decoder_latency(self): + if self.duration != -1: + vae_time = self.duration - sum(self.iteration_time) + return vae_time * 1000 + return 0 + + def get_text_encoder_latency(self): + return -1 + + def get_text_encoder_step_count(self): + return -1 + + def get_unet_step_count(self): + return len(self.iteration_time) + + def get_vae_decoder_step_count(self): + return 1 + + callback = PerfCollector() + adapter_config = get_lora_config(kwargs.get("lora", None), kwargs.get("lora_alphas", [])) if adapter_config: ov_config['adapters'] = adapter_config @@ -393,7 +440,7 @@ def create_genai_image_gen_model(model_path, device, ov_config, **kwargs): end = time.perf_counter() log.info(f'Pipeline initialization time: {end - start:.2f}s') - return t2i_pipe, end - start, True + return t2i_pipe, end - start, True, callback def create_ldm_super_resolution_model(model_path, device, **kwargs): @@ -414,7 +461,7 @@ def create_ldm_super_resolution_model(model_path, device, **kwargs): def create_genai_speech_2_txt_model(model_path, device, **kwargs): import openvino_genai as ov_genai - if kwargs.get("genai", False) is False: + if kwargs.get("genai", True) is False: raise RuntimeError('==Failure the command line does not set --genai ==') if is_genai_available(log_msg=True) is False: raise RuntimeError('==Failure genai is not enable ==') @@ -442,11 +489,13 @@ def create_speech_2txt_model(model_path, device, **kwargs): if not model_path_existed: raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist') else: - if kwargs.get("genai", False) and is_genai_available(log_msg=True): + if kwargs.get("genai", True) and is_genai_available(log_msg=True): if model_class not in [OV_MODEL_CLASSES_MAPPING[default_model_type]]: log.warning("OpenVINO GenAI based benchmarking is not available for {model_type}. Will be switched to default bencmarking") else: + log.info("Selected OpenVINO GenAI for benchmarking") return create_genai_speech_2_txt_model(model_path, device, **kwargs) + log.info("Selected Optimum Intel for benchmarking") start = time.perf_counter() ov_model = model_class.from_pretrained( model_path, diff --git a/tools/llm_bench/task/image_generation.py b/tools/llm_bench/task/image_generation.py index b6260568bf..f227898ef6 100644 --- a/tools/llm_bench/task/image_generation.py +++ b/tools/llm_bench/task/image_generation.py @@ -41,7 +41,7 @@ def collects_input_args(image_param, model_type, model_name): return input_args -def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption): +def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption, callback=None): set_seed(args['seed']) input_text = image_param['prompt'] input_args = collects_input_args(image_param, args['model_type'], args['model_name']) @@ -104,7 +104,7 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, stable_diffusion_hook.clear_statistics() -def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption): +def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption, callback=None): set_seed(args['seed']) input_text = image_param['prompt'] input_args = collects_input_args(image_param, args['model_type'], args['model_name']) @@ -125,9 +125,11 @@ def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data if num == 0 and args["output_dir"] is not None: for bs_idx, in_text in enumerate(input_text_list): llm_bench_utils.output_file.output_image_input_text(in_text, args, image_id, bs_idx, proc_id) + callback.reset() start = time.perf_counter() - res = pipe.generate(input_text, **input_args).data + res = pipe.generate(input_text, **input_args, callback=callback).data end = time.perf_counter() + callback.duration = end - start if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.end_collect_momory_consumption() max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() @@ -155,7 +157,7 @@ def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data max_rss_mem=max_rss_mem_consumption, max_shared_mem=max_shared_mem_consumption, max_uss_mem=max_uss_mem_consumption, - stable_diffusion=None, + stable_diffusion=callback, prompt_idx=image_id ) metrics_print.print_generated(num, warm_up=(num == 0), generated=rslt_img_fn, prompt_idx=image_id) @@ -163,7 +165,7 @@ def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data def run_image_generation_benchmark(model_path, framework, device, args, num_iters, mem_consumption): - pipe, pretrain_time, use_genai = FW_UTILS[framework].create_image_gen_model(model_path, device, **args) + pipe, pretrain_time, use_genai, callback = FW_UTILS[framework].create_image_gen_model(model_path, device, **args) iter_data_list = [] input_image_list = get_image_prompt(args) if framework == "ov" and not use_genai: @@ -198,7 +200,7 @@ def run_image_generation_benchmark(model_path, framework, device, args, num_iter for image_id, image_param in enumerate(image_list): p_idx = prompt_idx_list[image_id] iter_timestamp[num][p_idx]['start'] = datetime.datetime.now().isoformat() - image_gen_fn(image_param, num, prompt_idx_list[image_id], pipe, args, iter_data_list, proc_id, mem_consumption) + image_gen_fn(image_param, num, prompt_idx_list[image_id], pipe, args, iter_data_list, proc_id, mem_consumption, callback) iter_timestamp[num][p_idx]['end'] = datetime.datetime.now().isoformat() prefix = '[warm-up]' if num == 0 else '[{}]'.format(num) log.info(f"{prefix}[P{p_idx}] start: {iter_timestamp[num][p_idx]['start']}, end: {iter_timestamp[num][p_idx]['end']}") From 21037497e6958c7df020131d77984a953a4beb08 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Mon, 25 Nov 2024 12:09:04 +0100 Subject: [PATCH 13/24] align with the openvino_tokenizers --- src/cpp/include/openvino/genai/tokenizer.hpp | 6 +++--- src/cpp/src/make_tokenizer_stateful.cpp | 13 +++++++++---- src/cpp/src/tokenizer.cpp | 5 ++--- src/python/py_tokenizer.cpp | 6 +++--- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 8d2d63ea80..36f63d2b5e 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -87,7 +87,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /** * @brief decode sequence of tokens * @param tokens vector storing tokens - * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false} + * @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false} * @return sequence string */ std::string decode(std::vector tokens, const ov::AnyMap& detokenization_params = {}); @@ -106,7 +106,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /** * @brief decode tokens. * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len] - * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false} + * @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false} * @return vector of std::string, with size = batch_size */ std::vector decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params = {}); @@ -125,7 +125,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /** * @brief batched decoding of tokens. * @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size - * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false} + * @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false} * @return vector of std::string, with size equal to batch_size */ std::vector decode(std::vector> tokens, const ov::AnyMap& detokenization_params = {}); diff --git a/src/cpp/src/make_tokenizer_stateful.cpp b/src/cpp/src/make_tokenizer_stateful.cpp index 3551e713c9..4685b0e715 100644 --- a/src/cpp/src/make_tokenizer_stateful.cpp +++ b/src/cpp/src/make_tokenizer_stateful.cpp @@ -60,7 +60,8 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr skip_tokens_const = std::dynamic_pointer_cast(vocab_decoder_node->get_input_node_shared_ptr(4)); - if (!skip_tokens_const) + std::shared_ptr skip_tokens_slice = std::dynamic_pointer_cast(vocab_decoder_node->get_input_node_shared_ptr(4)); + if (!skip_tokens_const && !skip_tokens_slice) return false; auto start_const = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{0}); @@ -74,10 +75,14 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr(int_max_const, read_value); - std::shared_ptr slice_node = std::make_shared(skip_tokens_const, start_const, stop, one_const); + // If already has slice just replace the stop input. + if (skip_tokens_slice) { + skip_tokens_slice->input(2).replace_source_output(stop); + } else { + std::shared_ptr slice_node = std::make_shared(skip_tokens_const, start_const, stop, one_const); + vocab_decoder_node->input(4).replace_source_output(slice_node->output(0)); + } - vocab_decoder_node->input(4).replace_source_output(slice_node->output(0)); - auto assign = std::make_shared(read_value, variable); model->add_sinks({assign}); model->add_variables({variable}); diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index fc6ba75d90..d0a472a40f 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -74,7 +74,7 @@ class Tokenizer::TokenizerImpl { // To change the adding special tokens mode we use a statefull subgraph, // this flag holds the current state value of the CompiledModel. bool m_add_special_tokens = true; - bool m_skip_special_tokens = false; + bool m_skip_special_tokens = true; bool m_older_than_24_5 = false; int64_t m_pad_token_id = -1; @@ -89,7 +89,7 @@ class Tokenizer::TokenizerImpl { void set_state_if_necessary(CircularBufferQueueElementGuard& infer_request_guard, const ov::AnyMap& params) { bool add_special_tokens_flag = true; - bool skip_special_tokens_flag = false; + bool skip_special_tokens_flag = true; ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag); ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag); @@ -164,7 +164,6 @@ class Tokenizer::TokenizerImpl { m_detokenizer = core.compile_model(ov_detokenizer, device, properties); } - const size_t INFER_REQUEST_QUEUE_SIZE = m_tokenizer.get_property(ov::optimal_number_of_infer_requests); m_ireq_queue_tokenizer = std::make_unique>( INFER_REQUEST_QUEUE_SIZE, diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp index dae2ffe775..db4643a65c 100644 --- a/src/python/py_tokenizer.cpp +++ b/src/python/py_tokenizer.cpp @@ -68,7 +68,7 @@ void init_tokenizer(py::module_& m) { detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens; return pyutils::handle_utf8(tok.decode(tokens, detokenization_params)); }, - py::arg("tokens"), py::arg("skip_special_tokens") = false, + py::arg("tokens"), py::arg("skip_special_tokens") = true, R"(Decode a sequence into a string prompt.)" ) @@ -79,7 +79,7 @@ void init_tokenizer(py::module_& m) { detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens; return pyutils::handle_utf8(tok.decode(tokens, detokenization_params)); }, - py::arg("tokens"), py::arg("skip_special_tokens") = false, + py::arg("tokens"), py::arg("skip_special_tokens") = true, R"(Decode tensor into a list of string prompts.)") .def( @@ -89,7 +89,7 @@ void init_tokenizer(py::module_& m) { detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens; return pyutils::handle_utf8(tok.decode(tokens, detokenization_params)); }, - py::arg("tokens"), py::arg("skip_special_tokens") = false, + py::arg("tokens"), py::arg("skip_special_tokens") = true, R"(Decode a batch of tokens into a list of string prompt.)") .def("apply_chat_template", [](Tokenizer& tok, From d26233b172d60063e50257058513a560e8e591b1 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Mon, 25 Nov 2024 12:56:37 +0100 Subject: [PATCH 14/24] update signature --- src/python/openvino_genai/py_openvino_genai.pyi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index df290a9744..5e4d2dd7b2 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -1303,17 +1303,17 @@ class Tokenizer: Embeds input prompts with special tags for a chat scenario. """ @typing.overload - def decode(self, tokens: list[int]) -> str: + def decode(self, tokens: list[int], skip_special_tokens: bool = True) -> str: """ Decode a sequence into a string prompt. """ @typing.overload - def decode(self, tokens: openvino._pyopenvino.Tensor) -> list[str]: + def decode(self, tokens: openvino._pyopenvino.Tensor, skip_special_tokens: bool = True) -> list[str]: """ Decode tensor into a list of string prompts. """ @typing.overload - def decode(self, tokens: list[list[int]]) -> list[str]: + def decode(self, tokens: list[list[int]], skip_special_tokens: bool = True) -> list[str]: """ Decode a batch of tokens into a list of string prompt. """ From 111bb5bb2afe5b6cc4b01ea935ed7af38c6075de Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Tue, 26 Nov 2024 10:45:37 +0100 Subject: [PATCH 15/24] add barier for AnyMap key names, apply review comments --- src/cpp/src/tokenizer.cpp | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index d0a472a40f..41f9a6abd4 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -55,6 +55,14 @@ ov::genai::TokenizedInputs pad_left(ov::Tensor& input_ids, ov::Tensor& attention return {input_ids, attention_mask}; } +void check_arguments(const ov::AnyMap& parameters, std::set allowed_argnames) { + for (const auto& [key, value] : parameters) { + if (allowed_argnames.find(key) == allowed_argnames.end()) { + OPENVINO_THROW("unacceptable parameter key: " + key); + } + } +} + constexpr char bos_token_key_name[] = "bos_token"; constexpr char eos_token_key_name[] = "eos_token"; constexpr char pad_token_key_name[] = "pad_token"; @@ -88,8 +96,8 @@ class Tokenizer::TokenizerImpl { std::string m_chat_template = {}; void set_state_if_necessary(CircularBufferQueueElementGuard& infer_request_guard, const ov::AnyMap& params) { - bool add_special_tokens_flag = true; - bool skip_special_tokens_flag = true; + bool add_special_tokens_flag = m_add_special_tokens; + bool skip_special_tokens_flag = m_skip_special_tokens; ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag); ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag); @@ -145,7 +153,7 @@ class Tokenizer::TokenizerImpl { auto device = "CPU"; // currently openvino_tokenizer supports only CPU auto ov_tokenizer = core.read_model(tokenizer_path / "openvino_tokenizer.xml"); - std::shared_ptr ov_detokenizer; + std::shared_ptr ov_detokenizer = nullptr; if (std::filesystem::exists(tokenizer_path / "openvino_detokenizer.xml")) { ov_detokenizer = core.read_model(tokenizer_path / "openvino_detokenizer.xml"); } @@ -155,12 +163,11 @@ class Tokenizer::TokenizerImpl { manager_tok.register_pass(); manager_tok.run_passes(ov_tokenizer); - ov::pass::Manager manager_detok; - manager_detok.register_pass(); - manager_detok.run_passes(ov_detokenizer); - m_tokenizer = core.compile_model(ov_tokenizer, device, properties); - if (std::filesystem::exists(tokenizer_path / "openvino_detokenizer.xml")) { + if (ov_detokenizer) { + ov::pass::Manager manager_detok; + manager_detok.register_pass(); + manager_detok.run_passes(ov_detokenizer); m_detokenizer = core.compile_model(ov_detokenizer, device, properties); } @@ -516,30 +523,37 @@ Tokenizer::Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyM } TokenizedInputs Tokenizer::encode(const std::string prompt, const ov::AnyMap& tokenization_params) { + check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()}); return m_pimpl->encode(std::move(prompt), tokenization_params); } TokenizedInputs Tokenizer::encode(std::vector& prompts, const ov::AnyMap& tokenization_params) { + check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()}); return m_pimpl->encode(prompts, tokenization_params); } TokenizedInputs Tokenizer::encode(std::vector&& prompts, const ov::AnyMap& tokenization_params) { + check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()}); return m_pimpl->encode(prompts, tokenization_params); } TokenizedInputs Tokenizer::encode(std::initializer_list& text, const ov::AnyMap& tokenization_params) { + check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()}); return encode(std::vector(text.begin(), text.end()), tokenization_params); } std::string Tokenizer::decode(std::vector tokens, const ov::AnyMap& detokenization_params) { + check_arguments(detokenization_params, {ov::genai::skip_special_tokens.name()}); return m_pimpl->decode(tokens, detokenization_params); } std::vector Tokenizer::decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params) { + check_arguments(detokenization_params, {ov::genai::skip_special_tokens.name()}); return m_pimpl->decode(tokens, detokenization_params); } std::vector Tokenizer::decode(std::vector> lines, const ov::AnyMap& detokenization_params) { + check_arguments(detokenization_params, {ov::genai::skip_special_tokens.name()}); return m_pimpl->decode(lines, detokenization_params); } From 3da2aebb6829856d25f391ae5f0e9d069cca6cd9 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 27 Nov 2024 00:04:10 +0400 Subject: [PATCH 16/24] [Build] Use officially released py-build-cmake version (#1253) --- pyproject.toml | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c9d5dce207..de3e5b5a9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,16 +3,31 @@ name = "openvino-genai" version = "2025.0.0.0" description = "Library of the most popular Generative AI model pipelines, optimized execution methods, and samples" requires-python = ">=3.9" -readme = {file = "src/README.md", content-type="text/markdown"} -license = {text = "OSI Approved :: Apache Software License"} +readme = { file = "src/README.md", content-type="text/markdown" } +license = { "file" = "LICENSE" } authors = [ { name = "OpenVINO Developers", email = "openvino@intel.com" }, ] classifiers = [ + "Development Status :: 5 - Production/Stable", + "License :: OSI Approved :: Apache Software License", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Libraries :: Python Modules", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Operating System :: Unix", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows", + "Operating System :: MacOS", + "Programming Language :: C++", + "Programming Language :: C", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: Implementation :: CPython" ] dependencies = [ "openvino_tokenizers~=2025.0.0.0.dev" @@ -36,7 +51,7 @@ options = {"BUILD_TOKENIZERS" = "OFF"} [build-system] requires = [ - "py-build-cmake@git+https://github.com/tttapa/py-build-cmake@7ab73da351c7140f06d727a8705bece4cf544cd9", + "py-build-cmake==0.3.0", "openvino~=2025.0.0.0.dev", "pybind11-stubgen==2.5.1", "cmake~=3.23.0" From fa1e95e965f915b0a1dab3b548967329f87925eb Mon Sep 17 00:00:00 2001 From: Dmitry Matveev Date: Wed, 27 Nov 2024 16:08:54 +0000 Subject: [PATCH 17/24] NPUW Deref: Baseline - don't hold pointers to the orig models (#1259) --- src/cpp/src/llm_pipeline_static.cpp | 32 ++++++++++++++--------------- src/cpp/src/llm_pipeline_static.hpp | 4 ---- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 2beb7d64be..597b5f69ac 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -675,45 +675,45 @@ void StaticLLMPipeline::setupAndCompileModels( // NB: Get information about NPU if available auto npudesc = extract_npu_descriptor(core); // (1) Read the template model - this will be kvcache model - m_kvcache_model = core.read_model((models_path / "openvino_model.xml").string()); + auto kvcache_model = core.read_model((models_path / "openvino_model.xml").string()); // (2) Expose KV-cache input and output layers from kvcache model - ov::pass::StatefulToStateless().run_on_model(m_kvcache_model); + ov::pass::StatefulToStateless().run_on_model(kvcache_model); // (3) Align u4 ZP constants - align_u4_zp_constants(m_kvcache_model); + align_u4_zp_constants(kvcache_model); // (4) Clone the model - this will be prefill - m_prefill_model = m_kvcache_model->clone(); - m_prefill_model->set_friendly_name(m_kvcache_model->get_friendly_name() + "_prefill"); + auto prefill_model = kvcache_model->clone(); + prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill"); // (5) Reshape both models to static shape const uint32_t kMaxPromptLen = align_to(pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u), 64u); const uint32_t kMinResponseLen = align_to(pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u), 64u); ModelDesc model_desc = get_modeldesc_from_json(models_path / "config.json"); KVAxesPosition axes = get_kv_axes(model_desc.type); m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len, false}; - reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); - reshape_to_static(m_kvcache_model, 1u, m_kvcache_desc.total_size, axes); + reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); + reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes); // (6) Apply opt layout if applicable // NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model if ( model_desc.name_or_path == "meta-llama/Llama-2-7b-chat-hf" || (model_desc.type == "llama" && model_desc.num_key_value_heads == 32)) { - if (optimize_value_tensors(m_kvcache_model)) { + if (optimize_value_tensors(kvcache_model)) { // NB: Check if TransposeValueTensors transformation was applied m_kvcache_desc.v_tensors_transposed = true; - m_prefill_model = cvt_value_tensors_layout(m_prefill_model); + prefill_model = cvt_value_tensors_layout(prefill_model); } } // (7) Replace KV-cache tensors for the entire cache to tensors only for new token (before concat) - m_kvcache_model = redirect_new_kv_to_output(m_kvcache_model); + kvcache_model = redirect_new_kv_to_output(kvcache_model); // (8) Convert kvcache tensors to fp16 precision - m_kvcache_model = cvt_kvcache_to_fp16(m_kvcache_model); - m_prefill_model = cvt_kvcache_to_fp16(m_prefill_model); + kvcache_model = cvt_kvcache_to_fp16(kvcache_model); + prefill_model = cvt_kvcache_to_fp16(prefill_model); // (9) Compile both model auto prefill_config = pop_or_default( - properties, "PREFILL_CONFIG", get_default_prefill_config(m_prefill_model, npudesc) + properties, "PREFILL_CONFIG", get_default_prefill_config(prefill_model, npudesc) ); // NB: GENERATE_HINT is only applicable for default generate config! auto generate_hint = str_to_hint(pop_or_default(properties, "GENERATE_HINT", "FAST_COMPILE")); auto generate_config = pop_or_default( - properties, "GENERATE_CONFIG", get_default_generate_config(m_kvcache_model, npudesc, generate_hint) + properties, "GENERATE_CONFIG", get_default_generate_config(kvcache_model, npudesc, generate_hint) ); merge_config_with(prefill_config, properties); merge_config_with(generate_config, properties); @@ -722,10 +722,10 @@ void StaticLLMPipeline::setupAndCompileModels( drop_cache_dir(generate_config); m_kvcache_request = core.compile_model( - m_kvcache_model, device, generate_config + kvcache_model, device, generate_config ).create_infer_request(); m_prefill_request = core.compile_model( - m_prefill_model, device, prefill_config + prefill_model, device, prefill_config ).create_infer_request(); } diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp index 2f9969f5d7..d8e59d867a 100644 --- a/src/cpp/src/llm_pipeline_static.hpp +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -61,10 +61,6 @@ class StaticLLMPipeline final : public LLMPipelineImplBase { bool v_tensors_transposed; }; - // FIXME: Ideally, we don't need to keep those - std::shared_ptr m_kvcache_model; - std::shared_ptr m_prefill_model; - KVCacheDesc m_kvcache_desc; ov::InferRequest m_kvcache_request; ov::InferRequest m_prefill_request; From 86068a5377466045ecda18c2181495e83ddeb19f Mon Sep 17 00:00:00 2001 From: Anna Likholat Date: Thu, 28 Nov 2024 11:34:24 +0100 Subject: [PATCH 18/24] Text2Image SDXL fix for GPU (#1266) CVS-156801 --- .../image_generation/stable_diffusion_xl_pipeline.hpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp index e7c8c35ce3..3c9130898f 100644 --- a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp @@ -111,12 +111,19 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { OPENVINO_THROW("Unsupported '", unet, "' UNet type"); } + // Temporary fix for GPU + ov::AnyMap updated_roperties = properties; + if (device.find("GPU") != std::string::npos && + updated_roperties.find("INFERENCE_PRECISION_HINT") == updated_roperties.end()) { + updated_roperties["INFERENCE_PRECISION_HINT"] = ov::element::f32; + } + const std::string vae = data["vae"][1].get(); if (vae == "AutoencoderKL") { if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) - m_vae = std::make_shared(root_dir / "vae_decoder", device, properties); + m_vae = std::make_shared(root_dir / "vae_decoder", device, updated_roperties); else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) { - m_vae = std::make_shared(root_dir / "vae_encoder", root_dir / "vae_decoder", device, properties); + m_vae = std::make_shared(root_dir / "vae_encoder", root_dir / "vae_decoder", device, updated_roperties); } else { OPENVINO_ASSERT("Unsupported pipeline type"); } From 13f1b446b593843397f29fabf90f91c14791f204 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Thu, 28 Nov 2024 19:16:16 +0400 Subject: [PATCH 19/24] Try to drop --pre (#1269) - `--pre` is not required for OpenVINO wheels as `~=2025.0.0.0.dev` already ensures that pre-releases can be installed - `--pre` affects all other packages, which leads to installation of unstable versions and broken whisper CI https://github.com/openvinotoolkit/openvino.genai/actions/runs/12056078081/job/33618027551?pr=1267 --- .../actions/install_python_deps/action.yml | 4 +- .github/workflows/causal_lm_cpp.yml | 60 +++++++++---------- .github/workflows/lcm_dreamshaper_cpp.yml | 4 +- .github/workflows/linux.yml | 6 +- .github/workflows/mac.yml | 6 +- .../workflows/stable_diffusion_1_5_cpp.yml | 4 +- .github/workflows/windows.yml | 8 +-- samples/deployment-requirements.txt | 3 +- samples/export-requirements.txt | 3 +- src/README.md | 2 +- 10 files changed, 49 insertions(+), 51 deletions(-) diff --git a/.github/actions/install_python_deps/action.yml b/.github/actions/install_python_deps/action.yml index 8f269cc42e..3b42f5fd9b 100644 --- a/.github/actions/install_python_deps/action.yml +++ b/.github/actions/install_python_deps/action.yml @@ -11,5 +11,5 @@ runs: shell: bash run: | source ${{ inputs.ov_dir }}/setupvars.sh - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index c75ac3214c..ce3ac5f046 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -46,8 +46,8 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2 optimum-cli export openvino -m TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T wget https://huggingface.co/smangrul/tinyllama_lora_sql/resolve/main/adapter_model.safetensors?download=true -O adapter_model.safetensors @@ -105,8 +105,8 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Compare env: @@ -241,8 +241,8 @@ jobs: - name: Download and convert model run: | call .\ov\setupvars.bat - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 optimum-cli export openvino -m TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T curl -o adapter_model.safetensors -s -L https://huggingface.co/smangrul/tinyllama_lora_sql/resolve/main/adapter_model.safetensors?download=true @@ -299,8 +299,8 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat - run: > . ./ov/setupvars.sh @@ -333,8 +333,8 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat - run: > . ./ov/setupvars.sh @@ -368,8 +368,8 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2 - run: > . ./ov/setupvars.sh @@ -403,8 +403,8 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1 - run: > . ./ov/setupvars.sh @@ -438,8 +438,8 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b - name: run and compare @@ -488,8 +488,8 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat --task text-generation-with-past - name: run and compare @@ -560,8 +560,8 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5 - name: Run Generation run: | @@ -615,8 +615,8 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat - name: Run Generation run: | @@ -670,8 +670,8 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Compare env: @@ -863,8 +863,8 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Run gtests run: | @@ -909,8 +909,8 @@ jobs: - name: Download and convert and model run: | call .\ov\setupvars.bat - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Run gtests run: | @@ -954,8 +954,8 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Run gtests run: | diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml index 6bd25cbdfe..233be9e5c0 100644 --- a/.github/workflows/lcm_dreamshaper_cpp.yml +++ b/.github/workflows/lcm_dreamshaper_cpp.yml @@ -59,7 +59,7 @@ jobs: - name: Install python dependencies run: | source openvino_lcm_cpp/bin/activate - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install -r ./samples/requirements.txt - name: Download and convert models and tokenizer @@ -119,7 +119,7 @@ jobs: - name: Install python dependencies run: | . "./openvino_lcm_cpp/Scripts/Activate.ps1" - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install -r ./samples/requirements.txt - name: Download and convert models and tokenizer diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 3c3e0347e7..44e115423c 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -258,7 +258,7 @@ jobs: - name: Test bindings run: | source ${OV_INSTALL_DIR}/setupvars.sh - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template env: PYTHONPATH: "./build/:$PYTHONPATH" @@ -349,7 +349,7 @@ jobs: - name: Test bindings run: | source ${OV_INSTALL_DIR}/setupvars.sh - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke env: PYTHONPATH: "./build/:$PYTHONPATH" @@ -437,7 +437,7 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --find-links ${OV_INSTALL_DIR}/wheels - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels + python -m pip install -r ./samples/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny whisper-tiny diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index 935d6556b3..5b1b7622ac 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -224,7 +224,7 @@ jobs: - name: Test bindings run: | source ${OV_INSTALL_DIR}/setupvars.sh - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template env: PYTHONPATH: "./build/:$PYTHONPATH" @@ -288,7 +288,7 @@ jobs: - name: Test bindings run: | source ${OV_INSTALL_DIR}/setupvars.sh - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke env: PYTHONPATH: "./build/:$PYTHONPATH" @@ -354,7 +354,7 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --find-links ${OV_INSTALL_DIR}/wheels - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels + python -m pip install -r ./samples/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny whisper-tiny diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml index f36ac43839..8a262cfd97 100644 --- a/.github/workflows/stable_diffusion_1_5_cpp.yml +++ b/.github/workflows/stable_diffusion_1_5_cpp.yml @@ -59,7 +59,7 @@ jobs: - name: Install python dependencies run: | source openvino_sd_cpp/bin/activate - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install -r ./samples/requirements.txt - name: Download and convert models and tokenizer @@ -133,7 +133,7 @@ jobs: - name: Install python dependencies run: | . "./openvino_sd_cpp/Scripts/Activate.ps1" - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install -r ./samples/requirements.txt - name: Download and convert models and tokenizer diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 1e4164aa0b..17a1abb288 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -235,7 +235,7 @@ jobs: - name: Test bindings run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template env: PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. @@ -299,7 +299,7 @@ jobs: - name: Test bindings run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke env: PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. @@ -363,7 +363,7 @@ jobs: - name: Test bindings run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels python -m pytest -v ./tests/python_tests/test_vlm_api.py env: PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. @@ -425,7 +425,7 @@ jobs: run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --find-links ${env:OV_INSTALL_DIR}/wheels - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels + python -m pip install -r ./samples/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny whisper-tiny diff --git a/samples/deployment-requirements.txt b/samples/deployment-requirements.txt index c29f496c84..ceac668e9c 100644 --- a/samples/deployment-requirements.txt +++ b/samples/deployment-requirements.txt @@ -1,5 +1,4 @@ --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly ---pre openvino_genai~=2025.0.0.0.dev librosa==0.10.2 # For Whisper -pillow==11.0.0 # Image processing +pillow==11.0.0 # Image processing for VLMs diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt index a84926f746..aa9a0ccea9 100644 --- a/samples/export-requirements.txt +++ b/samples/export-requirements.txt @@ -1,12 +1,11 @@ --extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly ---pre openvino-tokenizers~=2025.0.0.0.dev optimum-intel @ git+https://github.com/huggingface/optimum-intel.git numpy<2.0.0; sys_platform == 'darwin' einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen -diffusers==0.31.0 +diffusers==0.31.0 # For image generation pipelines timm==1.0.11 # For exporting InternVL2 torchvision # For visual language models transformers>=4.43 # For Whisper diff --git a/src/README.md b/src/README.md index 9a96daa9d2..c90bc8f4e4 100644 --- a/src/README.md +++ b/src/README.md @@ -37,7 +37,7 @@ If you want to try OpenVINO GenAI with different dependencies versions (**not** git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git cd openvino.genai # Install python dependencies - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt ``` From 079f1d521319e0d2443a185754902e47b77c5e8c Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Thu, 28 Nov 2024 21:15:38 +0400 Subject: [PATCH 20/24] Fixed pyi file build when OpenVINO_DIR is externally defined (#1271) --- src/python/CMakeLists.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index 25d81277d6..75a2fd59a7 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -114,6 +114,21 @@ elseif(DEFINED PY_BUILD_CMAKE_PACKAGE_NAME AND NOT WIN32) # in case of wheel build, pybind11-stubgen is always available via pyproject.toml's build-system # except Win32 where we have issues with pybind11_stubgen executable which cannot import its own module set(pybind11_stubgen_AVAILABLE ON) + + # by default, wheel build is performed with build-isolation, which means that some variables like PYTHONPATH + # are not available. But if user called setupvars.sh, then OpenVINO dir is available, while PYTHONPATH - no. + # In this case, we will have mismatch on Linux when OpenVINO can point on build dir / install dir, while + # PYTHONPATH points out to locally installed tmp OpenVINO wheel (build against wheel). + # Ways to handle it: + # - setting PYTHONPATH to $ENV{INTEL_OPENVINO_DIR}/python if INTEL_OPENVINO_DIR is defined. It means we are building against + # OpenVINO archive or installation tree + # - if it's not defined, we cannot do any guesses and hence, disable pybind11-stubgen usage + if(DEFINED ENV{INTEL_OPENVINO_DIR}) + set(openvino_pythonpath "$ENV{INTEL_OPENVINO_DIR}/python") + elseif(LINUX AND NOT OpenVINO_DIR STREQUAL OpenVINO_DIR_PY) + # here we imply that OpenVINO_DIR_PY points to manylinux, while OpenVINO_DIR point to Ubuntu binaries + set(pybind11_stubgen_AVAILABLE OFF) + endif() endif() # but we also need to check whether OpenVINO is installed From bc5f4dbe751d603ef6e94afd133d9ee6e469fd88 Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Thu, 28 Nov 2024 19:39:00 +0000 Subject: [PATCH 21/24] StaticLLMPipeline: Decide when to enable NPUW_DQ_FULL property (#1258) Based on (yet to be) supported OV properties from the NPU Plugin enable NPUW_DQ_FULL. releases/2024/5 mirror: https://github.com/openvinotoolkit/openvino.genai/pull/1272 Dependencies * https://github.com/openvinotoolkit/openvino/pull/27678 needs to be merged first * https://github.com/openvinotoolkit/openvino/pull/27789 --- src/cpp/src/llm_pipeline_static.cpp | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 597b5f69ac..db2adbd19e 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -457,6 +457,7 @@ void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) { struct NPUDesc { std::string arch; int64_t max_tiles; + bool compiler_dq; }; std::optional extract_npu_descriptor(ov::Core& core) { @@ -466,7 +467,14 @@ std::optional extract_npu_descriptor(ov::Core& core) { } const auto arch = core.get_property("NPU", ov::device::architecture); const auto max_tiles = core.get_property("NPU", ov::intel_npu::max_tiles); - return std::make_optional(NPUDesc{arch, max_tiles}); + + bool compiler_dq = false; + const auto device_caps = core.get_property("NPU", ov::device::capabilities); + if (std::find(device_caps.begin(), device_caps.end(), + "COMPILER_DYNAMIC_QUANTIZATION") != device_caps.end()) { + compiler_dq = true; + } + return std::make_optional(NPUDesc{arch, max_tiles, compiler_dq}); } ov::AnyMap get_baseline_common_config() { @@ -508,6 +516,9 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, npudesc->max_tiles != -1) { config.emplace("NPU_DPU_GROUPS", npudesc->max_tiles); } + if (npudesc.has_value() && npudesc->compiler_dq) { + config.emplace("NPUW_DQ_FULL", "NO"); + } return config; } @@ -523,6 +534,9 @@ ov::AnyMap get_default_generate_config(const std::shared_ptr& model, if (npudesc.has_value() && npudesc->arch == "4000") { config.emplace("NPU_DPU_GROUPS", 4); } + if (npudesc.has_value() && npudesc->compiler_dq) { + config.emplace("NPUW_DQ_FULL", "NO"); + } return config; } From b43d31ed0604ec7add9af42ee62bb7d5d6a0abe8 Mon Sep 17 00:00:00 2001 From: Alexander Kozlov Date: Fri, 29 Nov 2024 11:50:46 +0300 Subject: [PATCH 22/24] Enable Phi-3.5-vision in HF format. Enable use of LLMs as a text embedding models for similarity compute. (#1276) Now it is possible to use `--data-encoder Qwen/Qwen2.5-1.5B` to plug LLM as a model for embedding computation. --- .../whowhatbench/whowhat_metrics.py | 8 +++++++- tools/who_what_benchmark/whowhatbench/wwb.py | 11 ++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py b/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py index bbf96a3312..2d1da24168 100644 --- a/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py +++ b/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py @@ -3,6 +3,7 @@ """ from difflib import SequenceMatcher +from transformers import AutoTokenizer from PIL import Image import torch import torch.nn.functional as F @@ -109,7 +110,12 @@ def evaluate_divergency(tokenizer, data_gold, data_prediction): class TextSimilarity: def __init__(self, model_id) -> None: - self.model = SentenceTransformer(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + if hasattr(tokenizer, "pad_token") and tokenizer.pad_token: + pad_token = tokenizer.pad_token + else: + pad_token = tokenizer.eos_token + self.model = SentenceTransformer(model_id, tokenizer_kwargs={"pad_token": pad_token}, trust_remote_code=True) def evaluate(self, gt, prediction): return evaluate_similarity(self.model, gt, prediction) diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py index 0a01a8e8df..f9aea15b47 100644 --- a/tools/who_what_benchmark/whowhatbench/wwb.py +++ b/tools/who_what_benchmark/whowhatbench/wwb.py @@ -178,9 +178,14 @@ def load_visual_text_model( model_id, trust_remote_code=True, device_map=device.lower() ) except ValueError: - model = AutoModel.from_pretrained( - model_id, trust_remote_code=True, device_map=device.lower() - ) + try: + model = AutoModel.from_pretrained( + model_id, trust_remote_code=True, device_map=device.lower() + ) + except ValueError: + model = AutoModelForCausalLM.from_pretrained( + model_id, trust_remote_code=True, device_map=device.lower(), _attn_implementation="eager", use_flash_attention_2=False + ) model.eval() elif use_genai: logger.info("Using OpenVINO GenAI API") From 402958b8975275fd3873c09c5095ca84abc2cea9 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 29 Nov 2024 21:18:13 +0400 Subject: [PATCH 23/24] [Python] Update docs with str => PathLike (#1278) --- .../openvino_genai/py_openvino_genai.pyi | 38 +++++++++---------- src/python/py_image_generation_models.cpp | 24 ++++++------ src/python/py_image_generation_pipelines.cpp | 4 +- src/python/py_llm_pipeline.cpp | 4 +- src/python/py_lora_adapter.cpp | 2 +- src/python/py_vlm_pipeline.cpp | 2 +- src/python/py_whisper_pipeline.cpp | 2 +- 7 files changed, 38 insertions(+), 38 deletions(-) diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 5e4d2dd7b2..1c386dc097 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -19,7 +19,7 @@ class Adapter: def __init__(self, path: os.PathLike) -> None: """ Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier. - path (str): Path to adapter file in safetensors format. + path (os.PathLike): Path to adapter file in safetensors format. """ class AdapterConfig: """ @@ -162,20 +162,20 @@ class AutoencoderKL: def __init__(self, vae_decoder_path: os.PathLike) -> None: """ AutoencoderKL class initialized only with decoder model. - vae_decoder_path (str): VAE decoder directory. + vae_decoder_path (os.PathLike): VAE decoder directory. """ @typing.overload def __init__(self, vae_encoder_path: os.PathLike, vae_decoder_path: os.PathLike) -> None: """ AutoencoderKL class initialized with both encoder and decoder models. - vae_encoder_path (str): VAE encoder directory. - vae_decoder_path (str): VAE decoder directory. + vae_encoder_path (os.PathLike): VAE encoder directory. + vae_decoder_path (os.PathLike): VAE decoder directory. """ @typing.overload def __init__(self, vae_decoder_path: os.PathLike, device: str, **kwargs) -> None: """ AutoencoderKL class initialized only with decoder model. - vae_decoder_path (str): VAE decoder directory. + vae_decoder_path (os.PathLike): VAE decoder directory. device (str): Device on which inference will be done. kwargs: Device properties. """ @@ -183,8 +183,8 @@ class AutoencoderKL: def __init__(self, vae_encoder_path: os.PathLike, vae_decoder_path: os.PathLike, device: str, **kwargs) -> None: """ AutoencoderKL class initialized only with both encoder and decoder models. - vae_encoder_path (str): VAE encoder directory. - vae_decoder_path (str): VAE decoder directory. + vae_encoder_path (os.PathLike): VAE encoder directory. + vae_decoder_path (os.PathLike): VAE decoder directory. device (str): Device on which inference will be done. kwargs: Device properties. """ @@ -228,13 +228,13 @@ class CLIPTextModel: def __init__(self, root_dir: os.PathLike) -> None: """ CLIPTextModel class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. """ @typing.overload def __init__(self, root_dir: os.PathLike, device: str, **kwargs) -> None: """ CLIPTextModel class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. device (str): Device on which inference will be done. kwargs: Device properties. """ @@ -277,13 +277,13 @@ class CLIPTextModelWithProjection: def __init__(self, root_dir: os.PathLike) -> None: """ CLIPTextModelWithProjection class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. """ @typing.overload def __init__(self, root_dir: os.PathLike, device: str, **kwargs) -> None: """ CLIPTextModelWithProjection class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. device (str): Device on which inference will be done. kwargs: Device properties. """ @@ -790,7 +790,7 @@ class LLMPipeline: def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, device: str, config: dict[str, typing.Any] = {}, **kwargs) -> None: """ LLMPipeline class constructor for manually created openvino_genai.Tokenizer. - models_path (str): Path to the model file. + models_path (os.PathLike): Path to the model file. tokenizer (openvino_genai.Tokenizer): tokenizer object. device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline. @@ -800,7 +800,7 @@ class LLMPipeline: def __init__(self, models_path: os.PathLike, device: str, config: dict[str, typing.Any] = {}, **kwargs) -> None: """ LLMPipeline class constructor. - models_path (str): Path to the model file. + models_path (os.PathLike): Path to the model file. device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline. kwargs: Device properties. @@ -1231,13 +1231,13 @@ class Text2ImagePipeline: def __init__(self, models_path: os.PathLike) -> None: """ Text2ImagePipeline class constructor. - models_path (str): Path to the folder with exported model files. + models_path (os.PathLike): Path to the folder with exported model files. """ @typing.overload def __init__(self, models_path: os.PathLike, device: str, **kwargs) -> None: """ Text2ImagePipeline class constructor. - models_path (str): Path with exported model files. + models_path (os.PathLike): Path with exported model files. device (str): Device to run the model on (e.g., CPU, GPU). kwargs: Text2ImagePipeline properties """ @@ -1360,13 +1360,13 @@ class UNet2DConditionModel: def __init__(self, root_dir: os.PathLike) -> None: """ UNet2DConditionModel class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. """ @typing.overload def __init__(self, root_dir: os.PathLike, device: str, **kwargs) -> None: """ UNet2DConditionModel class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. device (str): Device on which inference will be done. kwargs: Device properties. """ @@ -1403,7 +1403,7 @@ class VLMPipeline: """ device on which inference will be done VLMPipeline class constructor. - models_path (str): Path to the folder with exported model files. + models_path (os.PathLike): Path to the folder with exported model files. device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. kwargs: Device properties """ @@ -1640,7 +1640,7 @@ class WhisperPipeline: def __init__(self, models_path: os.PathLike, device: str, **kwargs) -> None: """ WhisperPipeline class constructor. - models_path (str): Path to the model file. + models_path (os.PathLike): Path to the model file. device (str): Device to run the model on (e.g., CPU, GPU). """ def generate(self, raw_speech_input: list[float], generation_config: WhisperGenerationConfig | None = None, streamer: typing.Callable[[str], bool] | ChunkStreamerBase | None = None, **kwargs) -> WhisperDecodedResults: diff --git a/src/python/py_image_generation_models.cpp b/src/python/py_image_generation_models.cpp index 221fc7363e..72a8970cb4 100644 --- a/src/python/py_image_generation_models.cpp +++ b/src/python/py_image_generation_models.cpp @@ -31,7 +31,7 @@ void init_clip_text_model(py::module_& m) { py::arg("root_dir"), "Model root directory", R"( CLIPTextModel class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. )") .def(py::init([]( const std::filesystem::path& root_dir, @@ -45,7 +45,7 @@ void init_clip_text_model(py::module_& m) { py::arg("device"), "Device on which inference will be done", R"( CLIPTextModel class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. device (str): Device on which inference will be done. kwargs: Device properties. )") @@ -101,7 +101,7 @@ void init_unet2d_condition_model(py::module_& m) { py::arg("root_dir"), "Model root directory", R"( UNet2DConditionModel class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. )") .def(py::init([]( const std::filesystem::path& root_dir, @@ -114,7 +114,7 @@ void init_unet2d_condition_model(py::module_& m) { py::arg("device"), "Device on which inference will be done", R"( UNet2DConditionModel class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. device (str): Device on which inference will be done. kwargs: Device properties. )") @@ -172,7 +172,7 @@ void init_autoencoder_kl(py::module_& m) { py::arg("vae_decoder_path"), "VAE decoder directory", R"( AutoencoderKL class initialized only with decoder model. - vae_decoder_path (str): VAE decoder directory. + vae_decoder_path (os.PathLike): VAE decoder directory. )") .def(py::init([]( const std::filesystem::path& vae_encoder_path, @@ -184,8 +184,8 @@ void init_autoencoder_kl(py::module_& m) { py::arg("vae_decoder_path"), "VAE decoder directory", R"( AutoencoderKL class initialized with both encoder and decoder models. - vae_encoder_path (str): VAE encoder directory. - vae_decoder_path (str): VAE decoder directory. + vae_encoder_path (os.PathLike): VAE encoder directory. + vae_decoder_path (os.PathLike): VAE decoder directory. )") .def(py::init([]( const std::filesystem::path& vae_decoder_path, @@ -198,7 +198,7 @@ void init_autoencoder_kl(py::module_& m) { py::arg("device"), "Device on which inference will be done", R"( AutoencoderKL class initialized only with decoder model. - vae_decoder_path (str): VAE decoder directory. + vae_decoder_path (os.PathLike): VAE decoder directory. device (str): Device on which inference will be done. kwargs: Device properties. )") @@ -215,8 +215,8 @@ void init_autoencoder_kl(py::module_& m) { py::arg("device"), "Device on which inference will be done", R"( AutoencoderKL class initialized only with both encoder and decoder models. - vae_encoder_path (str): VAE encoder directory. - vae_decoder_path (str): VAE decoder directory. + vae_encoder_path (os.PathLike): VAE encoder directory. + vae_decoder_path (os.PathLike): VAE decoder directory. device (str): Device on which inference will be done. kwargs: Device properties. )") @@ -276,7 +276,7 @@ void init_clip_text_model_with_projection(py::module_& m) { py::arg("root_dir"), "Model root directory", R"( CLIPTextModelWithProjection class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. )") .def(py::init([]( const std::filesystem::path& root_dir, @@ -290,7 +290,7 @@ void init_clip_text_model_with_projection(py::module_& m) { py::arg("device"), "Device on which inference will be done", R"( CLIPTextModelWithProjection class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. device (str): Device on which inference will be done. kwargs: Device properties. )") diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp index dade8a170e..d0d2f18a92 100644 --- a/src/python/py_image_generation_pipelines.cpp +++ b/src/python/py_image_generation_pipelines.cpp @@ -141,7 +141,7 @@ void init_image_generation_pipelines(py::module_& m) { py::arg("models_path"), "folder with exported model files.", R"( Text2ImagePipeline class constructor. - models_path (str): Path to the folder with exported model files. + models_path (os.PathLike): Path to the folder with exported model files. )") .def(py::init([]( @@ -156,7 +156,7 @@ void init_image_generation_pipelines(py::module_& m) { py::arg("device"), "device on which inference will be done", R"( Text2ImagePipeline class constructor. - models_path (str): Path with exported model files. + models_path (os.PathLike): Path with exported model files. device (str): Device to run the model on (e.g., CPU, GPU). kwargs: Text2ImagePipeline properties )") diff --git a/src/python/py_llm_pipeline.cpp b/src/python/py_llm_pipeline.cpp index 030688d821..7255022238 100644 --- a/src/python/py_llm_pipeline.cpp +++ b/src/python/py_llm_pipeline.cpp @@ -122,7 +122,7 @@ void init_llm_pipeline(py::module_& m) { py::arg("config") = ov::AnyMap({}), "openvino.properties map", R"( LLMPipeline class constructor for manually created openvino_genai.Tokenizer. - models_path (str): Path to the model file. + models_path (os.PathLike): Path to the model file. tokenizer (openvino_genai.Tokenizer): tokenizer object. device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline. @@ -151,7 +151,7 @@ void init_llm_pipeline(py::module_& m) { py::arg("config") = ov::AnyMap({}), "openvino.properties map", R"( LLMPipeline class constructor. - models_path (str): Path to the model file. + models_path (os.PathLike): Path to the model file. device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline. kwargs: Device properties. diff --git a/src/python/py_lora_adapter.cpp b/src/python/py_lora_adapter.cpp index 3186a7ca5c..7f98b67064 100644 --- a/src/python/py_lora_adapter.cpp +++ b/src/python/py_lora_adapter.cpp @@ -23,7 +23,7 @@ void init_lora_adapter(py::module_& m) { py::arg("path"), "path", R"( Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier. - path (str): Path to adapter file in safetensors format. + path (os.PathLike): Path to adapter file in safetensors format. )") .def( "__bool__", diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index 9572652204..fc58ddc913 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -86,7 +86,7 @@ void init_vlm_pipeline(py::module_& m) { py::arg("device"), "device on which inference will be done" R"( VLMPipeline class constructor. - models_path (str): Path to the folder with exported model files. + models_path (os.PathLike): Path to the folder with exported model files. device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. kwargs: Device properties )") diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp index d34bd5f3b6..7ecf71d2f0 100644 --- a/src/python/py_whisper_pipeline.cpp +++ b/src/python/py_whisper_pipeline.cpp @@ -323,7 +323,7 @@ void init_whisper_pipeline(py::module_& m) { "openvino.properties map", R"( WhisperPipeline class constructor. - models_path (str): Path to the model file. + models_path (os.PathLike): Path to the model file. device (str): Device to run the model on (e.g., CPU, GPU). )") From 6dd8261f2e6c8b5d2920fc22f89feb4edd7bfed1 Mon Sep 17 00:00:00 2001 From: Anna Likholat Date: Fri, 29 Nov 2024 19:10:51 +0100 Subject: [PATCH 24/24] Txt2img models from buffer (#1279) --- .../genai/image_generation/autoencoder_kl.hpp | 56 ++++++++++++++++++- .../image_generation/clip_text_model.hpp | 27 +++++++++ .../clip_text_model_with_projection.hpp | 27 +++++++++ .../flux_transformer_2d_model.hpp | 26 ++++++++- .../sd3_transformer_2d_model.hpp | 22 ++++++++ .../image_generation/t5_encoder_model.hpp | 23 +++++++- .../unet2d_condition_model.hpp | 27 +++++++++ .../models/autoencoder_kl.cpp | 54 ++++++++++++++++++ .../models/clip_text_model.cpp | 19 +++++++ .../clip_text_model_with_projection.cpp | 19 +++++++ .../models/flux_transformer_2d_model.cpp | 25 ++++++++- .../models/sd3_transformer_2d_model.cpp | 19 +++++++ .../models/t5_encoder_model.cpp | 17 ++++++ .../models/unet2d_condition_model.cpp | 19 +++++++ 14 files changed, 372 insertions(+), 8 deletions(-) diff --git a/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp b/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp index b838fbfd97..347925727a 100644 --- a/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp +++ b/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp @@ -45,13 +45,37 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL { const std::string& device, const ov::AnyMap& properties = {}); + AutoencoderKL(const std::string& vae_decoder_model, + const Tensor& vae_decoder_weights, + const Config& vae_decoder_config); + + AutoencoderKL(const std::string& vae_encoder_model, + const Tensor& vae_encoder_weights, + const std::string& vae_decoder_model, + const Tensor& vae_decoder_weights, + const Config& vae_decoder_config); + + AutoencoderKL(const std::string& vae_decoder_model, + const Tensor& vae_decoder_weights, + const Config& vae_decoder_config, + const std::string& device, + const ov::AnyMap& properties = {}); + + AutoencoderKL(const std::string& vae_encoder_model, + const Tensor& vae_encoder_weights, + const std::string& vae_decoder_model, + const Tensor& vae_decoder_weights, + const Config& vae_decoder_config, + const std::string& device, + const ov::AnyMap& properties = {}); + template ::value, bool>::type = true> AutoencoderKL(const std::filesystem::path& vae_decoder_path, const std::string& device, Properties&&... properties) : AutoencoderKL(vae_decoder_path, device, ov::AnyMap{std::forward(properties)...}) { } - + template ::value, bool>::type = true> AutoencoderKL(const std::filesystem::path& vae_encoder_path, @@ -60,6 +84,36 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL { Properties&&... properties) : AutoencoderKL(vae_encoder_path, vae_decoder_path, device, ov::AnyMap{std::forward(properties)...}) { } + template ::value, bool>::type = true> + AutoencoderKL(const std::string& vae_decoder_model, + const Tensor& vae_decoder_weights, + const Config& vae_decoder_config, + const std::string& device, + Properties&&... properties) + : AutoencoderKL(vae_decoder_model, + vae_decoder_weights, + vae_decoder_config, + device, + ov::AnyMap{std::forward(properties)...}) { } + + template ::value, bool>::type = true> + AutoencoderKL(const std::string& vae_encoder_model, + const Tensor& vae_encoder_weights, + const std::string& vae_decoder_model, + const Tensor& vae_decoder_weights, + const Config& vae_decoder_config, + const std::string& device, + Properties&&... properties) + : AutoencoderKL(vae_encoder_model, + vae_encoder_weights, + vae_decoder_model, + vae_decoder_weights, + vae_decoder_config, + device, + ov::AnyMap{std::forward(properties)...}) { } + AutoencoderKL(const AutoencoderKL&); AutoencoderKL& reshape(int batch_size, int height, int width); diff --git a/src/cpp/include/openvino/genai/image_generation/clip_text_model.hpp b/src/cpp/include/openvino/genai/image_generation/clip_text_model.hpp index 26f28abac2..a3b9ebbd88 100644 --- a/src/cpp/include/openvino/genai/image_generation/clip_text_model.hpp +++ b/src/cpp/include/openvino/genai/image_generation/clip_text_model.hpp @@ -33,6 +33,18 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModel { const std::string& device, const ov::AnyMap& properties = {}); + CLIPTextModel(const std::string& model, + const Tensor& weights, + const Config& config, + const Tokenizer& clip_tokenizer); + + CLIPTextModel(const std::string& model, + const Tensor& weights, + const Config& config, + const Tokenizer& clip_tokenizer, + const std::string& device, + const ov::AnyMap& properties = {}); + template ::value, bool>::type = true> CLIPTextModel(const std::filesystem::path& root_dir, @@ -40,6 +52,21 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModel { Properties&&... properties) : CLIPTextModel(root_dir, device, ov::AnyMap{std::forward(properties)...}) { } + template ::value, bool>::type = true> + CLIPTextModel(const std::string& model, + const Tensor& weights, + const Config& config, + const Tokenizer& clip_tokenizer, + const std::string& device, + Properties&&... properties) + : CLIPTextModel(model, + weights, + config, + clip_tokenizer, + device, + ov::AnyMap{std::forward(properties)...}) { } + CLIPTextModel(const CLIPTextModel&); const Config& get_config() const; diff --git a/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp b/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp index 157e378026..563fb8711d 100644 --- a/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp +++ b/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp @@ -33,6 +33,18 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModelWithProjection { const std::string& device, const ov::AnyMap& properties = {}); + CLIPTextModelWithProjection(const std::string& model, + const Tensor& weights, + const Config& config, + const Tokenizer& clip_tokenizer); + + CLIPTextModelWithProjection(const std::string& model, + const Tensor& weights, + const Config& config, + const Tokenizer& clip_tokenizer, + const std::string& device, + const ov::AnyMap& properties = {}); + template ::value, bool>::type = true> CLIPTextModelWithProjection(const std::filesystem::path& root_dir, @@ -40,6 +52,21 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModelWithProjection { Properties&&... properties) : CLIPTextModelWithProjection(root_dir, device, ov::AnyMap{std::forward(properties)...}) { } + template ::value, bool>::type = true> + CLIPTextModelWithProjection(const std::string& model, + const Tensor& weights, + const Config& config, + const Tokenizer& clip_tokenizer, + const std::string& device, + Properties&&... properties) + : CLIPTextModelWithProjection(model, + weights, + config, + clip_tokenizer, + device, + ov::AnyMap{std::forward(properties)...}) { } + CLIPTextModelWithProjection(const CLIPTextModelWithProjection&); const Config& get_config() const; diff --git a/src/cpp/include/openvino/genai/image_generation/flux_transformer_2d_model.hpp b/src/cpp/include/openvino/genai/image_generation/flux_transformer_2d_model.hpp index 03defd5350..f0f89d03d7 100644 --- a/src/cpp/include/openvino/genai/image_generation/flux_transformer_2d_model.hpp +++ b/src/cpp/include/openvino/genai/image_generation/flux_transformer_2d_model.hpp @@ -28,14 +28,36 @@ class OPENVINO_GENAI_EXPORTS FluxTransformer2DModel { explicit FluxTransformer2DModel(const std::filesystem::path& root_dir); FluxTransformer2DModel(const std::filesystem::path& root_dir, - const std::string& device, - const ov::AnyMap& properties = {}); + const std::string& device, + const ov::AnyMap& properties = {}); + + FluxTransformer2DModel(const std::string& model, + const Tensor& weights, + const Config& config, + const size_t vae_scale_factor); + + FluxTransformer2DModel(const std::string& model, + const Tensor& weights, + const Config& config, + const size_t vae_scale_factor, + const std::string& device, + const ov::AnyMap& properties = {}); template ::value, bool>::type = true> FluxTransformer2DModel(const std::filesystem::path& root_dir, const std::string& device, Properties&&... properties) : FluxTransformer2DModel(root_dir, device, ov::AnyMap{std::forward(properties)...}) {} + template ::value, bool>::type = true> + FluxTransformer2DModel(const std::string& model, + const Tensor& weights, + const Config& config, + const size_t vae_scale_factor, + const std::string& device, + Properties&&... properties) + : FluxTransformer2DModel(model, weights, config, vae_scale_factor, device, ov::AnyMap{std::forward(properties)...}) {} + FluxTransformer2DModel(const FluxTransformer2DModel&); const Config& get_config() const; diff --git a/src/cpp/include/openvino/genai/image_generation/sd3_transformer_2d_model.hpp b/src/cpp/include/openvino/genai/image_generation/sd3_transformer_2d_model.hpp index 9f3f8ec5f5..e4641066ec 100644 --- a/src/cpp/include/openvino/genai/image_generation/sd3_transformer_2d_model.hpp +++ b/src/cpp/include/openvino/genai/image_generation/sd3_transformer_2d_model.hpp @@ -34,11 +34,33 @@ class OPENVINO_GENAI_EXPORTS SD3Transformer2DModel { const std::string& device, const ov::AnyMap& properties = {}); + SD3Transformer2DModel(const std::string& model, + const Tensor& weights, + const Config& config, + const size_t vae_scale_factor); + + SD3Transformer2DModel(const std::string& model, + const Tensor& weights, + const Config& config, + const size_t vae_scale_factor, + const std::string& device, + const ov::AnyMap& properties = {}); + template ::value, bool>::type = true> SD3Transformer2DModel(const std::filesystem::path& root_dir, const std::string& device, Properties&&... properties) : SD3Transformer2DModel(root_dir, device, ov::AnyMap{std::forward(properties)...}) {} + template ::value, bool>::type = true> + SD3Transformer2DModel(const std::string& model, + const Tensor& weights, + const Config& config, + const size_t vae_scale_factor, + const std::string& device, + Properties&&... properties) + : SD3Transformer2DModel(model, weights, config, vae_scale_factor, device, ov::AnyMap{std::forward(properties)...}) {} + SD3Transformer2DModel(const SD3Transformer2DModel&); const Config& get_config() const; diff --git a/src/cpp/include/openvino/genai/image_generation/t5_encoder_model.hpp b/src/cpp/include/openvino/genai/image_generation/t5_encoder_model.hpp index d72b7ab411..717871d1d9 100644 --- a/src/cpp/include/openvino/genai/image_generation/t5_encoder_model.hpp +++ b/src/cpp/include/openvino/genai/image_generation/t5_encoder_model.hpp @@ -26,13 +26,32 @@ class OPENVINO_GENAI_EXPORTS T5EncoderModel { const std::string& device, const ov::AnyMap& properties = {}); + T5EncoderModel(const std::string& model, + const Tensor& weights, + const Tokenizer& tokenizer); + + T5EncoderModel(const std::string&model, + const Tensor& weights, + const Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& properties = {}); + template ::value, bool>::type = true> T5EncoderModel(const std::filesystem::path& root_dir, - const std::string& device, - Properties&&... properties) + const std::string& device, + Properties&&... properties) : T5EncoderModel(root_dir, device, ov::AnyMap{std::forward(properties)...}) { } + template ::value, bool>::type = true> + T5EncoderModel(const std::string& model, + const Tensor& weights, + const Tokenizer& tokenizer, + const std::string& device, + Properties&&... properties) + : T5EncoderModel(model, weights, tokenizer, device, ov::AnyMap{std::forward(properties)...}) { } + T5EncoderModel(const T5EncoderModel&); T5EncoderModel& reshape(int batch_size, int max_sequence_length); diff --git a/src/cpp/include/openvino/genai/image_generation/unet2d_condition_model.hpp b/src/cpp/include/openvino/genai/image_generation/unet2d_condition_model.hpp index 85a370b449..4acfd2ce9b 100644 --- a/src/cpp/include/openvino/genai/image_generation/unet2d_condition_model.hpp +++ b/src/cpp/include/openvino/genai/image_generation/unet2d_condition_model.hpp @@ -36,6 +36,18 @@ class OPENVINO_GENAI_EXPORTS UNet2DConditionModel { const std::string& device, const ov::AnyMap& properties = {}); + UNet2DConditionModel(const std::string& model, + const Tensor& weights, + const Config& config, + const size_t vae_scale_factor); + + UNet2DConditionModel(const std::string& model, + const Tensor& weights, + const Config& config, + const size_t vae_scale_factor, + const std::string& device, + const ov::AnyMap& properties = {}); + template ::value, bool>::type = true> UNet2DConditionModel(const std::filesystem::path& root_dir, @@ -43,6 +55,21 @@ class OPENVINO_GENAI_EXPORTS UNet2DConditionModel { Properties&&... properties) : UNet2DConditionModel(root_dir, device, ov::AnyMap{std::forward(properties)...}) { } + template ::value, bool>::type = true> + UNet2DConditionModel(const std::string& model, + const Tensor& weights, + const Config& config, + const size_t vae_scale_factor, + const std::string& device, + Properties&&... properties) + : UNet2DConditionModel(model, + weights, + config, + vae_scale_factor, + device, + ov::AnyMap{std::forward(properties)...}) { } + UNet2DConditionModel(const UNet2DConditionModel&); const Config& get_config() const; diff --git a/src/cpp/src/image_generation/models/autoencoder_kl.cpp b/src/cpp/src/image_generation/models/autoencoder_kl.cpp index d7eaf18bf4..7c38cd77fa 100644 --- a/src/cpp/src/image_generation/models/autoencoder_kl.cpp +++ b/src/cpp/src/image_generation/models/autoencoder_kl.cpp @@ -129,6 +129,60 @@ AutoencoderKL::AutoencoderKL(const std::filesystem::path& vae_encoder_path, } } +AutoencoderKL::AutoencoderKL(const std::string& vae_decoder_model, + const Tensor& vae_decoder_weights, + const Config& vae_decoder_config) + : m_config(vae_decoder_config) { + ov::Core core = utils::singleton_core(); + m_decoder_model = core.read_model(vae_decoder_model, vae_decoder_weights); + // apply VaeImageProcessor postprocessing steps by merging them into the VAE decoder model + merge_vae_image_post_processing(); +} + +AutoencoderKL::AutoencoderKL(const std::string& vae_encoder_model, + const Tensor& vae_encoder_weights, + const std::string& vae_decoder_model, + const Tensor& vae_decoder_weights, + const Config& vae_decoder_config) + : AutoencoderKL(vae_decoder_model, vae_decoder_weights, vae_decoder_config) { + ov::Core core = utils::singleton_core(); + m_encoder_model = core.read_model(vae_encoder_model, vae_encoder_weights); + // apply VaeImageProcessor pre-processing steps by merging them into the VAE encoder + merge_vae_image_pre_processing(); +} + +AutoencoderKL::AutoencoderKL(const std::string& vae_decoder_model, + const Tensor& vae_decoder_weights, + const Config& vae_decoder_config, + const std::string& device, + const ov::AnyMap& properties) + : AutoencoderKL(vae_decoder_model, vae_decoder_weights, vae_decoder_config) { + if (auto filtered_properties = extract_adapters_from_properties(properties)) { + compile(device, *filtered_properties); + } else { + compile(device, properties); + } +} + +AutoencoderKL::AutoencoderKL(const std::string& vae_encoder_model, + const Tensor& vae_encoder_weights, + const std::string& vae_decoder_model, + const Tensor& vae_decoder_weights, + const Config& vae_decoder_config, + const std::string& device, + const ov::AnyMap& properties) + : AutoencoderKL(vae_encoder_model, + vae_encoder_weights, + vae_decoder_model, + vae_decoder_weights, + vae_decoder_config) { + if (auto filtered_properties = extract_adapters_from_properties(properties)) { + compile(device, *filtered_properties); + } else { + compile(device, properties); + } +} + AutoencoderKL::AutoencoderKL(const AutoencoderKL&) = default; AutoencoderKL& AutoencoderKL::reshape(int batch_size, int height, int width) { diff --git a/src/cpp/src/image_generation/models/clip_text_model.cpp b/src/cpp/src/image_generation/models/clip_text_model.cpp index f5a4d0940b..d2dab30bcf 100644 --- a/src/cpp/src/image_generation/models/clip_text_model.cpp +++ b/src/cpp/src/image_generation/models/clip_text_model.cpp @@ -48,6 +48,25 @@ CLIPTextModel::CLIPTextModel(const std::filesystem::path& root_dir, compile(device, properties); } +CLIPTextModel::CLIPTextModel(const std::string& model, + const Tensor& weights, + const Config& config, + const Tokenizer& clip_tokenizer) : + m_clip_tokenizer(clip_tokenizer), m_config(config) { + ov::Core core = utils::singleton_core(); + m_model = core.read_model(model, weights); +} + +CLIPTextModel::CLIPTextModel(const std::string& model, + const Tensor& weights, + const Config& config, + const Tokenizer& clip_tokenizer, + const std::string& device, + const ov::AnyMap& properties) : + CLIPTextModel(model, weights, config, clip_tokenizer) { + compile(device, properties); +} + CLIPTextModel::CLIPTextModel(const CLIPTextModel&) = default; const CLIPTextModel::Config& CLIPTextModel::get_config() const { diff --git a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp index 9a89fd73bc..13c7f5a442 100644 --- a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp +++ b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp @@ -39,6 +39,25 @@ CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::filesystem:: compile(device, properties); } +CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& model, + const Tensor& weights, + const Config& config, + const Tokenizer& clip_tokenizer) : + m_clip_tokenizer(clip_tokenizer), m_config(config) { + ov::Core core = utils::singleton_core(); + m_model = core.read_model(model, weights); +} + +CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& model, + const Tensor& weights, + const Config& config, + const Tokenizer& clip_tokenizer, + const std::string& device, + const ov::AnyMap& properties) : + CLIPTextModelWithProjection(model, weights, config, clip_tokenizer) { + compile(device, properties); +} + CLIPTextModelWithProjection::CLIPTextModelWithProjection(const CLIPTextModelWithProjection&) = default; const CLIPTextModelWithProjection::Config& CLIPTextModelWithProjection::get_config() const { diff --git a/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp index 92439be423..8bb66995b4 100644 --- a/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp +++ b/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp @@ -37,6 +37,25 @@ FluxTransformer2DModel::FluxTransformer2DModel(const std::filesystem::path& root compile(device, properties); } +FluxTransformer2DModel::FluxTransformer2DModel(const std::string& model, + const Tensor& weights, + const Config& config, + const size_t vae_scale_factor) : + m_config(config), m_vae_scale_factor(vae_scale_factor) { + ov::Core core = utils::singleton_core(); + m_model = core.read_model(model, weights); +} + +FluxTransformer2DModel::FluxTransformer2DModel(const std::string& model, + const Tensor& weights, + const Config& config, + const size_t vae_scale_factor, + const std::string& device, + const ov::AnyMap& properties) : + FluxTransformer2DModel(model, weights, config, vae_scale_factor) { + compile(device, properties); +} + FluxTransformer2DModel::FluxTransformer2DModel(const FluxTransformer2DModel&) = default; const FluxTransformer2DModel::Config& FluxTransformer2DModel::get_config() const { @@ -44,9 +63,9 @@ const FluxTransformer2DModel::Config& FluxTransformer2DModel::get_config() const } FluxTransformer2DModel& FluxTransformer2DModel::reshape(int batch_size, - int height, - int width, - int tokenizer_model_max_length) { + int height, + int width, + int tokenizer_model_max_length) { OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot reshape already compiled model"); // hidden_states=latent_model_input, diff --git a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp index 38e3dad290..70dddb0476 100644 --- a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp +++ b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp @@ -39,6 +39,25 @@ SD3Transformer2DModel::SD3Transformer2DModel(const std::filesystem::path& root_d compile(device, properties); } +SD3Transformer2DModel::SD3Transformer2DModel(const std::string& model, + const Tensor& weights, + const Config& config, + const size_t vae_scale_factor) : + m_config(config), m_vae_scale_factor(vae_scale_factor) { + ov::Core core = utils::singleton_core(); + m_model = core.read_model(model, weights); +} + +SD3Transformer2DModel::SD3Transformer2DModel(const std::string& model, + const Tensor& weights, + const Config& config, + const size_t vae_scale_factor, + const std::string& device, + const ov::AnyMap& properties) : + SD3Transformer2DModel(model, weights, config, vae_scale_factor) { + compile(device, properties); +} + SD3Transformer2DModel::SD3Transformer2DModel(const SD3Transformer2DModel&) = default; const SD3Transformer2DModel::Config& SD3Transformer2DModel::get_config() const { diff --git a/src/cpp/src/image_generation/models/t5_encoder_model.cpp b/src/cpp/src/image_generation/models/t5_encoder_model.cpp index 2efe4986e8..e7629b2f26 100644 --- a/src/cpp/src/image_generation/models/t5_encoder_model.cpp +++ b/src/cpp/src/image_generation/models/t5_encoder_model.cpp @@ -27,6 +27,23 @@ T5EncoderModel::T5EncoderModel(const std::filesystem::path& root_dir, compile(device, properties); } +T5EncoderModel::T5EncoderModel(const std::string& model, + const Tensor& weights, + const Tokenizer& tokenizer) : + m_tokenizer(tokenizer) { + ov::Core core = utils::singleton_core(); + m_model = core.read_model(model, weights); +} + +T5EncoderModel::T5EncoderModel(const std::string& model, + const Tensor& weights, + const Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& properties) : + T5EncoderModel(model, weights, tokenizer) { + compile(device, properties); +} + T5EncoderModel::T5EncoderModel(const T5EncoderModel&) = default; T5EncoderModel& T5EncoderModel::reshape(int batch_size, int max_sequence_length) { diff --git a/src/cpp/src/image_generation/models/unet2d_condition_model.cpp b/src/cpp/src/image_generation/models/unet2d_condition_model.cpp index 413acb638b..ca65c9d9d6 100644 --- a/src/cpp/src/image_generation/models/unet2d_condition_model.cpp +++ b/src/cpp/src/image_generation/models/unet2d_condition_model.cpp @@ -42,6 +42,25 @@ UNet2DConditionModel::UNet2DConditionModel(const std::filesystem::path& root_dir compile(device, properties); } +UNet2DConditionModel::UNet2DConditionModel(const std::string& model, + const Tensor& weights, + const Config& config, + const size_t vae_scale_factor) : + m_config(config), m_vae_scale_factor(vae_scale_factor) { + ov::Core core = utils::singleton_core(); + m_model = core.read_model(model, weights); +} + +UNet2DConditionModel::UNet2DConditionModel(const std::string& model, + const Tensor& weights, + const Config& config, + const size_t vae_scale_factor, + const std::string& device, + const ov::AnyMap& properties) : + UNet2DConditionModel(model, weights, config, vae_scale_factor) { + compile(device, properties); +} + UNet2DConditionModel::UNet2DConditionModel(const UNet2DConditionModel&) = default; const UNet2DConditionModel::Config& UNet2DConditionModel::get_config() const {