From 6cd66d0274ddc8fde544643f74113fb6c40d2394 Mon Sep 17 00:00:00 2001
From: Anna Likholat <anna.likholat@intel.com>
Date: Wed, 20 Nov 2024 19:17:38 +0100
Subject: [PATCH 01/24] Text2Image Readme update: decode method usage (#1237)

---
 samples/cpp/text2image/README.md              |  6 ++--
 samples/python/text2image/README.md           |  4 ++-
 .../src/image_generation/flux_pipeline.hpp    | 36 ++++++++++---------
 3 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/samples/cpp/text2image/README.md b/samples/cpp/text2image/README.md
index c5ffd53a84..ac736b2383 100644
--- a/samples/cpp/text2image/README.md
+++ b/samples/cpp/text2image/README.md
@@ -46,14 +46,16 @@ You can also add a callback to the `main.cpp` file to interrupt the image genera
 Please find the template of the callback usage below.
 
 ```cpp
-auto callback = [](size_t step, ov::Tensor& intermediate_res) -> bool {
+ov::genai::Text2ImagePipeline pipe(models_path, device);
+
+auto callback = [&](size_t step, ov::Tensor& intermediate_res) -> bool {
    std::cout << "Image generation step: " << step << std::endl;
+   ov::Tensor img = pipe.decode(intermediate_res); // get intermediate image tensor
    if (your_condition) // return true if you want to interrupt image generation
       return true;
    return false;
 };
 
-ov::genai::Text2ImagePipeline pipe(models_path, device);
 ov::Tensor image = pipe.generate(prompt,
    ...
    ov::genai::callback(callback)
diff --git a/samples/python/text2image/README.md b/samples/python/text2image/README.md
index 9421061885..2e841673d3 100644
--- a/samples/python/text2image/README.md
+++ b/samples/python/text2image/README.md
@@ -46,13 +46,15 @@ You can also add a callback to the `main.py` file to interrupt the image generat
 Please find the template of the callback usage below.
 
 ```python
+pipe = openvino_genai.Text2ImagePipeline(model_dir, device)
+
 def callback(step, intermediate_res):
    print("Image generation step: ", step)
+   image_tensor = pipe.decode(intermediate_res) # get intermediate image tensor
    if your_condition: # return True if you want to interrupt image generation
       return True
    return False
 
-pipe = openvino_genai.Text2ImagePipeline(model_dir, device)
 image = pipe.generate(
    ...
    callback = callback
diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp
index 101401d434..e684443e47 100644
--- a/src/cpp/src/image_generation/flux_pipeline.hpp
+++ b/src/cpp/src/image_generation/flux_pipeline.hpp
@@ -297,33 +297,33 @@ class FluxPipeline : public DiffusionPipeline {
     ov::Tensor generate(const std::string& positive_prompt,
                         ov::Tensor initial_image,
                         const ov::AnyMap& properties) override {
-        ImageGenerationConfig generation_config = m_generation_config;
-        generation_config.update_generation_config(properties);
+        m_custom_generation_config = m_generation_config;
+        m_custom_generation_config.update_generation_config(properties);
 
         if (!initial_image) {
             // in case of typical text to image generation, we need to ignore 'strength'
-            generation_config.strength = 1.0f;
+            m_custom_generation_config.strength = 1.0f;
         }
 
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
         const auto& transformer_config = m_transformer->get_config();
 
-        if (generation_config.height < 0)
-            generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor;
-        if (generation_config.width < 0)
-            generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor;
+        if (m_custom_generation_config.height < 0)
+            m_custom_generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor;
+        if (m_custom_generation_config.width < 0)
+            m_custom_generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor;
 
-        check_inputs(generation_config, initial_image);
+        check_inputs(m_custom_generation_config, initial_image);
 
-        compute_hidden_states(positive_prompt, generation_config);
+        compute_hidden_states(positive_prompt, m_custom_generation_config);
 
-        ov::Tensor latents = prepare_latents(initial_image, generation_config);
+        ov::Tensor latents = prepare_latents(initial_image, m_custom_generation_config);
 
         size_t image_seq_len = latents.get_shape()[1];
         float mu = m_scheduler->calculate_shift(image_seq_len);
 
-        float linspace_end = 1.0f / generation_config.num_inference_steps;
-        std::vector<float> sigmas = numpy_utils::linspace<float>(1.0f, linspace_end, generation_config.num_inference_steps, true);
+        float linspace_end = 1.0f / m_custom_generation_config.num_inference_steps;
+        std::vector<float> sigmas = numpy_utils::linspace<float>(1.0f, linspace_end, m_custom_generation_config.num_inference_steps, true);
 
         m_scheduler->set_timesteps_with_sigma(sigmas, mu);
         std::vector<float> timesteps = m_scheduler->get_float_timesteps();
@@ -345,7 +345,7 @@ class FluxPipeline : public DiffusionPipeline {
 
             ov::Tensor noise_pred_tensor = m_transformer->infer(latents, timestep);
 
-            auto scheduler_step_result = m_scheduler->step(noise_pred_tensor, latents, inference_step, generation_config.generator);
+            auto scheduler_step_result = m_scheduler->step(noise_pred_tensor, latents, inference_step, m_custom_generation_config.generator);
             latents = scheduler_step_result["latent"];
 
             if (do_callback) {
@@ -355,12 +355,16 @@ class FluxPipeline : public DiffusionPipeline {
             }
         }
 
-        latents = unpack_latents(latents, generation_config.height, generation_config.width, vae_scale_factor);
+        latents = unpack_latents(latents, m_custom_generation_config.height, m_custom_generation_config.width, vae_scale_factor);
         return m_vae->decode(latents);
     }
 
     ov::Tensor decode(const ov::Tensor latent) override {
-        return m_vae->decode(latent);
+        ov::Tensor unpacked_latent = unpack_latents(latent,
+                                                m_custom_generation_config.height,
+                                                m_custom_generation_config.width,
+                                                m_vae->get_vae_scale_factor());
+        return m_vae->decode(unpacked_latent);
     }
 
 private:
@@ -407,7 +411,7 @@ class FluxPipeline : public DiffusionPipeline {
     std::shared_ptr<CLIPTextModel> m_clip_text_encoder;
     std::shared_ptr<T5EncoderModel> m_t5_text_encoder;
     std::shared_ptr<AutoencoderKL> m_vae;
-
+    ImageGenerationConfig m_custom_generation_config;
 };
 
 }  // namespace genai

From cd05c8eb9ce1eb22411c2107afcdb1b3e2344fa9 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Wed, 20 Nov 2024 20:03:33 +0100
Subject: [PATCH 02/24] Fixed passing of generation config params to VLM
 generate. (#1180)

- Fixed passing of generation config params to VLM generate().
- Updated generation config params params list in
`update_config_from_kwargs()` method.

Ticket: CVS-157050

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .../openvino/genai/generation_config.hpp      |   2 +-
 src/cpp/src/llm_pipeline_static.cpp           |   7 +
 src/cpp/src/utils.hpp                         |  22 +-
 .../openvino_genai/py_openvino_genai.pyi      |   2 +-
 src/python/py_image_generation_pipelines.cpp  | 108 +--------
 src/python/py_tokenizer.cpp                   |  13 +-
 src/python/py_utils.cpp                       | 209 ++++++++++++------
 src/python/py_utils.hpp                       |   2 +-
 src/python/py_vlm_pipeline.cpp                |  44 +---
 src/python/py_whisper_pipeline.cpp            |  55 +----
 tests/cpp/utils.cpp                           |  21 ++
 tests/python_tests/test_generate_api.py       |  11 +-
 tests/python_tests/test_sampling.py           |   6 +-
 13 files changed, 220 insertions(+), 282 deletions(-)
 create mode 100644 tests/cpp/utils.cpp

diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
index 22edcb98c0..8d23b298ba 100644
--- a/src/cpp/include/openvino/genai/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -156,7 +156,7 @@ static constexpr ov::Property<bool> ignore_eos{"ignore_eos"};
 static constexpr ov::Property<size_t> min_new_tokens{"min_new_tokens"};
 static constexpr ov::Property<std::vector<std::string>> stop_strings{"stop_strings"};
 static constexpr ov::Property<bool> include_stop_str_in_output{"include_stop_str_in_output"};
-static constexpr ov::Property<std::vector<std::vector<int64_t>>> stop_token_ids{"stop_token_ids"};
+static constexpr ov::Property<std::set<int64_t>> stop_token_ids{"stop_token_ids"};
 
 static constexpr ov::Property<size_t> num_beam_groups{"num_beam_groups"};
 static constexpr ov::Property<size_t> num_beams{"num_beams"};
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 40089384a8..2beb7d64be 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -530,6 +530,13 @@ template <typename T>
 T pop_or_default(ov::AnyMap& config, const std::string& key, const T& default_value) {
     auto anyopt = pop_option(config, key);
     if (anyopt.has_value()) {
+        if (anyopt.value().empty()) {
+            if (ov::genai::utils::is_container<T>)
+                return T{};
+            else {
+                OPENVINO_THROW("Got empty ov::Any for key: " + key);
+            }
+        }
         return anyopt.value().as<T>();
     }
     return default_value;
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index 9adc46c87a..3487fccb81 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
+#include <type_traits>
 
 #include "openvino/genai/llm_pipeline.hpp"
 #include "openvino/runtime/core.hpp"
@@ -12,6 +13,16 @@ namespace ov {
 namespace genai {
 namespace utils {
 
+// Variable template that checks if a type has begin() and end() member functions
+template<typename, typename = void>
+constexpr bool is_container = false;
+ 
+template<typename T>
+constexpr bool is_container<T,
+    std::void_t<decltype(std::declval<T>().begin()),
+                decltype(std::declval<T>().end())>> = true;
+
+
 Tensor init_attention_mask(const Tensor& position_ids);
 
 void print_tensor(const ov::Tensor& tensor);
@@ -31,7 +42,16 @@ template <typename T>
 void read_anymap_param(const ov::AnyMap& config_map, const std::string& name, T& param) {
     auto it = config_map.find(name);
     if (it != config_map.end()) {
-        param = it->second.as<typename OmitOptional<T>::value>();
+        if (it->second.empty()) {
+            if (ov::genai::utils::is_container<T>)
+                param = T{};
+            else {
+                OPENVINO_THROW("Got empty ov::Any for parameter name: " + name);
+            }
+        }
+        else {
+            param = it->second.as<typename OmitOptional<T>::value>();
+        }
     }
 }
 
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index a16b74b703..df290a9744 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -1296,7 +1296,7 @@ class Tokenizer:
     openvino_genai.Tokenizer object is used to initialize Tokenizer
                if it's located in a different path than the main model.
     """
-    def __init__(self, tokenizer_path: os.PathLike, properties: dict[str, typing.Any] = {}) -> None:
+    def __init__(self, tokenizer_path: os.PathLike, properties: dict[str, typing.Any] = {}, **kwargs) -> None:
         ...
     def apply_chat_template(self, history: list[dict[str, str]], add_generation_prompt: bool, chat_template: str = '') -> str:
         """
diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp
index f70faaca61..dade8a170e 100644
--- a/src/python/py_image_generation_pipelines.cpp
+++ b/src/python/py_image_generation_pipelines.cpp
@@ -67,108 +67,6 @@ auto text2image_generate_docstring = R"(
 )";
 
 
-void update_image_generation_config_from_kwargs(
-    ov::genai::ImageGenerationConfig& config,
-    const py::kwargs& kwargs) {
-    for (const auto& item : kwargs) {
-        std::string key = py::cast<std::string>(item.first);
-        py::object value = py::cast<py::object>(item.second);
-
-        if (key == "prompt_2") {
-            config.prompt_2 = py::cast<std::string>(value);
-        } else if (key == "prompt_3") {
-            config.prompt_3 = py::cast<std::string>(value);
-        } else if (key == "negative_prompt") {
-            config.negative_prompt = py::cast<std::string>(value);
-        } else if (key == "negative_prompt_2") {
-            config.negative_prompt_2 = py::cast<std::string>(value);
-        } else if (key == "negative_prompt_3") {
-            config.negative_prompt_3 = py::cast<std::string>(value);
-        } else if (key == "num_images_per_prompt") {
-            config.num_images_per_prompt = py::cast<size_t>(value);
-        } else if (key == "guidance_scale") {
-            config.guidance_scale = py::cast<float>(value);
-        } else if (key == "height") {
-            config.height = py::cast<int64_t>(value);
-        } else if (key == "width") {
-            config.width = py::cast<int64_t>(value);
-        } else if (key == "num_inference_steps") {
-            config.num_inference_steps = py::cast<size_t>(value);
-        } else if (key == "generator") {
-            auto py_generator = py::cast<std::shared_ptr<ov::genai::Generator>>(value);
-            config.generator = py_generator;
-        } else if (key == "adapters") {
-            config.adapters = py::cast<ov::genai::AdapterConfig>(value);
-        } else if (key == "strength") {
-            config.strength = py::cast<float>(value);
-        } else if (key == "max_sequence_length") {
-            config.max_sequence_length = py::cast<size_t>(value);
-        } else {
-            throw(std::invalid_argument("'" + key + "' is unexpected parameter name. "
-                                        "Use help(openvino_genai.ImageGenerationConfig) to get list of acceptable parameters."));
-        }
-    }
-}
-
-ov::AnyMap text2image_kwargs_to_any_map(const py::kwargs& kwargs, bool allow_compile_properties=true) {
-    ov::AnyMap params = {};
-
-    for (const auto& item : kwargs) {
-        std::string key = py::cast<std::string>(item.first);
-        py::object value = py::cast<py::object>(item.second);
-
-        if (key == "prompt_2") {
-            params.insert({ov::genai::prompt_2(std::move(py::cast<std::string>(value)))});
-        } else if (key == "prompt_3") {
-            params.insert({ov::genai::prompt_3(std::move(py::cast<std::string>(value)))});
-        } else if (key == "negative_prompt") {
-            params.insert({ov::genai::negative_prompt(std::move(py::cast<std::string>(value)))});
-        } else if (key == "negative_prompt_2") {
-            params.insert({ov::genai::negative_prompt_2(std::move(py::cast<std::string>(value)))});
-        } else if (key == "negative_prompt_3") {
-            params.insert({ov::genai::negative_prompt_3(std::move(py::cast<std::string>(value)))});
-        } else if (key == "num_images_per_prompt") {
-            params.insert({ov::genai::num_images_per_prompt(std::move(py::cast<size_t>(value)))});
-        } else if (key == "guidance_scale") {
-            params.insert({ov::genai::guidance_scale(std::move(py::cast<float>(value)))});
-        } else if (key == "height") {
-            params.insert({ov::genai::height(std::move(py::cast<int64_t>(value)))});
-        } else if (key == "width") {
-            params.insert({ov::genai::width(std::move(py::cast<int64_t>(value)))});
-        } else if (key == "num_inference_steps") {
-            params.insert({ov::genai::num_inference_steps(std::move(py::cast<size_t>(value)))});
-        } else if (key == "generator") {
-            auto py_generator =py::cast<std::shared_ptr<ov::genai::Generator>>(value);
-            params.insert({ov::genai::generator(std::move(py_generator))});
-        } else if (key == "adapters") {
-            params.insert({ov::genai::adapters(std::move(py::cast<ov::genai::AdapterConfig>(value)))});
-        } else if (key == "strength") {
-            params.insert({ov::genai::strength(std::move(py::cast<float>(value)))});
-        } else if (key == "max_sequence_length") {
-            params.insert({ov::genai::max_sequence_length(std::move(py::cast<size_t>(value)))});
-        } else if (key == "callback") {
-            params.insert({ov::genai::callback(std::move(py::cast<std::function<bool(size_t, ov::Tensor&)>>(value)))});
-        }
-        else {
-            if (allow_compile_properties) {
-                // convert arbitrary objects to ov::Any
-                // not supported properties are not checked, as these properties are passed to compile(), which will throw exception in case of unsupported property
-                if (pyutils::py_object_is_any_map(value)) {
-                    auto map = pyutils::py_object_to_any_map(value);
-                    params.insert(map.begin(), map.end());
-                } else {
-                    params[key] = pyutils::py_object_to_any(value);
-                }
-            }
-            else {
-                // generate doesn't run compile(), so only Text2ImagePipeline specific properties are allowed
-                throw(std::invalid_argument("'" + key + "' is unexpected parameter name. "
-                                            "Use help(openvino_genai.Text2ImagePipeline.generate) to get list of acceptable parameters."));
-            }
-        }
-    }
-    return params;
-}
 
 } // namespace
 
@@ -230,7 +128,7 @@ void init_image_generation_pipelines(py::module_& m) {
         .def("update_generation_config", [](
             ov::genai::ImageGenerationConfig config,
             const py::kwargs& kwargs) {
-            update_image_generation_config_from_kwargs(config, kwargs);
+            config.update_generation_config(pyutils::kwargs_to_any_map(kwargs));
         });
 
     auto text2image_pipeline = py::class_<ov::genai::Text2ImagePipeline>(m, "Text2ImagePipeline", "This class is used for generation with text-to-image models.")
@@ -252,7 +150,7 @@ void init_image_generation_pipelines(py::module_& m) {
             const py::kwargs& kwargs
         ) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
-            return std::make_unique<ov::genai::Text2ImagePipeline>(models_path, device, text2image_kwargs_to_any_map(kwargs, true));
+            return std::make_unique<ov::genai::Text2ImagePipeline>(models_path, device, pyutils::kwargs_to_any_map(kwargs));
         }),
         py::arg("models_path"), "folder with exported model files.",
         py::arg("device"), "device on which inference will be done",
@@ -289,7 +187,7 @@ void init_image_generation_pipelines(py::module_& m) {
                 const std::string& prompt,
                 const py::kwargs& kwargs
             ) -> py::typing::Union<ov::Tensor> {
-                ov::AnyMap params = text2image_kwargs_to_any_map(kwargs, false);
+                ov::AnyMap params = pyutils::kwargs_to_any_map(kwargs);
                 return py::cast(pipe.generate(prompt, params));
             },
             py::arg("prompt"), "Input string",
diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp
index b3c52cd28b..2ccccff4c0 100644
--- a/src/python/py_tokenizer.cpp
+++ b/src/python/py_tokenizer.cpp
@@ -30,9 +30,18 @@ void init_tokenizer(py::module_& m) {
         R"(openvino_genai.Tokenizer object is used to initialize Tokenizer
            if it's located in a different path than the main model.)")
 
-        .def(py::init([](const std::filesystem::path& tokenizer_path, const std::map<std::string, py::object>& properties) {
+        .def(py::init([](const std::filesystem::path& tokenizer_path, const std::map<std::string, py::object>& properties, const py::kwargs& kwargs) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
-            return std::make_unique<ov::genai::Tokenizer>(tokenizer_path, pyutils::properties_to_any_map(properties));
+            auto kwargs_properties = pyutils::kwargs_to_any_map(kwargs);
+            if (properties.size()) {
+                PyErr_WarnEx(PyExc_DeprecationWarning, 
+                         "'properties' parameters is deprecated, please use kwargs to pass config properties instead.", 
+                         1);
+                auto map_properties = pyutils::properties_to_any_map(properties);
+                kwargs_properties.insert(map_properties.begin(), map_properties.end());
+            }
+
+            return std::make_unique<ov::genai::Tokenizer>(tokenizer_path, kwargs_properties);
         }), py::arg("tokenizer_path"), py::arg("properties") = ov::AnyMap({}))
 
         .def("encode", [](Tokenizer& tok, std::vector<std::string>& prompts, bool add_special_tokens) {
diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp
index a2e8630059..579fe6b789 100644
--- a/src/python/py_utils.cpp
+++ b/src/python/py_utils.cpp
@@ -6,11 +6,15 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <pybind11/stl_bind.h>
+#include <pybind11/functional.h>
 
 #include <openvino/runtime/auto/properties.hpp>
 
 #include "tokenizers_path.hpp"
 #include "openvino/genai/llm_pipeline.hpp"
+#include "openvino/genai/visual_language/pipeline.hpp"
+#include "openvino/genai/image_generation/generation_config.hpp"
+#include "openvino/genai/whisper_generation_config.hpp"
 
 namespace py = pybind11;
 namespace ov::genai::pybind::utils {
@@ -43,7 +47,7 @@ bool py_object_is_any_map(const py::object& py_obj) {
     });
 }
 
-ov::Any py_object_to_any(const py::object& py_obj);
+ov::Any py_object_to_any(const py::object& py_obj, std::string property_name);
 
 ov::AnyMap py_object_to_any_map(const py::object& py_obj) {
     OPENVINO_ASSERT(py_object_is_any_map(py_obj), "Unsupported attribute type.");
@@ -54,16 +58,34 @@ ov::AnyMap py_object_to_any_map(const py::object& py_obj) {
         if (py_object_is_any_map(value)) {
             return_value[key] = py_object_to_any_map(value);
         } else {
-            return_value[key] = py_object_to_any(value);
+            return_value[key] = py_object_to_any(value, key);
         }
     }
     return return_value;
 }
 
-ov::Any py_object_to_any(const py::object& py_obj) {
+ov::Any py_object_to_any(const py::object& py_obj, std::string property_name) {
     // Python types
+    // TODO: Remove this after ov::Any is fixed to allow pass types, that can be casted to target type. Ticket: 157622
+    std::set<std::string> size_t_properties = {
+        "max_new_tokens",
+        "max_length",
+        "min_new_tokens",
+        "logprobs",
+        "num_beam_groups",
+        "num_beams",
+        "num_return_sequences",
+        "no_repeat_ngram_size",
+        "top_k",
+        "rng_seed",
+        "num_assistant_tokens",
+        "max_initial_timestamp_index",
+        "num_images_per_prompt",
+        "num_inference_steps",
+        "max_sequence_length"
+    };
+
     py::object float_32_type = py::module_::import("numpy").attr("float32");
-    
     if (py::isinstance<py::str>(py_obj)) {
         return py_obj.cast<std::string>();
     } else if (py::isinstance<py::bool_>(py_obj)) {
@@ -71,16 +93,19 @@ ov::Any py_object_to_any(const py::object& py_obj) {
     } else if (py::isinstance<py::bytes>(py_obj)) {
         return py_obj.cast<std::string>();
     } else if (py::isinstance<py::float_>(py_obj)) {
-        return py_obj.cast<double>();
+        return py_obj.cast<float>();
     } else if (py::isinstance(py_obj, float_32_type)) {
         return py_obj.cast<float>();
     } else if (py::isinstance<py::int_>(py_obj)) {
+        if (size_t_properties.find(property_name) != size_t_properties.end()) {
+            return py_obj.cast<size_t>();
+        }
         return py_obj.cast<int64_t>();
     } else if (py::isinstance<py::none>(py_obj)) {
         return {};
     } else if (py::isinstance<py::list>(py_obj)) {
         auto _list = py_obj.cast<py::list>();
-        enum class PY_TYPE : int { UNKNOWN = 0, STR, INT, FLOAT, BOOL, PARTIAL_SHAPE };
+        enum class PY_TYPE : int { UNKNOWN = 0, STR, INT, FLOAT, BOOL, PARTIAL_SHAPE, TENSOR};
         PY_TYPE detected_type = PY_TYPE::UNKNOWN;
         for (const auto& it : _list) {
             auto check_type = [&](PY_TYPE type) {
@@ -88,7 +113,7 @@ ov::Any py_object_to_any(const py::object& py_obj) {
                     detected_type = type;
                     return;
                 }
-                OPENVINO_THROW("Incorrect attribute. Mixed types in the list are not allowed.");
+                OPENVINO_THROW("Incorrect value in \"" + property_name + "\". Mixed types in the list are not allowed.");
             };
             if (py::isinstance<py::str>(it)) {
                 check_type(PY_TYPE::STR);
@@ -100,6 +125,8 @@ ov::Any py_object_to_any(const py::object& py_obj) {
                 check_type(PY_TYPE::BOOL);
             } else if (py::isinstance<ov::PartialShape>(it)) {
                 check_type(PY_TYPE::PARTIAL_SHAPE);
+            } else if (py::isinstance<ov::Tensor>(it)) {
+                check_type(PY_TYPE::TENSOR);
             }
         }
 
@@ -117,10 +144,89 @@ ov::Any py_object_to_any(const py::object& py_obj) {
             return _list.cast<std::vector<bool>>();
         case PY_TYPE::PARTIAL_SHAPE:
             return _list.cast<std::vector<ov::PartialShape>>();
+        case PY_TYPE::TENSOR:
+            return _list.cast<std::vector<ov::Tensor>>();
+        default:
+            OPENVINO_THROW("Property \"" + property_name + "\" got unsupported type.");
+        }
+
+    } else if (py::isinstance<py::dict>(py_obj)) {
+        auto _dict = py_obj.cast<py::dict>();
+        enum class PY_TYPE : int { UNKNOWN = 0, STR, INT};
+        PY_TYPE detected_key_type = PY_TYPE::UNKNOWN;
+        PY_TYPE detected_value_type = PY_TYPE::UNKNOWN;
+        for (const auto& it : _dict) {
+            auto check_type = [&](PY_TYPE type, PY_TYPE& detected_type) {
+                if (detected_type == PY_TYPE::UNKNOWN || detected_type == type) {
+                    detected_type = type;
+                    return;
+                }
+                OPENVINO_THROW("Incorrect value in \"" + property_name + "\". Mixed types in the dict are not allowed.");
+            };
+            // check key type
+            if (py::isinstance<py::str>(it.first)) {
+                check_type(PY_TYPE::STR, detected_key_type);
+            }
+
+            // check value type
+            if (py::isinstance<py::int_>(it.second)) {
+                check_type(PY_TYPE::INT, detected_value_type);
+            }
+        }
+        if (_dict.empty()) {
+            return ov::Any();
+        }
+
+        switch (detected_key_type) {
+        case PY_TYPE::STR:
+            switch (detected_value_type) {
+            case PY_TYPE::INT:
+                return _dict.cast<std::map<std::string, int64_t>>();
+            default:
+                OPENVINO_THROW("Property \"" + property_name + "\" got unsupported type.");
+            }
+        default:
+            OPENVINO_THROW("Property \"" + property_name + "\" got unsupported type.");
+        }
+    } else if (py::isinstance<py::set>(py_obj)) {
+        auto _set = py_obj.cast<py::set>();
+        enum class PY_TYPE : int { UNKNOWN = 0, STR, INT, FLOAT, BOOL};
+        PY_TYPE detected_type = PY_TYPE::UNKNOWN;
+        for (const auto& it : _set) {
+            auto check_type = [&](PY_TYPE type) {
+                if (detected_type == PY_TYPE::UNKNOWN || detected_type == type) {
+                    detected_type = type;
+                    return;
+                }
+                OPENVINO_THROW("Incorrect value in \"" + property_name + "\". Mixed types in the set are not allowed.");
+            };
+            if (py::isinstance<py::str>(it)) {
+                check_type(PY_TYPE::STR);
+            } else if (py::isinstance<py::int_>(it)) {
+                check_type(PY_TYPE::INT);
+            } else if (py::isinstance<py::float_>(it)) {
+                check_type(PY_TYPE::FLOAT);
+            } else if (py::isinstance<py::bool_>(it)) {
+                check_type(PY_TYPE::BOOL);
+            }
+        }
+
+        if (_set.empty())
+            return ov::Any();
+
+        switch (detected_type) {
+        case PY_TYPE::STR:
+            return _set.cast<std::set<std::string>>();
+        case PY_TYPE::FLOAT:
+            return _set.cast<std::set<double>>();
+        case PY_TYPE::INT:
+            return _set.cast<std::set<int64_t>>();
+        case PY_TYPE::BOOL:
+            return _set.cast<std::set<bool>>();
         default:
-            OPENVINO_ASSERT(false, "Unsupported attribute type.");
+            OPENVINO_THROW("Property \"" + property_name + "\" got unsupported type.");
         }
-    
+
     // OV types
     } else if (py_object_is_any_map(py_obj)) {
         return py_object_to_any_map(py_obj);
@@ -156,18 +262,33 @@ ov::Any py_object_to_any(const py::object& py_obj) {
         return py::cast<ov::Output<ov::Node>>(py_obj);
     } else if (py::isinstance<ov::genai::SchedulerConfig>(py_obj)) {
         return py::cast<ov::genai::SchedulerConfig>(py_obj);
-    } else if (py::isinstance<ov::genai::AdapterConfig>(py_obj)) { 
+    } else if (py::isinstance<ov::genai::AdapterConfig>(py_obj)) {
         return py::cast<ov::genai::AdapterConfig>(py_obj);
+    } else if (py::isinstance<ov::genai::GenerationConfig>(py_obj)) {
+        return py::cast<ov::genai::GenerationConfig>(py_obj);
+    } else if (py::isinstance<ov::genai::ImageGenerationConfig>(py_obj)) {
+        return py::cast<ov::genai::ImageGenerationConfig>(py_obj);
+    } else if (py::isinstance<ov::genai::WhisperGenerationConfig>(py_obj)) {
+        return py::cast<ov::genai::WhisperGenerationConfig>(py_obj);
+    } else if (py::isinstance<ov::genai::StopCriteria>(py_obj)) {
+        return py::cast<ov::genai::StopCriteria>(py_obj);
+    } else if (py::isinstance<ov::genai::Generator>(py_obj)) {
+        return py::cast<std::shared_ptr<ov::genai::Generator>>(py_obj);
+    } else if (py::isinstance<py::function>(py_obj) && property_name == "callback") {
+        return py::cast<std::function<bool(size_t, ov::Tensor&)>>(py_obj);
+    } else if ((py::isinstance<py::function>(py_obj) || py::isinstance<ov::genai::StreamerBase>(py_obj) || py::isinstance<std::monostate>(py_obj)) && property_name == "streamer") {
+        auto streamer = py::cast<ov::genai::pybind::utils::PyBindStreamerVariant>(py_obj);
+        return ov::genai::streamer(pystreamer_to_streamer(streamer)).second;
     } else if (py::isinstance<py::object>(py_obj)) {
         return py_obj;
     }
-    OPENVINO_ASSERT(false, "Unsupported attribute type.");
+    OPENVINO_THROW("Property \"" + property_name + "\" got unsupported type.");
 }
 
 std::map<std::string, ov::Any> properties_to_any_map(const std::map<std::string, py::object>& properties) {
     std::map<std::string, ov::Any> properties_to_cpp;
     for (const auto& property : properties) {
-        properties_to_cpp[property.first] = py_object_to_any(property.second);
+        properties_to_cpp[property.first] = py_object_to_any(property.second, property.first);
     }
     return properties_to_cpp;
 }
@@ -179,11 +300,16 @@ ov::AnyMap kwargs_to_any_map(const py::kwargs& kwargs) {
     for (const auto& item : kwargs) {
         std::string key = py::cast<std::string>(item.first);
         py::object value = py::cast<py::object>(item.second);
-        if (utils::py_object_is_any_map(value)) {
+        // we need to unpack only dictionaries, which are passed with "config" name,
+        // because there are dictionary properties that should not be unpacked
+        if (utils::py_object_is_any_map(value) && key == "config") {
             auto map = utils::py_object_to_any_map(value);
             params.insert(map.begin(), map.end());
         } else {
-            params[key] = utils::py_object_to_any(value);
+            if (py::isinstance<py::none>(value)) {
+                OPENVINO_ASSERT(!py::isinstance<py::none>(value), "Property \"", key, "\" can't be None.");
+            }
+            params[key] = utils::py_object_to_any(value, key);
         }
 
     }
@@ -227,60 +353,9 @@ ov::genai::OptionalGenerationConfig update_config_from_kwargs(const ov::genai::O
     ov::genai::GenerationConfig res_config;
     if(config.has_value())
         res_config = *config;
- 
-    for (const auto& item : kwargs) {
-        std::string key = py::cast<std::string>(item.first);
-        py::object value = py::cast<py::object>(item.second);
-
-        if (item.second.is_none()) {
-            // Even if argument key name does not fit GenerationConfig name 
-            // it's not an error if it's not defined. 
-            // Some HF configs can have parameters for methods currently unsupported in ov_genai
-            // but if their values are not set / None, then this should not block 
-            // us from reading such configs, e.g. {"typical_p": None, 'top_p': 1.0,...}
-            return res_config;
-        }  
-        if (key == "max_new_tokens") {
-            res_config.max_new_tokens = py::cast<int>(item.second);
-        } else if (key == "max_length") {
-            res_config.max_length = py::cast<int>(item.second);
-        } else if (key == "ignore_eos") {
-            res_config.ignore_eos = py::cast<bool>(item.second);
-        } else if (key == "num_beam_groups") {
-            res_config.num_beam_groups = py::cast<int>(item.second);
-        } else if (key == "num_beams") {
-            res_config.num_beams = py::cast<int>(item.second);
-        } else if (key == "diversity_penalty") {
-            res_config.diversity_penalty = py::cast<float>(item.second);
-        } else if (key == "length_penalty") {
-            res_config.length_penalty = py::cast<float>(item.second);
-        } else if (key == "num_return_sequences") {
-            res_config.num_return_sequences = py::cast<int>(item.second);
-        } else if (key == "no_repeat_ngram_size") {
-            res_config.no_repeat_ngram_size = py::cast<int>(item.second);
-        } else if (key == "stop_criteria") {
-            res_config.stop_criteria = py::cast<StopCriteria>(item.second);
-        } else if (key == "temperature") {
-            res_config.temperature = py::cast<float>(item.second);
-        } else if (key == "top_p") {
-            res_config.top_p = py::cast<float>(item.second);
-        } else if (key == "top_k") {
-            res_config.top_k = py::cast<int>(item.second);
-        } else if (key == "do_sample") {
-            res_config.do_sample = py::cast<bool>(item.second);
-        } else if (key == "repetition_penalty") {
-            res_config.repetition_penalty = py::cast<float>(item.second);
-        } else if (key == "eos_token_id") {
-            res_config.set_eos_token_id(py::cast<int>(item.second));
-        } else if (key == "adapters") {
-            res_config.adapters = py::cast<ov::genai::AdapterConfig>(item.second);
-        } else {
-            throw(std::invalid_argument("'" + key + "' is incorrect GenerationConfig parameter name. "
-                                        "Use help(openvino_genai.GenerationConfig) to get list of acceptable parameters."));
-        }
-    }
-
+    res_config.update_generation_config(kwargs_to_any_map(kwargs));
     return res_config;
 }
 
+
 }  // namespace ov::genai::pybind::utils
diff --git a/src/python/py_utils.hpp b/src/python/py_utils.hpp
index 9213060660..20094196a6 100644
--- a/src/python/py_utils.hpp
+++ b/src/python/py_utils.hpp
@@ -28,7 +28,7 @@ py::list handle_utf8(const std::vector<std::string>& decoded_res);
 
 py::str handle_utf8(const std::string& text);
 
-ov::Any py_object_to_any(const py::object& py_obj);
+ov::Any py_object_to_any(const py::object& py_obj, std::string property_name);
 
 bool py_object_is_any_map(const py::object& py_obj);
 
diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp
index 30e2e04a14..9572652204 100644
--- a/src/python/py_vlm_pipeline.cpp
+++ b/src/python/py_vlm_pipeline.cpp
@@ -72,46 +72,6 @@ py::object call_vlm_generate(
     return py::cast(pipe.generate(prompt, images, updated_config, streamer));
 }
 
-ov::AnyMap vlm_kwargs_to_any_map(const py::kwargs& kwargs, bool allow_compile_properties=true) {
-    ov::AnyMap params = {};
-
-    for (const auto& item : kwargs) {
-        std::string key = py::cast<std::string>(item.first);
-        py::object value = py::cast<py::object>(item.second);
-
-        if (key == "images") {
-            params.insert({ov::genai::images(std::move(py::cast<std::vector<ov::Tensor>>(value)))});
-        } else if (key == "image") {
-            params.insert({ov::genai::image(std::move(py::cast<ov::Tensor>(value)))});
-        } else if (key == "generation_config") {
-            params.insert({ov::genai::generation_config(std::move(py::cast<ov::genai::GenerationConfig>(value)))});
-        } else if (key == "streamer") {
-            auto py_streamer = py::cast<pyutils::PyBindStreamerVariant>(value);
-            params.insert({ov::genai::streamer(std::move(pyutils::pystreamer_to_streamer(py_streamer)))});
-
-        } 
-        else {
-            if (allow_compile_properties) {
-                // convert arbitrary objects to ov::Any
-                // not supported properties are not checked, as these properties are passed to compile(), which will throw exception in case of unsupported property
-                if (pyutils::py_object_is_any_map(value)) {
-                    auto map = pyutils::py_object_to_any_map(value);
-                    params.insert(map.begin(), map.end());
-                } else {
-                    params[key] = pyutils::py_object_to_any(value);
-                }
-            }
-            else {
-                // generate doesn't run compile(), so only VLMPipeline specific properties are allowed
-                throw(std::invalid_argument("'" + key + "' is unexpected parameter name. "
-                                        "Use help(openvino_genai.VLMPipeline.generate) to get list of acceptable parameters."));
-            }
-        }
-    }
-
-    return params;
-}
-
 void init_vlm_pipeline(py::module_& m) {
     py::class_<ov::genai::VLMPipeline>(m, "VLMPipeline", "This class is used for generation with VLMs")
         .def(py::init([](
@@ -120,7 +80,7 @@ void init_vlm_pipeline(py::module_& m) {
             const py::kwargs& kwargs
         ) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
-            return std::make_unique<ov::genai::VLMPipeline>(models_path, device, vlm_kwargs_to_any_map(kwargs, true));
+            return std::make_unique<ov::genai::VLMPipeline>(models_path, device, pyutils::kwargs_to_any_map(kwargs));
         }),
         py::arg("models_path"), "folder with exported model files",
         py::arg("device"), "device on which inference will be done"
@@ -177,7 +137,7 @@ void init_vlm_pipeline(py::module_& m) {
                const std::string& prompt,
                const py::kwargs& kwargs
             )  -> py::typing::Union<ov::genai::DecodedResults> {
-                return py::cast(pipe.generate(prompt, vlm_kwargs_to_any_map(kwargs, false)));
+                return py::cast(pipe.generate(prompt, pyutils::kwargs_to_any_map(kwargs)));
             },
             py::arg("prompt"), "Input string",
             (vlm_generate_kwargs_docstring + std::string(" \n ")).c_str()
diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp
index 3bf777f739..d34bd5f3b6 100644
--- a/src/python/py_whisper_pipeline.cpp
+++ b/src/python/py_whisper_pipeline.cpp
@@ -162,60 +162,7 @@ OptionalWhisperGenerationConfig update_whisper_config_from_kwargs(const Optional
     WhisperGenerationConfig res_config;
     if (config.has_value())
         res_config = *config;
-
-    for (const auto& item : kwargs) {
-        std::string key = py::cast<std::string>(item.first);
-        py::object value = py::cast<py::object>(item.second);
-
-        if (item.second.is_none()) {
-            // Even if argument key name does not fit GenerationConfig name
-            // it's not an error if it's not defined.
-            // Some HF configs can have parameters for methods currently unsupported in ov_genai
-            // but if their values are not set / None, then this should not block
-            // us from reading such configs, e.g. {"typical_p": None, 'top_p': 1.0,...}
-            return res_config;
-        }
-
-        if (key == "max_new_tokens") {
-            res_config.max_new_tokens = py::cast<int>(item.second);
-        } else if (key == "max_length") {
-            res_config.max_length = py::cast<int>(item.second);
-        } else if (key == "decoder_start_token_id") {
-            res_config.decoder_start_token_id = py::cast<int>(item.second);
-        } else if (key == "pad_token_id") {
-            res_config.pad_token_id = py::cast<int>(item.second);
-        } else if (key == "translate_token_id") {
-            res_config.translate_token_id = py::cast<int>(item.second);
-        } else if (key == "transcribe_token_id") {
-            res_config.transcribe_token_id = py::cast<int>(item.second);
-        } else if (key == "no_timestamps_token_id") {
-            res_config.no_timestamps_token_id = py::cast<int>(item.second);
-        } else if (key == "max_initial_timestamp_index") {
-            res_config.max_initial_timestamp_index = py::cast<size_t>(item.second);
-        } else if (key == "begin_suppress_tokens") {
-            res_config.begin_suppress_tokens = py::cast<std::vector<int64_t>>(item.second);
-        } else if (key == "suppress_tokens") {
-            res_config.suppress_tokens = py::cast<std::vector<int64_t>>(item.second);
-        } else if (key == "is_multilingual") {
-            res_config.is_multilingual = py::cast<bool>(item.second);
-        } else if (key == "language") {
-            res_config.language = py::cast<std::string>(item.second);
-        } else if (key == "lang_to_id") {
-            res_config.lang_to_id = py::cast<std::map<std::string, int64_t>>(item.second);
-        } else if (key == "task") {
-            res_config.task = py::cast<std::string>(item.second);
-        } else if (key == "return_timestamps") {
-            res_config.return_timestamps = py::cast<bool>(item.second);
-        } else if (key == "eos_token_id") {
-            res_config.set_eos_token_id(py::cast<int>(item.second));
-        } else {
-            throw(std::invalid_argument(
-                "'" + key +
-                "' is incorrect WhisperGenerationConfig parameter name. "
-                "Use help(openvino_genai.WhisperGenerationConfig) to get list of acceptable parameters."));
-        }
-    }
-
+    res_config.update_generation_config(pyutils::kwargs_to_any_map(kwargs));
     return res_config;
 }
 
diff --git a/tests/cpp/utils.cpp b/tests/cpp/utils.cpp
new file mode 100644
index 0000000000..d00edae6fb
--- /dev/null
+++ b/tests/cpp/utils.cpp
@@ -0,0 +1,21 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include "utils.hpp"
+
+
+using namespace ov::genai::utils;
+using map_type = std::map<std::string, int64_t>;
+
+TEST(TestIsContainer, test_is_container) {
+    EXPECT_EQ(is_container<int>, false);
+    EXPECT_EQ(is_container<int64_t>, false);
+    EXPECT_EQ(is_container<float>, false);
+    EXPECT_EQ(is_container<size_t>, false);
+    EXPECT_EQ(is_container<std::string>, true);
+    EXPECT_EQ(is_container<std::vector<float>>, true);
+    EXPECT_EQ(is_container<map_type>, true);
+    EXPECT_EQ(is_container<std::set<int64_t>>, true);
+}
\ No newline at end of file
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index ba934e3bda..80df79f31b 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -38,7 +38,7 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro
         # Need to set explicitly to False, but only if test arguments omitted this arg.
         # Do not apply 'repetition_penalty' if sampling is not used.
         config['do_sample'] = False
-        config['repetition_penalty'] = None
+        config['repetition_penalty'] = 1.0 # 1.0 means no penalty
     
     generation_config_hf = config.copy()
     if generation_config_hf.get('stop_criteria'):
@@ -78,7 +78,7 @@ def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str
         # Need to set explicitly to False, but only if test arguments omitted this arg.
         # Do not apply 'repetition_penalty' if sampling is not used.
         config['do_sample'] = False
-        config['repetition_penalty'] = None
+        config['repetition_penalty'] = 1.0 # 1.0 means no penalty
 
     generation_config_hf = config.copy()
     if generation_config_hf.get('stop_criteria'):
@@ -117,7 +117,7 @@ def hf_ov_genai_tensors_comparison(
         # Need to set explicitly to False, but only if test arguments omitted this arg.
         # Do not apply 'repetition_penalty' if sampling is not used.
         config['do_sample'] = False
-        config['repetition_penalty'] = None
+        config['repetition_penalty'] = 1.0 # 1.0 means no penalty
     
     generation_config_hf = config.copy()
     if generation_config_hf.get('stop_criteria'):
@@ -635,7 +635,8 @@ def test_valid_configs(model_tmp_path):
 
 invalid_py_configs = [
     dict(num_beam_groups=3, num_beams=15, do_sample=True),
-    dict(unexisting_key_name=True),  # no eos_token_id no max_new_tokens, no max_len
+    # TODO: Currently unexpected params do not cause exceptions. Need to implement it in c++ and return this test
+  #  dict(unexisting_key_name=True),  # no eos_token_id no max_new_tokens, no max_len
     dict(eos_token_id=42, ignore_eos=True),  # no max_new_tokens, no max_len with ignore_eos
     dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty
     dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp
@@ -763,7 +764,7 @@ def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: st
         # Need to set explicitly to False, but only if test arguments omitted this arg.
         # Do not apply 'repetition_penalty' if sampling is not used.
         config['do_sample'] = False
-        config['repetition_penalty'] = None
+        config['repetition_penalty'] = 1.0 # 1.0 means no penalty
     return pipe.generate([prompt], **config).perf_metrics
 
 
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index 9973e20e1d..9aa6931d85 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -334,7 +334,7 @@ def test_echo_without_completion(tmp_path, get_generation_config, max_num_batche
     model_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, model_path)
 
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix(), {}), scheduler_config, "CPU", {})
+    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()), scheduler_config, "CPU", {})
 
     outputs = pipe.generate(["What is OpenVINO?"], generation_configs)
     assert(len(outputs))
@@ -361,7 +361,7 @@ def test_echo_with_completion(tmp_path, get_generation_config, max_num_batched_t
     model_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, model_path)
 
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix(), {}), scheduler_config, "CPU", {})
+    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()), scheduler_config, "CPU", {})
 
     outputs = pipe.generate(["What is OpenVINO?"], generation_configs)
     assert(len(outputs))
@@ -389,7 +389,7 @@ def test_post_oom_health(tmp_path, sampling_config):
     models_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, models_path)
 
-    pipe = ContinuousBatchingPipeline(models_path.absolute().as_posix(), Tokenizer(models_path.absolute().as_posix(), {}), scheduler_config, "CPU", {})
+    pipe = ContinuousBatchingPipeline(models_path.absolute().as_posix(), Tokenizer(models_path.absolute().as_posix()), scheduler_config, "CPU", {})
     # First run should return incomplete response
     output = pipe.generate(["What is OpenVINO?"], generation_configs)
     assert (len(output))

From 89865c3e3856abec5fe6b7896a5e42cb81f5ff75 Mon Sep 17 00:00:00 2001
From: Helena Kloosterman <helena.kloosterman@intel.com>
Date: Thu, 21 Nov 2024 07:28:53 +0100
Subject: [PATCH 03/24] Update Python VLM example in README (#1178)

Existing example uses an undefined "read_image" function, and using
max_new_tokens in pipe.generate() resulted in an error with latest
nightly.

I updated the example to work out of the box. Makes it a bit longer, but
this section is hidden by default in the README, so it doesn't add to
visual clutter for people just visiting the repo. Also added links to
the relevant samples.

---------

Co-authored-by: Vladimir Zlobin <vladimir.zlobin@intel.com>
---
 README.md | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index fe18205028..c1217a0215 100644
--- a/README.md
+++ b/README.md
@@ -117,17 +117,34 @@ optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code --
 
 ### Run generation using VLMPipeline API in Python
 
+See [Visual Language Chat](https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/python/visual_language_chat) for a demo application.
+
+Run the following command to download a sample image:
+
+```sh
+curl -O "https://storage.openvinotoolkit.org/test_data/images/dog.jpg"
+```
+
 ```python
+import numpy as np
+import openvino as ov
 import openvino_genai as ov_genai
-#Will run model on CPU, GPU is a possible option
+from PIL import Image
+
+# Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU
 pipe = ov_genai.VLMPipeline("./MiniCPM-V-2_6/", "CPU")
-rgb = read_image("cat.jpg")
-print(pipe.generate(prompt, image=rgb, max_new_tokens=100))
+
+image = Image.open("dog.jpg")
+image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8)
+image_data = ov.Tensor(image_data)  
+
+prompt = "Can you describe the image?"
+print(pipe.generate(prompt, image=image_data, max_new_tokens=100))
 ```
 
 ### Run generation using VLMPipeline in C++
 
-Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for more details)
+Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for more details). See [Visual Language Chat](https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/cpp/visual_language_chat) for a demo application.
 
 ```cpp
 #include "load_image.hpp"

From 799454f5731518e795193721a77b44c95b45fb0f Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Thu, 21 Nov 2024 13:59:58 +0400
Subject: [PATCH 04/24] Install deployment and export requirements.txt (#1231)
 (#1241)

Ticket 157649

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 samples/CMakeLists.txt | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 229eccb3fe..860ced072b 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -14,8 +14,12 @@ add_subdirectory(cpp/text2image)
 add_subdirectory(cpp/visual_language_chat)
 add_subdirectory(cpp/whisper_speech_recognition)
 
-install(FILES requirements.txt DESTINATION samples
-        COMPONENT cpp_samples_genai)
+install(FILES
+        deployment-requirements.txt
+        export-requirements.txt
+        requirements.txt
+    DESTINATION samples
+    COMPONENT cpp_samples_genai)
 
 install(DIRECTORY
             cpp/beam_search_causal_lm

From 5d5fe7512398778681e0e2d2f5325e9c7995a7d0 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Thu, 21 Nov 2024 18:08:22 +0400
Subject: [PATCH 05/24] Allow missing OpenVINODeveloperPackage (#1243)

Compiliung GenAI against ov archives prints Warning: Please, install
pybind11-stubgen==2.5.1 otherwise
---
 src/python/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 898e18b895..25d81277d6 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -182,12 +182,14 @@ if(pybind11_stubgen_AVAILABLE)
         VERBATIM)
 
     add_custom_target(${TARGET_NAME}_stub ALL DEPENDS ${output_file})
-else()
+elseif(OpenVINODeveloperPackage_FOUND)
     # Produce warning message at build time as well
     add_custom_command(OUTPUT pybind11_stub_gen_not_found.txt
         COMMAND ${CMAKE_COMMAND}
             -E cmake_echo_color --red "Warning: Please, install ${pybind11_stubgen_dep}")
     add_custom_target(${TARGET_NAME}_stub ALL DEPENDS pybind11_stub_gen_not_found.txt)
+else()
+    add_custom_target(${TARGET_NAME}_stub ALL)
 endif()
 
 add_dependencies(${TARGET_NAME}_stub ${TARGET_NAME})

From ac7d39ffe66b04a52df69ad7950b4d7963d7f681 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 21 Nov 2024 22:08:05 +0100
Subject: [PATCH 06/24] parametrize decode in Tokenizers

---
 src/cpp/include/openvino/genai/tokenizer.hpp  | 45 +++++++++-
 .../src/make_combine_segments_stateful.cpp    | 44 ++++++++++
 .../src/make_combine_segments_stateful.hpp    | 37 +++++++++
 src/cpp/src/tokenizer.cpp                     | 82 +++++++++++--------
 src/python/py_tokenizer.cpp                   | 24 ++++--
 tests/python_tests/test_chat_generate_api.py  | 21 +++++
 6 files changed, 208 insertions(+), 45 deletions(-)

diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index bcb8da68a3..e90e9c80de 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -87,23 +87,59 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief decode sequence of tokens
     * @param tokens vector storing tokens
+    * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false}
     * @return sequence string
     */
-    std::string decode(std::vector<int64_t> tokens);
-    
+    std::string decode(std::vector<int64_t> tokens, const ov::AnyMap& detokenization_params = {});
+
+    /**
+    * @brief decode sequence of tokens
+    * @param tokens vector storing tokens
+    * @param tokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
+    * @return sequence string
+    */
+    template <typename... Properties>
+    util::EnableIfAllStringAny<std::string, Properties...> decode(std::vector<int64_t>& tokens, Properties&&... properties) {
+        return decode(tokens, AnyMap{std::forward<Properties>(properties)...});
+    }
+
     /**
     * @brief decode tokens. 
     * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
+    * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false}
+    * @return vector of std::string, with size = batch_size
+    */
+    std::vector<std::string> decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params = {});
+
+    /**
+    * @brief decode sequence of tokens
+    * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
+    * @param tokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
     * @return vector of std::string, with size = batch_size
     */
-    std::vector<std::string> decode(ov::Tensor tokens);
+    template <typename... Properties>
+    util::EnableIfAllStringAny<std::vector<std::string>, Properties...> decode(ov::Tensor tokens, Properties&&... properties) {
+        return decode(tokens, AnyMap{std::forward<Properties>(properties)...});
+    }
 
     /**
     * @brief batched decoding of tokens. 
     * @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size
+    * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false}
     * @return vector of std::string, with size equal to batch_size
     */
-    std::vector<std::string> decode(std::vector<std::vector<int64_t>> tokens);
+    std::vector<std::string> decode(std::vector<std::vector<int64_t>> tokens, const ov::AnyMap& detokenization_params = {});
+
+    /**
+    * @brief decode sequence of tokens
+    * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
+    * @param tokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
+    * @return vector of std::string, with size = batch_size
+    */
+    template <typename... Properties>
+    util::EnableIfAllStringAny<std::vector<std::string>, Properties...> decode(std::vector<std::vector<int64_t>> tokens, Properties&&... properties) {
+        return decode(tokens, AnyMap{std::forward<Properties>(properties)...});
+    }
 
     /**
      * @brief Embeds input prompts with special tags for a chat scenario.
@@ -143,6 +179,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
 };
 
 static constexpr ov::Property<bool> add_special_tokens{"add_special_tokens"};
+static constexpr ov::Property<bool> skip_special_tokens{"skip_special_tokens"};
 
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/make_combine_segments_stateful.cpp b/src/cpp/src/make_combine_segments_stateful.cpp
index 2285c172dc..26c58b8fca 100644
--- a/src/cpp/src/make_combine_segments_stateful.cpp
+++ b/src/cpp/src/make_combine_segments_stateful.cpp
@@ -4,6 +4,8 @@
 #include "make_combine_segments_stateful.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/select.hpp"
+#include "openvino/op/slice.hpp"
+#include "openvino/op/multiply.hpp"
 #include "openvino/op/read_value.hpp"
 #include "openvino/op/assign.hpp"
 
@@ -44,3 +46,45 @@ bool ov::genai::MakeCombineSegmentsSatateful::run_on_model(const std::shared_ptr
     model->add_variables({variable});
     return true;
 }
+
+bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr<ov::Model>& model) {
+
+    std::shared_ptr<ov::Node> vocab_decoder_node;
+    for (auto node: model->get_ordered_ops()) {
+        if (strcmp(node->get_type_info().name, "VocabDecoder") == 0) {
+            vocab_decoder_node = node;
+        }
+    }
+    auto val = vocab_decoder_node->input_value(4);
+    auto val_type = vocab_decoder_node->input_value(4).get_element_type();
+
+    if (!vocab_decoder_node || !vocab_decoder_node->input_value(4).get_element_type().is_integral_number()) {
+        return false;
+    }
+    
+    std::shared_ptr<v0::Constant> skip_tokens_const = std::dynamic_pointer_cast<v0::Constant>(vocab_decoder_node->get_input_node_shared_ptr(4));
+    if (!skip_tokens_const) {
+        return false;
+    }
+
+
+    auto start_const = std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector{0});
+    auto int_max_const = std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector{std::numeric_limits<int>::max()});
+    auto one_const = std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector{1});
+    
+    // By default, INT_MAX will multiply with 1 and all skip_tokens will be selected.
+    op::util::VariableInfo var_info{ov::Shape{1}, ov::element::i32, SKIP_SPECIAL_TOKENS_VAR_ID};
+    auto variable = std::make_shared<op::util::Variable>(var_info);
+    auto read_value = std::make_shared<v6::ReadValue>(one_const, variable);
+    // if flag is set, then slice up to the int_max which means skip all tokens.
+    auto stop = std::make_shared<v1::Multiply>(int_max_const, read_value);
+
+    std::shared_ptr<v8::Slice> slice_node = std::make_shared<v8::Slice>(skip_tokens_const, start_const, stop, one_const);
+    
+    vocab_decoder_node->input(4).replace_source_output(slice_node->output(0));
+
+    auto assign = std::make_shared<v6::Assign>(read_value, variable);
+    model->add_sinks({assign});
+    model->add_variables({variable});
+    return true;
+}
\ No newline at end of file
diff --git a/src/cpp/src/make_combine_segments_stateful.hpp b/src/cpp/src/make_combine_segments_stateful.hpp
index 6365497140..307c6199c8 100644
--- a/src/cpp/src/make_combine_segments_stateful.hpp
+++ b/src/cpp/src/make_combine_segments_stateful.hpp
@@ -38,7 +38,44 @@ class MakeCombineSegmentsSatateful : public ov::pass::ModelPass {
     bool run_on_model(const std::shared_ptr<ov::Model>& model) override;
 };
 
+/** 
+ * @brief This pass modifies tokenizer ov::Model so that special tokens adding will be
+ *  enabled or disabled depending on stateful value.
+ *                                          
+ *                                  +--------------+
+ *                                  |  DefaultMode |
+ *                                  +--------------+
+ *                                         |
+ *                                         v
+ *                                  +------------+   +-----------+
+ *                                  |  ReadValue |   |  INT_MAX  |
+ *                                  +------------+   +-----------+
+ *                                          \           /
+ *                                           \         /
+ *                                            v       v
+ *   +--------------------+     +---------+  +---------+
+ *   |  Const with tokens |     |  start  |  |   Mul   |
+ *   +--------------------+     +---------+  +---------+
+ *                         \          |          /
+ *                           \        |         /
+ *                             v      v        v
+ *                            +-----------------+
+ *                            |      Slice      |
+ *                            +-----------------+
+ *                                     |
+ *                                     v
+ *                          +----------------------+
+ *                          |     VocabDecoder     |
+ *                          +----------------------+
+**/
+class MakeVocabDecoderSatateful : public ov::pass::ModelPass {
+public:
+    OPENVINO_RTTI("MakeVocabDecoderSatateful", "0");
+    bool run_on_model(const std::shared_ptr<ov::Model>& model) override;
+};
+
 const std::string ADD_SPECIAL_TOKENS_VAR_ID = "add_special_tokens";
+const std::string SKIP_SPECIAL_TOKENS_VAR_ID = "skip_special_tokens";
 
 } // namespace genai
 } // namespace ov
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index f52417a94e..78b94915dd 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -10,6 +10,7 @@
 #include <jinja2cpp/generic_list.h>
 #include <jinja2cpp/generic_list_iterator.h>
 
+#include "openvino/pass/visualize_tree.hpp"
 #include "openvino/pass/manager.hpp"
 #include "openvino/runtime/core.hpp"
 #include "openvino/genai/tokenizer.hpp"
@@ -73,7 +74,8 @@ class Tokenizer::TokenizerImpl {
     std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_detokenizer;
     // To change the adding special tokens mode we use a statefull subgraph, 
     // this flag holds the current state value of the CompiledModel.
-    bool m_add_special_tokens = true;  
+    bool m_add_special_tokens = true;
+    bool m_skip_special_tokens = false;
     bool m_older_than_24_5 = false;
     
     int64_t m_pad_token_id = -1;
@@ -86,11 +88,16 @@ class Tokenizer::TokenizerImpl {
 
     std::string m_chat_template = {};
 
-    void set_state_if_necessary(CircularBufferQueueElementGuard<ov::InferRequest>& infer_request_guard, bool add_special_tokens) {
+    void set_state_if_necessary(CircularBufferQueueElementGuard<ov::InferRequest>& infer_request_guard, const ov::AnyMap& params) {
+        bool add_special_tokens_flag = true;
+        bool skip_special_tokens_flag = false;
+        ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag);
+        ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag);
+
         // If user requested add_special_tokens mode different from the current one,
         // need to set state variable.
         // If requested mode matches the stored state set, then don't touch states.
-        if (add_special_tokens == m_add_special_tokens) {
+        if (add_special_tokens_flag == m_add_special_tokens && skip_special_tokens_flag == m_skip_special_tokens) {
             return;
         }
         if (m_older_than_24_5) {
@@ -100,19 +107,23 @@ class Tokenizer::TokenizerImpl {
             return;
         }
         
-        // auto states = m_ireq_queue_tokenizer->get(0).query_state();
+        // add_special_tokens is managed by Select op with a bool input.
         ov::Tensor add_special_tensor = ov::Tensor(ov::element::boolean, {});
-        *add_special_tensor.data<bool>() = add_special_tokens;
+        *add_special_tensor.data<bool>() = add_special_tokens_flag;
+        
+        // skip_special_tokens is managed by multiplication with a number, therefore i32.
+        ov::Tensor skip_special_tensor = ov::Tensor(ov::element::i32, {1});
+        *skip_special_tensor.data<int>() = skip_special_tokens_flag;
 
         for (auto& state: infer_request_guard.get().query_state()) {
-            if (state.get_name().find(ov::genai::ADD_SPECIAL_TOKENS_VAR_ID) == std::string::npos) {
-                // It's not add_special_tokens flag state.
-                continue;
+            if (state.get_name().find(ov::genai::ADD_SPECIAL_TOKENS_VAR_ID) != std::string::npos) {
+                state.set_state(add_special_tensor);
+            } else if (state.get_name().find(ov::genai::SKIP_SPECIAL_TOKENS_VAR_ID) != std::string::npos) {
+                state.set_state(skip_special_tensor);
             }
-            state.set_state(add_special_tensor);
-            break;            
         }
-        m_add_special_tokens = add_special_tokens;
+        m_add_special_tokens = add_special_tokens_flag;
+        m_skip_special_tokens = skip_special_tokens_flag;
     }
 
     TokenizerImpl() = default;
@@ -135,15 +146,25 @@ class Tokenizer::TokenizerImpl {
 
         auto device = "CPU"; // currently openvino_tokenizer supports only CPU
         auto ov_tokenizer = core.read_model(tokenizer_path / "openvino_tokenizer.xml");
+        std::shared_ptr<ov::Model> ov_detokenizer;
+        if (std::filesystem::exists(tokenizer_path / "openvino_detokenizer.xml")) {
+            ov_detokenizer = core.read_model(tokenizer_path / "openvino_detokenizer.xml");
+        }
         m_older_than_24_5 = ov_tokenizer->get_rt_info().count("openvino_tokenizers_version") != 1;
         
-        ov::pass::Manager manager;
-        manager.register_pass<MakeCombineSegmentsSatateful>();
-        manager.run_passes(ov_tokenizer);
+        ov::pass::Manager manager_tok;
+        manager_tok.register_pass<MakeCombineSegmentsSatateful>();
+        manager_tok.run_passes(ov_tokenizer);
+        
+        ov::pass::Manager manager_detok;
+        manager_detok.register_pass<ov::pass::VisualizeTree>("before.svg");
+        manager_detok.register_pass<MakeVocabDecoderSatateful>();
+        manager_detok.register_pass<ov::pass::VisualizeTree>("after.svg");
+        manager_detok.run_passes(ov_detokenizer);
         
         m_tokenizer = core.compile_model(ov_tokenizer, device, properties);
         if (std::filesystem::exists(tokenizer_path / "openvino_detokenizer.xml")) {
-            m_detokenizer = core.compile_model(tokenizer_path / "openvino_detokenizer.xml", device, properties);
+            m_detokenizer = core.compile_model(ov_detokenizer, device, properties);
         }
 
         
@@ -298,11 +319,8 @@ class Tokenizer::TokenizerImpl {
     }
 
     TokenizedInputs encode(std::string prompt, const ov::AnyMap& tokenization_params = {}) {
-        bool add_special_tokens_flag = true;
-        ov::genai::utils::read_anymap_param(tokenization_params, add_special_tokens.name(), add_special_tokens_flag);
-
         CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());
-        set_state_if_necessary(infer_request_guard, add_special_tokens_flag);
+        set_state_if_necessary(infer_request_guard, tokenization_params);
         size_t batch_size = 1;
         infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
         infer_request_guard.get().start_async();
@@ -316,11 +334,8 @@ class Tokenizer::TokenizerImpl {
     TokenizedInputs encode(std::vector<std::string>& prompts, const ov::AnyMap& tokenization_params = {}) {
         TokenizedInputs unpadded;
         {
-            bool add_special_tokens_flag = true;
-            ov::genai::utils::read_anymap_param(tokenization_params, add_special_tokens.name(), add_special_tokens_flag);
-
             CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());
-            set_state_if_necessary(infer_request_guard, add_special_tokens_flag);
+            set_state_if_necessary(infer_request_guard, tokenization_params);
             infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
             auto size_ = infer_request_guard.get().get_input_tensor().get_shape();
             infer_request_guard.get().start_async();
@@ -343,10 +358,11 @@ class Tokenizer::TokenizerImpl {
         return {input_ids_, attention_mask_};
     }
 
-    std::string decode(std::vector<int64_t> tokens) {
+    std::string decode(std::vector<int64_t> tokens, const ov::AnyMap& detokenization_params = {}) {
         OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available");
 
         CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_detokenizer.get());
+        set_state_if_necessary(infer_request_guard, detokenization_params);
         size_t batch_size = 1;
         infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()});
         infer_request_guard.get().start_async();
@@ -354,12 +370,13 @@ class Tokenizer::TokenizerImpl {
         return infer_request_guard.get().get_output_tensor().data<std::string>()[0];
     }
 
-    std::vector<std::string> decode(ov::Tensor tokens) {
+    std::vector<std::string> decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params = {}) {
         OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available");
         OPENVINO_ASSERT(tokens.get_element_type() == ov::element::i64, "tokens tensor element type should be an i64");
         OPENVINO_ASSERT(tokens.get_shape().size() == 2, "tokens tensor should of rank 2 with shape [batch_size, seq_len]");
 
         CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_detokenizer.get());
+        set_state_if_necessary(infer_request_guard, detokenization_params);
         infer_request_guard.get().set_input_tensor(tokens);
         infer_request_guard.get().start_async();
         infer_request_guard.get().wait();
@@ -369,7 +386,7 @@ class Tokenizer::TokenizerImpl {
         return std::vector<std::string>(res_data, res_data + res.get_shape()[0]);
     }
 
-    std::vector<std::string> decode(std::vector<std::vector<int64_t>> lines) {
+    std::vector<std::string> decode(std::vector<std::vector<int64_t>> lines, const ov::AnyMap& detokenization_params = {}) {
         OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available");
 
         auto compare_lengths = [](const std::vector<int64_t>& a, const std::vector<int64_t>& b) {
@@ -388,6 +405,7 @@ class Tokenizer::TokenizerImpl {
         }
 
         CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_detokenizer.get());
+        set_state_if_necessary(infer_request_guard, detokenization_params);
         infer_request_guard.get().set_input_tensor(tokens);
         infer_request_guard.get().start_async();
         infer_request_guard.get().wait();
@@ -517,16 +535,16 @@ TokenizedInputs Tokenizer::encode(std::initializer_list<std::string>& text, cons
     return encode(std::vector<std::string>(text.begin(), text.end()), tokenization_params);
 }
 
-std::string Tokenizer::decode(std::vector<int64_t> tokens) {
-    return m_pimpl->decode(tokens);
+std::string Tokenizer::decode(std::vector<int64_t> tokens, const ov::AnyMap& detokenization_params) {
+    return m_pimpl->decode(tokens, detokenization_params);
 }
 
-std::vector<std::string> Tokenizer::decode(ov::Tensor tokens) {
-    return m_pimpl->decode(tokens);
+std::vector<std::string> Tokenizer::decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params) {
+    return m_pimpl->decode(tokens, detokenization_params);
 }
 
-std::vector<std::string> Tokenizer::decode(std::vector<std::vector<int64_t>> lines) {
-    return m_pimpl->decode(lines);
+std::vector<std::string> Tokenizer::decode(std::vector<std::vector<int64_t>> lines, const ov::AnyMap& detokenization_params) {
+    return m_pimpl->decode(lines, detokenization_params);
 }
 
 int64_t Tokenizer::get_bos_token_id() const {
diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp
index 2ccccff4c0..db4643a65c 100644
--- a/src/python/py_tokenizer.cpp
+++ b/src/python/py_tokenizer.cpp
@@ -63,27 +63,33 @@ void init_tokenizer(py::module_& m) {
 
         .def(
             "decode",
-            [](Tokenizer& tok, std::vector<int64_t>& tokens) -> py::str {
-                return pyutils::handle_utf8(tok.decode(tokens));
+            [](Tokenizer& tok, std::vector<int64_t>& tokens, bool skip_special_tokens) -> py::str {
+                ov::AnyMap detokenization_params;
+                detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
+                return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"),
+            py::arg("tokens"), py::arg("skip_special_tokens") = true,
             R"(Decode a sequence into a string prompt.)"
         )
 
         .def(
             "decode",
-            [](Tokenizer& tok, ov::Tensor& tokens) -> py::typing::List<py::str> {
-                return pyutils::handle_utf8(tok.decode(tokens));
+            [](Tokenizer& tok, ov::Tensor& tokens, bool skip_special_tokens) -> py::typing::List<py::str> {
+                ov::AnyMap detokenization_params;
+                detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
+                return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"),
+            py::arg("tokens"), py::arg("skip_special_tokens") = true,
             R"(Decode tensor into a list of string prompts.)")
 
         .def(
             "decode",
-            [](Tokenizer& tok, std::vector<std::vector<int64_t>>& tokens) -> py::typing::List<py::str> {
-                return pyutils::handle_utf8(tok.decode(tokens));
+            [](Tokenizer& tok, std::vector<std::vector<int64_t>>& tokens, bool skip_special_tokens) -> py::typing::List<py::str> {
+                ov::AnyMap detokenization_params;
+                detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
+                return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"),
+            py::arg("tokens"), py::arg("skip_special_tokens") = true,
             R"(Decode a batch of tokens into a list of string prompt.)")
 
         .def("apply_chat_template", [](Tokenizer& tok,
diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
index 25d0798994..a87a2c7555 100644
--- a/tests/python_tests/test_chat_generate_api.py
+++ b/tests/python_tests/test_chat_generate_api.py
@@ -217,3 +217,24 @@ def test_add_special_tokens(add_special_tokens, prompt):
     res_genai = genai_tokenzier.encode(prompt, add_special_tokens).input_ids.data
     res_hf = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"]
     assert np.all(res_genai == res_hf)
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+@pytest.mark.parametrize("add_special_tokens", [True, False])
+@pytest.mark.parametrize("skip_special_tokens", [True, False])
+@pytest.mark.parametrize("prompt", prompts)
+def test_add_special_tokens(add_special_tokens, skip_special_tokens, prompt):
+    import numpy as np
+    model_descr = get_chat_models_list()[0]
+    model_id, path, hf_tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
+    genai_tokenizer = pipe.get_tokenizer()
+    
+    # Calling encode with add_special_tokens will set state flag.
+    res_genai = genai_tokenizer.encode(prompt, add_special_tokens).input_ids.data
+    res_hf = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"]
+    assert np.all(res_genai == res_hf)
+    
+    # Decode with skip_special_tokens
+    decoded_genai = genai_tokenizer.decode(res_genai, skip_special_tokens=skip_special_tokens)
+    decoded_hf = hf_tokenizer.decode(res_hf[0], skip_special_tokens=skip_special_tokens)
+    assert decoded_genai == decoded_hf

From e46466d94124aa73daa34a91cf94a7e0ce4e1265 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 21 Nov 2024 22:10:47 +0100
Subject: [PATCH 07/24] rename pass

---
 .github/labeler.yml                                          | 4 ++--
 ...ine_segments_stateful.cpp => make_tokenizer_stateful.cpp} | 2 +-
 ...ine_segments_stateful.hpp => make_tokenizer_stateful.hpp} | 0
 src/cpp/src/tokenizer.cpp                                    | 5 +----
 4 files changed, 4 insertions(+), 7 deletions(-)
 rename src/cpp/src/{make_combine_segments_stateful.cpp => make_tokenizer_stateful.cpp} (98%)
 rename src/cpp/src/{make_combine_segments_stateful.hpp => make_tokenizer_stateful.hpp} (100%)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index c5d0db312c..c162f6aff4 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -8,8 +8,8 @@
 - 'src/cpp/src/tokenizers_path.hpp'
 - 'src/cpp/src/circular_buffer_queue.hpp'
 - 'src/cpp/src/synchronized_queue.hpp'
-- 'src/cpp/src/make_combine_segments_stateful.cpp'
-- 'src/cpp/src/make_combine_segments_stateful.hpp'
+- 'src/cpp/src/make_tokenizer_stateful.cpp'
+- 'src/cpp/src/make_tokenizer_stateful.hpp'
 - 'src/python/py_tokenizer.cpp'
 - 'thirdparty/openvino_tokenizers'
 - 'tests/python_tests/tokenizer_configs.py'
diff --git a/src/cpp/src/make_combine_segments_stateful.cpp b/src/cpp/src/make_tokenizer_stateful.cpp
similarity index 98%
rename from src/cpp/src/make_combine_segments_stateful.cpp
rename to src/cpp/src/make_tokenizer_stateful.cpp
index 26c58b8fca..538a935e56 100644
--- a/src/cpp/src/make_combine_segments_stateful.cpp
+++ b/src/cpp/src/make_tokenizer_stateful.cpp
@@ -1,7 +1,7 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include "make_combine_segments_stateful.hpp"
+#include "make_tokenizer_stateful.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/select.hpp"
 #include "openvino/op/slice.hpp"
diff --git a/src/cpp/src/make_combine_segments_stateful.hpp b/src/cpp/src/make_tokenizer_stateful.hpp
similarity index 100%
rename from src/cpp/src/make_combine_segments_stateful.hpp
rename to src/cpp/src/make_tokenizer_stateful.hpp
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 78b94915dd..fc6ba75d90 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -10,12 +10,11 @@
 #include <jinja2cpp/generic_list.h>
 #include <jinja2cpp/generic_list_iterator.h>
 
-#include "openvino/pass/visualize_tree.hpp"
 #include "openvino/pass/manager.hpp"
 #include "openvino/runtime/core.hpp"
 #include "openvino/genai/tokenizer.hpp"
 
-#include "make_combine_segments_stateful.hpp"
+#include "make_tokenizer_stateful.hpp"
 #include "tokenizers_path.hpp"
 #include "circular_buffer_queue.hpp"
 #include "json_utils.hpp"
@@ -157,9 +156,7 @@ class Tokenizer::TokenizerImpl {
         manager_tok.run_passes(ov_tokenizer);
         
         ov::pass::Manager manager_detok;
-        manager_detok.register_pass<ov::pass::VisualizeTree>("before.svg");
         manager_detok.register_pass<MakeVocabDecoderSatateful>();
-        manager_detok.register_pass<ov::pass::VisualizeTree>("after.svg");
         manager_detok.run_passes(ov_detokenizer);
         
         m_tokenizer = core.compile_model(ov_tokenizer, device, properties);

From ff8846ae599bc2a05b3173c0dd05a027a376e32c Mon Sep 17 00:00:00 2001
From: guozhong wang <guozhong.wang@intel.com>
Date: Fri, 22 Nov 2024 09:11:36 +0800
Subject: [PATCH 08/24] Fix wrong token latency when batch size is greater than
 1 (#1244)

Fix the wrong 2nd token latency when batch size is greater than 1.
python benchmark.py -m
/mnt/llm_irs/models_6c715998_ww45.4_optimum/llama-2-7b-chat/pytorch/dldt/FP16
-n 1 --genai -ic 128 -bs 16
[ INFO ] [Average] P[0] Input token size: 128, 1st token latency: **0.36
ms/16tokens**, **2nd token latency: 1958228200.33 ms/16tokens**, 2nd
tokens throughput: **0.00** 16tokenss/s

Fix result:
[ INFO ] [Average] P[0] Input token size: 128, 1st token latency: 91.54
ms/16tokens, 2nd token latency: 69.81 ms/16tokens, 2nd tokens
throughput: 229.18 tokens/s
---
 tools/llm_bench/llm_bench_utils/metrics_print.py  | 4 ++--
 tools/llm_bench/task/speech_to_text_generation.py | 4 ++--
 tools/llm_bench/task/text_generation.py           | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/llm_bench/llm_bench_utils/metrics_print.py b/tools/llm_bench/llm_bench_utils/metrics_print.py
index 905decf72b..de9d0126f8 100644
--- a/tools/llm_bench/llm_bench_utils/metrics_print.py
+++ b/tools/llm_bench/llm_bench_utils/metrics_print.py
@@ -149,7 +149,7 @@ def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch
             avg_input_size = int(avg_input_size / index_num)
             if avg_2nd_tokens_latency > 0:
                 avg_2nd_token_tput = (1 / avg_2nd_tokens_latency) * batch_size * 1000
-            latency_unit = 'token' if is_text_gen is True else 'step'
+            tput_unit = latency_unit = 'token' if is_text_gen is True else 'step'
             if batch_size > 1:
                 if is_text_gen is True:
                     latency_unit = '{}tokens'.format(batch_size)
@@ -157,7 +157,7 @@ def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch
                     latency_unit = '{}steps'.format(batch_size)
             avg_1st_token_latency = 'NA' if avg_1st_token_latency < 0 else f'{avg_1st_token_latency:.2f} ms/{latency_unit}'
             avg_2nd_tokens_latency = 'NA' if avg_2nd_tokens_latency < 0 else f'{avg_2nd_tokens_latency:.2f} ms/{latency_unit}'
-            avg_2nd_token_tput = 'NA' if avg_2nd_tokens_latency == 'NA' else f'{avg_2nd_token_tput:.2f} {latency_unit}s/s'
+            avg_2nd_token_tput = 'NA' if avg_2nd_tokens_latency == 'NA' else f'{avg_2nd_token_tput:.2f} {tput_unit}s/s'
             prefix = f'[ INFO ] [Average] P[{p_idx}]L[{loop_idx}]' if loop_idx != -1 else f'[ INFO ] [Average] P[{p_idx}]'
             if is_text_gen is True:
                 output_info = ''
diff --git a/tools/llm_bench/task/speech_to_text_generation.py b/tools/llm_bench/task/speech_to_text_generation.py
index ad49109bab..be9c9ab295 100644
--- a/tools/llm_bench/task/speech_to_text_generation.py
+++ b/tools/llm_bench/task/speech_to_text_generation.py
@@ -51,10 +51,10 @@ def run_speech_2_txt_generation(input_param, args, md5_list, iter_data_list):
         )
         end = time.perf_counter()
         perf_metrics = result_text.perf_metrics
-        first_token_time = perf_metrics.get_ttft().mean / args["batch_size"]
+        first_token_time = perf_metrics.get_ttft().mean
         second_tokens_durations = (
             np.array(perf_metrics.raw_metrics.m_new_token_times[1:])
-            - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1]) / args["batch_size"]
+            - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1])
         ).tolist()
         tm_list = (np.array([first_token_time] + second_tokens_durations) / 1000).tolist()
         tm_infer_list = []
diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py
index 029bcdf16d..7718773560 100644
--- a/tools/llm_bench/task/text_generation.py
+++ b/tools/llm_bench/task/text_generation.py
@@ -240,10 +240,10 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
         per_token_time = generation_time * 1000 / (num_tokens / args['batch_size'])
     else:
         log.warning("No generated tokens")
-    first_token_time = (perf_metrics.get_ttft().mean - perf_metrics.raw_metrics.tokenization_durations[-1] / 1000) / args["batch_size"]
+    first_token_time = (perf_metrics.get_ttft().mean - perf_metrics.raw_metrics.tokenization_durations[-1] / 1000) * args["batch_size"]
     second_tokens_durations = (
         np.array(perf_metrics.raw_metrics.m_new_token_times[1:])
-        - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1]) / args["batch_size"]
+        - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1])
     ).tolist()
 
     tm_list = np.array([first_token_time] + second_tokens_durations) / 1000

From 4529dec255b603d711a479f1a90c4cbec9ae3ebf Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 22 Nov 2024 10:49:11 +0100
Subject: [PATCH 09/24] fix typos

---
 src/cpp/include/openvino/genai/tokenizer.hpp | 28 ++++++++++----------
 src/cpp/src/make_tokenizer_stateful.cpp      | 17 +++++-------
 src/python/py_tokenizer.cpp                  |  6 ++---
 tests/python_tests/test_chat_generate_api.py |  3 ++-
 4 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index e90e9c80de..8d2d63ea80 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -47,7 +47,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief encode a single prompt
     * @param prompt std::string with input prompt
-    * @param tokenization_params AnyMap with tokenization parameters, e.g. {'add_special_tokens', false}
+    * @param tokenization_params AnyMap with tokenization parameters, e.g. {"add_special_tokens", false}
     * @return pair of [input_ids, attention_mask]
     */
     TokenizedInputs encode(const std::string prompt, const ov::AnyMap& tokenization_params = {});
@@ -55,7 +55,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief encode batch of prompts. Left padding will be applied by default
     * @param prompts vector storing batch of prompts
-    * @param tokenization_params AnyMap with tokenization parameters, e.g. {'add_special_tokens', false}
+    * @param tokenization_params AnyMap with tokenization parameters, e.g. {"add_special_tokens", false}
     * @return pair of [input_ids, attention_mask]
     */
     TokenizedInputs encode(std::vector<std::string>& prompt, const ov::AnyMap& tokenization_params = {});
@@ -87,7 +87,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief decode sequence of tokens
     * @param tokens vector storing tokens
-    * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false}
+    * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return sequence string
     */
     std::string decode(std::vector<int64_t> tokens, const ov::AnyMap& detokenization_params = {});
@@ -95,18 +95,18 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief decode sequence of tokens
     * @param tokens vector storing tokens
-    * @param tokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
+    * @param detokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
     * @return sequence string
     */
     template <typename... Properties>
-    util::EnableIfAllStringAny<std::string, Properties...> decode(std::vector<int64_t>& tokens, Properties&&... properties) {
-        return decode(tokens, AnyMap{std::forward<Properties>(properties)...});
+    util::EnableIfAllStringAny<std::string, Properties...> decode(std::vector<int64_t>& tokens, Properties&&... detokenization_params) {
+        return decode(tokens, AnyMap{std::forward<Properties>(detokenization_params)...});
     }
 
     /**
     * @brief decode tokens. 
     * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
-    * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false}
+    * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return vector of std::string, with size = batch_size
     */
     std::vector<std::string> decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params = {});
@@ -114,18 +114,18 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief decode sequence of tokens
     * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
-    * @param tokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
+    * @param detokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
     * @return vector of std::string, with size = batch_size
     */
     template <typename... Properties>
-    util::EnableIfAllStringAny<std::vector<std::string>, Properties...> decode(ov::Tensor tokens, Properties&&... properties) {
-        return decode(tokens, AnyMap{std::forward<Properties>(properties)...});
+    util::EnableIfAllStringAny<std::vector<std::string>, Properties...> decode(ov::Tensor tokens, Properties&&... detokenization_params) {
+        return decode(tokens, AnyMap{std::forward<Properties>(detokenization_params)...});
     }
 
     /**
     * @brief batched decoding of tokens. 
     * @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size
-    * @param tokenization_params AnyMap with detokenization parameters, e.g. {'skip_special_tokens', false}
+    * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return vector of std::string, with size equal to batch_size
     */
     std::vector<std::string> decode(std::vector<std::vector<int64_t>> tokens, const ov::AnyMap& detokenization_params = {});
@@ -133,12 +133,12 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief decode sequence of tokens
     * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
-    * @param tokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
+    * @param detokenization_params detokenization parameters,  e.g. ov::genai::skip_special_tokens(true)
     * @return vector of std::string, with size = batch_size
     */
     template <typename... Properties>
-    util::EnableIfAllStringAny<std::vector<std::string>, Properties...> decode(std::vector<std::vector<int64_t>> tokens, Properties&&... properties) {
-        return decode(tokens, AnyMap{std::forward<Properties>(properties)...});
+    util::EnableIfAllStringAny<std::vector<std::string>, Properties...> decode(std::vector<std::vector<int64_t>> tokens, Properties&&... detokenization_params) {
+        return decode(tokens, AnyMap{std::forward<Properties>(detokenization_params)...});
     }
 
     /**
diff --git a/src/cpp/src/make_tokenizer_stateful.cpp b/src/cpp/src/make_tokenizer_stateful.cpp
index 538a935e56..3551e713c9 100644
--- a/src/cpp/src/make_tokenizer_stateful.cpp
+++ b/src/cpp/src/make_tokenizer_stateful.cpp
@@ -48,25 +48,20 @@ bool ov::genai::MakeCombineSegmentsSatateful::run_on_model(const std::shared_ptr
 }
 
 bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr<ov::Model>& model) {
-
     std::shared_ptr<ov::Node> vocab_decoder_node;
     for (auto node: model->get_ordered_ops()) {
-        if (strcmp(node->get_type_info().name, "VocabDecoder") == 0) {
+        if (strcmp(node->get_type_info().name, "VocabDecoder") == 0)
             vocab_decoder_node = node;
-        }
     }
-    auto val = vocab_decoder_node->input_value(4);
-    auto val_type = vocab_decoder_node->input_value(4).get_element_type();
 
-    if (!vocab_decoder_node || !vocab_decoder_node->input_value(4).get_element_type().is_integral_number()) {
+    if (!vocab_decoder_node || vocab_decoder_node->get_input_size() < 5)
+        return false;
+    if (!vocab_decoder_node->input_value(4).get_element_type().is_integral_number())
         return false;
-    }
     
     std::shared_ptr<v0::Constant> skip_tokens_const = std::dynamic_pointer_cast<v0::Constant>(vocab_decoder_node->get_input_node_shared_ptr(4));
-    if (!skip_tokens_const) {
+    if (!skip_tokens_const)
         return false;
-    }
-
 
     auto start_const = std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector{0});
     auto int_max_const = std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector{std::numeric_limits<int>::max()});
@@ -87,4 +82,4 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr<ov
     model->add_sinks({assign});
     model->add_variables({variable});
     return true;
-}
\ No newline at end of file
+}
diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp
index db4643a65c..dae2ffe775 100644
--- a/src/python/py_tokenizer.cpp
+++ b/src/python/py_tokenizer.cpp
@@ -68,7 +68,7 @@ void init_tokenizer(py::module_& m) {
                 detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
                 return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"), py::arg("skip_special_tokens") = true,
+            py::arg("tokens"), py::arg("skip_special_tokens") = false,
             R"(Decode a sequence into a string prompt.)"
         )
 
@@ -79,7 +79,7 @@ void init_tokenizer(py::module_& m) {
                 detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
                 return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"), py::arg("skip_special_tokens") = true,
+            py::arg("tokens"), py::arg("skip_special_tokens") = false,
             R"(Decode tensor into a list of string prompts.)")
 
         .def(
@@ -89,7 +89,7 @@ void init_tokenizer(py::module_& m) {
                 detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
                 return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"), py::arg("skip_special_tokens") = true,
+            py::arg("tokens"), py::arg("skip_special_tokens") = false,
             R"(Decode a batch of tokens into a list of string prompt.)")
 
         .def("apply_chat_template", [](Tokenizer& tok,
diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
index a87a2c7555..efd1d87416 100644
--- a/tests/python_tests/test_chat_generate_api.py
+++ b/tests/python_tests/test_chat_generate_api.py
@@ -219,6 +219,7 @@ def test_add_special_tokens(add_special_tokens, prompt):
     assert np.all(res_genai == res_hf)
 
 @pytest.mark.precommit
+@pytest.mark.xfail(reason="Need to turn them back on when openvino_tokenizers will be updated.")
 @pytest.mark.nightly
 @pytest.mark.parametrize("add_special_tokens", [True, False])
 @pytest.mark.parametrize("skip_special_tokens", [True, False])
@@ -235,6 +236,6 @@ def test_add_special_tokens(add_special_tokens, skip_special_tokens, prompt):
     assert np.all(res_genai == res_hf)
     
     # Decode with skip_special_tokens
-    decoded_genai = genai_tokenizer.decode(res_genai, skip_special_tokens=skip_special_tokens)
+    decoded_genai = genai_tokenizer.decode(res_genai, skip_special_tokens=skip_special_tokens)[0]
     decoded_hf = hf_tokenizer.decode(res_hf[0], skip_special_tokens=skip_special_tokens)
     assert decoded_genai == decoded_hf

From 18e8d5b59c9f4776a59811db4f299c2da1ea974f Mon Sep 17 00:00:00 2001
From: Alexander Kozlov <kozzzloff@list.ru>
Date: Fri, 22 Nov 2024 16:12:08 +0400
Subject: [PATCH 10/24] [WWB]: Updated readme with the latest information
 (#1248)

---
 tools/who_what_benchmark/README.md | 34 ++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/tools/who_what_benchmark/README.md b/tools/who_what_benchmark/README.md
index 012782bad3..0e597859d2 100644
--- a/tools/who_what_benchmark/README.md
+++ b/tools/who_what_benchmark/README.md
@@ -9,12 +9,12 @@ WWB provides default datasets for the supported use cases. However, it is relati
 * Command-line interface for Hugging Face and OpenVINO models and API to support broader inference backends.
 * Simple and quick accuracy test for compressed, quantized, pruned, distilled LLMs. It works with any model that supports HuggingFace Transformers text generation API including:
     * HuggingFace Transformers compressed models via [Bitsandbytes](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.BitsAndBytesConfig)
+    * [OpenVINO](https://github.com/openvinotoolkit/openvino) and [NNCF](https://github.com/openvinotoolkit/nncf) via [Optimum-Intel](https://github.com/huggingface/optimum-intel) and OpenVINO [GenAI](https://github.com/openvinotoolkit/openvino.genai)
     * [GPTQ](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.GPTQConfig) via HuggingFace API
     * Llama.cpp via [BigDL-LLM](https://github.com/intel-analytics/BigDL/tree/main/python/llm)
-    * [OpenVINO](https://github.com/openvinotoolkit/openvino) and [NNCF](https://github.com/openvinotoolkit/nncf) via [Optimum-Intel](https://github.com/huggingface/optimum-intel)
     * Support of custom datasets of the user choice
-* Validation of text-to-image pipelines. Computes similarity score between generated images:
-    * Supports Diffusers library and Optimum-Intel via `Text2ImageEvaluator` class.
+* Validation of text-to-image pipelines. Computes similarity score between generated images with Diffusers library, Optimum-Intel, and OpenVINO GenAI via `Text2ImageEvaluator` class.
+* Validation of Visual Language pipelines. Computes similarity score between generated images with Diffusers library, Optimum-Intel, and OpenVINO GenAI via `VisualTextEvaluator` class.
 
 ### Installation
 Install WWB and its requirements from the source using `pip` or any other package manager. For example,
@@ -41,18 +41,30 @@ wwb --target-model phi-3-openvino --gt-data gt.csv --model-type text
 wwb --target-model phi-3-openvino --gt-data gt.csv --model-type text --genai
 ```
 
-### Compare Text-to-image models (Diffusers)
+> **NOTE**: use --verbose option for debug to see the outputs with the largest difference.
+
+### Compare Text-to-image models
 ```sh
-# Export FP16 model to OpenVINO
-optimum-cli export openvino -m SimianLuo/LCM_Dreamshaper_v7 --weight-format fp16 sd-lcm-fp16
 # Export model with 8-bit quantized weights to OpenVINO
 optimum-cli export openvino -m SimianLuo/LCM_Dreamshaper_v7 --weight-format int8 sd-lcm-int8
-# Collect the references and save the mappling in the .json file. 
-# Reference images will be stored in the "reference" subfolder under the same path with .json.
-wwb --base-model sd-lcm-fp16 --gt-data lcm_test/sd_xl.json --model-type text-to-image
+# Collect the references and save the mappling in the .csv file. 
+# Reference images will be stored in the "reference" subfolder under the same path with .csv.
+wwb --base-model SimianLuo/LCM_Dreamshaper_v7--gt-data lcm_test/gt.csv --model-type text-to-image --hf
+# Compute the metric
+# Target images will be stored in the "target" subfolder under the same path with .csv.
+wwb --target-model sd-lcm-int8 --gt-data lcm_test/gt.csv --model-type text-to-image --genai
+```
+
+### Compare Visual Language Models (VLMs)
+```sh
+# Export FP16 model to OpenVINO
+optimum-cli export openvino -m llava-hf/llava-v1.6-mistral-7b-hf  --weight-format int8 llava-int8
+# Collect the references and save the mappling in the .csv file. 
+# Reference images will be stored in the "reference" subfolder under the same path with .csv.
+wwb --base-model llava-hf/llava-v1.6-mistral-7b-hf --gt-data llava_test/gt.csv --model-type visual-text --hf
 # Compute the metric
-# Target images will be stored in the "target" subfolder under the same path with .json.
-wwb --target-model sd-lcm-int8 --gt-data lcm_test/sd_xl.json --model-type text-to-image
+# Target images will be stored in the "target" subfolder under the same path with .csv.
+wwb --target-model llava-int8 --gt-data llava_test/gt.csv --model-type visual-text --genai
 ```
 
 ### API

From d490c18aabe6c9491fab6d6601948e91f10d6fc3 Mon Sep 17 00:00:00 2001
From: Alexander Kozlov <kozzzloff@list.ru>
Date: Mon, 25 Nov 2024 10:38:04 +0300
Subject: [PATCH 11/24] [WWB]: Added ability to compare results for previously
 collected outputs w/o models provided (#1238)

- Compare outputs collected from the previous runs
- Kept only "similarity" metric by default as the only one that is used
in CI

Example:
```shell
optimum-cli export openvino -m Qwen/Qwen2-0.5B-Instruct --weight-format fp16 models/Qwen2-0.5B-Instruct-fp16

mkdir qwen2_N_FP16

# References from NAT FP16
wwb --base-model Qwen/Qwen2-0.5B-Instruct --gt-data qwen2_N_FP16/gt.csv --hf --num-samples 4

# Compare N_O_FP16, save Optimum data for references
wwb --target-model models/Qwen2-0.5B-Instruct-fp16 --gt-data qwen2_N_FP16/gt.csv --output qwen2_N_O_FP16 --num-samples 4

# Compare N_G_FP16, save GenAI data for references
wwb --target-model  models/Qwen2-0.5B-Instruct-fp16 --gt-data qwen2_N_FP16/gt.csv --genai --output qwen2_N_G_FP16 --num-samples 4

# Compare O_G_FP16, use pre-generated grout truth and target data from the previous runs
wwb --target-data qwen2_N_G_FP16/target.csv --gt-data qwen2_N_O_FP16/target.csv --genai --output qwen2_O_G_FP16 --num-samples 4

# The same for INT8
optimum-cli export openvino -m Qwen/Qwen2-0.5B-Instruct --weight-format int8 models/Qwen2-0.5B-Instruct-int8

# Compare N_G_INT8, save GenAI data for references
wwb --target-model models/Qwen2-0.5B-Instruct-int8 --gt-data qwen2_N_FP16/gt.csv --genai --output qwen2_N_G_INT8 --num-samples 4
```
---
 .../tests/test_cli_image.py                   | 166 ++++++++++--------
 .../who_what_benchmark/tests/test_cli_text.py | 128 ++++++++------
 .../who_what_benchmark/tests/test_cli_vlm.py  | 142 ++++++++-------
 .../whowhatbench/registry.py                  |   2 +-
 .../whowhatbench/text2image_evaluator.py      |  14 +-
 .../whowhatbench/text_evaluator.py            |  10 +-
 .../whowhatbench/visualtext_evaluator.py      |   8 +-
 tools/who_what_benchmark/whowhatbench/wwb.py  |  59 ++++---
 8 files changed, 298 insertions(+), 231 deletions(-)

diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py
index 374df2a1ec..b2c2015f80 100644
--- a/tools/who_what_benchmark/tests/test_cli_image.py
+++ b/tools/who_what_benchmark/tests/test_cli_image.py
@@ -14,7 +14,6 @@ def run_wwb(args):
     logger.info(" ".join(["TRANSFOREMRS_VERBOSITY=debug wwb"] + args))
     result = subprocess.run(["wwb"] + args, capture_output=True, text=True)
     logger.info(result)
-    print(" ".join(["TRANSFOREMRS_VERBOSITY=debug wwb"] + args))
     return result
 
 
@@ -27,7 +26,7 @@ def run_wwb(args):
     ],
 )
 def test_image_model_types(model_id, model_type, backend):
-    GT_FILE = "test_sd.json"
+    GT_FILE = "test_sd.csv"
     wwb_args = [
         "--base-model",
         model_id,
@@ -70,79 +69,94 @@ def test_image_model_types(model_id, model_type, backend):
     ],
 )
 def test_image_model_genai(model_id, model_type):
-    GT_FILE = "test_sd.json"
-    MODEL_PATH = tempfile.TemporaryDirectory().name
-
-    result = subprocess.run(["optimum-cli", "export",
-                             "openvino", "-m", model_id,
-                             MODEL_PATH], capture_output=True, text=True)
-    assert result.returncode == 0
-
-    wwb_args = [
-        "--base-model",
-        MODEL_PATH,
-        "--num-samples",
-        "1",
-        "--gt-data",
-        GT_FILE,
-        "--device",
-        "CPU",
-        "--model-type",
-        model_type,
-    ]
-    result = run_wwb(wwb_args)
-    assert result.returncode == 0
-    assert os.path.exists(GT_FILE)
-    assert os.path.exists("reference")
-
-    wwb_args = [
-        "--target-model",
-        MODEL_PATH,
-        "--num-samples",
-        "1",
-        "--gt-data",
-        GT_FILE,
-        "--device",
-        "CPU",
-        "--model-type",
-        model_type,
-        "--genai",
-    ]
-    result = run_wwb(wwb_args)
-
-    assert result.returncode == 0
-    assert "Metrics for model" in result.stderr
-    similarity = float(str(result.stderr).split(" ")[-1])
-    assert similarity >= 0.98
-    assert os.path.exists("target")
-
-    output_dir = tempfile.TemporaryDirectory().name
-    wwb_args = [
-        "--target-model",
-        MODEL_PATH,
-        "--num-samples",
-        "1",
-        "--gt-data",
-        GT_FILE,
-        "--device",
-        "CPU",
-        "--model-type",
-        model_type,
-        "--output",
-        output_dir,
-    ]
-    result = run_wwb(wwb_args)
-    assert os.path.exists(os.path.join(output_dir, "target"))
-    assert os.path.exists(os.path.join(output_dir, "target.json"))
-
-    try:
-        os.remove(GT_FILE)
-    except OSError:
-        pass
-    shutil.rmtree("reference", ignore_errors=True)
-    shutil.rmtree("target", ignore_errors=True)
-    shutil.rmtree(MODEL_PATH, ignore_errors=True)
-    shutil.rmtree(output_dir, ignore_errors=True)
+    with tempfile.TemporaryDirectory() as temp_dir:
+        GT_FILE = os.path.join(temp_dir, "gt.csv")
+        MODEL_PATH = os.path.join(temp_dir, model_id.replace("/", "--"))
+
+        result = subprocess.run(["optimum-cli", "export",
+                                 "openvino", "-m", model_id,
+                                 MODEL_PATH],
+                                capture_output=True, text=True)
+        assert result.returncode == 0
+
+        wwb_args = [
+            "--base-model",
+            MODEL_PATH,
+            "--num-samples",
+            "1",
+            "--gt-data",
+            GT_FILE,
+            "--device",
+            "CPU",
+            "--model-type",
+            model_type,
+        ]
+        result = run_wwb(wwb_args)
+        assert result.returncode == 0
+        assert os.path.exists(GT_FILE)
+        assert os.path.exists(os.path.join(temp_dir, "reference"))
+
+        wwb_args = [
+            "--target-model",
+            MODEL_PATH,
+            "--num-samples",
+            "1",
+            "--gt-data",
+            GT_FILE,
+            "--device",
+            "CPU",
+            "--model-type",
+            model_type,
+            "--genai",
+        ]
+        result = run_wwb(wwb_args)
+
+        assert result.returncode == 0
+        assert "Metrics for model" in result.stderr
+        similarity = float(str(result.stderr).split(" ")[-1])
+        assert similarity >= 0.98
+        assert os.path.exists(os.path.join(temp_dir, "target"))
+
+        output_dir = tempfile.TemporaryDirectory().name
+        wwb_args = [
+            "--target-model",
+            MODEL_PATH,
+            "--num-samples",
+            "1",
+            "--gt-data",
+            GT_FILE,
+            "--device",
+            "CPU",
+            "--model-type",
+            model_type,
+            "--output",
+            output_dir,
+        ]
+        result = run_wwb(wwb_args)
+        assert result.returncode == 0
+        assert os.path.exists(os.path.join(output_dir, "target"))
+        assert os.path.exists(os.path.join(output_dir, "target.csv"))
+
+        # test w/o models
+        wwb_args = [
+            "--target-data",
+            os.path.join(output_dir, "target.csv"),
+            "--num-samples",
+            "1",
+            "--gt-data",
+            GT_FILE,
+            "--device",
+            "CPU",
+            "--model-type",
+            model_type,
+        ]
+        result = run_wwb(wwb_args)
+        assert result.returncode == 0
+
+        shutil.rmtree("reference", ignore_errors=True)
+        shutil.rmtree("target", ignore_errors=True)
+        shutil.rmtree(MODEL_PATH, ignore_errors=True)
+        shutil.rmtree(output_dir, ignore_errors=True)
 
 
 @pytest.mark.parametrize(
@@ -152,7 +166,7 @@ def test_image_model_genai(model_id, model_type):
     ],
 )
 def test_image_custom_dataset(model_id, model_type, backend):
-    GT_FILE = "test_sd.json"
+    GT_FILE = "test_sd.csv"
     wwb_args = [
         "--base-model",
         model_id,
diff --git a/tools/who_what_benchmark/tests/test_cli_text.py b/tools/who_what_benchmark/tests/test_cli_text.py
index cf71adc08a..0baf60a5a4 100644
--- a/tools/who_what_benchmark/tests/test_cli_text.py
+++ b/tools/who_what_benchmark/tests/test_cli_text.py
@@ -73,29 +73,28 @@ def test_text_target_model():
 
 @pytest.fixture
 def test_text_gt_data():
-    with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile:
-        temp_file_name = tmpfile.name
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_file_name = os.path.join(temp_dir, "gt.csv")
 
-    result = run_wwb(
-        [
-            "--base-model",
-            base_model_path,
-            "--gt-data",
-            temp_file_name,
-            "--dataset",
-            "EleutherAI/lambada_openai,en",
-            "--dataset-field",
-            "text",
-            "--split",
-            "test",
-            "--num-samples",
-            "2",
-            "--device",
-            "CPU",
-        ]
-    )
-    data = pd.read_csv(temp_file_name)
-    os.remove(temp_file_name)
+        result = run_wwb(
+            [
+                "--base-model",
+                base_model_path,
+                "--gt-data",
+                temp_file_name,
+                "--dataset",
+                "EleutherAI/lambada_openai,en",
+                "--dataset-field",
+                "text",
+                "--split",
+                "test",
+                "--num-samples",
+                "2",
+                "--device",
+                "CPU",
+            ]
+        )
+        data = pd.read_csv(temp_file_name)
 
     assert result.returncode == 0
     assert len(data["questions"].values) == 2
@@ -107,6 +106,8 @@ def test_text_output_directory():
             [
                 "--base-model",
                 base_model_path,
+                "--gt-data",
+                os.path.join(temp_dir, "gt.csv"),
                 "--target-model",
                 target_model_path,
                 "--num-samples",
@@ -121,7 +122,23 @@ def test_text_output_directory():
         assert "Metrics for model" in result.stderr
         assert os.path.exists(os.path.join(temp_dir, "metrics_per_qustion.csv"))
         assert os.path.exists(os.path.join(temp_dir, "metrics.csv"))
-        assert os.path.exists(os.path.join(temp_dir, "target.json"))
+        assert os.path.exists(os.path.join(temp_dir, "target.csv"))
+
+        # test measurtement w/o models
+        result = run_wwb(
+            [
+                "--gt-data",
+                os.path.join(temp_dir, "gt.csv"),
+                "--target-data",
+                os.path.join(temp_dir, "target.csv"),
+                "--num-samples",
+                "2",
+                "--device",
+                "CPU",
+            ]
+        )
+        assert result.returncode == 0
+        assert "Metrics for model" in result.stderr
 
 
 def test_text_verbose():
@@ -143,46 +160,43 @@ def test_text_verbose():
 
 
 def test_text_language_autodetect():
-    temp_file_name = tempfile.NamedTemporaryFile(suffix=".csv").name
-
-    result = run_wwb(
-        [
-            "--base-model",
-            "Qwen/Qwen2-0.5B",
-            "--gt-data",
-            temp_file_name,
-            "--num-samples",
-            "2",
-            "--device",
-            "CPU",
-        ]
-    )
-    data = pd.read_csv(temp_file_name)
-    os.remove(temp_file_name)
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_file_name = os.path.join(temp_dir, "gt.csv")
+        result = run_wwb(
+            [
+                "--base-model",
+                "Qwen/Qwen2-0.5B",
+                "--gt-data",
+                temp_file_name,
+                "--num-samples",
+                "2",
+                "--device",
+                "CPU",
+            ]
+        )
+        data = pd.read_csv(temp_file_name)
 
     assert result.returncode == 0
     assert "马克" in data["prompts"].values[0]
 
 
 def test_text_hf_model():
-    with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile:
-        temp_file_name = tmpfile.name
-
-    result = run_wwb(
-        [
-            "--base-model",
-            model_id,
-            "--gt-data",
-            temp_file_name,
-            "--num-samples",
-            "2",
-            "--device",
-            "CPU",
-            "--hf",
-        ]
-    )
-    data = pd.read_csv(temp_file_name)
-    os.remove(temp_file_name)
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_file_name = os.path.join(temp_dir, "gt.csv")
+        result = run_wwb(
+            [
+                "--base-model",
+                model_id,
+                "--gt-data",
+                temp_file_name,
+                "--num-samples",
+                "2",
+                "--device",
+                "CPU",
+                "--hf",
+            ]
+        )
+        data = pd.read_csv(temp_file_name)
 
     assert result.returncode == 0
     assert len(data["prompts"].values) == 2
diff --git a/tools/who_what_benchmark/tests/test_cli_vlm.py b/tools/who_what_benchmark/tests/test_cli_vlm.py
index d45283493e..5b33abf33c 100644
--- a/tools/who_what_benchmark/tests/test_cli_vlm.py
+++ b/tools/who_what_benchmark/tests/test_cli_vlm.py
@@ -24,70 +24,88 @@ def run_wwb(args):
     ],
 )
 def test_vlm_basic(model_id, model_type):
-    GT_FILE = tempfile.NamedTemporaryFile(suffix=".json").name
-    MODEL_PATH = tempfile.TemporaryDirectory().name
+    with tempfile.TemporaryDirectory() as temp_dir:
+        GT_FILE = os.path.join(temp_dir, "gt.csv")
+        MODEL_PATH = os.path.join(temp_dir, model_id.replace("/", "--"))
 
-    result = subprocess.run(["optimum-cli", "export",
-                             "openvino", "-m", model_id,
-                             MODEL_PATH, "--task",
-                             "image-text-to-text",
-                             "--trust-remote-code"],
-                            capture_output=True,
-                            text=True,
-                            )
-    assert result.returncode == 0
+        result = subprocess.run(["optimum-cli", "export",
+                                 "openvino", "-m", model_id,
+                                 MODEL_PATH, "--task",
+                                 "image-text-to-text",
+                                 "--trust-remote-code"],
+                                capture_output=True,
+                                text=True,
+                                )
+        assert result.returncode == 0
 
-    wwb_args = [
-        "--base-model",
-        model_id,
-        "--num-samples",
-        "1",
-        "--gt-data",
-        GT_FILE,
-        "--device",
-        "CPU",
-        "--model-type",
-        model_type,
-        "--hf",
-    ]
-    result = run_wwb(wwb_args)
-    assert result.returncode == 0
+        # Collect reference with HF model
+        wwb_args = [
+            "--base-model",
+            model_id,
+            "--num-samples",
+            "1",
+            "--gt-data",
+            GT_FILE,
+            "--device",
+            "CPU",
+            "--model-type",
+            model_type,
+            "--hf",
+        ]
+        result = run_wwb(wwb_args)
+        assert result.returncode == 0
 
-    wwb_args = [
-        "--target-model",
-        MODEL_PATH,
-        "--num-samples",
-        "1",
-        "--gt-data",
-        GT_FILE,
-        "--device",
-        "CPU",
-        "--model-type",
-        model_type,
-    ]
-    result = run_wwb(wwb_args)
-    assert result.returncode == 0
+        # test Optimum
+        wwb_args = [
+            "--target-model",
+            MODEL_PATH,
+            "--num-samples",
+            "1",
+            "--gt-data",
+            GT_FILE,
+            "--device",
+            "CPU",
+            "--model-type",
+            model_type,
+        ]
+        result = run_wwb(wwb_args)
+        assert result.returncode == 0
 
-    wwb_args = [
-        "--target-model",
-        MODEL_PATH,
-        "--num-samples",
-        "1",
-        "--gt-data",
-        GT_FILE,
-        "--device",
-        "CPU",
-        "--model-type",
-        model_type,
-        "--genai",
-    ]
-    result = run_wwb(wwb_args)
-    assert result.returncode == 0
+        # test GenAI
+        wwb_args = [
+            "--target-model",
+            MODEL_PATH,
+            "--num-samples",
+            "1",
+            "--gt-data",
+            GT_FILE,
+            "--device",
+            "CPU",
+            "--model-type",
+            model_type,
+            "--genai",
+            "--output",
+            "target",
+        ]
+        result = run_wwb(wwb_args)
+        assert result.returncode == 0
 
-    try:
-        os.remove(GT_FILE)
-    except OSError:
-        pass
-    shutil.rmtree("reference", ignore_errors=True)
-    shutil.rmtree("target", ignore_errors=True)
-    shutil.rmtree(MODEL_PATH, ignore_errors=True)
+        # test w/o models
+        wwb_args = [
+            "--target-data",
+            "target/target.csv",
+            "--num-samples",
+            "1",
+            "--gt-data",
+            GT_FILE,
+            "--device",
+            "CPU",
+            "--model-type",
+            model_type,
+            "--genai",
+        ]
+        result = run_wwb(wwb_args)
+        assert result.returncode == 0
+        shutil.rmtree("reference", ignore_errors=True)
+        shutil.rmtree("target", ignore_errors=True)
+        shutil.rmtree(MODEL_PATH, ignore_errors=True)
diff --git a/tools/who_what_benchmark/whowhatbench/registry.py b/tools/who_what_benchmark/whowhatbench/registry.py
index 85fabf618e..0cfbf8e440 100644
--- a/tools/who_what_benchmark/whowhatbench/registry.py
+++ b/tools/who_what_benchmark/whowhatbench/registry.py
@@ -29,7 +29,7 @@ def dump_predictions(self, csv_name: str):
         pass
 
     @abstractmethod
-    def score(self, model, **kwargs):
+    def score(self, model_or_data, **kwargs):
         pass
 
     @abstractmethod
diff --git a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py
index 2663414917..1ff7ff5e21 100644
--- a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py
+++ b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py
@@ -84,15 +84,19 @@ def __init__(
     def get_generation_fn(self):
         return self.generation_fn
 
-    def score(self, model, gen_image_fn=None, output_dir=None, **kwargs):
-        model.resolution = self.resolution
+    def score(self, model_or_data, gen_image_fn=None, output_dir=None, **kwargs):
         if output_dir is None:
             image_folder = os.path.join(self.gt_dir, "target")
         else:
             image_folder = os.path.join(output_dir, "target")
-        predictions = self._generate_data(
-            model, gen_image_fn, image_folder
-        )
+
+        if isinstance(model_or_data, str) and os.path.exists(model_or_data):
+            predictions = pd.read_csv(model_or_data, keep_default_na=False)
+        else:
+            model_or_data.resolution = self.resolution
+            predictions = self._generate_data(
+                model_or_data, gen_image_fn, image_folder
+            )
         self.predictions = predictions
 
         all_metrics_per_prompt = {}
diff --git a/tools/who_what_benchmark/whowhatbench/text_evaluator.py b/tools/who_what_benchmark/whowhatbench/text_evaluator.py
index eb89083496..50ce224def 100644
--- a/tools/who_what_benchmark/whowhatbench/text_evaluator.py
+++ b/tools/who_what_benchmark/whowhatbench/text_evaluator.py
@@ -1,5 +1,6 @@
 from typing import Any, Union
 
+import os
 import pandas as pd
 from tqdm import tqdm
 
@@ -97,7 +98,7 @@ def __init__(
         tokenizer: Any = None,
         gt_data: str = None,
         test_data: Union[str, list] = None,
-        metrics=("similarity", "divergency"),
+        metrics="similarity",
         similarity_model_id: str = "sentence-transformers/all-mpnet-base-v2",
         max_new_tokens=128,
         crop_question=True,
@@ -155,8 +156,11 @@ def __init__(
     def get_generation_fn(self):
         return self.generation_fn
 
-    def score(self, model, gen_answer_fn=None, **kwargs):
-        predictions = self._generate_data(model, gen_answer_fn, self.generation_config)
+    def score(self, model_or_data, gen_answer_fn=None, **kwargs):
+        if isinstance(model_or_data, str) and os.path.exists(model_or_data):
+            predictions = pd.read_csv(model_or_data, keep_default_na=False)
+        else:
+            predictions = self._generate_data(model_or_data, gen_answer_fn, self.generation_config)
         self.predictions = predictions
 
         all_metrics_per_prompt = {}
diff --git a/tools/who_what_benchmark/whowhatbench/visualtext_evaluator.py b/tools/who_what_benchmark/whowhatbench/visualtext_evaluator.py
index ef10bdafcf..99027971d8 100644
--- a/tools/who_what_benchmark/whowhatbench/visualtext_evaluator.py
+++ b/tools/who_what_benchmark/whowhatbench/visualtext_evaluator.py
@@ -1,5 +1,6 @@
 from typing import Any, Union
 
+import os
 import datasets
 import pandas as pd
 from diffusers.utils.loading_utils import load_image
@@ -64,8 +65,11 @@ def __init__(
             seqs_per_request=seqs_per_request,
         )
 
-    def score(self, model, gen_answer_fn=None, **kwargs):
-        predictions = self._generate_data(model, gen_answer_fn)
+    def score(self, model_or_data, gen_answer_fn=None, **kwargs):
+        if isinstance(model_or_data, str) and os.path.exists(model_or_data):
+            predictions = pd.read_csv(model_or_data, keep_default_na=False)
+        else:
+            predictions = self._generate_data(model_or_data, gen_answer_fn, self.generation_config)
         self.predictions = predictions
 
         all_metrics_per_prompt = {}
diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py
index f3c5f8224a..0a01a8e8df 100644
--- a/tools/who_what_benchmark/whowhatbench/wwb.py
+++ b/tools/who_what_benchmark/whowhatbench/wwb.py
@@ -271,12 +271,17 @@ def parse_args():
         default=None,
         help="Tokenizer for divergency metric. If not provided, it will be load from base_model or target_model.",
     )
-
     parser.add_argument(
         "--gt-data",
         default=None,
-        help="CSV file containing GT outputs from base_model. If defined and exists then base_model will not used."
-        " If the files does not exist, it will be generated by base_model evaluation.",
+        help="CSV file containing GT outputs from --base-model. If defined and exists then --base-model will not used."
+        " If the files does not exist, it will be generated by --base-model evaluation.",
+    )
+    parser.add_argument(
+        "--target-data",
+        default=None,
+        help="CSV file containing outputs from target model. If defined and exists then --target-model will not used."
+        " If the files does not exist, it will be generated by --target-model evaluation.",
     )
     parser.add_argument(
         "--model-type",
@@ -385,14 +390,11 @@ def parse_args():
 
 
 def check_args(args):
-    if args.base_model is None and args.target_model is None:
-        raise ValueError(
-            "Wether --base-model or --target-model should be provided")
     if args.base_model is None and args.gt_data is None:
         raise ValueError("Wether --base-model or --gt-data should be provided")
-    if args.target_model is None and args.gt_data is None:
+    if args.target_model is None and args.gt_data is None and args.target_data:
         raise ValueError(
-            "Wether --target-model or --gt-data should be provided")
+            "Wether --target-model, --target-data or --gt-data should be provided")
 
 
 def load_tokenizer(args):
@@ -405,7 +407,7 @@ def load_tokenizer(args):
         tokenizer = AutoTokenizer.from_pretrained(
             args.base_model, trust_remote_code=True
         )
-    else:
+    elif args.target_model is not None:
         tokenizer = AutoTokenizer.from_pretrained(
             args.target_model, trust_remote_code=True
         )
@@ -419,7 +421,7 @@ def load_processor(args):
         processor = AutoProcessor.from_pretrained(
             args.base_model, trust_remote_code=True
         )
-    else:
+    elif args.target_model is not None:
         processor = AutoProcessor.from_pretrained(
             args.target_model, trust_remote_code=True
         )
@@ -611,20 +613,27 @@ def main():
             evaluator.dump_gt(args.gt_data)
         del base_model
 
-    if args.target_model:
-        target_model = load_model(
-            args.model_type,
-            args.target_model,
-            args.device,
-            args.ov_config,
-            args.hf,
-            args.genai,
-        )
-        all_metrics_per_question, all_metrics = evaluator.score(
-            target_model,
-            evaluator.get_generation_fn() if args.genai else None,
-            output_dir=args.output
-        )
+    if args.target_data or args.target_model:
+        if args.target_data and os.path.exists(args.target_data):
+            all_metrics_per_question, all_metrics = evaluator.score(
+                args.target_data,
+                None,
+                output_dir=args.output
+            )
+        else:
+            target_model = load_model(
+                args.model_type,
+                args.target_model,
+                args.device,
+                args.ov_config,
+                args.hf,
+                args.genai,
+            )
+            all_metrics_per_question, all_metrics = evaluator.score(
+                target_model,
+                evaluator.get_generation_fn() if args.genai else None,
+                output_dir=args.output
+            )
         logger.info("Metrics for model: %s", args.target_model)
         logger.info(all_metrics)
 
@@ -635,7 +644,7 @@ def main():
             df.to_csv(os.path.join(args.output, "metrics_per_qustion.csv"))
             df = pd.DataFrame(all_metrics)
             df.to_csv(os.path.join(args.output, "metrics.csv"))
-            evaluator.dump_predictions(os.path.join(args.output, "target.json"))
+            evaluator.dump_predictions(os.path.join(args.output, "target.csv"))
 
     if args.verbose and args.target_model is not None:
         if args.model_type == "text" or args.model_type == "visual-text":

From 43caa0b1352e8508b91ac658c143231fe16ead9c Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Mon, 25 Nov 2024 13:32:29 +0400
Subject: [PATCH 12/24] use genai callback in image gen and switch to genai by
 default (#1249)

CVS-157814
---
 .github/workflows/llm_bench-python.yml        | 18 ++---
 tools/llm_bench/README.md                     |  5 +-
 tools/llm_bench/benchmark.py                  |  3 +-
 .../llm_bench/llm_bench_utils/config_class.py | 15 ++--
 .../llm_bench_utils/metrics_print.py          | 17 +++--
 .../llm_bench/llm_bench_utils/model_utils.py  | 15 +++-
 tools/llm_bench/llm_bench_utils/ov_utils.py   | 69 ++++++++++++++++---
 tools/llm_bench/task/image_generation.py      | 14 ++--
 8 files changed, 108 insertions(+), 48 deletions(-)

diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
index 0ac47d1aa0..77f26d33a0 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/llm_bench-python.yml
@@ -66,28 +66,28 @@ jobs:
         python ./tools/llm_bench/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt
       env:
         GIT_LFS_SKIP_SMUDGE: 0
-    - name: Test tiny-random-baichuan2 on Linux
+    - name: Test tiny-random-baichuan2 on Linux Optimum Intel
       run: |
         optimum-cli export openvino --model katuni4ka/tiny-random-baichuan2 --trust-remote-code --weight-format fp16 ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1
-    - name: Test tiny-stable-diffusion on Linux
+        python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1 --optimum
+    - name: Test tiny-stable-diffusion on Linux Optimum Intel
       run: |
         optimum-cli export openvino --model segmind/tiny-sd --trust-remote-code --weight-format fp16 ./ov_models/tiny-sd/pytorch/dldt/FP16/
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-sd/pytorch/dldt/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1
+        python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-sd/pytorch/dldt/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum
     - name: Test dreamlike-anime on Linux with GenAI
       run: |
         optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 ov_models/dreamlike-art-dreamlike-anime-1.0/FP16
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/dreamlike-art-dreamlike-anime-1.0/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --genai
+        python ./tools/llm_bench/benchmark.py -m ./ov_models/dreamlike-art-dreamlike-anime-1.0/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1
     - name: Test dreamlike-anime on Linux with GenAI and LoRA
       run: |
         wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/dreamlike-art-dreamlike-anime-1.0/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --genai  --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7
+        python ./tools/llm_bench/benchmark.py -m ./ov_models/dreamlike-art-dreamlike-anime-1.0/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7
     - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux
       run: |
         optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format fp16 ov_models/TinyLlama-1.1B-Chat-v1.0/FP16
         optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format int8 ov_models/TinyLlama-1.1B-Chat-v1.0/INT8
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --genai  --assistant_confidence_threshold 0.4
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --genai  --num_assistant_tokens 5
+        python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1  --assistant_confidence_threshold 0.4
+        python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1  --num_assistant_tokens 5
     - name: Test whisper-tiny on Linux
       run: |
         GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --branch main --single-branch https://huggingface.co/datasets/facebook/multilingual_librispeech
@@ -97,8 +97,8 @@ jobs:
         tar zxvf data/mls_polish/train/audio/3283_1447_000.tar.gz -C data/mls_polish/train/audio/3283_1447_000/
         cd ..
         optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny ./ov_models/whisper-tiny
+        python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1 --optimum
         python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1 --genai
     - name: WWB Tests
       run: |
         GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.WWB_PATH }}/requirements.txt
diff --git a/tools/llm_bench/README.md b/tools/llm_bench/README.md
index d3f643b58f..87f6e91271 100755
--- a/tools/llm_bench/README.md
+++ b/tools/llm_bench/README.md
@@ -161,11 +161,10 @@ For example, `--load_config config.json` as following will result in streams.num
 
 ## 6. Execution on CPU device
 
-OpenVINO is by default built with [oneTBB](https://github.com/oneapi-src/oneTBB/) threading library, while Torch uses [OpenMP](https://www.openmp.org/). Both threading libraries have ['busy-wait spin'](https://gcc.gnu.org/onlinedocs/libgomp/GOMP_005fSPINCOUNT.html) by default. When running LLM pipeline on CPU device, there is threading overhead in the switching between inference on CPU with OpenVINO (oneTBB) and postprocessing (For example: greedy search or beam search) with Torch (OpenMP).
+OpenVINO is by default built with [oneTBB](https://github.com/oneapi-src/oneTBB/) threading library, while Torch uses [OpenMP](https://www.openmp.org/). Both threading libraries have ['busy-wait spin'](https://gcc.gnu.org/onlinedocs/libgomp/GOMP_005fSPINCOUNT.html) by default. When running LLM pipeline on CPU device, there is threading overhead in the switching between inference on CPU with OpenVINO (oneTBB) and postprocessing (For example: greedy search or beam search) with Torch (OpenMP). The default benchmarking scenarion uses OpenVINO GenAI that implements own postprocessing api without additional dependencies.
 
 **Alternative solutions**
-1. Use --genai option which uses OpenVINO genai API instead of optimum-intel API. In this case postprocessing is executed with OpenVINO genai API.
-2. Without --genai option which uses optimum-intel API, set environment variable [OMP_WAIT_POLICY](https://gcc.gnu.org/onlinedocs/libgomp/OMP_005fWAIT_005fPOLICY.html) to PASSIVE which will disable OpenMP 'busy-wait', and benchmark.py will limit the Torch thread number by default to avoid using CPU cores which is in 'busy-wait' by OpenVINO inference. Users can also set the number with --set_torch_thread option.
+1. With --optimum option which uses optimum-intel API, set environment variable [OMP_WAIT_POLICY](https://gcc.gnu.org/onlinedocs/libgomp/OMP_005fWAIT_005fPOLICY.html) to PASSIVE which will disable OpenMP 'busy-wait', and benchmark.py will limit the Torch thread number by default to avoid using CPU cores which is in 'busy-wait' by OpenVINO inference. Users can also set the number with --set_torch_thread option.
 
 ## 7. Additional Resources
 
diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py
index d652c8b48f..fe5068b009 100644
--- a/tools/llm_bench/benchmark.py
+++ b/tools/llm_bench/benchmark.py
@@ -130,7 +130,8 @@ def get_argprser():
     )
     parser.add_argument('-od', '--output_dir', help='Save the input text and generated text, images to files')
     llm_bench_utils.model_utils.add_stateful_model_arguments(parser)
-    parser.add_argument("--genai", action="store_true", help="Use OpenVINO GenAI optimized pipelines for benchmarking")
+    parser.add_argument("--genai", action="store_true", help="[DEPRECATED] Use OpenVINO GenAI optimized pipelines for benchmarking. Enabled by default")
+    parser.add_argument("--optimum", action="store_true", help="Use Optimum Intel pipelines for benchmarking")
     parser.add_argument(
         "--lora",
         nargs='*',
diff --git a/tools/llm_bench/llm_bench_utils/config_class.py b/tools/llm_bench/llm_bench_utils/config_class.py
index 2f6cd95664..12385d2879 100644
--- a/tools/llm_bench/llm_bench_utils/config_class.py
+++ b/tools/llm_bench/llm_bench_utils/config_class.py
@@ -7,9 +7,7 @@
 from optimum.intel.openvino import (
     OVModelForCausalLM,
     OVModelForSeq2SeqLM,
-    OVStableDiffusionPipeline,
-    OVLatentConsistencyModelPipeline,
-    OVStableDiffusionXLPipeline,
+    OVDiffusionPipeline,
     OVModelForSpeechSeq2Seq
 )
 from llm_bench_utils.ov_model_classes import OVMPTModel, OVLDMSuperResolutionPipeline, OVChatGLMModel
@@ -22,19 +20,14 @@
     'falcon': AutoTokenizer,
 }
 
+IMAGE_GEN_CLS = OVDiffusionPipeline
+
 OV_MODEL_CLASSES_MAPPING = {
     'decoder': OVModelForCausalLM,
     't5': OVModelForSeq2SeqLM,
     'blenderbot': OVModelForSeq2SeqLM,
     'falcon': OVModelForCausalLM,
     'mpt': OVMPTModel,
-    'stable-diffusion-xl': OVStableDiffusionXLPipeline,
-    'sdxl': OVStableDiffusionXLPipeline,
-    'lcm-sdxl': OVStableDiffusionXLPipeline,
-    'ssd-': OVStableDiffusionXLPipeline,
-    'lcm-ssd-': OVStableDiffusionXLPipeline,
-    'stable_diffusion': OVStableDiffusionPipeline,
-    'lcm': OVLatentConsistencyModelPipeline,
     'replit': OVMPTModel,
     'codet5': OVModelForSeq2SeqLM,
     'codegen2': OVModelForCausalLM,
@@ -57,7 +50,7 @@
 }
 
 USE_CASES = {
-    'image_gen': ['stable-diffusion-', 'ssd-', 'deepfloyd-if', 'tiny-sd', 'small-sd', 'lcm-', 'sdxl', 'dreamlike'],
+    'image_gen': ['stable-diffusion-', 'ssd-', 'tiny-sd', 'small-sd', 'lcm-', 'sdxl', 'dreamlike', "flux"],
     'speech2text': ['whisper'],
     'image_cls': ['vit'],
     'code_gen': ['replit', 'codegen2', 'codegen', 'codet5', "stable-code"],
diff --git a/tools/llm_bench/llm_bench_utils/metrics_print.py b/tools/llm_bench/llm_bench_utils/metrics_print.py
index de9d0126f8..73e83dc672 100644
--- a/tools/llm_bench/llm_bench_utils/metrics_print.py
+++ b/tools/llm_bench/llm_bench_utils/metrics_print.py
@@ -97,12 +97,17 @@ def print_stable_diffusion_infer_latency(iter_str, iter_data, stable_diffusion,
     prefix = f'[{iter_str}][P{prompt_idx}]'
     log.info(f"{prefix} First step of unet latency: {iter_data['first_token_latency']:.2f} ms/step, "
              f"other steps of unet latency: {iter_data['other_tokens_avg_latency']:.2f} ms/step",)
-    log.info(f"{prefix} Text encoder latency: {stable_diffusion.get_text_encoder_latency():.2f} ms/step, "
-             f"unet latency: {stable_diffusion.get_unet_latency():.2f} ms/step, "
-             f"vae decoder latency: {stable_diffusion.get_vae_decoder_latency():.2f} ms/step, "
-             f"text encoder step count: {stable_diffusion.get_text_encoder_step_count()}, "
-             f"unet step count: {stable_diffusion.get_unet_step_count()}, "
-             f"vae decoder step count: {stable_diffusion.get_vae_decoder_step_count()}",)
+    has_text_encoder_time = stable_diffusion.get_text_encoder_step_count() != -1
+    log_str = (
+        f"{prefix} Text encoder latency: {stable_diffusion.get_text_encoder_latency():.2f}" if has_text_encoder_time else f"{prefix} Text encoder latency: N/A "
+        f"unet latency: {stable_diffusion.get_unet_latency():.2f} ms/step, "
+        f"vae decoder latency: {stable_diffusion.get_vae_decoder_latency():.2f} ms/step, ")
+    if has_text_encoder_time:
+        log_str += f"text encoder step count: {stable_diffusion.get_text_encoder_step_count()}, "
+    log_str += (
+        f"unet step count: {stable_diffusion.get_unet_step_count()}, "
+        f"vae decoder step count: {stable_diffusion.get_vae_decoder_step_count()}")
+    log.info(log_str)
 
 
 def print_ldm_unet_vqvae_infer_latency(iter_num, iter_data, tms=None, warm_up=False, prompt_idx=-1):
diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py
index 6539bef232..f72557b6c5 100644
--- a/tools/llm_bench/llm_bench_utils/model_utils.py
+++ b/tools/llm_bench/llm_bench_utils/model_utils.py
@@ -95,6 +95,13 @@ def analyze_args(args):
     model_args['torch_compile_input_module'] = args.torch_compile_input_module
     model_args['media'] = args.media
 
+    optimum = args.optimum
+
+    if optimum and args.genai:
+        raise RuntimeError("`--genai` and `--optimum` can not be selected in the same time")
+    model_args["optimum"] = optimum
+    model_args["genai"] = not optimum
+
     has_torch_compile_options = any([args.torch_compile_options is not None, args.torch_compile_options is not None, args.torch_compile_dynamic])
     if model_args["torch_compile_backend"] is None and has_torch_compile_options:
         log.warning("torch.compile configuration options provided, but backend is not selected, openvino backend will be used")
@@ -102,7 +109,6 @@ def analyze_args(args):
     model_args['convert_tokenizer'] = args.convert_tokenizer
     model_args['subsequent'] = args.subsequent
     model_args['output_dir'] = args.output_dir
-    model_args['genai'] = args.genai
     model_args['lora'] = args.lora
     model_args['lora_alphas'] = args.lora_alphas
     model_args["use_cb"] = args.use_cb
@@ -135,7 +141,7 @@ def analyze_args(args):
     model_args['model_type'] = get_model_type(model_name, use_case, model_framework)
     model_args['model_name'] = model_name
 
-    if (args.use_cb or args.draft_model) and not args.genai:
+    if (args.use_cb or args.draft_model) and optimum:
         raise RuntimeError("Continuous batching mode supported only via OpenVINO GenAI")
     cb_config = None
     if args.cb_config:
@@ -169,6 +175,11 @@ def get_use_case(model_name_or_path):
         config = json.loads(config_file.read_text())
     except Exception:
         config = None
+    if (Path(model_name_or_path) / "model_index.json").exists():
+        diffusers_config = json.loads((Path(model_name_or_path) / "model_index.json").read_text())
+        pipe_type = diffusers_config.get("_class_name")
+        if pipe_type in ["StableDiffusionPipeline", "StableDiffusionXLPipeline", "StableDiffusion3Pipeline", "FluxPipeline", "LatentConsistencyModelPipeline"]:
+            return "image_gen", pipe_type.replace("Pipeline", "")
 
     if config is not None:
         for case, model_ids in USE_CASES.items():
diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
index cf0d0d831c..9ebd1363e3 100644
--- a/tools/llm_bench/llm_bench_utils/ov_utils.py
+++ b/tools/llm_bench/llm_bench_utils/ov_utils.py
@@ -11,7 +11,7 @@
 import json
 import types
 from llm_bench_utils.hook_common import get_bench_hook
-from llm_bench_utils.config_class import OV_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES
+from llm_bench_utils.config_class import OV_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES, IMAGE_GEN_CLS
 import openvino.runtime.opset13 as opset
 from transformers import pipeline
 
@@ -171,11 +171,13 @@ def create_text_gen_model(model_path, device, **kwargs):
     if not model_path_existed:
         raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist')
     else:
-        if kwargs.get("genai", False) and is_genai_available(log_msg=True):
+        if kwargs.get("genai", True) and is_genai_available(log_msg=True):
             if model_class not in [OV_MODEL_CLASSES_MAPPING[default_model_type], OV_MODEL_CLASSES_MAPPING["mpt"], OV_MODEL_CLASSES_MAPPING["chatglm"]]:
                 log.warning("OpenVINO GenAI based benchmarking is not available for {model_type}. Will be switched to default benchmarking")
             else:
+                log.info("Selected OpenVINO GenAI for benchmarking")
                 return create_genai_text_gen_model(model_path, device, ov_config, **kwargs)
+        log.info("Selected Optimum Intel for benchmarking")
         remote_code = False
         try:
             model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=False)
@@ -295,23 +297,23 @@ def convert_ov_tokenizer(tokenizer_path):
 
 
 def create_image_gen_model(model_path, device, **kwargs):
-    default_model_type = DEFAULT_MODEL_CLASSES[kwargs['use_case']]
-    model_type = kwargs.get('model_type', default_model_type)
-    model_class = OV_MODEL_CLASSES_MAPPING[model_type]
+    model_class = IMAGE_GEN_CLS
     model_path = Path(model_path)
     ov_config = kwargs['config']
     if not Path(model_path).exists():
         raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist')
     else:
-        if kwargs.get("genai", False) and is_genai_available(log_msg=True):
+        if kwargs.get("genai", True) and is_genai_available(log_msg=True):
+            log.info("Selected OpenVINO GenAI for benchmarking")
             return create_genai_image_gen_model(model_path, device, ov_config, **kwargs)
 
+        log.info("Selected Optimum Intel for benchmarking")
         start = time.perf_counter()
         ov_model = model_class.from_pretrained(model_path, device=device, ov_config=ov_config)
         end = time.perf_counter()
     from_pretrained_time = end - start
     log.info(f'From pretrained time: {from_pretrained_time:.2f}s')
-    return ov_model, from_pretrained_time, False
+    return ov_model, from_pretrained_time, False, None
 
 
 def get_genai_clip_text_encoder(model_index_data, model_path, device, ov_config):
@@ -350,6 +352,51 @@ def get_genai_unet_model(model_index_data, model_path, device, ov_config):
 def create_genai_image_gen_model(model_path, device, ov_config, **kwargs):
     import openvino_genai
 
+    class PerfCollector:
+        def __init__(self) -> types.NoneType:
+            self.iteration_time = []
+            self.start_time = time.perf_counter()
+            self.duration = -1
+
+        def __call__(self, step, latents):
+            self.iteration_time.append(time.perf_counter() - self.start_time)
+            self.start_time = time.perf_counter()
+            return False
+
+        def reset(self):
+            self.iteration_time = []
+            self.start_time = time.perf_counter()
+            self.duration = -1
+
+        def get_1st_unet_latency(self):
+            return self.iteration_time[0] * 1000 if len(self.iteration_time) > 0 else 0
+
+        def get_2nd_unet_latency(self):
+            return sum(self.iteration_time[1:]) / (len(self.iteration_time) - 1) * 1000 if len(self.iteration_time) > 1 else 0
+
+        def get_unet_latency(self):
+            return (sum(self.iteration_time) / len(self.iteration_time)) * 1000 if len(self.iteration_time) > 0 else 0
+
+        def get_vae_decoder_latency(self):
+            if self.duration != -1:
+                vae_time = self.duration - sum(self.iteration_time)
+                return vae_time * 1000
+            return 0
+
+        def get_text_encoder_latency(self):
+            return -1
+
+        def get_text_encoder_step_count(self):
+            return -1
+
+        def get_unet_step_count(self):
+            return len(self.iteration_time)
+
+        def get_vae_decoder_step_count(self):
+            return 1
+
+    callback = PerfCollector()
+
     adapter_config = get_lora_config(kwargs.get("lora", None), kwargs.get("lora_alphas", []))
     if adapter_config:
         ov_config['adapters'] = adapter_config
@@ -393,7 +440,7 @@ def create_genai_image_gen_model(model_path, device, ov_config, **kwargs):
 
     end = time.perf_counter()
     log.info(f'Pipeline initialization time: {end - start:.2f}s')
-    return t2i_pipe, end - start, True
+    return t2i_pipe, end - start, True, callback
 
 
 def create_ldm_super_resolution_model(model_path, device, **kwargs):
@@ -414,7 +461,7 @@ def create_ldm_super_resolution_model(model_path, device, **kwargs):
 
 def create_genai_speech_2_txt_model(model_path, device, **kwargs):
     import openvino_genai as ov_genai
-    if kwargs.get("genai", False) is False:
+    if kwargs.get("genai", True) is False:
         raise RuntimeError('==Failure the command line does not set --genai ==')
     if is_genai_available(log_msg=True) is False:
         raise RuntimeError('==Failure genai is not enable ==')
@@ -442,11 +489,13 @@ def create_speech_2txt_model(model_path, device, **kwargs):
     if not model_path_existed:
         raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist')
     else:
-        if kwargs.get("genai", False) and is_genai_available(log_msg=True):
+        if kwargs.get("genai", True) and is_genai_available(log_msg=True):
             if model_class not in [OV_MODEL_CLASSES_MAPPING[default_model_type]]:
                 log.warning("OpenVINO GenAI based benchmarking is not available for {model_type}. Will be switched to default bencmarking")
             else:
+                log.info("Selected OpenVINO GenAI for benchmarking")
                 return create_genai_speech_2_txt_model(model_path, device, **kwargs)
+        log.info("Selected Optimum Intel for benchmarking")
         start = time.perf_counter()
         ov_model = model_class.from_pretrained(
             model_path,
diff --git a/tools/llm_bench/task/image_generation.py b/tools/llm_bench/task/image_generation.py
index b6260568bf..f227898ef6 100644
--- a/tools/llm_bench/task/image_generation.py
+++ b/tools/llm_bench/task/image_generation.py
@@ -41,7 +41,7 @@ def collects_input_args(image_param, model_type, model_name):
     return input_args
 
 
-def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption):
+def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption, callback=None):
     set_seed(args['seed'])
     input_text = image_param['prompt']
     input_args = collects_input_args(image_param, args['model_type'], args['model_name'])
@@ -104,7 +104,7 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list,
     stable_diffusion_hook.clear_statistics()
 
 
-def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption):
+def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption, callback=None):
     set_seed(args['seed'])
     input_text = image_param['prompt']
     input_args = collects_input_args(image_param, args['model_type'], args['model_name'])
@@ -125,9 +125,11 @@ def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data
     if num == 0 and args["output_dir"] is not None:
         for bs_idx, in_text in enumerate(input_text_list):
             llm_bench_utils.output_file.output_image_input_text(in_text, args, image_id, bs_idx, proc_id)
+    callback.reset()
     start = time.perf_counter()
-    res = pipe.generate(input_text, **input_args).data
+    res = pipe.generate(input_text, **input_args, callback=callback).data
     end = time.perf_counter()
+    callback.duration = end - start
     if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
         mem_consumption.end_collect_momory_consumption()
         max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption()
@@ -155,7 +157,7 @@ def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data
         max_rss_mem=max_rss_mem_consumption,
         max_shared_mem=max_shared_mem_consumption,
         max_uss_mem=max_uss_mem_consumption,
-        stable_diffusion=None,
+        stable_diffusion=callback,
         prompt_idx=image_id
     )
     metrics_print.print_generated(num, warm_up=(num == 0), generated=rslt_img_fn, prompt_idx=image_id)
@@ -163,7 +165,7 @@ def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data
 
 
 def run_image_generation_benchmark(model_path, framework, device, args, num_iters, mem_consumption):
-    pipe, pretrain_time, use_genai = FW_UTILS[framework].create_image_gen_model(model_path, device, **args)
+    pipe, pretrain_time, use_genai, callback = FW_UTILS[framework].create_image_gen_model(model_path, device, **args)
     iter_data_list = []
     input_image_list = get_image_prompt(args)
     if framework == "ov" and not use_genai:
@@ -198,7 +200,7 @@ def run_image_generation_benchmark(model_path, framework, device, args, num_iter
             for image_id, image_param in enumerate(image_list):
                 p_idx = prompt_idx_list[image_id]
                 iter_timestamp[num][p_idx]['start'] = datetime.datetime.now().isoformat()
-                image_gen_fn(image_param, num, prompt_idx_list[image_id], pipe, args, iter_data_list, proc_id, mem_consumption)
+                image_gen_fn(image_param, num, prompt_idx_list[image_id], pipe, args, iter_data_list, proc_id, mem_consumption, callback)
                 iter_timestamp[num][p_idx]['end'] = datetime.datetime.now().isoformat()
                 prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
                 log.info(f"{prefix}[P{p_idx}] start: {iter_timestamp[num][p_idx]['start']}, end: {iter_timestamp[num][p_idx]['end']}")

From 21037497e6958c7df020131d77984a953a4beb08 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Mon, 25 Nov 2024 12:09:04 +0100
Subject: [PATCH 13/24] align with the openvino_tokenizers

---
 src/cpp/include/openvino/genai/tokenizer.hpp |  6 +++---
 src/cpp/src/make_tokenizer_stateful.cpp      | 13 +++++++++----
 src/cpp/src/tokenizer.cpp                    |  5 ++---
 src/python/py_tokenizer.cpp                  |  6 +++---
 4 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index 8d2d63ea80..36f63d2b5e 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -87,7 +87,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief decode sequence of tokens
     * @param tokens vector storing tokens
-    * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
+    * @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return sequence string
     */
     std::string decode(std::vector<int64_t> tokens, const ov::AnyMap& detokenization_params = {});
@@ -106,7 +106,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief decode tokens. 
     * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
-    * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
+    * @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return vector of std::string, with size = batch_size
     */
     std::vector<std::string> decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params = {});
@@ -125,7 +125,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief batched decoding of tokens. 
     * @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size
-    * @param tokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
+    * @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return vector of std::string, with size equal to batch_size
     */
     std::vector<std::string> decode(std::vector<std::vector<int64_t>> tokens, const ov::AnyMap& detokenization_params = {});
diff --git a/src/cpp/src/make_tokenizer_stateful.cpp b/src/cpp/src/make_tokenizer_stateful.cpp
index 3551e713c9..4685b0e715 100644
--- a/src/cpp/src/make_tokenizer_stateful.cpp
+++ b/src/cpp/src/make_tokenizer_stateful.cpp
@@ -60,7 +60,8 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr<ov
         return false;
     
     std::shared_ptr<v0::Constant> skip_tokens_const = std::dynamic_pointer_cast<v0::Constant>(vocab_decoder_node->get_input_node_shared_ptr(4));
-    if (!skip_tokens_const)
+    std::shared_ptr<v8::Slice> skip_tokens_slice = std::dynamic_pointer_cast<v8::Slice>(vocab_decoder_node->get_input_node_shared_ptr(4));
+    if (!skip_tokens_const && !skip_tokens_slice)
         return false;
 
     auto start_const = std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector{0});
@@ -74,10 +75,14 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr<ov
     // if flag is set, then slice up to the int_max which means skip all tokens.
     auto stop = std::make_shared<v1::Multiply>(int_max_const, read_value);
 
-    std::shared_ptr<v8::Slice> slice_node = std::make_shared<v8::Slice>(skip_tokens_const, start_const, stop, one_const);
+    // If already has slice just replace the stop input.
+    if (skip_tokens_slice) {
+        skip_tokens_slice->input(2).replace_source_output(stop);
+    } else {
+        std::shared_ptr<v8::Slice> slice_node = std::make_shared<v8::Slice>(skip_tokens_const, start_const, stop, one_const);
+        vocab_decoder_node->input(4).replace_source_output(slice_node->output(0));
+    }
     
-    vocab_decoder_node->input(4).replace_source_output(slice_node->output(0));
-
     auto assign = std::make_shared<v6::Assign>(read_value, variable);
     model->add_sinks({assign});
     model->add_variables({variable});
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index fc6ba75d90..d0a472a40f 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -74,7 +74,7 @@ class Tokenizer::TokenizerImpl {
     // To change the adding special tokens mode we use a statefull subgraph, 
     // this flag holds the current state value of the CompiledModel.
     bool m_add_special_tokens = true;
-    bool m_skip_special_tokens = false;
+    bool m_skip_special_tokens = true;
     bool m_older_than_24_5 = false;
     
     int64_t m_pad_token_id = -1;
@@ -89,7 +89,7 @@ class Tokenizer::TokenizerImpl {
 
     void set_state_if_necessary(CircularBufferQueueElementGuard<ov::InferRequest>& infer_request_guard, const ov::AnyMap& params) {
         bool add_special_tokens_flag = true;
-        bool skip_special_tokens_flag = false;
+        bool skip_special_tokens_flag = true;
         ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag);
         ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag);
 
@@ -164,7 +164,6 @@ class Tokenizer::TokenizerImpl {
             m_detokenizer = core.compile_model(ov_detokenizer, device, properties);
         }
 
-        
         const size_t INFER_REQUEST_QUEUE_SIZE = m_tokenizer.get_property(ov::optimal_number_of_infer_requests);
         m_ireq_queue_tokenizer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
             INFER_REQUEST_QUEUE_SIZE,
diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp
index dae2ffe775..db4643a65c 100644
--- a/src/python/py_tokenizer.cpp
+++ b/src/python/py_tokenizer.cpp
@@ -68,7 +68,7 @@ void init_tokenizer(py::module_& m) {
                 detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
                 return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"), py::arg("skip_special_tokens") = false,
+            py::arg("tokens"), py::arg("skip_special_tokens") = true,
             R"(Decode a sequence into a string prompt.)"
         )
 
@@ -79,7 +79,7 @@ void init_tokenizer(py::module_& m) {
                 detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
                 return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"), py::arg("skip_special_tokens") = false,
+            py::arg("tokens"), py::arg("skip_special_tokens") = true,
             R"(Decode tensor into a list of string prompts.)")
 
         .def(
@@ -89,7 +89,7 @@ void init_tokenizer(py::module_& m) {
                 detokenization_params[ov::genai::skip_special_tokens.name()] = skip_special_tokens;
                 return pyutils::handle_utf8(tok.decode(tokens, detokenization_params));
             },
-            py::arg("tokens"), py::arg("skip_special_tokens") = false,
+            py::arg("tokens"), py::arg("skip_special_tokens") = true,
             R"(Decode a batch of tokens into a list of string prompt.)")
 
         .def("apply_chat_template", [](Tokenizer& tok,

From d26233b172d60063e50257058513a560e8e591b1 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Mon, 25 Nov 2024 12:56:37 +0100
Subject: [PATCH 14/24] update signature

---
 src/python/openvino_genai/py_openvino_genai.pyi | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index df290a9744..5e4d2dd7b2 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -1303,17 +1303,17 @@ class Tokenizer:
         Embeds input prompts with special tags for a chat scenario.
         """
     @typing.overload
-    def decode(self, tokens: list[int]) -> str:
+    def decode(self, tokens: list[int], skip_special_tokens: bool = True) -> str:
         """
         Decode a sequence into a string prompt.
         """
     @typing.overload
-    def decode(self, tokens: openvino._pyopenvino.Tensor) -> list[str]:
+    def decode(self, tokens: openvino._pyopenvino.Tensor, skip_special_tokens: bool = True) -> list[str]:
         """
         Decode tensor into a list of string prompts.
         """
     @typing.overload
-    def decode(self, tokens: list[list[int]]) -> list[str]:
+    def decode(self, tokens: list[list[int]], skip_special_tokens: bool = True) -> list[str]:
         """
         Decode a batch of tokens into a list of string prompt.
         """

From 111bb5bb2afe5b6cc4b01ea935ed7af38c6075de Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Tue, 26 Nov 2024 10:45:37 +0100
Subject: [PATCH 15/24] add barier for AnyMap key names, apply review comments

---
 src/cpp/src/tokenizer.cpp | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index d0a472a40f..41f9a6abd4 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -55,6 +55,14 @@ ov::genai::TokenizedInputs pad_left(ov::Tensor& input_ids, ov::Tensor& attention
     return {input_ids, attention_mask};
 }
 
+void check_arguments(const ov::AnyMap& parameters, std::set<std::string> allowed_argnames) {
+    for (const auto& [key, value] : parameters) {
+        if (allowed_argnames.find(key) == allowed_argnames.end()) {
+            OPENVINO_THROW("unacceptable parameter key: " + key);
+        }
+    }
+}
+
 constexpr char bos_token_key_name[] = "bos_token";
 constexpr char eos_token_key_name[] = "eos_token";
 constexpr char pad_token_key_name[] = "pad_token";
@@ -88,8 +96,8 @@ class Tokenizer::TokenizerImpl {
     std::string m_chat_template = {};
 
     void set_state_if_necessary(CircularBufferQueueElementGuard<ov::InferRequest>& infer_request_guard, const ov::AnyMap& params) {
-        bool add_special_tokens_flag = true;
-        bool skip_special_tokens_flag = true;
+        bool add_special_tokens_flag = m_add_special_tokens;
+        bool skip_special_tokens_flag = m_skip_special_tokens;
         ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag);
         ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag);
 
@@ -145,7 +153,7 @@ class Tokenizer::TokenizerImpl {
 
         auto device = "CPU"; // currently openvino_tokenizer supports only CPU
         auto ov_tokenizer = core.read_model(tokenizer_path / "openvino_tokenizer.xml");
-        std::shared_ptr<ov::Model> ov_detokenizer;
+        std::shared_ptr<ov::Model> ov_detokenizer = nullptr;
         if (std::filesystem::exists(tokenizer_path / "openvino_detokenizer.xml")) {
             ov_detokenizer = core.read_model(tokenizer_path / "openvino_detokenizer.xml");
         }
@@ -155,12 +163,11 @@ class Tokenizer::TokenizerImpl {
         manager_tok.register_pass<MakeCombineSegmentsSatateful>();
         manager_tok.run_passes(ov_tokenizer);
         
-        ov::pass::Manager manager_detok;
-        manager_detok.register_pass<MakeVocabDecoderSatateful>();
-        manager_detok.run_passes(ov_detokenizer);
-        
         m_tokenizer = core.compile_model(ov_tokenizer, device, properties);
-        if (std::filesystem::exists(tokenizer_path / "openvino_detokenizer.xml")) {
+        if (ov_detokenizer) {
+            ov::pass::Manager manager_detok;
+            manager_detok.register_pass<MakeVocabDecoderSatateful>();
+            manager_detok.run_passes(ov_detokenizer);
             m_detokenizer = core.compile_model(ov_detokenizer, device, properties);
         }
 
@@ -516,30 +523,37 @@ Tokenizer::Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyM
 }
 
 TokenizedInputs Tokenizer::encode(const std::string prompt, const ov::AnyMap& tokenization_params) {
+    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()});
     return m_pimpl->encode(std::move(prompt), tokenization_params);
 }
 
 TokenizedInputs Tokenizer::encode(std::vector<std::string>& prompts, const ov::AnyMap& tokenization_params) {
+    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()});
     return m_pimpl->encode(prompts, tokenization_params);
 }
 
 TokenizedInputs Tokenizer::encode(std::vector<std::string>&& prompts, const ov::AnyMap& tokenization_params) {
+    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()});
     return m_pimpl->encode(prompts, tokenization_params);
 }
 
 TokenizedInputs Tokenizer::encode(std::initializer_list<std::string>& text, const ov::AnyMap& tokenization_params) {
+    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()});
     return encode(std::vector<std::string>(text.begin(), text.end()), tokenization_params);
 }
 
 std::string Tokenizer::decode(std::vector<int64_t> tokens, const ov::AnyMap& detokenization_params) {
+    check_arguments(detokenization_params, {ov::genai::skip_special_tokens.name()});
     return m_pimpl->decode(tokens, detokenization_params);
 }
 
 std::vector<std::string> Tokenizer::decode(ov::Tensor tokens, const ov::AnyMap& detokenization_params) {
+    check_arguments(detokenization_params, {ov::genai::skip_special_tokens.name()});
     return m_pimpl->decode(tokens, detokenization_params);
 }
 
 std::vector<std::string> Tokenizer::decode(std::vector<std::vector<int64_t>> lines, const ov::AnyMap& detokenization_params) {
+    check_arguments(detokenization_params, {ov::genai::skip_special_tokens.name()});
     return m_pimpl->decode(lines, detokenization_params);
 }
 

From 3da2aebb6829856d25f391ae5f0e9d069cca6cd9 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 27 Nov 2024 00:04:10 +0400
Subject: [PATCH 16/24] [Build] Use officially released py-build-cmake version
 (#1253)

---
 pyproject.toml | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c9d5dce207..de3e5b5a9e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,16 +3,31 @@ name = "openvino-genai"
 version = "2025.0.0.0"
 description = "Library of the most popular Generative AI model pipelines, optimized execution methods, and samples"
 requires-python = ">=3.9"
-readme = {file = "src/README.md", content-type="text/markdown"}
-license = {text = "OSI Approved :: Apache Software License"}
+readme = { file = "src/README.md", content-type="text/markdown" }
+license = { "file" = "LICENSE" }
 authors = [
     { name = "OpenVINO Developers", email = "openvino@intel.com" },
 ]
 classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "License :: OSI Approved :: Apache Software License",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development :: Libraries :: Python Modules",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Operating System :: Unix",
+    "Operating System :: POSIX :: Linux",
+    "Operating System :: Microsoft :: Windows",
+    "Operating System :: MacOS",
+    "Programming Language :: C++",
+    "Programming Language :: C",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: Implementation :: CPython"
 ]
 dependencies = [
     "openvino_tokenizers~=2025.0.0.0.dev"
@@ -36,7 +51,7 @@ options = {"BUILD_TOKENIZERS" = "OFF"}
 
 [build-system]
 requires = [
-    "py-build-cmake@git+https://github.com/tttapa/py-build-cmake@7ab73da351c7140f06d727a8705bece4cf544cd9",
+    "py-build-cmake==0.3.0",
     "openvino~=2025.0.0.0.dev",
     "pybind11-stubgen==2.5.1",
     "cmake~=3.23.0"

From fa1e95e965f915b0a1dab3b548967329f87925eb Mon Sep 17 00:00:00 2001
From: Dmitry Matveev <dmitry.matveev@intel.com>
Date: Wed, 27 Nov 2024 16:08:54 +0000
Subject: [PATCH 17/24] NPUW Deref: Baseline - don't hold pointers to the orig
 models (#1259)

---
 src/cpp/src/llm_pipeline_static.cpp | 32 ++++++++++++++---------------
 src/cpp/src/llm_pipeline_static.hpp |  4 ----
 2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 2beb7d64be..597b5f69ac 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -675,45 +675,45 @@ void StaticLLMPipeline::setupAndCompileModels(
     // NB: Get information about NPU if available
     auto npudesc = extract_npu_descriptor(core);
     // (1) Read the template model - this will be kvcache model
-    m_kvcache_model = core.read_model((models_path / "openvino_model.xml").string());
+    auto kvcache_model = core.read_model((models_path / "openvino_model.xml").string());
     // (2) Expose KV-cache input and output layers from kvcache model
-    ov::pass::StatefulToStateless().run_on_model(m_kvcache_model);
+    ov::pass::StatefulToStateless().run_on_model(kvcache_model);
     // (3) Align u4 ZP constants
-    align_u4_zp_constants(m_kvcache_model);
+    align_u4_zp_constants(kvcache_model);
     // (4) Clone the model - this will be prefill
-    m_prefill_model = m_kvcache_model->clone();
-    m_prefill_model->set_friendly_name(m_kvcache_model->get_friendly_name() + "_prefill");
+    auto prefill_model = kvcache_model->clone();
+    prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill");
     // (5) Reshape both models to static shape
     const uint32_t kMaxPromptLen = align_to(pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u), 64u);
     const uint32_t kMinResponseLen = align_to(pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u), 64u);
     ModelDesc model_desc = get_modeldesc_from_json(models_path / "config.json");
     KVAxesPosition axes = get_kv_axes(model_desc.type);
     m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len, false};
-    reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes);
-    reshape_to_static(m_kvcache_model, 1u, m_kvcache_desc.total_size, axes);
+    reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes);
+    reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes);
     // (6) Apply opt layout if applicable
     // NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model
     if ( model_desc.name_or_path == "meta-llama/Llama-2-7b-chat-hf" ||
         (model_desc.type == "llama" && model_desc.num_key_value_heads == 32)) {
-        if (optimize_value_tensors(m_kvcache_model)) {
+        if (optimize_value_tensors(kvcache_model)) {
             // NB: Check if TransposeValueTensors transformation was applied
             m_kvcache_desc.v_tensors_transposed = true;
-            m_prefill_model = cvt_value_tensors_layout(m_prefill_model);
+            prefill_model = cvt_value_tensors_layout(prefill_model);
         }
     }
     // (7) Replace KV-cache tensors for the entire cache to tensors only for new token (before concat)
-    m_kvcache_model = redirect_new_kv_to_output(m_kvcache_model);
+    kvcache_model = redirect_new_kv_to_output(kvcache_model);
     // (8) Convert kvcache tensors to fp16 precision
-    m_kvcache_model = cvt_kvcache_to_fp16(m_kvcache_model);
-    m_prefill_model = cvt_kvcache_to_fp16(m_prefill_model);
+    kvcache_model = cvt_kvcache_to_fp16(kvcache_model);
+    prefill_model = cvt_kvcache_to_fp16(prefill_model);
     // (9) Compile both model
     auto prefill_config = pop_or_default(
-        properties, "PREFILL_CONFIG", get_default_prefill_config(m_prefill_model, npudesc)
+        properties, "PREFILL_CONFIG", get_default_prefill_config(prefill_model, npudesc)
     );
     // NB: GENERATE_HINT is only applicable for default generate config!
     auto generate_hint = str_to_hint(pop_or_default<std::string>(properties, "GENERATE_HINT", "FAST_COMPILE"));
     auto generate_config = pop_or_default(
-        properties, "GENERATE_CONFIG", get_default_generate_config(m_kvcache_model, npudesc, generate_hint)
+        properties, "GENERATE_CONFIG", get_default_generate_config(kvcache_model, npudesc, generate_hint)
     );
     merge_config_with(prefill_config, properties);
     merge_config_with(generate_config, properties);
@@ -722,10 +722,10 @@ void StaticLLMPipeline::setupAndCompileModels(
     drop_cache_dir(generate_config);
 
     m_kvcache_request = core.compile_model(
-        m_kvcache_model, device, generate_config
+        kvcache_model, device, generate_config
     ).create_infer_request();
     m_prefill_request = core.compile_model(
-        m_prefill_model, device, prefill_config
+        prefill_model, device, prefill_config
     ).create_infer_request();
 }
 
diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp
index 2f9969f5d7..d8e59d867a 100644
--- a/src/cpp/src/llm_pipeline_static.hpp
+++ b/src/cpp/src/llm_pipeline_static.hpp
@@ -61,10 +61,6 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
         bool v_tensors_transposed;
     };
 
-    // FIXME: Ideally, we don't need to keep those
-    std::shared_ptr<ov::Model> m_kvcache_model;
-    std::shared_ptr<ov::Model> m_prefill_model;
-
     KVCacheDesc m_kvcache_desc;
     ov::InferRequest m_kvcache_request;
     ov::InferRequest m_prefill_request;

From 86068a5377466045ecda18c2181495e83ddeb19f Mon Sep 17 00:00:00 2001
From: Anna Likholat <anna.likholat@intel.com>
Date: Thu, 28 Nov 2024 11:34:24 +0100
Subject: [PATCH 18/24] Text2Image SDXL fix for GPU (#1266)

CVS-156801
---
 .../image_generation/stable_diffusion_xl_pipeline.hpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
index e7c8c35ce3..3c9130898f 100644
--- a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
@@ -111,12 +111,19 @@ class StableDiffusionXLPipeline : public DiffusionPipeline {
             OPENVINO_THROW("Unsupported '", unet, "' UNet type");
         }
 
+        // Temporary fix for GPU
+        ov::AnyMap updated_roperties = properties;
+        if (device.find("GPU") != std::string::npos &&
+            updated_roperties.find("INFERENCE_PRECISION_HINT") == updated_roperties.end()) {
+            updated_roperties["INFERENCE_PRECISION_HINT"] = ov::element::f32;
+        }
+
         const std::string vae = data["vae"][1].get<std::string>();
         if (vae == "AutoencoderKL") {
             if (m_pipeline_type == PipelineType::TEXT_2_IMAGE)
-                m_vae = std::make_shared<AutoencoderKL>(root_dir / "vae_decoder", device, properties);
+                m_vae = std::make_shared<AutoencoderKL>(root_dir / "vae_decoder", device, updated_roperties);
             else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) {
-                m_vae = std::make_shared<AutoencoderKL>(root_dir / "vae_encoder", root_dir / "vae_decoder", device, properties);
+                m_vae = std::make_shared<AutoencoderKL>(root_dir / "vae_encoder", root_dir / "vae_decoder", device, updated_roperties);
             } else {
                 OPENVINO_ASSERT("Unsupported pipeline type");
             }

From 13f1b446b593843397f29fabf90f91c14791f204 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 28 Nov 2024 19:16:16 +0400
Subject: [PATCH 19/24] Try to drop --pre (#1269)

- `--pre` is not required for OpenVINO wheels as `~=2025.0.0.0.dev`
already ensures that pre-releases can be installed
- `--pre` affects all other packages, which leads to installation of
unstable versions and broken whisper CI
https://github.com/openvinotoolkit/openvino.genai/actions/runs/12056078081/job/33618027551?pr=1267
---
 .../actions/install_python_deps/action.yml    |  4 +-
 .github/workflows/causal_lm_cpp.yml           | 60 +++++++++----------
 .github/workflows/lcm_dreamshaper_cpp.yml     |  4 +-
 .github/workflows/linux.yml                   |  6 +-
 .github/workflows/mac.yml                     |  6 +-
 .../workflows/stable_diffusion_1_5_cpp.yml    |  4 +-
 .github/workflows/windows.yml                 |  8 +--
 samples/deployment-requirements.txt           |  3 +-
 samples/export-requirements.txt               |  3 +-
 src/README.md                                 |  2 +-
 10 files changed, 49 insertions(+), 51 deletions(-)

diff --git a/.github/actions/install_python_deps/action.yml b/.github/actions/install_python_deps/action.yml
index 8f269cc42e..3b42f5fd9b 100644
--- a/.github/actions/install_python_deps/action.yml
+++ b/.github/actions/install_python_deps/action.yml
@@ -11,5 +11,5 @@ runs:
       shell: bash
       run: |
         source ${{ inputs.ov_dir }}/setupvars.sh
-        python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-        python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
+        python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+        python -m pip install -r ./samples/requirements.txt
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index c75ac3214c..ce3ac5f046 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -46,8 +46,8 @@ jobs:
       - name: Download and convert and model
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ./samples/requirements.txt
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2
           optimum-cli export openvino -m TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
           wget https://huggingface.co/smangrul/tinyllama_lora_sql/resolve/main/adapter_model.safetensors?download=true -O adapter_model.safetensors
@@ -105,8 +105,8 @@ jobs:
       - name: Download and convert and model
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ./samples/requirements.txt
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - name: Compare
         env:
@@ -241,8 +241,8 @@ jobs:
       - name: Download and convert model
         run: |
           call .\ov\setupvars.bat
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ./samples/requirements.txt
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           optimum-cli export openvino -m TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
           curl -o adapter_model.safetensors -s -L https://huggingface.co/smangrul/tinyllama_lora_sql/resolve/main/adapter_model.safetensors?download=true
@@ -299,8 +299,8 @@ jobs:
       - name: Download and convert and model
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ./samples/requirements.txt
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat
       - run: >
           . ./ov/setupvars.sh
@@ -333,8 +333,8 @@ jobs:
       - name: Download and convert and model
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ./samples/requirements.txt
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat
       - run: >
           . ./ov/setupvars.sh
@@ -368,8 +368,8 @@ jobs:
       - name: Download and convert and model
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ./samples/requirements.txt
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2
       - run: >
           . ./ov/setupvars.sh
@@ -403,8 +403,8 @@ jobs:
       - name: Download and convert and model
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ./samples/requirements.txt
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1
       - run: >
           . ./ov/setupvars.sh
@@ -438,8 +438,8 @@ jobs:
       - name: Download and convert and model
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ./samples/requirements.txt
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b
       - name: run and compare
@@ -488,8 +488,8 @@ jobs:
       - name: Download and convert and model
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ./samples/requirements.txt
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat --task text-generation-with-past
       - name: run and compare
@@ -560,8 +560,8 @@ jobs:
       - name: Download and convert and model
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ./samples/requirements.txt
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5
       - name: Run Generation
         run: |
@@ -615,8 +615,8 @@ jobs:
       - name: Download and convert and model
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ./samples/requirements.txt
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat
       - name: Run Generation
         run: |
@@ -670,8 +670,8 @@ jobs:
       - name: Download and convert and model
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ./samples/requirements.txt
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - name: Compare
         env:
@@ -863,8 +863,8 @@ jobs:
       - name: Download and convert and model
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ./samples/requirements.txt
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - name: Run gtests
         run: |
@@ -909,8 +909,8 @@ jobs:
       - name: Download and convert and model
         run: |
           call .\ov\setupvars.bat
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ./samples/requirements.txt
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - name: Run gtests
         run: |
@@ -954,8 +954,8 @@ jobs:
       - name: Download and convert and model
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install -r ./samples/requirements.txt
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - name: Run gtests
         run: |
diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
index 6bd25cbdfe..233be9e5c0 100644
--- a/.github/workflows/lcm_dreamshaper_cpp.yml
+++ b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -59,7 +59,7 @@ jobs:
       - name: Install python dependencies
         run: |
           source openvino_lcm_cpp/bin/activate
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install -r ./samples/requirements.txt
 
       - name: Download and convert models and tokenizer
@@ -119,7 +119,7 @@ jobs:
       - name: Install python dependencies
         run: |
           . "./openvino_lcm_cpp/Scripts/Activate.ps1"
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install -r ./samples/requirements.txt
 
       - name: Download and convert models and tokenizer
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 3c3e0347e7..44e115423c 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -258,7 +258,7 @@ jobs:
       - name: Test bindings
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels
           python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
@@ -349,7 +349,7 @@ jobs:
       - name: Test bindings
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels
           python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
@@ -437,7 +437,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels
+          python -m pip install -r ./samples/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels
           optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny whisper-tiny
 
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 935d6556b3..5b1b7622ac 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -224,7 +224,7 @@ jobs:
       - name: Test bindings
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels
           python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
@@ -288,7 +288,7 @@ jobs:
       - name: Test bindings
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels
           python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
@@ -354,7 +354,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels
+          python -m pip install -r ./samples/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels
           optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny whisper-tiny
 
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index f36ac43839..8a262cfd97 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -59,7 +59,7 @@ jobs:
       - name: Install python dependencies
         run: |
           source openvino_sd_cpp/bin/activate
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install -r ./samples/requirements.txt
 
       - name: Download and convert models and tokenizer
@@ -133,7 +133,7 @@ jobs:
       - name: Install python dependencies
         run: |
           . "./openvino_sd_cpp/Scripts/Activate.ps1"
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install -r ./samples/requirements.txt
 
       - name: Download and convert models and tokenizer
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 1e4164aa0b..17a1abb288 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -235,7 +235,7 @@ jobs:
       - name: Test bindings
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels
           python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
@@ -299,7 +299,7 @@ jobs:
       - name: Test bindings
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels
           python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
@@ -363,7 +363,7 @@ jobs:
       - name: Test bindings
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels
           python -m pytest -v ./tests/python_tests/test_vlm_api.py
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
@@ -425,7 +425,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --find-links ${env:OV_INSTALL_DIR}/wheels
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels
+          python -m pip install -r ./samples/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny whisper-tiny
 
diff --git a/samples/deployment-requirements.txt b/samples/deployment-requirements.txt
index c29f496c84..ceac668e9c 100644
--- a/samples/deployment-requirements.txt
+++ b/samples/deployment-requirements.txt
@@ -1,5 +1,4 @@
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
---pre
 openvino_genai~=2025.0.0.0.dev
 librosa==0.10.2  # For Whisper
-pillow==11.0.0  # Image processing
+pillow==11.0.0  # Image processing for VLMs
diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
index a84926f746..aa9a0ccea9 100644
--- a/samples/export-requirements.txt
+++ b/samples/export-requirements.txt
@@ -1,12 +1,11 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
---pre
 openvino-tokenizers~=2025.0.0.0.dev
 optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
-diffusers==0.31.0
+diffusers==0.31.0 # For image generation pipelines
 timm==1.0.11  # For exporting InternVL2
 torchvision  # For visual language models
 transformers>=4.43 # For Whisper
diff --git a/src/README.md b/src/README.md
index 9a96daa9d2..c90bc8f4e4 100644
--- a/src/README.md
+++ b/src/README.md
@@ -37,7 +37,7 @@ If you want to try OpenVINO GenAI with different dependencies versions (**not**
     git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     cd openvino.genai
     # Install python dependencies
-    python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+    python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
     python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
     ```
 

From 079f1d521319e0d2443a185754902e47b77c5e8c Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 28 Nov 2024 21:15:38 +0400
Subject: [PATCH 20/24] Fixed pyi file build when OpenVINO_DIR is externally
 defined (#1271)

---
 src/python/CMakeLists.txt | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 25d81277d6..75a2fd59a7 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -114,6 +114,21 @@ elseif(DEFINED PY_BUILD_CMAKE_PACKAGE_NAME AND NOT WIN32)
     # in case of wheel build, pybind11-stubgen is always available via pyproject.toml's build-system
     # except Win32 where we have issues with pybind11_stubgen executable which cannot import its own module
     set(pybind11_stubgen_AVAILABLE ON)
+
+    # by default, wheel build is performed with build-isolation, which means that some variables like PYTHONPATH
+    # are not available. But if user called setupvars.sh, then OpenVINO dir is available, while PYTHONPATH - no.
+    # In this case, we will have mismatch on Linux when OpenVINO can point on build dir / install dir, while
+    # PYTHONPATH points out to locally installed tmp OpenVINO wheel (build against wheel).
+    # Ways to handle it:
+    # - setting PYTHONPATH to $ENV{INTEL_OPENVINO_DIR}/python if INTEL_OPENVINO_DIR is defined. It means we are building against
+    #   OpenVINO archive or installation tree
+    # - if it's not defined, we cannot do any guesses and hence, disable pybind11-stubgen usage
+    if(DEFINED ENV{INTEL_OPENVINO_DIR})
+        set(openvino_pythonpath "$ENV{INTEL_OPENVINO_DIR}/python")
+    elseif(LINUX AND NOT OpenVINO_DIR STREQUAL OpenVINO_DIR_PY)
+        # here we imply that OpenVINO_DIR_PY points to manylinux, while OpenVINO_DIR point to Ubuntu binaries
+        set(pybind11_stubgen_AVAILABLE OFF)
+    endif()
 endif()
 
 # but we also need to check whether OpenVINO is installed

From bc5f4dbe751d603ef6e94afd133d9ee6e469fd88 Mon Sep 17 00:00:00 2001
From: Alexey Smirnov <alexey.smirnov@intel.com>
Date: Thu, 28 Nov 2024 19:39:00 +0000
Subject: [PATCH 21/24] StaticLLMPipeline: Decide when to enable NPUW_DQ_FULL
 property (#1258)

Based on (yet to be) supported OV properties from the NPU Plugin enable
NPUW_DQ_FULL.
releases/2024/5 mirror:
https://github.com/openvinotoolkit/openvino.genai/pull/1272

Dependencies
* https://github.com/openvinotoolkit/openvino/pull/27678 needs to be
merged first
* https://github.com/openvinotoolkit/openvino/pull/27789
---
 src/cpp/src/llm_pipeline_static.cpp | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 597b5f69ac..db2adbd19e 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -457,6 +457,7 @@ void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) {
 struct NPUDesc {
     std::string arch;
     int64_t max_tiles;
+    bool compiler_dq;
 };
 
 std::optional<NPUDesc> extract_npu_descriptor(ov::Core& core) {
@@ -466,7 +467,14 @@ std::optional<NPUDesc> extract_npu_descriptor(ov::Core& core) {
     }
     const auto arch = core.get_property("NPU", ov::device::architecture);
     const auto max_tiles = core.get_property("NPU", ov::intel_npu::max_tiles);
-    return std::make_optional(NPUDesc{arch, max_tiles});
+
+    bool compiler_dq = false;
+    const auto device_caps = core.get_property("NPU", ov::device::capabilities);
+    if (std::find(device_caps.begin(), device_caps.end(),
+                  "COMPILER_DYNAMIC_QUANTIZATION") != device_caps.end()) {
+        compiler_dq = true;
+    }
+    return std::make_optional(NPUDesc{arch, max_tiles, compiler_dq});
 }
 
 ov::AnyMap get_baseline_common_config() {
@@ -508,6 +516,9 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr<ov::Model>& model,
         npudesc->max_tiles != -1) {
         config.emplace("NPU_DPU_GROUPS", npudesc->max_tiles);
     }
+    if (npudesc.has_value() && npudesc->compiler_dq) {
+        config.emplace("NPUW_DQ_FULL", "NO");
+    }
     return config;
 }
 
@@ -523,6 +534,9 @@ ov::AnyMap get_default_generate_config(const std::shared_ptr<ov::Model>& model,
     if (npudesc.has_value() && npudesc->arch == "4000") {
         config.emplace("NPU_DPU_GROUPS", 4);
     }
+    if (npudesc.has_value() && npudesc->compiler_dq) {
+        config.emplace("NPUW_DQ_FULL", "NO");
+    }
     return config;
 }
 

From b43d31ed0604ec7add9af42ee62bb7d5d6a0abe8 Mon Sep 17 00:00:00 2001
From: Alexander Kozlov <kozzzloff@list.ru>
Date: Fri, 29 Nov 2024 11:50:46 +0300
Subject: [PATCH 22/24] Enable Phi-3.5-vision in HF format. Enable use of LLMs
 as a text embedding models for similarity compute. (#1276)

Now it is possible to use `--data-encoder Qwen/Qwen2.5-1.5B` to plug LLM
as a model for embedding computation.
---
 .../whowhatbench/whowhat_metrics.py                   |  8 +++++++-
 tools/who_what_benchmark/whowhatbench/wwb.py          | 11 ++++++++---
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py b/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py
index bbf96a3312..2d1da24168 100644
--- a/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py
+++ b/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py
@@ -3,6 +3,7 @@
 """
 
 from difflib import SequenceMatcher
+from transformers import AutoTokenizer
 from PIL import Image
 import torch
 import torch.nn.functional as F
@@ -109,7 +110,12 @@ def evaluate_divergency(tokenizer, data_gold, data_prediction):
 
 class TextSimilarity:
     def __init__(self, model_id) -> None:
-        self.model = SentenceTransformer(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        if hasattr(tokenizer, "pad_token") and tokenizer.pad_token:
+            pad_token = tokenizer.pad_token
+        else:
+            pad_token = tokenizer.eos_token
+        self.model = SentenceTransformer(model_id, tokenizer_kwargs={"pad_token": pad_token}, trust_remote_code=True)
 
     def evaluate(self, gt, prediction):
         return evaluate_similarity(self.model, gt, prediction)
diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py
index 0a01a8e8df..f9aea15b47 100644
--- a/tools/who_what_benchmark/whowhatbench/wwb.py
+++ b/tools/who_what_benchmark/whowhatbench/wwb.py
@@ -178,9 +178,14 @@ def load_visual_text_model(
                 model_id, trust_remote_code=True, device_map=device.lower()
             )
         except ValueError:
-            model = AutoModel.from_pretrained(
-                model_id, trust_remote_code=True, device_map=device.lower()
-            )
+            try:
+                model = AutoModel.from_pretrained(
+                    model_id, trust_remote_code=True, device_map=device.lower()
+                )
+            except ValueError:
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_id, trust_remote_code=True, device_map=device.lower(), _attn_implementation="eager", use_flash_attention_2=False
+                )
         model.eval()
     elif use_genai:
         logger.info("Using OpenVINO GenAI API")

From 402958b8975275fd3873c09c5095ca84abc2cea9 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 29 Nov 2024 21:18:13 +0400
Subject: [PATCH 23/24] [Python] Update docs with str => PathLike (#1278)

---
 .../openvino_genai/py_openvino_genai.pyi      | 38 +++++++++----------
 src/python/py_image_generation_models.cpp     | 24 ++++++------
 src/python/py_image_generation_pipelines.cpp  |  4 +-
 src/python/py_llm_pipeline.cpp                |  4 +-
 src/python/py_lora_adapter.cpp                |  2 +-
 src/python/py_vlm_pipeline.cpp                |  2 +-
 src/python/py_whisper_pipeline.cpp            |  2 +-
 7 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index 5e4d2dd7b2..1c386dc097 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -19,7 +19,7 @@ class Adapter:
     def __init__(self, path: os.PathLike) -> None:
         """
                     Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier.
-                    path (str): Path to adapter file in safetensors format.
+                    path (os.PathLike): Path to adapter file in safetensors format.
         """
 class AdapterConfig:
     """
@@ -162,20 +162,20 @@ class AutoencoderKL:
     def __init__(self, vae_decoder_path: os.PathLike) -> None:
         """
                     AutoencoderKL class initialized only with decoder model.
-                    vae_decoder_path (str): VAE decoder directory.
+                    vae_decoder_path (os.PathLike): VAE decoder directory.
         """
     @typing.overload
     def __init__(self, vae_encoder_path: os.PathLike, vae_decoder_path: os.PathLike) -> None:
         """
                     AutoencoderKL class initialized with both encoder and decoder models.
-                    vae_encoder_path (str): VAE encoder directory.
-                    vae_decoder_path (str): VAE decoder directory.
+                    vae_encoder_path (os.PathLike): VAE encoder directory.
+                    vae_decoder_path (os.PathLike): VAE decoder directory.
         """
     @typing.overload
     def __init__(self, vae_decoder_path: os.PathLike, device: str, **kwargs) -> None:
         """
                     AutoencoderKL class initialized only with decoder model.
-                    vae_decoder_path (str): VAE decoder directory.
+                    vae_decoder_path (os.PathLike): VAE decoder directory.
                     device (str): Device on which inference will be done.
                     kwargs: Device properties.
         """
@@ -183,8 +183,8 @@ class AutoencoderKL:
     def __init__(self, vae_encoder_path: os.PathLike, vae_decoder_path: os.PathLike, device: str, **kwargs) -> None:
         """
                     AutoencoderKL class initialized only with both encoder and decoder models.
-                    vae_encoder_path (str): VAE encoder directory.
-                    vae_decoder_path (str): VAE decoder directory.
+                    vae_encoder_path (os.PathLike): VAE encoder directory.
+                    vae_decoder_path (os.PathLike): VAE decoder directory.
                     device (str): Device on which inference will be done.
                     kwargs: Device properties.
         """
@@ -228,13 +228,13 @@ class CLIPTextModel:
     def __init__(self, root_dir: os.PathLike) -> None:
         """
                     CLIPTextModel class
-                    root_dir (str): Model root directory.
+                    root_dir (os.PathLike): Model root directory.
         """
     @typing.overload
     def __init__(self, root_dir: os.PathLike, device: str, **kwargs) -> None:
         """
                     CLIPTextModel class
-                    root_dir (str): Model root directory.
+                    root_dir (os.PathLike): Model root directory.
                     device (str): Device on which inference will be done.
                     kwargs: Device properties.
         """
@@ -277,13 +277,13 @@ class CLIPTextModelWithProjection:
     def __init__(self, root_dir: os.PathLike) -> None:
         """
                     CLIPTextModelWithProjection class
-                    root_dir (str): Model root directory.
+                    root_dir (os.PathLike): Model root directory.
         """
     @typing.overload
     def __init__(self, root_dir: os.PathLike, device: str, **kwargs) -> None:
         """
                     CLIPTextModelWithProjection class
-                    root_dir (str): Model root directory.
+                    root_dir (os.PathLike): Model root directory.
                     device (str): Device on which inference will be done.
                     kwargs: Device properties.
         """
@@ -790,7 +790,7 @@ class LLMPipeline:
     def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, device: str, config: dict[str, typing.Any] = {}, **kwargs) -> None:
         """
                     LLMPipeline class constructor for manually created openvino_genai.Tokenizer.
-                    models_path (str): Path to the model file.
+                    models_path (os.PathLike): Path to the model file.
                     tokenizer (openvino_genai.Tokenizer): tokenizer object.
                     device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'.
                     Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline.
@@ -800,7 +800,7 @@ class LLMPipeline:
     def __init__(self, models_path: os.PathLike, device: str, config: dict[str, typing.Any] = {}, **kwargs) -> None:
         """
                     LLMPipeline class constructor.
-                    models_path (str): Path to the model file.
+                    models_path (os.PathLike): Path to the model file.
                     device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'.
                     Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline.
                     kwargs: Device properties.
@@ -1231,13 +1231,13 @@ class Text2ImagePipeline:
     def __init__(self, models_path: os.PathLike) -> None:
         """
                     Text2ImagePipeline class constructor.
-                    models_path (str): Path to the folder with exported model files.
+                    models_path (os.PathLike): Path to the folder with exported model files.
         """
     @typing.overload
     def __init__(self, models_path: os.PathLike, device: str, **kwargs) -> None:
         """
                     Text2ImagePipeline class constructor.
-                    models_path (str): Path with exported model files.
+                    models_path (os.PathLike): Path with exported model files.
                     device (str): Device to run the model on (e.g., CPU, GPU).
                     kwargs: Text2ImagePipeline properties
         """
@@ -1360,13 +1360,13 @@ class UNet2DConditionModel:
     def __init__(self, root_dir: os.PathLike) -> None:
         """
                     UNet2DConditionModel class
-                    root_dir (str): Model root directory.
+                    root_dir (os.PathLike): Model root directory.
         """
     @typing.overload
     def __init__(self, root_dir: os.PathLike, device: str, **kwargs) -> None:
         """
                     UNet2DConditionModel class
-                    root_dir (str): Model root directory.
+                    root_dir (os.PathLike): Model root directory.
                     device (str): Device on which inference will be done.
                     kwargs: Device properties.
         """
@@ -1403,7 +1403,7 @@ class VLMPipeline:
         """
         device on which inference will be done
                     VLMPipeline class constructor.
-                    models_path (str): Path to the folder with exported model files.
+                    models_path (os.PathLike): Path to the folder with exported model files.
                     device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'.
                     kwargs: Device properties
         """
@@ -1640,7 +1640,7 @@ class WhisperPipeline:
     def __init__(self, models_path: os.PathLike, device: str, **kwargs) -> None:
         """
                     WhisperPipeline class constructor.
-                    models_path (str): Path to the model file.
+                    models_path (os.PathLike): Path to the model file.
                     device (str): Device to run the model on (e.g., CPU, GPU).
         """
     def generate(self, raw_speech_input: list[float], generation_config: WhisperGenerationConfig | None = None, streamer: typing.Callable[[str], bool] | ChunkStreamerBase | None = None, **kwargs) -> WhisperDecodedResults:
diff --git a/src/python/py_image_generation_models.cpp b/src/python/py_image_generation_models.cpp
index 221fc7363e..72a8970cb4 100644
--- a/src/python/py_image_generation_models.cpp
+++ b/src/python/py_image_generation_models.cpp
@@ -31,7 +31,7 @@ void init_clip_text_model(py::module_& m) {
         py::arg("root_dir"), "Model root directory", 
         R"(
             CLIPTextModel class
-            root_dir (str): Model root directory.
+            root_dir (os.PathLike): Model root directory.
         )")
         .def(py::init([](
             const std::filesystem::path& root_dir,
@@ -45,7 +45,7 @@ void init_clip_text_model(py::module_& m) {
         py::arg("device"), "Device on which inference will be done",
         R"(
             CLIPTextModel class
-            root_dir (str): Model root directory.
+            root_dir (os.PathLike): Model root directory.
             device (str): Device on which inference will be done.
             kwargs: Device properties.
         )") 
@@ -101,7 +101,7 @@ void init_unet2d_condition_model(py::module_& m) {
         py::arg("root_dir"), "Model root directory", 
         R"(
             UNet2DConditionModel class
-            root_dir (str): Model root directory.
+            root_dir (os.PathLike): Model root directory.
         )")
         .def(py::init([](
             const std::filesystem::path& root_dir,
@@ -114,7 +114,7 @@ void init_unet2d_condition_model(py::module_& m) {
         py::arg("device"), "Device on which inference will be done",
         R"(
             UNet2DConditionModel class
-            root_dir (str): Model root directory.
+            root_dir (os.PathLike): Model root directory.
             device (str): Device on which inference will be done.
             kwargs: Device properties.
         )") 
@@ -172,7 +172,7 @@ void init_autoencoder_kl(py::module_& m) {
         py::arg("vae_decoder_path"), "VAE decoder directory", 
         R"(
             AutoencoderKL class initialized only with decoder model.
-            vae_decoder_path (str): VAE decoder directory.
+            vae_decoder_path (os.PathLike): VAE decoder directory.
         )")
         .def(py::init([](
             const std::filesystem::path& vae_encoder_path,
@@ -184,8 +184,8 @@ void init_autoencoder_kl(py::module_& m) {
         py::arg("vae_decoder_path"), "VAE decoder directory",
         R"(
             AutoencoderKL class initialized with both encoder and decoder models.
-            vae_encoder_path (str): VAE encoder directory.
-            vae_decoder_path (str): VAE decoder directory.
+            vae_encoder_path (os.PathLike): VAE encoder directory.
+            vae_decoder_path (os.PathLike): VAE decoder directory.
         )")
         .def(py::init([](
             const std::filesystem::path& vae_decoder_path,
@@ -198,7 +198,7 @@ void init_autoencoder_kl(py::module_& m) {
         py::arg("device"), "Device on which inference will be done",
         R"(
             AutoencoderKL class initialized only with decoder model.
-            vae_decoder_path (str): VAE decoder directory.
+            vae_decoder_path (os.PathLike): VAE decoder directory.
             device (str): Device on which inference will be done.
             kwargs: Device properties.
         )")
@@ -215,8 +215,8 @@ void init_autoencoder_kl(py::module_& m) {
         py::arg("device"), "Device on which inference will be done",
         R"(
             AutoencoderKL class initialized only with both encoder and decoder models.
-            vae_encoder_path (str): VAE encoder directory.
-            vae_decoder_path (str): VAE decoder directory.
+            vae_encoder_path (os.PathLike): VAE encoder directory.
+            vae_decoder_path (os.PathLike): VAE decoder directory.
             device (str): Device on which inference will be done.
             kwargs: Device properties.
         )") 
@@ -276,7 +276,7 @@ void init_clip_text_model_with_projection(py::module_& m) {
         py::arg("root_dir"), "Model root directory", 
         R"(
             CLIPTextModelWithProjection class
-            root_dir (str): Model root directory.
+            root_dir (os.PathLike): Model root directory.
         )")
         .def(py::init([](
             const std::filesystem::path& root_dir,
@@ -290,7 +290,7 @@ void init_clip_text_model_with_projection(py::module_& m) {
         py::arg("device"), "Device on which inference will be done",
         R"(
             CLIPTextModelWithProjection class
-            root_dir (str): Model root directory.
+            root_dir (os.PathLike): Model root directory.
             device (str): Device on which inference will be done.
             kwargs: Device properties.
         )") 
diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp
index dade8a170e..d0d2f18a92 100644
--- a/src/python/py_image_generation_pipelines.cpp
+++ b/src/python/py_image_generation_pipelines.cpp
@@ -141,7 +141,7 @@ void init_image_generation_pipelines(py::module_& m) {
         py::arg("models_path"), "folder with exported model files.",
         R"(
             Text2ImagePipeline class constructor.
-            models_path (str): Path to the folder with exported model files.
+            models_path (os.PathLike): Path to the folder with exported model files.
         )")
 
         .def(py::init([](
@@ -156,7 +156,7 @@ void init_image_generation_pipelines(py::module_& m) {
         py::arg("device"), "device on which inference will be done",
         R"(
             Text2ImagePipeline class constructor.
-            models_path (str): Path with exported model files.
+            models_path (os.PathLike): Path with exported model files.
             device (str): Device to run the model on (e.g., CPU, GPU).
             kwargs: Text2ImagePipeline properties
         )")
diff --git a/src/python/py_llm_pipeline.cpp b/src/python/py_llm_pipeline.cpp
index 030688d821..7255022238 100644
--- a/src/python/py_llm_pipeline.cpp
+++ b/src/python/py_llm_pipeline.cpp
@@ -122,7 +122,7 @@ void init_llm_pipeline(py::module_& m) {
         py::arg("config") = ov::AnyMap({}), "openvino.properties map",
         R"(
             LLMPipeline class constructor for manually created openvino_genai.Tokenizer.
-            models_path (str): Path to the model file.
+            models_path (os.PathLike): Path to the model file.
             tokenizer (openvino_genai.Tokenizer): tokenizer object.
             device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'.
             Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline.
@@ -151,7 +151,7 @@ void init_llm_pipeline(py::module_& m) {
         py::arg("config") = ov::AnyMap({}), "openvino.properties map",
         R"(
             LLMPipeline class constructor.
-            models_path (str): Path to the model file.
+            models_path (os.PathLike): Path to the model file.
             device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'.
             Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline.
             kwargs: Device properties.
diff --git a/src/python/py_lora_adapter.cpp b/src/python/py_lora_adapter.cpp
index 3186a7ca5c..7f98b67064 100644
--- a/src/python/py_lora_adapter.cpp
+++ b/src/python/py_lora_adapter.cpp
@@ -23,7 +23,7 @@ void init_lora_adapter(py::module_& m) {
         py::arg("path"), "path",
         R"(
             Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier.
-            path (str): Path to adapter file in safetensors format.
+            path (os.PathLike): Path to adapter file in safetensors format.
         )")
         .def(
             "__bool__", 
diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp
index 9572652204..fc58ddc913 100644
--- a/src/python/py_vlm_pipeline.cpp
+++ b/src/python/py_vlm_pipeline.cpp
@@ -86,7 +86,7 @@ void init_vlm_pipeline(py::module_& m) {
         py::arg("device"), "device on which inference will be done"
         R"(
             VLMPipeline class constructor.
-            models_path (str): Path to the folder with exported model files.
+            models_path (os.PathLike): Path to the folder with exported model files.
             device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'.
             kwargs: Device properties
         )")
diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp
index d34bd5f3b6..7ecf71d2f0 100644
--- a/src/python/py_whisper_pipeline.cpp
+++ b/src/python/py_whisper_pipeline.cpp
@@ -323,7 +323,7 @@ void init_whisper_pipeline(py::module_& m) {
             "openvino.properties map",
             R"(
             WhisperPipeline class constructor.
-            models_path (str): Path to the model file.
+            models_path (os.PathLike): Path to the model file.
             device (str): Device to run the model on (e.g., CPU, GPU).
         )")
 

From 6dd8261f2e6c8b5d2920fc22f89feb4edd7bfed1 Mon Sep 17 00:00:00 2001
From: Anna Likholat <anna.likholat@intel.com>
Date: Fri, 29 Nov 2024 19:10:51 +0100
Subject: [PATCH 24/24] Txt2img models from buffer (#1279)

---
 .../genai/image_generation/autoencoder_kl.hpp | 56 ++++++++++++++++++-
 .../image_generation/clip_text_model.hpp      | 27 +++++++++
 .../clip_text_model_with_projection.hpp       | 27 +++++++++
 .../flux_transformer_2d_model.hpp             | 26 ++++++++-
 .../sd3_transformer_2d_model.hpp              | 22 ++++++++
 .../image_generation/t5_encoder_model.hpp     | 23 +++++++-
 .../unet2d_condition_model.hpp                | 27 +++++++++
 .../models/autoencoder_kl.cpp                 | 54 ++++++++++++++++++
 .../models/clip_text_model.cpp                | 19 +++++++
 .../clip_text_model_with_projection.cpp       | 19 +++++++
 .../models/flux_transformer_2d_model.cpp      | 25 ++++++++-
 .../models/sd3_transformer_2d_model.cpp       | 19 +++++++
 .../models/t5_encoder_model.cpp               | 17 ++++++
 .../models/unet2d_condition_model.cpp         | 19 +++++++
 14 files changed, 372 insertions(+), 8 deletions(-)

diff --git a/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp b/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp
index b838fbfd97..347925727a 100644
--- a/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp
@@ -45,13 +45,37 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL {
                   const std::string& device,
                   const ov::AnyMap& properties = {});
 
+    AutoencoderKL(const std::string& vae_decoder_model,
+                  const Tensor& vae_decoder_weights,
+                  const Config& vae_decoder_config);
+
+    AutoencoderKL(const std::string& vae_encoder_model,
+                  const Tensor& vae_encoder_weights,
+                  const std::string& vae_decoder_model,
+                  const Tensor& vae_decoder_weights,
+                  const Config& vae_decoder_config);
+
+    AutoencoderKL(const std::string& vae_decoder_model,
+                  const Tensor& vae_decoder_weights,
+                  const Config& vae_decoder_config,
+                  const std::string& device,
+                  const ov::AnyMap& properties = {});
+
+    AutoencoderKL(const std::string& vae_encoder_model,
+                  const Tensor& vae_encoder_weights,
+                  const std::string& vae_decoder_model,
+                  const Tensor& vae_decoder_weights,
+                  const Config& vae_decoder_config,
+                  const std::string& device,
+                  const ov::AnyMap& properties = {});
+
     template <typename... Properties,
               typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
     AutoencoderKL(const std::filesystem::path& vae_decoder_path,
                   const std::string& device,
                   Properties&&... properties)
         : AutoencoderKL(vae_decoder_path, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
-        
+
     template <typename... Properties,
               typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
     AutoencoderKL(const std::filesystem::path& vae_encoder_path,
@@ -60,6 +84,36 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL {
                   Properties&&... properties)
         : AutoencoderKL(vae_encoder_path, vae_decoder_path, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
 
+    template <typename... Properties,
+              typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
+    AutoencoderKL(const std::string& vae_decoder_model,
+                  const Tensor& vae_decoder_weights,
+                  const Config& vae_decoder_config,
+                  const std::string& device,
+                  Properties&&... properties)
+        : AutoencoderKL(vae_decoder_model,
+                        vae_decoder_weights,
+                        vae_decoder_config,
+                        device,
+                        ov::AnyMap{std::forward<Properties>(properties)...}) { }
+
+    template <typename... Properties,
+              typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
+    AutoencoderKL(const std::string& vae_encoder_model,
+                  const Tensor& vae_encoder_weights,
+                  const std::string& vae_decoder_model,
+                  const Tensor& vae_decoder_weights,
+                  const Config& vae_decoder_config,
+                  const std::string& device,
+                  Properties&&... properties)
+        : AutoencoderKL(vae_encoder_model,
+                        vae_encoder_weights,
+                        vae_decoder_model,
+                        vae_decoder_weights,
+                        vae_decoder_config,
+                        device,
+                        ov::AnyMap{std::forward<Properties>(properties)...}) { }
+
     AutoencoderKL(const AutoencoderKL&);
 
     AutoencoderKL& reshape(int batch_size, int height, int width);
diff --git a/src/cpp/include/openvino/genai/image_generation/clip_text_model.hpp b/src/cpp/include/openvino/genai/image_generation/clip_text_model.hpp
index 26f28abac2..a3b9ebbd88 100644
--- a/src/cpp/include/openvino/genai/image_generation/clip_text_model.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/clip_text_model.hpp
@@ -33,6 +33,18 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModel {
                   const std::string& device,
                   const ov::AnyMap& properties = {});
 
+    CLIPTextModel(const std::string& model,
+                  const Tensor& weights,
+                  const Config& config,
+                  const Tokenizer& clip_tokenizer);
+
+    CLIPTextModel(const std::string& model,
+                  const Tensor& weights,
+                  const Config& config,
+                  const Tokenizer& clip_tokenizer,
+                  const std::string& device,
+                  const ov::AnyMap& properties = {});
+
     template <typename... Properties,
               typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
     CLIPTextModel(const std::filesystem::path& root_dir,
@@ -40,6 +52,21 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModel {
                   Properties&&... properties)
         : CLIPTextModel(root_dir, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
 
+    template <typename... Properties,
+              typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
+    CLIPTextModel(const std::string& model,
+                  const Tensor& weights,
+                  const Config& config,
+                  const Tokenizer& clip_tokenizer,
+                  const std::string& device,
+                  Properties&&... properties)
+        : CLIPTextModel(model,
+                        weights,
+                        config,
+                        clip_tokenizer,
+                        device,
+                        ov::AnyMap{std::forward<Properties>(properties)...}) { }
+
     CLIPTextModel(const CLIPTextModel&);
 
     const Config& get_config() const;
diff --git a/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp b/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp
index 157e378026..563fb8711d 100644
--- a/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/clip_text_model_with_projection.hpp
@@ -33,6 +33,18 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModelWithProjection {
                                 const std::string& device,
                                 const ov::AnyMap& properties = {});
 
+    CLIPTextModelWithProjection(const std::string& model,
+                                const Tensor& weights,
+                                const Config& config,
+                                const Tokenizer& clip_tokenizer);
+
+    CLIPTextModelWithProjection(const std::string& model,
+                                const Tensor& weights,
+                                const Config& config,
+                                const Tokenizer& clip_tokenizer,
+                                const std::string& device,
+                                const ov::AnyMap& properties = {});
+
     template <typename... Properties,
               typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
     CLIPTextModelWithProjection(const std::filesystem::path& root_dir,
@@ -40,6 +52,21 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModelWithProjection {
                                 Properties&&... properties)
         : CLIPTextModelWithProjection(root_dir, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
 
+    template <typename... Properties,
+              typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
+    CLIPTextModelWithProjection(const std::string& model,
+                                const Tensor& weights,
+                                const Config& config,
+                                const Tokenizer& clip_tokenizer,
+                                const std::string& device,
+                                Properties&&... properties)
+        : CLIPTextModelWithProjection(model,
+                                      weights,
+                                      config,
+                                      clip_tokenizer,
+                                      device,
+                                      ov::AnyMap{std::forward<Properties>(properties)...}) { }
+
     CLIPTextModelWithProjection(const CLIPTextModelWithProjection&);
 
     const Config& get_config() const;
diff --git a/src/cpp/include/openvino/genai/image_generation/flux_transformer_2d_model.hpp b/src/cpp/include/openvino/genai/image_generation/flux_transformer_2d_model.hpp
index 03defd5350..f0f89d03d7 100644
--- a/src/cpp/include/openvino/genai/image_generation/flux_transformer_2d_model.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/flux_transformer_2d_model.hpp
@@ -28,14 +28,36 @@ class OPENVINO_GENAI_EXPORTS FluxTransformer2DModel {
     explicit FluxTransformer2DModel(const std::filesystem::path& root_dir);
 
     FluxTransformer2DModel(const std::filesystem::path& root_dir,
-                          const std::string& device,
-                          const ov::AnyMap& properties = {});
+                           const std::string& device,
+                           const ov::AnyMap& properties = {});
+
+    FluxTransformer2DModel(const std::string& model,
+                           const Tensor& weights,
+                           const Config& config,
+                           const size_t vae_scale_factor);
+
+    FluxTransformer2DModel(const std::string& model,
+                           const Tensor& weights,
+                           const Config& config,
+                           const size_t vae_scale_factor,
+                           const std::string& device,
+                           const ov::AnyMap& properties = {});
 
     template <typename... Properties,
               typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
     FluxTransformer2DModel(const std::filesystem::path& root_dir, const std::string& device, Properties&&... properties)
         : FluxTransformer2DModel(root_dir, device, ov::AnyMap{std::forward<Properties>(properties)...}) {}
 
+    template <typename... Properties,
+              typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
+    FluxTransformer2DModel(const std::string& model,
+                           const Tensor& weights,
+                           const Config& config,
+                           const size_t vae_scale_factor,
+                           const std::string& device,
+                           Properties&&... properties)
+        : FluxTransformer2DModel(model, weights, config, vae_scale_factor, device, ov::AnyMap{std::forward<Properties>(properties)...}) {}
+
     FluxTransformer2DModel(const FluxTransformer2DModel&);
 
     const Config& get_config() const;
diff --git a/src/cpp/include/openvino/genai/image_generation/sd3_transformer_2d_model.hpp b/src/cpp/include/openvino/genai/image_generation/sd3_transformer_2d_model.hpp
index 9f3f8ec5f5..e4641066ec 100644
--- a/src/cpp/include/openvino/genai/image_generation/sd3_transformer_2d_model.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/sd3_transformer_2d_model.hpp
@@ -34,11 +34,33 @@ class OPENVINO_GENAI_EXPORTS SD3Transformer2DModel {
                           const std::string& device,
                           const ov::AnyMap& properties = {});
 
+    SD3Transformer2DModel(const std::string& model,
+                          const Tensor& weights,
+                          const Config& config,
+                          const size_t vae_scale_factor);
+
+    SD3Transformer2DModel(const std::string& model,
+                          const Tensor& weights,
+                          const Config& config,
+                          const size_t vae_scale_factor,
+                          const std::string& device,
+                          const ov::AnyMap& properties = {});
+
     template <typename... Properties,
               typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
     SD3Transformer2DModel(const std::filesystem::path& root_dir, const std::string& device, Properties&&... properties)
         : SD3Transformer2DModel(root_dir, device, ov::AnyMap{std::forward<Properties>(properties)...}) {}
 
+    template <typename... Properties,
+              typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
+    SD3Transformer2DModel(const std::string& model,
+                          const Tensor& weights,
+                          const Config& config,
+                          const size_t vae_scale_factor,
+                          const std::string& device,
+                          Properties&&... properties)
+        : SD3Transformer2DModel(model, weights, config, vae_scale_factor, device, ov::AnyMap{std::forward<Properties>(properties)...}) {}
+
     SD3Transformer2DModel(const SD3Transformer2DModel&);
 
     const Config& get_config() const;
diff --git a/src/cpp/include/openvino/genai/image_generation/t5_encoder_model.hpp b/src/cpp/include/openvino/genai/image_generation/t5_encoder_model.hpp
index d72b7ab411..717871d1d9 100644
--- a/src/cpp/include/openvino/genai/image_generation/t5_encoder_model.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/t5_encoder_model.hpp
@@ -26,13 +26,32 @@ class OPENVINO_GENAI_EXPORTS T5EncoderModel {
                   const std::string& device,
                   const ov::AnyMap& properties = {});
 
+    T5EncoderModel(const std::string& model,
+                   const Tensor& weights,
+                   const Tokenizer& tokenizer);
+
+    T5EncoderModel(const std::string&model,
+                   const Tensor& weights,
+                   const Tokenizer& tokenizer,
+                   const std::string& device,
+                   const ov::AnyMap& properties = {});
+
     template <typename... Properties,
               typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
     T5EncoderModel(const std::filesystem::path& root_dir,
-                  const std::string& device,
-                  Properties&&... properties)
+                   const std::string& device,
+                   Properties&&... properties)
         : T5EncoderModel(root_dir, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
 
+    template <typename... Properties,
+              typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
+    T5EncoderModel(const std::string& model,
+                   const Tensor& weights,
+                   const Tokenizer& tokenizer,
+                   const std::string& device,
+                   Properties&&... properties)
+        : T5EncoderModel(model, weights, tokenizer, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
+
     T5EncoderModel(const T5EncoderModel&);
 
     T5EncoderModel& reshape(int batch_size, int max_sequence_length);
diff --git a/src/cpp/include/openvino/genai/image_generation/unet2d_condition_model.hpp b/src/cpp/include/openvino/genai/image_generation/unet2d_condition_model.hpp
index 85a370b449..4acfd2ce9b 100644
--- a/src/cpp/include/openvino/genai/image_generation/unet2d_condition_model.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/unet2d_condition_model.hpp
@@ -36,6 +36,18 @@ class OPENVINO_GENAI_EXPORTS UNet2DConditionModel {
                          const std::string& device,
                          const ov::AnyMap& properties = {});
 
+    UNet2DConditionModel(const std::string& model,
+                         const Tensor& weights,
+                         const Config& config,
+                         const size_t vae_scale_factor);
+
+    UNet2DConditionModel(const std::string& model,
+                         const Tensor& weights,
+                         const Config& config,
+                         const size_t vae_scale_factor,
+                         const std::string& device,
+                         const ov::AnyMap& properties = {});
+
     template <typename... Properties,
               typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
     UNet2DConditionModel(const std::filesystem::path& root_dir,
@@ -43,6 +55,21 @@ class OPENVINO_GENAI_EXPORTS UNet2DConditionModel {
                          Properties&&... properties)
         : UNet2DConditionModel(root_dir, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
 
+    template <typename... Properties,
+              typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
+    UNet2DConditionModel(const std::string& model,
+                         const Tensor& weights,
+                         const Config& config,
+                         const size_t vae_scale_factor,
+                         const std::string& device,
+                         Properties&&... properties)
+        : UNet2DConditionModel(model,
+                               weights,
+                               config,
+                               vae_scale_factor,
+                               device,
+                               ov::AnyMap{std::forward<Properties>(properties)...}) { }
+
     UNet2DConditionModel(const UNet2DConditionModel&);
 
     const Config& get_config() const;
diff --git a/src/cpp/src/image_generation/models/autoencoder_kl.cpp b/src/cpp/src/image_generation/models/autoencoder_kl.cpp
index d7eaf18bf4..7c38cd77fa 100644
--- a/src/cpp/src/image_generation/models/autoencoder_kl.cpp
+++ b/src/cpp/src/image_generation/models/autoencoder_kl.cpp
@@ -129,6 +129,60 @@ AutoencoderKL::AutoencoderKL(const std::filesystem::path& vae_encoder_path,
     }
 }
 
+AutoencoderKL::AutoencoderKL(const std::string& vae_decoder_model,
+                             const Tensor& vae_decoder_weights,
+                             const Config& vae_decoder_config)
+    : m_config(vae_decoder_config) {
+    ov::Core core = utils::singleton_core();
+    m_decoder_model = core.read_model(vae_decoder_model, vae_decoder_weights);
+    // apply VaeImageProcessor postprocessing steps by merging them into the VAE decoder model
+    merge_vae_image_post_processing();
+}
+
+AutoencoderKL::AutoencoderKL(const std::string& vae_encoder_model,
+                             const Tensor& vae_encoder_weights,
+                             const std::string& vae_decoder_model,
+                             const Tensor& vae_decoder_weights,
+                             const Config& vae_decoder_config)
+    : AutoencoderKL(vae_decoder_model, vae_decoder_weights, vae_decoder_config) {
+    ov::Core core = utils::singleton_core();
+    m_encoder_model = core.read_model(vae_encoder_model, vae_encoder_weights);
+    // apply VaeImageProcessor pre-processing steps by merging them into the VAE encoder
+    merge_vae_image_pre_processing();
+}
+
+AutoencoderKL::AutoencoderKL(const std::string& vae_decoder_model,
+                             const Tensor& vae_decoder_weights,
+                             const Config& vae_decoder_config,
+                             const std::string& device,
+                             const ov::AnyMap& properties)
+    : AutoencoderKL(vae_decoder_model, vae_decoder_weights, vae_decoder_config) {
+    if (auto filtered_properties = extract_adapters_from_properties(properties)) {
+        compile(device, *filtered_properties);
+    } else {
+        compile(device, properties);
+    }
+}
+
+AutoencoderKL::AutoencoderKL(const std::string& vae_encoder_model,
+                             const Tensor& vae_encoder_weights,
+                             const std::string& vae_decoder_model,
+                             const Tensor& vae_decoder_weights,
+                             const Config& vae_decoder_config,
+                             const std::string& device,
+                             const ov::AnyMap& properties)
+    : AutoencoderKL(vae_encoder_model,
+                    vae_encoder_weights,
+                    vae_decoder_model,
+                    vae_decoder_weights,
+                    vae_decoder_config) {
+    if (auto filtered_properties = extract_adapters_from_properties(properties)) {
+        compile(device, *filtered_properties);
+    } else {
+        compile(device, properties);
+    }
+}
+
 AutoencoderKL::AutoencoderKL(const AutoencoderKL&) = default;
 
 AutoencoderKL& AutoencoderKL::reshape(int batch_size, int height, int width) {
diff --git a/src/cpp/src/image_generation/models/clip_text_model.cpp b/src/cpp/src/image_generation/models/clip_text_model.cpp
index f5a4d0940b..d2dab30bcf 100644
--- a/src/cpp/src/image_generation/models/clip_text_model.cpp
+++ b/src/cpp/src/image_generation/models/clip_text_model.cpp
@@ -48,6 +48,25 @@ CLIPTextModel::CLIPTextModel(const std::filesystem::path& root_dir,
     compile(device, properties);
 }
 
+CLIPTextModel::CLIPTextModel(const std::string& model,
+                             const Tensor& weights,
+                             const Config& config,
+                             const Tokenizer& clip_tokenizer) :
+    m_clip_tokenizer(clip_tokenizer), m_config(config) {
+    ov::Core core = utils::singleton_core();
+    m_model = core.read_model(model, weights);
+}
+
+CLIPTextModel::CLIPTextModel(const std::string& model,
+                             const Tensor& weights,
+                             const Config& config,
+                             const Tokenizer& clip_tokenizer,
+                             const std::string& device,
+                             const ov::AnyMap& properties) :
+    CLIPTextModel(model, weights, config, clip_tokenizer) {
+    compile(device, properties);
+}
+
 CLIPTextModel::CLIPTextModel(const CLIPTextModel&) = default;
 
 const CLIPTextModel::Config& CLIPTextModel::get_config() const {
diff --git a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
index 9a89fd73bc..13c7f5a442 100644
--- a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
+++ b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
@@ -39,6 +39,25 @@ CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::filesystem::
     compile(device, properties);
 }
 
+CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& model,
+                                                         const Tensor& weights,
+                                                         const Config& config,
+                                                         const Tokenizer& clip_tokenizer) :
+    m_clip_tokenizer(clip_tokenizer), m_config(config) {
+    ov::Core core = utils::singleton_core();
+    m_model = core.read_model(model, weights);
+}
+
+CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& model,
+                                                         const Tensor& weights,
+                                                         const Config& config,
+                                                         const Tokenizer& clip_tokenizer,
+                                                         const std::string& device,
+                                                         const ov::AnyMap& properties) :
+    CLIPTextModelWithProjection(model, weights, config, clip_tokenizer) {
+    compile(device, properties);
+}
+
 CLIPTextModelWithProjection::CLIPTextModelWithProjection(const CLIPTextModelWithProjection&) = default;
 
 const CLIPTextModelWithProjection::Config& CLIPTextModelWithProjection::get_config() const {
diff --git a/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp
index 92439be423..8bb66995b4 100644
--- a/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp
+++ b/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp
@@ -37,6 +37,25 @@ FluxTransformer2DModel::FluxTransformer2DModel(const std::filesystem::path& root
     compile(device, properties);
 }
 
+FluxTransformer2DModel::FluxTransformer2DModel(const std::string& model,
+                                               const Tensor& weights,
+                                               const Config& config,
+                                               const size_t vae_scale_factor) :
+    m_config(config), m_vae_scale_factor(vae_scale_factor) {
+    ov::Core core = utils::singleton_core();
+    m_model = core.read_model(model, weights);
+}
+
+FluxTransformer2DModel::FluxTransformer2DModel(const std::string& model,
+                                               const Tensor& weights,
+                                               const Config& config,
+                                               const size_t vae_scale_factor,
+                                               const std::string& device,
+                                               const ov::AnyMap& properties) :
+    FluxTransformer2DModel(model, weights, config, vae_scale_factor) {
+    compile(device, properties);
+}
+
 FluxTransformer2DModel::FluxTransformer2DModel(const FluxTransformer2DModel&) = default;
 
 const FluxTransformer2DModel::Config& FluxTransformer2DModel::get_config() const {
@@ -44,9 +63,9 @@ const FluxTransformer2DModel::Config& FluxTransformer2DModel::get_config() const
 }
 
 FluxTransformer2DModel& FluxTransformer2DModel::reshape(int batch_size,
-                                                      int height,
-                                                      int width,
-                                                      int tokenizer_model_max_length) {
+                                                        int height,
+                                                        int width,
+                                                        int tokenizer_model_max_length) {
     OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot reshape already compiled model");
 
     // hidden_states=latent_model_input,
diff --git a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
index 38e3dad290..70dddb0476 100644
--- a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
+++ b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
@@ -39,6 +39,25 @@ SD3Transformer2DModel::SD3Transformer2DModel(const std::filesystem::path& root_d
     compile(device, properties);
 }
 
+SD3Transformer2DModel::SD3Transformer2DModel(const std::string& model,
+                                             const Tensor& weights,
+                                             const Config& config,
+                                             const size_t vae_scale_factor) :
+    m_config(config), m_vae_scale_factor(vae_scale_factor) {
+    ov::Core core = utils::singleton_core();
+    m_model = core.read_model(model, weights);
+}
+
+SD3Transformer2DModel::SD3Transformer2DModel(const std::string& model,
+                                             const Tensor& weights,
+                                             const Config& config,
+                                             const size_t vae_scale_factor,
+                                             const std::string& device,
+                                             const ov::AnyMap& properties) :
+    SD3Transformer2DModel(model, weights, config, vae_scale_factor) {
+    compile(device, properties);
+}
+
 SD3Transformer2DModel::SD3Transformer2DModel(const SD3Transformer2DModel&) = default;
 
 const SD3Transformer2DModel::Config& SD3Transformer2DModel::get_config() const {
diff --git a/src/cpp/src/image_generation/models/t5_encoder_model.cpp b/src/cpp/src/image_generation/models/t5_encoder_model.cpp
index 2efe4986e8..e7629b2f26 100644
--- a/src/cpp/src/image_generation/models/t5_encoder_model.cpp
+++ b/src/cpp/src/image_generation/models/t5_encoder_model.cpp
@@ -27,6 +27,23 @@ T5EncoderModel::T5EncoderModel(const std::filesystem::path& root_dir,
     compile(device, properties);
 }
 
+T5EncoderModel::T5EncoderModel(const std::string& model,
+                               const Tensor& weights,
+                               const Tokenizer& tokenizer) :
+    m_tokenizer(tokenizer) {
+    ov::Core core = utils::singleton_core();
+    m_model = core.read_model(model, weights);
+}
+
+T5EncoderModel::T5EncoderModel(const std::string& model,
+                               const Tensor& weights,
+                               const Tokenizer& tokenizer,
+                               const std::string& device,
+                               const ov::AnyMap& properties) :
+    T5EncoderModel(model, weights, tokenizer) {
+    compile(device, properties);
+}
+
 T5EncoderModel::T5EncoderModel(const T5EncoderModel&) = default;
 
 T5EncoderModel& T5EncoderModel::reshape(int batch_size, int max_sequence_length) {
diff --git a/src/cpp/src/image_generation/models/unet2d_condition_model.cpp b/src/cpp/src/image_generation/models/unet2d_condition_model.cpp
index 413acb638b..ca65c9d9d6 100644
--- a/src/cpp/src/image_generation/models/unet2d_condition_model.cpp
+++ b/src/cpp/src/image_generation/models/unet2d_condition_model.cpp
@@ -42,6 +42,25 @@ UNet2DConditionModel::UNet2DConditionModel(const std::filesystem::path& root_dir
     compile(device, properties);
 }
 
+UNet2DConditionModel::UNet2DConditionModel(const std::string& model,
+                                           const Tensor& weights,
+                                           const Config& config,
+                                           const size_t vae_scale_factor) :
+    m_config(config), m_vae_scale_factor(vae_scale_factor) {
+    ov::Core core = utils::singleton_core();
+    m_model = core.read_model(model, weights);
+}
+
+UNet2DConditionModel::UNet2DConditionModel(const std::string& model,
+                                           const Tensor& weights,
+                                           const Config& config,
+                                           const size_t vae_scale_factor,
+                                           const std::string& device,
+                                           const ov::AnyMap& properties) :
+    UNet2DConditionModel(model, weights, config, vae_scale_factor) {
+    compile(device, properties);
+}
+
 UNet2DConditionModel::UNet2DConditionModel(const UNet2DConditionModel&) = default;
 
 const UNet2DConditionModel::Config& UNet2DConditionModel::get_config() const {