Merge branch 'master' into feature/nodejs-bindings

openvinotoolkit · Nov 25, 2024 · e433a19 · e433a19
2 parents 6853446 + d490c18
commit e433a19
Show file tree

Hide file tree

Showing 31 changed files with 604 additions and 556 deletions.
diff --git a/README.md b/README.md
@@ -117,17 +117,34 @@ optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code --
 
 ### Run generation using VLMPipeline API in Python
 
+See [Visual Language Chat](https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/python/visual_language_chat) for a demo application.
+
+Run the following command to download a sample image:
+
+```sh
+curl -O "https://storage.openvinotoolkit.org/test_data/images/dog.jpg"
+```
+
 ```python
+import numpy as np
+import openvino as ov
 import openvino_genai as ov_genai
-#Will run model on CPU, GPU is a possible option
+from PIL import Image
+
+# Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU
 pipe = ov_genai.VLMPipeline("./MiniCPM-V-2_6/", "CPU")
-rgb = read_image("cat.jpg")
-print(pipe.generate(prompt, image=rgb, max_new_tokens=100))
+
+image = Image.open("dog.jpg")
+image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8)
+image_data = ov.Tensor(image_data)  
+
+prompt = "Can you describe the image?"
+print(pipe.generate(prompt, image=image_data, max_new_tokens=100))
 ```
 
 ### Run generation using VLMPipeline in C++
 
-Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for more details)
+Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for more details). See [Visual Language Chat](https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/cpp/visual_language_chat) for a demo application.
 
 ```cpp
 #include "load_image.hpp"

diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
@@ -19,8 +19,12 @@ add_subdirectory(cpp/text2image)
 add_subdirectory(cpp/visual_language_chat)
 add_subdirectory(cpp/whisper_speech_recognition)
 
-install(FILES requirements.txt DESTINATION samples
-        COMPONENT cpp_samples_genai)
+install(FILES
+        deployment-requirements.txt
+        export-requirements.txt
+        requirements.txt
+    DESTINATION samples
+    COMPONENT cpp_samples_genai)
 
 install(DIRECTORY
             cpp/beam_search_causal_lm

diff --git a/samples/cpp/text2image/README.md b/samples/cpp/text2image/README.md
@@ -46,14 +46,16 @@ You can also add a callback to the `main.cpp` file to interrupt the image genera
 Please find the template of the callback usage below.
 
 ```cpp
-auto callback = [](size_t step, ov::Tensor& intermediate_res) -> bool {
+ov::genai::Text2ImagePipeline pipe(models_path, device);
+
+auto callback = [&](size_t step, ov::Tensor& intermediate_res) -> bool {
    std::cout << "Image generation step: " << step << std::endl;
+   ov::Tensor img = pipe.decode(intermediate_res); // get intermediate image tensor
    if (your_condition) // return true if you want to interrupt image generation
       return true;
    return false;
 };
 
-ov::genai::Text2ImagePipeline pipe(models_path, device);
 ov::Tensor image = pipe.generate(prompt,
    ...
    ov::genai::callback(callback)

diff --git a/samples/python/text2image/README.md b/samples/python/text2image/README.md
@@ -46,13 +46,15 @@ You can also add a callback to the `main.py` file to interrupt the image generat
 Please find the template of the callback usage below.
 
 ```python
+pipe = openvino_genai.Text2ImagePipeline(model_dir, device)
+
 def callback(step, intermediate_res):
    print("Image generation step: ", step)
+   image_tensor = pipe.decode(intermediate_res) # get intermediate image tensor
    if your_condition: # return True if you want to interrupt image generation
       return True
    return False
 
-pipe = openvino_genai.Text2ImagePipeline(model_dir, device)
 image = pipe.generate(
    ...
    callback = callback

diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -156,7 +156,7 @@ static constexpr ov::Property<bool> ignore_eos{"ignore_eos"};
 static constexpr ov::Property<size_t> min_new_tokens{"min_new_tokens"};
 static constexpr ov::Property<std::vector<std::string>> stop_strings{"stop_strings"};
 static constexpr ov::Property<bool> include_stop_str_in_output{"include_stop_str_in_output"};
-static constexpr ov::Property<std::vector<std::vector<int64_t>>> stop_token_ids{"stop_token_ids"};
+static constexpr ov::Property<std::set<int64_t>> stop_token_ids{"stop_token_ids"};
 
 static constexpr ov::Property<size_t> num_beam_groups{"num_beam_groups"};
 static constexpr ov::Property<size_t> num_beams{"num_beams"};

diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp
@@ -297,33 +297,33 @@ class FluxPipeline : public DiffusionPipeline {
     ov::Tensor generate(const std::string& positive_prompt,
                         ov::Tensor initial_image,
                         const ov::AnyMap& properties) override {
-        ImageGenerationConfig generation_config = m_generation_config;
-        generation_config.update_generation_config(properties);
+        m_custom_generation_config = m_generation_config;
+        m_custom_generation_config.update_generation_config(properties);
 
         if (!initial_image) {
             // in case of typical text to image generation, we need to ignore 'strength'
-            generation_config.strength = 1.0f;
+            m_custom_generation_config.strength = 1.0f;
         }
 
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
         const auto& transformer_config = m_transformer->get_config();
 
-        if (generation_config.height < 0)
-            generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor;
-        if (generation_config.width < 0)
-            generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor;
+        if (m_custom_generation_config.height < 0)
+            m_custom_generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor;
+        if (m_custom_generation_config.width < 0)
+            m_custom_generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor;
 
-        check_inputs(generation_config, initial_image);
+        check_inputs(m_custom_generation_config, initial_image);
 
-        compute_hidden_states(positive_prompt, generation_config);
+        compute_hidden_states(positive_prompt, m_custom_generation_config);
 
-        ov::Tensor latents = prepare_latents(initial_image, generation_config);
+        ov::Tensor latents = prepare_latents(initial_image, m_custom_generation_config);
 
         size_t image_seq_len = latents.get_shape()[1];
         float mu = m_scheduler->calculate_shift(image_seq_len);
 
-        float linspace_end = 1.0f / generation_config.num_inference_steps;
-        std::vector<float> sigmas = numpy_utils::linspace<float>(1.0f, linspace_end, generation_config.num_inference_steps, true);
+        float linspace_end = 1.0f / m_custom_generation_config.num_inference_steps;
+        std::vector<float> sigmas = numpy_utils::linspace<float>(1.0f, linspace_end, m_custom_generation_config.num_inference_steps, true);
 
         m_scheduler->set_timesteps_with_sigma(sigmas, mu);
         std::vector<float> timesteps = m_scheduler->get_float_timesteps();
@@ -345,7 +345,7 @@ class FluxPipeline : public DiffusionPipeline {
 
             ov::Tensor noise_pred_tensor = m_transformer->infer(latents, timestep);
 
-            auto scheduler_step_result = m_scheduler->step(noise_pred_tensor, latents, inference_step, generation_config.generator);
+            auto scheduler_step_result = m_scheduler->step(noise_pred_tensor, latents, inference_step, m_custom_generation_config.generator);
             latents = scheduler_step_result["latent"];
 
             if (do_callback) {
@@ -355,12 +355,16 @@ class FluxPipeline : public DiffusionPipeline {
             }
         }
 
-        latents = unpack_latents(latents, generation_config.height, generation_config.width, vae_scale_factor);
+        latents = unpack_latents(latents, m_custom_generation_config.height, m_custom_generation_config.width, vae_scale_factor);
         return m_vae->decode(latents);
     }
 
     ov::Tensor decode(const ov::Tensor latent) override {
-        return m_vae->decode(latent);
+        ov::Tensor unpacked_latent = unpack_latents(latent,
+                                                m_custom_generation_config.height,
+                                                m_custom_generation_config.width,
+                                                m_vae->get_vae_scale_factor());
+        return m_vae->decode(unpacked_latent);
     }
 
 private:
@@ -407,7 +411,7 @@ class FluxPipeline : public DiffusionPipeline {
     std::shared_ptr<CLIPTextModel> m_clip_text_encoder;
     std::shared_ptr<T5EncoderModel> m_t5_text_encoder;
     std::shared_ptr<AutoencoderKL> m_vae;
-
+    ImageGenerationConfig m_custom_generation_config;
 };
 
 }  // namespace genai

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
@@ -530,6 +530,13 @@ template <typename T>
 T pop_or_default(ov::AnyMap& config, const std::string& key, const T& default_value) {
     auto anyopt = pop_option(config, key);
     if (anyopt.has_value()) {
+        if (anyopt.value().empty()) {
+            if (ov::genai::utils::is_container<T>)
+                return T{};
+            else {
+                OPENVINO_THROW("Got empty ov::Any for key: " + key);
+            }
+        }
         return anyopt.value().as<T>();
     }
     return default_value;

diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
+#include <type_traits>
 
 #include "openvino/genai/llm_pipeline.hpp"
 #include "openvino/runtime/core.hpp"
@@ -12,6 +13,16 @@ namespace ov {
 namespace genai {
 namespace utils {
 
+// Variable template that checks if a type has begin() and end() member functions
+template<typename, typename = void>
+constexpr bool is_container = false;
+
+template<typename T>
+constexpr bool is_container<T,
+    std::void_t<decltype(std::declval<T>().begin()),
+                decltype(std::declval<T>().end())>> = true;
+
+
 Tensor init_attention_mask(const Tensor& position_ids);
 
 void print_tensor(const ov::Tensor& tensor);
@@ -31,7 +42,16 @@ template <typename T>
 void read_anymap_param(const ov::AnyMap& config_map, const std::string& name, T& param) {
     auto it = config_map.find(name);
     if (it != config_map.end()) {
-        param = it->second.as<typename OmitOptional<T>::value>();
+        if (it->second.empty()) {
+            if (ov::genai::utils::is_container<T>)
+                param = T{};
+            else {
+                OPENVINO_THROW("Got empty ov::Any for parameter name: " + name);
+            }
+        }
+        else {
+            param = it->second.as<typename OmitOptional<T>::value>();
+        }
     }
 }
 

diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
@@ -182,12 +182,14 @@ if(pybind11_stubgen_AVAILABLE)
         VERBATIM)
 
     add_custom_target(${TARGET_NAME}_stub ALL DEPENDS ${output_file})
-else()
+elseif(OpenVINODeveloperPackage_FOUND)
     # Produce warning message at build time as well
     add_custom_command(OUTPUT pybind11_stub_gen_not_found.txt
         COMMAND ${CMAKE_COMMAND}
             -E cmake_echo_color --red "Warning: Please, install ${pybind11_stubgen_dep}")
     add_custom_target(${TARGET_NAME}_stub ALL DEPENDS pybind11_stub_gen_not_found.txt)
+else()
+    add_custom_target(${TARGET_NAME}_stub ALL)
 endif()
 
 add_dependencies(${TARGET_NAME}_stub ${TARGET_NAME})
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -1296,7 +1296,7 @@ class Tokenizer:
     openvino_genai.Tokenizer object is used to initialize Tokenizer
                if it's located in a different path than the main model.
     """
-    def __init__(self, tokenizer_path: os.PathLike, properties: dict[str, typing.Any] = {}) -> None:
+    def __init__(self, tokenizer_path: os.PathLike, properties: dict[str, typing.Any] = {}, **kwargs) -> None:
         ...
     def apply_chat_template(self, history: list[dict[str, str]], add_generation_prompt: bool, chat_template: str = '') -> str:
         """

diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp
@@ -67,108 +67,6 @@ auto text2image_generate_docstring = R"(
 )";
 
 
-void update_image_generation_config_from_kwargs(
-    ov::genai::ImageGenerationConfig& config,
-    const py::kwargs& kwargs) {
-    for (const auto& item : kwargs) {
-        std::string key = py::cast<std::string>(item.first);
-        py::object value = py::cast<py::object>(item.second);
-
-        if (key == "prompt_2") {
-            config.prompt_2 = py::cast<std::string>(value);
-        } else if (key == "prompt_3") {
-            config.prompt_3 = py::cast<std::string>(value);
-        } else if (key == "negative_prompt") {
-            config.negative_prompt = py::cast<std::string>(value);
-        } else if (key == "negative_prompt_2") {
-            config.negative_prompt_2 = py::cast<std::string>(value);
-        } else if (key == "negative_prompt_3") {
-            config.negative_prompt_3 = py::cast<std::string>(value);
-        } else if (key == "num_images_per_prompt") {
-            config.num_images_per_prompt = py::cast<size_t>(value);
-        } else if (key == "guidance_scale") {
-            config.guidance_scale = py::cast<float>(value);
-        } else if (key == "height") {
-            config.height = py::cast<int64_t>(value);
-        } else if (key == "width") {
-            config.width = py::cast<int64_t>(value);
-        } else if (key == "num_inference_steps") {
-            config.num_inference_steps = py::cast<size_t>(value);
-        } else if (key == "generator") {
-            auto py_generator = py::cast<std::shared_ptr<ov::genai::Generator>>(value);
-            config.generator = py_generator;
-        } else if (key == "adapters") {
-            config.adapters = py::cast<ov::genai::AdapterConfig>(value);
-        } else if (key == "strength") {
-            config.strength = py::cast<float>(value);
-        } else if (key == "max_sequence_length") {
-            config.max_sequence_length = py::cast<size_t>(value);
-        } else {
-            throw(std::invalid_argument("'" + key + "' is unexpected parameter name. "
-                                        "Use help(openvino_genai.ImageGenerationConfig) to get list of acceptable parameters."));
-        }
-    }
-}
-
-ov::AnyMap text2image_kwargs_to_any_map(const py::kwargs& kwargs, bool allow_compile_properties=true) {
-    ov::AnyMap params = {};
-
-    for (const auto& item : kwargs) {
-        std::string key = py::cast<std::string>(item.first);
-        py::object value = py::cast<py::object>(item.second);
-
-        if (key == "prompt_2") {
-            params.insert({ov::genai::prompt_2(std::move(py::cast<std::string>(value)))});
-        } else if (key == "prompt_3") {
-            params.insert({ov::genai::prompt_3(std::move(py::cast<std::string>(value)))});
-        } else if (key == "negative_prompt") {
-            params.insert({ov::genai::negative_prompt(std::move(py::cast<std::string>(value)))});
-        } else if (key == "negative_prompt_2") {
-            params.insert({ov::genai::negative_prompt_2(std::move(py::cast<std::string>(value)))});
-        } else if (key == "negative_prompt_3") {
-            params.insert({ov::genai::negative_prompt_3(std::move(py::cast<std::string>(value)))});
-        } else if (key == "num_images_per_prompt") {
-            params.insert({ov::genai::num_images_per_prompt(std::move(py::cast<size_t>(value)))});
-        } else if (key == "guidance_scale") {
-            params.insert({ov::genai::guidance_scale(std::move(py::cast<float>(value)))});
-        } else if (key == "height") {
-            params.insert({ov::genai::height(std::move(py::cast<int64_t>(value)))});
-        } else if (key == "width") {
-            params.insert({ov::genai::width(std::move(py::cast<int64_t>(value)))});
-        } else if (key == "num_inference_steps") {
-            params.insert({ov::genai::num_inference_steps(std::move(py::cast<size_t>(value)))});
-        } else if (key == "generator") {
-            auto py_generator =py::cast<std::shared_ptr<ov::genai::Generator>>(value);
-            params.insert({ov::genai::generator(std::move(py_generator))});
-        } else if (key == "adapters") {
-            params.insert({ov::genai::adapters(std::move(py::cast<ov::genai::AdapterConfig>(value)))});
-        } else if (key == "strength") {
-            params.insert({ov::genai::strength(std::move(py::cast<float>(value)))});
-        } else if (key == "max_sequence_length") {
-            params.insert({ov::genai::max_sequence_length(std::move(py::cast<size_t>(value)))});
-        } else if (key == "callback") {
-            params.insert({ov::genai::callback(std::move(py::cast<std::function<bool(size_t, ov::Tensor&)>>(value)))});
-        }
-        else {
-            if (allow_compile_properties) {
-                // convert arbitrary objects to ov::Any
-                // not supported properties are not checked, as these properties are passed to compile(), which will throw exception in case of unsupported property
-                if (pyutils::py_object_is_any_map(value)) {
-                    auto map = pyutils::py_object_to_any_map(value);
-                    params.insert(map.begin(), map.end());
-                } else {
-                    params[key] = pyutils::py_object_to_any(value);
-                }
-            }
-            else {
-                // generate doesn't run compile(), so only Text2ImagePipeline specific properties are allowed
-                throw(std::invalid_argument("'" + key + "' is unexpected parameter name. "
-                                            "Use help(openvino_genai.Text2ImagePipeline.generate) to get list of acceptable parameters."));
-            }
-        }
-    }
-    return params;
-}
 
 } // namespace
 
@@ -230,7 +128,7 @@ void init_image_generation_pipelines(py::module_& m) {
         .def("update_generation_config", [](
             ov::genai::ImageGenerationConfig config,
             const py::kwargs& kwargs) {
-            update_image_generation_config_from_kwargs(config, kwargs);
+            config.update_generation_config(pyutils::kwargs_to_any_map(kwargs));
         });
 
     auto text2image_pipeline = py::class_<ov::genai::Text2ImagePipeline>(m, "Text2ImagePipeline", "This class is used for generation with text-to-image models.")
@@ -252,7 +150,7 @@ void init_image_generation_pipelines(py::module_& m) {
             const py::kwargs& kwargs
         ) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
-            return std::make_unique<ov::genai::Text2ImagePipeline>(models_path, device, text2image_kwargs_to_any_map(kwargs, true));
+            return std::make_unique<ov::genai::Text2ImagePipeline>(models_path, device, pyutils::kwargs_to_any_map(kwargs));
         }),
         py::arg("models_path"), "folder with exported model files.",
         py::arg("device"), "device on which inference will be done",
@@ -289,7 +187,7 @@ void init_image_generation_pipelines(py::module_& m) {
                 const std::string& prompt,
                 const py::kwargs& kwargs
             ) -> py::typing::Union<ov::Tensor> {
-                ov::AnyMap params = text2image_kwargs_to_any_map(kwargs, false);
+                ov::AnyMap params = pyutils::kwargs_to_any_map(kwargs);
                 return py::cast(pipe.generate(prompt, params));
             },
             py::arg("prompt"), "Input string",