Merge branch 'generate_pipeline' into update-tests

openvinotoolkit · May 29, 2024 · 6e52dc9 · 6e52dc9
2 parents db31fe8 + 6709a67
commit 6e52dc9
Show file tree

Hide file tree

Showing 21 changed files with 693 additions and 377 deletions.
diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -100,8 +100,39 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     bool is_greedy_decoding() const;
     bool is_beam_search() const;
     bool is_multinomial() const;
-    static GenerationConfig anymap_to_generation_config(const ov::AnyMap& config_map = {});
+    void update_generation_config(const ov::AnyMap& config_map = {});
 };
 
+/*
+ * utils that allow to use generate and operator() in the following way:
+ * pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...)
+ * pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...)
+*/
+static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"};
+static constexpr ov::Property<size_t> max_length{"max_length"};
+static constexpr ov::Property<bool> ignore_eos{"ignore_eos"};
+
+static constexpr ov::Property<size_t> num_beam_groups{"num_beam_groups"};
+static constexpr ov::Property<size_t> num_beams{"num_beams"};
+static constexpr ov::Property<float> diversity_penalty{"diversity_penalty"};
+static constexpr ov::Property<float> length_penalty{"length_penalty"};
+static constexpr ov::Property<size_t> num_return_sequences{"num_return_sequences"};
+static constexpr ov::Property<size_t> no_repeat_ngram_size{"no_repeat_ngram_size"};
+static constexpr ov::Property<StopCriteria> stop_criteria{"stop_criteria"};
+
+static constexpr ov::Property<float> temperature{"temperature"};
+static constexpr ov::Property<float> top_p{"top_p"};
+static constexpr ov::Property<int> top_k{"top_k"};
+static constexpr ov::Property<bool> do_sample{"do_sample"};
+static constexpr ov::Property<float> repetition_penalty{"repetition_penalty"};
+
+
+static constexpr ov::Property<int64_t> pad_token_id{"pad_token_id"};
+static constexpr ov::Property<int64_t> bos_token_id{"bos_token_id"};
+static constexpr ov::Property<int64_t> eos_token_id{"eos_token_id"};
+
+static constexpr ov::Property<std::string> bos_token{"bos_token"};
+static constexpr ov::Property<std::string> eos_token{"eos_token"};
+
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -14,9 +14,10 @@
 namespace ov {
 namespace genai {
 
-using StreamerVariant = std::variant<std::function<void (std::string)>, std::shared_ptr<StreamerBase>>;
+using StreamerVariant = std::variant<std::function<void(std::string)>, std::shared_ptr<StreamerBase>, std::monostate>;
 using OptionalGenerationConfig = std::optional<GenerationConfig>;
-using OptionalStreamerVariant = std::optional<StreamerVariant>;
+using EncodedInputs = std::variant<ov::Tensor, std::pair<ov::Tensor, ov::Tensor>, TokenizedInputs>;
+using StringInputs = std::variant<std::string, std::vector<std::string>>;
 
 /**
 * @brief Structure to store resulting batched tokens and scores for each batch sequence
@@ -43,6 +44,13 @@ class DecodedResults {
 
      // @brief Convert DecodedResults to a vector of strings.
      // @return A std::vector<std::string> containing the texts from the DecodedResults object.
+    operator std::string() const { 
+        OPENVINO_ASSERT(texts.size() == 1, "DecodedResults can be converted to string only if contains a single prompt");
+        return texts.at(0); 
+    }
+
+    // @brief Convert DecodedResults to a single string.
+    // @return std::string containing the texts from the DecodedResults object.
     operator std::vector<std::string>() const { 
         return texts; 
     }
@@ -71,11 +79,27 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     * @param model_path Path to the dir model xml/bin files, tokenizers and generation_configs.json
     * @param device optional device
     * @param plugin_config optional plugin_config
-    * @param ov_tokenizers_path optional path to an extension to add. Empty adds openvino_tokenizers from openvini_genai library folder.
     */
-    LLMPipeline(const std::string& path, const std::string& device="CPU", 
-                const ov::AnyMap& plugin_config={}, 
-                const std::string& ov_tokenizers_path="");
+    LLMPipeline(
+        const std::string& path, 
+        const std::string& device="CPU", 
+        const ov::AnyMap& plugin_config={}
+    );
+
+    /**
+    * @brief Constructs an LLMPipeline from already existing infer InferRequest and Tokenizer
+    *
+    * @param request infer request of the model
+    * @param tokenizer initialized Tokenizer 
+    * @param generation_config optional generation_config, be default will be initialized for greedy decoding
+    * @param device optional device
+    * @param plugin_config optional plugin_config
+    */
+    LLMPipeline(
+        const ov::InferRequest& request, 
+        const ov::genai::Tokenizer& tokenizer, 
+        OptionalGenerationConfig generation_config=std::nullopt
+    );
 
     /**
     * @brief Constructs a LLMPipeline when ov::Tokenizer is initialized manually using file from the different dirs.
@@ -95,76 +119,84 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     ~LLMPipeline();
 
     /**
-    * @brief High level generate for the input with a single prompt which encodes inputs and returns decoded output
+    * @brief High level generate that receives prompts as a string or a vector of strings and returns decoded output.
     *
-    * @param text input prompt
+    * @param inputs input prompt or a vector of prompts
     * @param generation_config optional GenerationConfig
     * @param streamer optional streamer
-    * @return std::string decoded resulting text
+    * @return DecodedResults decoded resulting text
+    */
+    DecodedResults generate(
+        StringInputs inputs, 
+        OptionalGenerationConfig generation_config=std::nullopt, 
+        StreamerVariant streamer=std::monostate()
+    );
+
+    /**
+    * @brief High level generate that receives prompts as a string or a vector of strings and returns decoded output.
+    * properties can be in any order pipe.generate(..., ov::genai::max_new_tokens(100), ov::genai::streamer(lambda_func)).
+    * 
+    * @param inputs input prompt or a vector of prompts
+    * @param properties properties 
+    * @return DecodedResults decoded resulting text
     */
-    std::string generate(std::string text, OptionalGenerationConfig generation_config=std::nullopt, OptionalStreamerVariant streamer=std::nullopt);
-
     template <typename... Properties>
-    util::EnableIfAllStringAny<std::string, Properties...> generate(
-            std::string text,
+    util::EnableIfAllStringAny<DecodedResults, Properties...> generate(
+            StringInputs inputs,
             Properties&&... properties) {
-        return generate(text, AnyMap{std::forward<Properties>(properties)...});
+        return generate(inputs, AnyMap{std::forward<Properties>(properties)...});
+    }
+    DecodedResults generate(StringInputs inputs, const ov::AnyMap& config_map);
+
+
+    DecodedResults operator()(
+        StringInputs inputs, 
+        OptionalGenerationConfig generation_config=std::nullopt, 
+        StreamerVariant streamer=std::monostate()
+    ) {
+        return generate(inputs, generation_config, streamer);
     }
-    std::string generate(std::string text, const ov::AnyMap& config);
 
     template <typename... Properties>
-    util::EnableIfAllStringAny<EncodedResults, Properties...> generate(
-        ov::Tensor input_ids,
-        Properties&&... properties) {
-        return generate(input_ids, AnyMap{std::forward<Properties>(properties)...});
+    util::EnableIfAllStringAny<DecodedResults, Properties...> operator()(
+            StringInputs inputs,
+            Properties&&... properties) {
+        return generate(inputs, AnyMap{std::forward<Properties>(properties)...});
     }
-    EncodedResults generate(ov::Tensor input_ids, const ov::AnyMap& config);
 
     /**
-    * @brief High level generate for batched prompts which encodes inputs and returns decoded outputs. 
+    * @brief Low level generate to be called with already encoded input_ids tokens.
     * Streamer cannot be used for multibatch inputs.
     *
-    * @param text input prompt
+    * @param input_ids or pair of (input_ids, attentino_mask) encoded input prompt tokens
     * @param generation_config optional GenerationConfig
-    * @return DecodedResults a structure with resulting texts & scores
+    * @param streamer optional streamer
+    * @return EncodedResults a structure with resulting tokens and scores
+    * @throws Exception if the stremaer is set for inputs_ids with multiple batches
     */
-    DecodedResults generate(const std::vector<std::string>& texts, OptionalGenerationConfig generation_config);
+    EncodedResults generate(
+        const EncodedInputs& inputs, 
+        OptionalGenerationConfig generation_config=std::nullopt,
+        StreamerVariant streamer=std::monostate()
+    );
 
     /**
     * @brief Low level generate to be called with already encoded input_ids tokens.
     * Streamer cannot be used for multibatch inputs.
     *
-    * @param input_ids encoded input prompt tokens
-    * @param attention_mask optional attention_mask
-    * @param generation_config optional GenerationConfig
-    * @param streamer optional streamer
+    * @param input_ids or pair of (input_ids, attentino_mask) encoded input prompt tokens
+    * @param generation config params
     * @return EncodedResults a structure with resulting tokens and scores
     * @throws Exception if the stremaer is set for inputs_ids with multiple batches
     */
-    EncodedResults generate(ov::Tensor input_ids, 
-                            std::optional<ov::Tensor> attention_mask, 
-                            OptionalGenerationConfig generation_config=std::nullopt,
-                            OptionalStreamerVariant streamer=std::nullopt);
-
-    template <typename InputsType, typename... Properties>
-    util::EnableIfAllStringAny<std::string, Properties...> operator()(
-        InputsType text,
-        Properties&&... properties) {
-        return generate(text, AnyMap{std::forward<Properties>(properties)...});
-    }
-
-    DecodedResults operator()(const std::vector<std::string>& text, OptionalGenerationConfig generation_config=std::nullopt) {
-        return generate(text, generation_config);
-    }
-
-    std::string operator()(
-        std::string text, 
-        OptionalGenerationConfig generation_config=std::nullopt, 
-        OptionalStreamerVariant streamer=std::nullopt
-    ) {
-        return generate(text, generation_config, streamer);
+    template <typename... Properties>
+    util::EnableIfAllStringAny<EncodedResults, Properties...> generate(
+            const EncodedInputs& inputs,
+            Properties&&... properties) {
+        return generate(inputs, AnyMap{std::forward<Properties>(properties)...});
     }
-
+    EncodedResults generate(const EncodedInputs& inputs, const ov::AnyMap& config_map);
+
     ov::genai::Tokenizer get_tokenizer();
     GenerationConfig get_generation_config() const;
     void set_generation_config(const GenerationConfig& generation_config);
@@ -178,40 +210,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     std::unique_ptr<LLMPipelineImpl> m_pimpl;
 };
 
-/*
- * utils that allow to use generate and operator() in the following way:
- * pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...)
- * pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...)
-*/
-static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"};
-static constexpr ov::Property<size_t> max_length{"max_length"};
-static constexpr ov::Property<bool> ignore_eos{"ignore_eos"};
-
-static constexpr ov::Property<size_t> num_beam_groups{"num_beam_groups"};
-static constexpr ov::Property<size_t> num_beams{"num_beams"};
-static constexpr ov::Property<float> diversity_penalty{"diversity_penalty"};
-static constexpr ov::Property<float> length_penalty{"length_penalty"};
-static constexpr ov::Property<size_t> num_return_sequences{"num_return_sequences"};
-static constexpr ov::Property<size_t> no_repeat_ngram_size{"no_repeat_ngram_size"};
-static constexpr ov::Property<StopCriteria> stop_criteria{"stop_criteria"};
-
-static constexpr ov::Property<float> temperature{"temperature"};
-static constexpr ov::Property<float> top_p{"top_p"};
-static constexpr ov::Property<int> top_k{"top_k"};
-static constexpr ov::Property<bool> do_sample{"do_sample"};
-static constexpr ov::Property<float> repetition_penalty{"repetition_penalty"};
-
-
-static constexpr ov::Property<int64_t> pad_token_id{"pad_token_id"};
-static constexpr ov::Property<int64_t> bos_token_id{"bos_token_id"};
-static constexpr ov::Property<int64_t> eos_token_id{"eos_token_id"};
-
-static constexpr ov::Property<std::string> bos_token{"bos_token"};
-static constexpr ov::Property<std::string> eos_token{"eos_token"};
-
-// only lambda streamer can be set via ov::streamer(),... syntaxic sugar,
-// because std::variant<StremaerBase, std::function<>> can not be stored in AnyMap
-static constexpr ov::Property<std::function<void (std::string)>> streamer{"streamer"};
+std::pair<std::string, Any> streamer(StreamerVariant func);
+std::pair<std::string, Any> generation_config(const GenerationConfig& config);
 
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/include/openvino/genai/streamer_base.hpp b/src/cpp/include/openvino/genai/streamer_base.hpp
@@ -15,16 +15,13 @@ namespace genai {
 */
 class StreamerBase {
 public:
-    Tokenizer m_tokenizer;
-    explicit StreamerBase(Tokenizer tokenizer): m_tokenizer(tokenizer) {}
-    StreamerBase() = default;
-
     /// @brief put is called every time new token is decoded
     virtual void put(int64_t token) = 0;
 
     /// @brief end is called at the end of generation. It can be used to flush cache if your own streamer has one
     virtual void end() = 0;
 };
 
+
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -12,6 +12,11 @@
 namespace ov {
 namespace genai {
 
+struct TokenizedInputs {
+    ov::Tensor input_ids;
+    ov::Tensor attention_mask;
+};
+
 /**
 * @brief class is used to encode prompts and decode resulting tokens
 */
@@ -22,22 +27,22 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
     * @param device device. Currently only 'CPU' is supported
     */
-    Tokenizer(const std::string& tokenizers_path, const std::string& device="CPU", const std::string& ov_tokenizers_path="");
+    Tokenizer(const std::string& tokenizers_path, const std::string& device="CPU");
 
     /**
     * @brief encode a single prompt
     * @return pair of [input_ids, attention_mask]
     */
-    std::pair<ov::Tensor, ov::Tensor> encode(const std::string prompt);
+    TokenizedInputs encode(const std::string prompt);
 
     /**
     * @brief encode batch of prompts. Left padding will be applied by default
     * @param prompts vector storing batch of prompts
     * @return pair of [input_ids, attention_mask]
     */
-    std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string>& prompts);
-    std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string>&& prompts);
-    std::pair<ov::Tensor, ov::Tensor> encode(std::initializer_list<std::string>& prompts);
+    TokenizedInputs encode(std::vector<std::string>& prompts);
+    TokenizedInputs encode(std::vector<std::string>&& prompts);
+    TokenizedInputs encode(std::initializer_list<std::string>& prompts);
 
     /**
     * @brief decode sequence of tokens

diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
@@ -55,32 +55,29 @@ GenerationConfig::GenerationConfig(std::string json_path) {
 
 }
 
-GenerationConfig GenerationConfig::anymap_to_generation_config(const ov::AnyMap& config_map) {
+void GenerationConfig::update_generation_config(const ov::AnyMap& config_map) {
     using ov::genai::utils::read_anymap_param;
 
-    GenerationConfig config;
-    read_anymap_param(config_map, "max_new_tokens", config.max_new_tokens);
-    read_anymap_param(config_map, "max_length", config.max_length);
-    read_anymap_param(config_map, "ignore_eos", config.ignore_eos);
-    read_anymap_param(config_map, "num_beam_groups", config.num_beam_groups);
-    read_anymap_param(config_map, "num_beams", config.num_beams);
-    read_anymap_param(config_map, "diversity_penalty", config.diversity_penalty);
-    read_anymap_param(config_map, "length_penalty", config.length_penalty);
-    read_anymap_param(config_map, "num_return_sequences", config.num_return_sequences);
-    read_anymap_param(config_map, "no_repeat_ngram_size", config.no_repeat_ngram_size);
-    read_anymap_param(config_map, "stop_criteria", config.stop_criteria);
-    read_anymap_param(config_map, "temperature", config.temperature);
-    read_anymap_param(config_map, "top_p", config.top_p);
-    read_anymap_param(config_map, "top_k", config.top_k);
-    read_anymap_param(config_map, "do_sample", config.do_sample);
-    read_anymap_param(config_map, "repetition_penalty", config.repetition_penalty);
-    read_anymap_param(config_map, "pad_token_id", config.pad_token_id);
-    read_anymap_param(config_map, "bos_token_id", config.bos_token_id);
-    read_anymap_param(config_map, "eos_token_id", config.eos_token_id);
-    read_anymap_param(config_map, "bos_token", config.bos_token);
-    read_anymap_param(config_map, "eos_token", config.eos_token);
-
-    return config;
+    read_anymap_param(config_map, "max_new_tokens", max_new_tokens);
+    read_anymap_param(config_map, "max_length", max_length);
+    read_anymap_param(config_map, "ignore_eos", ignore_eos);
+    read_anymap_param(config_map, "num_beam_groups", num_beam_groups);
+    read_anymap_param(config_map, "num_beams", num_beams);
+    read_anymap_param(config_map, "diversity_penalty", diversity_penalty);
+    read_anymap_param(config_map, "length_penalty", length_penalty);
+    read_anymap_param(config_map, "num_return_sequences", num_return_sequences);
+    read_anymap_param(config_map, "no_repeat_ngram_size", no_repeat_ngram_size);
+    read_anymap_param(config_map, "stop_criteria", stop_criteria);
+    read_anymap_param(config_map, "temperature", temperature);
+    read_anymap_param(config_map, "top_p", top_p);
+    read_anymap_param(config_map, "top_k", top_k);
+    read_anymap_param(config_map, "do_sample", do_sample);
+    read_anymap_param(config_map, "repetition_penalty", repetition_penalty);
+    read_anymap_param(config_map, "pad_token_id", pad_token_id);
+    read_anymap_param(config_map, "bos_token_id", bos_token_id);
+    read_anymap_param(config_map, "eos_token_id", eos_token_id);
+    read_anymap_param(config_map, "bos_token", bos_token);
+    read_anymap_param(config_map, "eos_token", eos_token);
 }
 
 size_t GenerationConfig::get_max_new_tokens(size_t prompt_length) const {

diff --git a/src/cpp/src/generation_config_helper.hpp b/src/cpp/src/generation_config_helper.hpp
diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
@@ -92,7 +92,7 @@ EncodedResults greedy_decoding(
     for (size_t i = 0; i < max_tokens - 1; ++i) {
         utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask"));
         m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask")));
-
+    
         // todo: consider replacing with start_async and run callback right after that
         m_model_runner.infer();
         auto logits = m_model_runner.get_tensor("logits");