diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 0da478a397..82a450b619 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -100,8 +100,39 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { bool is_greedy_decoding() const; bool is_beam_search() const; bool is_multinomial() const; - static GenerationConfig anymap_to_generation_config(const ov::AnyMap& config_map = {}); + void update_generation_config(const ov::AnyMap& config_map = {}); }; +/* + * utils that allow to use generate and operator() in the following way: + * pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...) + * pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...) +*/ +static constexpr ov::Property max_new_tokens{"max_new_tokens"}; +static constexpr ov::Property max_length{"max_length"}; +static constexpr ov::Property ignore_eos{"ignore_eos"}; + +static constexpr ov::Property num_beam_groups{"num_beam_groups"}; +static constexpr ov::Property num_beams{"num_beams"}; +static constexpr ov::Property diversity_penalty{"diversity_penalty"}; +static constexpr ov::Property length_penalty{"length_penalty"}; +static constexpr ov::Property num_return_sequences{"num_return_sequences"}; +static constexpr ov::Property no_repeat_ngram_size{"no_repeat_ngram_size"}; +static constexpr ov::Property stop_criteria{"stop_criteria"}; + +static constexpr ov::Property temperature{"temperature"}; +static constexpr ov::Property top_p{"top_p"}; +static constexpr ov::Property top_k{"top_k"}; +static constexpr ov::Property do_sample{"do_sample"}; +static constexpr ov::Property repetition_penalty{"repetition_penalty"}; + + +static constexpr ov::Property pad_token_id{"pad_token_id"}; +static constexpr ov::Property bos_token_id{"bos_token_id"}; +static constexpr ov::Property eos_token_id{"eos_token_id"}; + +static constexpr ov::Property bos_token{"bos_token"}; +static constexpr ov::Property eos_token{"eos_token"}; + } // namespace genai } // namespace ov diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index d16ec0dc8b..c8c6ea9231 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -14,9 +14,10 @@ namespace ov { namespace genai { -using StreamerVariant = std::variant, std::shared_ptr>; +using StreamerVariant = std::variant, std::shared_ptr, std::monostate>; using OptionalGenerationConfig = std::optional; -using OptionalStreamerVariant = std::optional; +using EncodedInputs = std::variant, TokenizedInputs>; +using StringInputs = std::variant>; /** * @brief Structure to store resulting batched tokens and scores for each batch sequence @@ -43,6 +44,13 @@ class DecodedResults { // @brief Convert DecodedResults to a vector of strings. // @return A std::vector containing the texts from the DecodedResults object. + operator std::string() const { + OPENVINO_ASSERT(texts.size() == 1, "DecodedResults can be converted to string only if contains a single prompt"); + return texts.at(0); + } + + // @brief Convert DecodedResults to a single string. + // @return std::string containing the texts from the DecodedResults object. operator std::vector() const { return texts; } @@ -71,11 +79,27 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * @param model_path Path to the dir model xml/bin files, tokenizers and generation_configs.json * @param device optional device * @param plugin_config optional plugin_config - * @param ov_tokenizers_path optional path to an extension to add. Empty adds openvino_tokenizers from openvini_genai library folder. */ - LLMPipeline(const std::string& path, const std::string& device="CPU", - const ov::AnyMap& plugin_config={}, - const std::string& ov_tokenizers_path=""); + LLMPipeline( + const std::string& path, + const std::string& device="CPU", + const ov::AnyMap& plugin_config={} + ); + + /** + * @brief Constructs an LLMPipeline from already existing infer InferRequest and Tokenizer + * + * @param request infer request of the model + * @param tokenizer initialized Tokenizer + * @param generation_config optional generation_config, be default will be initialized for greedy decoding + * @param device optional device + * @param plugin_config optional plugin_config + */ + LLMPipeline( + const ov::InferRequest& request, + const ov::genai::Tokenizer& tokenizer, + OptionalGenerationConfig generation_config=std::nullopt + ); /** * @brief Constructs a LLMPipeline when ov::Tokenizer is initialized manually using file from the different dirs. @@ -95,76 +119,84 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { ~LLMPipeline(); /** - * @brief High level generate for the input with a single prompt which encodes inputs and returns decoded output + * @brief High level generate that receives prompts as a string or a vector of strings and returns decoded output. * - * @param text input prompt + * @param inputs input prompt or a vector of prompts * @param generation_config optional GenerationConfig * @param streamer optional streamer - * @return std::string decoded resulting text + * @return DecodedResults decoded resulting text + */ + DecodedResults generate( + StringInputs inputs, + OptionalGenerationConfig generation_config=std::nullopt, + StreamerVariant streamer=std::monostate() + ); + + /** + * @brief High level generate that receives prompts as a string or a vector of strings and returns decoded output. + * properties can be in any order pipe.generate(..., ov::genai::max_new_tokens(100), ov::genai::streamer(lambda_func)). + * + * @param inputs input prompt or a vector of prompts + * @param properties properties + * @return DecodedResults decoded resulting text */ - std::string generate(std::string text, OptionalGenerationConfig generation_config=std::nullopt, OptionalStreamerVariant streamer=std::nullopt); - template - util::EnableIfAllStringAny generate( - std::string text, + util::EnableIfAllStringAny generate( + StringInputs inputs, Properties&&... properties) { - return generate(text, AnyMap{std::forward(properties)...}); + return generate(inputs, AnyMap{std::forward(properties)...}); + } + DecodedResults generate(StringInputs inputs, const ov::AnyMap& config_map); + + + DecodedResults operator()( + StringInputs inputs, + OptionalGenerationConfig generation_config=std::nullopt, + StreamerVariant streamer=std::monostate() + ) { + return generate(inputs, generation_config, streamer); } - std::string generate(std::string text, const ov::AnyMap& config); template - util::EnableIfAllStringAny generate( - ov::Tensor input_ids, - Properties&&... properties) { - return generate(input_ids, AnyMap{std::forward(properties)...}); + util::EnableIfAllStringAny operator()( + StringInputs inputs, + Properties&&... properties) { + return generate(inputs, AnyMap{std::forward(properties)...}); } - EncodedResults generate(ov::Tensor input_ids, const ov::AnyMap& config); /** - * @brief High level generate for batched prompts which encodes inputs and returns decoded outputs. + * @brief Low level generate to be called with already encoded input_ids tokens. * Streamer cannot be used for multibatch inputs. * - * @param text input prompt + * @param input_ids or pair of (input_ids, attentino_mask) encoded input prompt tokens * @param generation_config optional GenerationConfig - * @return DecodedResults a structure with resulting texts & scores + * @param streamer optional streamer + * @return EncodedResults a structure with resulting tokens and scores + * @throws Exception if the stremaer is set for inputs_ids with multiple batches */ - DecodedResults generate(const std::vector& texts, OptionalGenerationConfig generation_config); + EncodedResults generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config=std::nullopt, + StreamerVariant streamer=std::monostate() + ); /** * @brief Low level generate to be called with already encoded input_ids tokens. * Streamer cannot be used for multibatch inputs. * - * @param input_ids encoded input prompt tokens - * @param attention_mask optional attention_mask - * @param generation_config optional GenerationConfig - * @param streamer optional streamer + * @param input_ids or pair of (input_ids, attentino_mask) encoded input prompt tokens + * @param generation config params * @return EncodedResults a structure with resulting tokens and scores * @throws Exception if the stremaer is set for inputs_ids with multiple batches */ - EncodedResults generate(ov::Tensor input_ids, - std::optional attention_mask, - OptionalGenerationConfig generation_config=std::nullopt, - OptionalStreamerVariant streamer=std::nullopt); - - template - util::EnableIfAllStringAny operator()( - InputsType text, - Properties&&... properties) { - return generate(text, AnyMap{std::forward(properties)...}); - } - - DecodedResults operator()(const std::vector& text, OptionalGenerationConfig generation_config=std::nullopt) { - return generate(text, generation_config); - } - - std::string operator()( - std::string text, - OptionalGenerationConfig generation_config=std::nullopt, - OptionalStreamerVariant streamer=std::nullopt - ) { - return generate(text, generation_config, streamer); + template + util::EnableIfAllStringAny generate( + const EncodedInputs& inputs, + Properties&&... properties) { + return generate(inputs, AnyMap{std::forward(properties)...}); } - + EncodedResults generate(const EncodedInputs& inputs, const ov::AnyMap& config_map); + ov::genai::Tokenizer get_tokenizer(); GenerationConfig get_generation_config() const; void set_generation_config(const GenerationConfig& generation_config); @@ -178,40 +210,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { std::unique_ptr m_pimpl; }; -/* - * utils that allow to use generate and operator() in the following way: - * pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...) - * pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...) -*/ -static constexpr ov::Property max_new_tokens{"max_new_tokens"}; -static constexpr ov::Property max_length{"max_length"}; -static constexpr ov::Property ignore_eos{"ignore_eos"}; - -static constexpr ov::Property num_beam_groups{"num_beam_groups"}; -static constexpr ov::Property num_beams{"num_beams"}; -static constexpr ov::Property diversity_penalty{"diversity_penalty"}; -static constexpr ov::Property length_penalty{"length_penalty"}; -static constexpr ov::Property num_return_sequences{"num_return_sequences"}; -static constexpr ov::Property no_repeat_ngram_size{"no_repeat_ngram_size"}; -static constexpr ov::Property stop_criteria{"stop_criteria"}; - -static constexpr ov::Property temperature{"temperature"}; -static constexpr ov::Property top_p{"top_p"}; -static constexpr ov::Property top_k{"top_k"}; -static constexpr ov::Property do_sample{"do_sample"}; -static constexpr ov::Property repetition_penalty{"repetition_penalty"}; - - -static constexpr ov::Property pad_token_id{"pad_token_id"}; -static constexpr ov::Property bos_token_id{"bos_token_id"}; -static constexpr ov::Property eos_token_id{"eos_token_id"}; - -static constexpr ov::Property bos_token{"bos_token"}; -static constexpr ov::Property eos_token{"eos_token"}; - -// only lambda streamer can be set via ov::streamer(),... syntaxic sugar, -// because std::variant> can not be stored in AnyMap -static constexpr ov::Property> streamer{"streamer"}; +std::pair streamer(StreamerVariant func); +std::pair generation_config(const GenerationConfig& config); } // namespace genai } // namespace ov diff --git a/src/cpp/include/openvino/genai/streamer_base.hpp b/src/cpp/include/openvino/genai/streamer_base.hpp index 7731b51c1c..ba6287c66a 100644 --- a/src/cpp/include/openvino/genai/streamer_base.hpp +++ b/src/cpp/include/openvino/genai/streamer_base.hpp @@ -15,10 +15,6 @@ namespace genai { */ class StreamerBase { public: - Tokenizer m_tokenizer; - explicit StreamerBase(Tokenizer tokenizer): m_tokenizer(tokenizer) {} - StreamerBase() = default; - /// @brief put is called every time new token is decoded virtual void put(int64_t token) = 0; @@ -26,5 +22,6 @@ class StreamerBase { virtual void end() = 0; }; + } // namespace genai } // namespace ov diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index e0214fcfbb..5dcc1a2670 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -12,6 +12,11 @@ namespace ov { namespace genai { +struct TokenizedInputs { + ov::Tensor input_ids; + ov::Tensor attention_mask; +}; + /** * @brief class is used to encode prompts and decode resulting tokens */ @@ -22,22 +27,22 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path * @param device device. Currently only 'CPU' is supported */ - Tokenizer(const std::string& tokenizers_path, const std::string& device="CPU", const std::string& ov_tokenizers_path=""); + Tokenizer(const std::string& tokenizers_path, const std::string& device="CPU"); /** * @brief encode a single prompt * @return pair of [input_ids, attention_mask] */ - std::pair encode(const std::string prompt); + TokenizedInputs encode(const std::string prompt); /** * @brief encode batch of prompts. Left padding will be applied by default * @param prompts vector storing batch of prompts * @return pair of [input_ids, attention_mask] */ - std::pair encode(std::vector& prompts); - std::pair encode(std::vector&& prompts); - std::pair encode(std::initializer_list& prompts); + TokenizedInputs encode(std::vector& prompts); + TokenizedInputs encode(std::vector&& prompts); + TokenizedInputs encode(std::initializer_list& prompts); /** * @brief decode sequence of tokens diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index e2e95262de..5569a759b0 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -55,32 +55,29 @@ GenerationConfig::GenerationConfig(std::string json_path) { } -GenerationConfig GenerationConfig::anymap_to_generation_config(const ov::AnyMap& config_map) { +void GenerationConfig::update_generation_config(const ov::AnyMap& config_map) { using ov::genai::utils::read_anymap_param; - GenerationConfig config; - read_anymap_param(config_map, "max_new_tokens", config.max_new_tokens); - read_anymap_param(config_map, "max_length", config.max_length); - read_anymap_param(config_map, "ignore_eos", config.ignore_eos); - read_anymap_param(config_map, "num_beam_groups", config.num_beam_groups); - read_anymap_param(config_map, "num_beams", config.num_beams); - read_anymap_param(config_map, "diversity_penalty", config.diversity_penalty); - read_anymap_param(config_map, "length_penalty", config.length_penalty); - read_anymap_param(config_map, "num_return_sequences", config.num_return_sequences); - read_anymap_param(config_map, "no_repeat_ngram_size", config.no_repeat_ngram_size); - read_anymap_param(config_map, "stop_criteria", config.stop_criteria); - read_anymap_param(config_map, "temperature", config.temperature); - read_anymap_param(config_map, "top_p", config.top_p); - read_anymap_param(config_map, "top_k", config.top_k); - read_anymap_param(config_map, "do_sample", config.do_sample); - read_anymap_param(config_map, "repetition_penalty", config.repetition_penalty); - read_anymap_param(config_map, "pad_token_id", config.pad_token_id); - read_anymap_param(config_map, "bos_token_id", config.bos_token_id); - read_anymap_param(config_map, "eos_token_id", config.eos_token_id); - read_anymap_param(config_map, "bos_token", config.bos_token); - read_anymap_param(config_map, "eos_token", config.eos_token); - - return config; + read_anymap_param(config_map, "max_new_tokens", max_new_tokens); + read_anymap_param(config_map, "max_length", max_length); + read_anymap_param(config_map, "ignore_eos", ignore_eos); + read_anymap_param(config_map, "num_beam_groups", num_beam_groups); + read_anymap_param(config_map, "num_beams", num_beams); + read_anymap_param(config_map, "diversity_penalty", diversity_penalty); + read_anymap_param(config_map, "length_penalty", length_penalty); + read_anymap_param(config_map, "num_return_sequences", num_return_sequences); + read_anymap_param(config_map, "no_repeat_ngram_size", no_repeat_ngram_size); + read_anymap_param(config_map, "stop_criteria", stop_criteria); + read_anymap_param(config_map, "temperature", temperature); + read_anymap_param(config_map, "top_p", top_p); + read_anymap_param(config_map, "top_k", top_k); + read_anymap_param(config_map, "do_sample", do_sample); + read_anymap_param(config_map, "repetition_penalty", repetition_penalty); + read_anymap_param(config_map, "pad_token_id", pad_token_id); + read_anymap_param(config_map, "bos_token_id", bos_token_id); + read_anymap_param(config_map, "eos_token_id", eos_token_id); + read_anymap_param(config_map, "bos_token", bos_token); + read_anymap_param(config_map, "eos_token", eos_token); } size_t GenerationConfig::get_max_new_tokens(size_t prompt_length) const { diff --git a/src/cpp/src/generation_config_helper.hpp b/src/cpp/src/generation_config_helper.hpp deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp index 51e8023b42..3cc5efd26e 100644 --- a/src/cpp/src/greedy_decoding.cpp +++ b/src/cpp/src/greedy_decoding.cpp @@ -92,7 +92,7 @@ EncodedResults greedy_decoding( for (size_t i = 0; i < max_tokens - 1; ++i) { utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask")); m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask"))); - + // todo: consider replacing with start_async and run callback right after that m_model_runner.infer(); auto logits = m_model_runner.get_tensor("logits"); diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 3f4b9f3f89..3a824a7ec5 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -44,6 +44,9 @@ std::string get_absolute_file_path(const std::string& path) { namespace { +const std::string STREAMER_ARG_NAME = "streamer"; +const std::string CONFIG_ARG_NAME = "generation_config"; + ov::genai::GenerationConfig from_config_json_if_exists(const std::string& path) { constexpr char generation_config_fname[] = "generation_config.json"; constexpr char config_fname[] = "config.json"; @@ -83,16 +86,7 @@ std::string from_tokenizer_json_if_exists(const std::string& path) { return res; } -std::filesystem::path with_openvino_tokenizers(const std::filesystem::path& path) { -#ifdef _WIN32 - constexpr char tokenizers[] = "openvino_tokenizers.dll"; -#elif __linux__ - constexpr char tokenizers[] = "libopenvino_tokenizers.so"; -#elif __APPLE__ - constexpr char tokenizers[] = "libopenvino_tokenizers.dylib"; -#endif - return path.parent_path() / tokenizers; -} + std::string get_ov_genai_library_path() { #ifdef _WIN32 @@ -116,6 +110,27 @@ std::string get_ov_genai_library_path() { #endif // _WIN32 } +ov::genai::StreamerVariant get_streamer_from_map(const ov::AnyMap& config_map) { + ov::genai::StreamerVariant streamer = std::monostate(); + + if (config_map.count(STREAMER_ARG_NAME)) { + auto any_val = config_map.at(STREAMER_ARG_NAME); + if (any_val.is>()) { + streamer = any_val.as>(); + } else if (any_val.is>()) { + streamer = any_val.as>(); + } + } + return streamer; +} + +ov::genai::OptionalGenerationConfig get_config_from_map(const ov::AnyMap& config_map) { + if (config_map.count(CONFIG_ARG_NAME)) + return config_map.at(CONFIG_ARG_NAME).as(); + else + return std::nullopt; +} + } namespace ov { @@ -149,6 +164,15 @@ class LLMPipeline::LLMPipelineImpl { std::string m_chat_template = ""; bool is_chat_conversation = false; + LLMPipelineImpl( + const ov::InferRequest& request, + const ov::genai::Tokenizer& tokenizer, + OptionalGenerationConfig generation_config=std::nullopt + ): m_model_runner(request), m_tokenizer(tokenizer) { + GenerationConfig default_config; + m_generation_config = (generation_config.has_value()) ? *generation_config : default_config; + } + LLMPipelineImpl( const std::string& model_path, const ov::genai::Tokenizer& tokenizer, @@ -159,24 +183,212 @@ class LLMPipeline::LLMPipelineImpl { LLMPipelineImpl( const std::string& path, const std::string& device, - const ov::AnyMap& config, - const std::string& ov_tokenizers_path="" + const ov::AnyMap& config ); - GenerationConfig generation_config() const; + DecodedResults generate( + StringInputs inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) { + GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; + + + EncodedInputs encoded_input; + if (auto input_vector = std::get_if>(&inputs)) { + encoded_input = m_tokenizer.encode(*input_vector); + } else if (auto input_str = std::get_if(&inputs)) { + + std::string text = *input_str; + // todo: make for batched inputs as well + if (is_chat_conversation) + text = apply_chat_template(text); + + + // previous prompt generation in chat dialog stops with the end of sentence token, + // need to append this token to the current prompt + auto kv_cache_len = m_model_runner.query_state()[0].get_state().get_shape()[2]; + if (is_chat_conversation && kv_cache_len > 0) + text = config.eos_token + text; + + auto res = m_tokenizer.encode(text); + auto input_ids = res.input_ids; + auto attention_mask = res.attention_mask; + + + // todo: W/A If sentence begins with a specfial tokens (, , etc.) openvino_tokenizer inserts 2 special extra tokens and "▁", + // but HF does not do that. Moreover openvino_tokenizer always inserts but in chat scenario HF does not do that because skip_special_tokens=True. + // Need to remove both of that tokens manually to get exact token by token alignment with HF + auto size = input_ids.get_shape(); + int64_t* inputs_data = input_ids.data(); + std::vector tmp_ids(inputs_data, inputs_data + input_ids.get_size()); // todo: works only for batch 1 + // tmp_ids.erase(tmp_ids.begin()); + + auto attention_mask_data = attention_mask.data(); + std::vector tmp_attn_mask(attention_mask_data, attention_mask_data + attention_mask.get_size()); + // tmp_attn_mask.erase(tmp_attn_mask.begin()); + + std::vector prefixes_to_exclude = {config.eos_token, config.bos_token}; + auto prefix_match = [&text](std::string prefix) { return text.substr(0, prefix.length()) == prefix; }; + if (std::any_of(prefixes_to_exclude.begin(), prefixes_to_exclude.end(), prefix_match)) { + tmp_ids.erase(tmp_ids.begin()); + tmp_attn_mask.erase(tmp_attn_mask.begin()); + } + + input_ids = ov::Tensor(input_ids.get_element_type(), {1, tmp_ids.size()}); + for (size_t i = 0; i < tmp_ids.size(); i++) + input_ids.data()[i] = tmp_ids.data()[i]; + attention_mask = ov::Tensor(attention_mask.get_element_type(), {1, tmp_attn_mask.size()}); + for (size_t i = 0; i < tmp_attn_mask.size(); i++) + attention_mask.data()[i] = tmp_attn_mask.data()[i]; + + encoded_input = TokenizedInputs{input_ids, attention_mask}; + } + auto encoded_results = generate(encoded_input, config, streamer); + return {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores}; + } - std::string generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer); - DecodedResults generate(std::vector texts, OptionalGenerationConfig generation_config); - EncodedResults generate(ov::Tensor input_ids, std::optional attention_mask, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer); + EncodedResults generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) { + ov::Tensor input_ids; + ov::Tensor attention_mask; + + // input_ids + // attention_mask + if (auto data = std::get_if(&inputs)) { + input_ids = *data; + } else if (auto data = std::get_if>(&inputs)) { + input_ids = data->first; + attention_mask = data->second; + } else if (auto data = std::get_if(&inputs)) { + input_ids = data->input_ids; + attention_mask = data->attention_mask; + } + + ov::genai::EncodedResults result; + GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; + + std::shared_ptr streamer_ptr; + + if (auto streamer_obj = std::get_if(&streamer)) { + streamer_ptr = nullptr; + } else if (auto streamer_obj = std::get_if>(&streamer)) { + streamer_ptr = *streamer_obj; + } else if (auto callback = std::get_if>(&streamer)) { + streamer_ptr = std::make_shared(m_tokenizer, *callback); + } + + auto batch_size = input_ids.get_shape().at(0); + if ((batch_size != 1 || !(config.is_greedy_decoding() || config.is_multinomial())) && streamer_ptr) { + OPENVINO_THROW("Currently streaming is possible only with batch size=1 and greedy or multinomial decoding"); + } + + // auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::genai::utils::init_attention_mask(input_ids); + + if (config.is_greedy_decoding()) { + result = ov::genai::greedy_decoding(m_model_runner, input_ids, attention_mask, config, streamer_ptr, is_chat_conversation); + } else if (config.is_beam_search()) { + result = beam_search(m_model_runner, input_ids, attention_mask, config); + } else if (config.is_multinomial()) { + result = multinominal_decoding(m_model_runner, input_ids, attention_mask, config, streamer_ptr); + } else { + OPENVINO_THROW("No decoding algorithm found for provided configuration parameters."); + } + + if (!is_chat_conversation) + m_model_runner.reset_state(); + + return result; + } - std::string apply_chat_template(std::string prompt, std::string role = "user") const; + std::string apply_chat_template(std::string prompt, std::string role = "user") const { + jinja2::TemplateEnv env; + env.GetSettings().lstripBlocks = true; + env.GetSettings().trimBlocks = true; + jinja2::Template tpl(&env); + tpl.Load(m_chat_template); + + jinja2::ValuesMap message {{"role", role}, {"content", prompt}}; + jinja2::ValuesMap params = { + {"messages", jinja2::ValuesList({message})}, + {"bos_token", m_generation_config.bos_token}, + {"eos_token", m_generation_config.eos_token}, + {"add_generation_prompt", true}, + }; + + return tpl.RenderAsString(params).value(); + } + + std::vector apply_chat_template(std::vector& prompts, std::string role = "user") const { + std::vector res; + for (const auto& prompt: prompts) { + res.emplace_back(apply_chat_template(prompt)); + } + return res; + } }; +DecodedResults LLMPipeline::generate( + StringInputs inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer +) { + return m_pimpl->generate(inputs, generation_config, streamer); +} + +DecodedResults LLMPipeline::generate(StringInputs text, const ov::AnyMap& config_map) { + auto config_arg = get_config_from_map(config_map); + GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); + config.update_generation_config(config_map); + + return m_pimpl->generate(text, config, get_streamer_from_map(config_map)); +} + +EncodedResults LLMPipeline::generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer +) { + return m_pimpl->generate(inputs, generation_config, streamer); +} + +EncodedResults LLMPipeline::generate(const EncodedInputs& inputs, const ov::AnyMap& config_map) { + auto config_arg = get_config_from_map(config_map); + GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); + config.update_generation_config(config_map); + + return m_pimpl->generate(inputs, config, get_streamer_from_map(config_map)); +} + +std::pair streamer(StreamerVariant func) { + if (auto streamer_obj = std::get_if>(&func)) { + return {STREAMER_ARG_NAME, Any::make>(*streamer_obj)}; + } else { + auto callback = std::get>(func); + return {STREAMER_ARG_NAME, Any::make>(callback)}; + } +} + +std::pair generation_config(const GenerationConfig& config) { + return {CONFIG_ARG_NAME, Any::make(config)}; +} + } // namespace genai } // namespace ov using namespace std; +ov::genai::LLMPipeline::LLMPipeline( + const ov::InferRequest& request, + const ov::genai::Tokenizer& tokenizer, + OptionalGenerationConfig generation_config +) { + m_pimpl = std::make_unique(request, tokenizer, generation_config); +} + ov::genai::LLMPipeline::LLMPipeline( const std::string& model_path, @@ -208,167 +420,27 @@ ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( ov::genai::LLMPipeline::LLMPipeline( const std::string& path, const std::string& device, - const ov::AnyMap& config, - const std::string& ov_tokenizers_path + const ov::AnyMap& config ) { - m_pimpl = make_unique(path, device, config, ov_tokenizers_path); + m_pimpl = make_unique(path, device, config); } ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( const std::string& path, const std::string& device, - const ov::AnyMap& config, - const std::string& ov_tokenizers_path + const ov::AnyMap& config ): m_model_runner{ov::Core{}.compile_model(path + "/openvino_model.xml", device, config).create_infer_request()}, - m_tokenizer{ - ov_tokenizers_path.empty() - ? Tokenizer(path, device, with_openvino_tokenizers(get_ov_genai_library_path()).string()) - : Tokenizer(path, device, ov_tokenizers_path) - }, m_generation_config{from_config_json_if_exists(path)}, m_chat_template{from_tokenizer_json_if_exists(path)} - {} + { + ov::genai::utils::GenAIEnvManager env_manager(get_ov_genai_library_path()); + m_tokenizer = Tokenizer(path, device); + } -ov::genai::GenerationConfig ov::genai::LLMPipeline::LLMPipelineImpl::generation_config() const { - return m_generation_config; -} ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const { - return m_pimpl->generation_config(); -} - -std::string ov::genai::LLMPipeline::LLMPipelineImpl::generate( - std::string text, - OptionalGenerationConfig generation_config, - OptionalStreamerVariant streamer -) { - GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; - - if (is_chat_conversation) { - text = apply_chat_template(text); - } - auto kv_cache_len = m_model_runner.query_state()[0].get_state().get_shape()[2]; - - // previous prompt generation in chat dialog stops with the end of sentence token, - // need to append this token to the current prompt - if (is_chat_conversation && kv_cache_len > 0) { - text = config.eos_token + text; - } - - auto [input_ids, attention_mask] = m_tokenizer.encode(text); - - // todo: W/A If sentence begins with a specfial tokens (, , etc.) openvino_tokenizer inserts 2 special extra tokens and "▁", - // but HF does not do that. Moreover openvino_tokenizer always inserts but in chat scenario HF does not do that because skip_special_tokens=True. - // Need to remove both of that tokens manually to get exact token by token alignment with HF - auto size = input_ids.get_shape(); - int64_t* inputs_data = input_ids.data(); - std::vector tmp_ids(inputs_data, inputs_data + input_ids.get_size()); // todo: works only for batch 1 - // tmp_ids.erase(tmp_ids.begin()); - - auto attention_mask_data = attention_mask.data(); - std::vector tmp_attn_mask(attention_mask_data, attention_mask_data + attention_mask.get_size()); - // tmp_attn_mask.erase(tmp_attn_mask.begin()); - - std::vector prefixes_to_exclude = {config.eos_token, config.bos_token}; - auto prefix_match = [&text](std::string prefix) { return text.substr(0, prefix.length()) == prefix; }; - if (std::any_of(prefixes_to_exclude.begin(), prefixes_to_exclude.end(), prefix_match)) { - tmp_ids.erase(tmp_ids.begin()); - tmp_attn_mask.erase(tmp_attn_mask.begin()); - } - - input_ids = ov::Tensor(input_ids.get_element_type(), {1, tmp_ids.size()}); - for (size_t i = 0; i < tmp_ids.size(); i++) - input_ids.data()[i] = tmp_ids.data()[i]; - attention_mask = ov::Tensor(attention_mask.get_element_type(), {1, tmp_attn_mask.size()}); - for (size_t i = 0; i < tmp_attn_mask.size(); i++) - attention_mask.data()[i] = tmp_attn_mask.data()[i]; - - auto generate_results = generate(input_ids, attention_mask, config, streamer); - return m_tokenizer.decode(generate_results.tokens)[0]; -} - -ov::genai::DecodedResults ov::genai::LLMPipeline::generate(const std::vector& texts, OptionalGenerationConfig generation_config) { - return m_pimpl->generate(texts, generation_config); -} - -ov::genai::DecodedResults ov::genai::LLMPipeline::LLMPipelineImpl::generate(std::vector texts, OptionalGenerationConfig generation_config) { - auto [input_ids, attention_mask] = m_tokenizer.encode(texts); - - auto generate_results = generate(input_ids, attention_mask, generation_config, {}); - - return {m_tokenizer.decode(generate_results.tokens), generate_results.scores}; -} - -ov::genai::EncodedResults ov::genai::LLMPipeline::LLMPipeline::generate(ov::Tensor input_ids, - std::optional attention_mask, - OptionalGenerationConfig generation_config, - OptionalStreamerVariant streamer) { - return m_pimpl->generate(input_ids, attention_mask, generation_config, streamer); -} - -ov::genai::EncodedResults ov::genai::LLMPipeline::LLMPipelineImpl::generate( - ov::Tensor input_ids, - std::optional attention_mask, OptionalGenerationConfig generation_config, - OptionalStreamerVariant streamer -) { - ov::genai::EncodedResults result; - GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; - - std::shared_ptr streamer_ptr; - if (!streamer.has_value()){ - streamer_ptr = nullptr; - } else if (auto streamer_obj = std::get_if>(&*streamer)) { - streamer_ptr = *streamer_obj; - } else if (auto callback = std::get_if>(&*streamer)) { - streamer_ptr = std::make_shared(m_tokenizer, *callback); - } - auto batch_size = input_ids.get_shape().at(0); - if ((batch_size != 1 || !(config.is_greedy_decoding() || config.is_multinomial())) && streamer_ptr) { - OPENVINO_THROW("Currently streaming is possible only with batch size=1 and greedy or multinomial decoding"); - } - - auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::genai::utils::init_attention_mask(input_ids); - - if (config.is_greedy_decoding()) { - result = ov::genai::greedy_decoding(m_model_runner, input_ids, attention_mask_data, config, streamer_ptr, is_chat_conversation); - } else if (config.is_beam_search()) { - result = beam_search(m_model_runner, input_ids, attention_mask_data, config); - } else if (config.is_multinomial()) { - result = multinominal_decoding(m_model_runner, input_ids, attention_mask_data, config, streamer_ptr); - } else { - OPENVINO_THROW("No decoding algorithm found for provided configuration parameters."); - } - - if (!is_chat_conversation) - m_model_runner.reset_state(); - - return result; -} - -std::string ov::genai::LLMPipeline::generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer) { - return m_pimpl->generate(text, generation_config, streamer); -} - -std::string ov::genai::LLMPipeline::generate(std::string text, const ov::AnyMap& config_map) { - OptionalStreamerVariant streamer; - auto config = GenerationConfig::anymap_to_generation_config(config_map); - if (config_map.count("streamer")) { - streamer = config_map.at("streamer").as>(); - } - - return m_pimpl->generate(text, config, streamer); -} - -ov::genai::EncodedResults ov::genai::LLMPipeline::generate(ov::Tensor input_ids, const ov::AnyMap& config_map) { - OptionalStreamerVariant streamer; - auto config = GenerationConfig::anymap_to_generation_config(config_map); - if (config_map.count("streamer")) { - streamer = config_map.at("streamer").as>(); - } - - std::optional attention_mask; - return m_pimpl->generate(input_ids, attention_mask, config, streamer); + return m_pimpl->m_generation_config; } ov::genai::Tokenizer ov::genai::LLMPipeline::get_tokenizer() { @@ -379,23 +451,6 @@ std::string ov::genai::LLMPipeline::apply_chat_template(std::string prompt, std: return m_pimpl->apply_chat_template(prompt, role); } -std::string ov::genai::LLMPipeline::LLMPipelineImpl::apply_chat_template(std::string prompt, std::string role) const { - jinja2::TemplateEnv env; - env.GetSettings().lstripBlocks = true; - env.GetSettings().trimBlocks = true; - jinja2::Template tpl(&env); - tpl.Load(m_chat_template); - - jinja2::ValuesMap message {{"role", role}, {"content", prompt}}; - jinja2::ValuesMap params = { - {"messages", jinja2::ValuesList({message})}, - {"bos_token", m_generation_config.bos_token}, - {"eos_token", m_generation_config.eos_token}, - {"add_generation_prompt", true}, - }; - - return tpl.RenderAsString(params).value(); -} void ov::genai::LLMPipeline::start_chat() { m_pimpl->is_chat_conversation = true; diff --git a/src/cpp/src/multinomial_decoding.cpp b/src/cpp/src/multinomial_decoding.cpp index 3dcdcdeb72..33b7e5e378 100644 --- a/src/cpp/src/multinomial_decoding.cpp +++ b/src/cpp/src/multinomial_decoding.cpp @@ -9,7 +9,6 @@ #include #include -#include "generation_config_helper.hpp" #include "openvino/genai/llm_pipeline.hpp" #include "utils.hpp" diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp index bb2bec09d9..39ef3bbcfa 100644 --- a/src/cpp/src/text_callback_streamer.cpp +++ b/src/cpp/src/text_callback_streamer.cpp @@ -1,3 +1,6 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + #include "text_callback_streamer.hpp" namespace ov { @@ -10,11 +13,6 @@ TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::func m_enabled = true; } -TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, bool print_eos_token) { - m_tokenizer = tokenizer; - m_print_eos_token = print_eos_token; -} - void TextCallbackStreamer::put(int64_t token) { std::stringstream res; // do nothing if token is met and if print_eos_token=false @@ -51,19 +49,19 @@ void TextCallbackStreamer::end() { on_finalized_text(res.str()); } -void TextCallbackStreamer::set_tokenizer(Tokenizer tokenizer) { - this->m_tokenizer = tokenizer; -} +// void TextCallbackStreamer::set_tokenizer(Tokenizer tokenizer) { +// this->m_tokenizer = tokenizer; +// } -void TextCallbackStreamer::set_callback(std::function callback) { - on_decoded_text_callback = callback; - m_enabled = true; -} +// void TextCallbackStreamer::set_callback(std::function callback) { +// on_decoded_text_callback = callback; +// m_enabled = true; +// } -void TextCallbackStreamer::set_callback() { - on_decoded_text_callback = [](std::string words){}; - m_enabled = false; -} +// void TextCallbackStreamer::set_callback() { +// on_decoded_text_callback = [](std::string words){}; +// m_enabled = false; +// } void TextCallbackStreamer::on_finalized_text(const std::string& subword) { if (m_enabled) { diff --git a/src/cpp/src/text_callback_streamer.hpp b/src/cpp/src/text_callback_streamer.hpp index 3834dd01ba..766f80cf9c 100644 --- a/src/cpp/src/text_callback_streamer.hpp +++ b/src/cpp/src/text_callback_streamer.hpp @@ -1,5 +1,6 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "openvino/genai/streamer_base.hpp" @@ -13,14 +14,12 @@ class TextCallbackStreamer: public StreamerBase { void put(int64_t token) override; void end() override; - TextCallbackStreamer(const Tokenizer& tokenizer, std::function callback, bool print_eos_token = false); - TextCallbackStreamer(const Tokenizer& tokenizer, bool print_eos_token = false); - TextCallbackStreamer() = default; - ~TextCallbackStreamer() = default; + TextCallbackStreamer(const Tokenizer& tokenizer, std::function callback, bool print_eos_token = false); + // ~TextCallbackStreamer() = default; - void set_tokenizer(Tokenizer tokenizer); - void set_callback(std::function callback); - void set_callback(); + // void set_tokenizer(Tokenizer tokenizer); + // void set_callback(std::function callback); + // void set_callback(); std::function on_decoded_text_callback = [](std::string words){}; bool m_enabled = false; diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 11ca3d3538..05e0c0d5db 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -4,11 +4,12 @@ #include #include "openvino/genai/tokenizer.hpp" #include "utils.hpp" +#include namespace { // todo: remove when openvino-tokenizers will support left padding -std::pair pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token) { +ov::genai::TokenizedInputs pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token) { const size_t batch_size = input_ids.get_shape()[0]; const size_t sequence_length = input_ids.get_shape()[1]; int64_t* inputs_data = input_ids.data(); @@ -39,6 +40,17 @@ std::pair pad_left(ov::Tensor&& input_ids, ov::Tensor&& return {input_ids, attention_mask}; } +std::filesystem::path with_openvino_tokenizers(const std::filesystem::path& path) { +#ifdef _WIN32 + constexpr char tokenizers[] = "openvino_tokenizers.dll"; +#elif __linux__ + constexpr char tokenizers[] = "libopenvino_tokenizers.so"; +#elif __APPLE__ + constexpr char tokenizers[] = "libopenvino_tokenizers.dylib"; +#endif + return path.parent_path() / tokenizers; +} + } namespace ov { @@ -53,13 +65,19 @@ class Tokenizer::TokenizerImpl { int64_t m_eos_token_id = 2; TokenizerImpl() = default; - TokenizerImpl(std::string tokenizers_path, const std::string device, const std::string& ov_tokenizers_path) { + TokenizerImpl(std::string tokenizers_path, const std::string device) { ov::Core core; if (ov::genai::utils::is_xml(tokenizers_path)) OPENVINO_THROW("tokenizers_path should be a path to a dir not a xml file"); - - core.add_extension(ov_tokenizers_path); + + const char* ov_tokenizers_path = getenv(ov::genai::utils::get_tokenizers_env_name()); + if (ov_tokenizers_path) { + core.add_extension(with_openvino_tokenizers(ov_tokenizers_path)); + } else { + OPENVINO_THROW("openvino_tokenizers path is not set"); + } + std::shared_ptr tokenizer_model, detokenizer_model; try { tokenizer_model = core.read_model(tokenizers_path + "/openvino_tokenizer.xml"); @@ -80,14 +98,14 @@ class Tokenizer::TokenizerImpl { m_pad_token_id = rt_info["pad_token_id"].as(); } - std::pair encode(std::string prompt) { + TokenizedInputs encode(std::string prompt) { size_t batch_size = 1; m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt}); m_tokenize_request.infer(); return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")}; } - std::pair encode(std::vector& prompts) { + TokenizedInputs encode(std::vector& prompts) { m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()}); auto size_ = m_tokenize_request.get_input_tensor().get_shape(); m_tokenize_request.infer(); @@ -139,23 +157,23 @@ class Tokenizer::TokenizerImpl { } }; -Tokenizer::Tokenizer(const std::string& tokenizers_path, const std::string& device, const std::string& ov_tokenizers_path) { - m_pimpl = std::make_shared(tokenizers_path, device, ov_tokenizers_path); +Tokenizer::Tokenizer(const std::string& tokenizers_path, const std::string& device) { + m_pimpl = std::make_shared(tokenizers_path, device); } -std::pair Tokenizer::encode(const std::string prompt) { +TokenizedInputs Tokenizer::encode(const std::string prompt) { return m_pimpl->encode(std::move(prompt)); } -std::pair Tokenizer::encode(std::vector& prompts) { +TokenizedInputs Tokenizer::encode(std::vector& prompts) { return m_pimpl->encode(prompts); } -std::pair Tokenizer::encode(std::vector&& prompts) { +TokenizedInputs Tokenizer::encode(std::vector&& prompts) { return m_pimpl->encode(prompts); } -std::pair Tokenizer::encode(std::initializer_list& text) { +TokenizedInputs Tokenizer::encode(std::initializer_list& text) { return encode(std::vector(text.begin(), text.end())); } diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index 8111dc5c94..497c4ac100 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -146,6 +146,36 @@ ov::Tensor extend_attention(ov::Tensor attention_mask) { return new_atten_mask; } +GenAIEnvManager::GenAIEnvManager(const std::string& path) { + #ifdef _WIN32 + char* value = nullptr; + size_t len = 0; + _dupenv_s(&value, &len, ov::genai::utils::get_tokenizers_env_name().c_str()); + if (value == nullptr) + _putenv_s(ov::genai::utils::get_tokenizers_env_name().c_str(), path.c_str()); + #else + if (!getenv(ov::genai::utils::get_tokenizers_env_name())) + setenv(ov::genai::utils::get_tokenizers_env_name(), path.c_str(), 1); + #endif + else + was_already_set = true; +} + +GenAIEnvManager::~GenAIEnvManager() { + if (!was_already_set){ + #ifdef _WIN32 + _putenv_s(ov::genai::utils::get_tokenizers_env_name()); + #else + unsetenv(ov::genai::utils::get_tokenizers_env_name()); + #endif + } +} + +const char* get_tokenizers_env_name() { + return "OPENVINO_TOKENIZERS_PATH_GENAI"; +} + + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 4559a8962f..292bb43505 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -59,6 +59,18 @@ void read_anymap_param(const ov::AnyMap& config_map, const std::string& name, T& } } +const char* get_tokenizers_env_name(); + +// const char* OV_TOKENIZERS_ENV_NAME = "OPENVINO_TOKENIZERS_PATH_GENAI"; + +class GenAIEnvManager { +public: + GenAIEnvManager(const std::string& path); + ~GenAIEnvManager(); +private: + bool was_already_set; +}; + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index f235d00fb4..178cc441a0 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -15,7 +15,7 @@ if(NOT pybind11_POPULATED) endif() pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp) -target_link_libraries(py_generate_pipeline PRIVATE openvino::genai) +target_link_libraries(py_generate_pipeline PRIVATE openvino::genai nlohmann_json::nlohmann_json) set_target_properties(py_generate_pipeline PROPERTIES LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" ) @@ -49,3 +49,6 @@ install(TARGETS genai py_generate_pipeline LIBRARY DESTINATION . COMPONENT wheel_genai RUNTIME DESTINATION . COMPONENT wheel_genai EXCLUDE_FROM_ALL) + +# to be able to use utils.hpp in pybind +include_directories(${CMAKE_SOURCE_DIR}/src/cpp/src/) diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py index e069157fa7..f23e447d5f 100644 --- a/src/python/openvino_genai/__init__.py +++ b/src/python/openvino_genai/__init__.py @@ -9,6 +9,6 @@ if hasattr(os, "add_dll_directory"): os.add_dll_directory(os.path.dirname(__file__)) -from .py_generate_pipeline import LLMPipeline, Tokenizer, GenerationConfig, DecodedResults, EncodedResults +from .py_generate_pipeline import LLMPipeline, Tokenizer, GenerationConfig, DecodedResults, EncodedResults, StreamerBase -__all__ = ['LLMPipeline', 'Tokenizer', 'GenerationConfig', 'DecodedResults', 'EncodedResults'] +__all__ = ['LLMPipeline', 'Tokenizer', 'GenerationConfig', 'DecodedResults', 'EncodedResults', 'StreamerBase'] diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index d1f8c5b3c2..3a93ddbea8 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -6,6 +6,7 @@ #include #include #include "openvino/genai/llm_pipeline.hpp" +#include "utils.hpp" #ifdef _WIN32 # include @@ -42,6 +43,7 @@ using ov::genai::EncodedResults; using ov::genai::DecodedResults; using ov::genai::StopCriteria; using ov::genai::StreamerBase; +using ov::genai::StreamerVariant; namespace { void str_to_stop_criteria(GenerationConfig& config, const std::string& stop_criteria_str){ @@ -84,17 +86,29 @@ void update_config_from_kwargs(GenerationConfig& config, const py::kwargs& kwarg if (kwargs.contains("bos_token")) config.bos_token = kwargs["bos_token"].cast(); } -// operator() and generate methods are identical, operator() is just an alias for generate -std::string call_with_kwargs(LLMPipeline& pipeline, const std::string& text, const py::kwargs& kwargs) { - // Create a new GenerationConfig instance and initialize from kwargs +py::object call_with_config(LLMPipeline& pipe, const std::string& text, const GenerationConfig& config, const StreamerVariant& streamer) { + if (config.num_return_sequences > 1) { + return py::cast(pipe.generate({text}, config, streamer).texts); + } else { + return py::cast(std::string(pipe.generate(text, config, streamer))); + } +} + +std::vector call_with_config(LLMPipeline& pipe, const std::vector& text, const GenerationConfig& config, const StreamerVariant& streamer) { + return pipe.generate(text, config, streamer); +} + +std::vector call_with_kwargs(LLMPipeline& pipeline, const std::vector& texts, const py::kwargs& kwargs) { GenerationConfig config = pipeline.get_generation_config(); update_config_from_kwargs(config, kwargs); - return pipeline(text, config); + return call_with_config(pipeline, texts, config, kwargs.contains("streamer") ? kwargs["streamer"].cast() : std::monostate()); } -std::string call_with_config(LLMPipeline& pipe, const std::string& text, const GenerationConfig& config) { - std::shared_ptr streamer; - return pipe(text, config); +py::object call_with_kwargs(LLMPipeline& pipeline, const std::string& text, const py::kwargs& kwargs) { + // Create a new GenerationConfig instance and initialize from kwargs + GenerationConfig config = pipeline.get_generation_config(); + update_config_from_kwargs(config, kwargs); + return call_with_config(pipeline, text, config, kwargs.contains("streamer") ? kwargs["streamer"].cast() : std::monostate()); } std::filesystem::path with_openvino_tokenizers(const std::filesystem::path& path) { @@ -138,6 +152,20 @@ std::string ov_tokenizers_module_path() { } return py::str(py::module_::import("openvino_tokenizers").attr("_ext_path")); } +class EmptyStreamer: public StreamerBase { + // It's impossible to create an instance of pure virtual class. Define EmptyStreamer instead. + void put(int64_t token) override { + PYBIND11_OVERRIDE_PURE( + void, // Return type + StreamerBase, // Parent class + put, // Name of function in C++ (must match Python name) + token // Argument(s) + ); + } + void end() override { + PYBIND11_OVERRIDE_PURE(void, StreamerBase, end); + } +}; } PYBIND11_MODULE(py_generate_pipeline, m) { @@ -147,21 +175,28 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def(py::init(), py::arg("model_path"), py::arg("tokenizer"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{}) - .def(py::init(), - py::arg("path"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{}, py::arg("ov_tokenizers_path") = ov_tokenizers_module_path()) + .def(py::init([](const std::string& model_path, + const std::string& device, + const ov::AnyMap& plugin_config) { + ov::genai::utils::GenAIEnvManager env_manager(ov_tokenizers_module_path()); + return std::make_unique(model_path, device, plugin_config);}), + py::arg("model_path"), "path to the model path", + py::arg("device") = "CPU", "device on which inference will be done", + py::arg("plugin_config") = ov::AnyMap(), + "LLMPipeline class constructor.\n" + " model_path (str): Path to the model file.\n" + " device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'.\n" + " plugin_config (ov::AnyMap): Plugin configuration settings. Default is an empty.") + .def("__call__", py::overload_cast(&call_with_kwargs)) - .def("__call__", py::overload_cast(&call_with_config)) + .def("__call__", py::overload_cast(&call_with_config)) + + .def("generate", py::overload_cast&, const py::kwargs&>(&call_with_kwargs)) + .def("generate", py::overload_cast&, const GenerationConfig&, const StreamerVariant&>(&call_with_config)) .def("generate", py::overload_cast(&call_with_kwargs)) - .def("generate", py::overload_cast(&call_with_config)) + .def("generate", py::overload_cast(&call_with_config)) // todo: if input_ids is a ov::Tensor/numpy tensor - // todo: implement calling generate/operator() with StreamerBase or lambda streamer - // signature to be implemented: - // EncodedResults generate(ov::Tensor input_ids, - // std::optional attention_mask, - // OptionalGenerationConfig generation_config=nullopt, - // OptionalStreamerVariant streamer=nullopt); - .def("get_tokenizer", &LLMPipeline::get_tokenizer) .def("start_chat", &LLMPipeline::start_chat) @@ -174,10 +209,9 @@ PYBIND11_MODULE(py_generate_pipeline, m) { // Binding for Tokenizer py::class_(m, "Tokenizer") .def(py::init<>()) - .def(py::init(), + .def(py::init(), py::arg("tokenizers_path"), - py::arg("device") = "CPU", - py::arg("ov_tokenizers_path") = py::str(ov_tokenizers_module_path())) + py::arg("device") = "CPU") // todo: implement encode/decode when for numpy inputs and outputs .def("encode", py::overload_cast(&Tokenizer::encode), "Encode a single prompt") @@ -222,4 +256,8 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_readwrite("tokens", &EncodedResults::tokens) .def_readwrite("scores", &EncodedResults::scores); + py::class_>(m, "StreamerBase") // Change the holder form unique_ptr to shared_ptr + .def(py::init<>()) + .def("put", &StreamerBase::put) + .def("end", &StreamerBase::end); } diff --git a/tests/python_tests/generate_api_check.py b/tests/python_tests/generate_api_check.py new file mode 100644 index 0000000000..ad0851fea2 --- /dev/null +++ b/tests/python_tests/generate_api_check.py @@ -0,0 +1,25 @@ +import openvino_genai as ov_genai +model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' +path = '/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0' +device = 'CPU' +pipe = ov_genai.LLMPipeline(path, device) + +from transformers import AutoTokenizer, AutoModelForCausalLM +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained(model_id) + +prompt = 'table is made of' +generation_config = {'max_new_tokens': 10} + +encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True) +hf_encoded_output = model.generate(encoded_prompt, **generation_config) +hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:]) + + + +import os +build_dir = os.getenv('GENAI_BUILD_DIR', 'build') +ov_tokenizers_path = f'{build_dir}/openvino_tokenizers/src/' +# pipe = ov_genai.LLMPipeline(path, device, {}, ov_tokenizers_path) + +ov_output = pipe.generate(prompt, **generation_config) diff --git a/tests/python_tests/list_test_models.py b/tests/python_tests/list_test_models.py index 99ca47b27c..90dcc83f0f 100644 --- a/tests/python_tests/list_test_models.py +++ b/tests/python_tests/list_test_models.py @@ -3,17 +3,17 @@ def models_list(): model_ids = [ ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "TinyLlama-1.1B-Chat-v1.0"), + # ("databricks/dolly-v2-3b", "dolly-v2-3b"), # not free disk space on CI machine + ("microsoft/phi-1_5", "phi-1_5/"), # ("google/gemma-2b-it", "gemma-2b-it"), # ("google/gemma-7b-it", "gemma-7b-it"), # ("meta-llama/Llama-2-7b-chat-hf", "Llama-2-7b-chat-hf"), # ("meta-llama/Llama-2-13b-chat-hf", "Llama-2-13b-chat-hf"), # ("openlm-research/open_llama_3b", "open_llama_3b"), # ("openlm-research/open_llama_7b", "open_llama_7b"), - # ("databricks/dolly-v2-3b", "dolly-v2-3b"), # ("databricks/dolly-v2-12b", "dolly-v2-12b"), # ("mistralai/Mistral-7B-v0.1", "Mistral-7B-v0.1"), # ("ikala/redpajama-3b-chat", "redpajama-3b-chat"), - # ("microsoft/phi-1_5", "phi-1_5/"), # ("Qwen/Qwen1.5-7B-Chat", "Qwen1.5-7B-Chat"), ] import os diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py index 690e1ae9dd..442201b0cd 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_generate_api.py @@ -3,6 +3,7 @@ import functools import openvino +import openvino_genai import openvino_tokenizers import optimum.intel import pytest @@ -38,11 +39,11 @@ def run_hf_ov_genai_comparison(model_fixture, generation_config, prompt): hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:]) device = 'CPU' - # pipe = ov_genai.LLMPipeline(path, device) - - pipe = ov_genai.LLMPipeline(str(path), device) + pipe = ov_genai.LLMPipeline(path, device) ov_output = pipe.generate(prompt, **generation_config) + if generation_config.get('num_return_sequences', 1) > 1: + ov_output = ov_output[0] if hf_output != ov_output: print(f'hf_output: {hf_output}') @@ -56,12 +57,11 @@ def stop_criteria_map(): test_cases = [ (dict(max_new_tokens=20, do_sample=False), 'table is made of'), # generation_config, prompt - (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), - # (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'Alan Turing was a'), - # (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'), - # (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), - # (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'The Sun is yellow because'), - # (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.5), 'The Sun is yellow because'), + (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'Alan Turing was a'), + (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'), + (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), + (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'The Sun is yellow because'), + (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.5), 'The Sun is yellow because'), ] @pytest.mark.parametrize("generation_config,prompt", test_cases) def test_greedy_decoding(model_fixture, generation_config, prompt): @@ -74,7 +74,6 @@ def test_greedy_decoding(model_fixture, generation_config, prompt): @pytest.mark.parametrize("max_new_tokens", [20, 15]) @pytest.mark.parametrize("diversity_penalty", [1.0, 1.5]) @pytest.mark.parametrize("prompt", prompts) -@pytest.mark.skip # temporarily def test_beam_search_decoding(model_fixture, num_beam_groups, group_size, max_new_tokens, diversity_penalty, prompt): generation_config = dict( @@ -90,7 +89,6 @@ def test_beam_search_decoding(model_fixture, num_beam_groups, group_size, @pytest.mark.parametrize("stop_criteria", ["never", "early", "heuristic"]) @pytest.mark.parametrize("prompt", prompts) @pytest.mark.parametrize("max_new_tokens", [20, 40, 300]) -@pytest.mark.skip # temporarily def test_stop_criteria(model_fixture, stop_criteria, prompt, max_new_tokens): # todo: for long sentences early stop_criteria fails if (stop_criteria == 'early' and max_new_tokens >= 300): @@ -123,3 +121,95 @@ def test_beam_search_long_sentences(model_fixture, num_beam_groups, group_size, max_new_tokens=max_new_tokens, ) run_hf_ov_genai_comparison(model_fixture, generation_config, prompt) + + +def user_defined_callback(subword): + print(subword) + + +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +def test_callback_one_string(model_fixture, callback): + pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') + pipe.generate('', openvino_genai.GenerationConfig(), callback) + + +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +def test_callback_batch_fail(model_fixture, callback): + pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') + with pytest.raises(RuntimeError): + pipe.generate(['1', '2'], openvino_genai.GenerationConfig(), callback) + + +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +def test_callback_kwargs_one_string(model_fixture, callback): + pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') + pipe.generate('', max_new_tokens=10, streamer=callback) + + +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +def test_callback_kwargs_batch_fail(model_fixture, callback): + pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') + with pytest.raises(RuntimeError): + pipe.generate(['1', '2'], max_new_tokens=10, streamer=callback) + + +class Printer(openvino_genai.StreamerBase): + def __init__(self, tokenizer): + super().__init__() + self.tokenizer = tokenizer + def put(self, token_id): + print(self.tokenizer.decode([token_id])) # Incorrect way to print, but easy to implement + def end(self): + print('end') + + +def test_streamer_one_string(model_fixture): + pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') + printer = Printer(pipe.get_tokenizer()) + pipe.generate('', openvino_genai.GenerationConfig(), printer) + + +def test_streamer_batch_fail(model_fixture): + pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') + printer = Printer(pipe.get_tokenizer()) + with pytest.raises(RuntimeError): + pipe.generate(['1', '2'], openvino_genai.GenerationConfig(), printer) + + +def test_streamer_kwargs_one_string(model_fixture): + pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') + printer = Printer(pipe.get_tokenizer()) + pipe.generate('', do_sample=True, streamer=printer) + + +def test_streamer_kwargs_batch_fail(model_fixture): + pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') + printer = Printer(pipe.get_tokenizer()) + with pytest.raises(RuntimeError): + pipe.generate('', num_beams=2, streamer=printer) + + +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +def test_operator_wit_callback_one_string(model_fixture, callback): + pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') + pipe('', openvino_genai.GenerationConfig(), callback) + + +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +def test_operator_wit_callback_batch_fail(model_fixture, callback): + pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') + with pytest.raises(RuntimeError): + pipe(['1', '2'], openvino_genai.GenerationConfig(), callback) + + +def test_perator_wit_streamer_kwargs_one_string(model_fixture): + pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') + printer = Printer(pipe.get_tokenizer()) + pipe('', do_sample=True, streamer=printer) + + +def test_erator_wit_streamer_kwargs_batch_fail(model_fixture): + pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') + printer = Printer(pipe.get_tokenizer()) + with pytest.raises(RuntimeError): + pipe('', num_beams=2, streamer=printer) diff --git a/text_generation/causal_lm/cpp/chat_sample.cpp b/text_generation/causal_lm/cpp/chat_sample.cpp index 3e215e5208..08748b3bbd 100644 --- a/text_generation/causal_lm/cpp/chat_sample.cpp +++ b/text_generation/causal_lm/cpp/chat_sample.cpp @@ -3,9 +3,26 @@ #include #include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/streamer_base.hpp" using namespace std; +class CustomStreamer: public ov::genai::StreamerBase { +public: + void put(int64_t token) { + std::cout << token << std::endl; + /* custom decoding/tokens processing code + tokens_cache.push_back(token); + std::string text = m_tokenizer.decode(tokens_cache); + ... + */ + }; + + void end() { + /* custom finalization */ + }; +}; + std::vector questions = { "1+1=", "what was the previous answer?", @@ -24,7 +41,8 @@ int main(int argc, char* argv[]) try { ov::genai::GenerationConfig config = pipe.get_generation_config(); config.max_new_tokens = 10000; - auto streamer = [](std::string word) { std::cout << word << std::flush; }; + std::function streamer = [](std::string word) { std::cout << word << std::flush; return true;}; + std::shared_ptr custom_streamer = std::make_shared(); pipe.start_chat(); for (size_t i = 0; i < questions.size(); i++) { @@ -35,7 +53,8 @@ int main(int argc, char* argv[]) try { cout << prompt << endl; // auto answer_str = pipe(prompt, config, streamer); - auto answer_str = pipe.generate(prompt, ov::genai::max_new_tokens(10000), ov::genai::streamer(streamer)); + auto answer_str = pipe(prompt, ov::genai::generation_config(config), ov::genai::streamer(streamer)); + // auto answer_str = pipe.generate(prompt, ov::genai::max_new_tokens(10000), ov::genai::streamer(streamer)); accumulated_str += answer_str; cout << "\n----------\n";