Skip to content

Commit

Permalink
Merge branch 'generate_pipeline' into update-tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Wovchena committed May 29, 2024
2 parents db31fe8 + 6709a67 commit 6e52dc9
Show file tree
Hide file tree
Showing 21 changed files with 693 additions and 377 deletions.
33 changes: 32 additions & 1 deletion src/cpp/include/openvino/genai/generation_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,39 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
bool is_greedy_decoding() const;
bool is_beam_search() const;
bool is_multinomial() const;
static GenerationConfig anymap_to_generation_config(const ov::AnyMap& config_map = {});
void update_generation_config(const ov::AnyMap& config_map = {});
};

/*
* utils that allow to use generate and operator() in the following way:
* pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...)
* pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...)
*/
static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"};
static constexpr ov::Property<size_t> max_length{"max_length"};
static constexpr ov::Property<bool> ignore_eos{"ignore_eos"};

static constexpr ov::Property<size_t> num_beam_groups{"num_beam_groups"};
static constexpr ov::Property<size_t> num_beams{"num_beams"};
static constexpr ov::Property<float> diversity_penalty{"diversity_penalty"};
static constexpr ov::Property<float> length_penalty{"length_penalty"};
static constexpr ov::Property<size_t> num_return_sequences{"num_return_sequences"};
static constexpr ov::Property<size_t> no_repeat_ngram_size{"no_repeat_ngram_size"};
static constexpr ov::Property<StopCriteria> stop_criteria{"stop_criteria"};

static constexpr ov::Property<float> temperature{"temperature"};
static constexpr ov::Property<float> top_p{"top_p"};
static constexpr ov::Property<int> top_k{"top_k"};
static constexpr ov::Property<bool> do_sample{"do_sample"};
static constexpr ov::Property<float> repetition_penalty{"repetition_penalty"};


static constexpr ov::Property<int64_t> pad_token_id{"pad_token_id"};
static constexpr ov::Property<int64_t> bos_token_id{"bos_token_id"};
static constexpr ov::Property<int64_t> eos_token_id{"eos_token_id"};

static constexpr ov::Property<std::string> bos_token{"bos_token"};
static constexpr ov::Property<std::string> eos_token{"eos_token"};

} // namespace genai
} // namespace ov
170 changes: 85 additions & 85 deletions src/cpp/include/openvino/genai/llm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@
namespace ov {
namespace genai {

using StreamerVariant = std::variant<std::function<void (std::string)>, std::shared_ptr<StreamerBase>>;
using StreamerVariant = std::variant<std::function<void(std::string)>, std::shared_ptr<StreamerBase>, std::monostate>;
using OptionalGenerationConfig = std::optional<GenerationConfig>;
using OptionalStreamerVariant = std::optional<StreamerVariant>;
using EncodedInputs = std::variant<ov::Tensor, std::pair<ov::Tensor, ov::Tensor>, TokenizedInputs>;
using StringInputs = std::variant<std::string, std::vector<std::string>>;

/**
* @brief Structure to store resulting batched tokens and scores for each batch sequence
Expand All @@ -43,6 +44,13 @@ class DecodedResults {

// @brief Convert DecodedResults to a vector of strings.
// @return A std::vector<std::string> containing the texts from the DecodedResults object.
operator std::string() const {
OPENVINO_ASSERT(texts.size() == 1, "DecodedResults can be converted to string only if contains a single prompt");
return texts.at(0);
}

// @brief Convert DecodedResults to a single string.
// @return std::string containing the texts from the DecodedResults object.
operator std::vector<std::string>() const {
return texts;
}
Expand Down Expand Up @@ -71,11 +79,27 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
* @param model_path Path to the dir model xml/bin files, tokenizers and generation_configs.json
* @param device optional device
* @param plugin_config optional plugin_config
* @param ov_tokenizers_path optional path to an extension to add. Empty adds openvino_tokenizers from openvini_genai library folder.
*/
LLMPipeline(const std::string& path, const std::string& device="CPU",
const ov::AnyMap& plugin_config={},
const std::string& ov_tokenizers_path="");
LLMPipeline(
const std::string& path,
const std::string& device="CPU",
const ov::AnyMap& plugin_config={}
);

/**
* @brief Constructs an LLMPipeline from already existing infer InferRequest and Tokenizer
*
* @param request infer request of the model
* @param tokenizer initialized Tokenizer
* @param generation_config optional generation_config, be default will be initialized for greedy decoding
* @param device optional device
* @param plugin_config optional plugin_config
*/
LLMPipeline(
const ov::InferRequest& request,
const ov::genai::Tokenizer& tokenizer,
OptionalGenerationConfig generation_config=std::nullopt
);

/**
* @brief Constructs a LLMPipeline when ov::Tokenizer is initialized manually using file from the different dirs.
Expand All @@ -95,76 +119,84 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
~LLMPipeline();

/**
* @brief High level generate for the input with a single prompt which encodes inputs and returns decoded output
* @brief High level generate that receives prompts as a string or a vector of strings and returns decoded output.
*
* @param text input prompt
* @param inputs input prompt or a vector of prompts
* @param generation_config optional GenerationConfig
* @param streamer optional streamer
* @return std::string decoded resulting text
* @return DecodedResults decoded resulting text
*/
DecodedResults generate(
StringInputs inputs,
OptionalGenerationConfig generation_config=std::nullopt,
StreamerVariant streamer=std::monostate()
);

/**
* @brief High level generate that receives prompts as a string or a vector of strings and returns decoded output.
* properties can be in any order pipe.generate(..., ov::genai::max_new_tokens(100), ov::genai::streamer(lambda_func)).
*
* @param inputs input prompt or a vector of prompts
* @param properties properties
* @return DecodedResults decoded resulting text
*/
std::string generate(std::string text, OptionalGenerationConfig generation_config=std::nullopt, OptionalStreamerVariant streamer=std::nullopt);

template <typename... Properties>
util::EnableIfAllStringAny<std::string, Properties...> generate(
std::string text,
util::EnableIfAllStringAny<DecodedResults, Properties...> generate(
StringInputs inputs,
Properties&&... properties) {
return generate(text, AnyMap{std::forward<Properties>(properties)...});
return generate(inputs, AnyMap{std::forward<Properties>(properties)...});
}
DecodedResults generate(StringInputs inputs, const ov::AnyMap& config_map);


DecodedResults operator()(
StringInputs inputs,
OptionalGenerationConfig generation_config=std::nullopt,
StreamerVariant streamer=std::monostate()
) {
return generate(inputs, generation_config, streamer);
}
std::string generate(std::string text, const ov::AnyMap& config);

template <typename... Properties>
util::EnableIfAllStringAny<EncodedResults, Properties...> generate(
ov::Tensor input_ids,
Properties&&... properties) {
return generate(input_ids, AnyMap{std::forward<Properties>(properties)...});
util::EnableIfAllStringAny<DecodedResults, Properties...> operator()(
StringInputs inputs,
Properties&&... properties) {
return generate(inputs, AnyMap{std::forward<Properties>(properties)...});
}
EncodedResults generate(ov::Tensor input_ids, const ov::AnyMap& config);

/**
* @brief High level generate for batched prompts which encodes inputs and returns decoded outputs.
* @brief Low level generate to be called with already encoded input_ids tokens.
* Streamer cannot be used for multibatch inputs.
*
* @param text input prompt
* @param input_ids or pair of (input_ids, attentino_mask) encoded input prompt tokens
* @param generation_config optional GenerationConfig
* @return DecodedResults a structure with resulting texts & scores
* @param streamer optional streamer
* @return EncodedResults a structure with resulting tokens and scores
* @throws Exception if the stremaer is set for inputs_ids with multiple batches
*/
DecodedResults generate(const std::vector<std::string>& texts, OptionalGenerationConfig generation_config);
EncodedResults generate(
const EncodedInputs& inputs,
OptionalGenerationConfig generation_config=std::nullopt,
StreamerVariant streamer=std::monostate()
);

/**
* @brief Low level generate to be called with already encoded input_ids tokens.
* Streamer cannot be used for multibatch inputs.
*
* @param input_ids encoded input prompt tokens
* @param attention_mask optional attention_mask
* @param generation_config optional GenerationConfig
* @param streamer optional streamer
* @param input_ids or pair of (input_ids, attentino_mask) encoded input prompt tokens
* @param generation config params
* @return EncodedResults a structure with resulting tokens and scores
* @throws Exception if the stremaer is set for inputs_ids with multiple batches
*/
EncodedResults generate(ov::Tensor input_ids,
std::optional<ov::Tensor> attention_mask,
OptionalGenerationConfig generation_config=std::nullopt,
OptionalStreamerVariant streamer=std::nullopt);

template <typename InputsType, typename... Properties>
util::EnableIfAllStringAny<std::string, Properties...> operator()(
InputsType text,
Properties&&... properties) {
return generate(text, AnyMap{std::forward<Properties>(properties)...});
}

DecodedResults operator()(const std::vector<std::string>& text, OptionalGenerationConfig generation_config=std::nullopt) {
return generate(text, generation_config);
}

std::string operator()(
std::string text,
OptionalGenerationConfig generation_config=std::nullopt,
OptionalStreamerVariant streamer=std::nullopt
) {
return generate(text, generation_config, streamer);
template <typename... Properties>
util::EnableIfAllStringAny<EncodedResults, Properties...> generate(
const EncodedInputs& inputs,
Properties&&... properties) {
return generate(inputs, AnyMap{std::forward<Properties>(properties)...});
}

EncodedResults generate(const EncodedInputs& inputs, const ov::AnyMap& config_map);

ov::genai::Tokenizer get_tokenizer();
GenerationConfig get_generation_config() const;
void set_generation_config(const GenerationConfig& generation_config);
Expand All @@ -178,40 +210,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
std::unique_ptr<LLMPipelineImpl> m_pimpl;
};

/*
* utils that allow to use generate and operator() in the following way:
* pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...)
* pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...)
*/
static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"};
static constexpr ov::Property<size_t> max_length{"max_length"};
static constexpr ov::Property<bool> ignore_eos{"ignore_eos"};

static constexpr ov::Property<size_t> num_beam_groups{"num_beam_groups"};
static constexpr ov::Property<size_t> num_beams{"num_beams"};
static constexpr ov::Property<float> diversity_penalty{"diversity_penalty"};
static constexpr ov::Property<float> length_penalty{"length_penalty"};
static constexpr ov::Property<size_t> num_return_sequences{"num_return_sequences"};
static constexpr ov::Property<size_t> no_repeat_ngram_size{"no_repeat_ngram_size"};
static constexpr ov::Property<StopCriteria> stop_criteria{"stop_criteria"};

static constexpr ov::Property<float> temperature{"temperature"};
static constexpr ov::Property<float> top_p{"top_p"};
static constexpr ov::Property<int> top_k{"top_k"};
static constexpr ov::Property<bool> do_sample{"do_sample"};
static constexpr ov::Property<float> repetition_penalty{"repetition_penalty"};


static constexpr ov::Property<int64_t> pad_token_id{"pad_token_id"};
static constexpr ov::Property<int64_t> bos_token_id{"bos_token_id"};
static constexpr ov::Property<int64_t> eos_token_id{"eos_token_id"};

static constexpr ov::Property<std::string> bos_token{"bos_token"};
static constexpr ov::Property<std::string> eos_token{"eos_token"};

// only lambda streamer can be set via ov::streamer(),... syntaxic sugar,
// because std::variant<StremaerBase, std::function<>> can not be stored in AnyMap
static constexpr ov::Property<std::function<void (std::string)>> streamer{"streamer"};
std::pair<std::string, Any> streamer(StreamerVariant func);
std::pair<std::string, Any> generation_config(const GenerationConfig& config);

} // namespace genai
} // namespace ov
5 changes: 1 addition & 4 deletions src/cpp/include/openvino/genai/streamer_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,13 @@ namespace genai {
*/
class StreamerBase {
public:
Tokenizer m_tokenizer;
explicit StreamerBase(Tokenizer tokenizer): m_tokenizer(tokenizer) {}
StreamerBase() = default;

/// @brief put is called every time new token is decoded
virtual void put(int64_t token) = 0;

/// @brief end is called at the end of generation. It can be used to flush cache if your own streamer has one
virtual void end() = 0;
};


} // namespace genai
} // namespace ov
15 changes: 10 additions & 5 deletions src/cpp/include/openvino/genai/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
namespace ov {
namespace genai {

struct TokenizedInputs {
ov::Tensor input_ids;
ov::Tensor attention_mask;
};

/**
* @brief class is used to encode prompts and decode resulting tokens
*/
Expand All @@ -22,22 +27,22 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
* @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
* @param device device. Currently only 'CPU' is supported
*/
Tokenizer(const std::string& tokenizers_path, const std::string& device="CPU", const std::string& ov_tokenizers_path="");
Tokenizer(const std::string& tokenizers_path, const std::string& device="CPU");

/**
* @brief encode a single prompt
* @return pair of [input_ids, attention_mask]
*/
std::pair<ov::Tensor, ov::Tensor> encode(const std::string prompt);
TokenizedInputs encode(const std::string prompt);

/**
* @brief encode batch of prompts. Left padding will be applied by default
* @param prompts vector storing batch of prompts
* @return pair of [input_ids, attention_mask]
*/
std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string>& prompts);
std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string>&& prompts);
std::pair<ov::Tensor, ov::Tensor> encode(std::initializer_list<std::string>& prompts);
TokenizedInputs encode(std::vector<std::string>& prompts);
TokenizedInputs encode(std::vector<std::string>&& prompts);
TokenizedInputs encode(std::initializer_list<std::string>& prompts);

/**
* @brief decode sequence of tokens
Expand Down
45 changes: 21 additions & 24 deletions src/cpp/src/generation_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,32 +55,29 @@ GenerationConfig::GenerationConfig(std::string json_path) {

}

GenerationConfig GenerationConfig::anymap_to_generation_config(const ov::AnyMap& config_map) {
void GenerationConfig::update_generation_config(const ov::AnyMap& config_map) {
using ov::genai::utils::read_anymap_param;

GenerationConfig config;
read_anymap_param(config_map, "max_new_tokens", config.max_new_tokens);
read_anymap_param(config_map, "max_length", config.max_length);
read_anymap_param(config_map, "ignore_eos", config.ignore_eos);
read_anymap_param(config_map, "num_beam_groups", config.num_beam_groups);
read_anymap_param(config_map, "num_beams", config.num_beams);
read_anymap_param(config_map, "diversity_penalty", config.diversity_penalty);
read_anymap_param(config_map, "length_penalty", config.length_penalty);
read_anymap_param(config_map, "num_return_sequences", config.num_return_sequences);
read_anymap_param(config_map, "no_repeat_ngram_size", config.no_repeat_ngram_size);
read_anymap_param(config_map, "stop_criteria", config.stop_criteria);
read_anymap_param(config_map, "temperature", config.temperature);
read_anymap_param(config_map, "top_p", config.top_p);
read_anymap_param(config_map, "top_k", config.top_k);
read_anymap_param(config_map, "do_sample", config.do_sample);
read_anymap_param(config_map, "repetition_penalty", config.repetition_penalty);
read_anymap_param(config_map, "pad_token_id", config.pad_token_id);
read_anymap_param(config_map, "bos_token_id", config.bos_token_id);
read_anymap_param(config_map, "eos_token_id", config.eos_token_id);
read_anymap_param(config_map, "bos_token", config.bos_token);
read_anymap_param(config_map, "eos_token", config.eos_token);

return config;
read_anymap_param(config_map, "max_new_tokens", max_new_tokens);
read_anymap_param(config_map, "max_length", max_length);
read_anymap_param(config_map, "ignore_eos", ignore_eos);
read_anymap_param(config_map, "num_beam_groups", num_beam_groups);
read_anymap_param(config_map, "num_beams", num_beams);
read_anymap_param(config_map, "diversity_penalty", diversity_penalty);
read_anymap_param(config_map, "length_penalty", length_penalty);
read_anymap_param(config_map, "num_return_sequences", num_return_sequences);
read_anymap_param(config_map, "no_repeat_ngram_size", no_repeat_ngram_size);
read_anymap_param(config_map, "stop_criteria", stop_criteria);
read_anymap_param(config_map, "temperature", temperature);
read_anymap_param(config_map, "top_p", top_p);
read_anymap_param(config_map, "top_k", top_k);
read_anymap_param(config_map, "do_sample", do_sample);
read_anymap_param(config_map, "repetition_penalty", repetition_penalty);
read_anymap_param(config_map, "pad_token_id", pad_token_id);
read_anymap_param(config_map, "bos_token_id", bos_token_id);
read_anymap_param(config_map, "eos_token_id", eos_token_id);
read_anymap_param(config_map, "bos_token", bos_token);
read_anymap_param(config_map, "eos_token", eos_token);
}

size_t GenerationConfig::get_max_new_tokens(size_t prompt_length) const {
Expand Down
Empty file.
2 changes: 1 addition & 1 deletion src/cpp/src/greedy_decoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ EncodedResults greedy_decoding(
for (size_t i = 0; i < max_tokens - 1; ++i) {
utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask"));
m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask")));

// todo: consider replacing with start_async and run callback right after that
m_model_runner.infer();
auto logits = m_model_runner.get_tensor("logits");
Expand Down
Loading

0 comments on commit 6e52dc9

Please sign in to comment.