From 482fa791f926143fec05ae564363d4cd0cf93508 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 1 Jan 2025 14:12:56 +0400 Subject: [PATCH 1/5] Updated real_models list (#1459) --- tests/python_tests/models/real_models | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/tests/python_tests/models/real_models b/tests/python_tests/models/real_models index 98fa18bd5e..420f8f53b6 100644 --- a/tests/python_tests/models/real_models +++ b/tests/python_tests/models/real_models @@ -11,7 +11,7 @@ EleutherAI/gpt-neo-2.7B EleutherAI/gpt-neox-20b EleutherAI/pythia-160m GAIR/Abel-7B-002 -# OrionStarAI/Orion-14B-Base: pip install flash_attn (https://github.com/huggingface/transformers/pull/30954) +OrionStarAI/Orion-14B-Base PygmalionAI/pygmalion-6b Qwen/Qwen-7B Qwen/Qwen-7B-Chat @@ -21,6 +21,8 @@ Qwen/Qwen1.5-7B Qwen/Qwen1.5-7B-Chat Qwen/Qwen1.5-MoE-A2.7B Qwen/Qwen1.5-MoE-A2.7B-Chat +Qwen/Qwen2-7B +Qwen/Qwen2-7B-Instruct Salesforce/codegen-350M-multi Salesforce/codegen-350M-nl Salesforce/codegen2-1b @@ -48,15 +50,16 @@ bigscience/bloomz-1b7 bigscience/bloomz-560m bigscience/bloomz-7b1 cerebras/Cerebras-GPT-13B -# core42/jais-13b: wrong output with PA -# core42/jais-13b-chat: wrong output with PA +core42/jais-13b +core42/jais-13b-chat databricks/dolly-v1-6b databricks/dolly-v2-3b # deepseek-ai/deepseek-coder-33b-instruct: OpenVINO tokenizers - Cannot convert tokenizer of this type without `.model` file # deepseek-ai/deepseek-coder-6.7b-instruct: OpenVINO tokenizers - Cannot convert tokenizer of this type without `.model` file -# deepseek-ai/deepseek-moe-16b-base: optimum - Trying to export a deepseek model, that is a custom or unsupported architecture -# facebook/blenderbot-3B: optimum - IndexError: tuple index out of range -# facebook/incoder-1B: CB - Failed to detect "eos_token_id" in openvino_tokenizer.xml runtime information +deepseek-ai/deepseek-moe-16b-base +deepseek-ai/DeepSeek-V3-Base +facebook/blenderbot-3B +facebook/incoder-1B facebook/opt-1.3b facebook/opt-125m facebook/opt-2.7b @@ -66,6 +69,7 @@ google/gemma-1.1-7b-it google/gemma-2b google/gemma-2b-it google/gemma-7b +google/gemma-2-9b google/pegasus-big_patent google/pegasus-large gpt2 @@ -86,6 +90,10 @@ microsoft/DialoGPT-medium microsoft/Orca-2-7b microsoft/Phi-3-mini-128k-instruct microsoft/Phi-3-mini-4k-instruct +microsoft/Phi-3-medium-128k-instruct +microsoft/Phi-3-small-8k-instruct +microsoft/Phi-3-small-128k-instruct +microsoft/Phi-3.5-MoE-instruct # microsoft/biogpt: OpenVINO Tokenizers - openvino.runtime.exceptions.OVTypeError: Tokenizer type is not supported: microsoft/phi-1_5 microsoft/phi-2 @@ -106,10 +114,10 @@ openbmb/MiniCPM-2B-dpo-bf16 openbmb/MiniCPM-2B-sft-bf16 openchat/openchat_3.5 openlm-research/open_llama_13b -# openlm-research/open_llama_3b: CPU - head size must be multiple of 16, current: 100 -# openlm-research/open_llama_3b_v2: CPU - head size must be multiple of 16, current: 100 +openlm-research/open_llama_3b +openlm-research/open_llama_3b_v2 # replit/replit-code-v1-3b: OpenVINO Tokenizers - AttributeError: 'ReplitLMTokenizer' object has no attribute 'sp_model' -# rinna/bilingual-gpt-neox-4b: OpenVINO Tokenizers - trash output (https://jira.devtools.intel.com/browse/CVS-142063) +rinna/bilingual-gpt-neox-4b rinna/youri-7b-chat stabilityai/stable-code-3b stabilityai/stable-zephyr-3b @@ -120,3 +128,4 @@ tiiuae/falcon-rw-7b togethercomputer/RedPajama-INCITE-Chat-3B-v1 # xverse/XVERSE-7B-Chat: Transformers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3 # xverse/XVERSE-MoE-A4.2B: Transformers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3 +Deci/DeciLM-7B \ No newline at end of file From 2ab8fa8578e120e5978294fa8a69efac72874a87 Mon Sep 17 00:00:00 2001 From: Gorokhov Dmitriy Date: Fri, 3 Jan 2025 11:57:57 +0400 Subject: [PATCH 2/5] Removed usage of deprecated ov::affinity property (#1467) OpenVINO PR: https://github.com/openvinotoolkit/openvino/pull/28247 --- src/python/py_utils.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp index 5c042d83d9..1fc34a36d2 100644 --- a/src/python/py_utils.cpp +++ b/src/python/py_utils.cpp @@ -262,8 +262,6 @@ ov::Any py_object_to_any(const py::object& py_obj, std::string property_name) { return py::cast(py_obj); } else if (py::isinstance(py_obj)) { return py::cast(py_obj); - } else if (py::isinstance(py_obj)) { - return py::cast(py_obj); } else if (py::isinstance(py_obj)) { return py::cast(py_obj); } else if (py::isinstance>(py_obj)) { From 42f3053afdaa61d36958324aa834c3e2c951eedd Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 3 Jan 2025 17:35:51 +0400 Subject: [PATCH 3/5] Don't throw when some generation parameters are ignored (#1473) Relaxed some checks introduced in https://github.com/openvinotoolkit/openvino.genai/pull/1448 Tickets: - CVS-159996 - CVS-159998 - CVS-160041 - CVS-160009 - CVS-160035 --- src/cpp/src/generation_config.cpp | 14 +++++++------- tests/python_tests/test_generation_config.py | 18 +++++++++--------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index 59be603fd9..25402e22e7 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -230,9 +230,9 @@ void GenerationConfig::validate() const { OPENVINO_ASSERT(temperature > 0, "When 'do_sample' is true, temperature must be a strictly positive float, but got ", temperature); } else { // parameters requiring multinomial - OPENVINO_ASSERT(top_k == std::numeric_limits::max(), "When 'do_sample' is false, top_k must be max of size_t, but got ", top_k); - OPENVINO_ASSERT(top_p == 1.0f, "When 'do_sample' is false, top_p must be 1.0f, but got ", top_p); - OPENVINO_ASSERT(temperature == 1.0f, "When 'do_sample' is false, temperature must be a 1.0f, but got ", temperature); + // OPENVINO_ASSERT(top_k == std::numeric_limits::max(), "When 'do_sample' is false, top_k must be max of size_t, but got ", top_k); + // OPENVINO_ASSERT(top_p == 1.0f, "When 'do_sample' is false, top_p must be 1.0f, but got ", top_p); + // OPENVINO_ASSERT(temperature == 1.0f, "When 'do_sample' is false, temperature must be a 1.0f, but got ", temperature); } if (is_beam_search()) { @@ -252,10 +252,10 @@ void GenerationConfig::validate() const { } } else { // parameters requiring beam search - OPENVINO_ASSERT(num_beam_groups == 1, "'num_beam_groups' is supported by beam search only and should be 1 otherwise, but got ", num_beam_groups); - OPENVINO_ASSERT(no_repeat_ngram_size == std::numeric_limits::max(), "'no_repeat_ngram_size' is supported only by beam search, otherwise should be set to max of size_t, but got ", no_repeat_ngram_size); - OPENVINO_ASSERT(diversity_penalty == 0.0f, "'diversity_penalty' is set to ", diversity_penalty, " (default is 0.0f), which is supported only by beam search sampling"); - OPENVINO_ASSERT(length_penalty == 1.0f, "'length_penalty' is set to ", length_penalty, " (default is 1.0f), which is supported only by beam search sampling"); + // OPENVINO_ASSERT(num_beam_groups == 1, "'num_beam_groups' is supported by beam search only and should be 1 otherwise, but got ", num_beam_groups); + // OPENVINO_ASSERT(no_repeat_ngram_size == std::numeric_limits::max(), "'no_repeat_ngram_size' is supported only by beam search, otherwise should be set to max of size_t, but got ", no_repeat_ngram_size); + // OPENVINO_ASSERT(diversity_penalty == 0.0f, "'diversity_penalty' is set to ", diversity_penalty, " (default is 0.0f), which is supported only by beam search sampling"); + // OPENVINO_ASSERT(length_penalty == 1.0f, "'length_penalty' is set to ", length_penalty, " (default is 1.0f), which is supported only by beam search sampling"); } // assistant generation diff --git a/tests/python_tests/test_generation_config.py b/tests/python_tests/test_generation_config.py index 110caaf0e5..0a42685b05 100644 --- a/tests/python_tests/test_generation_config.py +++ b/tests/python_tests/test_generation_config.py @@ -23,6 +23,10 @@ dict(max_new_tokens=1, do_sample=True, top_k=1), dict(max_new_tokens=1, do_sample=True, top_p=0.5), dict(max_new_tokens=1, do_sample=True, temperature=0.5), + # parameters requiring multimonial are ignored when do_sample=False + dict(max_new_tokens=1, top_k=1), # requires do_sample=True + dict(max_new_tokens=1, top_p=0.5), # requires do_sample=True + dict(max_new_tokens=1, temperature=2.0), # requires do_sample=True # beam search dict(max_new_tokens=1, num_beams=2), dict(max_new_tokens=1, num_beams=2, num_return_sequences=1), @@ -30,6 +34,11 @@ dict(max_new_tokens=1, num_beams=4, num_beam_groups=2, diversity_penalty=1.0), dict(max_new_tokens=1, num_beams=4, length_penalty=1.0), dict(max_new_tokens=1, num_beams=4, no_repeat_ngram_size=2), + # parameters requiring beam search are ignored when num_beams == 1 + dict(max_new_tokens=1, num_beam_groups=2), # requiring beam search + dict(max_new_tokens=1, no_repeat_ngram_size=2), # requiring beam search + dict(max_new_tokens=1, diversity_penalty=1.0), # requiring beam search + dict(max_new_tokens=1, length_penalty=2), # requiring beam search # assistant generation dict(max_new_tokens=1, assistant_confidence_threshold=0.5), dict(max_new_tokens=1, num_assistant_tokens=2), @@ -66,10 +75,6 @@ def test_valid_configs(generation_config_kwargs): dict(max_new_tokens=1, do_sample=True, top_p=1.1), # 'top_p' must be within (0, 1] when 'do_sample' is True dict(max_new_tokens=1, do_sample=True, top_p=0), # 'top_p' must be within (0, 1] when 'do_sample' is True dict(max_new_tokens=1, do_sample=True, temperature=-1.0), # invalid temp - # parameters requiring multimonial - dict(max_new_tokens=1, top_k=1), # requires do_sample=True - dict(max_new_tokens=1, top_p=0.5), # requires do_sample=True - dict(max_new_tokens=1, temperature=2.0), # requires do_sample=True # beam search dict(max_new_tokens=1, num_beams=2, num_return_sequences=3), # 'num_beams' must be >= 'num_return_sequences' dict(max_new_tokens=1, num_beams=3, num_beam_groups=2), # 'num_beams' must be divisible by 'num_beam_groups' @@ -80,11 +85,6 @@ def test_valid_configs(generation_config_kwargs): dict(max_new_tokens=1, num_beams=2, frequency_penalty=1.0), # 'frequency_penalty' is not supported by beam search dict(max_new_tokens=1, num_beams=2, presence_penalty=1.0), # 'presence_penalty' is not supported by beam search dict(max_new_tokens=1, num_beams=2, repetition_penalty=0.0), # 'repetition_penalty' is not supported by beam search - # parameters requiring beam search - dict(max_new_tokens=1, num_beam_groups=2), # requiring beam search - dict(max_new_tokens=1, no_repeat_ngram_size=2), # requiring beam search - dict(max_new_tokens=1, diversity_penalty=1.0), # requiring beam search - dict(max_new_tokens=1, length_penalty=2), # requiring beam search # assistant generation dict(max_new_tokens=1, num_assistant_tokens=2, do_sample=True, num_return_sequences=2), # 'num_return_sequences' must be 1, as we cannot use different number of tokens per sequence within a group dict(max_new_tokens=1, assistant_confidence_threshold=1.0, do_sample=True, num_return_sequences=2), # 'num_return_sequences' must be 1, as we cannot use different number of tokens per sequence within a group From 1fd1430af56ad6eb630917018267077464b0d76a Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sat, 4 Jan 2025 03:12:03 +0400 Subject: [PATCH 4/5] Move tests on sampling to test_sampling.py (#1465) - Move extensive tests on decoding / sampling from test_llm_pipeline.py tests to test_sampling.py - Partially refactored common functions in common.py to be more generic (to be continued in next PRs) - Dropped partially predefined functions with generation configs and replaced them in tests with dict of generation parameters, so you can better see tests params closer to tests itself and avoid creating numerous get_** for new generation values combinations. - Sampling tests are now implemented on top of stateful model for better comparison with optimum-intel --- .github/workflows/mac.yml | 4 +- .../openvino_genai/py_openvino_genai.pyi | 1 + .../py_continuous_batching_pipeline.cpp | 15 +- tests/python_tests/common.py | 346 +++++++++--------- tests/python_tests/ov_genai_test_utils.py | 49 ++- .../python_tests/test_continuous_batching.py | 38 +- tests/python_tests/test_kv_cache_eviction.py | 6 +- tests/python_tests/test_llm_pipeline.py | 327 +++-------------- .../python_tests/test_llm_pipeline_static.py | 40 +- tests/python_tests/test_sampling.py | 224 +++++------- tests/python_tests/test_vlm_pipeline.py | 17 +- .../tests/test_cli_image.py | 9 +- 12 files changed, 410 insertions(+), 666 deletions(-) diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index fb66271ff7..5402b79e70 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -1,4 +1,4 @@ -name: macOS (12, Python 3.9) +name: macOS (12, Python 3.10) on: workflow_dispatch: pull_request: @@ -16,7 +16,7 @@ concurrency: cancel-in-progress: true env: - PYTHON_VERSION: '3.9' + PYTHON_VERSION: '3.10' OV_BRANCH: master OV_TARBALL: '' diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 5d82fa89a3..9ff28859b9 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -697,6 +697,7 @@ class GenerationResult: """ m_generation_ids: list[str] m_scores: list[float] + m_status: GenerationStatus def __init__(self) -> None: ... def __repr__(self) -> str: diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index 2b48e4d44d..48eb124255 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -119,6 +119,13 @@ std::ostream& operator << (std::ostream& stream, const GenerationResult& generat } // namespace void init_continuous_batching_pipeline(py::module_& m) { + py::enum_(m, "GenerationStatus") + .value("RUNNING", ov::genai::GenerationStatus::RUNNING) + .value("FINISHED", ov::genai::GenerationStatus::FINISHED) + .value("IGNORED", ov::genai::GenerationStatus::IGNORED) + .value("DROPPED_BY_PIPELINE", ov::genai::GenerationStatus::DROPPED_BY_PIPELINE) + .value("DROPPED_BY_HANDLE", ov::genai::GenerationStatus::DROPPED_BY_HANDLE); + py::class_(m, "GenerationResult", generation_result_docstring) .def(py::init<>()) .def_readonly("m_request_id", &GenerationResult::m_request_id) @@ -130,6 +137,7 @@ void init_continuous_batching_pipeline(py::module_& m) { r.m_generation_ids = generation_ids; }) .def_readwrite("m_scores", &GenerationResult::m_scores) + .def_readwrite("m_status", &GenerationResult::m_status) .def("__repr__", [](const GenerationResult &r) -> py::str { std::stringstream stream; @@ -148,13 +156,6 @@ void init_continuous_batching_pipeline(py::module_& m) { .def_readwrite("m_generation_ids", &EncodedGenerationResult::m_generation_ids) .def_readwrite("m_scores", &EncodedGenerationResult::m_scores); - py::enum_(m, "GenerationStatus") - .value("RUNNING", ov::genai::GenerationStatus::RUNNING) - .value("FINISHED", ov::genai::GenerationStatus::FINISHED) - .value("IGNORED", ov::genai::GenerationStatus::IGNORED) - .value("DROPPED_BY_PIPELINE", ov::genai::GenerationStatus::DROPPED_BY_PIPELINE) - .value("DROPPED_BY_HANDLE", ov::genai::GenerationStatus::DROPPED_BY_HANDLE); - py::enum_(m, "GenerationFinishReason") .value("NONE", ov::genai::GenerationFinishReason::NONE) .value("STOP", ov::genai::GenerationFinishReason::STOP) diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index 9040fa435f..dc58d1ad2f 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -7,7 +7,7 @@ from optimum.intel import OVModelForCausalLM from pathlib import Path -from openvino_genai import ContinuousBatchingPipeline, SchedulerConfig, GenerationResult, GenerationConfig +from openvino_genai import ContinuousBatchingPipeline, LLMPipeline, SchedulerConfig, GenerationResult, GenerationConfig, DecodedResults, StopCriteria from transformers import AutoTokenizer, AutoModelForCausalLM from transformers import GenerationConfig as HFGenerationConfig from typing import List, Tuple @@ -20,20 +20,6 @@ def get_greedy() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config -def get_greedy_with_min_and_max_tokens() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_return_sequences = 1 - generation_config.min_new_tokens = 15 - generation_config.max_new_tokens = 30 - return generation_config - -def get_greedy_with_repetition_penalty() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_return_sequences = 1 - generation_config.repetition_penalty = 2.0 - generation_config.max_new_tokens = 30 - return generation_config - def get_greedy_with_penalties() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_return_sequences = 1 @@ -42,33 +28,6 @@ def get_greedy_with_penalties() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config -def get_greedy_with_single_stop_string() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_return_sequences = 1 - generation_config.min_new_tokens = 15 - generation_config.max_new_tokens = 50 - generation_config.stop_strings = {"anag"} # expected match on "manage" - generation_config.include_stop_str_in_output = True - return generation_config - -def get_greedy_with_multiple_stop_strings() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_return_sequences = 1 - generation_config.min_new_tokens = 1 - generation_config.max_new_tokens = 50 - generation_config.stop_strings = {".", "software", "Intel"} - generation_config.include_stop_str_in_output = True - return generation_config - -def get_greedy_with_multiple_stop_strings_no_match() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_return_sequences = 1 - generation_config.min_new_tokens = 1 - generation_config.max_new_tokens = 50 - generation_config.stop_strings = {"Einstein", "sunny", "geothermal"} - generation_config.include_stop_str_in_output = True - return generation_config - def get_beam_search() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 @@ -79,78 +38,6 @@ def get_beam_search() -> GenerationConfig: generation_config.num_return_sequences = generation_config.num_beams return generation_config -def get_beam_search_min_and_max_tokens() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_beam_groups = 3 - generation_config.num_beams = 6 - generation_config.diversity_penalty = 1 - generation_config.min_new_tokens = 15 - generation_config.max_new_tokens = 30 - generation_config.num_return_sequences = 3 - generation_config.num_return_sequences = generation_config.num_beams - return generation_config - -def get_beam_search_with_single_stop_string() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_beam_groups = 3 - generation_config.num_beams = 6 - generation_config.diversity_penalty = 1 - generation_config.max_new_tokens = 50 - generation_config.num_return_sequences = generation_config.num_beams - generation_config.stop_strings = {"open sour"} # expected match on "open source" - generation_config.include_stop_str_in_output = True - return generation_config - -def get_beam_search_with_multiple_stop_strings() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_beam_groups = 3 - generation_config.num_beams = 6 - generation_config.diversity_penalty = 1 - generation_config.max_new_tokens = 50 - generation_config.num_return_sequences = generation_config.num_beams - generation_config.stop_strings = {".", "software", "Intel"} - generation_config.include_stop_str_in_output = True - return generation_config - -def get_beam_search_with_multiple_stop_strings_no_match() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_beam_groups = 3 - generation_config.num_beams = 6 - generation_config.diversity_penalty = 1 - generation_config.max_new_tokens = 30 - generation_config.num_return_sequences = generation_config.num_beams - generation_config.stop_strings = {"Einstein", "sunny", "geothermal"} - generation_config.include_stop_str_in_output = True - return generation_config - -def get_greedy_stop_strings_exclude_from_output() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.max_new_tokens = 30 - generation_config.stop_strings = { "machines" } - generation_config.include_stop_str_in_output = False - return generation_config - -def get_greedy_stop_strings_include_to_output() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.max_new_tokens = 30 - generation_config.stop_strings = { "machines" } - generation_config.include_stop_str_in_output = True - return generation_config - -def get_greedy_n_stop_strings_exclude_from_output() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.max_new_tokens = 30 - generation_config.stop_strings = { "machines", "manage" } - generation_config.include_stop_str_in_output = False - return generation_config - -def get_greedy_n_stop_strings_include_to_output() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.max_new_tokens = 30 - generation_config.stop_strings = { "machines", "manage" } - generation_config.include_stop_str_in_output = True - return generation_config - def get_multinomial_temperature() -> GenerationConfig: generation_config = GenerationConfig() generation_config.do_sample = True @@ -288,8 +175,10 @@ def convert_to_hf( default_generation_config : HFGenerationConfig, generation_config : GenerationConfig ) -> HFGenerationConfig: - kwargs = {} + if generation_config is None: + return + kwargs = {} # generic parameters kwargs['max_length'] = generation_config.max_length # has higher priority than 'max_length' @@ -300,8 +189,16 @@ def convert_to_hf( # copy default parameters kwargs['bos_token_id'] = default_generation_config.bos_token_id - kwargs['eos_token_id'] = default_generation_config.eos_token_id kwargs['pad_token_id'] = default_generation_config.pad_token_id + + if len(generation_config.stop_token_ids) > 0: + kwargs['eos_token_id'] = list(generation_config.stop_token_ids) + elif generation_config.eos_token_id != -1: + kwargs['eos_token_id'] = generation_config.eos_token_id + else: + kwargs['eos_token_id'] = default_generation_config.eos_token_id + + # copy penalties kwargs['repetition_penalty'] = generation_config.repetition_penalty if generation_config.is_beam_search(): @@ -312,8 +209,20 @@ def convert_to_hf( kwargs['no_repeat_ngram_size'] = generation_config.no_repeat_ngram_size kwargs['num_return_sequences'] = generation_config.num_return_sequences kwargs['output_scores'] = True + if generation_config.num_beam_groups > 1: kwargs['diversity_penalty'] = generation_config.diversity_penalty + + # in OpenVINO GenAI this parameter is called stop_criteria, + # while in HF it's called early_stopping. + # HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER" + STOP_CRITERIA_MAP = { + StopCriteria.NEVER: "never", + StopCriteria.EARLY: True, + StopCriteria.HEURISTIC: False + } + + kwargs['early_stopping'] = STOP_CRITERIA_MAP[generation_config.stop_criteria] elif generation_config.is_multinomial(): # mulitinomial kwargs['temperature'] = generation_config.temperature @@ -332,23 +241,55 @@ def run_hugging_face( opt_model, hf_tokenizer, prompts: List[str], - generation_configs: List[GenerationConfig], + generation_configs: List[GenerationConfig] | GenerationConfig, ) -> List[GenerationResult]: generation_results = [] - for prompt, generation_config in zip(prompts, generation_configs): - inputs = hf_tokenizer(prompt, return_tensors="pt") - prompt_len = inputs['input_ids'].numel() - generate_outputs = opt_model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], - generation_config=convert_to_hf(opt_model.generation_config, generation_config), - return_dict_in_generate=True, tokenizer=hf_tokenizer) - all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True) - generation_result = GenerationResult() - generation_result.m_generation_ids = all_text_batch - # sequences_scores are available only for beam search case - if generation_config.is_beam_search(): - generation_result.m_scores = [score for score in generate_outputs.sequences_scores] - generation_results.append(generation_result) + if type(generation_configs) is list: + # process prompt by promp as we have multiple generation configs + for prompt, generation_config in zip(prompts, generation_configs): + hf_generation_config = convert_to_hf(opt_model.generation_config, generation_config) + inputs = hf_tokenizer(prompt, return_tensors="pt") + input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask'] + prompt_len = 0 if generation_config.echo else input_ids.numel() + + generate_outputs = opt_model.generate(input_ids=input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, + return_dict_in_generate=True, tokenizer=hf_tokenizer) + all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True) + + generation_result = GenerationResult() + generation_result.m_generation_ids = all_text_batch + # sequences_scores are available only for beam search case + if generation_config.is_beam_search(): + generation_result.m_scores = [score for score in generate_outputs.sequences_scores] + generation_results.append(generation_result) + else: + # process all prompts as a single batch as we have a single generation config for all prompts + inputs = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True, padding_side='left') + input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask'] + hf_generation_config = convert_to_hf(opt_model.generation_config, generation_configs) + hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, + return_dict_in_generate=True, tokenizer=hf_tokenizer) + + generation_ids = [] + scores = [] + + for idx, hf_encoded_out in enumerate(hf_encoded_outputs.sequences): + prompt_idx = idx // hf_generation_config.num_return_sequences + prompt_len = 0 if generation_configs.echo else input_ids[prompt_idx].numel() + decoded_text = hf_tokenizer.decode(hf_encoded_out[prompt_len:], skip_special_tokens=True) + generation_ids.append(decoded_text) + if generation_configs.is_beam_search(): + scores.append(hf_encoded_outputs.sequences_scores[idx]) + + # if we need to move to next generation result + if (idx + 1) // hf_generation_config.num_return_sequences != prompt_idx: + generation_result = GenerationResult() + generation_result.m_generation_ids = generation_ids + generation_result.m_scores = scores + generation_results.append(generation_result) + generation_ids = [] + scores = [] del hf_tokenizer del opt_model @@ -360,16 +301,65 @@ def run_continuous_batching( models_path : Path, scheduler_config : SchedulerConfig, prompts: List[str], - generation_configs : List[GenerationConfig] + generation_configs : List[GenerationConfig] | GenerationConfig ) -> List[GenerationResult]: - pipe = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU") - output = pipe.generate(prompts, generation_configs) - del pipe + if type(generation_configs) is not list: + generation_configs = [generation_configs] * len(prompts) + + cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU') + output = cb_pipe.generate(prompts, generation_configs) + + del cb_pipe shutil.rmtree(models_path) + return output -def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig): +def get_default_properties(): + import openvino.properties.hint as hints + import openvino as ov + + return { + hints.inference_precision : ov.Type.f32, + hints.kv_cache_precision : ov.Type.f16, + } + + +def run_llm_pipeline( + models_path : Path, + prompts: List[str], + generation_config : GenerationConfig, + use_cb : bool = False +) -> List[GenerationResult]: + properties = get_default_properties() + if use_cb: + properties['scheduler_config'] = SchedulerConfig() + + ov_pipe = LLMPipeline(models_path, device='CPU', **properties) + + generate_outputs : DecodedResults = ov_pipe.generate(inputs=prompts, generation_config=generation_config) + + index = 0 + generation_results = [] + + for _ in prompts: + generation_result = GenerationResult() + + generation_result.m_generation_ids = generate_outputs.texts[index : index + generation_config.num_return_sequences] + # sequences_scores are available only for beam search case + if generation_config.is_beam_search(): + generation_result.m_scores = generate_outputs.scores[index : index + generation_config.num_return_sequences] + generation_results.append(generation_result) + + index += generation_config.num_return_sequences + + del ov_pipe + shutil.rmtree(models_path) + + return generation_results + + +def compare_generation_result(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig): if generation_config.is_beam_search(): assert len(hf_result.m_scores) == len(ov_result.m_scores) for hf_score, ov_score in zip(hf_result.m_scores, ov_result.m_scores): @@ -386,46 +376,79 @@ def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, ge assert hf_text == ov_text -def get_hugging_face_model_and_tokenizer(model_id: str, use_optimum = True): +def compare_generation_results(prompts: List[str], hf_results: List[GenerationResult], ov_results: List[GenerationResult], generation_configs: List[GenerationConfig] | GenerationConfig): + if type(generation_configs) is not list: + generation_configs = [generation_configs] + + assert len(prompts) == len(hf_results) + assert len(prompts) == len(ov_results) + + for prompt, ref_result, ov_result, generation_config in zip(prompts, hf_results, ov_results, generation_configs): + print(f"Prompt = {prompt}\nReference result = {ref_result}\nOpenVINO result = {ov_result.m_generation_ids}") + compare_generation_result(ref_result, ov_result, generation_config) + + +def get_hugging_face_models(model_id: str): hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) if use_optimum else \ - AutoModelForCausalLM.from_pretrained(model_id) + opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, ov_config=get_default_properties()) return opt_model, hf_tokenizer -def save_ov_model_from_optimum(model, hf_tokenizer, models_path: Path): - model.save_pretrained(models_path) +def convert_models(opt_model : OVModelForCausalLM, hf_tokenizer : AutoTokenizer, models_path: Path): + opt_model.save_pretrained(models_path) + + # to store tokenizer config jsons with special tokens + hf_tokenizer.save_pretrained(models_path) + + # save generation config + opt_model.generation_config.save_pretrained(models_path) + # convert tokenizers as well from openvino_tokenizers import convert_tokenizer from openvino import serialize - tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True, skip_special_tokens=True) + + tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True) serialize(tokenizer, models_path / "openvino_tokenizer.xml") serialize(detokenizer, models_path / "openvino_detokenizer.xml") -def _generate_and_compare_with_reference_results(models_path: Path, prompts: List[str], reference_results: List[GenerationResult], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig): - ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs) +def run_llm_pipeline_with_ref(model_id: str, prompts: List[str], generation_config: GenerationConfig | dict, tmp_path: Path, use_cb : bool = False): + models_path : Path = tmp_path / model_id + opt_model, hf_tokenizer = get_hugging_face_models(model_id) - assert len(prompts) == len(reference_results) - assert len(prompts) == len(ov_results) + if type(generation_config) is dict: + generation_config = GenerationConfig(**generation_config) + + convert_models(opt_model, hf_tokenizer, models_path) - for prompt, ref_result, ov_result, generation_config in zip(prompts, reference_results, ov_results, generation_configs): - print(f"Prompt = {prompt}\nref result = {ref_result}\nOV result = {ov_result.m_generation_ids}") - compare_results(ref_result, ov_result, generation_config) + ov_results = run_llm_pipeline(models_path, prompts, generation_config, use_cb) + hf_results = run_hugging_face(opt_model, hf_tokenizer, prompts, generation_config) + compare_generation_results(prompts, hf_results, ov_results, generation_config) + + +def run_cb_pipeline_with_ref(tmp_path: str, model_id: str, scheduler_params: dict = {}, generation_config : GenerationConfig | dict = None): + prompts, generation_configs = get_test_dataset() + scheduler_config = get_scheduler_config(scheduler_params) + + # override dataset's generation config + if generation_config is not None: + if type(generation_config) is dict: + generation_config = GenerationConfig(**generation_config) + generation_configs = [generation_config] * len(prompts) -def generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path): - use_optimum = True models_path : Path = tmp_path / model_id - opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum) + opt_model, hf_tokenizer = get_hugging_face_models(model_id) - if use_optimum: - save_ov_model_from_optimum(opt_model, hf_tokenizer, models_path) + convert_models(opt_model, hf_tokenizer, models_path) - hf_results = run_hugging_face(opt_model=opt_model, hf_tokenizer=hf_tokenizer, prompts=prompts, generation_configs=generation_configs) - _generate_and_compare_with_reference_results(models_path, prompts, hf_results, generation_configs, scheduler_config) + hf_results = run_hugging_face(opt_model, hf_tokenizer, prompts, generation_configs) + ov_results = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs) + compare_generation_results(prompts, hf_results, ov_results, generation_configs) + +# TODO: remove after Generator property is supported by LLMPipeline / VLMPipeline def generate_and_compare_with_reference_text(models_path: Path, prompts: List[str], reference_texts_per_prompt: List[List[str]], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig): ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs) @@ -440,19 +463,6 @@ def generate_and_compare_with_reference_text(models_path: Path, prompts: List[st assert ref_text == ov_text -def run_continuous_batching_pipeline_test(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None): - prompts, generation_configs = get_test_dataset() - scheduler_config = get_scheduler_config(scheduler_params) - - if generation_config is not None: - generation_config.rng_seed = 0 - generation_configs = [generation_config] * len(prompts) - - generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) - - -DEFAULT_SCHEDULER_CONFIG = get_scheduler_config({"num_kv_blocks": 300, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}) - def get_image_by_link(link): from PIL import Image import requests diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index 9e8e4681f9..00c74f6628 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -13,6 +13,8 @@ import shutil import json +import openvino_genai as ov_genai + def get_models_list(): precommit_models = [ @@ -52,6 +54,7 @@ def get_models_list(): if pytest.selected_model_ids: model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')] + # pytest.set_trace() prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] @@ -81,66 +84,57 @@ def get_chat_models_list(): @functools.lru_cache(1) def read_model(params, **tokenizer_kwargs): - model_id, path = params + model_id, models_path = params from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - if (path / "openvino_model.xml").exists(): - opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True, + if (models_path / "openvino_model.xml").exists(): + opt_model = OVModelForCausalLM.from_pretrained(models_path, trust_remote_code=True, compile=False, device='CPU') else: ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer, with_detokenizer=True, **tokenizer_kwargs) - openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml") - openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml") + openvino.save_model(ov_tokenizer, models_path / "openvino_tokenizer.xml") + openvino.save_model(ov_detokenizer, models_path / "openvino_detokenizer.xml") # to store tokenizer config jsons with special tokens - hf_tokenizer.save_pretrained(path) + hf_tokenizer.save_pretrained(models_path) opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, compile=False, device='CPU', load_in_8bit=False) - opt_model.generation_config.save_pretrained(path) - opt_model.config.save_pretrained(path) - opt_model.save_pretrained(path) + opt_model.generation_config.save_pretrained(models_path) + opt_model.config.save_pretrained(models_path) + opt_model.save_pretrained(models_path) return ( model_id, - path, + models_path, hf_tokenizer, opt_model, - ov_genai.LLMPipeline(path, 'CPU', ENABLE_MMAP=False), + ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False), ) -# in OpenVINO GenAI this parameter is called stop_criteria, -# while in HF it's called early_stopping. -# HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER" -STOP_CRITERIA_MAP = { - ov_genai.StopCriteria.NEVER: "never", - ov_genai.StopCriteria.EARLY: True, - ov_genai.StopCriteria.HEURISTIC: False -} - - @pytest.fixture(scope="module") def model_tmp_path(tmpdir_factory): - model_id, path, _, _, _ = read_model(get_models_list()[0]) + model_id, models_path, _, _, _ = read_model(get_models_list()[0]) temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_')) # copy openvino converted model and tokenizers for pattern in ['*.xml', '*.bin']: - for src_file in path.glob(pattern): + for src_file in models_path.glob(pattern): if src_file.is_file(): shutil.copy(src_file, temp_path / src_file.name) + yield model_id, Path(temp_path) @pytest.fixture(scope="module") def model_tokenizers_tmp_path(tmpdir_factory): - model_id, path, _, _, _ = read_model(get_models_list()[0]) + model_id, models_path, _, _, _ = read_model(get_models_list()[0]) temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_')) # If tokens were not found in IR, it fallback to reading from config. @@ -148,10 +142,11 @@ def model_tokenizers_tmp_path(tmpdir_factory): # and set tokens in configs and to check if they are read and validated correctly. import openvino as ov + core = ov.Core() + # copy openvino converted model and tokenizers for pattern in ['*.xml', '*.bin']: - for src_file in path.glob(pattern): - core = ov.Core() + for src_file in models_path.glob(pattern): # Update files if they are openvino_tokenizer.xml or openvino_detokenizer.xml if src_file.name in ['openvino_tokenizer.xml', 'openvino_detokenizer.xml']: @@ -166,8 +161,10 @@ def model_tokenizers_tmp_path(tmpdir_factory): if src_file in ['openvino_tokenizer.bin', 'openvino_detokenizer.bin']: continue + if src_file.is_file(): shutil.copy(src_file, temp_path / src_file.name) + yield model_id, Path(temp_path) diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py index 01762bf9e3..fabcf06b71 100644 --- a/tests/python_tests/test_continuous_batching.py +++ b/tests/python_tests/test_continuous_batching.py @@ -9,8 +9,8 @@ from pathlib import Path from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer -from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \ - get_scheduler_config, get_greedy, run_continuous_batching_pipeline_test, get_beam_search, get_greedy, \ +from common import get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \ + get_scheduler_config, get_greedy, run_cb_pipeline_with_ref, get_beam_search, get_greedy, \ get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p from test_sampling import RandomSamplingTestStruct, get_current_platform_ref_texts @@ -39,19 +39,19 @@ def read_models_list(file_name: str): @pytest.mark.precommit @pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit"))) def test_e2e_precommit(tmp_path, model_id): - run_continuous_batching_pipeline_test(tmp_path, model_id) + run_cb_pipeline_with_ref(tmp_path, model_id) @pytest.mark.nightly @pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly"))) def test_e2e_nightly(tmp_path, model_id): - run_continuous_batching_pipeline_test(tmp_path, model_id) + run_cb_pipeline_with_ref(tmp_path, model_id) @pytest.mark.real_models @pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models"))) def test_e2e_real_models(tmp_path, model_id): - run_continuous_batching_pipeline_test(tmp_path, model_id) + run_cb_pipeline_with_ref(tmp_path, model_id) # # Comparison with stateful @@ -77,8 +77,8 @@ def test_continuous_batching_vs_stateful(prompt, generation_config): "facebook/opt-125m", Path("opt-125m") )) - cb = get_continuous_batching(path) - generated = cb.generate(prompt, **generation_config) + cb_pipe = get_continuous_batching(path) + generated = cb_pipe.generate(prompt, **generation_config) reference = stateful.generate(prompt, **generation_config) assert generated.texts == reference.texts if 1 != generation_config.get("num_return_sequences", 1): @@ -117,8 +117,8 @@ def test_cb_streamer_vs_return_vs_stateful(prompt): @pytest.mark.parametrize("model_descr", get_chat_models_list()) @pytest.mark.precommit def test_chat_scenario_vs_stateful(model_descr, generation_config_kwargs: Dict): - model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) - cb_pipe = get_continuous_batching(path) + model_id, models_path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) + cb_pipe = get_continuous_batching(models_path) ov_pipe.start_chat() cb_pipe.start_chat() @@ -150,10 +150,10 @@ def test_post_oom_health(tmp_path, sampling_config): scheduler_config.num_kv_blocks = 10 # Low cache size to trigger OOM quickly model_id : str = "facebook/opt-125m" - opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) + opt_model, hf_tokenizer = get_hugging_face_models(model_id) models_path : Path = tmp_path / model_id - save_ov_model_from_optimum(opt_model, hf_tokenizer, models_path) + convert_models(opt_model, hf_tokenizer, models_path) cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU") @@ -201,7 +201,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig: @pytest.mark.parametrize("params", scheduler_params_list) @pytest.mark.precommit def test_preemption(tmp_path, params): - run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1]) + run_cb_pipeline_with_ref(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1]) multinomial_params = RandomSamplingTestStruct( @@ -249,13 +249,12 @@ def test_preemption(tmp_path, params): def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse): generation_configs = multinomial_params.generation_config for config in generation_configs: - config.rng_seed = 0 config.max_new_tokens = 30 model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) + model, hf_tokenizer = get_hugging_face_models(model_id) models_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, models_path) + convert_models(model, hf_tokenizer, models_path) scheduler_config = get_scheduler_config({"num_kv_blocks": 3, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) generate_and_compare_with_reference_text(models_path, multinomial_params.prompts, multinomial_params.ref_texts, generation_configs, scheduler_config) @@ -329,15 +328,12 @@ def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse): @pytest.mark.precommit @pytest.mark.skip(reason="Random sampling results are non deterministic due to: discrete_distribution impl depends on platform, model inference results may depend on CPU. Test passes on CI but fails locally.") def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse): - generation_configs = multinomial_params_n_seq.generation_config - for config in generation_configs: - config.rng_seed = 0 model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) + opt_model, hf_tokenizer = get_hugging_face_models(model_id) models_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, models_path) + convert_models(opt_model, hf_tokenizer, models_path) # needed kv_blocks - 16 (2 blocks per sequence (30 tokens to generated text + prompt (> 2 tokens)) * (1 + 3 + 4) seq ) scheduler_config = get_scheduler_config({"num_kv_blocks": 8, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) - generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config) + generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, multinomial_params_n_seq.generation_config, scheduler_config) diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py index 6228f53dd1..41281e9cab 100644 --- a/tests/python_tests/test_kv_cache_eviction.py +++ b/tests/python_tests/test_kv_cache_eviction.py @@ -15,7 +15,7 @@ from openvino import serialize from transformers import AutoTokenizer -from common import TESTS_ROOT, run_continuous_batching_pipeline_test +from common import TESTS_ROOT, run_cb_pipeline_with_ref def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]: @@ -150,6 +150,7 @@ def get_greedy_seq_len_300() -> GenerationConfig: generation_config.max_new_tokens = 300 return generation_config + def get_beam_search_seq_len_300() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 @@ -159,6 +160,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig: generation_config.num_return_sequences = generation_config.num_beams return generation_config + scheduler_params_list = [ ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": True}, get_greedy_seq_len_300()), ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": True}, get_beam_search_seq_len_300()), @@ -168,5 +170,5 @@ def get_beam_search_seq_len_300() -> GenerationConfig: @pytest.mark.parametrize("params", scheduler_params_list) @pytest.mark.precommit def test_dynamic_memory_allocation(tmp_path, params): - run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", params[0], params[1]) + run_cb_pipeline_with_ref(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1]) diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py index 6e3cce06d0..986b342c59 100644 --- a/tests/python_tests/test_llm_pipeline.py +++ b/tests/python_tests/test_llm_pipeline.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import openvino_genai as ov_genai -from openvino_genai import StopCriteria, GenerationConfig +from openvino_genai import GenerationConfig import pytest from typing import Union, List, Dict, Optional import numpy as np @@ -10,152 +10,30 @@ import sys from pathlib import Path import torch -import math + +from common import run_llm_pipeline_with_ref, convert_to_hf from ov_genai_test_utils import ( get_models_list, read_model, load_genai_pipe_with_configs, get_chat_models_list, model_tmp_path, - STOP_CRITERIA_MAP, ) - -def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, prompts: Union[str, List[str]]): - model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr - config = generation_config.copy() # to avoid side effects - num_beams = config['num_beams'] if 'num_beams' in config else 1 - config['num_return_sequences'] = num_beams - - if not isinstance(prompts, list): - prompts = [prompts] - - if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config - # it conflicts with `diversity_penalty` and/or `num_beam_groups`. - # Need to set explicitly to False, but only if test arguments omitted this arg. - # Do not apply 'repetition_penalty' if sampling is not used. - config['do_sample'] = False - config['repetition_penalty'] = 1.0 # 1.0 means no penalty - - generation_config_hf = config.copy() - if generation_config_hf.get('stop_criteria'): - generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] - generation_config_hf.pop('ignore_eos', None) - - # Encode the batch of prompts - hf_tokenizer.padding_side = "left" - encoded_prompts = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True) - prompt_ids, attention_mask = encoded_prompts['input_ids'], encoded_prompts['attention_mask'] - - hf_encoded_outputs = opt_model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf) - - hf_outputs = [] - for idx, hf_encoded_out in enumerate(hf_encoded_outputs): - prompt_count = idx // num_beams - hf_outputs.append(hf_tokenizer.decode(hf_encoded_out[prompt_ids[prompt_count].shape[0]:], skip_special_tokens=True)) - - ov_outputs = ov_pipe.generate(prompts, **config).texts - - hf_outputs.sort() - ov_outputs.sort() - for i, (hf_output, ov_output) in enumerate(zip(hf_outputs, ov_outputs)): - if hf_output != ov_output: - print(f'hf_output: {hf_output}') - print(f'ov_output: {ov_output}') - assert hf_output == ov_output - - -def run_hf_ov_genai_comparison_text_inputs(model_descr, generation_config: Dict, prompt: str): - model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr - - config = generation_config.copy() # to avoid side effects - - if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config - # it conflicts with `diversity_penalty` and/or `num_beam_groups`. - # Need to set explicitly to False, but only if test arguments omitted this arg. - # Do not apply 'repetition_penalty' if sampling is not used. - config['do_sample'] = False - config['repetition_penalty'] = 1.0 # 1.0 means no penalty - - generation_config_hf = config.copy() - if generation_config_hf.get('stop_criteria'): - generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] - generation_config_hf.pop('ignore_eos', None) - - encoded_prompt = hf_tokenizer([prompt], return_tensors='pt', add_special_tokens=True) - prompt_ids, attention_mask = encoded_prompt['input_ids'], encoded_prompt['attention_mask'] - hf_encoded_output = opt_model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf) - hf_output = hf_tokenizer.decode(hf_encoded_output[0, prompt_ids.shape[1]:], skip_special_tokens=True) - - ov_output = ov_pipe.generate(prompt, **config) - if config.get('num_return_sequences', 1) > 1: - assert hf_output in ov_output.texts - else: - if hf_output != ov_output: - print(f'hf_output: {hf_output}') - print(f'ov_output: {ov_output}') - - assert hf_output == ov_output - - -def run_hf_ov_genai_comparison_encoded_inputs( - model_descr, - generation_config: Dict, - input_ids: np.ndarray, - attention_mask: Optional[np.array] = None - ): - device = 'CPU' - model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr - - config = generation_config.copy() # to avoid side effects - - if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config - # it conflicts with `diversity_penalty` and/or `num_beam_groups`. - # Need to set explicitly to False, but only if test arguments omitted this arg. - # Do not apply 'repetition_penalty' if sampling is not used. - config['do_sample'] = False - config['repetition_penalty'] = 1.0 # 1.0 means no penalty - - generation_config_hf = config.copy() - if generation_config_hf.get('stop_criteria'): - generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] - generation_config_hf.pop('ignore_eos', None) - - if attention_mask is not None: - inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask)) - inputs_hf = dict(inputs=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask)) - else: - inputs_hf = dict(inputs=torch.tensor(input_ids)) - inputs_ov = ov.Tensor(input_ids) - - hf_output = opt_model.generate(**inputs_hf, **generation_config_hf) - ov_output = ov_pipe.generate(inputs_ov, **config) - - hf_res = hf_output[0, input_ids.shape[1]:].numpy() - ov_res = np.array(ov_output.tokens, dtype=np.int64) - assert np.all(ov_res == hf_res) - # # e2e work # test_cases = [ - (dict(max_new_tokens=20), 'table is made of'), (dict(max_new_tokens=20), '你好! 你好嗎?'), - (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'), - (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), - (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'The Sun is yellow because'), - (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.5), 'The Sun is yellow because'), + (dict(max_new_tokens=30, num_beams=15, num_beam_groups=3, num_return_sequences=15, diversity_penalty=1.0), 'Alan Turing was a'), ] -@pytest.mark.parametrize("generation_config,prompt", test_cases) +@pytest.mark.parametrize("generation_config_dict,prompt", test_cases) @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly -def test_decoding(model_descr, generation_config, prompt): - run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) +def test_string_inputs(model_descr, generation_config_dict, prompt): + run_llm_pipeline_with_ref(model_id=model_descr[0], prompts=[prompt], generation_config=generation_config_dict, tmp_path=model_descr[1]) input_tensors_list = [ @@ -168,13 +46,32 @@ def test_decoding(model_descr, generation_config, prompt): @pytest.mark.precommit @pytest.mark.nightly def test_encoded_inputs(model_descr, inputs): - run_hf_ov_genai_comparison_encoded_inputs(read_model(model_descr), dict(max_new_tokens=20), *inputs) + device = 'CPU' + model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(model_descr) + + ov_generation_config = GenerationConfig(max_new_tokens=20) + hf_generation_config = convert_to_hf(opt_model.generation_config, ov_generation_config) + + input_ids, attention_mask = inputs + + if attention_mask is not None: + inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask)) + inputs_hf = dict(inputs=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask)) + else: + inputs_hf = dict(inputs=torch.tensor(input_ids)) + inputs_ov = ov.Tensor(input_ids) + + hf_output = opt_model.generate(**inputs_hf, generation_config=hf_generation_config) + ov_output = ov_pipe.generate(inputs_ov, ov_generation_config) + + hf_res = hf_output[0, input_ids.shape[1]:].numpy() + ov_res = np.array(ov_output.tokens, dtype=np.int64) + assert np.all(ov_res == hf_res) test_configs = [ dict(max_new_tokens=20), - dict(max_new_tokens=200, ignore_eos=True), - dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0) + dict(max_new_tokens=20, num_beam_groups=2, num_beams=6, diversity_penalty=1.0) ] batched_prompts = [ ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'], @@ -182,107 +79,13 @@ def test_encoded_inputs(model_descr, inputs): ['Alan Turing was a', 'return 0', '你好! 你好嗎?'], ['table is made', 'table is made [force left pad tokens]'] ] -@pytest.mark.parametrize("generation_config", test_configs) +@pytest.mark.parametrize("generation_config_dict", test_configs) @pytest.mark.parametrize("prompts", batched_prompts) @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly -def test_batch_text_input(model_descr, generation_config, prompts): - run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts) - - -prompts = ['The Sun is yellow because', 'Difference between Jupiter and Mars is that', 'table is made of'] -@pytest.mark.parametrize("num_beam_groups", [2, 3, 8]) -@pytest.mark.parametrize("group_size", [5, 3, 10]) -@pytest.mark.parametrize("max_new_tokens", [20, 15]) -@pytest.mark.parametrize("diversity_penalty", [1.0 , 1.5]) -@pytest.mark.parametrize("prompt", prompts) -@pytest.mark.parametrize("model_descr", get_models_list()) -@pytest.mark.precommit -@pytest.mark.nightly -def test_beam_search_decoding(model_descr, num_beam_groups, group_size, max_new_tokens, diversity_penalty, prompt): - generation_config = dict( - num_beam_groups=num_beam_groups, - num_beams=num_beam_groups * group_size, - diversity_penalty=diversity_penalty, - num_return_sequences=num_beam_groups * group_size, - max_new_tokens=max_new_tokens, - ) - run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) - - -@pytest.mark.parametrize("stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC]) -@pytest.mark.parametrize("prompt", prompts) -@pytest.mark.parametrize("max_new_tokens", [10, 80]) -@pytest.mark.parametrize("model_descr", get_models_list()) -@pytest.mark.precommit -@pytest.mark.nightly -def test_beam_search_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens): - # todo: with EARLY stop_criteria looks like HF return invalid out with sentence - # while genai ends sentence with - if (stop_criteria == StopCriteria.EARLY): - pytest.skip() - generation_config = dict( - num_beam_groups=2, - num_beams=2 * 3, - diversity_penalty=1.0, - num_return_sequences=2 * 3, - max_new_tokens=max_new_tokens, - stop_criteria=stop_criteria, - ) - run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) - - -# test long sequences -@pytest.mark.parametrize("num_beam_groups", [2]) -@pytest.mark.parametrize("group_size", [5]) -@pytest.mark.parametrize("max_new_tokens", [800, 2000]) -@pytest.mark.parametrize("prompt", prompts) -@pytest.mark.parametrize("model_descr", get_models_list()) -@pytest.mark.nightly -def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size, - max_new_tokens, prompt): - generation_config = dict( - num_beam_groups=num_beam_groups, - num_beams=num_beam_groups * group_size, - diversity_penalty=1.0, - num_return_sequences=num_beam_groups * group_size, - max_new_tokens=max_new_tokens, - ) - run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) - - -@pytest.mark.parametrize("prompt", prompts) -@pytest.mark.parametrize("model_descr", get_models_list()) -@pytest.mark.precommit -@pytest.mark.nightly -def test_greedy_repetition_penalty(model_descr, prompt): - model_id, path, tokenizer, model, pipe = read_model(model_descr) - - generation_config = dict( - repetition_penalty=2.0, - max_new_tokens=20, - do_sample=False - ) - run_hf_ov_genai_comparison_text_inputs((model_id, path, tokenizer, model, pipe), generation_config, prompt) - - generation_config = dict( - repetition_penalty=1.0, - max_new_tokens=20, - do_sample=False - ) - run_hf_ov_genai_comparison_text_inputs((model_id, path, tokenizer, model, pipe), generation_config, prompt) - - ov_output = pipe.generate(prompt, **generation_config) - - generation_config = dict( - repetition_penalty=0.5, - max_new_tokens=20, - do_sample=False - ) - ov_output_half_penalty = pipe.generate(prompt, **generation_config) - - assert(len(set(ov_output.split(' '))) > len(set(ov_output_half_penalty.split(' ')))) +def test_batch_string_inputs(model_descr, generation_config_dict, prompts): + run_llm_pipeline_with_ref(model_id=model_descr[0], prompts=prompts, generation_config=generation_config_dict, tmp_path=model_descr[1]) @pytest.mark.precommit @@ -313,17 +116,14 @@ def test_batch_size_switch(): @pytest.mark.parametrize("model_descr", get_chat_models_list()) @pytest.mark.precommit @pytest.mark.nightly -def test_chat_compare_with_HF(model_descr, generation_config_kwargs: Dict): +def test_chat_scenario(model_descr, generation_config_kwargs: Dict): chat_history_hf = [] chat_history_ov = [] - chat_prompt = '' - # Will set add_special_tokens=False inside pipeline when start_chat() is called. model_id, path, tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) - from transformers import GenerationConfig as HFGenerationConfig - hf_generation_config = HFGenerationConfig(**generation_config_kwargs) ov_generation_config = GenerationConfig(**generation_config_kwargs) + hf_generation_config = convert_to_hf(opt_model.generation_config, ov_generation_config) ov_pipe.start_chat() for prompt in questions: @@ -559,39 +359,27 @@ def test_unicode_pybind_decoding_one_string_streamer(): # Perf metrics # -def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: str) -> ov_genai.PerfMetrics: - model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr - - config = generation_config.copy() # to avoid side effects - - if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config - # it conflicts with `diversity_penalty` and/or `num_beam_groups`. - # Need to set explicitly to False, but only if test arguments omitted this arg. - # Do not apply 'repetition_penalty' if sampling is not used. - config['do_sample'] = False - config['repetition_penalty'] = 1.0 # 1.0 means no penalty - - return ov_pipe.generate([prompt], **config).perf_metrics +def run_perf_metrics_collection(model_descr, generation_config_dict: dict, prompt: str) -> ov_genai.PerfMetrics: + model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(model_descr) + return ov_pipe.generate([prompt], **generation_config_dict).perf_metrics test_cases = [ (dict(max_new_tokens=20), 'table is made of'), ] @pytest.mark.parametrize("generation_config,prompt", test_cases) -@pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly -@pytest.mark.skip(reason="load_time + mean_gen_duration < total_time fails in https://github.com/openvinotoolkit/openvino.genai/actions/runs/12503590506/job/34884840100?pr=1440.") -def test_perf_metrics(model_descr, generation_config, prompt): +def test_perf_metrics(generation_config, prompt): import time start_time = time.perf_counter() - perf_metrics = run_perf_metrics_collection(read_model(model_descr), generation_config, prompt) + model_id, path = 'katuni4ka/tiny-random-gemma2', Path('katuni4ka-tiny-random-gemma2') + perf_metrics = run_perf_metrics_collection((model_id, path), generation_config, prompt) total_time = (time.perf_counter() - start_time) * 1000 # Check that load time is adequate. load_time = perf_metrics.get_load_time() - assert load_time > 0 and load_time < 1000.0 + assert load_time > 0 and load_time < 2000.0 # Check that num input and generated tokens are adequate. num_generated_tokens = perf_metrics.get_num_generated_tokens() @@ -657,34 +445,6 @@ def test_perf_metrics(model_descr, generation_config, prompt): # Misc # -# TODO: move to test_sampling.py -@pytest.mark.precommit -@pytest.mark.nightly -def test_stop_token_ids(): - ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4] - res = ov_pipe.generate( - ov.Tensor([(1,)]), - max_new_tokens=3, - stop_token_ids={9935, ov_pipe.get_tokenizer().get_eos_token_id()}, - include_stop_str_in_output=False - ) - assert 2 == len(res.tokens[0]) - assert 9935 in res.tokens[0] - - -# TODO: move to test_sampling.py -@pytest.mark.precommit -@pytest.mark.nightly -def test_stop_strings(): - ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4] - res = ov_pipe.generate( - "", - max_new_tokens=5, - stop_strings={"ignored", "боль"} - ) - assert "боль" not in res - - # TODO: move this test to test_tokenizer.py @pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory") @pytest.mark.precommit @@ -698,7 +458,7 @@ def test_left_pad(): ] models = read_model(("microsoft/phi-1_5", Path("phi-1_5/"))) - config = { + generation_config_dict = { "max_new_tokens": 20, "num_beam_groups": 2, "num_beams": 2, @@ -713,4 +473,5 @@ def test_left_pad(): } models[2].pad_token = models[2].eos_token - run_hf_ov_genai_comparison_batched(models, config, prompts) + + run_llm_pipeline_with_ref(model_id=models[0], prompts=prompts, generation_config=generation_config_dict, tmp_path=models[1]) diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index c3500d15ac..6ef6162043 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -2,14 +2,18 @@ # SPDX-License-Identifier: Apache-2.0 import openvino_genai as ov_genai -from openvino.runtime import Core import pytest +import platform import sys from ov_genai_test_utils import ( get_models_list, get_chat_models_list, + read_model ) +from common import get_default_properties +if sys.platform == 'darwin' or platform.machine() in ["aarch64", "arm64", "ARM64"]: + pytest.skip("NPU plugin is available only on Linux and Windows x86_64", allow_module_level=True) # This test suite is designed specifically to validate the functionality and robustness of the StaticLLMPipeline on NPUW:CPU. common_config = { @@ -24,19 +28,18 @@ def generate_chat_history(model_path, device, pipeline_config, questions): pipe = ov_genai.LLMPipeline(model_path, device, **pipeline_config) pipe.start_chat() - chat_history = [ pipe.generate(question, max_new_tokens=50) for question in questions ] + chat_history = [ pipe.generate(question, max_new_tokens=50, do_sample=False) for question in questions ] pipe.finish_chat() return chat_history -@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.precommit @pytest.mark.nightly def test_generation_compare_with_stateful(): prompt = 'The Sun is yellow because' - model_path = get_models_list()[0][1] + model_path = read_model(get_models_list()[0])[1] - stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU") + stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU", **get_default_properties()) ref_out = stateful_pipe.generate(prompt, max_new_tokens=100) static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config) @@ -48,11 +51,10 @@ def test_generation_compare_with_stateful(): assert ref_out == actual_out -@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.precommit @pytest.mark.nightly def test_length_properties_set_no_exception(): - model_path = get_models_list()[0][1] + model_path = read_model(get_models_list()[0])[1] # NB: Check it doesn't throw any exception pipeline_config = { "MAX_PROMPT_LEN": 128, "MIN_RESPONSE_LEN": 64 } pipeline_config |= common_config @@ -65,22 +67,20 @@ def test_length_properties_set_no_exception(): { "MIN_RESPONSE_LEN": -1 }, { "MIN_RESPONSE_LEN": "1" } ] -@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.parametrize("pipeline_config", pipeline_configs) @pytest.mark.precommit @pytest.mark.nightly def test_invalid_length_properties_raise_error(pipeline_config): - model_path = get_models_list()[0][1] + model_path = read_model(get_models_list()[0])[1] pipeline_config |= common_config with pytest.raises(RuntimeError): pipe = ov_genai.LLMPipeline(model_path, "NPU", **pipeline_config) -@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.precommit @pytest.mark.nightly def test_batch_one_no_exception(): - model_path = get_models_list()[0][1] + model_path = read_model(get_models_list()[0])[1] prompt = 'The Sun is yellow because' static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config) # Check it doesn't throw any exception when batch of size 1 is provided @@ -88,11 +88,10 @@ def test_batch_one_no_exception(): # TODO: For the further batch support -@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.precommit @pytest.mark.nightly def test_batch_raise_error(): - model_path = get_models_list()[0][1] + model_path = read_model(get_models_list()[0])[1] prompt = 'The Sun is yellow because' pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config) with pytest.raises(RuntimeError): @@ -101,26 +100,24 @@ def test_batch_raise_error(): # TODO: For the further sampling support generation_configs = [ - dict(num_beam_groups=3), + dict(num_beams=3), dict(do_sample=True) ] -@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.parametrize("generation_config", generation_configs) @pytest.mark.precommit @pytest.mark.nightly def test_unsupported_sampling_raise_error(generation_config): - model_path = get_models_list()[0][1] + model_path = read_model(get_models_list()[0])[1] prompt = 'The Sun is yellow because' pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config) with pytest.raises(RuntimeError): pipe.generate(prompt, **generation_config) -@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.precommit @pytest.mark.nightly def test_max_number_of_tokens(): - model_path = get_models_list()[0][1] + model_path = read_model(get_models_list()[0])[1] prompt = 'The Sun is yellow because' num_tokens = 128 @@ -133,11 +130,10 @@ def test_max_number_of_tokens(): # FIXME: Known problem, output differs from stateful pipeline starting from 3rd prompt! -@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.skip(reason="JIRA-144780: Output differs from stateful pipeline") @pytest.mark.precommit @pytest.mark.nightly -def test_chat_generation(model_descr): +def test_chat_generation(): questions = [ '1+1=', 'What is the previous answer?', @@ -145,9 +141,9 @@ def test_chat_generation(model_descr): 'What was my first question?' ] - model_path = get_chat_models_list()[0][1] + model_path = read_model(get_chat_models_list()[0])[1] - chat_history_stateful = generate_chat_history(model_path, "CPU", { }, questions) + chat_history_stateful = generate_chat_history(model_path, "CPU", get_default_properties(), questions) chat_history_static = generate_chat_history(model_path, "NPU", common_config, questions) print('npu chat: \n{chat_history_static}\n') diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py index 25ae9d8afa..004d4f9d9d 100644 --- a/tests/python_tests/test_sampling.py +++ b/tests/python_tests/test_sampling.py @@ -1,84 +1,96 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import os + import sys import pytest -import shutil import sys from dataclasses import dataclass from pathlib import Path -from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer +from openvino_genai import GenerationConfig, StopCriteria from typing import List, TypedDict -from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, \ - get_greedy, get_beam_search, get_multinomial_temperature, \ - get_greedy_with_penalties, get_multinomial_temperature, \ - get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \ - get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty, \ - get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ - get_greedy, get_greedy_with_min_and_max_tokens, \ - get_greedy_with_single_stop_string, get_greedy_with_multiple_stop_strings, get_greedy_with_multiple_stop_strings_no_match, \ - get_beam_search, get_beam_search_min_and_max_tokens, get_beam_search_with_single_stop_string, \ - get_beam_search_with_multiple_stop_strings, get_beam_search_with_multiple_stop_strings_no_match, get_multinomial_max_and_min_token, \ - get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \ - get_greedy_stop_strings_exclude_from_output, get_greedy_stop_strings_include_to_output, \ - get_greedy_n_stop_strings_exclude_from_output, get_greedy_n_stop_strings_include_to_output, \ - generate_and_compare_with_hf, get_multinomial_temperature_and_repetition_penalty, get_scheduler_config, \ - run_continuous_batching +from common import get_hugging_face_models, convert_models, run_llm_pipeline_with_ref, run_llm_pipeline -# TODO: currently, this test drops EOS token as both HF and OV use `skip_special_tokens=True`, which should be disabled for samlpling tests @pytest.mark.precommit -def test_beam_search_has_eos_token_at_end(tmp_path): - ''' - Current test checks that in case of beam search, some generation results - explicitly have EOS token at the end, which is aligned with HF +@pytest.mark.parametrize("generation_config,prompt", + [(dict(max_new_tokens=30), 'table is made of'), + (dict(max_new_tokens=30, min_new_tokens=30), '你好! 你好嗎?'), + (dict(max_new_tokens=30, ignore_eos=True), 'Alan Turing was a'), + # (dict(max_length=40), 'table is made of'), + (dict(stop_token_ids={28998}), 'The Sun is yellow because'), # since a test does not hang, it means stop token is met + # (dict(max_new_tokens=1, min_new_tokens=0, echo=True), 'What is OpenVINO?') + ], + ids=["max_new_tokens", + "min_and_max_new_tokens", + "max_new_tokens_and_ignore_eos_true", + # "max_length", + "stop_token_ids", + # "echo_with_generation", + ]) +def test_basic_stop_criteria(tmp_path, generation_config, prompt): + model_id : str = "katuni4ka/tiny-random-phi3" + run_llm_pipeline_with_ref(model_id, [prompt], generation_config, tmp_path) - Example of current output: - { -1.23264, that I don't know about. - I don't know what you're talking about, but I'm pretty sure it's a Canadian thing. } - ''' - model_id = "facebook/opt-125m" - prompts = ["Tell me something about Canada"] - generation_configs = [get_beam_search()] - scheduler_config = get_scheduler_config() - generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) - -# TODO: currently, this test drops EOS token as both HF and OV use `skip_special_tokens=True`, which should be disabled for samlpling tests @pytest.mark.precommit -def test_greedy_has_eos_token_at_end(tmp_path): - ''' - Current test checks that in case of gready, some generation results - explicitly have EOS token at the end, which is aligned with HF: +@pytest.mark.parametrize("generation_config", + [dict(max_new_tokens=50, min_new_tokens=15, stop_strings={"anag"}, include_stop_str_in_output=True), # expected match on "manage" + dict(max_new_tokens=50, min_new_tokens=1, stop_strings={".", "software", "Intel"}, include_stop_str_in_output=True), + dict(max_new_tokens=50, min_new_tokens=1, stop_strings={"Einstein", "sunny", "geothermal"}, include_stop_str_in_output=True), # expected no match + dict(max_new_tokens=30, stop_strings={ "machines" }, include_stop_str_in_output=False), + dict(max_new_tokens=30, stop_strings={ "machines" }, include_stop_str_in_output=True), + dict(max_new_tokens=30, stop_strings={ "machines", "manage" }, include_stop_str_in_output=False), + dict(max_new_tokens=30, stop_strings={ "machines", "manage" }, include_stop_str_in_output=True),], + ids=["single_stop_string", + "multiple_stop_strings_match", + "multiple_stop_strings_no_match", + "single_stop_string_exclude_from_output", + "single_stop_string_include_to_output", + "multiple_stop_strings_exclude_from_output", + "multiple_stop_strings_include_to_output"]) +def test_stop_strings(tmp_path, generation_config): + prompts = [ "What is OpenVINO?" ] + model_id : str = "facebook/opt-125m" + run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path) - Example of current output: - { a software program } - ''' - model_id = "bigscience/bloomz-560m" - prompts = ["What is OpenVINO?"] - generation_configs = [get_greedy()] - scheduler_config = get_scheduler_config() - generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) + +@pytest.mark.precommit +@pytest.mark.parametrize("generation_config", + [dict(max_new_tokens=30), + dict(max_new_tokens=30, repetition_penalty=2.0),], + ids=["basic", + "repetition_penalty",]) +def test_greedy(tmp_path, generation_config): + prompts = [ "What is OpenVINO?" ] + model_id : str = "katuni4ka/tiny-random-phi3" + run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path) -# TODO: consider removing all these functions with generation configs and use Dict with properties, which can be converted to generation config @pytest.mark.precommit @pytest.mark.parametrize("generation_config", - [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_greedy_with_single_stop_string(), - get_greedy_with_multiple_stop_strings(), get_greedy_with_multiple_stop_strings_no_match(), - get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(), - get_greedy_stop_strings_exclude_from_output(), get_greedy_stop_strings_include_to_output(), - get_greedy_n_stop_strings_exclude_from_output(), get_greedy_n_stop_strings_include_to_output()], - ids=["greedy", "greedy_with_min_and_max_tokens", "greedy_with_repetition_penalty", "greedy_with_single_stop_string", - "greedy_with_multiple_stop_strings", "greedy_with_multiple_stop_strings_no_match", "beam_search", "beam_search_min_and_max_tokens", - "beam_search_with_multiple_stop_strings_no_match", "greedy_stop_strings_exclude_from_output", "greedy_stop_strings_include_to_output", - "greedy_n_stop_strings_exclude_from_output", "greedy_n_stop_strings_include_to_output"]) -def test_sampling_against_optimum(tmp_path, generation_config): + [dict(max_new_tokens=30, num_beams=2), + dict(max_new_tokens=30, num_beams=2, stop_criteria=StopCriteria.NEVER), + dict(max_new_tokens=30, num_beams=2, stop_criteria=StopCriteria.EARLY), + # dict(max_new_tokens=30, num_beams=2, echo=True), + dict(max_new_tokens=30, num_beams=2, length_penalty=1.0), + dict(max_new_tokens=30, num_beams=2, no_repeat_ngram_size=2), + dict(max_new_tokens=30, num_beams=6, num_beam_groups=3, diversity_penalty=1.2, num_return_sequences=3), + dict(max_new_tokens=30, min_new_tokens=15, num_beams=2, num_return_sequences=1), + dict(max_new_tokens=30, num_beams=2, stop_strings={"Einstein", "sunny", "geothermal"}, include_stop_str_in_output=True),], + ids=["single_group_stop_criteria_heuristic", + "single_group_stop_criteria_never", + "single_group_stop_criteria_early", + # "single_group_with_echo", + "single_group_lenght_penalty", + "single_group_no_repeat_ngram_size", + "multiple_groups", + "single_group_min_new_tokens", + "single_group_with_multiple_stop_strings_no_match",]) +def test_beam_search(tmp_path, generation_config): prompts = [ "What is OpenVINO?" ] - generation_configs = [generation_config] model_id : str = "facebook/opt-125m" - generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path) + run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path) @pytest.mark.precommit @@ -87,13 +99,28 @@ def test_sampling_against_optimum(tmp_path, generation_config): reason="Stop strings do not seem to work as expected with beam search in HF, so comparison will fail. If it changes, these cases shall be merged to the test above.", strict=True, ) -@pytest.mark.parametrize("generation_config", [get_beam_search_with_single_stop_string(), get_beam_search_with_multiple_stop_strings()], - ids=["beam_search_with_single_stop_string", "beam_search_with_multiple_stop_strings"]) +@pytest.mark.parametrize("generation_config", + [dict(max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, num_return_sequences=6, stop_strings={"open sour"}, include_stop_str_in_output=True), + dict(max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, num_return_sequences=6, stop_strings={".", "software", "Intel"}, include_stop_str_in_output=True),], + ids=["single_stop_string_match", "multiple_stop_strings_match"]) def test_beam_search_with_stop_string(tmp_path, generation_config): prompts = [ "What is OpenVINO?" ] - generation_configs = [generation_config] model_id : str = "facebook/opt-125m" - generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path) + run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path) + + +@pytest.mark.precommit +@pytest.mark.parametrize("generation_config", + [dict(max_new_tokens=1, min_new_tokens=0, echo=True), + dict(max_new_tokens=30, num_beams=2, echo=True),], + ids=["echo_with_generation", + "single_group_with_echo",]) +def test_echo(tmp_path, generation_config): + prompts = [ "What is OpenVINO?" ] + model_id : str = "facebook/opt-125m" + # TODO: support in stateful mode and remove 'use_cb=True' and this test at all + # as we can enable new parameters set in other tests + run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path, use_cb=True) # TODO: remove platform specific reference texts once CVS-159912 is done and use comparison with HF @@ -123,6 +150,12 @@ class RandomSamplingTestStruct: prompts: List[str] ref_texts: List[List[str]] +from common import get_multinomial_temperature, get_greedy_with_penalties, \ + get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \ + get_multinomial_temperature_top_p_and_top_k, get_multinomial_all_parameters, \ + get_multinomial_temperature_and_num_return_sequence, get_multinomial_max_and_min_token, \ + get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \ + get_multinomial_temperature_and_repetition_penalty RANDOM_SAMPLING_TEST_CASES = [ RandomSamplingTestStruct( @@ -285,72 +318,15 @@ def test_multinomial_sampling_against_reference(tmp_path, test_struct: RandomSam prompts = test_struct.prompts generation_config.rng_seed = 0 - generation_configs = [generation_config] + generation_configs = generation_config model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) + model, hf_tokenizer = get_hugging_face_models(model_id) models_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, models_path) + convert_models(model, hf_tokenizer, models_path) # run multinomial without comparison with reference - _ = run_continuous_batching(models_path, DEFAULT_SCHEDULER_CONFIG, prompts, generation_configs) + _ = run_llm_pipeline(models_path, prompts, generation_configs) # Reference comparison is not performed as sampling results are non-deterministic. # Discrete_distribution impl depends on platform, model inference results may depend on CPU. - - -@pytest.mark.precommit -@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters], - ids=["greedy", "beam_search", "multinomial_all_parameters"]) -@pytest.mark.parametrize("max_num_batched_tokens", [2, 4, 256]) -def test_echo_prompt_phase_only(tmp_path, get_generation_config, max_num_batched_tokens): - generation_config = get_generation_config() - generation_config.max_new_tokens = 0 - generation_config.echo = True - - scheduler_config = get_scheduler_config() - scheduler_config.max_num_batched_tokens = max_num_batched_tokens - generation_configs = [generation_config] - model_id : str = "facebook/opt-125m" - opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) - - model_path : Path = tmp_path / model_id - save_ov_model_from_optimum(opt_model, hf_tokenizer, model_path) - - cb_pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU") - - outputs = cb_pipe.generate(["What is OpenVINO?"], generation_configs) - assert(len(outputs)) - for output in outputs: - assert(len(output.m_generation_ids)) - for sequence in output.m_generation_ids: - assert(sequence == "What is OpenVINO?") - - -@pytest.mark.precommit -@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters], - ids=["greedy", "beam_search", "multinomial_all_parameters"]) -@pytest.mark.parametrize("max_num_batched_tokens", [2, 4, 256]) -def test_echo_with_generation_phase(tmp_path, get_generation_config, max_num_batched_tokens): - generation_config = get_generation_config() - generation_config.max_new_tokens = 10 - generation_config.echo = True - - scheduler_config = get_scheduler_config() - scheduler_config.max_num_batched_tokens = max_num_batched_tokens - generation_configs = [generation_config] - model_id : str = "facebook/opt-125m" - opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) - - model_path : Path = tmp_path / model_id - save_ov_model_from_optimum(opt_model, hf_tokenizer, model_path) - - cb_pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU") - outputs = cb_pipe.generate(["What is OpenVINO?"], generation_configs) - assert(len(outputs)) - - for output in outputs: - assert(len(output.m_generation_ids)) - for sequence in output.m_generation_ids: - assert(sequence.startswith("What is OpenVINO?")) - assert(len(sequence) > len("What is OpenVINO?")) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index b4df6492bb..81c181bc54 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -6,8 +6,8 @@ import pytest import transformers from optimum.intel.openvino import OVModelForVisualCausalLM -from openvino_genai import VLMPipeline -from common import get_greedy, get_image_by_link, get_beam_search, get_greedy, get_multinomial_all_parameters +from openvino_genai import VLMPipeline, GenerationConfig +from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters def get_ov_model(cache): model_dir = cache.mkdir("tiny-random-minicpmv-2_6") @@ -49,21 +49,22 @@ def streamer(word: str) -> bool: return False models_path = get_ov_model(cache) + generation_config = GenerationConfig(max_new_tokens=30) for links in image_links_for_testing: images = [] for link in links: images.append(get_image_by_link(link)) - pipe = VLMPipeline(models_path, "CPU") - pipe.start_chat() + ov_pipe = VLMPipeline(models_path, "CPU") + ov_pipe.start_chat() - pipe.generate(prompts[0], images=images, generation_config=get_greedy(), streamer=streamer) + ov_pipe.generate(prompts[0], images=images, generation_config=generation_config, streamer=streamer) for prompt in prompts[1:]: - pipe.generate(prompt, generation_config=get_greedy(), streamer=streamer) + ov_pipe.generate(prompt, generation_config=generation_config, streamer=streamer) - pipe.finish_chat() + ov_pipe.finish_chat() @pytest.mark.precommit @@ -95,7 +96,7 @@ def test_perf_metrics(cache): images = [get_image_by_link(image_links[0])] pipe = VLMPipeline(models_path, "CPU") - result = pipe.generate(prompts[0], images=images, generation_config=get_greedy()) + result = pipe.generate(prompts[0], images=images, generation_config=GenerationConfig(max_new_tokens=30)) perf_metrics = result.perf_metrics diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py index 7b966f049e..fec9e96f4c 100644 --- a/tools/who_what_benchmark/tests/test_cli_image.py +++ b/tools/who_what_benchmark/tests/test_cli_image.py @@ -42,8 +42,8 @@ def teardown_module(): ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "hf"), ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "openvino"), ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "text-to-image", "hf"), - ("hf-internal-testing/tiny-stable-diffusion-torch", "image-inpainting", "hf"), - ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "image-inpainting", "hf"), + # ("hf-internal-testing/tiny-stable-diffusion-torch", "image-inpainting", "hf"), + # ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "image-inpainting", "hf"), ], ) def test_image_model_types(model_id, model_type, backend): @@ -88,7 +88,10 @@ def test_image_model_types(model_id, model_type, backend): @pytest.mark.parametrize( ("model_id", "model_type"), list(itertools.product(OV_IMAGE_MODELS, - ["image-to-image", "text-to-image", "image-inpainting"])), + ["image-to-image", + "text-to-image", + # "image-inpainting" + ])), ) def test_image_model_genai(model_id, model_type): with tempfile.TemporaryDirectory() as temp_dir: From 002f84fecf311ac453c5c298b619cafabfdadd80 Mon Sep 17 00:00:00 2001 From: Oleg Pipikin Date: Sat, 4 Jan 2025 01:15:17 +0100 Subject: [PATCH 5/5] Add slice before matmut transformation for CB scenario (#1261) CVS-154930 CVS-155533 --------- Co-authored-by: Ilya Lavrenov --- src/cpp/src/continuous_batching_impl.cpp | 14 ++--- src/cpp/src/llm_pipeline_stateful.cpp | 2 +- src/cpp/src/model_runner.hpp | 51 ++++++++++++++---- src/cpp/src/sampler.cpp | 13 +++-- src/cpp/src/sequence_group.hpp | 13 +++++ .../speculative_decoding_impl.cpp | 2 + src/cpp/src/utils.cpp | 53 ++++++++++++++----- src/cpp/src/utils.hpp | 4 +- .../utils/paged_attention_transformations.hpp | 2 + 9 files changed, 115 insertions(+), 39 deletions(-) diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 7b076504d0..44bfaf7f21 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -28,6 +28,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl( bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction; utils::apply_paged_attention_transformations(model, device_config, is_need_per_layer_cache_control); + utils::apply_gather_before_matmul_transformation(model); initialize_pipeline(model, scheduler_config, properties, device_config, core); } @@ -444,7 +445,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs( const float * logits_data = logits.data(); ov::Shape logits_shape = logits.get_shape(); OPENVINO_ASSERT(logits_shape.size() == 3); - size_t batch_seq_len = logits_shape[1], vocab_size = logits_shape[2]; + size_t vocab_size = logits_shape[2]; for (size_t sequence_group_id = 0, currently_processed_tokens = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) { SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id]; // requests not scheduled, in decoding phase or not echoing are not processed @@ -454,18 +455,17 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs( size_t num_running_sequences = sequence_group->num_running_seqs(); OPENVINO_ASSERT(num_running_sequences == 1); - size_t actual_seq_len = sequence_group->get_num_scheduled_tokens(); - size_t padded_amount_of_processed_tokens = std::max(actual_seq_len, batch_seq_len); + size_t output_seq_len = sequence_group->get_output_seq_len(); const float * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens; size_t num_prompt_tokens_processed = sequence_group->get_num_processed_tokens(); - OPENVINO_ASSERT(num_prompt_tokens_processed + actual_seq_len <= sequence_group->get_prompt_len()); + OPENVINO_ASSERT(num_prompt_tokens_processed + output_seq_len <= sequence_group->get_prompt_len()); // if we processed the whole prompt we don't include last logprob as it will be processed by the sampler (it's already completion) // otherwise we include it as it will be used in the next part of the prompt int exclude_last_logprob = 1; - if (num_prompt_tokens_processed + actual_seq_len < sequence_group->get_prompt_len()) + if (num_prompt_tokens_processed + output_seq_len < sequence_group->get_prompt_len()) exclude_last_logprob = 0; // if we start processing the prompt we add "fake" log prob for the first position (begin of sequence) @@ -473,7 +473,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs( sequence_group->append_prompt_log_prob(1.0); for (int token_logits_offset = 0, token_id_offset = num_prompt_tokens_processed + 1; - token_logits_offset < actual_seq_len - exclude_last_logprob; + token_logits_offset < output_seq_len - exclude_last_logprob; token_logits_offset++, token_id_offset++) { const float* token_logits = (sequence_group_logits_data + token_logits_offset * vocab_size); @@ -498,7 +498,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs( sequence_group->append_prompt_log_prob(token_logit - max_value - log_sum); } - currently_processed_tokens += padded_amount_of_processed_tokens * num_running_sequences; + currently_processed_tokens += output_seq_len * num_running_sequences; // For max_new_tokens == 0, we don't reach sampling so need to notify handle separately if(sequence_group->get_sampling_parameters().max_new_tokens == 0) { sequence_group->notify_handle_echo_only(); diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp index 890afe2ab9..153fcc6fce 100644 --- a/src/cpp/src/llm_pipeline_stateful.cpp +++ b/src/cpp/src/llm_pipeline_stateful.cpp @@ -38,7 +38,7 @@ StatefulLLMPipeline::StatefulLLMPipeline( const ov::AnyMap& properties, const ov::genai::GenerationConfig& generation_config) : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) { - utils::slice_matmul_stateful_model(model); + utils::apply_slice_before_matmul_transformation(model); m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(model); ov::CompiledModel compiled_model; diff --git a/src/cpp/src/model_runner.hpp b/src/cpp/src/model_runner.hpp index abc96ac423..27eee9e27d 100644 --- a/src/cpp/src/model_runner.hpp +++ b/src/cpp/src/model_runner.hpp @@ -114,28 +114,54 @@ class ModelRunner { subsequence_begins_data[0] = 0; block_indices_begins_data[0] = 0; + bool matmul_gathering_is_available = false; + size_t gathering_current_index = 0; + std::vector gather_indices_values; + try { + std::ignore = m_request.get_tensor("sampled_tokens_indices"); + matmul_gathering_is_available = true; + } catch (const ov::Exception&) {} + + for (size_t i = 0; i < num_sequence_groups; ++i) { size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids[i]; - SequenceGroup::CPtr sequence_group = sequence_groups[seq_group_id]; - std::vector running_sequences = sequence_group->get_running_sequences(); + SequenceGroup::Ptr sequence_group = sequence_groups[seq_group_id]; + std::vector running_sequences = sequence_group->get_running_sequences(); size_t num_running_sequences = running_sequences.size(); size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens(); size_t group_position_id = sequence_group->get_num_processed_tokens(); + size_t prompt_len = sequence_group->get_prompt_len(); - // spec: In case of multiple input tokens for current sequence (prompt_len > 1), - // context_len corresponds to first token within subgroup of scheduled tokens - size_t group_context_len = group_position_id; + // Next variables are only for sliced matmul case + size_t output_seq_len = 0; + const bool echo_output = sequence_group->get_sampling_parameters().echo; + const bool sampling_is_required = sequence_group->requires_sampling(); + const size_t tokens_to_sample_per_sequence = 1 + sequence_group->get_num_tokens_to_validate(); for (size_t seq_id = 0; seq_id < num_running_sequences; ++seq_id) { + output_seq_len = 0; Sequence::CPtr sequence = running_sequences[seq_id]; - - for (size_t token_id = 0, position_id = group_position_id; token_id < num_scheduled_tokens; ++token_id, ++position_id) { + for (size_t token_id = 0, position_id = group_position_id; token_id < num_scheduled_tokens; ++token_id, ++position_id, ++gathering_current_index) { // compute token for current sequence - input_ids_data[token_id] = position_id < sequence_group->get_prompt_len() ? + input_ids_data[token_id] = position_id < prompt_len ? sequence_group->get_prompt_ids()[position_id] : - sequence->get_generated_ids()[position_id - sequence_group->get_prompt_len()]; + sequence->get_generated_ids()[position_id - prompt_len]; position_ids_data[token_id] = position_id; + + // Check if token gathering is required for the entire sequence group + if (matmul_gathering_is_available && (sampling_is_required || echo_output)) { + // Determine if the current token should be gathered + if (echo_output || + // Skip gathering for prompt tokens + group_position_id + token_id >= prompt_len - 1 && + // Gather only the last scheduled token or 1 + num_tokens_to_validate tokens for SD + // In SD, tokens_to_sample_per_sequence may exceed num_scheduled_tokens + token_id + tokens_to_sample_per_sequence >= num_scheduled_tokens) { + gather_indices_values.push_back(gathering_current_index); + output_seq_len++; + } + } } size_t expected_kv_cache_size = sequence_group->get_num_processed_tokens() - sequence_group->get_num_evicted_tokens(); @@ -153,6 +179,7 @@ class ModelRunner { subsequence_begins_data += 1; block_indices_begins_data += 1; } + sequence_group->set_output_seq_len(matmul_gathering_is_available ? output_seq_len : num_scheduled_tokens); } // typical LLM parameters @@ -168,6 +195,12 @@ class ModelRunner { m_request.set_tensor("block_indices_begins", block_indices_begins); m_request.set_tensor("max_context_len", max_context_len); + if (matmul_gathering_is_available) { + ov::Tensor gather_indices(ov::element::i64, {gather_indices_values.size()}); + std::memcpy(gather_indices.data(), gather_indices_values.data(), gather_indices_values.size() * sizeof(int64_t)); + m_request.set_tensor("sampled_tokens_indices", gather_indices); + } + // print_tensor("input_ids", input_ids); // print_tensor("position_ids", position_ids); diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index 9c18dc7721..b2e8add403 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -749,7 +749,7 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, const float * logits_data = logits.data(); ov::Shape logits_shape = logits.get_shape(); OPENVINO_ASSERT(logits_shape.size() == 3); - size_t batch_seq_len = logits_shape[1], vocab_size = logits_shape[2]; + size_t vocab_size = logits_shape[2]; SamplerOutput sampler_output; for (size_t sequence_group_id = 0, currently_processed_tokens = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) { @@ -758,8 +758,7 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, continue; size_t num_running_sequences = sequence_group->num_running_seqs(); - size_t actual_seq_len = sequence_group->get_num_scheduled_tokens(); // points to a token which needs to be sampled - size_t padded_amount_of_processed_tokens = std::max(actual_seq_len, batch_seq_len); + size_t output_seq_len = sequence_group->get_output_seq_len(); const ov::genai::GenerationConfig& sampling_params = sequence_group->get_sampling_parameters(); const auto request_id = sequence_group->get_request_id(); @@ -774,13 +773,13 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, auto& stop_strings = m_stop_strings.at(request_id); auto& logit_processor = m_logit_processors.at(request_id); const void * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens; - ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, actual_seq_len, vocab_size}, (void *)sequence_group_logits_data); + ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, output_seq_len, vocab_size}, (void *)sequence_group_logits_data); size_t max_removed_tokens_per_request = 0, min_generated_len = std::numeric_limits::max(), updated_validation_len = 0; if (sequence_group->requires_sampling()) { // get number of token to be validated auto num_tokens_to_process = sequence_group->get_num_tokens_to_validate(); - if (num_tokens_to_process > actual_seq_len - 1) { - auto delta = num_tokens_to_process - (actual_seq_len - 1); + if (num_tokens_to_process > output_seq_len - 1) { + auto delta = num_tokens_to_process - (output_seq_len - 1); updated_validation_len = std::max(updated_validation_len, delta); num_tokens_to_process -= delta; } @@ -914,7 +913,7 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, } // accumulate a number of processed tokens - currently_processed_tokens += padded_amount_of_processed_tokens * num_running_sequences; + currently_processed_tokens += output_seq_len * num_running_sequences; } return sampler_output; diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index 8f8d5f899e..14ce87c6f1 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -222,6 +222,8 @@ class SequenceGroup : public std::enable_shared_from_this { size_t m_num_validation_tokens = 0; // flag to enable/disable token generation, e.g. in speculative decoding scenario bool m_is_gen_paused = false; + // output seq len at current iteration + size_t m_output_seq_len = 0; size_t m_num_streamed_tokens = 0, m_stream_window_size = 0; @@ -394,6 +396,14 @@ class SequenceGroup : public std::enable_shared_from_this { return m_num_processed_tokens; } + size_t get_output_seq_len() const { + return m_output_seq_len; + } + + void set_output_seq_len(size_t len) { + m_output_seq_len = len; + } + /** * Registers within the sequence group that a given amount of tokens * has been evicted from the underlying KV cache. @@ -436,11 +446,14 @@ class SequenceGroup : public std::enable_shared_from_this { void schedule_tokens(size_t num_tokens) { m_num_scheduled_tokens = num_tokens; + // Unless otherwise specified, the sampler will process all scheduled tokens. + m_output_seq_len = num_tokens; } void clear_scheduled_tokens() { m_num_scheduled_tokens = 0; m_num_validation_tokens = 0; + m_output_seq_len = 0; } bool is_scheduled() const { diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp index f749ac4e81..526c5df2d4 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp @@ -33,6 +33,8 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con utils::apply_paged_attention_transformations(main_model, main_model_desc.scheduler_config.use_cache_eviction); utils::apply_paged_attention_transformations(draft_model, main_model_desc.scheduler_config.use_cache_eviction); + utils::apply_gather_before_matmul_transformation(main_model); + utils::apply_gather_before_matmul_transformation(draft_model); std::string draft_device = draft_model_desc.device.empty() ? main_model_desc.device : draft_model_desc.device; bool is_draft_scheduler_undefined = draft_model_desc.scheduler_config == SchedulerConfig(); diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index 52faae02e9..9261aa7a4a 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -4,9 +4,11 @@ #include "utils.hpp" #include +#include #include "openvino/op/add.hpp" #include "openvino/op/divide.hpp" +#include "openvino/op/gather.hpp" #include "openvino/op/multiply.hpp" #include "openvino/op/matmul.hpp" #include "openvino/op/slice.hpp" @@ -230,23 +232,34 @@ ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::Token return {new_input_ids, new_attention_mask}; } -void slice_matmul_stateful_model(std::shared_ptr model) { - auto last_node = model->output(0).get_node()->input_value(0).get_node(); - ov::Node* matmul = dynamic_cast(last_node); - if (matmul) { - // we have found matmul, do nothing - } else if(auto add = dynamic_cast(last_node)) { - matmul = dynamic_cast(add->input_value(0).get_node()); - } else if (auto transpose = dynamic_cast(last_node)) { - matmul = dynamic_cast(transpose->input_value(0).get_node()); - } else if (auto multiply = dynamic_cast(last_node)) { - if (auto tanh = dynamic_cast(multiply->input_value(0).get_node())) { - if (auto divide = dynamic_cast(tanh->input_value(0).get_node())) { - matmul = dynamic_cast(divide->input_value(0).get_node()); +namespace { +std::shared_ptr find_llm_matmul(const std::shared_ptr& model) { + auto last_node = model->output(0).get_node()->input_value(0).get_node_shared_ptr(); + std::shared_ptr matmul = std::dynamic_pointer_cast(last_node); + // There are several patterns for matmul we are looking for: + // Matmul -> Result + // Matmul -> Add -> Result + // Matmul -> Transpose -> Result + // MatMul -> Divide -> Tanh -> Multiply -> Result + if (!matmul) { + if(auto add = std::dynamic_pointer_cast(last_node)) { + matmul = std::dynamic_pointer_cast(add->input_value(0).get_node_shared_ptr()); + } else if (auto transpose = std::dynamic_pointer_cast(last_node)) { + matmul = std::dynamic_pointer_cast(transpose->input_value(0).get_node_shared_ptr()); + } else if (auto multiply = std::dynamic_pointer_cast(last_node)) { + if (auto tanh = std::dynamic_pointer_cast(multiply->input_value(0).get_node_shared_ptr())) { + if (auto divide = std::dynamic_pointer_cast(tanh->input_value(0).get_node_shared_ptr())) { + matmul = std::dynamic_pointer_cast(divide->input_value(0).get_node_shared_ptr()); + } } } } + return matmul; +} +} // namespace +void apply_slice_before_matmul_transformation(std::shared_ptr model) { + auto matmul = find_llm_matmul(model); if (matmul && matmul->input(0).get_partial_shape().rank().get_length() == 3) { auto start = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{-1}); auto stop = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{-2}); @@ -257,6 +270,19 @@ void slice_matmul_stateful_model(std::shared_ptr model) { } } +void apply_gather_before_matmul_transformation(std::shared_ptr model) { + auto matmul = ov::genai::utils::find_llm_matmul(model); + if (matmul && matmul->input(0).get_partial_shape().rank().get_length() == 3) { + auto indices = std::make_shared(ov::element::i64, ov::PartialShape{-1}); + indices->set_friendly_name("sampled_tokens_indices"); + indices->output(0).get_tensor().set_names({"sampled_tokens_indices"}); + auto axis = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{0}); + auto gather = std::make_shared(matmul->input_value(0), indices, axis); + matmul->input(0).replace_source_output(gather); + model->add_parameters({indices}); + } +} + template void read_rt_info(std::shared_ptr& model, const char* name, T& value) { if (!model) @@ -396,7 +422,6 @@ void print_compiled_model_properties(ov::CompiledModel& compiled_Model, const ch } } } - } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index af9d889115..ad0e1a05d4 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -99,7 +99,9 @@ std::pair split_scheduler_config(const ov::AnyMap& ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend); -void slice_matmul_stateful_model(std::shared_ptr model); +void apply_slice_before_matmul_transformation(std::shared_ptr model); + +void apply_gather_before_matmul_transformation(std::shared_ptr model); ov::Core singleton_core(); diff --git a/src/cpp/src/utils/paged_attention_transformations.hpp b/src/cpp/src/utils/paged_attention_transformations.hpp index 88ac0876c5..2cb32adcdc 100644 --- a/src/cpp/src/utils/paged_attention_transformations.hpp +++ b/src/cpp/src/utils/paged_attention_transformations.hpp @@ -27,6 +27,8 @@ size_t get_hidden_size(const std::shared_ptr model); void set_kv_cache_type_and_shape(std::shared_ptr model, DeviceConfig& device_config); +void apply_gather_before_matmul_transformation(std::shared_ptr model); + } // namespace utils } // namespace genai } // namespace ov