diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp index 927b234df8ba15..340633f5480a51 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp @@ -28,6 +28,34 @@ void registerNPUWLLMOptions(OptionsDesc& desc); static OptionMode mode() { return OptionMode::Mode; } \ }; +template +std::vector vector_from_string(const std::string& val) { + std::vector result; + if (!val.empty()) { + if (val[0] == '[') { // OV-serialized vector + std::istringstream is(val); + ov::util::Read>()(is, result); + } else { + result = OptionParser>::parse(val); + } + } + return result; +} + +template +std::map map_from_string(const std::string& val) { + std::map result; + if (!val.empty()) { + if (val[0] == '{') { // OV-serialized map + std::istringstream is(val); + ov::util::Read>()(is, result); + } else { + result = OptionParser>::parse(val); + } + } + return result; +} + DEFINE_OPT(NPU_USE_NPUW, bool, false, use_npuw, CompileTime); DEFINE_OPT(NPUW_DEVICES, std::string, "NPU,CPU", npuw::devices, CompileTime); DEFINE_OPT(NPUW_SUBMODEL_DEVICE, std::string, "", npuw::submodel_device, CompileTime); @@ -70,6 +98,7 @@ DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime); DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime); DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime); DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime); +DEFINE_OPT(NPUW_LLM_PAD_TOKEN_ID, int64_t, 0, npuw::llm::pad_token_id, CompileTime); namespace npuw { namespace llm { diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp index a416ca51233893..4ad2cc0d438347 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp @@ -412,6 +412,14 @@ static constexpr ov::Property max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN" */ static constexpr ov::Property min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"}; +/** + * @brief + * Type: std::map. + * Tell NPUW the configuration for compilation of prefill model. + * NOTE: !! Write-only !! + */ +static constexpr ov::Property prefill_config{"NPUW_LLM_PREFILL_CONFIG"}; + /** * @brief * Type: std::string. @@ -421,6 +429,21 @@ static constexpr ov::Property min_response_len{"NPUW_LLM_MIN_RESPONSE_ */ static constexpr ov::Property generate_hint{"NPUW_LLM_GENERATE_HINT"}; +/** + * @brief + * Type: std::map. + * Tell NPUW the configuration for compilation of generate model. + * NOTE: !! Write-only !! + */ +static constexpr ov::Property generate_config{"NPUW_LLM_GENERATE_CONFIG"}; + +/** + * @brief + * Type: int64_t. + * Pad token ID to fill input token ids in the conversation mode. + * Default: 0. + */ +static constexpr ov::Property pad_token_id{"NPUW_LLM_PAD_TOKEN_ID"}; } // namespace llm } // namespace npuw diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index 4ee9e392406452..e651d00c02552f 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -61,4 +61,5 @@ void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 260a1c444284cb..8f790a2e7c7561 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -134,10 +134,9 @@ std::optional get_option(ov::AnyMap& config, const std::string& option_name) } template -T pop_or_default(ov::AnyMap& config, const std::string& key, const T& default_value) { - auto anyopt = pop_option(config, key); - if (anyopt.has_value()) { - return anyopt.value().as(); +T opt_or_default(const std::optional& opt, const T& default_value) { + if (opt.has_value()) { + return opt.value().as(); } return default_value; } @@ -206,12 +205,6 @@ void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) { } } -void drop_cache_dir(ov::AnyMap& config) { - if (config.count("NPU_USE_NPUW") != 0u) { - pop_option(config, "CACHE_DIR"); - } -} - void split_llm_properties(const ov::AnyMap& properties, ov::AnyMap& llm_properties, ov::AnyMap& other_properties) { for (auto it = properties.begin(); it != properties.end(); ++it) { if (it->first.find("NPUW_LLM") != it->first.npos) { @@ -245,6 +238,15 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m std::map npuw_llm_props; std::map other_props; split_llm_properties(properties, npuw_llm_props, other_props); + + auto npudesc = extract_npu_descriptor(plugin); + + // Remove "NPUW_LLM_PREFILL_CONFIG", "NPUW_LLM_GENERATE_CONFIG" from map, + // to not pass them into ::intel_npu::Config object, as we don't need to + // preserve them somewhere. + auto prefill_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_GENERATE_CONFIG")); + auto generate_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_GENERATE_CONFIG")); + m_cfg.update(any_copy(npuw_llm_props)); LOG_DEBUG("1. Creating kvcache model as clone of passed one."); @@ -258,7 +260,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m LOG_DEBUG("4. Converting KV-cache in prefill model to FP16."); prefill_model = cvt_kvcache_to_fp16(prefill_model); - LOG_DEBUG("5. Optimize kvcache kvcache model to output key/values for new token."); + LOG_DEBUG("5. Optimize kvcache model to output key/values for new token."); kvcache_model = redirect_new_kv_to_output(kvcache_model); LOG_DEBUG("6. Converting KV-cache in kvcache model to FP16."); kvcache_model = cvt_kvcache_to_fp16(kvcache_model); @@ -273,19 +275,20 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m LOG_DEBUG("8. Make kvcache model with static shapes"); reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes); - auto npudesc = extract_npu_descriptor(plugin); - ov::AnyMap properties_copy = std::move(other_props); - auto prefill_config = get_default_prefill_config(model, npudesc); - // NB: GENERATE_HINT is only applicable for default generate config! + + auto prefill_config = opt_or_default(prefill_config_opt, get_default_prefill_config(prefill_model, npudesc)); + const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>(); LOG_DEBUG("9. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint))); - auto generate_config = get_default_generate_config(model, npudesc, generate_hint); + // NB: GENERATE_HINT is only applicable for default generate config! + if (generate_config_opt.has_value() && npuw_llm_props.count(ov::intel_npu::npuw::llm::generate_hint.name())) { + OPENVINO_THROW("GENERATE_HINT is only applicable for default generate config!"); + } + auto generate_config = opt_or_default(generate_config_opt, get_default_generate_config(model, npudesc, generate_hint)); + merge_config_with(prefill_config, properties_copy); merge_config_with(generate_config, properties_copy); - // FIXME: Drop CACHE_DIR option if NPUW is enabled - drop_cache_dir(prefill_config); - drop_cache_dir(generate_config); m_kvcache_compiled = std::make_shared(kvcache_model, plugin, generate_config); m_prefill_compiled = std::make_shared(prefill_model, plugin, prefill_config); @@ -308,6 +311,10 @@ void ov::npuw::LLMCompiledModel::set_property(const ov::AnyMap& properties) { ov::Any ov::npuw::LLMCompiledModel::get_property(const std::string& name) const { OPENVINO_SUPPRESS_DEPRECATED_START + if (name == ov::intel_npu::npuw::llm::prefill_config.name() || name == ov::intel_npu::npuw::llm::generate_config.name()) { + OPENVINO_THROW(name, " is write-only option!"); + } + auto&& configIterator = m_prop_to_opt.find(name); if (configIterator != m_prop_to_opt.cend()) { return std::get<1>(configIterator->second)(m_cfg); @@ -324,7 +331,7 @@ std::shared_ptr ov::npuw::LLMCompiledModel::create_sync_i std::shared_ptr ov::npuw::LLMCompiledModel::create_llm_infer_request() { auto this_sptr = std::static_pointer_cast(shared_from_this()); - return std::make_shared(this_sptr, m_kvcache_desc); + return std::make_shared(this_sptr); } void ov::npuw::LLMCompiledModel::implement_properties() { @@ -341,6 +348,7 @@ void ov::npuw::LLMCompiledModel::implement_properties() { BIND(npuw::llm::model_desc, NPUW_LLM_MODEL_DESC, getString), BIND(npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN, get), BIND(npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN, get), - BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT, getString)}); + BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT, getString), + BIND(npuw::llm::pad_token_id, NPUW_LLM_PAD_TOKEN_ID, get)}); #undef BIND } diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index a73478c0cab5d2..d3147669569b65 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -29,10 +29,8 @@ ov::SoPtr make_tensor_slice(ov::SoPtr tensor, } } // anonymous namespace -ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr& compiled_model, - const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc) - : ov::ISyncInferRequest(compiled_model), - m_kvcache_desc(kvcache_desc) { +ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr& compiled_model) + : ov::ISyncInferRequest(compiled_model) { m_kvcache_request = compiled_model->m_kvcache_compiled->create_infer_request(); m_prefill_request = compiled_model->m_prefill_compiled->create_infer_request(); @@ -52,13 +50,12 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0u); - fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0u); - fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0u); - fill_tensor(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0u); - m_kvcache_desc.num_stored_tokens = 0u; + fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), + m_npuw_llm_compiled_model->m_cfg.get<::intel_npu::NPUW_LLM_PAD_TOKEN_ID>()); + fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0); + fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0); + fill_tensor(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0); + m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens = 0u; } void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr input_ids, @@ -82,7 +79,7 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr input_ids, std::copy_n(position_ids->data(), position_ids->get_size(), padded_position_ids->data() + offset); m_prefill_request->infer(); - m_kvcache_desc.num_stored_tokens += static_cast(input_ids->get_size()); + m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens += static_cast(input_ids->get_size()); m_need_copy_kvcache = true; m_logits = m_prefill_request->get_tensor(m_prefill_out_ports.at("logits")); @@ -96,8 +93,9 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, LOG_DEBUG("Calling inference for generate model..."); LOG_BLOCK(); + auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc; // NB: KV-cache is full, further generation is impossible - if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) { + if (kvcache_desc.num_stored_tokens == kvcache_desc.total_size) { OPENVINO_THROW("KV-Cache is full."); } @@ -118,19 +116,19 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, auto prefill_out_slice = make_tensor_slice(prefill_out_tensor, - m_kvcache_desc.dim, - m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens, - m_kvcache_desc.max_prompt_size); + kvcache_desc.dim, + kvcache_desc.max_prompt_size - kvcache_desc.num_stored_tokens, + kvcache_desc.max_prompt_size); auto kvcache_in_slice = - make_tensor_slice(kvcache_in_tensor, m_kvcache_desc.dim, 0u, m_kvcache_desc.num_stored_tokens); + make_tensor_slice(kvcache_in_tensor, kvcache_desc.dim, 0u, kvcache_desc.num_stored_tokens); prefill_out_slice->copy_to(kvcache_in_slice._ptr); } LOG_DEBUG("Prepare attention mask pattern."); auto* attention_mask_data = m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask"))->data(); - attention_mask_data[m_kvcache_desc.total_size - 1] = 1; + attention_mask_data[kvcache_desc.total_size - 1] = 1; m_need_copy_kvcache = false; } @@ -147,7 +145,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, m_kvcache_request->infer(); m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at("logits")); - m_kvcache_desc.num_stored_tokens += 1; + kvcache_desc.num_stored_tokens += 1; LOG_DEBUG("Write KV-cache for the new token to the correct input position for next iteration."); const std::size_t kStartOutputKVCacheLayers = 1u; @@ -157,9 +155,9 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, const auto& input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values"); auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name)); auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor, - m_kvcache_desc.dim, - m_kvcache_desc.num_stored_tokens - 1, - m_kvcache_desc.num_stored_tokens); + kvcache_desc.dim, + kvcache_desc.num_stored_tokens - 1, + kvcache_desc.num_stored_tokens); auto kvcache_out_tensor = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(output_name)); kvcache_out_tensor->copy_to(kvcache_in_slice._ptr); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp index fbc6c702c4b62a..ea0b83d271b58e 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp @@ -15,8 +15,7 @@ namespace npuw { class LLMInferRequest final : public ov::ISyncInferRequest { public: - explicit LLMInferRequest(const std::shared_ptr& compiled_model, - const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc); + explicit LLMInferRequest(const std::shared_ptr& compiled_model); void infer() override; @@ -44,7 +43,7 @@ class LLMInferRequest final : public ov::ISyncInferRequest { std::shared_ptr m_kvcache_request; std::shared_ptr m_prefill_request; - LLMCompiledModel::KVCacheDesc m_kvcache_desc; + std::shared_ptr m_npuw_llm_compiled_model; ov::SoPtr m_logits; bool m_need_copy_kvcache = false;