Skip to content

Commit

Permalink
Added possibility to pass PREFILL/GENERATE configs and pad_token_id
Browse files Browse the repository at this point in the history
  • Loading branch information
AsyaPronina committed Dec 20, 2024
1 parent c1c66ce commit e38b474
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 46 deletions.
29 changes: 29 additions & 0 deletions src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,34 @@ void registerNPUWLLMOptions(OptionsDesc& desc);
static OptionMode mode() { return OptionMode::Mode; } \
};

template<typename T>
std::vector<T> vector_from_string(const std::string& val) {
std::vector<T> result;
if (!val.empty()) {
if (val[0] == '[') { // OV-serialized vector
std::istringstream is(val);
ov::util::Read<std::vector<T>>()(is, result);
} else {
result = OptionParser<std::vector<T>>::parse(val);
}
}
return result;
}

template<typename K, typename V>
std::map<K, V> map_from_string(const std::string& val) {
std::map<K, V> result;
if (!val.empty()) {
if (val[0] == '{') { // OV-serialized map
std::istringstream is(val);
ov::util::Read<std::map<K, V>>()(is, result);
} else {
result = OptionParser<std::map<K, V>>::parse(val);
}
}
return result;
}

DEFINE_OPT(NPU_USE_NPUW, bool, false, use_npuw, CompileTime);
DEFINE_OPT(NPUW_DEVICES, std::string, "NPU,CPU", npuw::devices, CompileTime);
DEFINE_OPT(NPUW_SUBMODEL_DEVICE, std::string, "", npuw::submodel_device, CompileTime);
Expand Down Expand Up @@ -70,6 +98,7 @@ DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime);
DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime);
DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime);
DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime);
DEFINE_OPT(NPUW_LLM_PAD_TOKEN_ID, int64_t, 0, npuw::llm::pad_token_id, CompileTime);

namespace npuw {
namespace llm {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,14 @@ static constexpr ov::Property<uint32_t> max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN"
*/
static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"};

/**
* @brief
* Type: std::map<std::string, ov::Any>.
* Tell NPUW the configuration for compilation of prefill model.
* NOTE: !! Write-only !!
*/
static constexpr ov::Property<std::string> prefill_config{"NPUW_LLM_PREFILL_CONFIG"};

/**
* @brief
* Type: std::string.
Expand All @@ -421,6 +429,21 @@ static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_
*/
static constexpr ov::Property<std::string> generate_hint{"NPUW_LLM_GENERATE_HINT"};

/**
* @brief
* Type: std::map<std::string, ov::Any>.
* Tell NPUW the configuration for compilation of generate model.
* NOTE: !! Write-only !!
*/
static constexpr ov::Property<std::string> generate_config{"NPUW_LLM_GENERATE_CONFIG"};

/**
* @brief
* Type: int64_t.
* Pad token ID to fill input token ids in the conversation mode.
* Default: 0.
*/
static constexpr ov::Property<int64_t> pad_token_id{"NPUW_LLM_PAD_TOKEN_ID"};
} // namespace llm

} // namespace npuw
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_npu/src/al/src/config/npuw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,5 @@ void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
desc.add<NPUW_LLM_MAX_PROMPT_LEN>();
desc.add<NPUW_LLM_MIN_RESPONSE_LEN>();
desc.add<NPUW_LLM_GENERATE_HINT>();
desc.add<NPUW_LLM_PAD_TOKEN_ID>();
}
50 changes: 29 additions & 21 deletions src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,9 @@ std::optional<T> get_option(ov::AnyMap& config, const std::string& option_name)
}

template <typename T>
T pop_or_default(ov::AnyMap& config, const std::string& key, const T& default_value) {
auto anyopt = pop_option(config, key);
if (anyopt.has_value()) {
return anyopt.value().as<T>();
T opt_or_default(const std::optional<ov::Any>& opt, const T& default_value) {
if (opt.has_value()) {
return opt.value().as<T>();
}
return default_value;
}
Expand Down Expand Up @@ -206,12 +205,6 @@ void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) {
}
}

void drop_cache_dir(ov::AnyMap& config) {
if (config.count("NPU_USE_NPUW") != 0u) {
pop_option(config, "CACHE_DIR");
}
}

void split_llm_properties(const ov::AnyMap& properties, ov::AnyMap& llm_properties, ov::AnyMap& other_properties) {
for (auto it = properties.begin(); it != properties.end(); ++it) {
if (it->first.find("NPUW_LLM") != it->first.npos) {
Expand Down Expand Up @@ -245,6 +238,15 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
std::map<std::string, ov::Any> npuw_llm_props;
std::map<std::string, ov::Any> other_props;
split_llm_properties(properties, npuw_llm_props, other_props);

auto npudesc = extract_npu_descriptor(plugin);

// Remove "NPUW_LLM_PREFILL_CONFIG", "NPUW_LLM_GENERATE_CONFIG" from map,
// to not pass them into ::intel_npu::Config object, as we don't need to
// preserve them somewhere.
auto prefill_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_GENERATE_CONFIG"));
auto generate_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_GENERATE_CONFIG"));

m_cfg.update(any_copy(npuw_llm_props));

LOG_DEBUG("1. Creating kvcache model as clone of passed one.");
Expand All @@ -258,7 +260,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
LOG_DEBUG("4. Converting KV-cache in prefill model to FP16.");
prefill_model = cvt_kvcache_to_fp16(prefill_model);

LOG_DEBUG("5. Optimize kvcache kvcache model to output key/values for new token.");
LOG_DEBUG("5. Optimize kvcache model to output key/values for new token.");
kvcache_model = redirect_new_kv_to_output(kvcache_model);
LOG_DEBUG("6. Converting KV-cache in kvcache model to FP16.");
kvcache_model = cvt_kvcache_to_fp16(kvcache_model);
Expand All @@ -273,19 +275,20 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
LOG_DEBUG("8. Make kvcache model with static shapes");
reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes);

auto npudesc = extract_npu_descriptor(plugin);

ov::AnyMap properties_copy = std::move(other_props);
auto prefill_config = get_default_prefill_config(model, npudesc);
// NB: GENERATE_HINT is only applicable for default generate config!

auto prefill_config = opt_or_default(prefill_config_opt, get_default_prefill_config(prefill_model, npudesc));

const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>();
LOG_DEBUG("9. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint)));
auto generate_config = get_default_generate_config(model, npudesc, generate_hint);
// NB: GENERATE_HINT is only applicable for default generate config!
if (generate_config_opt.has_value() && npuw_llm_props.count(ov::intel_npu::npuw::llm::generate_hint.name())) {
OPENVINO_THROW("GENERATE_HINT is only applicable for default generate config!");
}
auto generate_config = opt_or_default(generate_config_opt, get_default_generate_config(model, npudesc, generate_hint));

merge_config_with(prefill_config, properties_copy);
merge_config_with(generate_config, properties_copy);
// FIXME: Drop CACHE_DIR option if NPUW is enabled
drop_cache_dir(prefill_config);
drop_cache_dir(generate_config);

m_kvcache_compiled = std::make_shared<ov::npuw::CompiledModel>(kvcache_model, plugin, generate_config);
m_prefill_compiled = std::make_shared<ov::npuw::CompiledModel>(prefill_model, plugin, prefill_config);
Expand All @@ -308,6 +311,10 @@ void ov::npuw::LLMCompiledModel::set_property(const ov::AnyMap& properties) {

ov::Any ov::npuw::LLMCompiledModel::get_property(const std::string& name) const {
OPENVINO_SUPPRESS_DEPRECATED_START
if (name == ov::intel_npu::npuw::llm::prefill_config.name() || name == ov::intel_npu::npuw::llm::generate_config.name()) {
OPENVINO_THROW(name, " is write-only option!");
}

auto&& configIterator = m_prop_to_opt.find(name);
if (configIterator != m_prop_to_opt.cend()) {
return std::get<1>(configIterator->second)(m_cfg);
Expand All @@ -324,7 +331,7 @@ std::shared_ptr<ov::ISyncInferRequest> ov::npuw::LLMCompiledModel::create_sync_i

std::shared_ptr<ov::ISyncInferRequest> ov::npuw::LLMCompiledModel::create_llm_infer_request() {
auto this_sptr = std::static_pointer_cast<ov::npuw::LLMCompiledModel>(shared_from_this());
return std::make_shared<ov::npuw::LLMInferRequest>(this_sptr, m_kvcache_desc);
return std::make_shared<ov::npuw::LLMInferRequest>(this_sptr);
}

void ov::npuw::LLMCompiledModel::implement_properties() {
Expand All @@ -341,6 +348,7 @@ void ov::npuw::LLMCompiledModel::implement_properties() {
BIND(npuw::llm::model_desc, NPUW_LLM_MODEL_DESC, getString),
BIND(npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN, get),
BIND(npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN, get),
BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT, getString)});
BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT, getString),
BIND(npuw::llm::pad_token_id, NPUW_LLM_PAD_TOKEN_ID, get)});
#undef BIND
}
42 changes: 20 additions & 22 deletions src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,8 @@ ov::SoPtr<ov::ITensor> make_tensor_slice(ov::SoPtr<ov::ITensor> tensor,
}
} // anonymous namespace

ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model,
const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc)
: ov::ISyncInferRequest(compiled_model),
m_kvcache_desc(kvcache_desc) {
ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model)
: ov::ISyncInferRequest(compiled_model) {
m_kvcache_request = compiled_model->m_kvcache_compiled->create_infer_request();
m_prefill_request = compiled_model->m_prefill_compiled->create_infer_request();

Expand All @@ -52,13 +50,12 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
}

void ov::npuw::LLMInferRequest::prepare_for_new_conversation() {
// FIXME: for input_ids it must be padding from tokenizer that not available from here
// Get it from NPUW options
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0u);
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0u);
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0u);
fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0u);
m_kvcache_desc.num_stored_tokens = 0u;
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")),
m_npuw_llm_compiled_model->m_cfg.get<::intel_npu::NPUW_LLM_PAD_TOKEN_ID>());
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0);
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0);
fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0);
m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens = 0u;
}

void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
Expand All @@ -82,7 +79,7 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
std::copy_n(position_ids->data<int64_t>(), position_ids->get_size(), padded_position_ids->data<int64_t>() + offset);

m_prefill_request->infer();
m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(input_ids->get_size());
m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(input_ids->get_size());
m_need_copy_kvcache = true;

m_logits = m_prefill_request->get_tensor(m_prefill_out_ports.at("logits"));
Expand All @@ -96,8 +93,9 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
LOG_DEBUG("Calling inference for generate model...");
LOG_BLOCK();

auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
// NB: KV-cache is full, further generation is impossible
if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) {
if (kvcache_desc.num_stored_tokens == kvcache_desc.total_size) {
OPENVINO_THROW("KV-Cache is full.");
}

Expand All @@ -118,19 +116,19 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,

auto prefill_out_slice =
make_tensor_slice(prefill_out_tensor,
m_kvcache_desc.dim,
m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens,
m_kvcache_desc.max_prompt_size);
kvcache_desc.dim,
kvcache_desc.max_prompt_size - kvcache_desc.num_stored_tokens,
kvcache_desc.max_prompt_size);

auto kvcache_in_slice =
make_tensor_slice(kvcache_in_tensor, m_kvcache_desc.dim, 0u, m_kvcache_desc.num_stored_tokens);
make_tensor_slice(kvcache_in_tensor, kvcache_desc.dim, 0u, kvcache_desc.num_stored_tokens);

prefill_out_slice->copy_to(kvcache_in_slice._ptr);
}
LOG_DEBUG("Prepare attention mask pattern.");
auto* attention_mask_data =
m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask"))->data<int64_t>();
attention_mask_data[m_kvcache_desc.total_size - 1] = 1;
attention_mask_data[kvcache_desc.total_size - 1] = 1;

m_need_copy_kvcache = false;
}
Expand All @@ -147,7 +145,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,

m_kvcache_request->infer();
m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at("logits"));
m_kvcache_desc.num_stored_tokens += 1;
kvcache_desc.num_stored_tokens += 1;

LOG_DEBUG("Write KV-cache for the new token to the correct input position for next iteration.");
const std::size_t kStartOutputKVCacheLayers = 1u;
Expand All @@ -157,9 +155,9 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
const auto& input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values");
auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name));
auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor,
m_kvcache_desc.dim,
m_kvcache_desc.num_stored_tokens - 1,
m_kvcache_desc.num_stored_tokens);
kvcache_desc.dim,
kvcache_desc.num_stored_tokens - 1,
kvcache_desc.num_stored_tokens);
auto kvcache_out_tensor = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(output_name));
kvcache_out_tensor->copy_to(kvcache_in_slice._ptr);
}
Expand Down
5 changes: 2 additions & 3 deletions src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ namespace npuw {

class LLMInferRequest final : public ov::ISyncInferRequest {
public:
explicit LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model,
const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc);
explicit LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model);

void infer() override;

Expand Down Expand Up @@ -44,7 +43,7 @@ class LLMInferRequest final : public ov::ISyncInferRequest {

std::shared_ptr<ov::IAsyncInferRequest> m_kvcache_request;
std::shared_ptr<ov::IAsyncInferRequest> m_prefill_request;
LLMCompiledModel::KVCacheDesc m_kvcache_desc;
std::shared_ptr<LLMCompiledModel> m_npuw_llm_compiled_model;
ov::SoPtr<ov::ITensor> m_logits;
bool m_need_copy_kvcache = false;

Expand Down

0 comments on commit e38b474

Please sign in to comment.