diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp index a274c8d1c1cae6..371091b6bd1f3d 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp @@ -423,6 +423,8 @@ class Config final { std::string toString() const; + void fromString(const std::string& str); + private: std::shared_ptr _desc; ImplMap _impl; diff --git a/src/plugins/intel_npu/src/al/src/config/config.cpp b/src/plugins/intel_npu/src/al/src/config/config.cpp index a4e2b515b8e3f6..c9c26451d6f7d5 100644 --- a/src/plugins/intel_npu/src/al/src/config/config.cpp +++ b/src/plugins/intel_npu/src/al/src/config/config.cpp @@ -244,6 +244,31 @@ std::string Config::toString() const { return resultStream.str(); } +void Config::fromString(const std::string& str) { + std::map config; + std::string str_cfg(str); + + auto parse_token = [&](const std::string& token) { + auto pos_eq = token.find('='); + auto key = token.substr(0, pos_eq); + auto value = token.substr(pos_eq + 2, token.size() - pos_eq - 3); + config[key] = value; + }; + + size_t pos = 0; + std::string token, key, value; + while ((pos = str_cfg.find(' ')) != std::string::npos) { + token = str_cfg.substr(0, pos); + parse_token(token); + str_cfg.erase(0, pos + 1); + } + + // Process tail + parse_token(str_cfg); + + update(config); +} + // // envVarStrToBool // diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index a85277b636b2e6..66e1e8e55fde2a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2023-2024 Intel Corporation +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "compiled_model.hpp" @@ -21,6 +21,7 @@ #include "openvino/util/common_util.hpp" #include "partitioning/patterns/opt.hpp" #include "plugin.hpp" +#include "serialization.hpp" #include "unfold_sync_infer_request.hpp" #include "util.hpp" @@ -486,6 +487,222 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, report_io(); } +ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, + const std::shared_ptr& plugin, + const bool serialized) + : ov::npuw::ICompiledModel(model, plugin), + m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()), + m_cfg(m_options_desc), + m_name(model->get_friendly_name()), + m_loaded_from_cache(serialized) { + ::intel_npu::registerNPUWOptions(*m_options_desc); + NPUW_ASSERT(serialized && "This constructor should only be utilized during deserialization!"); + LOG_DEBUG("CompiledModel is being deserialized, skipping the full constructor flow..."); +} + +void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream) const { + using namespace ov::npuw::s11n; + + LOG_DEBUG("Serializing CompiledModelDesc..."); + LOG_BLOCK(); + + write(stream, replaced_by); + + write(stream, param_base); + write(stream, forced_to_fcall); + + write(stream, host_gather.dst_idx); + write(stream, host_gather.src_idx); + write(stream, host_gather.idx_idx); + + write(stream, spatial); + + write(stream, scales); + write(stream, zerops); + write(stream, is_remote); + + // NOTE: for closure only serialize uids - full flow + write(stream, closure_uid); + + // Some tensors might be present in CPU closure already - need to serialize as is + // FIXME: When weightless serialization is introduced, this should be handled differently + write(stream, closure.size()); + std::vector cpu_closures; + std::vector cpu_closure_ids; + for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) { + if (closure_uid[cidx] == -1) { // CPU closure, not in the bank + cpu_closure_ids.push_back(cidx); + cpu_closures.push_back(closure[cidx]); + } + } + + write(stream, cpu_closure_ids); + + for (const auto& tensor : cpu_closures) { + write(stream, tensor); + } + + // FIXME: support weightless flow! + + LOG_DEBUG("DONE."); +} + +void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& stream) { + using namespace ov::npuw::s11n; + + LOG_DEBUG("Deserializing CompiledModelDesc..."); + LOG_BLOCK(); + + read(stream, replaced_by); + + read(stream, param_base); + read(stream, forced_to_fcall); + + read(stream, host_gather.dst_idx); + read(stream, host_gather.src_idx); + read(stream, host_gather.idx_idx); + + read(stream, spatial); + + read(stream, scales); + read(stream, zerops); + read(stream, is_remote); + + // NOTE: for closure only deserialize uids - full flow + read(stream, closure_uid); + + // Some tensors might be present in CPU closure already - need to deserialize as is + // FIXME: When weightless serialization is introduced, this should be handled differently + std::size_t closure_size = 0; + read(stream, closure_size); + std::vector cpu_closure_ids; + read(stream, cpu_closure_ids); + closure.resize(closure_size); + for (const auto& cidx : cpu_closure_ids) { + read(stream, closure[cidx]); + } + + // FIXME: support weightless flow! + + LOG_DEBUG("DONE."); +} + +void ov::npuw::CompiledModel::serialize(std::ostream& stream) const { + LOG_INFO("Serializing CompiledModel..."); + LOG_BLOCK(); + + using namespace ov::npuw::s11n; + + // Serialize name + write(stream, m_name); + + // Serialize inputs and outputs + write(stream, inputs()); + write(stream, outputs()); + + // Serialize meta + write(stream, m_inputs_to_submodels_inputs); + write(stream, m_outputs_to_submodels_outputs); + write(stream, m_param_subscribers); + write(stream, m_submodels_input_to_prev_output); + + // Write device list + write(stream, m_dev_list); + + // Write config + write(stream, m_cfg); + + // Serialize compiled submodels + write(stream, m_compiled_submodels.size()); + for (const auto& subm : m_compiled_submodels) { + // Write device idx + std::size_t device_idx = subm.device_it - m_dev_list.begin(); + write(stream, device_idx); + // Write ICompiledModel if it's there + if (subm.compiled_model) { + write(stream, true); + // FIXME: workaround for import/export model since import model seem to reset the file pointer + std::stringstream ss; + subm.compiled_model->export_model(ss); + write(stream, ss.str()); + } else { + write(stream, false); + } + // Write the rest of the submodel desc + subm.serialize(stream); + } + + LOG_INFO("Done."); +} + +std::shared_ptr ov::npuw::CompiledModel::deserialize( + std::istream& stream, + const std::shared_ptr& plugin) { + LOG_INFO("Deserializing CompiledModel..."); + LOG_BLOCK(); + + using namespace ov::npuw::s11n; + + // Deserialize model name first + std::string model_name; + read(stream, model_name); + + // Create a dummy CompiledModel with an empty ov::Model - this will skip the constructor flow + // to continue deserialization + ov::ParameterVector parameters; + ov::NodeVector results; + + read(stream, parameters); + read(stream, results); + + auto ov_model = std::make_shared(results, parameters, model_name); + + auto compiled = std::make_shared(ov_model, plugin, true); + + // Deserialize meta + compiled->m_name = model_name; + read(stream, compiled->m_inputs_to_submodels_inputs); + read(stream, compiled->m_outputs_to_submodels_outputs); + read(stream, compiled->m_param_subscribers); + read(stream, compiled->m_submodels_input_to_prev_output); + + // Deserialize device list + read(stream, compiled->m_dev_list); + + // Deserialize config + read(stream, compiled->m_cfg); + + // Deserialize compiled submodels + std::size_t subm_size = 0; + read(stream, subm_size); + compiled->m_compiled_submodels.resize(subm_size); + for (std::size_t i = 0; i < subm_size; ++i) { + std::size_t device_idx = 0; + read(stream, device_idx); + + bool has_compiled_model = false; + read(stream, has_compiled_model); + if (has_compiled_model) { + // Import model from the plugin + // FIXME: workaround for import/export model since import model seems to reset the file pointer + std::string buf; + read(stream, buf); + std::stringstream buffer(buf); + compiled->m_compiled_submodels[i].compiled_model = + plugin->get_core()->import_model(buffer, compiled->m_dev_list[device_idx]); + } + compiled->m_compiled_submodels[i].device_it = compiled->m_dev_list.begin() + device_idx; + compiled->m_compiled_submodels[i].deserialize(stream); + } + + compiled->implement_properties(); + compiled->report_io(); + + LOG_INFO("Done."); + + return compiled; +} + void ov::npuw::CompiledModel::finalize_weights_bank() { LOG_INFO("Finalizing weights bank..."); // Register lazy tensors @@ -541,6 +758,33 @@ void ov::npuw::CompiledModel::finalize_weights_bank() { LOG_INFO("Done."); } +void ov::npuw::CompiledModel::reconstruct_closure() { + for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) { + auto& comp_model_desc = m_compiled_submodels[idx]; + + // Skip optimized out and non-functions + if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) { + continue; + } + + const auto real_idx = comp_model_desc.replaced_by.value_or(idx); + auto& func_desc = m_compiled_submodels[real_idx]; + + // At this point closure size should have already been deserialized + NPUW_ASSERT(!comp_model_desc.closure.empty() && "Closure shouldn't be empty at this point!"); + for (std::size_t cidx = 0; cidx < comp_model_desc.closure.size(); ++cidx) { + if (comp_model_desc.closure[cidx]) { + // host-side closure - already set, do nothing + NPUW_ASSERT(!comp_model_desc.is_remote[cidx]); + continue; + } + NPUW_ASSERT(comp_model_desc.closure_uid[cidx] != -1); + comp_model_desc.closure[cidx] = + m_weights_bank->get(comp_model_desc.closure_uid[cidx], *func_desc.device_it); + } + } +} + void ov::npuw::CompiledModel::detach_memory() { LOG_INFO("Detaching model & weight memory..."); LOG_BLOCK(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index f1de81f51d8c6a..b4faf9d417b003 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2023-2024 Intel Corporation +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -40,6 +40,9 @@ class CompiledModel : public ov::npuw::ICompiledModel { CompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, const ov::AnyMap& properties); + CompiledModel(const std::shared_ptr& model, + const std::shared_ptr& plugin, + const bool serialized); void export_model(std::ostream& model) const override; std::shared_ptr get_runtime_model() const override; @@ -56,6 +59,7 @@ class CompiledModel : public ov::npuw::ICompiledModel { friend class UnfoldInferRequest; friend class MemAccessSim; friend class FuncMemMgr; + friend class LLMCompiledModel; bool compile_for_success(std::size_t id); bool compile_for_device(std::size_t id, const std::string& device_to_try); @@ -66,6 +70,10 @@ class CompiledModel : public ov::npuw::ICompiledModel { void report_io() const; + void serialize(std::ostream& stream) const; + static std::shared_ptr deserialize(std::istream& stream, + const std::shared_ptr& plugin); + // This is used for removing too long output tensor names to fix some compilation issues // NB: These two methods has nothing to do with this particular class and should be // moved elsewhere @@ -83,6 +91,9 @@ class CompiledModel : public ov::npuw::ICompiledModel { void log_device_dist() const; void implement_properties(); + // For full deserialization flow with weights + void reconstruct_closure(); + void finalize_weights_bank(); void detach_memory(); std::string global_mem_device() const; @@ -141,7 +152,7 @@ class CompiledModel : public ov::npuw::ICompiledModel { // lazy_closure is used for weights sharing and allocating device memory. std::vector closure; std::vector lazy_closure; - std::vector closure_uid; + std::vector closure_uid; // Note: value -1 is considered uninitialized std::vector scales; std::vector zerops; std::vector is_remote; @@ -154,6 +165,9 @@ class CompiledModel : public ov::npuw::ICompiledModel { // Metrics execution_stats stat; + + void serialize(std::ostream& stream) const; + void deserialize(std::istream& stream); }; std::vector m_compiled_submodels; diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 07a4c564f55b4c..f05cf0509e7531 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2023-2024 Intel Corporation +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "llm_compiled_model.hpp" @@ -14,6 +14,7 @@ #include "openvino/pass/stateful_to_stateless.hpp" #include "openvino/pass/validate.hpp" #include "openvino/runtime/iasync_infer_request.hpp" +#include "serialization.hpp" namespace opp = ov::pass::pattern; class TransposeValueTensors : public ov::pass::MatcherPass { @@ -423,6 +424,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m const std::shared_ptr& plugin, const ov::AnyMap& properties) : ov::npuw::ICompiledModel(model, plugin), + m_name(model->get_friendly_name()), m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()), m_cfg(m_options_desc) { LOG_DEBUG("Creating LLMCompiledModel"); @@ -507,11 +509,147 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m "model and its config, please check passed config."); implement_properties(); + LOG_DEBUG("Done"); } -void ov::npuw::LLMCompiledModel::export_model(std::ostream& model) const { - OPENVINO_NOT_IMPLEMENTED; +ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& model, + const std::shared_ptr& plugin, + const bool serialized) + : ov::npuw::ICompiledModel(model, plugin), + m_name(model->get_friendly_name()), + m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()), + m_cfg(m_options_desc) { + NPUW_ASSERT(serialized && "This constructor should only be utilized during deserialization!"); + LOG_DEBUG("LLMCompiledModel is being deserialized, skipping the full constructor flow..."); +} + +void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const { + LOG_INFO("Serializing LLMCompiledModel..."); + LOG_BLOCK(); + + using namespace ov::npuw::s11n; + + // Serialize magic number first + write(stream, NPUW_SERIALIZATION_INDICATOR); + + // Serialize general meta info + write(stream, OPENVINO_VERSION_MAJOR); + write(stream, OPENVINO_VERSION_MINOR); + write(stream, OPENVINO_VERSION_PATCH); + write(stream, std::string(NPUW_SERIALIZATION_VERSION)); + + // Serialize name + write(stream, m_name); + + // Serialize inputs and outputs + write(stream, inputs()); + write(stream, outputs()); + + // Serialize LLMCompiledModel-specific data + write(stream, m_kvcache_desc.max_prompt_size); + write(stream, m_kvcache_desc.total_size); + write(stream, m_kvcache_desc.num_stored_tokens); + write(stream, m_kvcache_desc.dim); + + // Serialize CompiledModels + m_kvcache_compiled->serialize(stream); + m_prefill_compiled->serialize(stream); + + // Serialize weights bank (if required) + const auto& kv_bank = m_kvcache_compiled->m_weights_bank; + const auto& p_bank = m_prefill_compiled->m_weights_bank; + NPUW_ASSERT(kv_bank && p_bank && kv_bank == p_bank && "Prefill and KVCache models' weight bank should be shared!"); + // FIXME: support weightless flow + write(stream, kv_bank->get_name()); + kv_bank->serialize(stream); + + LOG_INFO("Done."); +} + +std::shared_ptr ov::npuw::LLMCompiledModel::deserialize( + std::istream& stream, + const std::shared_ptr& plugin) { + LOG_INFO("Deserializing LLMCompiledModel..."); + LOG_BLOCK(); + + using namespace ov::npuw::s11n; + + // Sanity check magic number + std::array serialization_indicator; + read(stream, serialization_indicator); + NPUW_ASSERT(serialization_indicator == NPUW_SERIALIZATION_INDICATOR && "This blob wasn't serialized via NPUW!"); + + // Deserialize general meta info + int vmajor, vminor, vpatch; + std::string s11n_version; + read(stream, vmajor); + read(stream, vminor); + read(stream, vpatch); + read(stream, s11n_version); + + if (vmajor != OPENVINO_VERSION_MAJOR || vminor != OPENVINO_VERSION_MINOR || vpatch != OPENVINO_VERSION_PATCH || + s11n_version != std::string(NPUW_SERIALIZATION_VERSION)) { + OPENVINO_THROW("This blobs was serialized with different OV version!", + " Serialized by OV ", + vmajor, + '.', + vminor, + '.', + vpatch, + " Current OV version ", + OPENVINO_VERSION_MAJOR, + '.', + OPENVINO_VERSION_MINOR, + '.', + OPENVINO_VERSION_PATCH, + " NPUW serialized by version ", + s11n_version, + " NPUW current serialization version ", + NPUW_SERIALIZATION_VERSION); + } + + // Deserialize model name first + std::string model_name; + read(stream, model_name); + + // Create a dummy CompiledModel with an empty ov::Model - this will skip the constructor flow + // to continue deserialization + ov::ParameterVector parameters; + ov::NodeVector results; + + read(stream, parameters); + read(stream, results); + + auto ov_model = std::make_shared(results, parameters, model_name); + + auto compiled = std::make_shared(ov_model, plugin, true); + + // Deserialize LLMCompiledModel-specific data + read(stream, compiled->m_kvcache_desc.max_prompt_size); + read(stream, compiled->m_kvcache_desc.total_size); + read(stream, compiled->m_kvcache_desc.num_stored_tokens); + read(stream, compiled->m_kvcache_desc.dim); + + // Deserialize CompiledModels + compiled->m_kvcache_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin); + compiled->m_prefill_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin); + + // Deserialize weights bank (if required) + std::string bank_name; + read(stream, bank_name); + auto bank = ov::npuw::weights::Bank::deserialize(stream, compiled->get_plugin()->get_core(), bank_name); + + // FIXME: support weightless option + compiled->m_kvcache_compiled->m_weights_bank = bank; + compiled->m_prefill_compiled->m_weights_bank = bank; + + // After bank deserialization - reconstruct NPU closures from the bank + compiled->m_kvcache_compiled->reconstruct_closure(); + compiled->m_prefill_compiled->reconstruct_closure(); + + LOG_INFO("Done."); + return compiled; } std::shared_ptr ov::npuw::LLMCompiledModel::get_runtime_model() const { diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp index e37a47b2c77948..5003ccce40bb9d 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2024-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -28,8 +28,15 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel { LLMCompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, const ov::AnyMap& properties); + LLMCompiledModel(const std::shared_ptr& model, + const std::shared_ptr& plugin, + const bool serialized); LLMCompiledModel() = delete; + void export_model(std::ostream& model) const override; + static std::shared_ptr deserialize(std::istream& stream, + const std::shared_ptr& plugin); + std::shared_ptr get_runtime_model() const override; void set_property(const ov::AnyMap& properties) override; @@ -42,6 +49,7 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel { std::shared_ptr create_sync_infer_request() const override; void implement_properties(); + std::string m_name; std::shared_ptr<::intel_npu::OptionsDesc> m_options_desc; ::intel_npu::Config m_cfg; GetPropertiesMap m_prop_to_opt; diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp new file mode 100644 index 00000000000000..5ff28204b4b6ca --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp @@ -0,0 +1,158 @@ +// Copyright (C) 2024-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "serialization.hpp" + +#include "intel_npu/config/config.hpp" +#include "logging.hpp" +#include "openvino/op/constant.hpp" +#include "spatial.hpp" + +void ov::npuw::s11n::write(std::ostream& stream, const std::streampos& var) { + stream.write(reinterpret_cast(&var), sizeof var); +} + +void ov::npuw::s11n::write(std::ostream& stream, const std::string& var) { + auto var_size = var.size(); + stream.write(reinterpret_cast(&var_size), sizeof var_size); + stream.write(&var[0], var.size()); +} + +void ov::npuw::s11n::write(std::ostream& stream, const bool& var) { + stream.write(reinterpret_cast(&var), sizeof var); +} + +void ov::npuw::s11n::write(std::ostream& stream, const ov::npuw::compiled::Spatial& var) { + using ov::npuw::s11n::write; + + write(stream, var.params.size()); + for (const auto& p : var.params) { + write(stream, p.idx); + write(stream, p.dim); + } + write(stream, var.range); + write(stream, var.nway); + write(stream, var.out_dim); + write(stream, var.nway_iters); + write(stream, var.tail_size); +} + +void ov::npuw::s11n::write(std::ostream& stream, const ov::Tensor& var) { + using ov::npuw::s11n::write; + + auto type_str = var.get_element_type().to_string(); + write(stream, type_str); + write(stream, var.get_shape()); + write(stream, var.get_byte_size()); + + ov::Tensor tensor; + if (var.is_continuous()) { + tensor = var; + } else { + // Just copy strided tensor to a non-strided one + tensor = ov::Tensor(var.get_element_type(), var.get_shape()); + var.copy_to(tensor); + } + NPUW_ASSERT(tensor); + stream.write(reinterpret_cast(var.data()), var.get_byte_size()); +} + +void ov::npuw::s11n::write(std::ostream& stream, const ::intel_npu::Config& var) { + write(stream, var.toString()); +} + +void ov::npuw::s11n::write(std::ostream& stream, const ov::Output& var) { + write(stream, var.get_element_type().to_string()); + write(stream, var.get_partial_shape().to_string()); + write(stream, var.get_names()); +} + +void ov::npuw::s11n::read(std::istream& stream, std::streampos& var) { + stream.read(reinterpret_cast(&var), sizeof var); +} + +void ov::npuw::s11n::read(std::istream& stream, std::string& var) { + std::size_t var_size = 0; + stream.read(reinterpret_cast(&var_size), sizeof var_size); + var.resize(var_size); + stream.read(&var[0], var_size); +} + +void ov::npuw::s11n::read(std::istream& stream, bool& var) { + stream.read(reinterpret_cast(&var), sizeof var); +} + +void ov::npuw::s11n::read(std::istream& stream, ov::npuw::compiled::Spatial& var) { + using ov::npuw::s11n::read; + + ov::npuw::compiled::Spatial spat; + std::size_t params_size = 0; + read(stream, params_size); + for (std::size_t i = 0; i < params_size; ++i) { + ov::npuw::compiled::Spatial::Param p; + read(stream, p.idx); + read(stream, p.dim); + spat.params.push_back(p); + } + read(stream, spat.range); + read(stream, spat.nway); + read(stream, spat.out_dim); + read(stream, spat.nway_iters); + read(stream, spat.tail_size); +} + +void ov::npuw::s11n::read(std::istream& stream, ov::Tensor& var) { + std::string type_str; + read(stream, type_str); + ov::element::Type type(type_str); + + ov::Shape shape; + read(stream, shape); + + std::size_t byte_size = 0; + read(stream, byte_size); + + var = ov::Tensor(type, shape); + + stream.read(reinterpret_cast(var.data()), byte_size); +} + +void ov::npuw::s11n::read(std::istream& stream, ::intel_npu::Config& var) { + std::string str; + read(stream, str); + var.fromString(str); +} + +void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr& var) { + std::string elem_type_str; + std::string part_shape_str; + std::unordered_set names; + read(stream, elem_type_str); + read(stream, part_shape_str); + read(stream, names); + // NOTE: the code below is taken from NPU plugin's create_dummy_model() + var = std::make_shared(ov::element::Type(elem_type_str), ov::PartialShape(part_shape_str)); + var->set_friendly_name(*names.begin()); // FIXME: any_name ? + var->output(0).get_tensor().set_names(names); +} + +void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr& var) { + std::string elem_type_str; + std::string part_shape_str; + std::unordered_set names; + read(stream, elem_type_str); + read(stream, part_shape_str); + read(stream, names); + // NOTE: the code below is taken from NPU plugin's create_dummy_model() + std::shared_ptr res = + std::make_shared(ov::element::Type(elem_type_str), std::vector{1}); + // FIXME: serialize names as well? + const std::shared_ptr& tensor_dummy = + std::make_shared(ov::element::Type(elem_type_str), + ov::PartialShape(part_shape_str), + names); + var = std::make_shared(res); + var->output(0).set_tensor_ptr(tensor_dummy); + var->set_friendly_name(*names.begin()); // any_name ? +} diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp new file mode 100644 index 00000000000000..77a6b3aa865254 --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp @@ -0,0 +1,207 @@ +// Copyright (C) 2024-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const constexpr std::array NPUW_SERIALIZATION_INDICATOR = + {char{0x13}, char{0x37}, char{0x6e}, char{0x70}, char{0x75}, char{0x77}}; + +const constexpr char* NPUW_SERIALIZATION_VERSION = "0.0"; + +// Forward declaration +namespace intel_npu { +class Config; +} // namespace intel_npu + +namespace ov { + +// Forward declaration +class Node; +class Tensor; +template +class Output; + +// Forward declaration +namespace op { +namespace v0 { +class Parameter; +} // namespace v0 +} // namespace op + +namespace npuw { + +// Forward declaration +namespace compiled { +struct Spatial; +} // namespace compiled + +namespace s11n { + +// Specific type overloads +void write(std::ostream& stream, const std::streampos& var); +void write(std::ostream& stream, const std::string& var); +void write(std::ostream& stream, const bool& var); +void write(std::ostream& stream, const ov::npuw::compiled::Spatial& var); +void write(std::ostream& stream, const ov::Tensor& var); +void write(std::ostream& stream, const ::intel_npu::Config& var); +void write(std::ostream& stream, const ov::Output& var); + +void read(std::istream& stream, std::streampos& var); +void read(std::istream& stream, std::string& var); +void read(std::istream& stream, bool& var); +void read(std::istream& stream, ov::npuw::compiled::Spatial& var); +void read(std::istream& stream, ov::Tensor& var); +void read(std::istream& stream, ::intel_npu::Config& var); +void read(std::istream& stream, std::shared_ptr& var); +void read(std::istream& stream, std::shared_ptr& var); + +// Forward declaration +template +void write(std::ostream& stream, const std::pair& var); +template +void write(std::ostream& stream, const std::vector& var); +template +void write(std::ostream& stream, const std::array& var); +template +void read(std::istream& stream, std::pair& var); +template +void read(std::istream& stream, std::vector& var); +template +void read(std::istream& stream, std::array& var); + +// Serialization +template ::value, bool> = true> +void write(std::ostream& stream, const T& var) { + stream.write(reinterpret_cast(&var), sizeof var); +} + +template +void write(std::ostream& stream, const std::pair& var) { + write(stream, var.first); + write(stream, var.second); +} + +template +void write(std::ostream& stream, const std::vector& var) { + write(stream, var.size()); + for (const auto& el : var) { + write(stream, el); + } +} + +template +void write(std::ostream& stream, const std::array& var) { + for (const auto& el : var) { + write(stream, el); + } +} + +template +void write(std::ostream& stream, const std::unordered_set& var) { + write(stream, var.size()); + for (const auto& el : var) { + write(stream, el); + } +} + +template +void write(std::ostream& stream, const std::map& var) { + write(stream, var.size()); + for (const auto& el : var) { + write(stream, el); + } +} + +template +void write(std::ostream& stream, const std::optional& var) { + if (var) { + write(stream, true); + write(stream, var.value()); + } else { + write(stream, false); + } +} + +// Deserialization +template ::value, bool> = true> +void read(std::istream& stream, T& var) { + stream.read(reinterpret_cast(&var), sizeof var); +} + +template +void read(std::istream& stream, std::pair& var) { + read(stream, var.first); + read(stream, var.second); +} + +template +void read(std::istream& stream, std::vector& var) { + var.clear(); + std::size_t var_size = 0; + stream.read(reinterpret_cast(&var_size), sizeof var_size); + var.reserve(var_size); + for (std::size_t i = 0; i < var_size; ++i) { + T elem; + read(stream, elem); + var.push_back(elem); + } +} + +template +void read(std::istream& stream, std::array& var) { + for (std::size_t i = 0; i < N; ++i) { + T elem; + read(stream, elem); + var[i] = elem; + } +} + +template +void read(std::istream& stream, std::unordered_set& var) { + var.clear(); + std::size_t var_size = 0; + stream.read(reinterpret_cast(&var_size), sizeof var_size); + for (std::size_t i = 0; i < var_size; ++i) { + T elem; + read(stream, elem); + var.insert(elem); + } +} + +template +void read(std::istream& stream, std::map& var) { + var.clear(); + std::size_t var_size = 0; + stream.read(reinterpret_cast(&var_size), sizeof var_size); + for (std::size_t i = 0; i < var_size; ++i) { + std::pair elem; + read(stream, elem); + var[elem.first] = elem.second; + } +} + +template +void read(std::istream& stream, std::optional& var) { + bool has_value = false; + read(stream, has_value); + if (has_value) { + T val; + read(stream, val); + var = val; + } +} + +} // namespace s11n +} // namespace npuw +} // namespace ov diff --git a/src/plugins/intel_npu/src/plugin/npuw/spatial.hpp b/src/plugins/intel_npu/src/plugin/npuw/spatial.hpp index fce2f63db4e807..2dc7eeaac3c538 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/spatial.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/spatial.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2023-2024 Intel Corporation +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -48,6 +48,7 @@ struct Spatial { std::size_t nway_iters = 0u; std::size_t tail_size = 0u; + Spatial() = default; Spatial(const function::Spatial& s, const std::shared_ptr& m) : range(s._range), nway(s._slice), diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp index 4cc804f7b7e399..21b575fe54a53b 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2024-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -6,6 +6,7 @@ #include "logging.hpp" #include "openvino/core/parallel.hpp" +#include "serialization.hpp" #include "util.hpp" using ov::npuw::weights::Bank; @@ -84,6 +85,7 @@ void Bank::evaluate_and_allocate() { std::unique_lock storage_guard(device_bank.mutex); vec.reserve(device_bank.storage.size()); + // FIXME: only add non-allocated tensors here for (const auto& el : device_bank.storage) { vec.push_back(el.second.lt); } @@ -155,6 +157,109 @@ bool Bank::is_remote(int64_t uid) const { return false; } +void Bank::serialize(std::ostream& stream) const { + using namespace ov::npuw::s11n; + + LOG_INFO("Serializing weights bank..."); + LOG_BLOCK(); + + std::lock_guard guard(m_mutex); + + write(stream, m_device_banks.size()); + + for (const auto& elem : m_device_banks) { + const auto& device = elem.first; + const auto& device_bank = elem.second; + std::lock_guard dev_guard(device_bank.mutex); + write(stream, device); + write(stream, device_bank.storage.size()); + for (const auto& t_pair : device_bank.storage) { + write(stream, t_pair.first); + write(stream, t_pair.second.tensor); + } + } + + LOG_INFO("DONE."); +} + +std::shared_ptr Bank::deserialize(std::istream& stream, + const std::shared_ptr& core, + const std::string& name) { + using namespace ov::npuw::s11n; + + LOG_INFO("Deserializing weights bank..."); + LOG_BLOCK(); + + auto bank = ov::npuw::weights::bank(name, core, ""); + + std::size_t bank_size = 0; + read(stream, bank_size); + + for (std::size_t i = 0; i < bank_size; ++i) { + std::string device; + read(stream, device); + std::size_t storage_size = 0; + read(stream, storage_size); + for (std::size_t j = 0; j < storage_size; ++j) { + int64_t uid = -1; + read(stream, uid); + bank->read_and_add_tensor(stream, uid, device); + } + } + + LOG_INFO("DONE."); + + return bank; +} + +void Bank::read_and_add_tensor(std::istream& stream, int64_t uid, const std::string& device) { + using namespace ov::npuw::s11n; + + // This method is supposed to be used only during deserialization + std::lock_guard guard(m_mutex); + + auto& device_bank = m_device_banks[device]; + std::lock_guard dev_guard(device_bank.mutex); + + auto iter_device = device_bank.storage.find(uid); + + if (iter_device != device_bank.storage.end()) { + // Already allocated + return; + } + + if (device == "CPU") { + // Just read deserialized tensor into the bank + read(stream, device_bank.storage[uid].tensor); + return; + } + + // Need to allocate on device and copy deserialized tensor to that memory + ov::SoPtr remote_tensor; + ov::Tensor allocated_tensor; + + // FIXME: reading not via a dedicated function + std::string type_str; + read(stream, type_str); + ov::element::Type type(type_str); + + ov::Shape shape; + read(stream, shape); + + std::size_t byte_size = 0; + read(stream, byte_size); + + auto remote_ctx = m_core->get_default_context(device)._ptr; + remote_tensor = remote_ctx->create_host_tensor(type, shape); + allocated_tensor = ov::make_tensor(remote_tensor); + device_bank.storage[uid] = {LazyTensor(), allocated_tensor}; + stream.read(reinterpret_cast(allocated_tensor.data()), byte_size); +} + +std::string Bank::get_name() const { + return m_bank_name; +} + std::shared_ptr BankManager::getBank(const std::string& bank_name, const std::shared_ptr& core, const std::string& alloc_device) { @@ -162,7 +267,7 @@ std::shared_ptr BankManager::getBank(const std::string& bank_name, auto iter = m_bank_map.find(bank_name); if (iter == m_bank_map.end() || iter->second.expired()) { - auto bank = std::make_shared(core, alloc_device); + auto bank = std::make_shared(core, alloc_device, bank_name); m_bank_map[bank_name] = bank; return bank; } @@ -174,7 +279,7 @@ std::shared_ptr ov::npuw::weights::bank(const std::string& bank_name, const std::string& alloc_device) { if (bank_name.empty()) { // Don't share this bank in manager - return std::make_shared(core, alloc_device); + return std::make_shared(core, alloc_device, bank_name); } auto& instance = BankManager::getInstance(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp index 0d1d84b490c5e2..fd9f0e39841b7a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2024-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -18,13 +18,19 @@ namespace ov { namespace npuw { +// Forward declaration +class LLMCompiledModel; +class CompiledModel; namespace weights { class Bank { public: - explicit Bank(const std::shared_ptr& core, const std::string& alloc_device) + explicit Bank(const std::shared_ptr& core, + const std::string& alloc_device, + const std::string& bank_name) : m_core(core), - m_alloc_device(alloc_device) {} + m_alloc_device(alloc_device), + m_bank_name(bank_name) {} // Register LazyTensor in a bank if it's not there. Returns LazyTensor's unique id int64_t registerLT(const LazyTensor& tensor, const std::string& device); @@ -37,7 +43,12 @@ class Bank { bool is_remote(int64_t uid) const; + std::string get_name() const; + private: + friend class ov::npuw::LLMCompiledModel; + friend class ov::npuw::CompiledModel; + struct StoredTensor { LazyTensor lt; ov::Tensor tensor; @@ -52,10 +63,18 @@ class Bank { ov::Tensor eval_and_alloc(const LazyTensor& tensor, DeviceBank& dbank, const std::string& device); + void serialize(std::ostream& stream) const; + static std::shared_ptr deserialize(std::istream& stream, + const std::shared_ptr& core, + const std::string& name); + // Used during deserialization + void read_and_add_tensor(std::istream& stream, int64_t uid, const std::string& device); + mutable std::mutex m_mutex; std::shared_ptr m_core = nullptr; std::string m_alloc_device; int64_t uid_count = 0; + std::string m_bank_name; }; std::shared_ptr bank(const std::string& bank_name, diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp index fa641dfdcd9641..c5f5ba436785cd 100644 --- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp +++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2024 Intel Corporation +// Copyright (C) 2018-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -7,6 +7,10 @@ #include #include "compiled_model.hpp" +#include "npuw/compiled_model.hpp" +#include "npuw/llm_compiled_model.hpp" +#include "npuw/serialization.hpp" +#include "driver_compiler_adapter.hpp" #include "compiler_adapter_factory.hpp" #include "intel_npu/common/device_helpers.hpp" #include "intel_npu/common/icompiler_adapter.hpp" @@ -752,7 +756,25 @@ std::shared_ptr Plugin::import_model(std::istream& stream, c OV_ITT_SCOPED_TASK(itt::domains::NPUPlugin, "Plugin::import_model"); OV_ITT_TASK_CHAIN(PLUGIN_IMPORT_MODEL, itt::domains::NPUPlugin, "Plugin::import_model", "merge_configs"); - const std::map propertiesMap = any_copy(properties); + // If was exported via NPUW + auto stream_start_pos = stream.tellg(); + std::array serialization_indicator; + ov::npuw::s11n::read(stream, serialization_indicator); + if (serialization_indicator == NPUW_SERIALIZATION_INDICATOR) { + stream.seekg(stream_start_pos); + return ov::npuw::LLMCompiledModel::deserialize(stream, shared_from_this()); + } + stream.seekg(stream_start_pos); + + // Drop NPUW properties if there are any + ov::AnyMap npu_plugin_properties; + for (auto it = properties.begin(); it != properties.end(); ++it) { + if (it->first.find("NPUW") == it->first.npos) { + npu_plugin_properties.insert(*it); + } + } + const std::map propertiesMap = any_copy(npu_plugin_properties); + auto localConfig = merge_configs(_globalConfig, propertiesMap, OptionMode::RunTime); _logger.setLevel(localConfig.get()); const auto platform = _backends->getCompilationPlatform(localConfig.get(), localConfig.get());