Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
fcceeb7
Introduce lazy memory allocation for ireq's I/O
smirnov-alexey Oct 2, 2025
f01b8d5
Fix no tensor being present in the storage
smirnov-alexey Oct 2, 2025
f6fbf64
Merge branch 'master' of https://github.com/openvinotoolkit/openvino …
smirnov-alexey Oct 9, 2025
bd27bbc
Address review comments
smirnov-alexey Oct 10, 2025
b258ed0
Merge branch 'master' of https://github.com/openvinotoolkit/openvino …
smirnov-alexey Oct 13, 2025
bd03f34
Merge branch 'master' of https://github.com/openvinotoolkit/openvino …
smirnov-alexey Oct 31, 2025
7dab40d
Copy Xiong's changes
smirnov-alexey Oct 31, 2025
20af381
Remove copy
smirnov-alexey Nov 3, 2025
a9e2029
WIP
smirnov-alexey Nov 4, 2025
a54d529
Fix concurrency issue with iterator invalidation
smirnov-alexey Nov 5, 2025
ec23004
Merge branch 'master' of https://github.com/openvinotoolkit/openvino …
smirnov-alexey Nov 5, 2025
a09921f
Refactoring
smirnov-alexey Nov 5, 2025
8146143
Fix merge
smirnov-alexey Nov 5, 2025
42b7c1d
Protect get_tensor by mutex
smirnov-alexey Nov 5, 2025
cf01cf8
Merge branch 'as/npuw_lazy_io_alloc' of https://github.com/smirnov-al…
smirnov-alexey Nov 5, 2025
430af31
Disable kv cache sharing when one of the models is transposed
smirnov-alexey Nov 5, 2025
4ab490b
Fix strides
smirnov-alexey Nov 7, 2025
05895d9
Handle strided tensors - copy on host
smirnov-alexey Nov 10, 2025
5ececfc
Merge branch 'master' of https://github.com/openvinotoolkit/openvino …
smirnov-alexey Nov 10, 2025
2183e6a
Handle strided tensors in pyramid attention
smirnov-alexey Nov 10, 2025
ea2b7a0
Address review comments
smirnov-alexey Nov 11, 2025
c2d6cb7
Merge branch 'as/npuw_lazy_io_alloc' of https://github.com/smirnov-al…
smirnov-alexey Nov 11, 2025
afa4146
Address review comments
smirnov-alexey Nov 11, 2025
db78940
Fix shape
smirnov-alexey Nov 11, 2025
a76cea0
Merge branch 'master' into as/npuw_share_kvcache
smirnov-alexey Nov 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 70 additions & 37 deletions src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,15 +158,50 @@ void ov::npuw::IBaseInferRequest::ensure_subrequest_is_accurate(std::size_t idx,
}

ov::SoPtr<ov::ITensor> ov::npuw::IBaseInferRequest::get_tensor(const ov::Output<const ov::Node>& port) const {
// assert(persistent)
return m_port_to_tensor.at(port).tensor;
std::unique_lock lock(m_io_storages_mutex);

if (is_stored(port)) {
return m_port_to_tensor.at(port).tensor;
}

// I/O: allocate here on demand (to reduce memory consumption in case some I/O were shared)
// Input
for (std::size_t i = 0; i < m_npuw_model->inputs().size(); ++i) {
if (m_npuw_model->inputs()[i] == port) {
ov::SoPtr<ov::ITensor> allocated = allocOut(port, global_input_mem_device(i));
m_input_allocated.insert(allocated->data());
m_port_to_tensor[port] = TensorStorage{allocated, true};
return m_port_to_tensor.at(port).tensor;
}
}

// Output
for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
if (m_npuw_model->outputs()[i] == port) {
auto tensor = alloc_global_out(i);
m_port_to_tensor[port] = TensorStorage{tensor, true};
return m_port_to_tensor.at(port).tensor;
}
}

NPUW_ASSERT(false);
return {};
}

void ov::npuw::IBaseInferRequest::set_tensor(const ov::Output<const ov::Node>& port,
const ov::SoPtr<ov::ITensor>& tensor) {
// Assigning via .at() to ensure it is a known port
// assert(persistent)
m_port_to_tensor.at(port).tensor = tensor;
std::unique_lock lock(m_io_storages_mutex);

if (!is_stored(port)) {
// TODO: might be useful to check if the tensor is allocated on the device
m_port_to_tensor[port] = TensorStorage{tensor, false};
} else {
m_port_to_tensor.at(port).tensor = tensor;
}

if (is_io(port)) {
m_port_to_tensor.at(port).persistent = true;
}

// Check if setting input tensor
if (m_port_to_tensor.at(port).persistent) {
Expand All @@ -190,6 +225,24 @@ void ov::npuw::IBaseInferRequest::check_tensors() const {
return;
}

bool ov::npuw::IBaseInferRequest::is_stored(const ov::Output<const ov::Node>& port) const {
return m_port_to_tensor.find(port) != m_port_to_tensor.end();
}

bool ov::npuw::IBaseInferRequest::is_io(const ov::Output<const ov::Node>& port) const {
for (std::size_t i = 0; i < m_npuw_model->inputs().size(); ++i) {
if (m_npuw_model->inputs()[i] == port) {
return true;
}
}
for (std::size_t i = 0; i < m_npuw_model->outputs().size(); ++i) {
if (m_npuw_model->outputs()[i] == port) {
return true;
}
}
return false;
}

void ov::npuw::IBaseInferRequest::handle_set_remote_input(const ov::Output<const ov::Node>& port,
const ov::SoPtr<ov::ITensor>& tensor) {
for (std::size_t i = 0; i < m_npuw_model->inputs().size(); ++i) {
Expand All @@ -209,6 +262,8 @@ void ov::npuw::IBaseInferRequest::handle_set_remote_input(const ov::Output<const
static_cast<ze_context_handle_t>(zrh.as<void*>()),
tensor->data()) > 0) {
if (tensor->is_continuous()) {
// Note: no need for locking as it's internal method that should
// only be called from set_tensor()
m_input_allocated.insert(tensor->data());
} else {
LOG_WARN("Strided remote tensor is not supported on the device! Expect worse performance due "
Expand Down Expand Up @@ -292,14 +347,14 @@ std::size_t ov::npuw::IBaseInferRequest::total_subrequests() const {

ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocMem(const ov::element::Type type,
const ov::Shape& shape,
const std::string& device) {
const std::string& device) const {
auto ptr = ov::npuw::util::allocMem(type, shape, device, m_npuw_model->get_plugin());
m_footprint[device] += ptr->get_byte_size();
return ptr;
}

ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocOut(const ov::Output<const ov::Node>& node,
const std::string& device) {
const std::string& device) const {
return allocMem(node.get_element_type(), node.get_shape(), device);
}

Expand Down Expand Up @@ -328,31 +383,7 @@ std::string ov::npuw::IBaseInferRequest::global_output_mem_device(std::size_t id
return *proto_comp_model_desc.device_it;
}

void ov::npuw::IBaseInferRequest::alloc_io() {
// Preallocate input tensors
LOG_INFO("Preallocating input tensors...");
for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
const auto& port = m_npuw_model->inputs()[i];
ov::SoPtr<ov::ITensor> allocated = allocOut(port, global_input_mem_device(i));
m_input_allocated.insert(allocated->data());
m_port_to_tensor[port] = TensorStorage{allocated, true};
} // for(inputs)

// Preallocate output tensors
LOG_INFO("Preallocating output tensors...");
for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
LOG_BLOCK();
const auto& port = m_npuw_model->outputs()[i];
LOG_INFO("Output " << i << " of " << m_npuw_model->outputs().size() << ": " << port);

// FIXME: Yes, the CompiledModel::ToSubmodel == JustInferRequest::LinkFrom
const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second);

auto tensor = alloc_global_out(i);
m_port_to_tensor[port] = TensorStorage{tensor, true};
}

void ov::npuw::IBaseInferRequest::alloc_quant_gather() {
// Try to allocate intermediate tensors to gather into, when host quant gather is enabled
for (size_t i = 0; i < m_num_submodels; i++) {
auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i];
Expand All @@ -363,7 +394,7 @@ void ov::npuw::IBaseInferRequest::alloc_io() {
}
}

ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::alloc_global_out(std::size_t out_idx) {
ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::alloc_global_out(std::size_t out_idx) const {
const auto& port = m_npuw_model->outputs().at(out_idx);
return allocOut(port, global_output_mem_device(out_idx));
}
Expand Down Expand Up @@ -540,7 +571,7 @@ void ov::npuw::IBaseInferRequest::bind_global_params(std::size_t idx, RqPtr requ
LOG_DEBUG("Processing " << param_idx << " -> " << sub_in_idx << std::endl);

const auto& g_port = m_npuw_model->inputs()[param_idx];
const auto& g_tnsr = m_port_to_tensor.at(g_port).tensor;
const auto& g_tnsr = get_tensor(g_port);
const auto& s_port = request->get_inputs()[sub_in_idx];
LOG_DEBUG("Processing " << g_port << " -> " << s_port << "...");
LOG_BLOCK();
Expand All @@ -558,8 +589,10 @@ void ov::npuw::IBaseInferRequest::bind_global_params(std::size_t idx, RqPtr requ
// Register for future use
m_attention_io[idx].inputs.at(sub_in_idx) = g_tnsr;
} else {
// Lock mutex just in case. m_input_allocated might be altered in parallel in get_tensor()
std::unique_lock lock(m_io_storages_mutex);
// Input parameter is non-spatial, do normal handling
if (m_input_allocated.count(g_tnsr->data()) == 0 && do_copy) {
if ((m_input_allocated.count(g_tnsr->data()) == 0 && do_copy) || !g_tnsr->is_continuous()) {
LOG_DEBUG("Will be copied");
copy_list.emplace_back(g_tnsr, s_port);
} else {
Expand Down Expand Up @@ -800,7 +833,7 @@ void ov::npuw::IBaseInferRequest::bind_pyramid_attention_inputs(std::size_t idx,
LOG_BLOCK();

// Optimization for the last chunk: Direct tensor reuse when shapes match
if (static_cast<int64_t>(input_shape[param.dim]) == past_len) {
if (static_cast<int64_t>(input_shape[param.dim]) == past_len && input->is_continuous()) {
request->set_tensor(iport, input);
continue;
}
Expand Down Expand Up @@ -877,7 +910,7 @@ void ov::npuw::IBaseInferRequest::bind_global_results(std::size_t idx, RqPtr req
std::tie(result_idx, sub_out_idx) = it;
const auto& g_port = m_npuw_model->outputs()[result_idx];
const auto& s_port = request->get_outputs()[sub_out_idx];
request->set_tensor(s_port, m_port_to_tensor.at(g_port).tensor);
request->set_tensor(s_port, get_tensor(g_port));
}

LOG_DEBUG("Done");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,16 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
// reset to 0 before every new execution
};
// FROM(Every subrequests' output port) TO(Its output tensor)
std::map<ov::Output<const ov::Node>, TensorStorage> m_port_to_tensor;
// mutable due to lazy I/O allocation in get_tensor()
mutable std::map<ov::Output<const ov::Node>, TensorStorage> m_port_to_tensor;

// FIXME: need to lock internal storages (e.g. accessed within get_tensor())
mutable std::mutex m_io_storages_mutex;

// Check that m_port_to_tensor does have a tensor stored at the port
bool is_stored(const ov::Output<const ov::Node>& port) const;
// Check the port is I/O
bool is_io(const ov::Output<const ov::Node>& port) const;

struct QuantGatherTensors {
ov::Tensor w, z, s;
Expand Down Expand Up @@ -161,15 +170,15 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
std::vector<GlobalIO> m_subrequests_gio;

// Tracks tensors we allocated on our own - to recognize and avoid copies
std::unordered_set<void*> m_input_allocated;
mutable std::unordered_set<void*> m_input_allocated; // mutable due to lazy I/O allocation in get_tensor()

// Common functionality - shared for subclasses
const std::size_t m_num_submodels;

TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device);
TensorPtr allocOut(const ov::Output<const ov::Node>& node, const std::string& device);
virtual void alloc_io();
virtual TensorPtr alloc_global_out(std::size_t out_idx);
TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device) const;
TensorPtr allocOut(const ov::Output<const ov::Node>& node, const std::string& device) const;
virtual void alloc_quant_gather();
virtual TensorPtr alloc_global_out(std::size_t out_idx) const;

std::string global_input_mem_device(std::size_t idx) const;
std::string global_output_mem_device(std::size_t idx) const;
Expand All @@ -193,7 +202,7 @@ class IBaseInferRequest : public ov::ISyncInferRequest {

MS m_ms_unpack;
ov::npuw::perf::Profile<MS> m_profile;
ov::npuw::perf::Profile<B> m_footprint;
mutable ov::npuw::perf::Profile<B> m_footprint; // mutable due to lazy I/O allocation in get_tensor()

std::string profile_tag(std::size_t idx) const;

Expand Down
23 changes: 12 additions & 11 deletions src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ void ov::npuw::FuncMemMgr::assign(const LinkFrom& from) {
oshape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range;
}
const auto& device = m_model->funcall_mem_device(real_idx);
// FIXME: handle the lazy way (see BaseSyncInferRequest::get_tensor())
// and share between submodels to reduce memory consumption
TensorPtr new_tensor = m_alloc(oport.get_element_type(), oshape, device);
NPUW_ASSERT(new_tensor);

Expand Down Expand Up @@ -242,14 +244,16 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
// Note: these buffers are allocated to the entire NWAY (> tail_size)
for (auto&& p : proto_comp_model_desc.spatial->params) {
const auto& iport = proto_comp_model_desc.compiled_model->inputs()[p.idx];
m_spatial_io[real_idx].input_tails[p.idx] =
allocOut(iport, m_npuw_model->funcall_mem_device(real_idx));
m_spatial_io[real_idx].input_tails[p.idx] = allocOut(
iport,
m_npuw_model->funcall_mem_device(real_idx)); // should it be handled lazy way as well?
}
const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size();
for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) {
const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx];
m_spatial_io[real_idx].output_tails[out_idx] =
allocOut(oport, m_npuw_model->funcall_mem_device(real_idx));
m_spatial_io[real_idx].output_tails[out_idx] = allocOut(
oport,
m_npuw_model->funcall_mem_device(real_idx)); // should it be handled lazy way as well?
}
}
} // if(spatial)
Expand Down Expand Up @@ -349,7 +353,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
}
} // if(function_pipelining)

alloc_io();
alloc_quant_gather();
connect_subrequests();
init_gio();

Expand Down Expand Up @@ -430,11 +434,8 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com

void ov::npuw::JustInferRequest::set_tensor(const ov::Output<const ov::Node>& port,
const ov::SoPtr<ov::ITensor>& tensor) {
// Check that it's I/O
NPUW_ASSERT(m_port_to_tensor.at(port).persistent);

// Assigning via .at() to ensure it is a known port
m_port_to_tensor.at(port).tensor = tensor;
NPUW_ASSERT(is_io(port));
m_port_to_tensor[port] = TensorStorage{tensor, true};

// Check if setting output tensor
for (std::size_t i = 0; i < m_npuw_model->outputs().size(); ++i) {
Expand All @@ -457,7 +458,7 @@ void ov::npuw::JustInferRequest::set_tensor(const ov::Output<const ov::Node>& po
handle_set_remote_input(port, tensor);
}

ov::npuw::TensorPtr ov::npuw::JustInferRequest::alloc_global_out(std::size_t out_idx) {
ov::npuw::TensorPtr ov::npuw::JustInferRequest::alloc_global_out(std::size_t out_idx) const {
const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(out_idx);
auto funcall_result_iter = m_funcall_result.find(from_submodel);
if (funcall_result_iter != m_funcall_result.end()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class JustInferRequest final : public IBaseInferRequest {
bool supports_async_pipeline() const override;
void update_subrequest_links(std::size_t idx) override;

TensorPtr alloc_global_out(std::size_t out_idx) override;
TensorPtr alloc_global_out(std::size_t out_idx) const override;

void set_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) override;

Expand Down
Loading
Loading