Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[intel-npu] Support new internal cached_model_buffer config for memory mapped cached blobs #27822

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
2ab6464
Squash commits from rebase
alexandruenache1111 Dec 16, 2024
714c889
Few important changes:
alexandruenache1111 Dec 18, 2024
6ba434d
Added test for reading any invalid metadata version
alexandruenache1111 Dec 20, 2024
bbda03e
Updated writeAndReadInvalidMetadataVersion test body
alexandruenache1111 Jan 8, 2025
938fde9
Move version functions as static methods inside MetadataBase
alexandruenache1111 Jan 9, 2025
3bb13d9
Nitpicks
alexandruenache1111 Jan 9, 2025
9613e41
Move version field to MetadataBase
alexandruenache1111 Jan 13, 2025
f673af8
Add `ov::internal::caching_with_mmap` property logic
MirceaDan99 Nov 14, 2024
a55f767
Refactor compiler type selection
MirceaDan99 Nov 14, 2024
440a928
Fix OV cache header not being removed from blob for memory mapped cac…
MirceaDan99 Nov 14, 2024
b20d63a
Keep `shared_ptr` of blob in IGraph to fix `export_model` for import …
MirceaDan99 Nov 20, 2024
9d08a56
Refactor changes for CIP & Drop `parse` function from `ICompilerAdapt…
MirceaDan99 Nov 20, 2024
615090a
Update plugin API to import model with mmap buffer
olpipi Nov 19, 2024
cb350a8
Use new `import_model` with `model_buffer` API
MirceaDan99 Nov 21, 2024
bd736a5
New fix for adding offset to `model_buffer` relative to end position …
MirceaDan99 Nov 22, 2024
a1ef946
Fix `std::vector` being moved after accesing its `.data()` and `.size…
MirceaDan99 Nov 22, 2024
c2221ab
Refactor `getGraphHandle` to drop dependency to `ov::AlignedBuffer`
MirceaDan99 Nov 25, 2024
8de684c
Refactor `import_model` new API to accept only either `std::istream` …
MirceaDan99 Nov 26, 2024
28f0d07
Re-add `DriverGraph::release_blob` method and adapt to `ov::AlignedBu…
MirceaDan99 Nov 26, 2024
f537364
Code clean-up
MirceaDan99 Nov 27, 2024
f3e29de
Revert changes in new `import_model` API, so `NPU` plugin will have `…
MirceaDan99 Nov 27, 2024
88e80a3
Add `BlobContainer` class and derivates for each `std::vector<uint8_t…
MirceaDan99 Nov 27, 2024
379c310
Fix clang formats
MirceaDan99 Dec 9, 2024
8060dbb
Use alternative from `PR #27981` instead for memory mapped buffers
MirceaDan99 Dec 11, 2024
09204af
Add suggested changes
MirceaDan99 Dec 12, 2024
9b0bc5b
Prepare `BlobContainerAlignedBuffer` for `OV versioning metadata`
MirceaDan99 Dec 12, 2024
fe97c12
Fix broken stream processed by NPUW
MirceaDan99 Jan 14, 2025
1e9a671
Fix offsets mismatch for HETERO plugin blob headers
MirceaDan99 Jan 14, 2025
1d38805
Optimize CIP path
MirceaDan99 Jan 17, 2025
e26d470
Add fix for new CIP optimization
MirceaDan99 Jan 17, 2025
c647071
Add unit tests
MirceaDan99 Jan 19, 2025
1801088
Revert changes for `CIP Optimization`
MirceaDan99 Jan 20, 2025
02683cf
Remove unit tests due to the need of extending `IGraph` api with
MirceaDan99 Jan 21, 2025
473b488
Add comment regarding `ov::internal::cached_model_buffer` not having …
MirceaDan99 Jan 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
// Copyright (C) 2018-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <memory>
#include <vector>

#include "openvino/runtime/shared_buffer.hpp"

namespace intel_npu {

class BlobContainer {
public:
/**
* @brief Returns the address at the beginning of the blob.
*/
virtual const void* get_ptr() const = 0;

/**
* @brief Size of the blob.
*/
virtual size_t size() const = 0;

/**
* @brief Returns true if the blob can be deallocated from memory, false otherwise.
*/
virtual bool release_from_memory() = 0;

virtual ~BlobContainer() = default;
};

class BlobContainerVector : public BlobContainer {
public:
BlobContainerVector(std::vector<uint8_t> blob) : _blob(std::move(blob)) {}

const void* get_ptr() const override {
return reinterpret_cast<const void*>(_blob.data());
}

size_t size() const override {
return _blob.size();
}

bool release_from_memory() override {
_blob.clear();
_blob.shrink_to_fit();
return true;
}

private:
std::vector<uint8_t> _blob;
};

class BlobContainerAlignedBuffer : public BlobContainer {
public:
BlobContainerAlignedBuffer(const std::shared_ptr<ov::AlignedBuffer>& blobSO, size_t ovHeaderOffset, uint64_t size)
: _size(size),
_ovHeaderOffset(ovHeaderOffset),
_blobSO(blobSO) {}

const void* get_ptr() const override {
return _blobSO->get_ptr(_ovHeaderOffset);
}

size_t size() const override {
return _size;
}

bool release_from_memory() override {
return false;
}

private:
uint64_t _size;
size_t _ovHeaderOffset;
std::shared_ptr<ov::AlignedBuffer> _blobSO;
};

} // namespace intel_npu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class ICompilerAdapter {
public:
virtual std::shared_ptr<IGraph> compile(const std::shared_ptr<const ov::Model>& model,
const Config& config) const = 0;
virtual std::shared_ptr<IGraph> parse(std::vector<uint8_t> network, const Config& config) const = 0;
virtual std::shared_ptr<IGraph> parse(std::unique_ptr<BlobContainer> blobPtr, const Config& config) const = 0;
virtual ov::SupportedOpsMap query(const std::shared_ptr<const ov::Model>& model, const Config& config) const = 0;
virtual uint32_t get_version() const = 0;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <mutex>
#include <vector>

#include "intel_npu/common/blob_container.hpp"
#include "intel_npu/network_metadata.hpp"
#include "intel_npu/utils/zero/zero_init.hpp"
#include "intel_npu/utils/zero/zero_utils.hpp"
Expand All @@ -21,7 +22,7 @@ class IGraph : public std::enable_shared_from_this<IGraph> {
IGraph(ze_graph_handle_t handle,
NetworkMetadata metadata,
const Config& config,
std::optional<std::vector<uint8_t>> blob);
std::unique_ptr<BlobContainer> blobPtr);

virtual size_t export_blob(std::ostream& stream) const = 0;

Expand Down Expand Up @@ -89,7 +90,7 @@ class IGraph : public std::enable_shared_from_this<IGraph> {
// first inference starts running
std::mutex _mutex;

std::vector<uint8_t> _blob;
std::unique_ptr<BlobContainer> _blobPtr;

uint32_t _unique_id = 0;
uint32_t _last_submitted_id;
Expand Down
9 changes: 3 additions & 6 deletions src/plugins/intel_npu/src/common/src/igraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,11 @@ namespace intel_npu {
IGraph::IGraph(ze_graph_handle_t handle,
NetworkMetadata metadata,
const Config& config,
std::optional<std::vector<uint8_t>> blob)
std::unique_ptr<BlobContainer> blobPtr)
: _handle(handle),
_metadata(std::move(metadata)),
_logger("IGraph", config.get<LOG_LEVEL>()) {
if (blob.has_value()) {
_blob = std::move(*blob);
}
}
_blobPtr(std::move(blobPtr)),
_logger("IGraph", config.get<LOG_LEVEL>()) {}

const NetworkMetadata& IGraph::get_metadata() const {
return _metadata;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class DriverCompilerAdapter final : public ICompilerAdapter {

std::shared_ptr<IGraph> compile(const std::shared_ptr<const ov::Model>& model, const Config& config) const override;

std::shared_ptr<IGraph> parse(std::vector<uint8_t> network, const Config& config) const override;
std::shared_ptr<IGraph> parse(std::unique_ptr<BlobContainer> blobPtr, const Config& config) const override;

ov::SupportedOpsMap query(const std::shared_ptr<const ov::Model>& model, const Config& config) const override;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class DriverGraph final : public IGraph {
ze_graph_handle_t graphHandle,
NetworkMetadata metadata,
const Config& config,
std::optional<std::vector<uint8_t>> blob);
std::unique_ptr<BlobContainer> blobPtr);

size_t export_blob(std::ostream& stream) const override;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class PluginCompilerAdapter final : public ICompilerAdapter {

std::shared_ptr<IGraph> compile(const std::shared_ptr<const ov::Model>& model, const Config& config) const override;

std::shared_ptr<IGraph> parse(std::vector<uint8_t> network, const Config& config) const override;
std::shared_ptr<IGraph> parse(std::unique_ptr<BlobContainer> blobPtr, const Config& config) const override;

ov::SupportedOpsMap query(const std::shared_ptr<const ov::Model>& model, const Config& config) const override;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class PluginGraph final : public IGraph {
const std::shared_ptr<ZeroInitStructsHolder>& zeroInitStruct,
ze_graph_handle_t graphHandle,
NetworkMetadata metadata,
std::vector<uint8_t> blob,
std::unique_ptr<BlobContainer> blobPtr,
const Config& config);

size_t export_blob(std::ostream& stream) const override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class ZeGraphExtWrappers {
const std::string& buildFlags,
const uint32_t& flags) const;

ze_graph_handle_t getGraphHandle(const std::vector<uint8_t>& network) const;
ze_graph_handle_t getGraphHandle(const uint8_t& data, size_t size) const;

NetworkMetadata getNetworkMeta(ze_graph_handle_t graphHandle) const;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -200,14 +200,16 @@ std::shared_ptr<IGraph> DriverCompilerAdapter::compile(const std::shared_ptr<con
graphHandle,
std::move(networkMeta),
config,
std::nullopt);
nullptr);
}

std::shared_ptr<IGraph> DriverCompilerAdapter::parse(std::vector<uint8_t> network, const Config& config) const {
std::shared_ptr<IGraph> DriverCompilerAdapter::parse(std::unique_ptr<BlobContainer> blobPtr,
const Config& config) const {
OV_ITT_TASK_CHAIN(PARSE_BLOB, itt::domains::NPUPlugin, "DriverCompilerAdapter", "parse");

_logger.debug("parse start");
ze_graph_handle_t graphHandle = _zeGraphExt->getGraphHandle(network);
ze_graph_handle_t graphHandle =
_zeGraphExt->getGraphHandle(*reinterpret_cast<const uint8_t*>(blobPtr->get_ptr()), blobPtr->size());
_logger.debug("parse end");

OV_ITT_TASK_NEXT(PARSE_BLOB, "getNetworkMeta");
Expand All @@ -218,7 +220,7 @@ std::shared_ptr<IGraph> DriverCompilerAdapter::parse(std::vector<uint8_t> networ
graphHandle,
std::move(networkMeta),
config,
std::optional<std::vector<uint8_t>>(std::move(network)));
std::move(blobPtr));
}

ov::SupportedOpsMap DriverCompilerAdapter::query(const std::shared_ptr<const ov::Model>& model,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ DriverGraph::DriverGraph(const std::shared_ptr<ZeGraphExtWrappers>& zeGraphExt,
ze_graph_handle_t graphHandle,
NetworkMetadata metadata,
const Config& config,
std::optional<std::vector<uint8_t>> blob)
: IGraph(graphHandle, std::move(metadata), config, std::move(blob)),
std::unique_ptr<BlobContainer> blobPtr)
: IGraph(graphHandle, std::move(metadata), config, std::move(blobPtr)),
_zeGraphExt(zeGraphExt),
_zeroInitStruct(zeroInitStruct),
_logger("DriverGraph", config.get<LOG_LEVEL>()) {
Expand Down Expand Up @@ -140,7 +140,7 @@ void DriverGraph::initialize(const Config& config) {
}

bool DriverGraph::release_blob(const Config& config) {
if (_blob.empty() || _zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8 ||
if (_blobPtr == nullptr || _zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8 ||
config.get<PERF_COUNT>()) {
return false;
}
Expand All @@ -153,8 +153,9 @@ bool DriverGraph::release_blob(const Config& config) {
return false;
}

_blob.clear();
_blob.shrink_to_fit();
if (!_blobPtr->release_from_memory()) {
return false;
}

_logger.debug("Blob is released");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,16 @@ std::shared_ptr<IGraph> PluginCompilerAdapter::compile(const std::shared_ptr<con

_logger.debug("compile start");
auto networkDesc = _compiler->compile(model, config);
auto blobPtr = std::make_unique<BlobContainerVector>(std::move(networkDesc.compiledNetwork));
_logger.debug("compile end");

ze_graph_handle_t graphHandle = nullptr;

if (_zeGraphExt) {
// Depending on the config, we may get an error when trying to get the graph handle from the compiled network
try {
graphHandle = _zeGraphExt->getGraphHandle(networkDesc.compiledNetwork);
graphHandle =
_zeGraphExt->getGraphHandle(*reinterpret_cast<const uint8_t*>(blobPtr->get_ptr()), blobPtr->size());
} catch (...) {
_logger.info("Failed to obtain the level zero graph handle. Inference requests for this model are not "
"allowed. Only exports are available");
Expand All @@ -99,29 +101,36 @@ std::shared_ptr<IGraph> PluginCompilerAdapter::compile(const std::shared_ptr<con
_zeroInitStruct,
graphHandle,
std::move(networkDesc.metadata),
std::move(networkDesc.compiledNetwork),
std::move(blobPtr),
config);
}

std::shared_ptr<IGraph> PluginCompilerAdapter::parse(std::vector<uint8_t> network, const Config& config) const {
std::shared_ptr<IGraph> PluginCompilerAdapter::parse(std::unique_ptr<BlobContainer> blobPtr,
const Config& config) const {
OV_ITT_TASK_CHAIN(PARSE_BLOB, itt::domains::NPUPlugin, "PluginCompilerAdapter", "parse");

_logger.debug("parse start");
std::vector<uint8_t> network(blobPtr->size());
network.assign(reinterpret_cast<const uint8_t*>(blobPtr->get_ptr()),
reinterpret_cast<const uint8_t*>(blobPtr->get_ptr()) + blobPtr->size());
auto networkMeta = _compiler->parse(network, config);
network.clear();
network.shrink_to_fit();
_logger.debug("parse end");

ze_graph_handle_t graphHandle = nullptr;

if (_zeGraphExt) {
graphHandle = _zeGraphExt->getGraphHandle(network);
graphHandle =
_zeGraphExt->getGraphHandle(*reinterpret_cast<const uint8_t*>(blobPtr->get_ptr()), blobPtr->size());
}

return std::make_shared<PluginGraph>(_zeGraphExt,
_compiler,
_zeroInitStruct,
graphHandle,
std::move(networkMeta),
std::move(network),
std::move(blobPtr),
config);
}

Expand Down
19 changes: 12 additions & 7 deletions src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ PluginGraph::PluginGraph(const std::shared_ptr<ZeGraphExtWrappers>& zeGraphExt,
const std::shared_ptr<ZeroInitStructsHolder>& zeroInitStruct,
ze_graph_handle_t graphHandle,
NetworkMetadata metadata,
std::vector<uint8_t> blob,
std::unique_ptr<BlobContainer> blobPtr,
const Config& config)
: IGraph(graphHandle, std::move(metadata), config, std::optional<std::vector<uint8_t>>(std::move(blob))),
: IGraph(graphHandle, std::move(metadata), config, std::move(blobPtr)),
_zeGraphExt(zeGraphExt),
_zeroInitStruct(zeroInitStruct),
_compiler(compiler),
Expand All @@ -31,7 +31,7 @@ PluginGraph::PluginGraph(const std::shared_ptr<ZeGraphExtWrappers>& zeGraphExt,
}

size_t PluginGraph::export_blob(std::ostream& stream) const {
stream.write(reinterpret_cast<const char*>(_blob.data()), _blob.size());
stream.write(reinterpret_cast<const char*>(_blobPtr->get_ptr()), _blobPtr->size());

if (!stream) {
_logger.error("Write blob to stream failed. Blob is broken!");
Expand All @@ -40,21 +40,26 @@ size_t PluginGraph::export_blob(std::ostream& stream) const {

if (_logger.level() >= ov::log::Level::INFO) {
std::uint32_t result = 1171117u;
for (const uint8_t* it = _blob.data(); it != _blob.data() + _blob.size(); ++it) {
for (const uint8_t* it = reinterpret_cast<const uint8_t*>(_blobPtr->get_ptr());
it != reinterpret_cast<const uint8_t*>(_blobPtr->get_ptr()) + _blobPtr->size();
++it) {
result = ((result << 7) + result) + static_cast<uint32_t>(*it);
}

std::stringstream str;
str << "Blob size: " << _blob.size() << ", hash: " << std::hex << result;
str << "Blob size: " << _blobPtr->size() << ", hash: " << std::hex << result;
_logger.info(str.str().c_str());
}
_logger.info("Write blob to stream successfully.");
return _blob.size();
return _blobPtr->size();
}

std::vector<ov::ProfilingInfo> PluginGraph::process_profiling_output(const std::vector<uint8_t>& profData,
const Config& config) const {
return _compiler->process_profiling_output(profData, _blob, config);
std::vector<uint8_t> blob(_blobPtr->size());
blob.assign(reinterpret_cast<const uint8_t*>(_blobPtr->get_ptr()),
reinterpret_cast<const uint8_t*>(_blobPtr->get_ptr()) + _blobPtr->size());
return _compiler->process_profiling_output(profData, blob, config);
}

void PluginGraph::set_argument_value(uint32_t argi, const void* argv) const {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -365,19 +365,15 @@ ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(std::pair<size_t, std::shar
return graphHandle;
}

ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(const std::vector<uint8_t>& network) const {
ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(const uint8_t& blobData, size_t blobSize) const {
ze_graph_handle_t graphHandle;

if (network.empty()) {
if (blobSize == 0) {
OPENVINO_THROW("Empty blob");
}

ze_graph_desc_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
nullptr,
ZE_GRAPH_FORMAT_NATIVE,
network.size(),
network.data(),
nullptr};
ze_graph_desc_t desc =
{ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, nullptr, ZE_GRAPH_FORMAT_NATIVE, blobSize, &blobData, nullptr};

_logger.debug("getGraphHandle - perform pfnCreate");
auto result = _zeroInitStruct->getGraphDdiTable().pfnCreate(_zeroInitStruct->getContext(),
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_npu/src/plugin/include/metrics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ class Metrics final {
ov::intel_npu::batch_mode.name(),
ov::hint::execution_mode.name()};

const std::vector<ov::PropertyName> _internalSupportedProperties = {ov::internal::caching_properties.name()};
const std::vector<ov::PropertyName> _internalSupportedProperties = {ov::internal::caching_properties.name(),
ov::internal::caching_with_mmap.name()};

// Metric to provide a hint for a range for number of async infer requests. (bottom bound, upper bound, step)
const std::tuple<uint32_t, uint32_t, uint32_t> _rangeForAsyncInferRequests{1u, 10u, 1u};
Expand Down
Loading
Loading