Skip to content

Commit

Permalink
Clean up code
Browse files Browse the repository at this point in the history
  • Loading branch information
EgorDuplensky committed Nov 13, 2024
1 parent 8de22e5 commit d5bda07
Show file tree
Hide file tree
Showing 26 changed files with 67 additions and 402 deletions.
2 changes: 0 additions & 2 deletions src/inference/dev_api/openvino/runtime/memory_solver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,13 @@ class MemorySolver {
struct Box {
/** Execution order index of first use. The data will be produced here. */
int start;
// intel_cpu::GlobalExecutionIndex start;

/**
* The execution order index of last use. After that data will be released.
* -1 is a reserved value for "till to end". The data will be alive to very
* end of execution.
*/
int finish;
// intel_cpu::GlobalExecutionIndex finish;

/** Size of data. In abstract unit of measure (byte, simd, cache line, ...) */
int64_t size;
Expand Down
9 changes: 3 additions & 6 deletions src/plugins/intel_cpu/src/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,17 +158,14 @@ CompiledModel::GraphGuard::Lock CompiledModel::get_graph() const {
GraphContext::Ptr ctx;
{
std::lock_guard<std::mutex> lock{*m_mutex.get()};
MemoryControl* memoryControl = m_networkMemoryControl->createMemoryControlUnit();
auto isQuantizedFlag = (m_cfg.lpTransformsMode == Config::On) &&
ov::pass::low_precision::LowPrecision::isFunctionQuantized(m_model);
ctx = std::make_shared<GraphContext>(m_cfg,
m_socketWeights[socketId],
isQuantizedFlag,
memoryControl,
m_networkMemoryControl,
m_networkMemoryControl->createMemoryControlUnit(),
streamsExecutor,
m_sub_memory_manager,
true);
m_sub_memory_manager);
}

const std::shared_ptr<const ov::Model> model = m_model;
Expand Down Expand Up @@ -356,7 +353,7 @@ void CompiledModel::release_memory() {
for (auto&& graph : m_graphs) {
GraphGuard::Lock graph_lock{graph};
auto ctx = graph_lock._graph.getGraphContext();
m_networkMemoryControl->releaseMemory();
ctx->getMemoryControl()->releaseMemory();
}
}

Expand Down
1 change: 0 additions & 1 deletion src/plugins/intel_cpu/src/compiled_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
#include "openvino/runtime/iinfer_request.hpp"
#include "openvino/runtime/iplugin.hpp"
#include "openvino/runtime/isync_infer_request.hpp"
#include "openvino/runtime/threading/thread_local.hpp"
#include "sub_memory_manager.hpp"

namespace ov {
Expand Down
14 changes: 0 additions & 14 deletions src/plugins/intel_cpu/src/cpu_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,24 +204,10 @@ void MemoryBlockWithReuse::setExtBuff(void *ptr, size_t size) {
m_data = decltype(m_data)(ptr, release);
}

// class MemoryUsage {
// public:
// MemoryUsage() {}

// ~MemoryUsage() {
// std::cout << "Total memory usage: " << total << "\n";
// }

// int total = 0;
// };

bool MemoryBlockWithReuse::resize(size_t size) {
// static MemoryUsage mu;

constexpr int cacheLineSize = 64;
bool sizeChanged = false;
if (size > m_memUpperBound) {
// mu.total += size;
void *ptr = dnnl::impl::malloc(size, cacheLineSize);
if (!ptr) {
OPENVINO_THROW("Failed to allocate ", size, " bytes of memory");
Expand Down
5 changes: 0 additions & 5 deletions src/plugins/intel_cpu/src/edge.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,11 +293,6 @@ std::string Edge::name() const {
std::stringstream result;

result << parentPtr->getName() << " port " << parent_port << " <-> " << childPtr->getName() << " port " << child_port;
// result << parentPtr->getName()<< " port " << parent_port
// << " <-> "
// << childPtr->getName() << " port " << child_port
// << " status: "
// << static_cast<int>(getStatus());

return result.str();
}
Expand Down
128 changes: 17 additions & 111 deletions src/plugins/intel_cpu/src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#include <algorithm>
#include <cstddef>
#include <cstdlib>
#include <iostream>
#include <iterator>
#include <limits>
#include <map>
Expand Down Expand Up @@ -71,7 +70,7 @@ template<typename NET>
void Graph::CreateGraph(NET &model, const GraphContext::CPtr context) {
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "CreateGraph");

Init(model, std::make_shared<GraphContext>(context->disableMemoryReuse()));
Init(model, context);

Activate();
}
Expand All @@ -83,7 +82,7 @@ void Graph::CreateGraph(const std::vector<NodePtr>& graphNodes,
if (IsReady())
ForgetGraphData();

m_context = std::make_shared<GraphContext>(context->disableMemoryReuse());
m_context = context;
m_stream = dnnl::stream(getEngine());

this->_name = std::move(name);
Expand Down Expand Up @@ -285,8 +284,6 @@ static std::tuple<std::vector<NodePtr>, std::vector<size_t>> ExtractExecutableNo
for (size_t i = 0; i < graphNodes.size(); i++) {
const auto& graphNode = graphNodes[i];
if ((!graphNode->isConstant() && CPU_DEBUG_CAPS_ALWAYS_TRUE(!graphNode->canBeSkipped())) || // non-constant executable or
// if ((!graphNode->isConstant()) || // non-constant executable or
// if ((!graphNode->isConstant() && !graphNode->canBeSkipped()) || // non-constant executable or
(graphNode->isDynamicNode() && !one_of(graphNode->getType(), Type::Input, Type::Output))) { // dynamic, except inputs / outputs
/* @todo
* Revise implementation.
Expand Down Expand Up @@ -360,20 +357,11 @@ static void UseExternalOutputMemory(const std::map<std::size_t, NodePtr>& output

void Graph::Activate(const std::vector<MemoryPtr>& externalInputMemory,
const std::vector<MemoryPtr>& externalOutputMemory) {
// OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status");

// const bool hasDynNodes = ProcessDynNodes();
// const auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector<size_t>{};
OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status");

UseExternalInputMemory(inputNodesMap, externalInputMemory);
UseExternalOutputMemory(outputNodesMap, externalOutputMemory);

// std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes);

// status = hasDynNodes ? (parallel_get_max_threads() > 1 ? Status::ReadyDynamic : Status::ReadyDynamicSeq)
// : Status::ReadyStatic;

// CPU_DEBUG_CAP_ENABLE(serialize(*this));
Allocate();

CreatePrimitivesAndExecConstants();
Expand Down Expand Up @@ -713,7 +701,7 @@ void Graph::ResolveComplexInplaceConflicts() {
}

/**
* Partition the \clusters of Edges, by moving and allocating at the same time
* Partition the \clusters of Edges, by moving to the end and allocating at the same time
* the clusters which cannot be handled as part of generic memory solver algorithm.
* Such clusters meet one of the following criteria:
* - base edge of a cluster is already Allocated
Expand Down Expand Up @@ -744,7 +732,6 @@ static size_t AllocateStringsAndConstants(EdgeClusters& clusters,
};

auto allocateConstantEdge = [context](const EdgePtr& edge) {
// std::cout << "Allocating constant edge: " << edge->name() << " wc: " << context->getWeightsCache() << "\n";
if (edge->getParent()->getType() == Type::Input) {
auto constNode = std::static_pointer_cast<node::Input>(edge->getParent());
edge->reuse(std::const_pointer_cast<IMemory>(constNode->getMemoryPtr()));
Expand All @@ -753,7 +740,7 @@ static size_t AllocateStringsAndConstants(EdgeClusters& clusters,
}
};

auto endOfNotAllocatedPartition =
auto notAllocatedPartitionEnd =
std::partition(clusters.begin(), clusters.end(),
[&allocateStringMemory, &allocateConstantEdge, &context](const EdgeCluster& cluster) {
if (cluster.empty()) return false;
Expand All @@ -764,15 +751,11 @@ static size_t AllocateStringsAndConstants(EdgeClusters& clusters,

OPENVINO_ASSERT(baseEdgeIt != cluster.end(), "Unexpected cluster state");

// const auto& baseEdge = cluster.front();
const auto& baseEdge = *baseEdgeIt;
// Skip already allocated cluster
if (baseEdge->getStatus() == Edge::Status::Allocated) {
return false;
}

// std::cout << "Processing string/const for base edge: " << baseEdge->name() << "\n";

// Skip if the baseEdge does not require allocation
if (baseEdge->getStatus() != Edge::Status::NeedAllocation) {
return true;
Expand Down Expand Up @@ -807,22 +790,17 @@ static size_t AllocateStringsAndConstants(EdgeClusters& clusters,
return true;
});

return std::distance(clusters.begin(), endOfNotAllocatedPartition);
return std::distance(clusters.begin(), notAllocatedPartitionEnd);
}

static void AllocateBaseEdges(const EdgeClusters& edgeClusters,
const MemoryControl::MemorySolution& memorySolution) {
// attach all the not yet allocated edges to the memory control
for (auto&& item : memorySolution) {
int count = 0;
// std::cout << "Processing cluster: " << item.first << "\n";
for (auto&& edge : edgeClusters[item.first]) {
// std::cout << "Processing base edge: " << edge->name() << "\n";
if (edge->getStatus() == Edge::Status::NeedAllocation) {
// std::cout << "Allocating edge: " << edge->name() << "\n";

edge->allocate(item.second);

// TODO: WA for some test (like strided_slice_test) which use tensors with
// shapes {0}. And it is implicitly converted into {1} tensor.
// Zeroing of input data allow pass tests.
Expand Down Expand Up @@ -852,7 +830,6 @@ static void AllocatedReferencingEdges(const EdgeClusters& clusters) {
}

std::for_each(edges_to_process.rbegin(), edges_to_process.rend(), [](const EdgePtr& edge) {
// std::cout << "Processing referencing edge: " << edge->name() << "\n";
if (edge->getStatus() == Edge::Status::NotAllocated) {
if (edge->inPlace(Edge::LOOK_DOWN)) {
edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN);
Expand All @@ -861,7 +838,6 @@ static void AllocatedReferencingEdges(const EdgeClusters& clusters) {
} else {
auto sharedEdge = edge->getSharedEdge();
auto sharedEdgeParent = sharedEdge->getParent();
// std::cout << "Allocating edge: " << edge->name() << " Using shared edge: " << sharedEdge->name() << "\n";
edge->allocate(sharedEdge->getMemoryPtr()->getMemoryBlock());
DEBUG_LOG(*edge, " sharedEdge with ", *sharedEdge);
}
Expand Down Expand Up @@ -899,23 +875,6 @@ std::vector<size_t> Graph::CreateExecutionGraph() {
return syncNodesInds;
}

static void ResolveInOutInPlaceEdgesLegacy(const std::vector<EdgePtr>& edges) {
for (const auto& edge : edges) {
// std::cout << edge->name() << "\n";
if (edge->getStatus() == Edge::Status::Uninitialized) {
if (edge->getParent()->getParentEdges().empty() &&
one_of(edge->getParent()->getType(), Type::Input, Type::MemoryInput) &&
edge->inPlace(Edge::LOOK_UP)) {
edge->getParent()->resolveInPlaceEdges(Edge::LOOK_UP);
} else if (edge->getChild()->getChildEdges().empty() &&
one_of(edge->getChild()->getType(), Type::Output, Type::MemoryOutput) &&
edge->inPlace(Edge::LOOK_DOWN)) {
edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN);
}
}
}
}

static void ResolveInOutInPlaceEdges(const std::vector<EdgePtr>& edges) {
for (const auto& edge : edges) {
if (edge->getStatus() == Edge::Status::Uninitialized) {
Expand All @@ -936,7 +895,6 @@ int Graph::RegisterToAllocationContext(int offset, AllocationContext& context) {
auto syncNodesInds = CreateExecutionGraph();

ResolveInOutInPlaceEdges(graphEdges);
// std::cout << "RegisterToAllocationContext: " << offset << "\n";

// nodes are expected to be topologically sorted
for (size_t execIndex = 0, j = 0; execIndex < graphNodes.size(); execIndex++) {
Expand All @@ -947,7 +905,6 @@ int Graph::RegisterToAllocationContext(int offset, AllocationContext& context) {
const auto outputExecIndex = offset;
offset++;
context.execIndex[node] = {inputExecIndex, outputExecIndex};
// std::cout << node->getName() << " - " << "[" << inputExecIndex << "," << outputExecIndex << "] offset " << offset << "\n";

if (j < syncNodesInds.size() && syncNodesInds[j] == execIndex) {
context.syncPoints.push_back(inputExecIndex);
Expand All @@ -960,28 +917,6 @@ int Graph::RegisterToAllocationContext(int offset, AllocationContext& context) {
return offset - 1;
}

AllocationContext Graph::CreateAllocationContext(bool global) {
AllocationContext allocationContext;

if (global) {
RegisterToAllocationContext(0, allocationContext);
} else { // local allocation context. Used for the nodes with inner graph which are not updated yet
ResolveInOutInPlaceEdgesLegacy(graphEdges);

auto syncNodesInds = CreateExecutionGraph();

for (size_t i = 0; i < graphNodes.size(); i++) {
const auto& node = graphNodes[i];
allocationContext.execIndex[node] = {i, i};
}

allocationContext.edges = graphEdges;
allocationContext.syncPoints = syncNodesInds;
}

return allocationContext;
}

static void InitEdgeStatus(const std::vector<EdgePtr>& edges) {
for (auto& edge : edges) edge->init();
}
Expand Down Expand Up @@ -1064,22 +999,14 @@ static MemoryRegions FormMemoryRegions(const EdgeClusters& clusters,
int64_t boxSize = 0;
bool isConst = false, isOutput = false, isInput = false;

// std::cout << "Form memory region for cluster: " << i << "\n";

for (auto &edge : clusters[i]) {
const auto& parent = edge->getParent();
const auto& child = edge->getChild();

// @todo this is can be considered as a property of the node, whether it is going to use input / output memory multiple times
// in scope of its execution routine
int e_start = parent->getType() == Type::TensorIterator ? globalExecIndex.at(parent).first : globalExecIndex.at(parent).second;
int e_finish = child->getType() == Type::TensorIterator ? globalExecIndex.at(child).second : globalExecIndex.at(child).first;

// std::cout << "[" << e_start << " - " << e_finish << "]"
// << edge->name()
// << "\n";

// int e_finish = edge->getChild()->getExecIndex();
// If node uses its input / output memory multiple times in scope of a single execution (i.e TensorIterator)
// prolong the lifetime of a memory region till execution is finished
int e_start = parent->usesInOutMemoryMultipleTimes() ? globalExecIndex.at(parent).first : globalExecIndex.at(parent).second;
int e_finish = child->usesInOutMemoryMultipleTimes() ? globalExecIndex.at(child).second : globalExecIndex.at(child).first;

auto&& desc = edge->getDesc();

Expand Down Expand Up @@ -1140,7 +1067,7 @@ static OutputMemoryBlocks FilterOutDynamicOutputEdges(MemoryRegions& memoryRegio
if (child->getType() == Type::Output && edge->getStatus() == Edge::Status::NeedAllocation) {
auto proxyMemBlock = std::make_shared<ProxyMemoryBlock>();
DEBUG_LOG("ProxyMemoryBlock ", proxyMemBlock);
// std::cout << "Allocating output edge: " << edge->name() << "\n";

edge->allocate(proxyMemBlock);

// Store the output memory blocks.
Expand Down Expand Up @@ -1172,7 +1099,7 @@ static OutputMemoryBlocks FilterOutDynamicOutputEdges(MemoryRegions& memoryRegio
* 2) OutputMemoryBlocks - to allow memory sharing between graph and infer request
*/
static std::tuple<MemoryControl::MemorySolution, EdgeClusters, OutputMemoryBlocks>
SolveMemoryReuse(MemoryControl* memoryControl,
SolveMemoryReuse(const std::shared_ptr<MemoryControl>& memoryControl,
const AllocationContext& allocationContext,
const GraphContext::CPtr graphContext,
const std::map<std::size_t, NodePtr>& outputNodesMap) {
Expand All @@ -1197,26 +1124,14 @@ SolveMemoryReuse(MemoryControl* memoryControl,
}

void Graph::Allocate() {
auto globalAllocation = m_context->memoryReuseGlobal();

if (std::getenv("LOCAL_REUSE")) {
globalAllocation = false;
}

// Set up the memory control subsystem.
auto memoryControl = globalAllocation ? m_context->getMemoryControl() : m_context->getNetworkMemoryControl()->createMemoryControlUnit();
auto memoryControl = m_context->getMemoryControl();

// memory is already allocated globally
if (memoryControl->allocated()) {
return;
return; // memory is already allocated globally
}

auto allocationContext = CreateAllocationContext(globalAllocation);

for (const auto& entry : allocationContext.execIndex) {
OPENVINO_ASSERT(entry.second.first >= 0);
OPENVINO_ASSERT(entry.second.second >= 0);
}
AllocationContext allocationContext;
RegisterToAllocationContext(0, allocationContext);

const auto& edges = allocationContext.edges;
InitEdgeStatus(edges);
Expand All @@ -1225,21 +1140,12 @@ void Graph::Allocate() {
EdgeClusters edgeClusters;
std::tie(solution, edgeClusters, m_outputNodesMemBlocks) = SolveMemoryReuse(memoryControl, allocationContext, m_context, outputNodesMap);

// std::cout << "### Global edges:" << "\n";
// for (const auto& edge : edges) {
// const auto& parent = edge->getParent();
// const auto& child = edge->getChild();
// std::cout << "[" << allocationContext.execIndex[parent].second << " - "
// << (child->getType() == Type::TensorIterator ? allocationContext.execIndex[child].second : allocationContext.execIndex[child].first) << "]"
// << edge->name()
// << "\n";
// }

AllocateBaseEdges(edgeClusters, solution);

memoryControl->allocateMemory();

AllocatedReferencingEdges(edgeClusters);

ValidateEdgeStatus(edges);
}

Expand Down
Loading

0 comments on commit d5bda07

Please sign in to comment.