Skip to content

Commit

Permalink
[Snippes][CPU] Created new Executors
Browse files Browse the repository at this point in the history
  • Loading branch information
a-sidorova committed Dec 30, 2024
1 parent 9bb9646 commit e1951cc
Show file tree
Hide file tree
Showing 8 changed files with 878 additions and 648 deletions.
82 changes: 82 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "nodes/executors/aarch64/subgraph.hpp"

#include "snippets/op/subgraph.hpp"


namespace ov {
namespace intel_cpu {

SubgraphExecutor::SubgraphExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config,
const std::shared_ptr<SubgraphAttrs>& snippet_attrs,
const std::shared_ptr<SubgraphCodeGenerator>& snippet,
const std::vector<ptrdiff_t>& start_offset_in,
const std::vector<ptrdiff_t>& start_offset_out,
const BufferScratchpadAllocator& allocator,
const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache)
: SubgraphBaseExecutor(snippet_config,
snippet_attrs,
snippet,
start_offset_in,
start_offset_out,
allocator,
kernel_cache) {
m_buffer_scratchpad = allocator(m_internal_buffer_size);
}

void SubgraphStaticExecutor::exec_impl(const std::vector<MemoryPtr>& inMemPtrs,
const std::vector<MemoryPtr>& outMemPtrs) {
const auto& callable = m_schedule->get_callable<kernel>();

auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
init_call_args(call_args, inMemPtrs, outMemPtrs, m_start_offset_in, m_start_offset_out, ithr);
update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr);
};
auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
callable(&call_args, indexes.data());
};

if (m_parallel_exec_domain.size() == rank6D) {
parallel_for6d(initializer, caller);
} else {
parallel_forNd(initializer, caller);
}
}

void SubgraphDynamicSpecializedExecutor::exec_impl(const std::vector<MemoryPtr>& inMemPtrs,
const std::vector<MemoryPtr>& outMemPtrs) {
const auto& callable = m_schedule->get_callable<dynamic_kernel>();

OPENVINO_ASSERT(m_data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!");
OPENVINO_ASSERT(m_data_offsets.front().size() == m_parallel_exec_domain.size(),
"Data offsets with invalid ranks detected");

// Note: we need to reset KernelExecutorTable to the state that was recorded in the
// SubgraphDynamicSpecializedExecutor constructor because the table might've been used for other shapes
m_reset_exec_table_state();

std::vector<const uint8_t*> src_ptrs;
std::vector<uint8_t*> dst_ptrs;
init_original_ptrs(inMemPtrs, outMemPtrs, src_ptrs, dst_ptrs, m_start_offset_in, m_start_offset_out);

auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
init_call_args(call_args, ithr);
update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr);
};
auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
update_ptrs(call_args, src_ptrs, dst_ptrs, indexes);
callable(&call_args);
};

if (m_parallel_exec_domain.size() == rank6D) {
parallel_for6d(initializer, caller);
} else {
parallel_forNd(initializer, caller);
}
}

} // namespace intel_cpu
} // namespace ov
44 changes: 44 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "nodes/executors/subgraph.hpp"

namespace ov {
namespace intel_cpu {

class SubgraphExecutor : public SubgraphBaseExecutor {
public:
SubgraphExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config,
const std::shared_ptr<SubgraphAttrs>& snippet_attrs,
const std::shared_ptr<SubgraphCodeGenerator>& snippet,
const std::vector<ptrdiff_t>& start_offset_in,
const std::vector<ptrdiff_t>& start_offset_out,
const BufferScratchpadAllocator& allocator,
const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache);
};

class SubgraphStaticExecutor : public SubgraphExecutor, public SubgraphStaticBaseExecutor {
public:
template <typename... Args>
SubgraphStaticExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config, Args... args)
: SubgraphExecutor(snippet_config, args...),
SubgraphStaticBaseExecutor() {}

void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override;
};

class SubgraphDynamicSpecializedExecutor : public SubgraphExecutor, public SubgraphDynamicSpecializedBaseExecutor {
public:
template <typename... Args>
SubgraphDynamicSpecializedExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config, Args... args)
: SubgraphExecutor(snippet_config, args...),
SubgraphDynamicSpecializedBaseExecutor(snippet_config) {}

void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override;
};

} // namespace intel_cpu
} // namespace ov
142 changes: 142 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "nodes/executors/subgraph.hpp"
#if defined(OPENVINO_ARCH_ARM64)
# include "emitters/snippets/aarch64/cpu_generator.hpp"
#else
# include "emitters/snippets/x64/cpu_generator.hpp"
#endif
#include "openvino/core/parallel.hpp"

namespace ov {
namespace intel_cpu {

SubgraphCodeGenerator::SubgraphCodeGenerator(const std::shared_ptr<SubgraphAttrs>& snippet_attrs,
const std::shared_ptr<CPURuntimeConfig>& config) {
OPENVINO_ASSERT(snippet_attrs, "Subgraph attributes are empty!");
OPENVINO_ASSERT(config, "Runtime Config is empty!");

jit_snippets_compile_args jcp;
jcp.data_offsets = config->io_data_offsets;
SubgraphBaseExecutor::init_parallel_domain(config, jcp.exec_domain);
schedule =
std::make_shared<ov::snippets::Schedule>(snippet_attrs->snippet->generate(reinterpret_cast<const void*>(&jcp)));
}

SubgraphBaseExecutor::SubgraphBaseExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config,
const std::shared_ptr<SubgraphAttrs>& snippet_attrs,
const std::shared_ptr<SubgraphCodeGenerator>& snippet,
const std::vector<ptrdiff_t>& start_offset_in,
const std::vector<ptrdiff_t>& start_offset_out,
const BufferScratchpadAllocator& allocator,
const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache)
: m_schedule(snippet->get()),
m_start_offset_in(start_offset_in),
m_start_offset_out(start_offset_out) {
OPENVINO_ASSERT(m_schedule, "Schedule is empty!");
OPENVINO_ASSERT(snippet_config, "Runtime Config is empty!");
init_parallel_domain(snippet_config, m_parallel_exec_domain);

m_tensor_rank = snippet_config->tensor_rank;
m_harness_work_amount = std::accumulate(m_parallel_exec_domain.cbegin(),
m_parallel_exec_domain.cend(),
size_t(1),
std::multiplies<size_t>());
m_nthreads = std::min(parallel_get_max_threads(), static_cast<int>(m_harness_work_amount));

m_buffer_scratchpad_size = snippet_config->buffer_scratchpad_size;
OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size),
"Undefined buffer scratchpad size!");
m_internal_buffer_size = static_cast<size_t>(m_nthreads) * m_buffer_scratchpad_size;
}

void SubgraphBaseExecutor::init_parallel_domain(const std::vector<size_t>& master_shape,
size_t tensor_rank,
size_t tile_rank,
std::vector<size_t>& domain) {
domain.resize(tensor_rank, 1);
std::fill(domain.begin(), domain.end(), 1);
std::copy(master_shape.cbegin(),
master_shape.cbegin() + (master_shape.size() - tile_rank),
domain.begin() + (tensor_rank - master_shape.size()));
}

void SubgraphBaseExecutor::init_parallel_domain(const std::shared_ptr<CPURuntimeConfig>& snippet_config,
std::vector<size_t>& domain) {
init_parallel_domain(snippet_config->master_shape, snippet_config->tensor_rank, snippet_config->tile_rank, domain);
}
void SubgraphBaseExecutor::parallel_for6d(
const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller) {
const auto& dom = m_parallel_exec_domain;

parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) {
jit_snippets_call_args call_args;
initializer(call_args, ithr);

size_t start = 0, end = 0;
splitter(m_harness_work_amount, nthr, ithr, start, end);

std::vector<size_t> indexes{0, 0, 0, 0, 0};
parallel_it_init(start,
indexes[0],
dom[0],
indexes[1],
dom[1],
indexes[2],
dom[2],
indexes[3],
dom[3],
indexes[4],
dom[4]);
for (size_t iwork = start; iwork < end; ++iwork) {
caller(call_args, indexes, ithr);
parallel_it_step(indexes[0],
dom[0],
indexes[1],
dom[1],
indexes[2],
dom[2],
indexes[3],
dom[3],
indexes[4],
dom[4]);
}
});
}

void SubgraphBaseExecutor::parallel_forNd(
const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller) {
const auto& dom = m_parallel_exec_domain;

parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) {
jit_snippets_call_args call_args;
initializer(call_args, ithr);

size_t start = 0, end = 0;
splitter(m_harness_work_amount, nthr, ithr, start, end);

std::vector<size_t> indexes(dom.size() - 1, 0);
for (size_t iwork = start; iwork < end; ++iwork) {
size_t tmp = iwork;
for (ptrdiff_t j = static_cast<ptrdiff_t>(dom.size()) - 2; j >= 0; j--) {
indexes[j] = tmp % dom[j];
tmp /= dom[j];
}

caller(call_args, indexes, ithr);
}
});
}

void SubgraphBaseExecutor::execute(const dnnl::stream& strm,
const std::vector<MemoryPtr>& inMemPtrs,
const std::vector<MemoryPtr>& outMemPtrs) {
exec_impl(inMemPtrs, outMemPtrs);
}

} // namespace intel_cpu
} // namespace ov
Loading

0 comments on commit e1951cc

Please sign in to comment.