Skip to content

Commit

Permalink
added profiling codes
Browse files Browse the repository at this point in the history
  • Loading branch information
yeonbok committed Apr 29, 2024
1 parent 1f5c819 commit 7f7c656
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 10 deletions.
13 changes: 13 additions & 0 deletions src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,19 @@ class ICompilationContext;
struct network {
public:
using ptr = std::shared_ptr<network>;
double total_host_time;
double total_update_shape;
double total_update_impl;
double total_realloc;
double total_exec_gpu;
double total_exec_cpu;
double total_runtime_skip_kv_cache;
double total_runtime_skip_gather;
double total_runtime_skip_strided_slice;
double total_runtime_skip_broadcast;
double total_runtime_skip_reorder;
double total_runtime_skip_permute;
double total_update_padding;

explicit network(program::ptr program, const ExecutionConfig& config, stream::ptr stream, bool is_internal = false, bool is_primary_stream = true);
network(engine& engine,
Expand Down
55 changes: 50 additions & 5 deletions src/plugins/intel_gpu/src/graph/network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@
namespace cldnn {

namespace {

#define PRINT 0
#define PRINT_ITER 0
#ifdef GPU_DEBUG_CONFIG
void dump_perf_data_raw(std::string dump_path, const std::list<std::shared_ptr<primitive_inst>>& exec_order) {
auto layouts_to_str = [](const std::vector<layout>& layouts) -> std::string {
Expand Down Expand Up @@ -919,14 +920,29 @@ std::map<primitive_id, network_output> network::execute(const std::vector<event:
return result;
}

using Time = std::chrono::high_resolution_clock;
void network::execute_impl(const std::vector<event::ptr>& events) {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "NetworkImpl::Execute");
int64_t curr_iter = -1;
GPU_DEBUG_GET_INSTANCE(debug_config);
#ifdef GPU_DEBUG_CONFIG
curr_iter = iteration;
#endif

#if PRINT
auto total_host_start = Time::now();
total_update_shape = 0;
total_update_impl = 0;
total_realloc = 0;
total_exec_gpu = 0;
total_exec_cpu = 0;
total_runtime_skip_kv_cache = 0;
total_runtime_skip_gather = 0;
total_runtime_skip_strided_slice = 0;
total_runtime_skip_broadcast = 0;
total_runtime_skip_reorder = 0;
total_runtime_skip_permute = 0;
total_update_padding = 0;
#endif
// Wait for previous execution completion
reset_execution(false);
GPU_DEBUG_IF(debug_config->dump_runtime_memory_pool > 0) {
Expand All @@ -936,7 +952,9 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
GPU_DEBUG_TRACE << "============================================================================" << std::endl;
GPU_DEBUG_TRACE << "Start network execution (net_id : " << get_id() << ", iter :" << curr_iter << ")" << std::endl;
}

#if PRINT_ITER
std::cout << "Start network execution (net_id : " << get_id() << ", iter :" << curr_iter << ")" << std::endl;
#endif
std::vector<memory::ptr> in_out_mem;
auto is_surface_lock_check_needed = [&](const shared_mem_type& shared_mem_type) {
return shared_mem_type == shared_mem_type::shared_mem_vasurface ||
Expand Down Expand Up @@ -1236,10 +1254,37 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
// provide proper event to execution. Flushing pipeline should prevent this kind of issues.
// In scenarios with a big number of very small networks it can provide performance drop.
get_stream().flush();

#if PRINT
using ms = std::chrono::duration<double, std::ratio<1, 1000>>;
std::chrono::duration<float> dur = Time::now() - total_host_start;
auto total_host_time = std::chrono::duration_cast<ms>(dur).count();
#endif
// Deallocate events from the previos iteration
_old_events.clear();

#if PRINT
std::cout << "====================================================================" << std::endl;
std::cout << " iter " << curr_iter << std::endl;
std::cout << "====================================================================" << std::endl;
std::cout << "iter " << curr_iter << " total_update_shape : " << total_update_shape << " ms" << std::endl;
std::cout << "iter " << curr_iter << " total_update_impl : " << total_update_impl << " ms" << std::endl;
std::cout << "iter " << curr_iter << " total_realloc : " << total_realloc << " ms" << std::endl;
std::cout << "iter " << curr_iter << " total_rt_skip_kv_cache : " << total_runtime_skip_kv_cache << " ms"
<< std::endl;
std::cout << "iter " << curr_iter << " total_rt_skip_gather : " << total_runtime_skip_gather << " ms"
<< std::endl;
std::cout << "iter " << curr_iter << " total_rt_skip_strided_slice : " << total_runtime_skip_strided_slice
<< " ms" << std::endl;
std::cout << "iter " << curr_iter << " total_rt_skip_broadcast : " << total_runtime_skip_broadcast << " ms"
<< std::endl;
std::cout << "iter " << curr_iter << " total_rt_skip_reorder : " << total_runtime_skip_reorder << " ms"
<< std::endl;
std::cout << "iter " << curr_iter << " total_rt_skip_permute : " << total_runtime_skip_permute << " ms"
<< std::endl;
std::cout << "iter " << curr_iter << " total_update_padding : " << total_update_padding << " ms" << std::endl;
std::cout << "iter " << curr_iter << " total execute time (GPU) : " << total_exec_gpu << " ms" << std::endl;
std::cout << "iter " << curr_iter << " total execute time (CPU) : " << total_exec_cpu << " ms" << std::endl;
std::cout << "iter " << curr_iter << " total_host_ime " << total_host_time << " ms" << std::endl;
#endif
GPU_DEBUG_IF(debug_config->dump_runtime_memory_pool > 0) {
get_memory_pool().dump(get_id());
}
Expand Down
84 changes: 79 additions & 5 deletions src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,44 @@
namespace cldnn {
namespace {

using Time = std::chrono::high_resolution_clock;
using ms = std::chrono::duration<double, std::ratio<1, 1000>>;

static std::string update_shape_stage_name = "update_shape";
static std::string update_impl_stage_name = "update_impl";
static std::string realloc_stage_name = "realloc";
static std::string skip_reorder_stage_name = "skip_reorder";
static std::string skip_gather_stage_name = "skip_gather";
static std::string update_padding_stage_name = "update_padding";
static std::string kv_cache_opt_stage_name = "kv_cache_opt";
static std::string skip_strided_slice_stage_name = "skip_strided_slice";
static std::string skip_broadcast_stage_name = "skip_broadcast";
static std::string skip_permute_stage_name = "skip_permute";
static std::string execute_cpu_stage_name = "execute_cpu";
static std::string execute_gpu_stage_name = "execute_gpu";
#define PROFILE 0
#if PROFILE
#define PROFILE_START(stage) std::chrono::steady_clock::time_point start_##stage; \
start_##stage = Time::now();
#define PROFILE_END(stage,total_time) \
std::chrono::duration<float> dur_##stage = Time::now() - start_##stage; \
auto time_##stage = std::chrono::duration_cast<ms>(dur_##stage).count(); \
get_network().##total_time += time_##stage;
//std::cout << id() << ", " << stage##_stage_name << ", " << time_##stage << std::endl;

#define PROFILE_START_POINT(stage) std::chrono::steady_clock::time_point start_##stage; \
start_##stage = Time::now();

#define PROFILE_END_POINT(stage) \
std::chrono::duration<float> dur_##stage = Time::now() - start_##stage; \
auto time_##stage = std::chrono::duration_cast<ms>(dur_##stage).count(); \
std::cout << id() << ", " << stage##_stage_name << ", " << time_##stage << std::endl;
#else
#define PROFILE_START(stage)
#define PROFILE_END(stage,total_time)
#define PROFILE_START_POINT(stage)
#define PROFILE_END_POINT(stage)
#endif
template <typename T>
bool is_optimized_output_user(const T user) {
if (user->can_be_optimized()) {
Expand Down Expand Up @@ -251,6 +289,7 @@ event::ptr primitive_inst::set_output_memory(memory::ptr mem_new, bool check, si
}

void primitive_inst::update_shape() {
PROFILE_START(update_shape)
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("update_shape: " + id()));
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::shape_inference);
if (update_shape_done_by_other) {
Expand Down Expand Up @@ -308,13 +347,15 @@ void primitive_inst::update_shape() {
// We assume that tensor ranks are static, thus shape_of doesn't need to update anything even if input shape is dynamic
if (_node->is_type<shape_of>() && !input_shape_changed) {
reset_shape_change();
PROFILE_END(update_shape,total_update_shape);
return;
}

// if input shape is not changed, loop doesn't need to update anything.
// because actual output layout will be calculated after the end of body network execution.
if (_node->is_type<loop>() && !input_shape_changed) {
reset_shape_change();
PROFILE_END(update_shape,total_update_shape);
return;
}

Expand All @@ -330,6 +371,7 @@ void primitive_inst::update_shape() {
if (!subgraph_input_changed) {
GPU_DEBUG_TRACE_DETAIL << id() << ": skip shape_update, because it is in shape_of_subgraph and input shape is not changed\n";
reset_shape_change();
PROFILE_END(update_shape,total_update_shape);
return;
}
}
Expand All @@ -356,8 +398,10 @@ void primitive_inst::update_shape() {
input_shape_changed = true;
}

if (!_node->is_type<kv_cache>() && !input_shape_changed && !_node->generates_dynamic_output() && _impl_params->get_output_layout().is_static())
if (!_node->is_type<kv_cache>() && !input_shape_changed && !_node->generates_dynamic_output() && _impl_params->get_output_layout().is_static()) {
PROFILE_END(update_shape,total_update_shape);
return;
}

std::vector<event::ptr> dependencies_events;
auto queue_type = get_network().get_stream().get_queue_type();
Expand Down Expand Up @@ -449,6 +493,7 @@ void primitive_inst::update_shape() {
if (var_mem_size < _impl_params->get_output_layout(0).get_buffer_size().count())
set_shape_change();
}
PROFILE_END(update_shape,total_update_shape);
}

event::ptr primitive_inst::realloc_if_needed() {
Expand Down Expand Up @@ -1301,16 +1346,31 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
// Check successor reorder if layouts are same
// Need to set can_be_optimized for user reorder at predecessor because
// if the user is can_be_optimized and output node then current nodes' output should be allocated to host.
PROFILE_START(skip_reorder)
do_runtime_skip_reorder();
PROFILE_END(skip_reorder, total_runtime_skip_reorder);

PROFILE_START(skip_gather)
do_runtime_skip_gather();
PROFILE_END(skip_gather, total_runtime_skip_gather);
PROFILE_START(update_padding)
update_paddings();
PROFILE_END(update_padding, total_update_padding);
PROFILE_START(kv_cache_opt)
do_runtime_in_place_kv_cache();
PROFILE_END(kv_cache_opt, total_runtime_skip_kv_cache);
PROFILE_START(skip_permute)
do_runtime_skip_permute();
PROFILE_END(skip_permute, total_runtime_skip_permute);
PROFILE_START(skip_strided_slice)
do_runtime_skip_strided_slice();
PROFILE_END(skip_strided_slice, total_runtime_skip_strided_slice);
PROFILE_START(skip_broadcast)
do_runtime_skip_broadcast();

PROFILE_END(skip_broadcast, total_runtime_skip_broadcast);
if (!is_valid_fusion()) {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("unfused_subgraph_exec: " + id()));
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin,
openvino::itt::handle("unfused_subgraph_exec: " + id()));
auto subgraph = get_unfused_subgraph();

for (auto& d : _deps) {
Expand Down Expand Up @@ -1341,12 +1401,17 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
// Try update impl if current impl is dynamic because opt kernel may be added to impl cache through async compilation.
// Only try update weight and realloc when impl is updated.
if (shape_changed() || !_impl || (!shape_changed() && _impl->is_dynamic())) {
if (update_impl()) {
PROFILE_START(update_impl)
bool update_impl_res = update_impl();
PROFILE_END(update_impl,total_update_impl);
if (update_impl_res) {
need_args_update = true;
auto ev = update_weights();
if (ev)
dependencies.push_back(ev);
PROFILE_START(realloc)
auto ev_reset = realloc_if_needed();
PROFILE_END(realloc,total_realloc);
if (ev_reset)
dependencies.push_back(ev_reset);
}
Expand Down Expand Up @@ -1417,7 +1482,16 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {

{
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::inference);
auto ev = _impl->execute(dependencies, *this);
cldnn::event::ptr ev = nullptr;
if (_impl->is_cpu()) {
PROFILE_START(execute_cpu)
ev = _impl->execute(dependencies, *this);
PROFILE_END(execute_cpu,total_exec_cpu);
} else {
PROFILE_START(execute_gpu)
ev = _impl->execute(dependencies, *this);
PROFILE_END(execute_gpu,total_exec_gpu);
}

GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) {
get_network().get_stream().wait_for_events({ev});
Expand Down

0 comments on commit 7f7c656

Please sign in to comment.