added profiling codes

yeonbok · Apr 29, 2024 · 7f7c656 · 7f7c656
1 parent 1f5c819
commit 7f7c656
Show file tree

Hide file tree

Showing 3 changed files with 142 additions and 10 deletions.
diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
@@ -66,6 +66,19 @@ class ICompilationContext;
 struct network {
 public:
     using ptr = std::shared_ptr<network>;
+    double total_host_time;
+    double total_update_shape;
+    double total_update_impl;
+    double total_realloc;
+    double total_exec_gpu;
+    double total_exec_cpu;
+    double total_runtime_skip_kv_cache;
+    double total_runtime_skip_gather;
+    double total_runtime_skip_strided_slice;
+    double total_runtime_skip_broadcast;
+    double total_runtime_skip_reorder;
+    double total_runtime_skip_permute;
+    double total_update_padding;
 
     explicit network(program::ptr program, const ExecutionConfig& config, stream::ptr stream, bool is_internal = false, bool is_primary_stream = true);
     network(engine& engine,

diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -65,7 +65,8 @@
 namespace cldnn {
 
 namespace {
-
+#define PRINT 0
+#define PRINT_ITER 0
 #ifdef GPU_DEBUG_CONFIG
 void dump_perf_data_raw(std::string dump_path, const std::list<std::shared_ptr<primitive_inst>>& exec_order) {
     auto layouts_to_str = [](const std::vector<layout>& layouts) -> std::string {
@@ -919,14 +920,29 @@ std::map<primitive_id, network_output> network::execute(const std::vector<event:
     return result;
 }
 
+using Time = std::chrono::high_resolution_clock;
 void network::execute_impl(const std::vector<event::ptr>& events) {
     OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "NetworkImpl::Execute");
     int64_t curr_iter = -1;
     GPU_DEBUG_GET_INSTANCE(debug_config);
 #ifdef GPU_DEBUG_CONFIG
     curr_iter = iteration;
 #endif
-
+#if PRINT
+    auto total_host_start = Time::now();
+    total_update_shape = 0;
+    total_update_impl = 0;
+    total_realloc = 0;
+    total_exec_gpu = 0;
+    total_exec_cpu = 0;
+    total_runtime_skip_kv_cache = 0;
+    total_runtime_skip_gather = 0;
+    total_runtime_skip_strided_slice = 0;
+    total_runtime_skip_broadcast = 0;
+    total_runtime_skip_reorder = 0;
+    total_runtime_skip_permute = 0;
+    total_update_padding = 0;
+#endif
     // Wait for previous execution completion
     reset_execution(false);
     GPU_DEBUG_IF(debug_config->dump_runtime_memory_pool > 0) {
@@ -936,7 +952,9 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
         GPU_DEBUG_TRACE << "============================================================================" << std::endl;
         GPU_DEBUG_TRACE << "Start network execution (net_id : " << get_id() << ", iter :" << curr_iter << ")" << std::endl;
     }
-
+    #if PRINT_ITER
+    std::cout << "Start network execution (net_id : " << get_id() << ", iter :" << curr_iter << ")" << std::endl;
+    #endif
     std::vector<memory::ptr> in_out_mem;
     auto is_surface_lock_check_needed = [&](const shared_mem_type& shared_mem_type) {
         return shared_mem_type == shared_mem_type::shared_mem_vasurface ||
@@ -1236,10 +1254,37 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
     // provide proper event to execution. Flushing pipeline should prevent this kind of issues.
     // In scenarios with a big number of very small networks it can provide performance drop.
     get_stream().flush();
-
+    #if PRINT
+    using ms = std::chrono::duration<double, std::ratio<1, 1000>>;
+    std::chrono::duration<float> dur = Time::now() - total_host_start;
+    auto total_host_time = std::chrono::duration_cast<ms>(dur).count();
+    #endif
     // Deallocate events from the previos iteration
     _old_events.clear();
-
+    #if PRINT
+    std::cout << "====================================================================" << std::endl;
+    std::cout << "      iter " << curr_iter << std::endl;
+    std::cout << "====================================================================" << std::endl;
+    std::cout << "iter " << curr_iter << " total_update_shape : " << total_update_shape << " ms" << std::endl;
+    std::cout << "iter " << curr_iter << " total_update_impl : " << total_update_impl << " ms" << std::endl;
+    std::cout << "iter " << curr_iter << " total_realloc : " << total_realloc << " ms" << std::endl;
+    std::cout << "iter " << curr_iter << " total_rt_skip_kv_cache : " << total_runtime_skip_kv_cache << " ms"
+              << std::endl;
+    std::cout << "iter " << curr_iter << " total_rt_skip_gather : " << total_runtime_skip_gather << " ms"
+              << std::endl;
+    std::cout << "iter " << curr_iter << " total_rt_skip_strided_slice : " << total_runtime_skip_strided_slice
+              << " ms" << std::endl;
+    std::cout << "iter " << curr_iter << " total_rt_skip_broadcast : " << total_runtime_skip_broadcast << " ms"
+              << std::endl;
+    std::cout << "iter " << curr_iter << " total_rt_skip_reorder : " << total_runtime_skip_reorder << " ms"
+              << std::endl;
+    std::cout << "iter " << curr_iter << " total_rt_skip_permute : " << total_runtime_skip_permute << " ms"
+              << std::endl;
+    std::cout << "iter " << curr_iter << " total_update_padding : " << total_update_padding << " ms" << std::endl;
+    std::cout << "iter " << curr_iter << " total execute time (GPU) : " << total_exec_gpu << " ms" << std::endl;
+    std::cout << "iter " << curr_iter << " total execute time (CPU) : " << total_exec_cpu << " ms" << std::endl;
+    std::cout << "iter " << curr_iter << " total_host_ime " << total_host_time << " ms" << std::endl;
+    #endif
     GPU_DEBUG_IF(debug_config->dump_runtime_memory_pool > 0) {
         get_memory_pool().dump(get_id());
     }

diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -57,6 +57,44 @@
 namespace cldnn {
 namespace {
 
+using Time = std::chrono::high_resolution_clock;
+using ms = std::chrono::duration<double, std::ratio<1, 1000>>;
+
+static std::string update_shape_stage_name = "update_shape";
+static std::string update_impl_stage_name = "update_impl";
+static std::string realloc_stage_name = "realloc";
+static std::string skip_reorder_stage_name = "skip_reorder";
+static std::string skip_gather_stage_name = "skip_gather";
+static std::string update_padding_stage_name = "update_padding";
+static std::string kv_cache_opt_stage_name = "kv_cache_opt";
+static std::string skip_strided_slice_stage_name = "skip_strided_slice";
+static std::string skip_broadcast_stage_name = "skip_broadcast";
+static std::string skip_permute_stage_name = "skip_permute";
+static std::string execute_cpu_stage_name = "execute_cpu";
+static std::string execute_gpu_stage_name = "execute_gpu";
+#define PROFILE 0
+#if PROFILE
+#define PROFILE_START(stage) std::chrono::steady_clock::time_point start_##stage; \
+                             start_##stage = Time::now();
+#define PROFILE_END(stage,total_time) \
+                            std::chrono::duration<float> dur_##stage = Time::now() - start_##stage; \
+                            auto time_##stage = std::chrono::duration_cast<ms>(dur_##stage).count(); \
+                            get_network().##total_time += time_##stage;
+                            //std::cout << id() << ", " << stage##_stage_name << ", " << time_##stage << std::endl;
+
+#define PROFILE_START_POINT(stage) std::chrono::steady_clock::time_point start_##stage; \
+                             start_##stage = Time::now();
+
+#define PROFILE_END_POINT(stage) \
+                            std::chrono::duration<float> dur_##stage = Time::now() - start_##stage; \
+                            auto time_##stage = std::chrono::duration_cast<ms>(dur_##stage).count(); \
+                            std::cout << id() << ", " << stage##_stage_name << ", " << time_##stage << std::endl;
+#else
+#define PROFILE_START(stage)
+#define PROFILE_END(stage,total_time)
+#define PROFILE_START_POINT(stage)
+#define PROFILE_END_POINT(stage)
+#endif
 template <typename T>
 bool is_optimized_output_user(const T user) {
     if (user->can_be_optimized()) {
@@ -251,6 +289,7 @@ event::ptr primitive_inst::set_output_memory(memory::ptr mem_new, bool check, si
 }
 
 void primitive_inst::update_shape() {
+    PROFILE_START(update_shape)
     OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("update_shape: " + id()));
     GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::shape_inference);
     if (update_shape_done_by_other) {
@@ -308,13 +347,15 @@ void primitive_inst::update_shape() {
     // We assume that tensor ranks are static, thus shape_of doesn't need to update anything even if input shape is dynamic
     if (_node->is_type<shape_of>() && !input_shape_changed) {
         reset_shape_change();
+        PROFILE_END(update_shape,total_update_shape);
         return;
     }
 
     // if input shape is not changed, loop doesn't need to update anything.
     // because actual output layout will be calculated after the end of body network execution.
     if (_node->is_type<loop>() && !input_shape_changed) {
         reset_shape_change();
+        PROFILE_END(update_shape,total_update_shape);
         return;
     }
 
@@ -330,6 +371,7 @@ void primitive_inst::update_shape() {
         if (!subgraph_input_changed) {
             GPU_DEBUG_TRACE_DETAIL << id() << ": skip shape_update, because it is in shape_of_subgraph and input shape is not changed\n";
             reset_shape_change();
+            PROFILE_END(update_shape,total_update_shape);
             return;
         }
     }
@@ -356,8 +398,10 @@ void primitive_inst::update_shape() {
         input_shape_changed = true;
     }
 
-    if (!_node->is_type<kv_cache>() && !input_shape_changed && !_node->generates_dynamic_output() && _impl_params->get_output_layout().is_static())
+    if (!_node->is_type<kv_cache>() && !input_shape_changed && !_node->generates_dynamic_output() && _impl_params->get_output_layout().is_static()) {
+        PROFILE_END(update_shape,total_update_shape);
         return;
+    }
 
     std::vector<event::ptr> dependencies_events;
     auto queue_type = get_network().get_stream().get_queue_type();
@@ -449,6 +493,7 @@ void primitive_inst::update_shape() {
         if (var_mem_size < _impl_params->get_output_layout(0).get_buffer_size().count())
             set_shape_change();
     }
+    PROFILE_END(update_shape,total_update_shape);
 }
 
 event::ptr primitive_inst::realloc_if_needed() {
@@ -1301,16 +1346,31 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
         // Check successor reorder if layouts are same
         // Need to set can_be_optimized for user reorder at predecessor because
         // if the user is can_be_optimized and output node then current nodes' output should be allocated to host.
+        PROFILE_START(skip_reorder)
         do_runtime_skip_reorder();
+        PROFILE_END(skip_reorder, total_runtime_skip_reorder);
+
+        PROFILE_START(skip_gather)
         do_runtime_skip_gather();
+        PROFILE_END(skip_gather, total_runtime_skip_gather);
+        PROFILE_START(update_padding)
         update_paddings();
+        PROFILE_END(update_padding, total_update_padding);
+        PROFILE_START(kv_cache_opt)
         do_runtime_in_place_kv_cache();
+        PROFILE_END(kv_cache_opt, total_runtime_skip_kv_cache);
+        PROFILE_START(skip_permute)
         do_runtime_skip_permute();
+        PROFILE_END(skip_permute, total_runtime_skip_permute);
+        PROFILE_START(skip_strided_slice)
         do_runtime_skip_strided_slice();
+        PROFILE_END(skip_strided_slice, total_runtime_skip_strided_slice);
+        PROFILE_START(skip_broadcast)
         do_runtime_skip_broadcast();
-
+        PROFILE_END(skip_broadcast, total_runtime_skip_broadcast);
         if (!is_valid_fusion()) {
-            OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("unfused_subgraph_exec: " + id()));
+            OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin,
+                               openvino::itt::handle("unfused_subgraph_exec: " + id()));
             auto subgraph = get_unfused_subgraph();
 
             for (auto& d : _deps) {
@@ -1341,12 +1401,17 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
         // Try update impl if current impl is dynamic because opt kernel may be added to impl cache through async compilation.
         // Only try update weight and realloc when impl is updated.
         if (shape_changed() || !_impl || (!shape_changed() && _impl->is_dynamic())) {
-            if (update_impl()) {
+            PROFILE_START(update_impl)
+            bool update_impl_res = update_impl();
+            PROFILE_END(update_impl,total_update_impl);
+            if (update_impl_res) {
                 need_args_update = true;
                 auto ev = update_weights();
                 if (ev)
                     dependencies.push_back(ev);
+                PROFILE_START(realloc)
                 auto ev_reset = realloc_if_needed();
+                PROFILE_END(realloc,total_realloc);
                 if (ev_reset)
                     dependencies.push_back(ev_reset);
             }
@@ -1417,7 +1482,16 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
 
     {
         GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::inference);
-        auto ev = _impl->execute(dependencies, *this);
+        cldnn::event::ptr ev = nullptr;
+        if (_impl->is_cpu()) {
+            PROFILE_START(execute_cpu)
+            ev  = _impl->execute(dependencies, *this);
+            PROFILE_END(execute_cpu,total_exec_cpu);
+        } else {
+            PROFILE_START(execute_gpu)
+            ev  = _impl->execute(dependencies, *this);
+            PROFILE_END(execute_gpu,total_exec_gpu);
+        }
 
         GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) {
             get_network().get_stream().wait_for_events({ev});