From 120c7c11590171e0547e82d811bf2cf6d3cc1067 Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Wed, 9 Oct 2024 11:48:13 -0500 Subject: [PATCH 1/2] Adding profiling to sycl backend --- .../dynamic_selection_impl/sycl_backend.h | 59 ++++--------------- .../sycl/test_auto_tune_policy_sycl.pass.cpp | 39 +----------- .../test_dynamic_load_policy_sycl.pass.cpp | 59 +++++++++++++------ 3 files changed, 56 insertions(+), 101 deletions(-) diff --git a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h index 52df9966946..af1a835d1ed 100644 --- a/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h +++ b/include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h @@ -38,7 +38,6 @@ class sycl_backend using resource_container_t = std::vector; private: - static inline bool is_profiling_enabled = false; using report_clock_type = std::chrono::steady_clock; using report_duration = std::chrono::milliseconds; @@ -86,6 +85,9 @@ class sycl_backend std::chrono::nanoseconds(time_end - time_start))); } } + if constexpr (report_info_v){ + s->report(execution_info::task_completion); + } } bool @@ -159,17 +161,18 @@ class sycl_backend template sycl_backend(const NativeUniverseVector& v) { - bool profiling = true; global_rank_.reserve(v.size()); for (auto e : v) { - global_rank_.push_back(e); - if (!e.template has_property()) - { - profiling = false; + if(!e.get_device().has(sycl::aspect::ext_oneapi_queue_profiling_tag)){ + if (!e.template has_property()){ + auto prop_list = sycl::property_list{sycl::property::queue::enable_profiling()}; + auto e_tmp = sycl::queue{e.get_device(), prop_list}; + e = e_tmp; + } } + global_rank_.push_back(e); } - is_profiling_enabled = profiling; sgroup_ptr_ = std::make_unique(global_rank_); } @@ -188,34 +191,12 @@ class sycl_backend if constexpr (report_task_completion || report_task_time) { - const auto t0 = report_clock_type::now(); auto e1 = f(q, std::forward(args)...); async_waiter waiter{e1, std::make_shared(s)}; - if constexpr (report_task_time) - { - if (is_profiling_enabled) - async_waiter_list.add_waiter(new async_waiter(waiter)); - } + async_waiter_list.add_waiter(new async_waiter(waiter)); - if (report_task_time && !is_profiling_enabled || report_task_completion) - { - auto e2 = q.submit([=](sycl::handler& h) { - h.depends_on(e1); - h.host_task([=]() { - if constexpr (report_task_time) - { - if (!is_profiling_enabled) - s.report(execution_info::task_time, - std::chrono::duration_cast(report_clock_type::now() - t0)); - } - if constexpr (report_task_completion) - s.report(execution_info::task_completion); - }); - }); - waiter = async_waiter{e2, std::make_shared(s)}; - } return waiter; } @@ -237,10 +218,7 @@ class sycl_backend void lazy_report() { - if (is_profiling_enabled) - { async_waiter_list.lazy_report(); - } } private: @@ -250,21 +228,8 @@ class sycl_backend void initialize_default_resources() { - bool profiling = true; - auto prop_list = sycl::property_list{}; auto devices = sycl::device::get_devices(); - for (auto& x : devices) - { - if (!x.has(sycl::aspect::queue_profiling)) - { - profiling = false; - } - } - is_profiling_enabled = profiling; - if (is_profiling_enabled) - { - prop_list = sycl::property_list{sycl::property::queue::enable_profiling()}; - } + auto prop_list = sycl::property_list{sycl::property::queue::enable_profiling()}; for (auto& x : devices) { global_rank_.push_back(sycl::queue{x, prop_list}); diff --git a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp index e710741b161..94c96fb1560 100644 --- a/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp +++ b/test/parallel_api/dynamic_selection/sycl/test_auto_tune_policy_sycl.pass.cpp @@ -90,6 +90,7 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource) auto f = [&](typename oneapi::dpl::experimental::policy_traits::resource_type q) { if (i <= 2 * n_samples) { + // we should be round-robining through the resources if (q != u[(i - 1) % n_samples]) { @@ -470,14 +471,10 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource) } -template static inline void build_auto_tune_universe(std::vector& u) { - auto prop_list = sycl::property_list{}; - if(use_event_profiling){ - prop_list = sycl::property_list{sycl::property::queue::enable_profiling()}; - } + auto prop_list = sycl::property_list{sycl::property::queue::enable_profiling()}; try { @@ -532,12 +529,9 @@ main() #if !ONEDPL_FPGA_DEVICE || !ONEDPL_FPGA_EMULATOR using policy_t = oneapi::dpl::experimental::auto_tune_policy; std::vector u1; - std::vector u2; - constexpr bool use_event_profiling = true; build_auto_tune_universe(u1); - build_auto_tune_universe(u2); - if (u1.size() != 0 || u2.size() !=0 ) + if (u1.size() != 0) { auto f = [u1](int i) { if (i <= 8) @@ -576,33 +570,6 @@ main() actual = test_auto_submit_and_wait(u1, 1); actual = test_auto_submit_and_wait(u1, 2); actual = test_auto_submit_and_wait(u1, 3); - // Use event profiling - actual = test_auto_submit_wait_on_event(u2, 0); - actual = test_auto_submit_wait_on_event(u2, 1); - actual = test_auto_submit_wait_on_event(u2, 2); - actual = test_auto_submit_wait_on_event(u2, 3); - actual = test_auto_submit_wait_on_group(u2, 0); - actual = test_auto_submit_wait_on_group(u2, 1); - actual = test_auto_submit_wait_on_group(u2, 2); - actual = test_auto_submit_wait_on_group(u2, 3); - actual = test_auto_submit_and_wait(u2, 0); - actual = test_auto_submit_and_wait(u2, 1); - actual = test_auto_submit_and_wait(u2, 2); - actual = test_auto_submit_and_wait(u2, 3); - // now select then submits - actual = test_auto_submit_wait_on_event(u2, 0); - actual = test_auto_submit_wait_on_event(u2, 1); - actual = test_auto_submit_wait_on_event(u2, 2); - actual = test_auto_submit_wait_on_event(u2, 3); - actual = test_auto_submit_wait_on_group(u2, 0); - actual = test_auto_submit_wait_on_group(u2, 1); - actual = test_auto_submit_wait_on_group(u2, 2); - actual = test_auto_submit_wait_on_group(u2, 3); - actual = test_auto_submit_and_wait(u2, 0); - actual = test_auto_submit_and_wait(u2, 1); - actual = test_auto_submit_and_wait(u2, 2); - actual = test_auto_submit_and_wait(u2, 3); - bProcessed = true; } #endif // Devices available are CPU and GPU diff --git a/test/parallel_api/dynamic_selection/sycl/test_dynamic_load_policy_sycl.pass.cpp b/test/parallel_api/dynamic_selection/sycl/test_dynamic_load_policy_sycl.pass.cpp index b473892af19..40257fa4d97 100644 --- a/test/parallel_api/dynamic_selection/sycl/test_dynamic_load_policy_sycl.pass.cpp +++ b/test/parallel_api/dynamic_selection/sycl/test_dynamic_load_policy_sycl.pass.cpp @@ -13,15 +13,18 @@ #include #include "support/test_dynamic_load_utils.h" #include "support/utils.h" +#include +#include #if TEST_DYNAMIC_SELECTION_AVAILABLE static inline void build_dl_universe(std::vector& u) { + auto prop_list = sycl::property_list{sycl::property::queue::enable_profiling()}; try { auto device_cpu1 = sycl::device(sycl::cpu_selector_v); - sycl::queue cpu1_queue(device_cpu1); + sycl::queue cpu1_queue{device_cpu1, prop_list}; u.push_back(cpu1_queue); } catch (const sycl::exception&) @@ -31,8 +34,8 @@ build_dl_universe(std::vector& u) try { auto device_cpu2 = sycl::device(sycl::cpu_selector_v); - sycl::queue cpu2_queue(device_cpu2); - u.push_back(cpu2_queue); + sycl::queue cpu2_queue{device_cpu2, prop_list}; + u.push_back(cpu2_queue); } catch (const sycl::exception&) { @@ -41,6 +44,7 @@ build_dl_universe(std::vector& u) } #endif +static constexpr size_t N = 1024; int main() { @@ -49,36 +53,55 @@ main() #if TEST_DYNAMIC_SELECTION_AVAILABLE #if !ONEDPL_FPGA_DEVICE || !ONEDPL_FPGA_EMULATOR using policy_t = oneapi::dpl::experimental::dynamic_load_policy; - std::vector u; - build_dl_universe(u); - - auto n = u.size(); + std::vector u1; + build_dl_universe(u1); + auto n = u1.size(); //If building the universe is not a success, return if (n != 0) { // should be similar to round_robin when waiting on policy - auto f = [u, n](int i) { return u[i % u.size()]; }; + auto f = [u1, n](int i) { return u1[i % u1.size()]; }; - auto f2 = [u, n](int i) { return u[0]; }; + auto f2 = [u1, n](int i) { return u1[0]; }; // should always pick first when waiting on sync in each iteration constexpr bool just_call_submit = false; constexpr bool call_select_before_submit = true; - auto actual = test_dl_initialization(u); - actual = test_select(u, f2); - actual = test_submit_and_wait_on_event(u, f2); - actual = test_submit_and_wait_on_event(u, f2); - actual = test_submit_and_wait(u, f2); - actual = test_submit_and_wait(u, f2); - actual = test_submit_and_wait_on_group(u, f); - actual = test_submit_and_wait_on_group(u, f); + auto actual = test_dl_initialization(u1); + actual = test_select(u1, f2); + actual = test_submit_and_wait_on_event(u1, f2); + actual = test_submit_and_wait_on_event(u1, f2); + actual = test_submit_and_wait(u1, f2); + actual = test_submit_and_wait(u1, f2); + actual = test_submit_and_wait_on_group(u1, f); + actual = test_submit_and_wait_on_group(u1, f); bProcessed = true; } #endif // Devices available are CPU and GPU #endif // TEST_DYNAMIC_SELECTION_AVAILABLE - return TestUtils::done(bProcessed); + return TestUtils::done(bProcessed);/* + sycl::queue q {sycl::device(sycl::cpu_selector_v){sycl::device(sycl::cpu_selector_v)}; + + if (!q.get_device().has(sycl::aspect::ext_oneapi_queue_profiling_tag)) { + std::cout << "Cannot time kernels without enabling profiling on queue\n"; + // return; + } + + // commands submitted here are not timed + + sycl::event start = syclex::submit_profiling_tag(q); + //sycl::parallel_for(q, {N}, [=](auto i) {}); + //sycl::parallel_for(q, {N}, [=](auto i) {}); + sycl::event end = syclex::submit_profiling_tag(q); + + q.wait(); + + uint64_t elapsed = + end.get_profiling_info() - + start.get_profiling_info(); + std::cout << "Execution time: " << elapsed << " (nanoseconds)\n";*/ } From f523b4987f51a3d4a411abb4743f9b7bb421e2a1 Mon Sep 17 00:00:00 2001 From: Anuya Welling Date: Wed, 16 Oct 2024 16:06:35 -0500 Subject: [PATCH 2/2] Removed comments --- .../test_dynamic_load_policy_sycl.pass.cpp | 22 +------------------ 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/test/parallel_api/dynamic_selection/sycl/test_dynamic_load_policy_sycl.pass.cpp b/test/parallel_api/dynamic_selection/sycl/test_dynamic_load_policy_sycl.pass.cpp index 40257fa4d97..38d8a347698 100644 --- a/test/parallel_api/dynamic_selection/sycl/test_dynamic_load_policy_sycl.pass.cpp +++ b/test/parallel_api/dynamic_selection/sycl/test_dynamic_load_policy_sycl.pass.cpp @@ -83,25 +83,5 @@ main() #endif // Devices available are CPU and GPU #endif // TEST_DYNAMIC_SELECTION_AVAILABLE - return TestUtils::done(bProcessed);/* - sycl::queue q {sycl::device(sycl::cpu_selector_v){sycl::device(sycl::cpu_selector_v)}; - - if (!q.get_device().has(sycl::aspect::ext_oneapi_queue_profiling_tag)) { - std::cout << "Cannot time kernels without enabling profiling on queue\n"; - // return; - } - - // commands submitted here are not timed - - sycl::event start = syclex::submit_profiling_tag(q); - //sycl::parallel_for(q, {N}, [=](auto i) {}); - //sycl::parallel_for(q, {N}, [=](auto i) {}); - sycl::event end = syclex::submit_profiling_tag(q); - - q.wait(); - - uint64_t elapsed = - end.get_profiling_info() - - start.get_profiling_info(); - std::cout << "Execution time: " << elapsed << " (nanoseconds)\n";*/ + return TestUtils::done(bProcessed); }