Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Dynamic Selection] Adding profiling to sycl backend #1893

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 12 additions & 47 deletions include/oneapi/dpl/internal/dynamic_selection_impl/sycl_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ class sycl_backend
using resource_container_t = std::vector<execution_resource_t>;

private:
static inline bool is_profiling_enabled = false;
using report_clock_type = std::chrono::steady_clock;
using report_duration = std::chrono::milliseconds;

Expand Down Expand Up @@ -86,6 +85,9 @@ class sycl_backend
std::chrono::nanoseconds(time_end - time_start)));
}
}
if constexpr (report_info_v<Selection, execution_info::task_completion_t>){
s->report(execution_info::task_completion);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why no check for s != nullptr in this if?

}
}

bool
Expand Down Expand Up @@ -159,17 +161,18 @@ class sycl_backend
template <typename NativeUniverseVector>
sycl_backend(const NativeUniverseVector& v)
{
bool profiling = true;
global_rank_.reserve(v.size());
for (auto e : v)
{
global_rank_.push_back(e);
if (!e.template has_property<sycl::property::queue::enable_profiling>())
{
profiling = false;
if(!e.get_device().has(sycl::aspect::ext_oneapi_queue_profiling_tag)){
if (!e.template has_property<sycl::property::queue::enable_profiling>()){
auto prop_list = sycl::property_list{sycl::property::queue::enable_profiling()};
auto e_tmp = sycl::queue{e.get_device(), prop_list};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should other properties also be copied from the original queue and then enable_profling added on top of those?

e = e_tmp;
}
}
global_rank_.push_back(e);
}
is_profiling_enabled = profiling;
sgroup_ptr_ = std::make_unique<submission_group>(global_rank_);
}

Expand All @@ -188,34 +191,12 @@ class sycl_backend

if constexpr (report_task_completion || report_task_time)
{
const auto t0 = report_clock_type::now();

auto e1 = f(q, std::forward<Args>(args)...);
async_waiter<SelectionHandle> waiter{e1, std::make_shared<SelectionHandle>(s)};

if constexpr (report_task_time)
{
if (is_profiling_enabled)
async_waiter_list.add_waiter(new async_waiter(waiter));
}
async_waiter_list.add_waiter(new async_waiter(waiter));

if (report_task_time && !is_profiling_enabled || report_task_completion)
{
auto e2 = q.submit([=](sycl::handler& h) {
h.depends_on(e1);
h.host_task([=]() {
if constexpr (report_task_time)
{
if (!is_profiling_enabled)
s.report(execution_info::task_time,
std::chrono::duration_cast<report_duration>(report_clock_type::now() - t0));
}
if constexpr (report_task_completion)
s.report(execution_info::task_completion);
});
});
waiter = async_waiter{e2, std::make_shared<SelectionHandle>(s)};
}
return waiter;
}

Expand All @@ -237,10 +218,7 @@ class sycl_backend
void
lazy_report()
{
if (is_profiling_enabled)
{
async_waiter_list.lazy_report();
}
}

private:
Expand All @@ -250,21 +228,8 @@ class sycl_backend
void
initialize_default_resources()
{
bool profiling = true;
auto prop_list = sycl::property_list{};
auto devices = sycl::device::get_devices();
for (auto& x : devices)
{
if (!x.has(sycl::aspect::queue_profiling))
{
profiling = false;
}
}
is_profiling_enabled = profiling;
if (is_profiling_enabled)
{
prop_list = sycl::property_list{sycl::property::queue::enable_profiling()};
}
auto prop_list = sycl::property_list{sycl::property::queue::enable_profiling()};
for (auto& x : devices)
{
global_rank_.push_back(sycl::queue{x, prop_list});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ test_auto_submit_wait_on_event(UniverseContainer u, int best_resource)
auto f = [&](typename oneapi::dpl::experimental::policy_traits<Policy>::resource_type q) {
if (i <= 2 * n_samples)
{

// we should be round-robining through the resources
if (q != u[(i - 1) % n_samples])
{
Expand Down Expand Up @@ -470,14 +471,10 @@ test_auto_submit_and_wait(UniverseContainer u, int best_resource)
}


template<bool use_event_profiling=false>
static inline void
build_auto_tune_universe(std::vector<sycl::queue>& u)
{
auto prop_list = sycl::property_list{};
if(use_event_profiling){
prop_list = sycl::property_list{sycl::property::queue::enable_profiling()};
}
auto prop_list = sycl::property_list{sycl::property::queue::enable_profiling()};

try
{
Expand Down Expand Up @@ -532,12 +529,9 @@ main()
#if !ONEDPL_FPGA_DEVICE || !ONEDPL_FPGA_EMULATOR
using policy_t = oneapi::dpl::experimental::auto_tune_policy<oneapi::dpl::experimental::sycl_backend>;
std::vector<sycl::queue> u1;
std::vector<sycl::queue> u2;
constexpr bool use_event_profiling = true;
build_auto_tune_universe(u1);
build_auto_tune_universe<use_event_profiling>(u2);

if (u1.size() != 0 || u2.size() !=0 )
if (u1.size() != 0)
{
auto f = [u1](int i) {
if (i <= 8)
Expand Down Expand Up @@ -576,33 +570,6 @@ main()
actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u1, 1);
actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u1, 2);
actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u1, 3);
// Use event profiling
actual = test_auto_submit_wait_on_event<just_call_submit, policy_t>(u2, 0);
actual = test_auto_submit_wait_on_event<just_call_submit, policy_t>(u2, 1);
actual = test_auto_submit_wait_on_event<just_call_submit, policy_t>(u2, 2);
actual = test_auto_submit_wait_on_event<just_call_submit, policy_t>(u2, 3);
actual = test_auto_submit_wait_on_group<just_call_submit, policy_t>(u2, 0);
actual = test_auto_submit_wait_on_group<just_call_submit, policy_t>(u2, 1);
actual = test_auto_submit_wait_on_group<just_call_submit, policy_t>(u2, 2);
actual = test_auto_submit_wait_on_group<just_call_submit, policy_t>(u2, 3);
actual = test_auto_submit_and_wait<just_call_submit, policy_t>(u2, 0);
actual = test_auto_submit_and_wait<just_call_submit, policy_t>(u2, 1);
actual = test_auto_submit_and_wait<just_call_submit, policy_t>(u2, 2);
actual = test_auto_submit_and_wait<just_call_submit, policy_t>(u2, 3);
// now select then submits
actual = test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u2, 0);
actual = test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u2, 1);
actual = test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u2, 2);
actual = test_auto_submit_wait_on_event<call_select_before_submit, policy_t>(u2, 3);
actual = test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u2, 0);
actual = test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u2, 1);
actual = test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u2, 2);
actual = test_auto_submit_wait_on_group<call_select_before_submit, policy_t>(u2, 3);
actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u2, 0);
actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u2, 1);
actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u2, 2);
actual = test_auto_submit_and_wait<call_select_before_submit, policy_t>(u2, 3);

bProcessed = true;
}
#endif // Devices available are CPU and GPU
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,18 @@
#include <iostream>
#include "support/test_dynamic_load_utils.h"
#include "support/utils.h"
#include <sycl/sycl.hpp>
#include <sycl/aspects.hpp>
#if TEST_DYNAMIC_SELECTION_AVAILABLE

static inline void
build_dl_universe(std::vector<sycl::queue>& u)
{
auto prop_list = sycl::property_list{sycl::property::queue::enable_profiling()};
try
{
auto device_cpu1 = sycl::device(sycl::cpu_selector_v);
sycl::queue cpu1_queue(device_cpu1);
sycl::queue cpu1_queue{device_cpu1, prop_list};
u.push_back(cpu1_queue);
}
catch (const sycl::exception&)
Expand All @@ -31,8 +34,8 @@ build_dl_universe(std::vector<sycl::queue>& u)
try
{
auto device_cpu2 = sycl::device(sycl::cpu_selector_v);
sycl::queue cpu2_queue(device_cpu2);
u.push_back(cpu2_queue);
sycl::queue cpu2_queue{device_cpu2, prop_list};
u.push_back(cpu2_queue);
}
catch (const sycl::exception&)
{
Expand All @@ -41,6 +44,7 @@ build_dl_universe(std::vector<sycl::queue>& u)
}
#endif

static constexpr size_t N = 1024;
int
main()
{
Expand All @@ -49,31 +53,30 @@ main()
#if TEST_DYNAMIC_SELECTION_AVAILABLE
#if !ONEDPL_FPGA_DEVICE || !ONEDPL_FPGA_EMULATOR
using policy_t = oneapi::dpl::experimental::dynamic_load_policy<oneapi::dpl::experimental::sycl_backend>;
std::vector<sycl::queue> u;
build_dl_universe(u);

auto n = u.size();
std::vector<sycl::queue> u1;
build_dl_universe(u1);

auto n = u1.size();
//If building the universe is not a success, return
if (n != 0)
{
// should be similar to round_robin when waiting on policy
auto f = [u, n](int i) { return u[i % u.size()]; };
auto f = [u1, n](int i) { return u1[i % u1.size()]; };

auto f2 = [u, n](int i) { return u[0]; };
auto f2 = [u1, n](int i) { return u1[0]; };
// should always pick first when waiting on sync in each iteration

constexpr bool just_call_submit = false;
constexpr bool call_select_before_submit = true;

auto actual = test_dl_initialization(u);
actual = test_select<policy_t, decltype(u), decltype(f2)&, false>(u, f2);
actual = test_submit_and_wait_on_event<just_call_submit, policy_t>(u, f2);
actual = test_submit_and_wait_on_event<call_select_before_submit, policy_t>(u, f2);
actual = test_submit_and_wait<just_call_submit, policy_t>(u, f2);
actual = test_submit_and_wait<call_select_before_submit, policy_t>(u, f2);
actual = test_submit_and_wait_on_group<just_call_submit, policy_t>(u, f);
actual = test_submit_and_wait_on_group<call_select_before_submit, policy_t>(u, f);
auto actual = test_dl_initialization(u1);
actual = test_select<policy_t, decltype(u1), decltype(f2)&, false>(u1, f2);
actual = test_submit_and_wait_on_event<just_call_submit, policy_t>(u1, f2);
actual = test_submit_and_wait_on_event<call_select_before_submit, policy_t>(u1, f2);
actual = test_submit_and_wait<just_call_submit, policy_t>(u1, f2);
actual = test_submit_and_wait<call_select_before_submit, policy_t>(u1, f2);
actual = test_submit_and_wait_on_group<just_call_submit, policy_t>(u1, f);
actual = test_submit_and_wait_on_group<call_select_before_submit, policy_t>(u1, f);

bProcessed = true;
}
Expand Down
Loading