Skip to content

Commit f5092ca

Browse files
authored
[SYCL] Handler-less kernel submit path (single_task) (#20349)
Extend the handler-less kernel submission path to support the single_task functions.
1 parent f444525 commit f5092ca

File tree

7 files changed

+182
-39
lines changed

7 files changed

+182
-39
lines changed

sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,21 @@ template <typename KernelName = sycl::detail::auto_name, typename KernelType>
152152
void single_task(queue Q, const KernelType &KernelObj,
153153
const sycl::detail::code_location &CodeLoc =
154154
sycl::detail::code_location::current()) {
155-
submit(
156-
std::move(Q),
157-
[&](handler &CGH) { single_task<KernelName>(CGH, KernelObj); }, CodeLoc);
155+
// TODO The handler-less path does not support kernel function properties
156+
// and kernel functions with the kernel_handler type argument yet.
157+
if constexpr (!(ext::oneapi::experimental::detail::
158+
HasKernelPropertiesGetMethod<
159+
const KernelType &>::value) &&
160+
!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
161+
void>::value)) {
162+
detail::submit_kernel_direct_single_task<KernelName>(
163+
std::move(Q), empty_properties_t{}, KernelObj, CodeLoc);
164+
} else {
165+
submit(
166+
std::move(Q),
167+
[&](handler &CGH) { single_task<KernelName>(CGH, KernelObj); },
168+
CodeLoc);
169+
}
158170
}
159171

160172
template <typename... ArgsT>
@@ -268,8 +280,8 @@ void nd_launch(queue Q, nd_range<Dimensions> Range, const KernelType &KernelObj,
268280
const KernelType &>::value) &&
269281
!(detail::KernelLambdaHasKernelHandlerArgT<
270282
KernelType, sycl::nd_item<Dimensions>>::value)) {
271-
detail::submit_kernel_direct<KernelName>(std::move(Q), empty_properties_t{},
272-
Range, KernelObj);
283+
detail::submit_kernel_direct_parallel_for<KernelName>(
284+
std::move(Q), empty_properties_t{}, Range, KernelObj);
273285
} else {
274286
submit(std::move(Q), [&](handler &CGH) {
275287
nd_launch<KernelName>(CGH, Range, KernelObj,

sycl/include/sycl/khr/free_function_commands.hpp

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, KernelType &&k,
164164
const KernelType &>::value) &&
165165
!(detail::KernelLambdaHasKernelHandlerArgT<
166166
KernelType, sycl::nd_item<1>>::value)) {
167-
detail::submit_kernel_direct(
167+
detail::submit_kernel_direct_parallel_for(
168168
q, ext::oneapi::experimental::empty_properties_t{},
169169
nd_range<1>(r, size), std::forward<KernelType>(k));
170170
} else {
@@ -185,7 +185,7 @@ void launch_grouped(const queue &q, range<2> r, range<2> size, KernelType &&k,
185185
const KernelType &>::value) &&
186186
!(detail::KernelLambdaHasKernelHandlerArgT<
187187
KernelType, sycl::nd_item<2>>::value)) {
188-
detail::submit_kernel_direct(
188+
detail::submit_kernel_direct_parallel_for(
189189
q, ext::oneapi::experimental::empty_properties_t{},
190190
nd_range<2>(r, size), std::forward<KernelType>(k));
191191
} else {
@@ -206,7 +206,7 @@ void launch_grouped(const queue &q, range<3> r, range<3> size, KernelType &&k,
206206
const KernelType &>::value) &&
207207
!(detail::KernelLambdaHasKernelHandlerArgT<
208208
KernelType, sycl::nd_item<3>>::value)) {
209-
detail::submit_kernel_direct(
209+
detail::submit_kernel_direct_parallel_for(
210210
q, ext::oneapi::experimental::empty_properties_t{},
211211
nd_range<3>(r, size), std::forward<KernelType>(k));
212212
} else {
@@ -319,11 +319,24 @@ void launch_task(handler &h, const KernelType &k) {
319319
h.single_task(k);
320320
}
321321

322-
template <typename KernelType>
323-
void launch_task(const sycl::queue &q, const KernelType &k,
322+
template <typename KernelType, typename = typename std::enable_if_t<
323+
enable_kernel_function_overload<KernelType>>>
324+
void launch_task(const sycl::queue &q, KernelType &&k,
324325
const sycl::detail::code_location &codeLoc =
325326
sycl::detail::code_location::current()) {
326-
submit(q, [&](handler &h) { launch_task<KernelType>(h, k); }, codeLoc);
327+
// TODO The handler-less path does not support kernel function properties
328+
// and kernel functions with the kernel_handler type argument yet.
329+
if constexpr (!(ext::oneapi::experimental::detail::
330+
HasKernelPropertiesGetMethod<
331+
const KernelType &>::value) &&
332+
!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
333+
void>::value)) {
334+
detail::submit_kernel_direct_single_task(
335+
q, ext::oneapi::experimental::empty_properties_t{},
336+
std::forward<KernelType>(k), codeLoc);
337+
} else {
338+
submit(q, [&](handler &h) { launch_task<KernelType>(h, k); }, codeLoc);
339+
}
327340
}
328341

329342
template <typename... Args>

sycl/include/sycl/queue.hpp

Lines changed: 88 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -157,14 +157,14 @@ class __SYCL_EXPORT SubmissionInfo {
157157

158158
} // namespace v1
159159

160-
template <typename KernelName = detail::auto_name, bool EventNeeded = false,
160+
template <detail::WrapAs WrapAs, typename LambdaArgType,
161+
typename KernelName = detail::auto_name, bool EventNeeded = false,
161162
typename PropertiesT, typename KernelTypeUniversalRef, int Dims>
162163
auto submit_kernel_direct(
163-
const queue &Queue, PropertiesT Props, const nd_range<Dims> &Range,
164-
KernelTypeUniversalRef &&KernelFunc,
164+
const queue &Queue, [[maybe_unused]] PropertiesT Props,
165+
const nd_range<Dims> &Range, KernelTypeUniversalRef &&KernelFunc,
165166
const detail::code_location &CodeLoc = detail::code_location::current()) {
166167
// TODO Properties not supported yet
167-
(void)Props;
168168
static_assert(
169169
std::is_same_v<PropertiesT,
170170
ext::oneapi::experimental::empty_properties_t>,
@@ -176,34 +176,39 @@ auto submit_kernel_direct(
176176

177177
using NameT =
178178
typename detail::get_kernel_name_t<KernelName, KernelType>::name;
179-
using LambdaArgType =
180-
sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
181-
static_assert(
182-
std::is_convertible_v<sycl::nd_item<Dims>, LambdaArgType>,
183-
"Kernel argument of a sycl::parallel_for with sycl::nd_range "
184-
"must be either sycl::nd_item or be convertible from sycl::nd_item");
185-
using TransformedArgType = sycl::nd_item<Dims>;
186-
187-
#ifndef __SYCL_DEVICE_ONLY__
188-
detail::checkValueRange<Dims>(Range);
189-
#endif
190179

191-
detail::KernelWrapper<detail::WrapAs::parallel_for, NameT, KernelType,
192-
TransformedArgType, PropertiesT>::wrap(KernelFunc);
180+
detail::KernelWrapper<WrapAs, NameT, KernelType, LambdaArgType,
181+
PropertiesT>::wrap(KernelFunc);
193182

194-
HostKernelRef<KernelType, KernelTypeUniversalRef, TransformedArgType, Dims>
183+
HostKernelRef<KernelType, KernelTypeUniversalRef, LambdaArgType, Dims>
195184
HostKernel(std::forward<KernelTypeUniversalRef>(KernelFunc));
196185

197186
// Instantiating the kernel on the host improves debugging.
198187
// Passing this pointer to another translation unit prevents optimization.
199188
#ifndef NDEBUG
200-
// TODO: call library to prevent dropping call due to optimization
189+
// TODO: call library to prevent dropping call due to optimization.
201190
(void)
202191
detail::GetInstantiateKernelOnHostPtr<KernelType, LambdaArgType, Dims>();
203192
#endif
204193

205194
detail::DeviceKernelInfo *DeviceKernelInfoPtr =
206195
&detail::getDeviceKernelInfo<NameT>();
196+
constexpr auto Info = detail::CompileTimeKernelInfo<NameT>;
197+
198+
assert(Info.Name != std::string_view{} && "Kernel must have a name!");
199+
200+
static_assert(
201+
Info.Name == std::string_view{} || sizeof(KernelType) == Info.KernelSize,
202+
"Unexpected kernel lambda size. This can be caused by an "
203+
"external host compiler producing a lambda with an "
204+
"unexpected layout. This is a limitation of the compiler."
205+
"In many cases the difference is related to capturing constexpr "
206+
"variables. In such cases removing constexpr specifier aligns the "
207+
"captures between the host compiler and the device compiler."
208+
"\n"
209+
"In case of MSVC, passing "
210+
"-fsycl-host-compiler-options='/std:c++latest' "
211+
"might also help.");
207212

208213
if constexpr (EventNeeded) {
209214
return submit_kernel_direct_with_event_impl(
@@ -216,6 +221,48 @@ auto submit_kernel_direct(
216221
}
217222
}
218223

224+
template <typename KernelName = detail::auto_name, bool EventNeeded = false,
225+
typename PropertiesT, typename KernelTypeUniversalRef, int Dims>
226+
auto submit_kernel_direct_parallel_for(
227+
const queue &Queue, PropertiesT Props, const nd_range<Dims> &Range,
228+
KernelTypeUniversalRef &&KernelFunc,
229+
const detail::code_location &CodeLoc = detail::code_location::current()) {
230+
231+
using KernelType =
232+
std::remove_const_t<std::remove_reference_t<KernelTypeUniversalRef>>;
233+
234+
using LambdaArgType =
235+
sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
236+
static_assert(
237+
std::is_convertible_v<sycl::nd_item<Dims>, LambdaArgType>,
238+
"Kernel argument of a sycl::parallel_for with sycl::nd_range "
239+
"must be either sycl::nd_item or be convertible from sycl::nd_item");
240+
using TransformedArgType = sycl::nd_item<Dims>;
241+
242+
#ifndef __SYCL_DEVICE_ONLY__
243+
detail::checkValueRange<Dims>(Range);
244+
#endif
245+
246+
return submit_kernel_direct<detail::WrapAs::parallel_for, TransformedArgType,
247+
KernelName, EventNeeded, PropertiesT,
248+
KernelTypeUniversalRef, Dims>(
249+
Queue, Props, Range, std::forward<KernelTypeUniversalRef>(KernelFunc),
250+
CodeLoc);
251+
}
252+
253+
template <typename KernelName = detail::auto_name, bool EventNeeded = false,
254+
typename PropertiesT, typename KernelTypeUniversalRef>
255+
auto submit_kernel_direct_single_task(
256+
const queue &Queue, PropertiesT Props, KernelTypeUniversalRef &&KernelFunc,
257+
const detail::code_location &CodeLoc = detail::code_location::current()) {
258+
259+
return submit_kernel_direct<detail::WrapAs::single_task, void, KernelName,
260+
EventNeeded, PropertiesT, KernelTypeUniversalRef,
261+
1>(
262+
Queue, Props, nd_range<1>{1, 1},
263+
std::forward<KernelTypeUniversalRef>(KernelFunc), CodeLoc);
264+
}
265+
219266
} // namespace detail
220267

221268
namespace ext ::oneapi ::experimental {
@@ -2727,12 +2774,27 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
27272774
"Use queue.submit() instead");
27282775

27292776
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
2730-
return submit(
2731-
[&](handler &CGH) {
2732-
CGH.template single_task<KernelName, KernelType, PropertiesT>(
2733-
Properties, KernelFunc);
2734-
},
2735-
TlsCodeLocCapture.query());
2777+
2778+
// TODO The handler-less path does not support kernel
2779+
// function properties and kernel functions with the kernel_handler
2780+
// type argument yet.
2781+
if constexpr (
2782+
std::is_same_v<PropertiesT,
2783+
ext::oneapi::experimental::empty_properties_t> &&
2784+
!(ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod<
2785+
const KernelType &>::value) &&
2786+
!(detail::KernelLambdaHasKernelHandlerArgT<KernelType, void>::value)) {
2787+
return detail::submit_kernel_direct_single_task<KernelName, true>(
2788+
*this, ext::oneapi::experimental::empty_properties_t{}, KernelFunc,
2789+
TlsCodeLocCapture.query());
2790+
} else {
2791+
return submit(
2792+
[&](handler &CGH) {
2793+
CGH.template single_task<KernelName, KernelType, PropertiesT>(
2794+
Properties, KernelFunc);
2795+
},
2796+
TlsCodeLocCapture.query());
2797+
}
27362798
}
27372799

27382800
/// single_task version with a kernel represented as a lambda.
@@ -3291,7 +3353,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
32913353
const KernelType &>::value) &&
32923354
!(detail::KernelLambdaHasKernelHandlerArgT<
32933355
KernelType, sycl::nd_item<Dims>>::value)) {
3294-
return detail::submit_kernel_direct<KernelName, true>(
3356+
return detail::submit_kernel_direct_parallel_for<KernelName, true>(
32953357
*this, ext::oneapi::experimental::empty_properties_t{}, Range,
32963358
Rest..., TlsCodeLocCapture.query());
32973359
} else {

sycl/test-e2e/Basic/test_num_kernel_copies.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ int main(int argc, char **argv) {
3636

3737
kernel<2> krn2;
3838
q.single_task(krn2);
39-
assert(copy_count == 1);
39+
// The kernel is copied on the scheduler-based path only.
40+
assert(copy_count == 0);
4041
assert(move_count == 0);
4142
copy_count = 0;
4243

sycl/test/basic_tests/kernel_size_mismatch.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ int main() {
1313
(void)A;
1414
// expected-no-diagnostics
1515
#else
16-
// expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement '{{.*}}': Unexpected kernel lambda size. This can be caused by an external host compiler producing a lambda with an unexpected layout. This is a limitation of the compiler.}}
16+
// expected-error-re@sycl/queue.hpp:* {{static assertion failed due to requirement '{{.*}}': Unexpected kernel lambda size. This can be caused by an external host compiler producing a lambda with an unexpected layout. This is a limitation of the compiler.}}
1717
#endif
1818
}).wait();
1919
}

sycl/test/basic_tests/single_task_error_message.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ int main() {
1212
.single_task([&](sycl::handler &cgh) {
1313
// expected-error-re@sycl/queue.hpp:* {{static assertion failed due to requirement '{{.*}}': sycl::queue.single_task() requires a kernel instead of command group.{{.*}} Use queue.submit() instead}}
1414
// expected-error-re@sycl/detail/cg_types.hpp:* {{no matching function for call to object of type '(lambda at {{.*}}single_task_error_message.cpp:{{.*}})'}}
15+
// TODO Investigate why this function template is not instantiated
16+
// (if this is expected).
17+
// expected-error@sycl/detail/cg_types.hpp:* {{no matching function for call to 'runKernelWithoutArg'}}
1518
})
1619
.wait();
1720
}

sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ class FreeFunctionCommandsEventsTests : public ::testing::Test {
7878
protected:
7979
void SetUp() override {
8080
counter_urEnqueueKernelLaunch = 0;
81+
counter_urEnqueueKernelLaunchWithEvent = 0;
8182
counter_urUSMEnqueueMemcpy = 0;
8283
counter_urUSMEnqueueFill = 0;
8384
counter_urUSMEnqueuePrefetch = 0;
@@ -281,6 +282,57 @@ TEST_F(FreeFunctionCommandsEventsTests,
281282
ASSERT_EQ(counter_urEnqueueKernelLaunchWithEvent, size_t{1});
282283
}
283284

285+
TEST_F(FreeFunctionCommandsEventsTests, LaunchTaskShortcutMoveKernel) {
286+
mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
287+
&redefined_urEnqueueKernelLaunch);
288+
289+
TestMoveFunctor::MoveCtorCalls = 0;
290+
TestMoveFunctor MoveOnly;
291+
std::mutex CvMutex;
292+
std::condition_variable Cv;
293+
bool ready = false;
294+
295+
// This kernel submission uses scheduler-bypass path, so the HostKernel
296+
// shouldn't be constructed.
297+
298+
sycl::khr::launch_task(Queue, std::move(MoveOnly));
299+
300+
ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 0);
301+
ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
302+
303+
// Another kernel submission is queued behind a host task,
304+
// to force the scheduler-based submission. In this case, the HostKernel
305+
// should be constructed.
306+
307+
// Replace the callback with an event based one, since the scheduler
308+
// needs to create an event internally
309+
mock::getCallbacks().set_replace_callback(
310+
"urEnqueueKernelLaunch", &redefined_urEnqueueKernelLaunchWithEvent);
311+
312+
Queue.submit([&](sycl::handler &CGH) {
313+
CGH.host_task([&] {
314+
std::unique_lock<std::mutex> lk(CvMutex);
315+
Cv.wait(lk, [&ready] { return ready; });
316+
});
317+
});
318+
319+
sycl::khr::launch_task(Queue, std::move(MoveOnly));
320+
321+
{
322+
std::unique_lock<std::mutex> lk(CvMutex);
323+
ready = true;
324+
}
325+
Cv.notify_one();
326+
327+
Queue.wait();
328+
329+
// Move ctor for TestMoveFunctor is called during move construction of
330+
// HostKernel. Copy ctor is called by InstantiateKernelOnHost, can't delete
331+
// it.
332+
ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 1);
333+
ASSERT_EQ(counter_urEnqueueKernelLaunchWithEvent, size_t{1});
334+
}
335+
284336
TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchGroupedKernelNoEvent) {
285337
mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
286338
&redefined_urEnqueueKernelLaunch);

0 commit comments

Comments
 (0)