Skip to content

Commit

Permalink
Merge pull request #1880 from hdelan/l0-native-enqueue
Browse files Browse the repository at this point in the history
[L0] L0 impl for enqueue native command
  • Loading branch information
omarahmed1111 authored Jul 31, 2024
2 parents 3e762e0 + 3f13f69 commit 99489ad
Show file tree
Hide file tree
Showing 8 changed files with 250 additions and 5 deletions.
2 changes: 1 addition & 1 deletion source/adapters/level_zero/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -877,7 +877,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
}
case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: {
// L0 doesn't support enqueueing native work through the urNativeEnqueueExp
return ReturnValue(static_cast<ur_bool_t>(false));
return ReturnValue(static_cast<ur_bool_t>(true));
}

case UR_DEVICE_INFO_ESIMD_SUPPORT: {
Expand Down
69 changes: 65 additions & 4 deletions source/adapters/level_zero/enqueue_native.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,72 @@

#include <ur_api.h>

#include "logger/ur_logger.hpp"
#include "queue.hpp"
#include "ur_level_zero.hpp"

ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp(
ur_exp_enqueue_native_command_function_t, void *, uint32_t,
const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *,
uint32_t, const ur_event_handle_t *, ur_event_handle_t *) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data,
uint32_t, const ur_mem_handle_t *,
const ur_exp_enqueue_native_command_properties_t *,
uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventList,
ur_event_handle_t *phEvent) {
auto Queue = this;
std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);

bool UseCopyEngine = false;

// Please note that the following code should be run before the
// subsequent getAvailableCommandList() call so that there is no
// dead-lock from waiting unsubmitted events in an open batch.
// The createAndRetainUrZeEventList() has the proper side-effect
// of submitting batches with dependent events.
//
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, phEventList, Queue, UseCopyEngine));

// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
// TODO: Change UseCopyEngine argument to 'true' once L0 backend
// support is added
UR_CALL(Queue->Context->getAvailableCommandList(
Queue, CommandList, UseCopyEngine, NumEventsInWaitList, phEventList));

// TODO: do we need to create a unique command type for this?
ze_event_handle_t ZeEvent = nullptr;
ur_event_handle_t InternalEvent;
bool IsInternal = phEvent == nullptr;
ur_event_handle_t *Event = phEvent ? phEvent : &InternalEvent;
UR_CALL(createEventAndAssociateQueue(Queue, Event,
UR_COMMAND_ENQUEUE_NATIVE_EXP,
CommandList, IsInternal, false));
ZeEvent = (*Event)->ZeEvent;
(*Event)->WaitList = TmpWaitList;

const auto &WaitList = (*Event)->WaitList;
if (WaitList.Length) {
ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
(CommandList->first, WaitList.Length, WaitList.ZeEventList));
}

UR_CALL(Queue->executeCommandList(CommandList, false, false));
UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
UseCopyEngine, 0, nullptr));

{
ScopedCommandList Active{Queue, CommandList->first};

// Call interop func which enqueues native async work
pfnNativeEnqueue(Queue, data);
}

UR_CALL(Queue->executeCommandList(CommandList, false, false));
UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
UseCopyEngine, 0, nullptr));

ZE2UR_CALL(zeCommandListAppendSignalEvent, (CommandList->first, ZeEvent));

UR_CALL(Queue->executeCommandList(CommandList, false));
return UR_RESULT_SUCCESS;
}
9 changes: 9 additions & 0 deletions source/adapters/level_zero/queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,15 @@ ur_result_t ur_queue_handle_legacy_t_::queueGetNativeHandle(
) {
auto Queue = this;

// Needed for EnqueueNativeCommandExp, so that the native queue 'got' in the
// interop func is the as the native queue used to manage dependencies
// before the interop func invocation
if (Queue->getThreadLocalCommandList() != ze_command_list_handle_t{0}) {
auto ZeCmdList = ur_cast<ze_command_list_handle_t *>(NativeQueue);
*ZeCmdList = Queue->getThreadLocalCommandList();
return UR_RESULT_SUCCESS;
}

// Lock automatically releases when this goes out of scope.
std::shared_lock<ur_shared_mutex> lock(Queue->Mutex);

Expand Down
26 changes: 26 additions & 0 deletions source/adapters/level_zero/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,12 @@ struct ur_queue_handle_legacy_t_ : _ur_object, public ur_queue_handle_t_ {
uint32_t, const ur_event_handle_t *,
ur_event_handle_t *) override;

// Thread local stream will be used if ScopedStream is active
static ze_command_list_handle_t &getThreadLocalCommandList() {
static thread_local ze_command_list_handle_t CommandList{0};
return CommandList;
}

using queue_type = ur_device_handle_t_::queue_group_info_t::type;
// PI queue is in general a one to many mapping to L0 native queues.
struct ur_queue_group_t {
Expand Down Expand Up @@ -941,3 +947,23 @@ ur_result_t setSignalEvent(ur_queue_handle_legacy_t Queue, bool UseCopyEngine,
ur_result_t CleanupEventListFromResetCmdList(
std::vector<ur_event_handle_t> &EventListToCleanup,
bool QueueLocked = false);

// RAII object to make hQueue command list getter methods all return the same
// command list within the lifetime of this object.
//
// This is useful for urEnqueueNativeCommandExp where we want guarantees that
// the user submitted native calls will be dispatched to a known command list,
// which must be "got" within the user submitted fuction.
class ScopedCommandList {
ur_queue_handle_legacy_t hQueue;

public:
ScopedCommandList(ur_queue_handle_legacy_t hQueue,
ze_command_list_handle_t CommandList)
: hQueue{hQueue} {
hQueue->getThreadLocalCommandList() = CommandList;
}
~ScopedCommandList() {
hQueue->getThreadLocalCommandList() = ze_command_list_handle_t{0};
}
};
17 changes: 17 additions & 0 deletions test/conformance/exp_enqueue_native/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,21 @@ if (UR_BUILD_ADAPTER_CUDA)
target_link_libraries(test-exp_enqueue_native PRIVATE cudadrv)
endif()

if (UR_BUILD_ADAPTER_L0)
add_conformance_test_with_kernels_environment(
exp_enqueue_native
enqueue_native_level_zero.cpp
)
target_link_libraries(test-exp_enqueue_native PRIVATE
LevelZeroLoader
LevelZeroLoader-Headers
)

target_include_directories(test-exp_enqueue_native PRIVATE
${PROJECT_SOURCE_DIR}/source
${PROJECT_SOURCE_DIR}/source/adapters/level_zero
LevelZeroLoader-Headers
)
endif()

# TODO: Add more tests for different triples
128 changes: 128 additions & 0 deletions test/conformance/exp_enqueue_native/enqueue_native_level_zero.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
// Copyright (C) 2024 Intel Corporation
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
// See LICENSE.TXT
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "ze_api.h"

#include <uur/fixtures.h>
#include <vector>

using T = uint32_t;

struct urLevelZeroEnqueueNativeCommandTest : uur::urQueueTest {
void SetUp() {
UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTest::SetUp());

host_vec = std::vector<T>(global_size, 0);
ASSERT_EQ(host_vec.size(), global_size);
ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr,
allocation_size, &device_ptr));
ASSERT_NE(device_ptr, nullptr);
}
static constexpr T val = 42;
static constexpr uint32_t global_size = 1e7;
std::vector<T> host_vec;
void *device_ptr = nullptr;
static constexpr size_t allocation_size = sizeof(val) * global_size;
};

UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urLevelZeroEnqueueNativeCommandTest);

struct InteropData1 {
void *fill_ptr;
};

// Fill a device ptr with the pattern val
void interop_func_1(ur_queue_handle_t hQueue, void *data) {
ze_command_list_handle_t CommandList;
ASSERT_SUCCESS(urQueueGetNativeHandle(hQueue, nullptr,
(ur_native_handle_t *)&CommandList));
InteropData1 *func_data = reinterpret_cast<InteropData1 *>(data);

// If L0 interop becomes a real use case we should make a new UR entry point
// to propagate events into and out of the the interop func.
zeCommandListAppendMemoryFill(
CommandList, func_data->fill_ptr,
&urLevelZeroEnqueueNativeCommandTest::val,
sizeof(urLevelZeroEnqueueNativeCommandTest::val),
urLevelZeroEnqueueNativeCommandTest::allocation_size, nullptr, 0,
nullptr);
}

struct InteropData2 {
void *from, *to;
};

// Read from device ptr to host ptr
void interop_func_2(ur_queue_handle_t hQueue, void *data) {
ze_command_list_handle_t CommandList;
ASSERT_SUCCESS(urQueueGetNativeHandle(hQueue, nullptr,
(ur_native_handle_t *)&CommandList));
InteropData2 *func_data = reinterpret_cast<InteropData2 *>(data);

// If L0 interop becomes a real use case we should make a new UR entry point
// to propagate events into and out of the the interop func.
zeCommandListAppendMemoryCopy(
CommandList, func_data->to, func_data->from,
urLevelZeroEnqueueNativeCommandTest::allocation_size, nullptr, 0,
nullptr);
}

TEST_P(urLevelZeroEnqueueNativeCommandTest, Success) {
InteropData1 data_1{device_ptr};
ur_event_handle_t event_1;
ASSERT_SUCCESS(urEnqueueNativeCommandExp(
queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/,
nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1));
}

TEST_P(urLevelZeroEnqueueNativeCommandTest, Dependencies) {
ur_event_handle_t event_1, event_2;

InteropData1 data_1{device_ptr};
ASSERT_SUCCESS(urEnqueueNativeCommandExp(
queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/,
nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1));

InteropData2 data_2{device_ptr, host_vec.data()};
ASSERT_SUCCESS(urEnqueueNativeCommandExp(
queue, &interop_func_2, &data_2, 0, nullptr /*phMemList=*/,
nullptr /*pProperties=*/, 1, &event_1, &event_2));
urQueueFinish(queue);
for (auto &i : host_vec) {
ASSERT_EQ(i, val);
}
}

TEST_P(urLevelZeroEnqueueNativeCommandTest, DependenciesURBefore) {
ur_event_handle_t event_1, event_2;

ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptr, sizeof(val), &val,
allocation_size, 0,
nullptr /*phEventWaitList=*/, &event_1));

InteropData2 data_2{device_ptr, host_vec.data()};
ASSERT_SUCCESS(urEnqueueNativeCommandExp(
queue, &interop_func_2, &data_2, 0, nullptr /*phMemList=*/,
nullptr /*pProperties=*/, 1, &event_1, &event_2));
urQueueFinish(queue);
for (auto &i : host_vec) {
ASSERT_EQ(i, val);
}
}

TEST_P(urLevelZeroEnqueueNativeCommandTest, DependenciesURAfter) {
ur_event_handle_t event_1;

InteropData1 data_1{device_ptr};
ASSERT_SUCCESS(urEnqueueNativeCommandExp(
queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/,
nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1));

urEnqueueUSMMemcpy(queue, /*blocking*/ true, host_vec.data(), device_ptr,
allocation_size, 1, &event_1, nullptr);
for (auto &i : host_vec) {
ASSERT_EQ(i, val);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
urLevelZeroEnqueueNativeCommandTest.Success{{.*}}
urLevelZeroEnqueueNativeCommandTest.Dependencies{{.*}}
urLevelZeroEnqueueNativeCommandTest.DependenciesURBefore{{.*}}
urLevelZeroEnqueueNativeCommandTest.DependenciesURAfter{{.*}}
Empty file.

0 comments on commit 99489ad

Please sign in to comment.