From 03310785be97d17d8c14d4b58c5d6f12181e7ce9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20B=C3=B6ck?= Date: Tue, 21 May 2024 15:10:48 +0100 Subject: [PATCH] [runtime] Implement most basic offloading support (#10) With these changes we now have the DM core acting as our "host" that runs IREE and the compute cores parked until a kernel should be executed. The DM core then wakes the kernels using a cluster interrupt to start executing a workgroup. This should additionally enable multicore processing once the Codegen backend is capable of tiling and distributing tiles to workgroups. --- runtime/runtime/src/Quidditch/CMakeLists.txt | 26 +- .../Quidditch/command_buffer/CMakeLists.txt | 18 + .../Quidditch/command_buffer/command_buffer.c | 614 ++++++++++++++++++ .../Quidditch/command_buffer/command_buffer.h | 62 ++ .../src/Quidditch/device/CMakeLists.txt | 24 + .../src/Quidditch/{ => device}/device.c | 12 +- .../src/Quidditch/{ => device}/device.h | 0 .../src/Quidditch/{ => device}/event.c | 0 .../src/Quidditch/{ => device}/event.h | 0 .../{ => device}/registration/CMakeLists.txt | 0 .../{ => device}/registration/registration.c | 0 .../{ => device}/registration/registration.h | 0 .../src/Quidditch/{ => device}/semaphore.c | 0 .../src/Quidditch/{ => device}/semaphore.h | 0 .../src/Quidditch/dispatch/CMakeLists.txt | 11 + .../runtime/src/Quidditch/dispatch/dispatch.c | 90 +++ .../runtime/src/Quidditch/dispatch/dispatch.h | 36 + .../src/Quidditch/executable/CMakeLists.txt | 15 + .../src/Quidditch/executable/executable.c | 215 ++++++ .../src/Quidditch/executable/executable.h | 85 +++ .../src/Quidditch/loader/CMakeLists.txt | 12 + runtime/runtime/src/Quidditch/loader/loader.c | 143 ++++ runtime/runtime/src/Quidditch/loader/loader.h | 12 + runtime/samples/CMakeLists.txt | 4 +- runtime/samples/main.c | 22 +- runtime/snitch_cluster/CMakeLists.txt | 8 +- .../rtl/src/snitch_cluster_start.S | 1 - .../rtl/src/snitch_cluster_start.c | 2 - runtime/snitch_cluster/rtl/src/snrt.c | 9 - runtime/snitch_cluster/rtl/src/snrt.h | 8 - runtime/snitch_cluster/rtl/src/syscalls.c | 9 +- runtime/tests/CMakeLists.txt | 8 +- 32 files changed, 1380 insertions(+), 66 deletions(-) create mode 100644 runtime/runtime/src/Quidditch/command_buffer/CMakeLists.txt create mode 100644 runtime/runtime/src/Quidditch/command_buffer/command_buffer.c create mode 100644 runtime/runtime/src/Quidditch/command_buffer/command_buffer.h create mode 100644 runtime/runtime/src/Quidditch/device/CMakeLists.txt rename runtime/runtime/src/Quidditch/{ => device}/device.c (98%) rename runtime/runtime/src/Quidditch/{ => device}/device.h (100%) rename runtime/runtime/src/Quidditch/{ => device}/event.c (100%) rename runtime/runtime/src/Quidditch/{ => device}/event.h (100%) rename runtime/runtime/src/Quidditch/{ => device}/registration/CMakeLists.txt (100%) rename runtime/runtime/src/Quidditch/{ => device}/registration/registration.c (100%) rename runtime/runtime/src/Quidditch/{ => device}/registration/registration.h (100%) rename runtime/runtime/src/Quidditch/{ => device}/semaphore.c (100%) rename runtime/runtime/src/Quidditch/{ => device}/semaphore.h (100%) create mode 100644 runtime/runtime/src/Quidditch/dispatch/CMakeLists.txt create mode 100644 runtime/runtime/src/Quidditch/dispatch/dispatch.c create mode 100644 runtime/runtime/src/Quidditch/dispatch/dispatch.h create mode 100644 runtime/runtime/src/Quidditch/executable/CMakeLists.txt create mode 100644 runtime/runtime/src/Quidditch/executable/executable.c create mode 100644 runtime/runtime/src/Quidditch/executable/executable.h create mode 100644 runtime/runtime/src/Quidditch/loader/CMakeLists.txt create mode 100644 runtime/runtime/src/Quidditch/loader/loader.c create mode 100644 runtime/runtime/src/Quidditch/loader/loader.h diff --git a/runtime/runtime/src/Quidditch/CMakeLists.txt b/runtime/runtime/src/Quidditch/CMakeLists.txt index 993bdfd..f7bdb8c 100644 --- a/runtime/runtime/src/Quidditch/CMakeLists.txt +++ b/runtime/runtime/src/Quidditch/CMakeLists.txt @@ -1,21 +1,5 @@ -add_subdirectory(registration) - -iree_cc_library( - NAME - device - SRCS - device.c - event.c - semaphore.c - DEPS - snRuntime - iree::base - iree::base::internal - iree::base::internal::arena - iree::base::internal::synchronization - iree::hal::utils::deferred_command_buffer - iree::hal::utils::file_transfer - iree::hal::utils::memory_file - iree::hal::utils::semaphore_base - PUBLIC -) +add_subdirectory(command_buffer) +add_subdirectory(device) +add_subdirectory(dispatch) +add_subdirectory(executable) +add_subdirectory(loader) diff --git a/runtime/runtime/src/Quidditch/command_buffer/CMakeLists.txt b/runtime/runtime/src/Quidditch/command_buffer/CMakeLists.txt new file mode 100644 index 0000000..7e3474c --- /dev/null +++ b/runtime/runtime/src/Quidditch/command_buffer/CMakeLists.txt @@ -0,0 +1,18 @@ + +iree_cc_library( + NAME + command_buffer + HDRS + "command_buffer.h" + SRCS + "command_buffer.c" + DEPS + snRuntime + iree::base + iree::base::internal + iree::base::internal::cpu + iree::base::internal::fpu_state + iree::hal + Quidditch::executable::executable + PUBLIC +) diff --git a/runtime/runtime/src/Quidditch/command_buffer/command_buffer.c b/runtime/runtime/src/Quidditch/command_buffer/command_buffer.c new file mode 100644 index 0000000..c402c9b --- /dev/null +++ b/runtime/runtime/src/Quidditch/command_buffer/command_buffer.c @@ -0,0 +1,614 @@ +// Copyright 2021 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "command_buffer.h" + +#include +#include +#include + +#include "Quidditch/executable/executable.h" +#include "iree/base/api.h" +#include "iree/base/internal/cpu.h" +#include "iree/base/internal/fpu_state.h" +#include "iree/base/internal/math.h" +#include "iree/hal/local/executable_library.h" +#include "iree/hal/local/local_pipeline_layout.h" + +//===----------------------------------------------------------------------===// +// quidditch_command_buffer_t, fork of iree_hal_inline_command_buffer_t. +//===----------------------------------------------------------------------===// + +// Inline synchronous one-shot command "buffer". +typedef struct quidditch_command_buffer_t { + iree_hal_command_buffer_t base; + iree_allocator_t host_allocator; + + struct { + // A flattened list of all available descriptor set bindings. + // As descriptor sets are pushed/bound the bindings will be updated to + // represent the fully-translated binding data pointer. + // + // TODO(benvanik): support proper mapping semantics and track the + // iree_hal_buffer_mapping_t and map/unmap where appropriate. + void* full_bindings[IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT * + IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT]; + size_t full_binding_lengths[IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT * + IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT]; + + // Packed bindings scratch space used during dispatch. Executable bindings + // are packed into a dense list with unused bindings removed. + void* packed_bindings[IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT * + IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT]; + size_t packed_binding_lengths[IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT * + IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT]; + + // All available push constants updated each time push_constants is called. + // Reset only with the command buffer and otherwise will maintain its values + // during recording to allow for partial push_constants updates. + uint32_t push_constants[IREE_HAL_LOCAL_MAX_PUSH_CONSTANT_COUNT]; + + // Cached and initialized dispatch state reused for all dispatches. + // Individual dispatches must populate the dynamically changing fields like + // push_constant_count and binding_count. + iree_alignas(64) iree_hal_executable_dispatch_state_v0_t dispatch_state; + + // An opaque tag used to reduce the cost of processor ID queries. + iree_cpu_processor_tag_t processor_tag; + // Guess at the current processor ID. + iree_cpu_processor_id_t processor_id; + } state; +} quidditch_command_buffer_t; + +static const iree_hal_command_buffer_vtable_t quidditch_command_buffer_vtable; + +static quidditch_command_buffer_t* quidditch_command_buffer_cast( + iree_hal_command_buffer_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &quidditch_command_buffer_vtable); + return (quidditch_command_buffer_t*)base_value; +} + +static void quidditch_command_buffer_reset( + quidditch_command_buffer_t* command_buffer) { + memset(&command_buffer->state, 0, sizeof(command_buffer->state)); + + // Setup the cached dispatch state pointers that don't change. + iree_hal_executable_dispatch_state_v0_t* dispatch_state = + &command_buffer->state.dispatch_state; + dispatch_state->push_constants = command_buffer->state.push_constants; + dispatch_state->binding_ptrs = command_buffer->state.packed_bindings; + dispatch_state->binding_lengths = + command_buffer->state.packed_binding_lengths; +} + +iree_host_size_t quidditch_command_buffer_size(void) { + return sizeof(quidditch_command_buffer_t); +} + +iree_status_t quidditch_command_buffer_initialize( + iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode, + iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, + iree_allocator_t host_allocator, iree_byte_span_t storage, + iree_hal_command_buffer_t** out_command_buffer) { + IREE_ASSERT_ARGUMENT(out_command_buffer); + *out_command_buffer = NULL; + + if (!iree_all_bits_set( + mode, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT | + IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION)) { + // This implementation only supports command buffers that are allowed to + // execute inline. This mode is a contract with the caller that it is ok if + // we begin executing prior to submission. + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "inline command buffers must have a mode with ALLOW_INLINE_EXECUTION"); + } + if (binding_capacity > 0) { + // We execute as we record and can't use binding tables to do that. + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "indirect command buffers do not support binding tables"); + } + if (storage.data_length < quidditch_command_buffer_size()) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "storage must have at least the capacity as " + "defined by quidditch_command_buffer_size"); + } + + IREE_TRACE_ZONE_BEGIN(z0); + + quidditch_command_buffer_t* command_buffer = + (quidditch_command_buffer_t*)storage.data; + memset(command_buffer, 0, sizeof(*command_buffer)); + + iree_hal_command_buffer_initialize( + device, mode, command_categories, queue_affinity, binding_capacity, + &quidditch_command_buffer_vtable, &command_buffer->base); + command_buffer->host_allocator = host_allocator; + quidditch_command_buffer_reset(command_buffer); + + *out_command_buffer = &command_buffer->base; + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +void quidditch_command_buffer_deinitialize( + iree_hal_command_buffer_t* base_command_buffer) { + quidditch_command_buffer_t* command_buffer = + quidditch_command_buffer_cast(base_command_buffer); + quidditch_command_buffer_reset(command_buffer); +} + +iree_status_t quidditch_command_buffer_create( + iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode, + iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, + iree_allocator_t host_allocator, + iree_hal_command_buffer_t** out_command_buffer) { + IREE_ASSERT_ARGUMENT(out_command_buffer); + *out_command_buffer = NULL; + IREE_TRACE_ZONE_BEGIN(z0); + + uint8_t* storage = NULL; + iree_status_t status = iree_allocator_malloc( + host_allocator, quidditch_command_buffer_size(), (void**)&storage); + iree_hal_command_buffer_t* command_buffer = NULL; + if (iree_status_is_ok(status)) { + status = quidditch_command_buffer_initialize( + device, mode, command_categories, queue_affinity, binding_capacity, + host_allocator, + iree_make_byte_span(storage, quidditch_command_buffer_size()), + &command_buffer); + } + + if (iree_status_is_ok(status)) { + *out_command_buffer = command_buffer; + } else { + iree_allocator_free(host_allocator, storage); + } + IREE_TRACE_ZONE_END(z0); + return status; +} + +static void quidditch_command_buffer_destroy( + iree_hal_command_buffer_t* base_command_buffer) { + quidditch_command_buffer_t* command_buffer = + quidditch_command_buffer_cast(base_command_buffer); + iree_allocator_t host_allocator = command_buffer->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); + + quidditch_command_buffer_deinitialize(base_command_buffer); + iree_allocator_free(host_allocator, command_buffer); + + IREE_TRACE_ZONE_END(z0); +} + +bool quidditch_command_buffer_isa(iree_hal_command_buffer_t* command_buffer) { + return iree_hal_resource_is(&command_buffer->resource, + &quidditch_command_buffer_vtable); +} + +//===----------------------------------------------------------------------===// +// quidditch_command_buffer_t recording +//===----------------------------------------------------------------------===// + +// Updates the cached processor ID field in the command buffer. +static void quidditch_command_buffer_update_processor_id( + quidditch_command_buffer_t* command_buffer) { + iree_cpu_requery_processor_id(&command_buffer->state.processor_tag, + &command_buffer->state.processor_id); +} + +static iree_status_t quidditch_command_buffer_begin( + iree_hal_command_buffer_t* base_command_buffer) { + quidditch_command_buffer_t* command_buffer = + quidditch_command_buffer_cast(base_command_buffer); + quidditch_command_buffer_reset(command_buffer); + + // Query the processor ID we start out on. We may update it during execution. + quidditch_command_buffer_update_processor_id(command_buffer); + + return iree_ok_status(); +} + +static iree_status_t quidditch_command_buffer_end( + iree_hal_command_buffer_t* base_command_buffer) { + quidditch_command_buffer_t* command_buffer = + quidditch_command_buffer_cast(base_command_buffer); + quidditch_command_buffer_reset(command_buffer); + return iree_ok_status(); +} + +//===----------------------------------------------------------------------===// +// quidditch_command_buffer_t debug utilities +//===----------------------------------------------------------------------===// + +static void quidditch_command_buffer_begin_debug_group( + iree_hal_command_buffer_t* base_command_buffer, iree_string_view_t label, + iree_hal_label_color_t label_color, + const iree_hal_label_location_t* location) { + // TODO(benvanik): tracy event stack. +} + +static void quidditch_command_buffer_end_debug_group( + iree_hal_command_buffer_t* base_command_buffer) { + // TODO(benvanik): tracy event stack. +} + +//===----------------------------------------------------------------------===// +// iree_hal_command_buffer_execution_barrier +//===----------------------------------------------------------------------===// + +static iree_status_t quidditch_command_buffer_execution_barrier( + iree_hal_command_buffer_t* base_command_buffer, + iree_hal_execution_stage_t source_stage_mask, + iree_hal_execution_stage_t target_stage_mask, + iree_hal_execution_barrier_flags_t flags, + iree_host_size_t memory_barrier_count, + const iree_hal_memory_barrier_t* memory_barriers, + iree_host_size_t buffer_barrier_count, + const iree_hal_buffer_barrier_t* buffer_barriers) { + // No-op; we execute synchronously. + return iree_ok_status(); +} + +//===----------------------------------------------------------------------===// +// iree_hal_command_buffer_signal_event +//===----------------------------------------------------------------------===// + +static iree_status_t quidditch_command_buffer_signal_event( + iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event, + iree_hal_execution_stage_t source_stage_mask) { + // No-op; we execute synchronously. + return iree_ok_status(); +} + +//===----------------------------------------------------------------------===// +// iree_hal_command_buffer_reset_event +//===----------------------------------------------------------------------===// + +static iree_status_t quidditch_command_buffer_reset_event( + iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event, + iree_hal_execution_stage_t source_stage_mask) { + // No-op; we execute synchronously. + return iree_ok_status(); +} + +//===----------------------------------------------------------------------===// +// iree_hal_command_buffer_wait_events +//===----------------------------------------------------------------------===// + +static iree_status_t quidditch_command_buffer_wait_events( + iree_hal_command_buffer_t* base_command_buffer, + iree_host_size_t event_count, const iree_hal_event_t** events, + iree_hal_execution_stage_t source_stage_mask, + iree_hal_execution_stage_t target_stage_mask, + iree_host_size_t memory_barrier_count, + const iree_hal_memory_barrier_t* memory_barriers, + iree_host_size_t buffer_barrier_count, + const iree_hal_buffer_barrier_t* buffer_barriers) { + // No-op; we execute synchronously. + return iree_ok_status(); +} + +//===----------------------------------------------------------------------===// +// iree_hal_command_buffer_discard_buffer +//===----------------------------------------------------------------------===// + +static iree_status_t quidditch_command_buffer_discard_buffer( + iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) { + // Could be treated as a cache invalidation as it indicates we won't be using + // the existing buffer contents again. + return iree_ok_status(); +} + +//===----------------------------------------------------------------------===// +// iree_hal_command_buffer_fill_buffer +//===----------------------------------------------------------------------===// + +static iree_status_t quidditch_command_buffer_fill_buffer( + iree_hal_command_buffer_t* base_command_buffer, + iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset, + iree_device_size_t length, const void* pattern, + iree_host_size_t pattern_length) { + return iree_hal_buffer_map_fill(target_buffer, target_offset, length, pattern, + pattern_length); +} + +//===----------------------------------------------------------------------===// +// iree_hal_command_buffer_update_buffer +//===----------------------------------------------------------------------===// + +static iree_status_t quidditch_command_buffer_update_buffer( + iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer, + iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer, + iree_device_size_t target_offset, iree_device_size_t length) { + return iree_hal_buffer_map_write( + target_buffer, target_offset, + (const uint8_t*)source_buffer + source_offset, length); +} + +//===----------------------------------------------------------------------===// +// iree_hal_command_buffer_copy_buffer +//===----------------------------------------------------------------------===// + +static iree_status_t quidditch_command_buffer_copy_buffer( + iree_hal_command_buffer_t* base_command_buffer, + iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset, + iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset, + iree_device_size_t length) { + return iree_hal_buffer_map_copy(source_buffer, source_offset, target_buffer, + target_offset, length); +} + +//===----------------------------------------------------------------------===// +// iree_hal_command_buffer_collective +//===----------------------------------------------------------------------===// + +static iree_status_t quidditch_command_buffer_collective( + iree_hal_command_buffer_t* base_command_buffer, iree_hal_channel_t* channel, + iree_hal_collective_op_t op, uint32_t param, + iree_hal_buffer_binding_t send_binding, + iree_hal_buffer_binding_t recv_binding, iree_device_size_t element_count) { + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "collectives not yet implemented on CPU"); +} + +//===----------------------------------------------------------------------===// +// iree_hal_command_buffer_push_constants +//===----------------------------------------------------------------------===// +// NOTE: command buffer state change only; enqueues no tasks. + +static iree_status_t quidditch_command_buffer_push_constants( + iree_hal_command_buffer_t* base_command_buffer, + iree_hal_pipeline_layout_t* pipeline_layout, iree_host_size_t offset, + const void* values, iree_host_size_t values_length) { + quidditch_command_buffer_t* command_buffer = + quidditch_command_buffer_cast(base_command_buffer); + + if (IREE_UNLIKELY(offset + values_length >= + sizeof(command_buffer->state.push_constants))) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "push constant range %" PRIhsz " (length=%" PRIhsz + ") out of range", + offset, values_length); + } + + memcpy((uint8_t*)&command_buffer->state.push_constants + offset, values, + values_length); + + return iree_ok_status(); +} + +//===----------------------------------------------------------------------===// +// iree_hal_command_buffer_push_descriptor_set +//===----------------------------------------------------------------------===// +// NOTE: command buffer state change only; enqueues no tasks. + +static iree_status_t quidditch_command_buffer_push_descriptor_set( + iree_hal_command_buffer_t* base_command_buffer, + iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set, + iree_host_size_t binding_count, + const iree_hal_descriptor_set_binding_t* bindings) { + quidditch_command_buffer_t* command_buffer = + quidditch_command_buffer_cast(base_command_buffer); + + if (IREE_UNLIKELY(set >= IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT)) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "set %u out of bounds", set); + } + + iree_host_size_t binding_base = + set * IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT; + for (iree_host_size_t i = 0; i < binding_count; ++i) { + if (IREE_UNLIKELY(bindings[i].binding >= + IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT)) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "buffer binding index out of bounds"); + } + iree_host_size_t binding_ordinal = binding_base + bindings[i].binding; + + // TODO(benvanik): track mapping so we can properly map/unmap/flush/etc. + iree_hal_buffer_mapping_t buffer_mapping = {{0}}; + if (bindings[i].buffer) { + IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range( + bindings[i].buffer, IREE_HAL_MAPPING_MODE_PERSISTENT, + IREE_HAL_MEMORY_ACCESS_ANY, bindings[i].offset, bindings[i].length, + &buffer_mapping)); + } + command_buffer->state.full_bindings[binding_ordinal] = + buffer_mapping.contents.data; + command_buffer->state.full_binding_lengths[binding_ordinal] = + buffer_mapping.contents.data_length; + } + + return iree_ok_status(); +} + +//===----------------------------------------------------------------------===// +// iree_hal_command_buffer_dispatch +//===----------------------------------------------------------------------===// + +#include + +static iree_status_t quidditch_command_buffer_dispatch( + iree_hal_command_buffer_t* base_command_buffer, + iree_hal_executable_t* executable, int32_t entry_point, + uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z) { + quidditch_command_buffer_t* command_buffer = + quidditch_command_buffer_cast(base_command_buffer); + + quidditch_executable_t* local_executable = + quidditch_executable_cast(executable); + if (IREE_UNLIKELY(!local_executable->pipeline_layouts)) { + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "layouts not provided during executable creation; cannot dispatch"); + } + + iree_hal_local_pipeline_layout_t* local_layout = + (iree_hal_local_pipeline_layout_t*) + local_executable->pipeline_layouts[entry_point]; + iree_host_size_t local_memory_size = + local_executable->dispatch_attrs + ? local_executable->dispatch_attrs[entry_point].local_memory_pages * + IREE_HAL_WORKGROUP_LOCAL_MEMORY_PAGE_SIZE + : 0; + + // Update the ID of the processor we are running on. + // We don't know how much time has passed since we last updated as we are + // running inline with the user program; if we knew we were going to be + // handling a batch of dispatches we could reduce the amount of times we call + // this - but that's what the task system is for. + quidditch_command_buffer_update_processor_id(command_buffer); + + iree_hal_executable_dispatch_state_v0_t* dispatch_state = + &command_buffer->state.dispatch_state; + + // TODO(benvanik): expose on API or keep fixed on executable. + dispatch_state->workgroup_size_x = 1; + dispatch_state->workgroup_size_y = 1; + dispatch_state->workgroup_size_z = 1; + dispatch_state->workgroup_count_x = workgroup_x; + dispatch_state->workgroup_count_y = workgroup_y; + dispatch_state->workgroup_count_z = workgroup_z; + + dispatch_state->max_concurrency = snrt_cluster_compute_core_num(); + + // Push constants are pulled directly from the command buffer state, but we + // only allow the dispatch to read what we know is initialized based on the + // layout. + dispatch_state->push_constant_count = local_layout->push_constants; + + // Produce the dense binding list based on the declared bindings used. + // This allows us to change the descriptor sets and bindings counts supported + // in the HAL independent of any executable as each executable just gets the + // flat dense list and doesn't care about our descriptor set stuff. + // + // Note that we are just directly setting the binding data pointers here with + // no ownership/retaining/etc - it's part of the HAL contract that buffers are + // kept valid for the duration they may be in use. + iree_hal_local_binding_mask_t used_binding_mask = local_layout->used_bindings; + iree_host_size_t used_binding_count = + iree_math_count_ones_u64(used_binding_mask); + dispatch_state->binding_count = used_binding_count; + void** binding_ptrs = (void**)dispatch_state->binding_ptrs; + size_t* binding_lengths = (size_t*)dispatch_state->binding_lengths; + iree_host_size_t binding_base = 0; + for (iree_host_size_t i = 0; i < used_binding_count; ++i) { + int mask_offset = iree_math_count_trailing_zeros_u64(used_binding_mask); + int binding_ordinal = binding_base + mask_offset; + binding_base += mask_offset + 1; + used_binding_mask = iree_shr(used_binding_mask, mask_offset + 1); + binding_ptrs[i] = command_buffer->state.full_bindings[binding_ordinal]; + if (!binding_ptrs[i]) { + return iree_make_status(IREE_STATUS_FAILED_PRECONDITION, + "(flat) binding %d is NULL", binding_ordinal); + } + binding_lengths[i] = + command_buffer->state.full_binding_lengths[binding_ordinal]; + } + + // TODO(benvanik): plumb through an arena or fixed-size reservation to use. + // For now when deploying to devices where you want something like the + // inline command buffer you probably don't want 256KB of transient memory + // getting allocated and retained implicitly - this should be a compiler + // option. For now we just malloc here to make things work and strongly + // encourage the kind of user who wants synchronous inline execution to not + // also want tons of scratch memory. + iree_byte_span_t local_memory = iree_make_byte_span(NULL, local_memory_size); + if (local_memory_size > 0) { + IREE_RETURN_IF_ERROR(iree_allocator_malloc(command_buffer->host_allocator, + local_memory_size, + (void**)&local_memory.data)); + } + + // Since we are running on a borrowed thread, we know nothing about the + // floating point state. Reset it. + iree_fpu_state_t fpu_state = + iree_fpu_state_push(IREE_FPU_STATE_FLAG_FLUSH_DENORMALS_TO_ZERO); + iree_status_t status = quidditch_executable_issue_dispatch_inline( + local_executable, entry_point, dispatch_state, + command_buffer->state.processor_id, local_memory); + iree_fpu_state_pop(fpu_state); + + if (local_memory.data) { + iree_allocator_free(command_buffer->host_allocator, local_memory.data); + } + return status; +} + +typedef union iree_hal_vec3_t { + struct { + uint32_t x; + uint32_t y; + uint32_t z; + }; + uint32_t value[3]; +} iree_hal_vec3_t; + +static iree_status_t quidditch_command_buffer_dispatch_indirect( + iree_hal_command_buffer_t* base_command_buffer, + iree_hal_executable_t* executable, int32_t entry_point, + iree_hal_buffer_t* workgroups_buffer, + iree_device_size_t workgroups_offset) { + // TODO(benvanik): track mapping so we can properly map/unmap/flush/etc. + iree_hal_buffer_mapping_t buffer_mapping = {{0}}; + IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range( + workgroups_buffer, IREE_HAL_MAPPING_MODE_PERSISTENT, + IREE_HAL_MEMORY_ACCESS_READ, workgroups_offset, 3 * sizeof(uint32_t), + &buffer_mapping)); + iree_hal_vec3_t workgroup_count = + *(const iree_hal_vec3_t*)buffer_mapping.contents.data; + return quidditch_command_buffer_dispatch( + base_command_buffer, executable, entry_point, workgroup_count.x, + workgroup_count.y, workgroup_count.z); +} + +//===----------------------------------------------------------------------===// +// iree_hal_command_buffer_execute_commands +//===----------------------------------------------------------------------===// + +static iree_status_t quidditch_command_buffer_execute_commands( + iree_hal_command_buffer_t* base_command_buffer, + iree_hal_command_buffer_t* base_commands, + iree_hal_buffer_binding_table_t binding_table) { + // TODO(#10144): decide how to execute the inline command buffer; it is + // definitely a deferred command buffer but we don't want to force that + // dependency here. We could allow injection of a function to call to execute + // command buffers so that the device can decide how it wants to handle them. + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "indirect command buffers not yet implemented"); +} + +//===----------------------------------------------------------------------===// +// iree_hal_command_buffer_vtable_t +//===----------------------------------------------------------------------===// + +static const iree_hal_command_buffer_vtable_t quidditch_command_buffer_vtable = + { + .destroy = quidditch_command_buffer_destroy, + .begin = quidditch_command_buffer_begin, + .end = quidditch_command_buffer_end, + .begin_debug_group = quidditch_command_buffer_begin_debug_group, + .end_debug_group = quidditch_command_buffer_end_debug_group, + .execution_barrier = quidditch_command_buffer_execution_barrier, + .signal_event = quidditch_command_buffer_signal_event, + .reset_event = quidditch_command_buffer_reset_event, + .wait_events = quidditch_command_buffer_wait_events, + .discard_buffer = quidditch_command_buffer_discard_buffer, + .fill_buffer = quidditch_command_buffer_fill_buffer, + .update_buffer = quidditch_command_buffer_update_buffer, + .copy_buffer = quidditch_command_buffer_copy_buffer, + .collective = quidditch_command_buffer_collective, + .push_constants = quidditch_command_buffer_push_constants, + .push_descriptor_set = quidditch_command_buffer_push_descriptor_set, + .dispatch = quidditch_command_buffer_dispatch, + .dispatch_indirect = quidditch_command_buffer_dispatch_indirect, + .execute_commands = quidditch_command_buffer_execute_commands, +}; diff --git a/runtime/runtime/src/Quidditch/command_buffer/command_buffer.h b/runtime/runtime/src/Quidditch/command_buffer/command_buffer.h new file mode 100644 index 0000000..1de1948 --- /dev/null +++ b/runtime/runtime/src/Quidditch/command_buffer/command_buffer.h @@ -0,0 +1,62 @@ +// Copyright 2021 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include "iree/base/api.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Returns the size, in bytes, of an inline command buffer. +// This can be used for arena/stack allocations along with +// quidditch_command_buffer_initialize/quidditch_command_buffer_deinitialize. +iree_host_size_t quidditch_command_buffer_size(void); + +// Initializes an inline synchronous one-shot single-threaded command "buffer". +// This is equivalent to quidditch_command_buffer_create but uses +// caller-allocated |storage| (must be at least the capacity specified by +// quidditch_command_buffer_size). +// +// NOTE: this must only be used when the command buffer handle cannot escape +// the caller: attempting to use the resulting command buffer as a ref object +// is invalid. +iree_status_t quidditch_command_buffer_initialize( + iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode, + iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, + iree_allocator_t host_allocator, iree_byte_span_t storage, + iree_hal_command_buffer_t** out_command_buffer); + +// Deinitializes an inline command buffer previously initialized with +// quidditch_command_buffer_initialize. +void quidditch_command_buffer_deinitialize( + iree_hal_command_buffer_t* command_buffer); + +// Creates an inline synchronous one-shot single-threaded command "buffer". +// This is designed for ultra-low latency situations where we know the command +// buffer is going to be submitted with no wait semaphores indicating that it +// can begin execution immediately. No inter-command-buffer scheduling will be +// performed and all barriers and events are ignored. +// +// Executes all work on the calling thread synchronously (today). +// +// Must have IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION set. +iree_status_t quidditch_command_buffer_create( + iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode, + iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, + iree_allocator_t host_allocator, + iree_hal_command_buffer_t** out_command_buffer); + +// Returns true if |command_buffer| is an inline command buffer. +bool quidditch_command_buffer_isa(iree_hal_command_buffer_t* command_buffer); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus diff --git a/runtime/runtime/src/Quidditch/device/CMakeLists.txt b/runtime/runtime/src/Quidditch/device/CMakeLists.txt new file mode 100644 index 0000000..b242fac --- /dev/null +++ b/runtime/runtime/src/Quidditch/device/CMakeLists.txt @@ -0,0 +1,24 @@ +add_subdirectory(registration) + +iree_cc_library( + NAME + device + SRCS + device.c + event.c + semaphore.c + DEPS + snRuntime + iree::base + iree::base::internal + iree::base::internal::arena + iree::base::internal::synchronization + iree::hal::local::executable_library_util + iree::hal::local::executable_loader + iree::hal::utils::deferred_command_buffer + iree::hal::utils::file_transfer + iree::hal::utils::memory_file + iree::hal::utils::semaphore_base + Quidditch::command_buffer::command_buffer + PUBLIC +) diff --git a/runtime/runtime/src/Quidditch/device.c b/runtime/runtime/src/Quidditch/device/device.c similarity index 98% rename from runtime/runtime/src/Quidditch/device.c rename to runtime/runtime/src/Quidditch/device/device.c index 5a288bb..6c07dce 100644 --- a/runtime/runtime/src/Quidditch/device.c +++ b/runtime/runtime/src/Quidditch/device/device.c @@ -10,11 +10,11 @@ #include #include +#include "Quidditch/command_buffer/command_buffer.h" #include "event.h" #include "iree/base/internal/arena.h" #include "iree/base/internal/cpu.h" #include "iree/hal/local/executable_environment.h" -#include "iree/hal/local/inline_command_buffer.h" #include "iree/hal/local/local_executable_cache.h" #include "iree/hal/local/local_pipeline_layout.h" #include "iree/hal/utils/deferred_command_buffer.h" @@ -233,7 +233,7 @@ static iree_status_t quidditch_device_create_command_buffer( iree_hal_command_buffer_t** out_command_buffer) { if (iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION)) { - return iree_hal_inline_command_buffer_create( + return quidditch_command_buffer_create( base_device, mode, command_categories, queue_affinity, binding_capacity, iree_hal_device_host_allocator(base_device), out_command_buffer); } else { @@ -353,8 +353,8 @@ static iree_status_t quidditch_device_apply_deferred_command_buffers( // the deferred command buffers. We want to reset it between each apply so // that we don't get state carrying across. iree_byte_span_t storage = - iree_make_byte_span(iree_alloca(iree_hal_inline_command_buffer_size()), - iree_hal_inline_command_buffer_size()); + iree_make_byte_span(iree_alloca(quidditch_command_buffer_size()), + quidditch_command_buffer_size()); // NOTE: we ignore any inline command buffers that may be passed in as they've // already executed during recording. The caller is probably in for a bad time @@ -363,7 +363,7 @@ static iree_status_t quidditch_device_apply_deferred_command_buffers( iree_hal_command_buffer_t* command_buffer = command_buffers[i]; if (iree_hal_deferred_command_buffer_isa(command_buffer)) { iree_hal_command_buffer_t* inline_command_buffer = NULL; - IREE_RETURN_IF_ERROR(iree_hal_inline_command_buffer_initialize( + IREE_RETURN_IF_ERROR(quidditch_command_buffer_initialize( (iree_hal_device_t*)device, iree_hal_command_buffer_mode(command_buffer) | IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION, @@ -373,7 +373,7 @@ static iree_status_t quidditch_device_apply_deferred_command_buffers( iree_status_t status = iree_hal_deferred_command_buffer_apply( command_buffer, inline_command_buffer, iree_hal_buffer_binding_table_empty()); - iree_hal_inline_command_buffer_deinitialize(inline_command_buffer); + quidditch_command_buffer_deinitialize(inline_command_buffer); IREE_RETURN_IF_ERROR(status); } } diff --git a/runtime/runtime/src/Quidditch/device.h b/runtime/runtime/src/Quidditch/device/device.h similarity index 100% rename from runtime/runtime/src/Quidditch/device.h rename to runtime/runtime/src/Quidditch/device/device.h diff --git a/runtime/runtime/src/Quidditch/event.c b/runtime/runtime/src/Quidditch/device/event.c similarity index 100% rename from runtime/runtime/src/Quidditch/event.c rename to runtime/runtime/src/Quidditch/device/event.c diff --git a/runtime/runtime/src/Quidditch/event.h b/runtime/runtime/src/Quidditch/device/event.h similarity index 100% rename from runtime/runtime/src/Quidditch/event.h rename to runtime/runtime/src/Quidditch/device/event.h diff --git a/runtime/runtime/src/Quidditch/registration/CMakeLists.txt b/runtime/runtime/src/Quidditch/device/registration/CMakeLists.txt similarity index 100% rename from runtime/runtime/src/Quidditch/registration/CMakeLists.txt rename to runtime/runtime/src/Quidditch/device/registration/CMakeLists.txt diff --git a/runtime/runtime/src/Quidditch/registration/registration.c b/runtime/runtime/src/Quidditch/device/registration/registration.c similarity index 100% rename from runtime/runtime/src/Quidditch/registration/registration.c rename to runtime/runtime/src/Quidditch/device/registration/registration.c diff --git a/runtime/runtime/src/Quidditch/registration/registration.h b/runtime/runtime/src/Quidditch/device/registration/registration.h similarity index 100% rename from runtime/runtime/src/Quidditch/registration/registration.h rename to runtime/runtime/src/Quidditch/device/registration/registration.h diff --git a/runtime/runtime/src/Quidditch/semaphore.c b/runtime/runtime/src/Quidditch/device/semaphore.c similarity index 100% rename from runtime/runtime/src/Quidditch/semaphore.c rename to runtime/runtime/src/Quidditch/device/semaphore.c diff --git a/runtime/runtime/src/Quidditch/semaphore.h b/runtime/runtime/src/Quidditch/device/semaphore.h similarity index 100% rename from runtime/runtime/src/Quidditch/semaphore.h rename to runtime/runtime/src/Quidditch/device/semaphore.h diff --git a/runtime/runtime/src/Quidditch/dispatch/CMakeLists.txt b/runtime/runtime/src/Quidditch/dispatch/CMakeLists.txt new file mode 100644 index 0000000..5175394 --- /dev/null +++ b/runtime/runtime/src/Quidditch/dispatch/CMakeLists.txt @@ -0,0 +1,11 @@ + +iree_cc_library( + NAME + dispatch + SRCS + dispatch.c + DEPS + snRuntime + iree::base + PUBLIC +) diff --git a/runtime/runtime/src/Quidditch/dispatch/dispatch.c b/runtime/runtime/src/Quidditch/dispatch/dispatch.c new file mode 100644 index 0000000..1abb28e --- /dev/null +++ b/runtime/runtime/src/Quidditch/dispatch/dispatch.c @@ -0,0 +1,90 @@ + +#include "dispatch.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iree/base/alignment.h" + +// TODO: This should be cluster local. +static struct worker_metadata_t { + atomic_uint workers_waiting; + atomic_bool exit; +} worker_metadata = {0, false}; + +// TODO: All of this synchronization in this file could use hardware barriers +// which might be more efficient. +static void park_worker() { + worker_metadata.workers_waiting++; + asm volatile("wfi"); + snrt_int_cluster_clr(1 << snrt_cluster_core_idx()); + worker_metadata.workers_waiting--; +} + +static void wake_all_workers() { + assert(snrt_is_dm_core() && "DM core is currently our host"); + uint32_t compute_cores = snrt_cluster_compute_core_num(); + // Compute cores are indices 0 to compute_cores. + snrt_int_cluster_set((1 << compute_cores) - 1); +} + +void quidditch_dispatch_wait_for_workers() { + assert(snrt_is_dm_core() && "DM core is currently our host"); + // Spin until all compute corkers are parked. + while (worker_metadata.workers_waiting != snrt_cluster_compute_core_num()) + ; +} + +// TODO: This only works for a single cluster by using globals. Should be +// cluster local. +static iree_hal_executable_dispatch_v0_t configuredKernel; +static const iree_hal_executable_environment_v0_t* configuredEnvironment; +static const iree_hal_executable_dispatch_state_v0_t* configuredDispatchState; +static iree_alignas(64) iree_hal_executable_workgroup_state_v0_t + configuredWorkgroupState[SNRT_CLUSTER_CORE_NUM - 1]; +static atomic_bool error = false; + +bool quidditch_dispatch_errors_occurred() { return error; } + +void quidditch_dispatch_set_kernel( + iree_hal_executable_dispatch_v0_t kernel, + const iree_hal_executable_environment_v0_t* environment, + const iree_hal_executable_dispatch_state_v0_t* dispatch_state) { + configuredKernel = kernel; + configuredEnvironment = environment; + configuredDispatchState = dispatch_state; +} + +int quidditch_dispatch_enter_worker_loop() { + snrt_interrupt_enable(IRQ_M_CLUSTER); + + while (!worker_metadata.exit) { + park_worker(); + if (worker_metadata.exit) break; + + if (configuredKernel(configuredEnvironment, configuredDispatchState, + &configuredWorkgroupState[snrt_cluster_core_idx()])) + error = true; + } + + snrt_interrupt_disable(IRQ_M_CLUSTER); + return 0; +} + +void quidditch_dispatch_quit() { + quidditch_dispatch_wait_for_workers(); + worker_metadata.exit = true; + wake_all_workers(); +} + +void quidditch_dispatch_submit_workgroup( + const iree_hal_executable_workgroup_state_v0_t* workgroup_state) { + configuredWorkgroupState[workgroup_state->processor_id] = *workgroup_state; + snrt_int_cluster_set(1 << workgroup_state->processor_id); +} diff --git a/runtime/runtime/src/Quidditch/dispatch/dispatch.h b/runtime/runtime/src/Quidditch/dispatch/dispatch.h new file mode 100644 index 0000000..d7e7f4c --- /dev/null +++ b/runtime/runtime/src/Quidditch/dispatch/dispatch.h @@ -0,0 +1,36 @@ + +#pragma once + +#include +#include + +#include "iree/hal/local/executable_library.h" + +/// Entry point for compute cores to be parked and called upon for kernel +/// execution. Cores are halted within the function until +/// 'quidditch_dispatch_quit' is called. +int quidditch_dispatch_enter_worker_loop(void); + +/// Called by the host core before exiting to release all computes cores from +/// the work loop. +void quidditch_dispatch_quit(void); + +/// Causes the host core to wait for all workers to enter a parked state again. +void quidditch_dispatch_wait_for_workers(void); + +/// Returns true if any kernel execution of any compute core ever caused an +/// error. +bool quidditch_dispatch_errors_occurred(); + +/// Configures the kernel, environment and dispatch state to use for subsequent +/// 'quidditch_dispatch_submit_workgroup' calls. It is impossible for a cluster +/// to execute more than one kernel at a time. +void quidditch_dispatch_set_kernel( + iree_hal_executable_dispatch_v0_t kernel, + const iree_hal_executable_environment_v0_t* environment, + const iree_hal_executable_dispatch_state_v0_t* dispatch_state); + +/// Dispatches the compute core with the id 'workgroup_state->processorId' to +/// execute the last configured kernel with the given workgroup state. +void quidditch_dispatch_submit_workgroup( + const iree_hal_executable_workgroup_state_v0_t* workgroup_state); diff --git a/runtime/runtime/src/Quidditch/executable/CMakeLists.txt b/runtime/runtime/src/Quidditch/executable/CMakeLists.txt new file mode 100644 index 0000000..a988742 --- /dev/null +++ b/runtime/runtime/src/Quidditch/executable/CMakeLists.txt @@ -0,0 +1,15 @@ + +iree_cc_library( + NAME + executable + HDRS + "executable.h" + SRCS + "executable.c" + DEPS + snRuntime + Quidditch::dispatch::dispatch + iree::base + iree::hal::local::executable_library + PUBLIC +) diff --git a/runtime/runtime/src/Quidditch/executable/executable.c b/runtime/runtime/src/Quidditch/executable/executable.c new file mode 100644 index 0000000..a5f79d5 --- /dev/null +++ b/runtime/runtime/src/Quidditch/executable/executable.c @@ -0,0 +1,215 @@ +// Copyright 2020 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "executable.h" + +#include + +#include "Quidditch/dispatch/dispatch.h" +#include "iree/hal/local/executable_environment.h" +#include "iree/hal/local/executable_library_util.h" + +static void quidditch_executable_destroy( + iree_hal_executable_t* base_executable) { + quidditch_executable_t* executable = (quidditch_executable_t*)base_executable; + iree_allocator_t host_allocator = executable->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_executable_library_deinitialize_imports(&executable->environment, + host_allocator); + + for (iree_host_size_t i = 0; i < executable->pipeline_layout_count; ++i) { + iree_hal_pipeline_layout_release(executable->pipeline_layouts[i]); + } + iree_allocator_free(host_allocator, executable); + + IREE_TRACE_ZONE_END(z0); +} + +static const iree_hal_executable_vtable_t quidditch_executable_vtable = { + .destroy = quidditch_executable_destroy, +}; + +void quidditch_executable_initialize( + iree_host_size_t pipeline_layout_count, + iree_hal_pipeline_layout_t* const* source_pipeline_layouts, + iree_hal_pipeline_layout_t** target_pipeline_layouts, + iree_allocator_t host_allocator, + quidditch_executable_t* out_base_executable) { + iree_hal_resource_initialize(&quidditch_executable_vtable, + &out_base_executable->resource); + out_base_executable->host_allocator = host_allocator; + + out_base_executable->pipeline_layout_count = pipeline_layout_count; + out_base_executable->pipeline_layouts = target_pipeline_layouts; + for (iree_host_size_t i = 0; i < pipeline_layout_count; ++i) { + target_pipeline_layouts[i] = source_pipeline_layouts[i]; + iree_hal_pipeline_layout_retain(source_pipeline_layouts[i]); + } + + // Function attributes are optional and populated by the parent type. + out_base_executable->dispatch_attrs = NULL; + + // Default environment with no imports assigned. + iree_hal_executable_environment_initialize(host_allocator, + &out_base_executable->environment); +} + +static int quidditch_executable_import_thunk_v0( + iree_hal_executable_import_v0_t fn_ptr, void* params, void* context, + void* reserved) { + return fn_ptr(params, context, reserved); +} + +iree_status_t quidditch_executable_create( + const iree_hal_executable_params_t* executable_params, + const iree_hal_executable_library_header_t** library_header, + const iree_hal_executable_import_provider_t import_provider, + iree_allocator_t host_allocator, iree_hal_executable_t** out_executable) { + IREE_ASSERT_ARGUMENT(executable_params); + IREE_ASSERT_ARGUMENT(!executable_params->pipeline_layout_count || + executable_params->pipeline_layouts); + IREE_ASSERT_ARGUMENT(!executable_params->constant_count || + executable_params->constants); + IREE_ASSERT_ARGUMENT(library_header); + IREE_ASSERT_ARGUMENT(out_executable); + *out_executable = NULL; + IREE_TRACE_ZONE_BEGIN(z0); + + quidditch_executable_t* executable = NULL; + iree_host_size_t total_size = + sizeof(*executable) + + executable_params->pipeline_layout_count * sizeof(*executable->layouts) + + executable_params->constant_count * sizeof(*executable_params->constants); + iree_status_t status = + iree_allocator_malloc(host_allocator, total_size, (void**)&executable); + if (iree_status_is_ok(status)) { + quidditch_executable_initialize(executable_params->pipeline_layout_count, + executable_params->pipeline_layouts, + &executable->layouts[0], host_allocator, + executable); + executable->library.header = library_header; + executable->identifier = iree_make_cstring_view((*library_header)->name); + executable->dispatch_attrs = executable->library.v0->exports.attrs; + } + + // Copy executable constants so we own them. + if (iree_status_is_ok(status) && executable_params->constant_count > 0) { + uint32_t* target_constants = + (uint32_t*)((uint8_t*)executable + sizeof(*executable) + + executable_params->pipeline_layout_count * + sizeof(*executable->layouts)); + memcpy(target_constants, executable_params->constants, + executable_params->constant_count * + sizeof(*executable_params->constants)); + executable->environment.constants = target_constants; + } + + // Resolve imports, if any. + if (iree_status_is_ok(status)) { + status = iree_hal_executable_library_initialize_imports( + &executable->environment, import_provider, + &executable->library.v0->imports, quidditch_executable_import_thunk_v0, + host_allocator); + } + + // Verify that the library matches the executable params. + if (iree_status_is_ok(status)) { + status = iree_hal_executable_library_verify(executable_params, + executable->library.v0); + } + + // Publish the executable sources with the tracing infrastructure. + if (iree_status_is_ok(status)) { + iree_hal_executable_library_publish_source_files(executable->library.v0); + } + + if (iree_status_is_ok(status)) { + *out_executable = (iree_hal_executable_t*)executable; + } else { + *out_executable = NULL; + } + IREE_TRACE_ZONE_END(z0); + return status; +} + +quidditch_executable_t* quidditch_executable_cast( + iree_hal_executable_t* base_value) { + return (quidditch_executable_t*)base_value; +} + +iree_status_t quidditch_executable_issue_dispatch_inline( + quidditch_executable_t* executable, iree_host_size_t ordinal, + const iree_hal_executable_dispatch_state_v0_t* dispatch_state, + uint32_t processor_id, iree_byte_span_t local_memory) { + IREE_TRACE_ZONE_BEGIN(z0); + // TODO(benvanik): annotate with executable name to calculate total time. + + const uint32_t workgroup_count_x = dispatch_state->workgroup_count_x; + const uint32_t workgroup_count_y = dispatch_state->workgroup_count_y; + const uint32_t workgroup_count_z = dispatch_state->workgroup_count_z; + +#if IREE_HAL_VERBOSE_TRACING_ENABLE + // TODO(benvanik): tracing.h helper that speeds this up; too slow. + IREE_TRACE({ + char xyz_string[32]; + int xyz_string_length = + snprintf(xyz_string, IREE_ARRAYSIZE(xyz_string), "%ux%ux%u", + workgroup_count_x, workgroup_count_y, workgroup_count_z); + IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(z0, xyz_string, xyz_string_length); + }); +#endif // IREE_HAL_VERBOSE_TRACING_ENABLE + + iree_hal_executable_workgroup_state_v0_t workgroup_state; + + workgroup_state.local_memory = local_memory.data; + workgroup_state.local_memory_size = (size_t)local_memory.data_length; + + const iree_hal_executable_library_v0_t* library = executable->library.v0; + + if (IREE_UNLIKELY(ordinal >= library->exports.count)) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "entry point ordinal out of bounds"); + } + iree_hal_executable_dispatch_v0_t kernel = library->exports.ptrs[ordinal]; + + // Note that this is technically not required as kernel execution has as + // post-condition that all compute cores are parked. + quidditch_dispatch_wait_for_workers(); + + quidditch_dispatch_set_kernel(kernel, &executable->environment, + dispatch_state); + + uint32_t worker_to_pop = 0; + for (uint32_t z = 0; z < workgroup_count_z; ++z) { + workgroup_state.workgroup_id_z = z; + for (uint32_t y = 0; y < workgroup_count_y; ++y) { + workgroup_state.workgroup_id_y = y; + for (uint32_t x = 0; x < workgroup_count_x; ++x) { + workgroup_state.workgroup_id_x = x; + workgroup_state.processor_id = worker_to_pop; + + quidditch_dispatch_submit_workgroup(&workgroup_state); + + worker_to_pop++; + if (worker_to_pop == snrt_cluster_compute_core_num()) { + // Wait for all workers to be done before scheduling more of them. + // This is easier than waiting for the next compute core to be done. + quidditch_dispatch_wait_for_workers(); + worker_to_pop = 0; + } + } + } + } + + quidditch_dispatch_wait_for_workers(); + + if (quidditch_dispatch_errors_occurred()) + return iree_make_status(IREE_STATUS_INTERNAL); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} diff --git a/runtime/runtime/src/Quidditch/executable/executable.h b/runtime/runtime/src/Quidditch/executable/executable.h new file mode 100644 index 0000000..ab3c1e1 --- /dev/null +++ b/runtime/runtime/src/Quidditch/executable/executable.h @@ -0,0 +1,85 @@ +// Copyright 2020 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include + +#include "iree/base/api.h" +#include "iree/hal/api.h" +#include "iree/hal/local/executable_library.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +typedef struct quidditch_executable_t { + iree_hal_resource_t resource; + iree_allocator_t host_allocator; + + // Optional pipeline layout + // Not all users require the layouts (such as when directly calling executable + // functions) and in those cases they can be omitted. Users routing through + // the HAL command buffer APIs will usually require them. + // + // TODO(benvanik): make this a flag we set and can query instead - poking into + // this from dispatch code is a layering violation. + iree_host_size_t pipeline_layout_count; + iree_hal_pipeline_layout_t** pipeline_layouts; + + // Defines per-entry point how much workgroup local memory is required. + // Contains entries with 0 to indicate no local memory is required or >0 in + // units of IREE_HAL_WORKGROUP_LOCAL_MEMORY_PAGE_SIZE for the minimum amount + // of memory required by the function. + const iree_hal_executable_dispatch_attrs_v0_t* dispatch_attrs; + + // Execution environment. + iree_hal_executable_environment_v0_t environment; + + // Name used for the file field in tracy and debuggers. + iree_string_view_t identifier; + + union { + const iree_hal_executable_library_header_t** header; + const iree_hal_executable_library_v0_t* v0; + } library; + + iree_hal_pipeline_layout_t* layouts[]; +} quidditch_executable_t; + +iree_status_t quidditch_executable_create( + const iree_hal_executable_params_t* executable_params, + const iree_hal_executable_library_header_t** library_header, + iree_hal_executable_import_provider_t import_provider, + iree_allocator_t host_allocator, iree_hal_executable_t** out_executable); + +// Initializes the local executable base type. +// +// Callers must allocate memory for |target_pipeline_layouts| with at least +// `pipeline_layout_count * sizeof(*target_pipeline_layouts)` bytes. +void quidditch_executable_initialize( + iree_host_size_t pipeline_layout_count, + iree_hal_pipeline_layout_t* const* source_pipeline_layouts, + iree_hal_pipeline_layout_t** target_pipeline_layouts, + iree_allocator_t host_allocator, + quidditch_executable_t* out_base_executable); + +quidditch_executable_t* quidditch_executable_cast( + iree_hal_executable_t* base_value); + +iree_status_t quidditch_executable_issue_call( + quidditch_executable_t* executable, iree_host_size_t ordinal, + const iree_hal_executable_dispatch_state_v0_t* dispatch_state, + const iree_hal_executable_workgroup_state_v0_t* workgroup_state); + +iree_status_t quidditch_executable_issue_dispatch_inline( + quidditch_executable_t* executable, iree_host_size_t ordinal, + const iree_hal_executable_dispatch_state_v0_t* dispatch_state, + uint32_t processor_id, iree_byte_span_t local_memory); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus diff --git a/runtime/runtime/src/Quidditch/loader/CMakeLists.txt b/runtime/runtime/src/Quidditch/loader/CMakeLists.txt new file mode 100644 index 0000000..831537a --- /dev/null +++ b/runtime/runtime/src/Quidditch/loader/CMakeLists.txt @@ -0,0 +1,12 @@ + +iree_cc_library( + NAME + loader + SRCS + loader.c + DEPS + snRuntime + iree::base + Quidditch::executable::executable + PUBLIC +) diff --git a/runtime/runtime/src/Quidditch/loader/loader.c b/runtime/runtime/src/Quidditch/loader/loader.c new file mode 100644 index 0000000..6de920f --- /dev/null +++ b/runtime/runtime/src/Quidditch/loader/loader.c @@ -0,0 +1,143 @@ +#include "loader.h" + +#include "Quidditch/executable/executable.h" +#include "iree/hal/local/executable_environment.h" + +//===----------------------------------------------------------------------===// +// quidditch_loader_t, fork of iree_hal_static_library_loader_t. +//===----------------------------------------------------------------------===// + +typedef struct quidditch_loader_t { + iree_hal_executable_loader_t base; + iree_allocator_t host_allocator; + iree_host_size_t library_count; + const iree_hal_executable_library_header_t** const libraries[]; +} quidditch_loader_t; + +static const iree_hal_executable_loader_vtable_t quidditch_loader_vtable; + +iree_status_t quidditch_loader_create( + iree_host_size_t library_count, + const iree_hal_executable_library_query_fn_t* library_query_fns, + iree_hal_executable_import_provider_t import_provider, + iree_allocator_t host_allocator, + iree_hal_executable_loader_t** out_executable_loader) { + IREE_ASSERT_ARGUMENT(!library_count || library_query_fns); + IREE_ASSERT_ARGUMENT(out_executable_loader); + *out_executable_loader = NULL; + IREE_TRACE_ZONE_BEGIN(z0); + + quidditch_loader_t* executable_loader = NULL; + iree_host_size_t total_size = + sizeof(*executable_loader) + + sizeof(executable_loader->libraries[0]) * library_count; + iree_status_t status = iree_allocator_malloc(host_allocator, total_size, + (void**)&executable_loader); + if (iree_status_is_ok(status)) { + iree_hal_executable_loader_initialize( + &quidditch_loader_vtable, import_provider, &executable_loader->base); + executable_loader->host_allocator = host_allocator; + executable_loader->library_count = library_count; + + // Default environment to enable initialization. + iree_hal_executable_environment_v0_t environment; + iree_hal_executable_environment_initialize(host_allocator, &environment); + + // Query and verify the libraries provided all match our expected version. + // It's rare they won't, however static libraries generated with a newer + // version of the IREE compiler that are then linked with an older version + // of the runtime are difficult to spot otherwise. + for (iree_host_size_t i = 0; i < library_count; ++i) { + const iree_hal_executable_library_header_t* const* header_ptr = + library_query_fns[i](IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST, + &environment); + if (!header_ptr) { + status = iree_make_status( + IREE_STATUS_UNAVAILABLE, + "failed to query library header for runtime version %d", + IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST); + break; + } + const iree_hal_executable_library_header_t* header = *header_ptr; + IREE_TRACE_ZONE_APPEND_TEXT(z0, header->name); + if (header->version > IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST) { + status = iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "executable does not support this version of the " + "runtime (executable: %d, runtime: %d)", + header->version, IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST); + break; + } + memcpy((void*)&executable_loader->libraries[i], &header_ptr, + sizeof(header_ptr)); + } + } + + if (iree_status_is_ok(status)) { + *out_executable_loader = (iree_hal_executable_loader_t*)executable_loader; + } else { + iree_allocator_free(host_allocator, executable_loader); + } + + IREE_TRACE_ZONE_END(z0); + return status; +} + +static void quidditch_loader_destroy( + iree_hal_executable_loader_t* base_executable_loader) { + quidditch_loader_t* executable_loader = + (quidditch_loader_t*)base_executable_loader; + iree_allocator_t host_allocator = executable_loader->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); + + iree_allocator_free(host_allocator, executable_loader); + + IREE_TRACE_ZONE_END(z0); +} + +static bool quidditch_loader_query_support( + iree_hal_executable_loader_t* base_executable_loader, + iree_hal_executable_caching_mode_t caching_mode, + iree_string_view_t executable_format) { + return iree_string_view_equal(executable_format, + iree_make_cstring_view("static")); +} + +static iree_status_t quidditch_loader_try_load( + iree_hal_executable_loader_t* base_executable_loader, + const iree_hal_executable_params_t* executable_params, + iree_host_size_t worker_capacity, iree_hal_executable_t** out_executable) { + quidditch_loader_t* executable_loader = + (quidditch_loader_t*)base_executable_loader; + + // The executable data is just the name of the library. + iree_string_view_t library_name = iree_make_string_view( + (const char*)executable_params->executable_data.data, + executable_params->executable_data.data_length); + + // Linear scan of the registered libraries; there's usually only one per + // module (aka source model) and as such it's a small list and probably not + // worth optimizing. We could sort the libraries list by name on loader + // creation to perform a binary-search fairly easily, though, at the cost of + // the additional code size. + for (iree_host_size_t i = 0; i < executable_loader->library_count; ++i) { + const iree_hal_executable_library_header_t* header = + *executable_loader->libraries[i]; + if (iree_string_view_equal(library_name, + iree_make_cstring_view(header->name))) { + return quidditch_executable_create( + executable_params, executable_loader->libraries[i], + base_executable_loader->import_provider, + executable_loader->host_allocator, out_executable); + } + } + return iree_make_status(IREE_STATUS_NOT_FOUND, + "no static library with the name '%.*s' registered", + (int)library_name.size, library_name.data); +} + +static const iree_hal_executable_loader_vtable_t quidditch_loader_vtable = { + .destroy = quidditch_loader_destroy, + .query_support = quidditch_loader_query_support, + .try_load = quidditch_loader_try_load, +}; diff --git a/runtime/runtime/src/Quidditch/loader/loader.h b/runtime/runtime/src/Quidditch/loader/loader.h new file mode 100644 index 0000000..c7340a1 --- /dev/null +++ b/runtime/runtime/src/Quidditch/loader/loader.h @@ -0,0 +1,12 @@ + +#pragma once + +#include "iree/hal/local/executable_library.h" +#include "iree/hal/local/executable_loader.h" + +iree_status_t quidditch_loader_create( + iree_host_size_t library_count, + const iree_hal_executable_library_query_fn_t* library_query_fns, + iree_hal_executable_import_provider_t import_provider, + iree_allocator_t host_allocator, + iree_hal_executable_loader_t** out_executable_loader); diff --git a/runtime/samples/CMakeLists.txt b/runtime/samples/CMakeLists.txt index fa2a448..efa1013 100644 --- a/runtime/samples/CMakeLists.txt +++ b/runtime/samples/CMakeLists.txt @@ -12,8 +12,8 @@ target_link_libraries( iree::modules::hal iree::modules::hal::types iree::hal::local::local - iree::hal::local::loaders::static_library_loader - Quidditch::device + Quidditch::device::device + Quidditch::loader::loader simple_add_module ) diff --git a/runtime/samples/main.c b/runtime/samples/main.c index 170a116..49f6e4b 100644 --- a/runtime/samples/main.c +++ b/runtime/samples/main.c @@ -1,14 +1,16 @@ -#include +#include +#include +#include #include #include -#include #include #include #include #include #include +#include #include uint32_t snrt_l1_start_addr(); @@ -32,15 +34,17 @@ static iree_status_t setup_instance_and_device( add_dispatch_0_library_query}; iree_hal_executable_loader_t* loader; - result = iree_hal_static_library_loader_create( - IREE_ARRAYSIZE(libraries), libraries, - iree_hal_executable_import_provider_null(), host_allocator, &loader); + result = quidditch_loader_create(IREE_ARRAYSIZE(libraries), libraries, + iree_hal_executable_import_provider_null(), + host_allocator, &loader); if (!iree_status_is_ok(result)) goto error_release_vm; l1_arena.buffer = (uint8_t*)snrt_l1_start_addr(); l1_arena.length = 0; - // TODO: This is a lie and it WILL crash into our stack and CLS memory. - l1_arena.capacity = snrt_l1_end_addr() - snrt_l1_start_addr(); + unsigned stack_size_per_core = 1 << SNRT_LOG2_STACK_SIZE; + l1_arena.capacity = + (snrt_l1_end_addr() - snrt_cluster_core_num() * stack_size_per_core) - + snrt_l1_start_addr(); iree_hal_allocator_t* device_allocator; result = @@ -67,8 +71,7 @@ static iree_status_t setup_instance_and_device( } int main() { - // TODO: Remove/redirect compute cores once implemented. - if (snrt_cluster_core_idx() != 0) return 0; + if (!snrt_is_dm_core()) return quidditch_dispatch_enter_worker_loop(); double data[4]; @@ -194,5 +197,6 @@ int main() { return -1; } + quidditch_dispatch_quit(); return 0; } diff --git a/runtime/snitch_cluster/CMakeLists.txt b/runtime/snitch_cluster/CMakeLists.txt index 3b055be..de119a6 100644 --- a/runtime/snitch_cluster/CMakeLists.txt +++ b/runtime/snitch_cluster/CMakeLists.txt @@ -55,14 +55,12 @@ add_library(snRuntime ${runtime_dir}/src/snrt.c ${runtime_dir}/src/syscalls.c ) -target_include_directories(snRuntime PRIVATE - ${snRuntimeSrc}/target/snitch_cluster/sw/runtime/common/ +target_include_directories(snRuntime + PUBLIC ${snRuntimeSrc}/sw/snRuntime/src - ${snRuntimeSrc}/sw/snRuntime/src/omp ${header_dir} - PUBLIC + ${snRuntimeSrc}/target/snitch_cluster/sw/runtime/common/ ${snRuntimeSrc}/sw/snRuntime/api - ${snRuntimeSrc}/sw/snRuntime/api/omp ) add_dependencies(snRuntime cluster_gen snitch_cluster_peripheral.h) target_link_options(snRuntime INTERFACE -Tbase.ld) diff --git a/runtime/snitch_cluster/rtl/src/snitch_cluster_start.S b/runtime/snitch_cluster/rtl/src/snitch_cluster_start.S index 93259b8..5bf3389 100644 --- a/runtime/snitch_cluster/rtl/src/snitch_cluster_start.S +++ b/runtime/snitch_cluster/rtl/src/snitch_cluster_start.S @@ -6,7 +6,6 @@ #define SNRT_INIT_FP_REGS #define SNRT_INIT_GP #define SNRT_INIT_CORE_INFO -#define SNRT_INIT_CLS #define SNRT_INIT_STACK #define SNRT_INIT_TLS #define SNRT_CRT0_PARK diff --git a/runtime/snitch_cluster/rtl/src/snitch_cluster_start.c b/runtime/snitch_cluster/rtl/src/snitch_cluster_start.c index d2dcffc..4593bc5 100644 --- a/runtime/snitch_cluster/rtl/src/snitch_cluster_start.c +++ b/runtime/snitch_cluster/rtl/src/snitch_cluster_start.c @@ -4,8 +4,6 @@ #define SNRT_INIT_TLS #define SNRT_INIT_BSS -#define SNRT_INIT_CLS -#define SNRT_INIT_LIBS #define SNRT_CRT0_PRE_BARRIER #define SNRT_INVOKE_MAIN #define SNRT_CRT0_POST_BARRIER diff --git a/runtime/snitch_cluster/rtl/src/snrt.c b/runtime/snitch_cluster/rtl/src/snrt.c index d97be42..9709e3c 100644 --- a/runtime/snitch_cluster/rtl/src/snrt.c +++ b/runtime/snitch_cluster/rtl/src/snrt.c @@ -4,12 +4,8 @@ #include "snrt.h" -#include "alloc.c" -#include "cls.c" #include "cluster_interrupts.c" #include "dma.c" -#include "eu.c" -#include "kmp.c" #include "printf.c" #include "riscv.c" #include "snitch_cluster_start.c" @@ -24,11 +20,6 @@ extern uint32_t snrt_l1_end_addr(); extern volatile uint32_t *snrt_zero_memory_ptr(); -extern cls_t* cls(); - -extern snrt_allocator_t *snrt_l1_allocator(); -extern snrt_allocator_t *snrt_l3_allocator(); - extern uint32_t snrt_global_all_to_all_reduction(uint32_t value); extern uint32_t snrt_global_compute_core_num(); diff --git a/runtime/snitch_cluster/rtl/src/snrt.h b/runtime/snitch_cluster/rtl/src/snrt.h index 7b75cf3..c354c24 100644 --- a/runtime/snitch_cluster/rtl/src/snrt.h +++ b/runtime/snitch_cluster/rtl/src/snrt.h @@ -12,24 +12,16 @@ #include "snitch_cluster_memory.h" // Forward declarations -#include "alloc_decls.h" -#include "cls_decls.h" #include "riscv_decls.h" #include "start_decls.h" #include "sync_decls.h" #include "team_decls.h" // Implementation -#include "alloc.h" -#include "cls.h" #include "cluster_interrupts.h" #include "dma.h" #include "dump.h" -#include "eu.h" -#include "kmp.h" -#include "omp.h" #include "perf_cnt.h" -#include "printf.h" #include "riscv.h" #include "snitch_cluster_global_interrupts.h" #include "ssr.h" diff --git a/runtime/snitch_cluster/rtl/src/syscalls.c b/runtime/snitch_cluster/rtl/src/syscalls.c index b192e9f..2edc46c 100644 --- a/runtime/snitch_cluster/rtl/src/syscalls.c +++ b/runtime/snitch_cluster/rtl/src/syscalls.c @@ -48,7 +48,14 @@ ssize_t write(int file, const void *ptr, size_t len) { return old_len; } -void *sbrk(ptrdiff_t incr) { return snrt_l3alloc(incr); } +extern uint8_t _edram; +static uint8_t *heap = &_edram; + +void *sbrk(ptrdiff_t incr) { + uint8_t *result = heap; + heap += incr; + return result; +} void _exit(int exitCode) { asm volatile("wfi"); diff --git a/runtime/tests/CMakeLists.txt b/runtime/tests/CMakeLists.txt index 0442c9d..57eac73 100644 --- a/runtime/tests/CMakeLists.txt +++ b/runtime/tests/CMakeLists.txt @@ -4,6 +4,10 @@ include(CTest) add_executable(HelloWorld main.c) target_link_libraries(HelloWorld snRuntime) +macro(test_executable target_name) + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${target_name}.test) + add_test(NAME ${target_name} COMMAND ${target_name} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${target_name}.test) +endmacro() -add_test(NAME HelloWorld COMMAND HelloWorld) -add_test(NAME IREE_HelloWorld COMMAND IREE_HelloWorld) +test_executable(HelloWorld) +test_executable(IREE_HelloWorld)