diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 28fcb41..b19a894 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -90,5 +90,4 @@ jobs: - name: Test Runtime working-directory: ${{github.workspace}}/quidditch-runtime-build - # TODO: This should run a proper test suite once we are no longer using verilator. - run: ctest --extra-verbose -j$(nproc) -R HelloWorld + run: ctest --extra-verbose -j$(nproc) diff --git a/codegen/compiler/src/Quidditch/HoistHALOpsToFunc.cpp b/codegen/compiler/src/Quidditch/HoistHALOpsToFunc.cpp index e1a3dcf..12ac0bf 100644 --- a/codegen/compiler/src/Quidditch/HoistHALOpsToFunc.cpp +++ b/codegen/compiler/src/Quidditch/HoistHALOpsToFunc.cpp @@ -41,6 +41,8 @@ void HoistHALOpsToFunc::runOnOperation() { continue; func->setAttr("xdsl_generated", builder.getUnitAttr()); + // xDSL only supports barepointer lowering right now. + func->setAttr("llvm.bareptr", builder.getUnitAttr()); // Find all HAL operations that need to be hoisted and any other operations // they depend on. diff --git a/runtime/cmake/quidditch_module.cmake b/runtime/cmake/quidditch_module.cmake index b210e07..0fa8f46 100644 --- a/runtime/cmake/quidditch_module.cmake +++ b/runtime/cmake/quidditch_module.cmake @@ -14,57 +14,78 @@ find_program(IREE_COMPILE_PATH iree-compile ) message(STATUS "Using iree-compile at ${IREE_COMPILE_PATH}") +find_package(Python3 REQUIRED) +cmake_path(GET Python3_EXECUTABLE PARENT_PATH python_bin_dir) +cmake_path(GET python_bin_dir PARENT_PATH python_bin_dir) +find_program(XDSL_OPT_PATH xdsl-opt + PATHS ${python_bin_dir} + PATH_SUFFIXES "bin" + NO_DEFAULT_PATH + DOC "Path of the xdsl-opt file" + REQUIRED +) + function(quidditch_module) - cmake_parse_arguments(_RULE "" "SRC" "FLAGS;DEPENDS" ${ARGN}) + cmake_parse_arguments(_RULE "LLVM" "SRC" "FLAGS;DEPENDS" ${ARGN}) + + set(_MLIR_SRC "${_RULE_SRC}") - set(_MLIR_SRC "${_RULE_SRC}") + cmake_path(GET _MLIR_SRC STEM filename) - cmake_path(GET _MLIR_SRC STEM filename) + get_filename_component(_MLIR_SRC "${_MLIR_SRC}" REALPATH) + set(_O_FILE_NAME "${CMAKE_CURRENT_BINARY_DIR}/${filename}/${filename}.o") + set(_H_FILE_NAME "${CMAKE_CURRENT_BINARY_DIR}/${filename}/${filename}_module.h") + set(_MODULE_NAME "${filename}_module") - set(_COMPILER_ARGS ${_RULE_FLAGS}) + set(_COMPILER_ARGS ${_RULE_FLAGS}) + list(APPEND _COMPILER_ARGS "--iree-vm-bytecode-module-strip-source-map=true") + list(APPEND _COMPILER_ARGS "--iree-vm-emit-polyglot-zip=false") + list(APPEND _COMPILER_ARGS "--iree-input-type=auto") + # TODO: xDSL cannot deal with anything but f64 right now. + list(APPEND _COMPILER_ARGS "--iree-opt-demote-f64-to-f32=0") + + if (_RULE_LLVM) list(APPEND _COMPILER_ARGS "--iree-hal-target-backends=llvm-cpu") list(APPEND _COMPILER_ARGS "--iree-llvmcpu-debug-symbols=false") - list(APPEND _COMPILER_ARGS "--iree-vm-bytecode-module-strip-source-map=true") - list(APPEND _COMPILER_ARGS "--iree-vm-emit-polyglot-zip=false") list(APPEND _COMPILER_ARGS "--iree-llvmcpu-target-triple=riscv32-unknown-elf") list(APPEND _COMPILER_ARGS "--iree-llvmcpu-target-cpu=generic-rv32") list(APPEND _COMPILER_ARGS "--iree-llvmcpu-target-cpu-features=+m,+f,+d,+zfh") list(APPEND _COMPILER_ARGS "--iree-llvmcpu-target-abi=ilp32d") list(APPEND _COMPILER_ARGS "--iree-llvmcpu-link-embedded=false") - list(APPEND _COMPILER_ARGS "--iree-input-type=auto") - - set(_O_FILE_NAME "${CMAKE_CURRENT_BINARY_DIR}/${filename}/${filename}.o") - set(_H_FILE_NAME "${CMAKE_CURRENT_BINARY_DIR}/${filename}/${filename}_module.h") - set(_MODULE_NAME "${filename}_module") - - get_filename_component(_MLIR_SRC "${_MLIR_SRC}" REALPATH) - list(APPEND _COMPILER_ARGS "--output-format=vm-c") - list(APPEND _COMPILER_ARGS "--iree-vm-target-index-bits=32") list(APPEND _COMPILER_ARGS "--iree-llvmcpu-link-static") list(APPEND _COMPILER_ARGS "--iree-llvmcpu-static-library-output-path=${_O_FILE_NAME}") - list(APPEND _COMPILER_ARGS "${_MLIR_SRC}") - list(APPEND _COMPILER_ARGS "-o") - list(APPEND _COMPILER_ARGS "${_H_FILE_NAME}") + else () + list(APPEND _COMPILER_ARGS "--iree-hal-target-backends=quidditch") + list(APPEND _COMPILER_ARGS "--iree-quidditch-static-library-output-path=${_O_FILE_NAME}") + list(APPEND _COMPILER_ARGS "--iree-quidditch-xdsl-opt-path=${XDSL_OPT_PATH}") + list(APPEND _COMPILER_ARGS "--iree-quidditch-pulp-clang-path=${PULP_CLANG_PATH}") + endif () + + list(APPEND _COMPILER_ARGS "--output-format=vm-c") + list(APPEND _COMPILER_ARGS "--iree-vm-target-index-bits=32") + list(APPEND _COMPILER_ARGS "${_MLIR_SRC}") + list(APPEND _COMPILER_ARGS "-o") + list(APPEND _COMPILER_ARGS "${_H_FILE_NAME}") - set(_OUTPUT_FILES "${_H_FILE_NAME}") - string(REPLACE ".o" ".h" _STATIC_HDR_PATH "${_O_FILE_NAME}") - list(APPEND _OUTPUT_FILES "${_O_FILE_NAME}" "${_STATIC_HDR_PATH}") + set(_OUTPUT_FILES "${_H_FILE_NAME}") + string(REPLACE ".o" ".h" _STATIC_HDR_PATH "${_O_FILE_NAME}") + list(APPEND _OUTPUT_FILES "${_O_FILE_NAME}" "${_STATIC_HDR_PATH}") - add_custom_command( - OUTPUT ${_OUTPUT_FILES} - COMMAND ${IREE_COMPILE_PATH} ${_COMPILER_ARGS} - DEPENDS ${IREE_COMPILE_PATH} ${_MLIR_SRC} - ) + add_custom_command( + OUTPUT ${_OUTPUT_FILES} + COMMAND ${IREE_COMPILE_PATH} ${_COMPILER_ARGS} + DEPENDS ${IREE_COMPILE_PATH} ${_MLIR_SRC} + ) - add_library(${_MODULE_NAME} - STATIC - ${_H_FILE_NAME} ${_O_FILE_NAME} - ) - target_include_directories(${_MODULE_NAME} INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/${filename}) - target_compile_definitions(${_MODULE_NAME} PUBLIC EMITC_IMPLEMENTATION=\"${_H_FILE_NAME}\") - set_target_properties( - ${_MODULE_NAME} - PROPERTIES - LINKER_LANGUAGE C - ) + add_library(${_MODULE_NAME} + STATIC + ${_H_FILE_NAME} ${_O_FILE_NAME} + ) + target_include_directories(${_MODULE_NAME} INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/${filename}) + target_compile_definitions(${_MODULE_NAME} PUBLIC EMITC_IMPLEMENTATION=\"${_H_FILE_NAME}\") + set_target_properties( + ${_MODULE_NAME} + PROPERTIES + LINKER_LANGUAGE C + ) endfunction() diff --git a/runtime/runtime/src/Quidditch/CMakeLists.txt b/runtime/runtime/src/Quidditch/CMakeLists.txt index bf3cc54..993bdfd 100644 --- a/runtime/runtime/src/Quidditch/CMakeLists.txt +++ b/runtime/runtime/src/Quidditch/CMakeLists.txt @@ -1,11 +1,21 @@ add_subdirectory(registration) iree_cc_library( - NAME - device - SRCS - device.c - DEPS - iree::base - PUBLIC + NAME + device + SRCS + device.c + event.c + semaphore.c + DEPS + snRuntime + iree::base + iree::base::internal + iree::base::internal::arena + iree::base::internal::synchronization + iree::hal::utils::deferred_command_buffer + iree::hal::utils::file_transfer + iree::hal::utils::memory_file + iree::hal::utils::semaphore_base + PUBLIC ) diff --git a/runtime/runtime/src/Quidditch/device.c b/runtime/runtime/src/Quidditch/device.c index 40ab7e1..5a288bb 100644 --- a/runtime/runtime/src/Quidditch/device.c +++ b/runtime/runtime/src/Quidditch/device.c @@ -1,54 +1,194 @@ +// Copyright 2021 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + #include "device.h" -#include -#include +#include +#include +#include + +#include "event.h" +#include "iree/base/internal/arena.h" +#include "iree/base/internal/cpu.h" +#include "iree/hal/local/executable_environment.h" +#include "iree/hal/local/inline_command_buffer.h" +#include "iree/hal/local/local_executable_cache.h" +#include "iree/hal/local/local_pipeline_layout.h" +#include "iree/hal/utils/deferred_command_buffer.h" +#include "iree/hal/utils/file_transfer.h" +#include "iree/hal/utils/memory_file.h" +#include "semaphore.h" typedef struct quidditch_device_t { iree_hal_resource_t resource; + iree_string_view_t identifier; iree_allocator_t host_allocator; - iree_hal_allocator_t *device_allocator; + iree_hal_allocator_t* device_allocator; + + // Optional provider used for creating/configuring collective channels. + iree_hal_channel_provider_t* channel_provider; + + // Block pool used for command buffers with a larger block size (as command + // buffers can contain inlined data uploads). + iree_arena_block_pool_t large_block_pool; + + // Shared semaphore state used to emulate OS-level primitives. This backend + // is intended to run on bare-metal systems where we need to perform all + // synchronization ourselves. + quidditch_semaphore_state_t semaphore_state; iree_host_size_t loader_count; - iree_hal_executable_loader_t *loaders[]; + iree_hal_executable_loader_t* loaders[]; } quidditch_device_t; static const iree_hal_device_vtable_t quidditch_device_vtable; -static quidditch_device_t *cast_device(iree_hal_device_t *device) { - IREE_HAL_ASSERT_TYPE(device, &quidditch_device_vtable); - return (quidditch_device_t *)device; +static quidditch_device_t* quidditch_device_cast( + iree_hal_device_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &quidditch_device_vtable); + return (quidditch_device_t*)base_value; } -static void destroy(iree_hal_device_t *base_device) { - quidditch_device_t *device = cast_device(base_device); - iree_hal_allocator_release(device->device_allocator); - for (iree_host_size_t i = 0; i < device->loader_count; i++) { +void quidditch_device_params_initialize(quidditch_device_params_t* out_params) { + memset(out_params, 0, sizeof(*out_params)); + out_params->arena_block_size = 32 * 1024; +} + +static iree_status_t quidditch_device_check_params( + const quidditch_device_params_t* params) { + if (params->arena_block_size < 4096) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "arena block size too small (< 4096 bytes)"); + } + return iree_ok_status(); +} + +iree_status_t quidditch_device_create(iree_string_view_t identifier, + const quidditch_device_params_t* params, + iree_host_size_t loader_count, + iree_hal_executable_loader_t** loaders, + iree_hal_allocator_t* device_allocator, + iree_allocator_t host_allocator, + iree_hal_device_t** out_device) { + IREE_ASSERT_ARGUMENT(params); + IREE_ASSERT_ARGUMENT(!loader_count || loaders); + IREE_ASSERT_ARGUMENT(device_allocator); + IREE_ASSERT_ARGUMENT(out_device); + *out_device = NULL; + IREE_TRACE_ZONE_BEGIN(z0); + + IREE_RETURN_AND_END_ZONE_IF_ERROR(z0, quidditch_device_check_params(params)); + + quidditch_device_t* device = NULL; + iree_host_size_t struct_size = + sizeof(*device) + loader_count * sizeof(*device->loaders); + iree_host_size_t total_size = struct_size + identifier.size; + iree_status_t status = + iree_allocator_malloc(host_allocator, total_size, (void**)&device); + if (iree_status_is_ok(status)) { + memset(device, 0, total_size); + iree_hal_resource_initialize(&quidditch_device_vtable, &device->resource); + iree_string_view_append_to_buffer(identifier, &device->identifier, + (char*)device + struct_size); + device->host_allocator = host_allocator; + device->device_allocator = device_allocator; + iree_hal_allocator_retain(device_allocator); + iree_arena_block_pool_initialize(params->arena_block_size, host_allocator, + &device->large_block_pool); + + device->loader_count = loader_count; + for (iree_host_size_t i = 0; i < device->loader_count; ++i) { + device->loaders[i] = loaders[i]; + iree_hal_executable_loader_retain(device->loaders[i]); + } + + quidditch_semaphore_state_initialize(&device->semaphore_state); + } + + if (iree_status_is_ok(status)) { + *out_device = (iree_hal_device_t*)device; + } else { + iree_hal_device_release((iree_hal_device_t*)device); + } + IREE_TRACE_ZONE_END(z0); + return status; +} + +static void quidditch_device_destroy(iree_hal_device_t* base_device) { + quidditch_device_t* device = quidditch_device_cast(base_device); + iree_allocator_t host_allocator = iree_hal_device_host_allocator(base_device); + IREE_TRACE_ZONE_BEGIN(z0); + + quidditch_semaphore_state_deinitialize(&device->semaphore_state); + + for (iree_host_size_t i = 0; i < device->loader_count; ++i) { iree_hal_executable_loader_release(device->loaders[i]); } + + iree_hal_allocator_release(device->device_allocator); + iree_hal_channel_provider_release(device->channel_provider); + + iree_arena_block_pool_deinitialize(&device->large_block_pool); + + iree_allocator_free(host_allocator, device); + + IREE_TRACE_ZONE_END(z0); } -static iree_allocator_t host_allocator(iree_hal_device_t *base_device) { - return cast_device(base_device)->host_allocator; +static iree_string_view_t quidditch_device_id(iree_hal_device_t* base_device) { + quidditch_device_t* device = quidditch_device_cast(base_device); + return device->identifier; } -static iree_hal_allocator_t *device_allocator(iree_hal_device_t *base_device) { - return cast_device(base_device)->device_allocator; +static iree_allocator_t quidditch_device_host_allocator( + iree_hal_device_t* base_device) { + quidditch_device_t* device = quidditch_device_cast(base_device); + return device->host_allocator; } -static iree_status_t create_executable_cache( - iree_hal_device_t *base_device, iree_string_view_t identifier, - iree_loop_t loop, iree_hal_executable_cache_t **out_executable_cache) { - quidditch_device_t *device = cast_device(base_device); - return iree_hal_local_executable_cache_create( - identifier, /*worker_capacity=*/1, device->loader_count, device->loaders, - iree_hal_device_host_allocator(base_device), out_executable_cache); +static iree_hal_allocator_t* quidditch_device_allocator( + iree_hal_device_t* base_device) { + quidditch_device_t* device = quidditch_device_cast(base_device); + return device->device_allocator; +} + +static void quidditch_replace_device_allocator( + iree_hal_device_t* base_device, iree_hal_allocator_t* new_allocator) { + quidditch_device_t* device = quidditch_device_cast(base_device); + iree_hal_allocator_retain(new_allocator); + iree_hal_allocator_release(device->device_allocator); + device->device_allocator = new_allocator; } -static iree_status_t query_i64(iree_hal_device_t *base_device, - iree_string_view_t category, - iree_string_view_t key, int64_t *out_value) { - quidditch_device_t *device = cast_device(base_device); +static void quidditch_replace_channel_provider( + iree_hal_device_t* base_device, iree_hal_channel_provider_t* new_provider) { + quidditch_device_t* device = quidditch_device_cast(base_device); + iree_hal_channel_provider_retain(new_provider); + iree_hal_channel_provider_release(device->channel_provider); + device->channel_provider = new_provider; +} + +static iree_status_t quidditch_device_trim(iree_hal_device_t* base_device) { + quidditch_device_t* device = quidditch_device_cast(base_device); + return iree_hal_allocator_trim(device->device_allocator); +} + +static iree_status_t quidditch_device_query_i64(iree_hal_device_t* base_device, + iree_string_view_t category, + iree_string_view_t key, + int64_t* out_value) { + quidditch_device_t* device = quidditch_device_cast(base_device); + *out_value = 0; + + if (iree_string_view_equal(category, IREE_SV("hal.device.id"))) { + *out_value = + iree_string_view_match_pattern(device->identifier, key) ? 1 : 0; + return iree_ok_status(); + } if (iree_string_view_equal(category, IREE_SV("hal.executable.format"))) { *out_value = @@ -56,87 +196,331 @@ static iree_status_t query_i64(iree_hal_device_t *base_device, device->loader_count, device->loaders, /*caching_mode=*/0, key) ? 1 : 0; - return iree_ok_status(); } + if (iree_string_view_equal(category, IREE_SV("hal.device"))) { + if (iree_string_view_equal(key, IREE_SV("concurrency"))) { + *out_value = 1; + return iree_ok_status(); + } + } else if (iree_string_view_equal(category, IREE_SV("hal.dispatch"))) { + if (iree_string_view_equal(key, IREE_SV("concurrency"))) { + *out_value = 1; + return iree_ok_status(); + } + } else if (iree_string_view_equal(category, IREE_SV("hal.cpu"))) { + return iree_cpu_lookup_data_by_key(key, out_value); + } + return iree_make_status( IREE_STATUS_NOT_FOUND, "unknown device configuration key value '%.*s :: %.*s'", (int)category.size, category.data, (int)key.size, key.data); } -static iree_status_t create_descriptor_set_layout( - iree_hal_device_t *base_device, +static iree_status_t quidditch_device_create_channel( + iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity, + iree_hal_channel_params_t params, iree_hal_channel_t** out_channel) { + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "collectives not implemented"); +} + +static iree_status_t quidditch_device_create_command_buffer( + iree_hal_device_t* base_device, iree_hal_command_buffer_mode_t mode, + iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, + iree_hal_command_buffer_t** out_command_buffer) { + if (iree_all_bits_set(mode, + IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION)) { + return iree_hal_inline_command_buffer_create( + base_device, mode, command_categories, queue_affinity, binding_capacity, + iree_hal_device_host_allocator(base_device), out_command_buffer); + } else { + quidditch_device_t* device = quidditch_device_cast(base_device); + return iree_hal_deferred_command_buffer_create( + base_device, mode, command_categories, binding_capacity, + &device->large_block_pool, device->host_allocator, out_command_buffer); + } +} + +static iree_status_t quidditch_device_create_descriptor_set_layout( + iree_hal_device_t* base_device, iree_hal_descriptor_set_layout_flags_t flags, iree_host_size_t binding_count, - const iree_hal_descriptor_set_layout_binding_t *bindings, - iree_hal_descriptor_set_layout_t **out_descriptor_set_layout) { + const iree_hal_descriptor_set_layout_binding_t* bindings, + iree_hal_descriptor_set_layout_t** out_descriptor_set_layout) { return iree_hal_local_descriptor_set_layout_create( flags, binding_count, bindings, iree_hal_device_host_allocator(base_device), out_descriptor_set_layout); } -static iree_status_t create_pipeline_layout( - iree_hal_device_t *base_device, iree_host_size_t push_constants, +static iree_status_t quidditch_device_create_event( + iree_hal_device_t* base_device, iree_hal_event_t** out_event) { + return quidditch_event_create(iree_hal_device_host_allocator(base_device), + out_event); +} + +static iree_status_t quidditch_device_create_executable_cache( + iree_hal_device_t* base_device, iree_string_view_t identifier, + iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache) { + quidditch_device_t* device = quidditch_device_cast(base_device); + return iree_hal_local_executable_cache_create( + identifier, /*worker_capacity=*/1, device->loader_count, device->loaders, + iree_hal_device_host_allocator(base_device), out_executable_cache); +} + +static iree_status_t quidditch_device_import_file( + iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity, + iree_hal_memory_access_t access, iree_io_file_handle_t* handle, + iree_hal_external_file_flags_t flags, iree_hal_file_t** out_file) { + if (iree_io_file_handle_type(handle) != + IREE_IO_FILE_HANDLE_TYPE_HOST_ALLOCATION) { + return iree_make_status( + IREE_STATUS_UNAVAILABLE, + "implementation does not support the external file type"); + } + return iree_hal_memory_file_wrap( + queue_affinity, access, handle, iree_hal_device_allocator(base_device), + iree_hal_device_host_allocator(base_device), out_file); +} + +static iree_status_t quidditch_device_create_pipeline_layout( + iree_hal_device_t* base_device, iree_host_size_t push_constants, iree_host_size_t set_layout_count, - iree_hal_descriptor_set_layout_t *const *set_layouts, - iree_hal_pipeline_layout_t **out_pipeline_layout) { + iree_hal_descriptor_set_layout_t* const* set_layouts, + iree_hal_pipeline_layout_t** out_pipeline_layout) { return iree_hal_local_pipeline_layout_create( push_constants, set_layout_count, set_layouts, iree_hal_device_host_allocator(base_device), out_pipeline_layout); } -static iree_status_t create_semaphore(iree_hal_device_t *base_device, - uint64_t initial_value, - iree_hal_semaphore_t **out_semaphore) { - IREE_ATTRIBUTE_UNUSED quidditch_device_t *device = cast_device(base_device); +static iree_status_t quidditch_device_create_semaphore( + iree_hal_device_t* base_device, uint64_t initial_value, + iree_hal_semaphore_t** out_semaphore) { + quidditch_device_t* device = quidditch_device_cast(base_device); + return quidditch_semaphore_create(&device->semaphore_state, initial_value, + device->host_allocator, out_semaphore); +} - return iree_make_status(IREE_STATUS_UNIMPLEMENTED); +static iree_hal_semaphore_compatibility_t +quidditch_device_query_semaphore_compatibility( + iree_hal_device_t* base_device, iree_hal_semaphore_t* semaphore) { + // The synchronous submission queue handles all semaphores as if host-side. + return IREE_HAL_SEMAPHORE_COMPATIBILITY_HOST_ONLY; } -static const iree_hal_device_vtable_t quidditch_device_vtable = { - .destroy = destroy, - .host_allocator = host_allocator, - .device_allocator = device_allocator, - .create_executable_cache = create_executable_cache, - .query_i64 = query_i64, - .create_descriptor_set_layout = create_descriptor_set_layout, - .create_pipeline_layout = create_pipeline_layout, - .create_semaphore = create_semaphore, -}; +static iree_status_t quidditch_device_queue_alloca( + iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity, + const iree_hal_semaphore_list_t wait_semaphore_list, + const iree_hal_semaphore_list_t signal_semaphore_list, + iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params, + iree_device_size_t allocation_size, + iree_hal_buffer_t** IREE_RESTRICT out_buffer) { + // TODO(benvanik): queue-ordered allocations. + IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list, + iree_infinite_timeout())); + IREE_RETURN_IF_ERROR( + iree_hal_allocator_allocate_buffer(iree_hal_device_allocator(base_device), + params, allocation_size, out_buffer)); + IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_signal(signal_semaphore_list)); + return iree_ok_status(); +} -iree_status_t quidditch_device_create(iree_host_size_t loader_count, - iree_hal_executable_loader_t **loaders, - iree_hal_allocator_t *device_allocator, - iree_allocator_t host_allocator, - iree_hal_device_t **out_device) { - IREE_ASSERT_ARGUMENT(loaders || loader_count == 0); - IREE_ASSERT_ARGUMENT(device_allocator); - IREE_ASSERT_ARGUMENT(out_device); +static iree_status_t quidditch_device_queue_dealloca( + iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity, + const iree_hal_semaphore_list_t wait_semaphore_list, + const iree_hal_semaphore_list_t signal_semaphore_list, + iree_hal_buffer_t* buffer) { + // TODO(benvanik): queue-ordered allocations. + IREE_RETURN_IF_ERROR(iree_hal_device_queue_barrier( + base_device, queue_affinity, wait_semaphore_list, signal_semaphore_list)); + return iree_ok_status(); +} - // Set the out device to null in case any steps fail. - *out_device = NULL; +static iree_status_t quidditch_device_apply_deferred_command_buffers( + quidditch_device_t* device, iree_host_size_t command_buffer_count, + iree_hal_command_buffer_t* const* command_buffers) { + // See if there are any deferred command buffers; this saves us work in cases + // of pure inline execution. + bool any_deferred = false; + for (iree_host_size_t i = 0; i < command_buffer_count && !any_deferred; ++i) { + any_deferred = iree_hal_deferred_command_buffer_isa(command_buffers[i]); + } + if (!any_deferred) return iree_ok_status(); - quidditch_device_t *device = NULL; - iree_host_size_t allocation_size = - sizeof(quidditch_device_t) + - loader_count * sizeof(*device->loaders); // NOLINT(*-sizeof-expression) - IREE_RETURN_IF_ERROR( - iree_allocator_malloc(host_allocator, allocation_size, (void **)&device)); - memset(device, 0, allocation_size); - iree_hal_resource_initialize(&quidditch_device_vtable, &device->resource); - device->host_allocator = host_allocator; - device->device_allocator = device_allocator; - - // Make sure to increase the ref counts of any entities we reference. - iree_hal_allocator_retain(device_allocator); - device->loader_count = loader_count; - for (iree_host_size_t i = 0; i < device->loader_count; ++i) { - device->loaders[i] = loaders[i]; - iree_hal_executable_loader_retain(device->loaders[i]); + // Stack allocate storage for an inline command buffer we'll use to replay + // the deferred command buffers. We want to reset it between each apply so + // that we don't get state carrying across. + iree_byte_span_t storage = + iree_make_byte_span(iree_alloca(iree_hal_inline_command_buffer_size()), + iree_hal_inline_command_buffer_size()); + + // NOTE: we ignore any inline command buffers that may be passed in as they've + // already executed during recording. The caller is probably in for a bad time + // if they mixed the two modes together! + for (iree_host_size_t i = 0; i < command_buffer_count; ++i) { + iree_hal_command_buffer_t* command_buffer = command_buffers[i]; + if (iree_hal_deferred_command_buffer_isa(command_buffer)) { + iree_hal_command_buffer_t* inline_command_buffer = NULL; + IREE_RETURN_IF_ERROR(iree_hal_inline_command_buffer_initialize( + (iree_hal_device_t*)device, + iree_hal_command_buffer_mode(command_buffer) | + IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION, + IREE_HAL_COMMAND_CATEGORY_ANY, IREE_HAL_QUEUE_AFFINITY_ANY, + /*binding_capacity=*/0, device->host_allocator, storage, + &inline_command_buffer)); + iree_status_t status = iree_hal_deferred_command_buffer_apply( + command_buffer, inline_command_buffer, + iree_hal_buffer_binding_table_empty()); + iree_hal_inline_command_buffer_deinitialize(inline_command_buffer); + IREE_RETURN_IF_ERROR(status); + } } - *out_device = (iree_hal_device_t *)device; return iree_ok_status(); } + +static iree_status_t quidditch_device_queue_read( + iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity, + const iree_hal_semaphore_list_t wait_semaphore_list, + const iree_hal_semaphore_list_t signal_semaphore_list, + iree_hal_file_t* source_file, uint64_t source_offset, + iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset, + iree_device_size_t length, uint32_t flags) { + // TODO: expose streaming chunk count/size options. + iree_status_t loop_status = iree_ok_status(); + iree_hal_file_transfer_options_t options = { + .loop = iree_loop_inline(&loop_status), + .chunk_count = IREE_HAL_FILE_TRANSFER_CHUNK_COUNT_DEFAULT, + .chunk_size = IREE_HAL_FILE_TRANSFER_CHUNK_SIZE_DEFAULT, + }; + IREE_RETURN_IF_ERROR(iree_hal_device_queue_read_streaming( + base_device, queue_affinity, wait_semaphore_list, signal_semaphore_list, + source_file, source_offset, target_buffer, target_offset, length, flags, + options)); + return loop_status; +} + +static iree_status_t quidditch_device_queue_write( + iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity, + const iree_hal_semaphore_list_t wait_semaphore_list, + const iree_hal_semaphore_list_t signal_semaphore_list, + iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset, + iree_hal_file_t* target_file, uint64_t target_offset, + iree_device_size_t length, uint32_t flags) { + // TODO: expose streaming chunk count/size options. + iree_status_t loop_status = iree_ok_status(); + iree_hal_file_transfer_options_t options = { + .loop = iree_loop_inline(&loop_status), + .chunk_count = IREE_HAL_FILE_TRANSFER_CHUNK_COUNT_DEFAULT, + .chunk_size = IREE_HAL_FILE_TRANSFER_CHUNK_SIZE_DEFAULT, + }; + IREE_RETURN_IF_ERROR(iree_hal_device_queue_write_streaming( + base_device, queue_affinity, wait_semaphore_list, signal_semaphore_list, + source_buffer, source_offset, target_file, target_offset, length, flags, + options)); + return loop_status; +} + +static iree_status_t quidditch_device_queue_execute( + iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity, + const iree_hal_semaphore_list_t wait_semaphore_list, + const iree_hal_semaphore_list_t signal_semaphore_list, + iree_host_size_t command_buffer_count, + iree_hal_command_buffer_t* const* command_buffers) { + quidditch_device_t* device = quidditch_device_cast(base_device); + + // TODO(#4680): there is some better error handling here needed; we should + // propagate failures to all signal semaphores. Today we aren't as there + // shouldn't be any failures or if there are there's not much we'd be able to + // do - chances are we already executed everything inline! + + // Wait for semaphores to be signaled before performing any work. + IREE_RETURN_IF_ERROR(quidditch_semaphore_multi_wait( + &device->semaphore_state, IREE_HAL_WAIT_MODE_ALL, wait_semaphore_list, + iree_infinite_timeout())); + + // Run all deferred command buffers - any we could have run inline we already + // did during recording. + IREE_RETURN_IF_ERROR(quidditch_device_apply_deferred_command_buffers( + device, command_buffer_count, command_buffers)); + + // Signal all semaphores now that batch work has completed. + IREE_RETURN_IF_ERROR(quidditch_semaphore_multi_signal( + &device->semaphore_state, signal_semaphore_list)); + + return iree_ok_status(); +} + +static iree_status_t quidditch_device_queue_flush( + iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity) { + // Currently unused; we flush as submissions are made. + return iree_ok_status(); +} + +static iree_status_t quidditch_device_wait_semaphores( + iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode, + const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) { + quidditch_device_t* device = quidditch_device_cast(base_device); + return quidditch_semaphore_multi_wait(&device->semaphore_state, wait_mode, + semaphore_list, timeout); +} + +static iree_status_t quidditch_device_profiling_begin( + iree_hal_device_t* base_device, + const iree_hal_device_profiling_options_t* options) { + // Unimplemented (and that's ok). + // We could hook in to vendor APIs (Intel/ARM/etc) or generic perf infra: + // https://man7.org/linux/man-pages/man2/perf_event_open.2.html + // Capturing things like: + // PERF_COUNT_HW_CPU_CYCLES / PERF_COUNT_HW_INSTRUCTIONS + // PERF_COUNT_HW_CACHE_REFERENCES / PERF_COUNT_HW_CACHE_MISSES + // etc + // TODO(benvanik): shared iree/hal/local/profiling implementation of this. + return iree_ok_status(); +} + +static iree_status_t quidditch_device_profiling_flush( + iree_hal_device_t* base_device) { + // Unimplemented (and that's ok). + return iree_ok_status(); +} + +static iree_status_t quidditch_device_profiling_end( + iree_hal_device_t* base_device) { + // Unimplemented (and that's ok). + return iree_ok_status(); +} + +static const iree_hal_device_vtable_t quidditch_device_vtable = { + .destroy = quidditch_device_destroy, + .id = quidditch_device_id, + .host_allocator = quidditch_device_host_allocator, + .device_allocator = quidditch_device_allocator, + .replace_device_allocator = quidditch_replace_device_allocator, + .replace_channel_provider = quidditch_replace_channel_provider, + .trim = quidditch_device_trim, + .query_i64 = quidditch_device_query_i64, + .create_channel = quidditch_device_create_channel, + .create_command_buffer = quidditch_device_create_command_buffer, + .create_descriptor_set_layout = + quidditch_device_create_descriptor_set_layout, + .create_event = quidditch_device_create_event, + .create_executable_cache = quidditch_device_create_executable_cache, + .import_file = quidditch_device_import_file, + .create_pipeline_layout = quidditch_device_create_pipeline_layout, + .create_semaphore = quidditch_device_create_semaphore, + .query_semaphore_compatibility = + quidditch_device_query_semaphore_compatibility, + .queue_alloca = quidditch_device_queue_alloca, + .queue_dealloca = quidditch_device_queue_dealloca, + .queue_read = quidditch_device_queue_read, + .queue_write = quidditch_device_queue_write, + .queue_execute = quidditch_device_queue_execute, + .queue_flush = quidditch_device_queue_flush, + .wait_semaphores = quidditch_device_wait_semaphores, + .profiling_begin = quidditch_device_profiling_begin, + .profiling_flush = quidditch_device_profiling_flush, + .profiling_end = quidditch_device_profiling_end, +}; diff --git a/runtime/runtime/src/Quidditch/device.h b/runtime/runtime/src/Quidditch/device.h index 002cf43..2978096 100644 --- a/runtime/runtime/src/Quidditch/device.h +++ b/runtime/runtime/src/Quidditch/device.h @@ -5,7 +5,22 @@ #include #include -iree_status_t quidditch_device_create( - iree_host_size_t loader_count, iree_hal_executable_loader_t **loaders, - iree_hal_allocator_t *device_allocator, iree_allocator_t host_allocator, - iree_hal_device_t **out_device); +// Parameters configuring an iree_hal_sync_device_t. +// Must be initialized with iree_hal_sync_device_params_initialize prior to use. +typedef struct quidditch_device_params_t { + // Total size of each block in the device shared block pool. + // Larger sizes will lower overhead and ensure the heap isn't hit for + // transient allocations while also increasing memory consumption. + iree_host_size_t arena_block_size; +} quidditch_device_params_t; + +// Initializes |out_params| to default values. +void quidditch_device_params_initialize(quidditch_device_params_t *out_params); + +iree_status_t quidditch_device_create(iree_string_view_t identifier, + const quidditch_device_params_t *params, + iree_host_size_t loader_count, + iree_hal_executable_loader_t **loaders, + iree_hal_allocator_t *device_allocator, + iree_allocator_t host_allocator, + iree_hal_device_t **out_device); diff --git a/runtime/runtime/src/Quidditch/event.c b/runtime/runtime/src/Quidditch/event.c new file mode 100644 index 0000000..6d07211 --- /dev/null +++ b/runtime/runtime/src/Quidditch/event.c @@ -0,0 +1,54 @@ +// Copyright 2021 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "event.h" + +#include + +typedef struct quidditch_event_t { + iree_hal_resource_t resource; + iree_allocator_t host_allocator; +} quidditch_event_t; + +static const iree_hal_event_vtable_t quidditch_event_vtable; + +static quidditch_event_t* quidditch_event_cast(iree_hal_event_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &quidditch_event_vtable); + return (quidditch_event_t*)base_value; +} + +iree_status_t quidditch_event_create(iree_allocator_t host_allocator, + iree_hal_event_t** out_event) { + IREE_ASSERT_ARGUMENT(out_event); + *out_event = NULL; + IREE_TRACE_ZONE_BEGIN(z0); + + quidditch_event_t* event = NULL; + iree_status_t status = + iree_allocator_malloc(host_allocator, sizeof(*event), (void**)&event); + if (iree_status_is_ok(status)) { + iree_hal_resource_initialize(&quidditch_event_vtable, &event->resource); + event->host_allocator = host_allocator; + *out_event = (iree_hal_event_t*)event; + } + + IREE_TRACE_ZONE_END(z0); + return status; +} + +static void quidditch_event_destroy(iree_hal_event_t* base_event) { + quidditch_event_t* event = quidditch_event_cast(base_event); + iree_allocator_t host_allocator = event->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); + + iree_allocator_free(host_allocator, event); + + IREE_TRACE_ZONE_END(z0); +} + +static const iree_hal_event_vtable_t quidditch_event_vtable = { + .destroy = quidditch_event_destroy, +}; diff --git a/runtime/runtime/src/Quidditch/event.h b/runtime/runtime/src/Quidditch/event.h new file mode 100644 index 0000000..6e13851 --- /dev/null +++ b/runtime/runtime/src/Quidditch/event.h @@ -0,0 +1,24 @@ +// Copyright 2021 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_HAL_DRIVERS_LOCAL_SYNC_SYNC_EVENT_H_ +#define IREE_HAL_DRIVERS_LOCAL_SYNC_SYNC_EVENT_H_ + +#include "iree/base/api.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +iree_status_t quidditch_event_create(iree_allocator_t host_allocator, + iree_hal_event_t** out_event); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_HAL_DRIVERS_LOCAL_SYNC_SYNC_EVENT_H_ diff --git a/runtime/runtime/src/Quidditch/semaphore.c b/runtime/runtime/src/Quidditch/semaphore.c new file mode 100644 index 0000000..64872c5 --- /dev/null +++ b/runtime/runtime/src/Quidditch/semaphore.c @@ -0,0 +1,431 @@ +// Copyright 2021 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "semaphore.h" + +#include +#include +#include +#include + +#include "iree/hal/utils/semaphore_base.h" + +//===----------------------------------------------------------------------===// +// quidditch_semaphore_state_t +//===----------------------------------------------------------------------===// + +void quidditch_semaphore_state_initialize( + quidditch_semaphore_state_t* out_shared_state) { + memset(out_shared_state, 0, sizeof(*out_shared_state)); + iree_notification_initialize(&out_shared_state->notification); +} + +void quidditch_semaphore_state_deinitialize( + quidditch_semaphore_state_t* shared_state) { + iree_notification_deinitialize(&shared_state->notification); + memset(shared_state, 0, sizeof(*shared_state)); +} + +//===----------------------------------------------------------------------===// +// quidditch_semaphore_t +//===----------------------------------------------------------------------===// + +typedef struct quidditch_semaphore_t { + iree_hal_semaphore_t base; + iree_allocator_t host_allocator; + + // Shared across all semaphores. + quidditch_semaphore_state_t* shared_state; + + // Guards all mutable fields. We expect low contention on semaphores and since + // iree_slim_mutex_t is (effectively) just a CAS this keeps things simpler + // than trying to make the entire structure lock-free. + iree_slim_mutex_t mutex; + + // Current signaled value. May be IREE_HAL_SEMAPHORE_FAILURE_VALUE to + // indicate that the semaphore has been signaled for failure and + // |failure_status| contains the error. + uint64_t current_value; + + // OK or the status passed to iree_hal_semaphore_fail. Owned by the semaphore. + iree_status_t failure_status; +} quidditch_semaphore_t; + +static const iree_hal_semaphore_vtable_t quidditch_semaphore_vtable; + +static quidditch_semaphore_t* quidditch_semaphore_cast( + iree_hal_semaphore_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &quidditch_semaphore_vtable); + return (quidditch_semaphore_t*)base_value; +} + +iree_status_t quidditch_semaphore_create( + quidditch_semaphore_state_t* shared_state, uint64_t initial_value, + iree_allocator_t host_allocator, iree_hal_semaphore_t** out_semaphore) { + IREE_ASSERT_ARGUMENT(shared_state); + IREE_ASSERT_ARGUMENT(out_semaphore); + *out_semaphore = NULL; + IREE_TRACE_ZONE_BEGIN(z0); + + quidditch_semaphore_t* semaphore = NULL; + iree_status_t status = iree_allocator_malloc( + host_allocator, sizeof(*semaphore), (void**)&semaphore); + if (iree_status_is_ok(status)) { + iree_hal_semaphore_initialize(&quidditch_semaphore_vtable, + &semaphore->base); + semaphore->host_allocator = host_allocator; + semaphore->shared_state = shared_state; + + iree_slim_mutex_initialize(&semaphore->mutex); + semaphore->current_value = initial_value; + semaphore->failure_status = iree_ok_status(); + + *out_semaphore = &semaphore->base; + } + + IREE_TRACE_ZONE_END(z0); + return status; +} + +static void quidditch_semaphore_destroy(iree_hal_semaphore_t* base_semaphore) { + quidditch_semaphore_t* semaphore = quidditch_semaphore_cast(base_semaphore); + iree_allocator_t host_allocator = semaphore->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); + + iree_slim_mutex_deinitialize(&semaphore->mutex); + iree_status_ignore(semaphore->failure_status); + + iree_hal_semaphore_deinitialize(&semaphore->base); + iree_allocator_free(host_allocator, semaphore); + + IREE_TRACE_ZONE_END(z0); +} + +static iree_status_t quidditch_semaphore_query( + iree_hal_semaphore_t* base_semaphore, uint64_t* out_value) { + quidditch_semaphore_t* semaphore = quidditch_semaphore_cast(base_semaphore); + + iree_slim_mutex_lock(&semaphore->mutex); + + *out_value = semaphore->current_value; + + iree_status_t status = iree_ok_status(); + if (*out_value >= IREE_HAL_SEMAPHORE_FAILURE_VALUE) { + status = iree_status_clone(semaphore->failure_status); + } + + iree_slim_mutex_unlock(&semaphore->mutex); + + return status; +} + +// Signals |semaphore| to |new_value| or returns an error if doing so would be +// invalid. The semaphore mutex must be held. +static iree_status_t quidditch_semaphore_signal_unsafe( + quidditch_semaphore_t* semaphore, uint64_t new_value) { + if (new_value <= semaphore->current_value) { + uint64_t current_value IREE_ATTRIBUTE_UNUSED = semaphore->current_value; + return iree_make_status(IREE_STATUS_OUT_OF_RANGE, + "semaphore values must be monotonically " + "increasing; current_value=%" PRIu64 + ", new_value=%" PRIu64, + current_value, new_value); + } + + // Update to the new value. + semaphore->current_value = new_value; + + return iree_ok_status(); +} + +static iree_status_t quidditch_semaphore_signal( + iree_hal_semaphore_t* base_semaphore, uint64_t new_value) { + quidditch_semaphore_t* semaphore = quidditch_semaphore_cast(base_semaphore); + + iree_slim_mutex_lock(&semaphore->mutex); + + iree_status_t status = + quidditch_semaphore_signal_unsafe(semaphore, new_value); + if (!iree_status_is_ok(status)) { + iree_slim_mutex_unlock(&semaphore->mutex); + return status; + } + + iree_slim_mutex_unlock(&semaphore->mutex); + + // Notify timepoints of the new value. + iree_hal_semaphore_notify(&semaphore->base, new_value, IREE_STATUS_OK); + + // Post a global notification so that any waiter will wake. + // TODO(#4680): make notifications per-semaphore; would make multi-wait + // impossible with iree_notification_t and we'd have to use wait handles. + iree_notification_post(&semaphore->shared_state->notification, + IREE_ALL_WAITERS); + + return iree_ok_status(); +} + +static void quidditch_semaphore_fail(iree_hal_semaphore_t* base_semaphore, + iree_status_t status) { + quidditch_semaphore_t* semaphore = quidditch_semaphore_cast(base_semaphore); + const iree_status_code_t status_code = iree_status_code(status); + + iree_slim_mutex_lock(&semaphore->mutex); + + // Try to set our local status - we only preserve the first failure so only + // do this if we are going from a valid semaphore to a failed one. + if (!iree_status_is_ok(semaphore->failure_status)) { + // Previous status was not OK; drop our new status. + IREE_IGNORE_ERROR(status); + iree_slim_mutex_unlock(&semaphore->mutex); + return; + } + + // Signal to our failure sentinel value. + semaphore->current_value = IREE_HAL_SEMAPHORE_FAILURE_VALUE; + semaphore->failure_status = status; + + iree_slim_mutex_unlock(&semaphore->mutex); + + // Notify timepoints of the failure. + iree_hal_semaphore_notify(&semaphore->base, IREE_HAL_SEMAPHORE_FAILURE_VALUE, + status_code); + + iree_notification_post(&semaphore->shared_state->notification, + IREE_ALL_WAITERS); +} + +iree_status_t quidditch_semaphore_multi_signal( + quidditch_semaphore_state_t* shared_state, + const iree_hal_semaphore_list_t semaphore_list) { + IREE_ASSERT_ARGUMENT(shared_state); + if (semaphore_list.count == 0) { + return iree_ok_status(); + } else if (semaphore_list.count == 1) { + // Fast-path for a single semaphore. + return iree_hal_semaphore_signal(semaphore_list.semaphores[0], + semaphore_list.payload_values[0]); + } + + // Try to signal all semaphores, stopping if we encounter any issues. + iree_status_t status = iree_ok_status(); + for (iree_host_size_t i = 0; i < semaphore_list.count; ++i) { + quidditch_semaphore_t* semaphore = + quidditch_semaphore_cast(semaphore_list.semaphores[i]); + + iree_slim_mutex_lock(&semaphore->mutex); + status = quidditch_semaphore_signal_unsafe( + semaphore, semaphore_list.payload_values[i]); + if (!iree_status_is_ok(status)) { + iree_slim_mutex_unlock(&semaphore->mutex); + break; + } + + iree_slim_mutex_unlock(&semaphore->mutex); + + // Notify timepoints that the new value has been reached. + iree_hal_semaphore_notify(semaphore_list.semaphores[i], + semaphore_list.payload_values[i], IREE_STATUS_OK); + } + + // Notify all waiters that we've updated semaphores. They'll wake and check + // to see if they are satisfied. + // NOTE: we do this even if there was a failure as we may have signaled some + // of the list. + iree_notification_post(&shared_state->notification, IREE_ALL_WAITERS); + + return status; +} + +typedef struct quidditch_semaphore_notify_state_t { + quidditch_semaphore_t* semaphore; + uint64_t value; +} quidditch_semaphore_notify_state_t; + +static bool quidditch_semaphore_is_signaled( + quidditch_semaphore_notify_state_t* state) { + quidditch_semaphore_t* semaphore = state->semaphore; + iree_slim_mutex_lock(&semaphore->mutex); + bool is_signaled = semaphore->current_value >= state->value || + !iree_status_is_ok(semaphore->failure_status); + iree_slim_mutex_unlock(&semaphore->mutex); + return is_signaled; +} + +static iree_status_t quidditch_semaphore_wait( + iree_hal_semaphore_t* base_semaphore, uint64_t value, + iree_timeout_t timeout) { + quidditch_semaphore_t* semaphore = quidditch_semaphore_cast(base_semaphore); + + // Try to see if we can return immediately. + iree_slim_mutex_lock(&semaphore->mutex); + if (!iree_status_is_ok(semaphore->failure_status)) { + // Fastest path: failed; return an error to tell callers to query for it. + iree_slim_mutex_unlock(&semaphore->mutex); + return iree_status_from_code(IREE_STATUS_ABORTED); + } else if (semaphore->current_value >= value) { + // Fast path: already satisfied. + iree_slim_mutex_unlock(&semaphore->mutex); + return iree_ok_status(); + } else if (iree_timeout_is_immediate(timeout)) { + // Not satisfied but a poll, so can avoid the expensive wait handle work. + iree_slim_mutex_unlock(&semaphore->mutex); + return iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED); + } + iree_slim_mutex_unlock(&semaphore->mutex); + + // TODO(#4680): we should be checking for DEADLINE_EXCEEDED here. This is + // easy when it's iree_timeout_is_infinite (we can just use the notification + // as below) but if it's an actual deadline we'll need to probably switch to + // iree_wait_handle_t. + + // Perform wait on the global notification. Will wait forever. + quidditch_semaphore_state_t* shared_state = semaphore->shared_state; + quidditch_semaphore_notify_state_t notify_state = { + .semaphore = semaphore, + .value = value, + }; + iree_notification_await(&shared_state->notification, + (iree_condition_fn_t)quidditch_semaphore_is_signaled, + (void*)¬ify_state, timeout); + + iree_status_t status = iree_ok_status(); + iree_slim_mutex_lock(&semaphore->mutex); + if (!iree_status_is_ok(semaphore->failure_status)) { + // Semaphore has failed. + status = iree_status_from_code(IREE_STATUS_ABORTED); + } else if (semaphore->current_value < value) { + // Deadline expired before the semaphore was signaled. + status = iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED); + } + iree_slim_mutex_unlock(&semaphore->mutex); + return status; +} + +// Returns true if any semaphore in the list has signaled (or failed). +// Used with with iree_condition_fn_t and must match that signature. +static bool quidditch_semaphore_any_signaled( + const iree_hal_semaphore_list_t* semaphore_list) { + for (iree_host_size_t i = 0; i < semaphore_list->count; ++i) { + quidditch_semaphore_t* semaphore = + quidditch_semaphore_cast(semaphore_list->semaphores[i]); + iree_slim_mutex_lock(&semaphore->mutex); + bool is_signaled = + semaphore->current_value >= semaphore_list->payload_values[i] || + !iree_status_is_ok(semaphore->failure_status); + iree_slim_mutex_unlock(&semaphore->mutex); + if (is_signaled) return true; + } + return false; +} + +// Returns true if all semaphores in the list has signaled (or any failed). +// Used with with iree_condition_fn_t and must match that signature. +static bool quidditch_semaphore_all_signaled( + const iree_hal_semaphore_list_t* semaphore_list) { + for (iree_host_size_t i = 0; i < semaphore_list->count; ++i) { + quidditch_semaphore_t* semaphore = + quidditch_semaphore_cast(semaphore_list->semaphores[i]); + iree_slim_mutex_lock(&semaphore->mutex); + bool is_signaled = + semaphore->current_value >= semaphore_list->payload_values[i] || + !iree_status_is_ok(semaphore->failure_status); + iree_slim_mutex_unlock(&semaphore->mutex); + if (!is_signaled) return false; + } + return true; +} + +// Returns a status derived from the |semaphore_list| at the current time: +// - IREE_STATUS_OK: any or all semaphores signaled (based on |wait_mode|). +// - IREE_STATUS_ABORTED: one or more semaphores failed. +// - IREE_STATUS_DEADLINE_EXCEEDED: any or all semaphores unsignaled. +static iree_status_t quidditch_semaphore_result_from_state( + iree_hal_wait_mode_t wait_mode, + const iree_hal_semaphore_list_t semaphore_list) { + bool any_signaled = false; + bool all_signaled = true; + bool any_failed = false; + for (iree_host_size_t i = 0; i < semaphore_list.count; ++i) { + quidditch_semaphore_t* semaphore = + quidditch_semaphore_cast(semaphore_list.semaphores[i]); + iree_slim_mutex_lock(&semaphore->mutex); + const uint64_t current_value = semaphore->current_value; + const iree_status_code_t current_status_code = + iree_status_code(semaphore->failure_status); + if (current_status_code != IREE_STATUS_OK) { + // Semaphore has failed. + any_failed = true; + } else if (current_value < semaphore_list.payload_values[i]) { + // Deadline expired before the semaphore was signaled. + all_signaled = false; + } else { + // Signaled! + any_signaled = true; + } + iree_slim_mutex_unlock(&semaphore->mutex); + } + if (any_failed) { + // Always prioritize failure state. + return iree_status_from_code(IREE_STATUS_ABORTED); + } + switch (wait_mode) { + default: + case IREE_HAL_WAIT_MODE_ALL: + return all_signaled + ? iree_ok_status() + : iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED); + case IREE_HAL_WAIT_MODE_ANY: + return any_signaled + ? iree_ok_status() + : iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED); + } +} + +iree_status_t quidditch_semaphore_multi_wait( + quidditch_semaphore_state_t* shared_state, iree_hal_wait_mode_t wait_mode, + const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) { + if (semaphore_list.count == 0) { + return iree_ok_status(); + } else if (semaphore_list.count == 1) { + // Fast-path for a single semaphore. + return iree_hal_semaphore_wait(semaphore_list.semaphores[0], + semaphore_list.payload_values[0], timeout); + } + + IREE_TRACE_ZONE_BEGIN(z0); + + // Fast-path for polling; we'll never wait and can just do a quick query. + if (iree_timeout_is_immediate(timeout)) { + iree_status_t status = + quidditch_semaphore_result_from_state(wait_mode, semaphore_list); + IREE_TRACE_ZONE_END(z0); + return status; + } + + // Perform wait on the global notification. + iree_notification_await( + &shared_state->notification, + wait_mode == IREE_HAL_WAIT_MODE_ALL + ? (iree_condition_fn_t)quidditch_semaphore_all_signaled + : (iree_condition_fn_t)quidditch_semaphore_any_signaled, + (void*)&semaphore_list, iree_infinite_timeout()); + + // We may have been successful - or may have a partial failure. + iree_status_t status = + quidditch_semaphore_result_from_state(wait_mode, semaphore_list); + + IREE_TRACE_ZONE_END(z0); + return status; +} + +static const iree_hal_semaphore_vtable_t quidditch_semaphore_vtable = { + .destroy = quidditch_semaphore_destroy, + .query = quidditch_semaphore_query, + .signal = quidditch_semaphore_signal, + .fail = quidditch_semaphore_fail, + .wait = quidditch_semaphore_wait, +}; diff --git a/runtime/runtime/src/Quidditch/semaphore.h b/runtime/runtime/src/Quidditch/semaphore.h new file mode 100644 index 0000000..bc1d38a --- /dev/null +++ b/runtime/runtime/src/Quidditch/semaphore.h @@ -0,0 +1,73 @@ +// Copyright 2021 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_HAL_DRIVERS_LOCAL_SYNC_SYNC_SEMAPHORE_H_ +#define IREE_HAL_DRIVERS_LOCAL_SYNC_SYNC_SEMAPHORE_H_ + +#include + +#include "iree/base/api.h" +#include "iree/base/internal/synchronization.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +//===----------------------------------------------------------------------===// +// quidditch_semaphore_state_t +//===----------------------------------------------------------------------===// + +// State shared between all sync semaphores. +// Owned by the device and guaranteed to remain valid for the lifetime of any +// semaphore created from it. +typedef struct quidditch_semaphore_state_t { + // In-process notification signaled when any semaphore value changes. + iree_notification_t notification; +} quidditch_semaphore_state_t; + +// Initializes state used to perform semaphore synchronization. +void quidditch_semaphore_state_initialize( + quidditch_semaphore_state_t* out_shared_state); + +// Deinitializes state used to perform semaphore synchronization; no semaphores +// must be live with references. +void quidditch_semaphore_state_deinitialize( + quidditch_semaphore_state_t* shared_state); + +//===----------------------------------------------------------------------===// +// quidditch_semaphore_t +//===----------------------------------------------------------------------===// + +// Creates a semaphore that allows for ordering of operations on the local host. +// Backed by a shared iree_notification_t in |shared_state|. Not efficient under +// high contention or many simultaneous users but that's not what the +// synchronous backend is intended for - if you want something efficient in the +// face of hundreds or thousands of active asynchronous operations then use the +// task system. +iree_status_t quidditch_semaphore_create( + quidditch_semaphore_state_t* shared_state, uint64_t initial_value, + iree_allocator_t host_allocator, iree_hal_semaphore_t** out_semaphore); + +// Performs a signal of a list of semaphores. +// The semaphores will transition to their new values (nearly) atomically and +// batching up signals will reduce synchronization overhead. +iree_status_t quidditch_semaphore_multi_signal( + quidditch_semaphore_state_t* shared_state, + const iree_hal_semaphore_list_t semaphore_list); + +// Performs a multi-wait on one or more semaphores. +// Returns IREE_STATUS_DEADLINE_EXCEEDED if the wait does not complete before +// |timeout| elapses. +iree_status_t quidditch_semaphore_multi_wait( + quidditch_semaphore_state_t* shared_state, iree_hal_wait_mode_t wait_mode, + const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_HAL_DRIVERS_LOCAL_SYNC_SYNC_SEMAPHORE_H_ diff --git a/runtime/samples/CMakeLists.txt b/runtime/samples/CMakeLists.txt index aa18830..fa2a448 100644 --- a/runtime/samples/CMakeLists.txt +++ b/runtime/samples/CMakeLists.txt @@ -4,16 +4,16 @@ quidditch_module(SRC simple_add.mlir) add_executable(IREE_HelloWorld main.c) target_link_libraries( - IREE_HelloWorld - PRIVATE - snRuntime - iree::base - iree::vm - iree::modules::hal - iree::modules::hal::types - iree::hal::local::local - iree::hal::local::loaders::static_library_loader - Quidditch::device - simple_add_module + IREE_HelloWorld + PRIVATE + snRuntime + iree::base + iree::vm + iree::modules::hal + iree::modules::hal::types + iree::hal::local::local + iree::hal::local::loaders::static_library_loader + Quidditch::device + simple_add_module ) diff --git a/runtime/samples/main.c b/runtime/samples/main.c index 86703ce..170a116 100644 --- a/runtime/samples/main.c +++ b/runtime/samples/main.c @@ -11,6 +11,11 @@ #include #include +uint32_t snrt_l1_start_addr(); +uint32_t snrt_l1_end_addr(); + +static iree_allocator_inline_storage_t l1_arena; + static iree_status_t setup_instance_and_device( iree_allocator_t host_allocator, iree_vm_instance_t** out_instance, iree_hal_device_t** out_device) { @@ -32,17 +37,24 @@ static iree_status_t setup_instance_and_device( iree_hal_executable_import_provider_null(), host_allocator, &loader); if (!iree_status_is_ok(result)) goto error_release_vm; - // TODO: Replace with more sophisticated allocator representing cluster - // memory. + l1_arena.buffer = (uint8_t*)snrt_l1_start_addr(); + l1_arena.length = 0; + // TODO: This is a lie and it WILL crash into our stack and CLS memory. + l1_arena.capacity = snrt_l1_end_addr() - snrt_l1_start_addr(); + iree_hal_allocator_t* device_allocator; - result = iree_hal_allocator_create_heap(iree_make_cstring_view("quidditch"), - host_allocator, host_allocator, - &device_allocator); + result = + iree_hal_allocator_create_heap(iree_make_cstring_view("quidditch"), + iree_allocator_inline_arena(&l1_arena), + host_allocator, &device_allocator); if (!iree_status_is_ok(result)) goto error_release_library_loader; - result = quidditch_device_create( - /*loader_count=*/1, &loader, device_allocator, host_allocator, - out_device); + quidditch_device_params_t params; + quidditch_device_params_initialize(¶ms); + result = + quidditch_device_create(IREE_SV("snitch"), ¶ms, + /*loader_count=*/1, &loader, device_allocator, + host_allocator, out_device); iree_hal_executable_loader_release(loader); iree_hal_allocator_release(device_allocator); return result; @@ -54,14 +66,14 @@ static iree_status_t setup_instance_and_device( return result; } -static float data[128]; - int main() { // TODO: Remove/redirect compute cores once implemented. - if (!snrt_is_dm_core()) return 0; + if (snrt_cluster_core_idx() != 0) return 0; - for (int i = 0; i < 128; i++) { - data[i] = i; + double data[4]; + + for (int i = 0; i < IREE_ARRAYSIZE(data); i++) { + data[i] = (i + 1); } iree_allocator_t host_allocator = iree_allocator_system(); @@ -76,8 +88,6 @@ int main() { return -1; } - // TODO: Create EmitC module here. - iree_vm_module_t* hal_module = NULL; result = iree_hal_module_create(vmInstance, /*device_count=*/1, @@ -108,9 +118,9 @@ int main() { iree_hal_buffer_view_t* buffer = NULL; result = iree_hal_buffer_view_allocate_buffer_copy( - device, iree_hal_device_allocator(device), 1, (iree_hal_dim_t[]){128}, - IREE_HAL_ELEMENT_TYPE_FLOAT_32, IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, - params, span, &buffer); + device, iree_hal_device_allocator(device), 1, + (iree_hal_dim_t[]){IREE_ARRAYSIZE(data)}, IREE_HAL_ELEMENT_TYPE_FLOAT_64, + IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, params, span, &buffer); if (!iree_status_is_ok(result)) goto error_release_context; iree_vm_list_t* inputs = NULL; @@ -139,6 +149,33 @@ int main() { context, main_function, IREE_VM_CONTEXT_FLAG_NONE, /*policy=*/NULL, inputs, outputs, iree_allocator_system())); + if (!iree_status_is_ok(result)) goto error_release_output; + + iree_hal_buffer_view_t* ret_buffer_view = + iree_vm_list_get_ref_deref(outputs, /*i=*/0, iree_hal_buffer_view_type()); + if (ret_buffer_view == NULL) goto error_release_output; + + iree_hal_buffer_mapping_t mapping; + result = iree_hal_buffer_map_range( + iree_hal_buffer_view_buffer(ret_buffer_view), + IREE_HAL_MAPPING_MODE_SCOPED, IREE_HAL_MEMORY_ACCESS_READ, 0, + IREE_WHOLE_BUFFER, &mapping); + if (!iree_status_is_ok(result)) goto error_release_output; + + for (int i = 0; i < IREE_ARRAYSIZE(data); i++) { + double value = ((double*)mapping.contents.data)[i]; + printf("%f\n", value); + if (value == (i + 1) * 2) continue; + + result = iree_make_status(IREE_STATUS_UNKNOWN, "output incorrect"); + break; + } + + iree_hal_buffer_unmap_range(&mapping); + +error_release_output: + iree_vm_list_release(outputs); + iree_vm_list_release(inputs); error_release_context: iree_vm_context_release(context); error_release_mlir_module: diff --git a/runtime/samples/simple_add.mlir b/runtime/samples/simple_add.mlir index 263ec4d..caf046b 100644 --- a/runtime/samples/simple_add.mlir +++ b/runtime/samples/simple_add.mlir @@ -1,17 +1,17 @@ builtin.module @test_simple_add { - func.func @add(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> { - %init = tensor.empty() : tensor<128xf32> + func.func @add(%arg0: tensor<4xf64>, %arg1: tensor<4xf64>) -> tensor<4xf64> { + %init = tensor.empty() : tensor<4xf64> %out = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} - ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) - outs(%init : tensor<128xf32>) { - ^bb0(%in: f32 , %in_1: f32, %out: f32): - %o = arith.addf %in, %in_1 : f32 - linalg.yield %o : f32 - } -> tensor<128xf32> - func.return %out : tensor<128xf32> + ins(%arg0, %arg1 : tensor<4xf64>, tensor<4xf64>) + outs(%init : tensor<4xf64>) { + ^bb0(%in: f64 , %in_1: f64, %out: f64): + %o = arith.addf %in, %in_1 : f64 + linalg.yield %o : f64 + } -> tensor<4xf64> + func.return %out : tensor<4xf64> } } diff --git a/runtime/tests/CMakeLists.txt b/runtime/tests/CMakeLists.txt index 90b594d..0442c9d 100644 --- a/runtime/tests/CMakeLists.txt +++ b/runtime/tests/CMakeLists.txt @@ -6,3 +6,4 @@ target_link_libraries(HelloWorld snRuntime) add_test(NAME HelloWorld COMMAND HelloWorld) +add_test(NAME IREE_HelloWorld COMMAND IREE_HelloWorld)