From 35cd344b023c3ff889169d2a483af2922e005640 Mon Sep 17 00:00:00 2001 From: Stella Laurenzo Date: Wed, 28 Aug 2024 14:53:09 -0700 Subject: [PATCH] [libshortfin] Implement invocation. --- libshortfin/bindings/python/array_binding.cc | 81 ++++++-- libshortfin/bindings/python/lib_ext.cc | 11 +- .../mobilenet_server/inference_system.py | 13 +- libshortfin/src/shortfin/array/array.cc | 22 ++- libshortfin/src/shortfin/array/array.h | 21 +- libshortfin/src/shortfin/array/storage.cc | 32 ++-- libshortfin/src/shortfin/array/storage.h | 32 ++-- libshortfin/src/shortfin/local/messaging.h | 2 +- libshortfin/src/shortfin/local/program.cc | 162 +++++++++++++++- libshortfin/src/shortfin/local/program.h | 108 ++++++++++- libshortfin/src/shortfin/local/scheduler.cc | 46 ++++- libshortfin/src/shortfin/local/scheduler.h | 3 + libshortfin/src/shortfin/local/scope.h | 2 + .../src/shortfin/support/iree_concurrency.h | 16 +- .../src/shortfin/support/iree_helpers.h | 180 ++++-------------- libshortfin/src/shortfin/support/logging.h | 4 + 16 files changed, 501 insertions(+), 234 deletions(-) diff --git a/libshortfin/bindings/python/array_binding.cc b/libshortfin/bindings/python/array_binding.cc index 9858c2350..6ad1a717f 100644 --- a/libshortfin/bindings/python/array_binding.cc +++ b/libshortfin/bindings/python/array_binding.cc @@ -13,6 +13,23 @@ using namespace shortfin::array; namespace shortfin::python { namespace { +static const char DOCSTRING_ARRAY_COPY_FROM[] = + R"(Copy contents from a source array to this array. + +Equivalent to `dest_array.storage.copy_from(source_array.storage)`. +)"; + +static const char DOCSTRING_ARRAY_COPY_TO[] = + R"(Copy contents this array to a destination array. + +Equivalent to `dest_array.storage.copy_from(source_array.storage)`. +)"; + +static const char DOCSTRING_ARRAY_FILL[] = R"(Fill an array with a value. + +Equivalent to `array.storage.fill(pattern)`. +)"; + static const char DOCSTRING_STORAGE_DATA[] = R"(Access raw binary contents. Accessing `foo = storage.data` is equivalent to `storage.data.map(read=True)`. @@ -28,6 +45,23 @@ As with `map`, this will only work on buffers that are host visible, which includes all host buffers and device buffers created with the necessary access. )"; +static const char DOCSTRING_STORAGE_COPY_FROM[] = + R"(Copy contents from a source storage to this array. + +This operation executes asynchronously and the effect will only be visible +once the execution scope has been synced to the point of mutation. +)"; + +static const char DOCSTRING_STORAGE_FILL[] = R"(Fill a storage with a value. + +Takes as argument any value that can be interpreted as a buffer with the Python +buffer protocol of size 1, 2, or 4 bytes. The storage will be filled uniformly +with the pattern. + +This operation executes asynchronously and the effect will only be visible +once the execution scope has been synced to the point of mutation. +)"; + static const char DOCSTRING_STORAGE_MAP[] = R"(Create a mapping of the buffer contents in host memory. @@ -104,26 +138,30 @@ void BindArray(py::module_ &m) { .def_static( "allocate_host", [](local::ScopedDevice &device, iree_device_size_t allocation_size) { - return storage::AllocateHost(device, allocation_size); + return storage::allocate_host(device, allocation_size); }, py::arg("device"), py::arg("allocation_size"), py::keep_alive<0, 1>()) .def_static( "allocate_device", [](local::ScopedDevice &device, iree_device_size_t allocation_size) { - return storage::AllocateDevice(device, allocation_size); + return storage::allocate_device(device, allocation_size); }, py::arg("device"), py::arg("allocation_size"), py::keep_alive<0, 1>()) - .def("fill", - [](storage &self, py::handle buffer) { - Py_buffer py_view; - int flags = PyBUF_FORMAT | PyBUF_ND; // C-Contiguous ND. - if (PyObject_GetBuffer(buffer.ptr(), &py_view, flags) != 0) { - throw py::python_error(); - } - PyBufferReleaser py_view_releaser(py_view); - self.Fill(py_view.buf, py_view.len); - }) - .def("copy_from", [](storage &self, storage &src) { self.CopyFrom(src); }) + .def( + "fill", + [](storage &self, py::handle buffer) { + Py_buffer py_view; + int flags = PyBUF_FORMAT | PyBUF_ND; // C-Contiguous ND. + if (PyObject_GetBuffer(buffer.ptr(), &py_view, flags) != 0) { + throw py::python_error(); + } + PyBufferReleaser py_view_releaser(py_view); + self.fill(py_view.buf, py_view.len); + }, + py::arg("pattern"), DOCSTRING_STORAGE_FILL) + .def( + "copy_from", [](storage &self, storage &src) { self.copy_from(src); }, + py::arg("source_storage"), DOCSTRING_STORAGE_COPY_FROM) .def( "map", [](storage &self, bool read, bool write, bool discard) { @@ -137,7 +175,7 @@ void BindArray(py::module_ &m) { } mapping *cpp_mapping = nullptr; py::object py_mapping = CreateMappingObject(&cpp_mapping); - self.MapExplicit( + self.map_explicit( *cpp_mapping, static_cast(access)); return py_mapping; @@ -154,12 +192,12 @@ void BindArray(py::module_ &m) { [](storage &self) { mapping *cpp_mapping = nullptr; py::object py_mapping = CreateMappingObject(&cpp_mapping); - *cpp_mapping = self.MapRead(); + *cpp_mapping = self.map_read(); return py_mapping; }, [](storage &self, py::handle buffer_obj) { PyBufferRequest src_info(buffer_obj, PyBUF_SIMPLE); - auto dest_data = self.MapWriteDiscard(); + auto dest_data = self.map_write_discard(); if (src_info.view().len > dest_data.size()) { throw std::invalid_argument( fmt::format("Cannot write {} bytes into buffer of {} bytes", @@ -243,6 +281,17 @@ void BindArray(py::module_ &m) { py::rv_policy::reference_internal) .def_prop_ro("storage", &device_array::storage, py::rv_policy::reference_internal) + + .def( + "fill", + [](py::handle self, py::handle buffer) { + self.attr("storage").attr("fill")(buffer); + }, + py::arg("pattern"), DOCSTRING_ARRAY_FILL) + .def("copy_from", &device_array::copy_from, py::arg("source_array"), + DOCSTRING_ARRAY_COPY_FROM) + .def("copy_to", &device_array::copy_to, py::arg("dest_array"), + DOCSTRING_ARRAY_COPY_TO) .def("__repr__", &device_array::to_s); } diff --git a/libshortfin/bindings/python/lib_ext.cc b/libshortfin/bindings/python/lib_ext.cc index 6072caa04..d3e1ed597 100644 --- a/libshortfin/bindings/python/lib_ext.cc +++ b/libshortfin/bindings/python/lib_ext.cc @@ -379,8 +379,17 @@ void BindLocal(py::module_ &m) { .def("__add__", &local::DeviceAffinity::AddDevice) .def("__repr__", &local::DeviceAffinity::to_s); - py::class_(m, "Program"); + py::class_(m, "Program") + .def_prop_ro("exports", &local::Program::exports) + .def("lookup_function", &local::Program::LookupRequiredFunction) + .def("__getitem__", &local::Program::LookupRequiredFunction); + py::class_(m, "ProgramFunction") + .def_prop_ro("name", &local::ProgramFunction::name) + .def_prop_ro("calling_convention", + &local::ProgramFunction::calling_convention) + .def("__repr__", &local::ProgramFunction::to_s); py::class_(m, "ProgramModule") + .def_prop_ro("exports", &local::ProgramModule::exports) .def("__repr__", &local::ProgramModule::to_s) .def_static("load", &local::ProgramModule::Load, py::arg("system"), py::arg("path"), py::arg("mmap") = true); diff --git a/libshortfin/examples/python/mobilenet_server/inference_system.py b/libshortfin/examples/python/mobilenet_server/inference_system.py index 8ae7773db..c50dc02e9 100644 --- a/libshortfin/examples/python/mobilenet_server/inference_system.py +++ b/libshortfin/examples/python/mobilenet_server/inference_system.py @@ -24,7 +24,7 @@ def __init__(self, raw_image_data): class InferenceProcess(sf.Process): def __init__(self, program, request_queue, **kwargs): super().__init__(**kwargs) - self.program = program + self.main_function = program["module.torch-jit-export$async"] self.request_reader = request_queue.reader() self.device = self.scope.device(0) self.device_input = sfnp.device_array( @@ -41,9 +41,14 @@ async def run(self): # support for. Generally, APIs on storage should be mirrored onto # the array. self.host_staging.storage.data = request.raw_image_data - print(self.host_staging) - self.device_input.storage.copy_from(self.host_staging.storage) - print(self.device_input) + print("host_staging =", self.host_staging) + self.device_input.copy_from(self.host_staging) + output = await self.scope.invoke(self.main_function, self.device_input) + print("OUTPUT:", output) + # read_back = self.device_input.for_transfer() + # read_back.copy_from(self.device_input) + # await self.device + # print("read back =", read_back) class Main: diff --git a/libshortfin/src/shortfin/array/array.cc b/libshortfin/src/shortfin/array/array.cc index 1d6d7cc5a..c94ea2503 100644 --- a/libshortfin/src/shortfin/array/array.cc +++ b/libshortfin/src/shortfin/array/array.cc @@ -20,19 +20,19 @@ template class InlinedDims; // device_array // -------------------------------------------------------------------------- // -const mapping device_array::data() const { return storage_.MapRead(); } +const mapping device_array::data() const { return storage_.map_read(); } -mapping device_array::data() { return storage_.MapRead(); } +mapping device_array::data() { return storage_.map_read(); } -mapping device_array::data_rw() { return storage_.MapReadWrite(); } +mapping device_array::data_rw() { return storage_.map_read_write(); } -mapping device_array::data_w() { return storage_.MapWriteDiscard(); } +mapping device_array::data_w() { return storage_.map_write_discard(); } std::optional device_array::map_memory_for_xtensor() { if (storage_.is_mappable_for_read_write()) { - return storage_.MapReadWrite(); + return storage_.map_read_write(); } else if (storage_.is_mappable_for_read()) { - return storage_.MapRead(); + return storage_.map_read(); } return {}; } @@ -52,10 +52,12 @@ std::string device_array::to_s() const { } } - return fmt::format("device_array([{}], dtype='{}', device={}({})) ={}{}", - fmt::join(shape(), ", "), dtype().name(), - storage_.device().to_s(), storage_.formatted_memory_type(), - contents_prefix, contents); + return fmt::format( + "device_array([{}], dtype='{}', device={}(type={}, usage={}, access={})) " + "={}{}", + fmt::join(shape(), ", "), dtype().name(), storage_.device().to_s(), + storage_.formatted_memory_type(), storage_.formatted_buffer_usage(), + storage_.formatted_memory_access(), contents_prefix, contents); } } // namespace shortfin::array diff --git a/libshortfin/src/shortfin/array/array.h b/libshortfin/src/shortfin/array/array.h index c3ab6e302..23aac2567 100644 --- a/libshortfin/src/shortfin/array/array.h +++ b/libshortfin/src/shortfin/array/array.h @@ -67,7 +67,7 @@ class SHORTFIN_API device_array static device_array for_device(local::ScopedDevice &device, std::span shape, DType dtype) { return device_array( - storage::AllocateDevice(device, dtype.compute_dense_nd_size(shape)), + storage::allocate_device(device, dtype.compute_dense_nd_size(shape)), shape, dtype); } @@ -76,7 +76,7 @@ class SHORTFIN_API device_array static device_array for_host(local::ScopedDevice &device, std::span shape, DType dtype) { return device_array( - storage::AllocateHost(device, dtype.compute_dense_nd_size(shape)), + storage::allocate_host(device, dtype.compute_dense_nd_size(shape)), shape, dtype); } @@ -85,6 +85,23 @@ class SHORTFIN_API device_array return for_host(storage().device(), shape(), dtype()); } + // Enqueues a fill of the storage with an arbitrary pattern of the given + // size. The pattern size must be 1, 2, or 4. Equivalent to calling the same + // on the backing storage. + void fill(const void *pattern, iree_host_size_t pattern_length) { + storage_.fill(pattern, pattern_length); + } + + // Performs either a d2h, h2d or d2d transfer from a source storage to this + // storage. Equivalent to calling the same on the backing storage. + void copy_from(device_array &source_array) { + storage_.copy_from(source_array.storage_); + } + // Inverse of copy_from. + void copy_to(device_array &dest_array) { + dest_array.storage_.copy_from(storage_); + } + // Untyped access to the backing data. The array must be mappable. Specific // access modes: // * data(): Read-only access to the data. diff --git a/libshortfin/src/shortfin/array/storage.cc b/libshortfin/src/shortfin/array/storage.cc index fa9e0f4b8..19f4c2d6d 100644 --- a/libshortfin/src/shortfin/array/storage.cc +++ b/libshortfin/src/shortfin/array/storage.cc @@ -35,8 +35,8 @@ storage::storage(local::ScopedDevice device, iree::hal_buffer_ptr buffer, } storage::~storage() { logging::destruct("array::storage", this); } -storage storage::AllocateDevice(ScopedDevice &device, - iree_device_size_t allocation_size) { +storage storage::allocate_device(ScopedDevice &device, + iree_device_size_t allocation_size) { if (!device.raw_device()) { throw std::invalid_argument("Cannot allocate with a null device affinity"); } @@ -54,8 +54,8 @@ storage storage::AllocateDevice(ScopedDevice &device, device.scope().NewTimelineResource()); } -storage storage::AllocateHost(ScopedDevice &device, - iree_device_size_t allocation_size) { +storage storage::allocate_host(ScopedDevice &device, + iree_device_size_t allocation_size) { if (!device.raw_device()) { throw std::invalid_argument("Cannot allocate with a null device affinity"); } @@ -64,7 +64,8 @@ storage storage::AllocateHost(ScopedDevice &device, iree_hal_buffer_params_t params = { .usage = IREE_HAL_BUFFER_USAGE_MAPPING, .access = IREE_HAL_MEMORY_ACCESS_ALL, - .type = IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_HOST, + .type = IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_HOST | + IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE, .queue_affinity = device.affinity().queue_affinity(), }; if (device.affinity().queue_affinity() != 0) { @@ -76,7 +77,7 @@ storage storage::AllocateHost(ScopedDevice &device, device.scope().NewTimelineResource()); } -storage storage::Subspan(iree_device_size_t byte_offset, +storage storage::subspan(iree_device_size_t byte_offset, iree_device_size_t byte_length) { storage new_storage(device_, {}, timeline_resource_); SHORTFIN_THROW_IF_ERROR(iree_hal_buffer_subspan( @@ -84,7 +85,7 @@ storage storage::Subspan(iree_device_size_t byte_offset, return new_storage; } -void storage::Fill(const void *pattern, iree_host_size_t pattern_length) { +void storage::fill(const void *pattern, iree_host_size_t pattern_length) { device_.scope().scheduler().AppendCommandBuffer( device_, TransactionType::TRANSFER, [&](Account &account) { // Must depend on all of this buffer's use dependencies to avoid @@ -94,9 +95,8 @@ void storage::Fill(const void *pattern, iree_host_size_t pattern_length) { // write-after-write hazard. account.active_deps_extend(timeline_resource_->mutation_barrier()); - // TODO: I need to join the submission dependencies on the account - // with the timeline resource idle fence to ensure that - // write-after-access is properly sequenced. + SHORTFIN_SCHED_LOG(" : FillBuffer({})", + static_cast(buffer_.get())); SHORTFIN_THROW_IF_ERROR(iree_hal_command_buffer_fill_buffer( account.active_command_buffer(), iree_hal_make_buffer_ref( @@ -111,7 +111,7 @@ void storage::Fill(const void *pattern, iree_host_size_t pattern_length) { }); } -void storage::CopyFrom(storage &source_storage) { +void storage::copy_from(storage &source_storage) { device_.scope().scheduler().AppendCommandBuffer( device_, TransactionType::TRANSFER, [&](Account &account) { // Must depend on the source's mutation dependencies to avoid @@ -122,6 +122,9 @@ void storage::CopyFrom(storage &source_storage) { account.active_deps_extend(timeline_resource_->use_barrier()); account.active_deps_extend(timeline_resource_->mutation_barrier()); + SHORTFIN_SCHED_LOG(" : CopyBuffer({} -> {})", + static_cast(source_storage.buffer_.get()), + static_cast(buffer_.get())); SHORTFIN_THROW_IF_ERROR(iree_hal_command_buffer_copy_buffer( account.active_command_buffer(), /*source_ref=*/ @@ -129,10 +132,13 @@ void storage::CopyFrom(storage &source_storage) { /*target_ref=*/ iree_hal_make_buffer_ref(buffer_, 0, byte_length()))); - // And move our own mutation barrier to the current pending timeline + // Move our own mutation barrier to the current pending timeline // value. timeline_resource_->set_mutation_barrier( account.timeline_sem(), account.timeline_idle_timepoint()); + // And extend the source use barrier. + source_storage.timeline_resource_->use_barrier_insert( + account.timeline_sem(), account.timeline_idle_timepoint()); }); } @@ -150,7 +156,7 @@ bool storage::is_mappable_for_read_write() const { (IREE_HAL_MEMORY_ACCESS_READ | IREE_HAL_MEMORY_ACCESS_WRITE)); } -void storage::MapExplicit(mapping &mapping, iree_hal_memory_access_t access) { +void storage::map_explicit(mapping &mapping, iree_hal_memory_access_t access) { assert(access != IREE_HAL_MEMORY_ACCESS_NONE); mapping.reset(); SHORTFIN_THROW_IF_ERROR(iree_hal_buffer_map_range( diff --git a/libshortfin/src/shortfin/array/storage.h b/libshortfin/src/shortfin/array/storage.h index 0db73d28f..ab3363a93 100644 --- a/libshortfin/src/shortfin/array/storage.h +++ b/libshortfin/src/shortfin/array/storage.h @@ -80,30 +80,30 @@ class SHORTFIN_API storage { // Allocates device storage, compatible with the given device affinity. // By default, this will be IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE. - static storage AllocateDevice(local::ScopedDevice &device, - iree_device_size_t allocation_size); + static storage allocate_device(local::ScopedDevice &device, + iree_device_size_t allocation_size); // Allocates host storage, compatible with the given device affinity. // By default, if there are any affinity bits set in the device, then // the storage will be device visible and have permitted usage for // transfers. This default policy can be overriden based on device defaults // or explicit options. - static storage AllocateHost(local::ScopedDevice &device, - iree_device_size_t allocation_size); + static storage allocate_host(local::ScopedDevice &device, + iree_device_size_t allocation_size); // Creates a subspan view of the current storage given a byte offset and // length. The returned storage shares the underlying allocation and // scheduling control block. - storage Subspan(iree_device_size_t byte_offset, + storage subspan(iree_device_size_t byte_offset, iree_device_size_t byte_length); // Enqueues a fill of the storage with an arbitrary pattern of the given // size. The pattern size must be 1, 2, or 4. - void Fill(const void *pattern, iree_host_size_t pattern_length); + void fill(const void *pattern, iree_host_size_t pattern_length); // Performs either a d2h, h2d or d2d transfer from a source storage to this // storage. - void CopyFrom(storage &source_storage); + void copy_from(storage &source_storage); iree_device_size_t byte_length() const { return iree_hal_buffer_byte_length(buffer_.get()); @@ -124,33 +124,33 @@ class SHORTFIN_API storage { bool is_mappable_for_read_write() const; // Maps the memory for access from a host pointer using a scoped mapping. - void MapExplicit(mapping &mapping, iree_hal_memory_access_t access); + void map_explicit(mapping &mapping, iree_hal_memory_access_t access); // Maps the memory for read/write access, preserving any contents. - mapping MapReadWrite() { + mapping map_read_write() { mapping m; - MapExplicit(m, IREE_HAL_MEMORY_ACCESS_READ | IREE_HAL_MEMORY_ACCESS_WRITE); + map_explicit(m, IREE_HAL_MEMORY_ACCESS_READ | IREE_HAL_MEMORY_ACCESS_WRITE); return m; } // Maps the memory for discard write. This is used if populating an initial // buffer. - mapping MapWriteDiscard() { + mapping map_write_discard() { mapping m; - MapExplicit(m, IREE_HAL_MEMORY_ACCESS_DISCARD_WRITE); + map_explicit(m, IREE_HAL_MEMORY_ACCESS_DISCARD_WRITE); return m; } // Maps the memory for read-only access. - mapping MapRead() { + mapping map_read() { mapping m; - MapExplicit(m, IREE_HAL_MEMORY_ACCESS_READ); + map_explicit(m, IREE_HAL_MEMORY_ACCESS_READ); return m; } - const mapping MapRead() const { + const mapping map_read() const { mapping m; - const_cast(this)->MapExplicit(m, IREE_HAL_MEMORY_ACCESS_READ); + const_cast(this)->map_explicit(m, IREE_HAL_MEMORY_ACCESS_READ); return m; } diff --git a/libshortfin/src/shortfin/local/messaging.h b/libshortfin/src/shortfin/local/messaging.h index f006775fe..fc1f3173b 100644 --- a/libshortfin/src/shortfin/local/messaging.h +++ b/libshortfin/src/shortfin/local/messaging.h @@ -121,9 +121,9 @@ class SHORTFIN_API Message { // sized field that the allocator can use at it sees fit. Both fields // are managed within a lock_ scope and are optimized for single threaded // access and cross-thread transfers with coarse references. + mutable iree::slim_mutex lock_; mutable intptr_t ref_data_ = 1; mutable detail::MessageRefOwner owner_; - mutable iree::slim_mutex lock_; friend struct detail::MessageRefOwner; }; diff --git a/libshortfin/src/shortfin/local/program.cc b/libshortfin/src/shortfin/local/program.cc index cba725096..527c97d0c 100644 --- a/libshortfin/src/shortfin/local/program.cc +++ b/libshortfin/src/shortfin/local/program.cc @@ -10,11 +10,49 @@ #include "fmt/std.h" #include "iree/vm/bytecode/module.h" #include "shortfin/local/system.h" +#include "shortfin/support/logging.h" namespace shortfin::local { -ProgramModule ProgramModule::Load(System& system, - const std::filesystem::path& path, +namespace { +void GetVmModuleExports(iree_vm_module_t *vm_module, + std::vector &exports) { + auto sig = iree_vm_module_signature(vm_module); + for (iree_host_size_t i = 0; i < sig.export_function_count; ++i) { + iree_vm_function_t f; + SHORTFIN_THROW_IF_ERROR(iree_vm_module_lookup_function_by_ordinal( + vm_module, IREE_VM_FUNCTION_LINKAGE_EXPORT, i, &f)); + exports.emplace_back(to_string_view(iree_vm_function_name(&f))); + } +} +} // namespace + +// -------------------------------------------------------------------------- // +// ProgramFunction +// -------------------------------------------------------------------------- // + +std::string_view ProgramFunction::name() const { + if (!*this) return {}; + return to_string_view(iree_vm_function_name(&vm_function_)); +} + +std::string_view ProgramFunction::calling_convention() const { + if (!*this) return {}; + return to_string_view( + iree_vm_function_signature(&vm_function_).calling_convention); +} + +std::string ProgramFunction::to_s() const { + if (!*this) return std::string("ProgramFunction(NULL)"); + return fmt::format("ProgramFunction({}: {})", name(), calling_convention()); +} + +// -------------------------------------------------------------------------- // +// ProgramModule +// -------------------------------------------------------------------------- // + +ProgramModule ProgramModule::Load(System &system, + const std::filesystem::path &path, bool mmap) { iree::file_contents_ptr contents; iree_file_read_flags_t flags = @@ -53,4 +91,124 @@ std::string ProgramModule::to_s() const { sig.version, fmt::join(exports, ", ")); } +std::vector ProgramModule::exports() const { + std::vector exports; + GetVmModuleExports(vm_module_, exports); + return exports; +} + +// -------------------------------------------------------------------------- // +// Program +// -------------------------------------------------------------------------- // + +std::optional Program::LookupFunction(std::string_view name) { + iree_vm_function_t f; + iree_status_t status = iree_vm_context_resolve_function( + vm_context_, to_iree_string_view(name), &f); + if (iree_status_is_not_found(status)) return {}; + SHORTFIN_THROW_IF_ERROR(status); + return ProgramFunction(vm_context_, f); +} + +ProgramFunction Program::LookupRequiredFunction(std::string_view name) { + auto f = LookupFunction(name); + if (!f) { + throw std::invalid_argument( + fmt::format("Function '{}' not found in program. Available exports: {}", + name, fmt::join(exports(), ", "))); + } + return std::move(*f); +} + +std::vector Program::exports() const { + std::vector results; + + // Iterate in reverse since "user modules" are typically last. + int module_count = iree_vm_context_module_count(vm_context_); + for (int i = module_count - 1; i >= 0; --i) { + auto vm_module = iree_vm_context_module_at(vm_context_, i); + std::string_view module_name = + to_string_view(iree_vm_module_name(vm_module)); + std::vector names; + GetVmModuleExports(vm_module, names); + for (auto &name : names) { + results.push_back(fmt::format("{}.{}", module_name, name)); + } + } + return results; +} + +// -------------------------------------------------------------------------- // +// Invocation +// -------------------------------------------------------------------------- // + +void Invocation::Deleter::operator()(Invocation *inst) { + uint8_t *memory = static_cast(static_cast(inst)); + + // Trailing arg list and result list. The arg list pointer is only available + // at construction, so we use the knowledge that it is stored right after + // the object. The result_list_ is available for the life of the invocation. + iree_vm_list_deinitialize(static_cast( + static_cast(memory + sizeof(Invocation)))); + iree_vm_list_deinitialize(inst->result_list_); + + // Was allocated in New as a uint8_t[] so delete it by whence it came. + delete[] memory; +} + +Invocation::Invocation() = default; +Invocation::~Invocation() { + if (!scheduled()) { + // This instance was dropped on the floor before scheduling. + // Clean up the initialization parameters. + iree::vm_context_ptr drop = + iree::vm_context_ptr::steal_reference(state.params.context); + } +} + +Invocation::Ptr Invocation::New(iree::vm_context_ptr vm_context, + iree_vm_function_t &vm_function) { + auto sig = iree_vm_function_signature(&vm_function); + iree_host_size_t arg_count; + iree_host_size_t result_count; + SHORTFIN_THROW_IF_ERROR(iree_vm_function_call_count_arguments_and_results( + &sig, &arg_count, &result_count)); + + // Compute size of trailing arg/result storage. + auto variant_type_def = iree_vm_make_undefined_type_def(); + iree_host_size_t arg_storage_size = + iree_vm_list_storage_size(&variant_type_def, arg_count); + iree_host_size_t result_storage_size = + iree_vm_list_storage_size(&variant_type_def, result_count); + + // Allocate storage for the Invocation, arg, result list and placement new + // the Invocation into the storage area. + std::unique_ptr inst_storage( + new uint8_t[sizeof(Invocation) + arg_storage_size + result_storage_size]); + new (inst_storage.get()) Invocation(); + + // Initialize trailing lists. Abort on failure since this is a bug and we + // would otherwise leak. + iree_vm_list_t *arg_list; + iree_vm_list_t *result_list; + IREE_CHECK_OK( + iree_vm_list_initialize({.data = inst_storage.get() + sizeof(Invocation), + .data_length = arg_storage_size}, + &variant_type_def, arg_count, &arg_list)); + IREE_CHECK_OK(iree_vm_list_initialize( + {.data = inst_storage.get() + sizeof(Invocation) + arg_storage_size, + .data_length = result_storage_size}, + &variant_type_def, arg_count, &result_list)); + + Ptr inst( + static_cast(static_cast(inst_storage.release())), + Deleter()); + inst->state.params.context = + vm_context.release(); // Ref transfer to Invocation. + inst->state.params.function = vm_function; + inst->state.params.arg_list = arg_list; + inst->result_list_ = result_list; + return inst; +} + } // namespace shortfin::local diff --git a/libshortfin/src/shortfin/local/program.h b/libshortfin/src/shortfin/local/program.h index 40637a768..6ee4cd5f7 100644 --- a/libshortfin/src/shortfin/local/program.h +++ b/libshortfin/src/shortfin/local/program.h @@ -8,8 +8,12 @@ #define SHORTFIN_LOCAL_PROGRAM_H #include +#include #include +#include +#include "shortfin/local/async.h" +#include "shortfin/local/worker.h" #include "shortfin/support/api.h" #include "shortfin/support/iree_helpers.h" @@ -17,6 +21,31 @@ namespace shortfin::local { class SHORTFIN_API System; +// References a function in a Program. +class SHORTFIN_API ProgramFunction { + public: + ProgramFunction(iree::vm_context_ptr vm_context, + iree_vm_function_t vm_function) + : vm_context_(std::move(vm_context)), vm_function_(vm_function) {} + + operator bool() const { return vm_context_; } + + std::string_view name() const; + std::string_view calling_convention() const; + + std::string to_s() const; + + operator iree_vm_context_t *() { return vm_context_.get(); } + operator iree_vm_function_t &() { return vm_function_; } + + private: + // The context that this function was resolved against. + iree::vm_context_ptr vm_context_; + iree_vm_function_t vm_function_; + + friend class Program; +}; + // High level API for working with program modules. Think of a module as // a shared library in a traditional Unix system: // @@ -36,13 +65,16 @@ class SHORTFIN_API System; class SHORTFIN_API ProgramModule { public: std::string to_s() const; - iree_vm_module_t* vm_module() const { return vm_module_; } + iree_vm_module_t *vm_module() const { return vm_module_; } std::string_view name() const; // Loads a dynamic bytecode module (VMFB) from a path on the file system. - static ProgramModule Load(System& system, const std::filesystem::path& path, + static ProgramModule Load(System &system, const std::filesystem::path &path, bool mmap = true); + // Gets the name of all exported functions. + std::vector exports() const; + protected: explicit ProgramModule(iree::vm_module_ptr vm_module) : vm_module_(std::move(vm_module)) {} @@ -70,13 +102,79 @@ class SHORTFIN_API Program { bool trace_execution = false; }; + // Looks up a public function by fully qualified name (i.e. module.function). + // Returns nothing if not found. + std::optional LookupFunction(std::string_view name); + + // Looks up a public function by fully qualified name, throwing an + // invalid_argument exception on failure to find. + ProgramFunction LookupRequiredFunction(std::string_view name); + + // Gets the name of all exported functions. + std::vector exports() const; + private: - explicit Program(iree::vm_context_ptr context) - : context_(std::move(context)) {} - iree::vm_context_ptr context_; + explicit Program(iree::vm_context_ptr vm_context) + : vm_context_(std::move(vm_context)) {} + iree::vm_context_ptr vm_context_; friend class Scope; }; +// State related to making an invocation of a function on a program. +// +// Since ownership of this object is transferred to the loop/callback and +// internal pointers into it must remain stable, it is only valid to heap +// allocate it. +class SHORTFIN_API Invocation { + struct Deleter { + void operator()(Invocation *); + }; + + public: + using Ptr = std::unique_ptr; + static_assert(sizeof(Ptr) == sizeof(void *)); + using Future = TypedFuture; + + Ptr New(iree::vm_context_ptr vm_context, iree_vm_function_t &vm_function); + Invocation(const Invocation &) = delete; + Invocation &operator=(const Invocation &) = delete; + Invocation &operator=(Invocation &&) = delete; + Invocation(Invocation &&inv) = delete; + ~Invocation(); + + // Whether the Invocation has entered the scheduled state. Once scheduled, + // arguments and initialization parameters can no longer be accessed. + bool scheduled() const { return static_cast(future_); } + + // Transfers ownership of an invocation and schedules it on worker, returning + // a future that will resolve to the owned invocation upon completion. + static Future Invoke(Ptr invocation, Worker &worker); + + private: + Invocation(); + + // Parameters needed to make the async call are stored at construction time + // up until the point the call is made in the params union. When invoking, + // these will be copied to the stack and passed to the async invocation, + // which initializes the async_invoke_state. Phasing it like this saves + // hundreds of bytes of redundant storage. + struct Params { + // Context is retained upon construction and released when scheduled. + iree_vm_context_t *context; + iree_vm_function_t function; + iree_vm_list_t *arg_list = nullptr; + }; + union State { + State() {} + ~State() {} + Params params; + iree_vm_async_invoke_state_t async_invoke_state; + } state; + + iree_vm_list_t *result_list_ = nullptr; + std::optional future_; +}; + } // namespace shortfin::local #endif // SHORTFIN_LOCAL_PROGRAM_H diff --git a/libshortfin/src/shortfin/local/scheduler.cc b/libshortfin/src/shortfin/local/scheduler.cc index c5a9fc062..4c624fb36 100644 --- a/libshortfin/src/shortfin/local/scheduler.cc +++ b/libshortfin/src/shortfin/local/scheduler.cc @@ -12,6 +12,26 @@ namespace shortfin::local::detail { +namespace { + +std::string SummarizeFence(iree_hal_fence_t *fence) { + if (!SHORTFIN_SCHED_LOG_ENABLED) { + return std::string(); + } + std::string result("fence("); + iree_hal_semaphore_list_t list = iree_hal_fence_semaphore_list(fence); + for (iree_host_size_t i = 0; i < list.count; ++i) { + if (i > 0) result.append(", "); + result.append(fmt::format("[{}@{}]", + static_cast(list.semaphores[i]), + list.payload_values[i])); + } + result.append(")"); + return result; +} + +} // namespace + // -------------------------------------------------------------------------- // // Account // -------------------------------------------------------------------------- // @@ -30,9 +50,6 @@ void Account::Initialize() { void Account::Reset() { active_tx_type_ = TransactionType::NONE; - // if (active_command_buffer_) { - // iree_hal_command_buffer_end(active_command_buffer_); - // } active_command_buffer_.reset(); } @@ -55,8 +72,12 @@ CompletionEvent Account::OnSync() { iree::shared_event::ref satisfied(false); iree::hal_semaphore_ptr sem = sem_; auto idle_timepoint = idle_timepoint_; + SHORTFIN_SCHED_LOG("OnSync::Wait({}@{})", static_cast(sem.get()), + idle_timepoint); scheduler_.system().blocking_executor().Schedule( [sem = std::move(sem), idle_timepoint, satisfied]() { + SHORTFIN_SCHED_LOG("OnSync::Complete({}@{})", + static_cast(sem.get()), idle_timepoint); iree_status_t status = iree_hal_semaphore_wait( sem, idle_timepoint, iree_infinite_timeout()); IREE_CHECK_OK(status); @@ -140,8 +161,10 @@ void Scheduler::AppendCommandBuffer(ScopedDevice &device, TransactionType tx_type, std::function callback) { Account &account = GetDefaultAccount(device); - auto needed_affinity_bits = device.affinity().queue_affinity(); + SHORTFIN_SCHED_LOG( + "AppendCommandBuffer(account=0x{:x}, tx_type={}, queue_affinity={}):", + account.id(), static_cast(tx_type), needed_affinity_bits); // Initialize a fresh command buffer if needed. if (!account.active_command_buffer_) { @@ -181,6 +204,11 @@ void Scheduler::AppendCommandBuffer(ScopedDevice &device, account.active_deps_ = std::move(new_active_deps); account.active_command_buffer_ = std::move(new_cb); account.idle_timepoint_ += 1; + SHORTFIN_SCHED_LOG( + " : New command buffer (category={}, idle_timepoint={})", category, + account.idle_timepoint_); + } else { + SHORTFIN_SCHED_LOG(" : Continue active command buffer"); } // Perform the mutation. @@ -199,13 +227,21 @@ void Scheduler::Flush() { // from idle to active. for (Account &account : accounts_) { if (!account.active_command_buffer_) continue; - iree_hal_semaphore_t *signal_sem = account.sem_; uint64_t signal_timepoint = account.idle_timepoint_; iree_hal_command_buffer_t *active_command_buffer = account.active_command_buffer_; iree_hal_buffer_binding_table_t binding_tables = iree_hal_buffer_binding_table_empty(); + + SHORTFIN_SCHED_LOG( + "Flush command buffer (account=0x{:x}, queue_affinity={}, " + "signal_timepoint={}, deps={})", + account.id(), account.active_queue_affinity_bits_, signal_timepoint, + SummarizeFence(account.active_deps_)); + + // End recording and submit. + SHORTFIN_THROW_IF_ERROR(iree_hal_command_buffer_end(active_command_buffer)); SHORTFIN_THROW_IF_ERROR(iree_hal_device_queue_execute( account.hal_device(), /*queue_affinity=*/account.active_queue_affinity_bits_, diff --git a/libshortfin/src/shortfin/local/scheduler.h b/libshortfin/src/shortfin/local/scheduler.h index 2f606ced3..fa9bec237 100644 --- a/libshortfin/src/shortfin/local/scheduler.h +++ b/libshortfin/src/shortfin/local/scheduler.h @@ -175,6 +175,9 @@ class SHORTFIN_API Account { Device *device() const { return device_; } iree_hal_device_t *hal_device() { return hal_device_; } size_t semaphore_count() const { return 1; } + // Gets a unique integer id for this account. Currently just the address of + // the sem, but can be derived from any owned entity. + uintptr_t id() const { return reinterpret_cast(sem_.get()); } // Accesses the active command buffer. This will only be non-null if a // pending transaction has been set up (i.e. via AppendCommandBuffer). diff --git a/libshortfin/src/shortfin/local/scope.h b/libshortfin/src/shortfin/local/scope.h index 0cb566b89..bd3423885 100644 --- a/libshortfin/src/shortfin/local/scope.h +++ b/libshortfin/src/shortfin/local/scope.h @@ -140,6 +140,8 @@ class SHORTFIN_API Scope : public std::enable_shared_from_this { Program LoadUnboundProgram(std::span modules, Program::Options options = {}); + void Invoke(ProgramFunction &function); + private: void AddDevice(std::string_view device_class, Device *device); void Initialize(); // Called after all devices are added. diff --git a/libshortfin/src/shortfin/support/iree_concurrency.h b/libshortfin/src/shortfin/support/iree_concurrency.h index 28ef1e99b..be3e42742 100644 --- a/libshortfin/src/shortfin/support/iree_concurrency.h +++ b/libshortfin/src/shortfin/support/iree_concurrency.h @@ -16,21 +16,7 @@ namespace shortfin::iree { -namespace detail { -struct thread_ptr_helper { - static void steal(iree_thread_t *obj) { LogIREESteal("iree_thread_t", obj); } - static void retain(iree_thread_t *obj) { - LogIREERetain("iree_thread_t", obj); - iree_thread_retain(obj); - } - static void release(iree_thread_t *obj) { - LogIREERelease("iree_thread_t", obj); - iree_thread_release(obj); - } -}; -}; // namespace detail - -using thread_ptr = object_ptr; +SHORTFIN_IREE_DEF_PTR(thread); // Wraps an iree::slim_mutex as an RAII object. class slim_mutex { diff --git a/libshortfin/src/shortfin/support/iree_helpers.h b/libshortfin/src/shortfin/support/iree_helpers.h index c77ddbaa8..de2c6435a 100644 --- a/libshortfin/src/shortfin/support/iree_helpers.h +++ b/libshortfin/src/shortfin/support/iree_helpers.h @@ -32,6 +32,10 @@ inline std::string_view to_string_view(iree_string_view_t isv) { return std::string_view(isv.data, isv.size); } +inline iree_string_view_t to_iree_string_view(std::string_view sv) { + return iree_make_string_view(sv.data(), sv.size()); +} + namespace iree { // -------------------------------------------------------------------------- // @@ -52,132 +56,6 @@ inline void LogIREESteal(const char *type_name, void *ptr) {} inline void LogLiveRefs() {} #endif -struct hal_buffer_ptr_helper { - static void steal(iree_hal_buffer_t *obj) { - LogIREESteal("iree_hal_buffer_t", obj); - } - static void retain(iree_hal_buffer_t *obj) { - LogIREERetain("iree_hal_buffer_t", obj); - iree_hal_buffer_retain(obj); - } - static void release(iree_hal_buffer_t *obj) { - LogIREERelease("iree_hal_buffer_t", obj); - iree_hal_buffer_release(obj); - } -}; - -struct hal_command_buffer_helper { - static void steal(iree_hal_command_buffer_t *obj) { - LogIREESteal("iree_hal_command_buffer_t", obj); - } - static void retain(iree_hal_command_buffer_t *obj) { - LogIREERetain("iree_hal_command_buffer_t", obj); - iree_hal_command_buffer_retain(obj); - } - static void release(iree_hal_command_buffer_t *obj) { - LogIREERelease("iree_hal_command_buffer_t", obj); - iree_hal_command_buffer_release(obj); - } -}; - -struct hal_device_ptr_helper { - static void steal(iree_hal_device_t *obj) { - LogIREESteal("iree_hal_device_t", obj); - } - static void retain(iree_hal_device_t *obj) { - LogIREERetain("iree_hal_device_t", obj); - iree_hal_device_retain(obj); - } - static void release(iree_hal_device_t *obj) { - LogIREERelease("iree_hal_device_t", obj); - iree_hal_device_release(obj); - } -}; - -struct hal_driver_ptr_helper { - static void steal(iree_hal_driver_t *obj) { - LogIREESteal("iree_hal_driver_t", obj); - } - static void retain(iree_hal_driver_t *obj) { - LogIREERetain("iree_hal_driver_t", obj); - iree_hal_driver_retain(obj); - } - static void release(iree_hal_driver_t *obj) { - LogIREERelease("iree_hal_driver_t", obj); - iree_hal_driver_release(obj); - } -}; - -struct hal_fence_ptr_helper { - static void steal(iree_hal_fence_t *obj) { - LogIREESteal("iree_hal_fence_t", obj); - } - static void retain(iree_hal_fence_t *obj) { - LogIREERetain("iree_hal_fence_t", obj); - iree_hal_fence_retain(obj); - } - static void release(iree_hal_fence_t *obj) { - LogIREERelease("iree_hal_fence_t", obj); - iree_hal_fence_release(obj); - } -}; - -struct hal_semaphore_ptr_helper { - static void steal(iree_hal_semaphore_t *obj) { - LogIREESteal("iree_hal_semaphore_t", obj); - } - static void retain(iree_hal_semaphore_t *obj) { - LogIREERetain("iree_hal_semaphore_t", obj); - iree_hal_semaphore_retain(obj); - } - static void release(iree_hal_semaphore_t *obj) { - LogIREERelease("iree_hal_semaphore_t", obj); - iree_hal_semaphore_release(obj); - } -}; - -struct vm_context_ptr_helper { - static void steal(iree_vm_context_t *obj) { - LogIREESteal("iree_vm_context_t", obj); - } - static void retain(iree_vm_context_t *obj) { - LogIREERetain("iree_vm_context_t", obj); - iree_vm_context_retain(obj); - } - static void release(iree_vm_context_t *obj) { - LogIREERelease("iree_vm_context_t", obj); - iree_vm_context_release(obj); - } -}; - -struct vm_instance_ptr_helper { - static void steal(iree_vm_instance_t *obj) { - LogIREESteal("iree_vm_instance_t", obj); - } - static void retain(iree_vm_instance_t *obj) { - LogIREERetain("iree_vm_instance_t", obj); - iree_vm_instance_retain(obj); - } - static void release(iree_vm_instance_t *obj) { - LogIREERelease("iree_vm_instance_t", obj); - iree_vm_instance_release(obj); - } -}; - -struct vm_module_ptr_helper { - static void steal(iree_vm_module_t *obj) { - LogIREESteal("iree_vm_module_t", obj); - } - static void retain(iree_vm_module_t *obj) { - LogIREERetain("iree_vm_module_t", obj); - iree_vm_module_retain(obj); - } - static void release(iree_vm_module_t *obj) { - LogIREERelease("iree_vm_module_t", obj); - iree_vm_module_release(obj); - } -}; - }; // namespace detail // Wraps an IREE retain/release style object pointer in a smart-pointer @@ -261,24 +139,38 @@ class object_ptr { friend class Assignment; }; -using hal_buffer_ptr = - object_ptr; -using hal_command_buffer_ptr = - object_ptr; -using hal_driver_ptr = - object_ptr; -using hal_device_ptr = - object_ptr; -using hal_fence_ptr = - object_ptr; -using hal_semaphore_ptr = - object_ptr; -using vm_context_ptr = - object_ptr; -using vm_instance_ptr = - object_ptr; -using vm_module_ptr = - object_ptr; +// Defines a reference counting helper struct named like +// iree_hal_buffer_ptr_helper (for type_stem == hal_buffer). +// These must be defined in the shortfin::iree::detail namespace. +#define SHORTFIN_IREE_DEF_PTR(type_stem) \ + namespace detail { \ + struct type_stem##_ptr_helper { \ + static void steal(iree_##type_stem##_t *obj) { \ + LogIREESteal(#type_stem "_t", obj); \ + } \ + static void retain(iree_##type_stem##_t *obj) { \ + LogIREERetain(#type_stem "_t", obj); \ + iree_##type_stem##_retain(obj); \ + } \ + static void release(iree_##type_stem##_t *obj) { \ + LogIREERelease(#type_stem "_t", obj); \ + iree_##type_stem##_release(obj); \ + } \ + }; \ + } \ + using type_stem##_ptr = \ + object_ptr + +SHORTFIN_IREE_DEF_PTR(hal_command_buffer); +SHORTFIN_IREE_DEF_PTR(hal_buffer); +SHORTFIN_IREE_DEF_PTR(hal_device); +SHORTFIN_IREE_DEF_PTR(hal_driver); +SHORTFIN_IREE_DEF_PTR(hal_fence); +SHORTFIN_IREE_DEF_PTR(hal_semaphore); +SHORTFIN_IREE_DEF_PTR(vm_context); +SHORTFIN_IREE_DEF_PTR(vm_instance); +SHORTFIN_IREE_DEF_PTR(vm_list); +SHORTFIN_IREE_DEF_PTR(vm_module); // Holds a pointer allocated by some allocator, deleting it if still owned // at destruction time. diff --git a/libshortfin/src/shortfin/support/logging.h b/libshortfin/src/shortfin/support/logging.h index 337ebacae..298518f9b 100644 --- a/libshortfin/src/shortfin/support/logging.h +++ b/libshortfin/src/shortfin/support/logging.h @@ -13,6 +13,10 @@ #define SHORTFIN_LOG_LIFETIMES 0 #endif +// Scheduler logging. +#define SHORTFIN_SCHED_LOG_ENABLED 1 +#define SHORTFIN_SCHED_LOG(...) shortfin::logging::info("SCHED: " __VA_ARGS__) + namespace shortfin::logging { // TODO: Re-export doesn't really work like this. Need to define API