Skip to content

[SYCL] Support device_global in SYCLBIN kernel_bundle #19164

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: sycl
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions sycl/source/detail/context_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -340,9 +340,9 @@ void context_impl::addDeviceGlobalInitializer(
}
}

std::vector<ur_event_handle_t>
context_impl::initializeDeviceGlobals(ur_program_handle_t NativePrg,
queue_impl &QueueImpl) {
std::vector<ur_event_handle_t> context_impl::initializeDeviceGlobals(
ur_program_handle_t NativePrg, queue_impl &QueueImpl,
detail::kernel_bundle_impl *KernelBundleImplPtr) {
if (!MDeviceGlobalNotInitializedCnt.load(std::memory_order_acquire))
return {};

Expand Down Expand Up @@ -396,6 +396,12 @@ context_impl::initializeDeviceGlobals(ur_program_handle_t NativePrg,
detail::ProgramManager::getInstance().getDeviceGlobalEntries(
DeviceGlobalIds,
/*ExcludeDeviceImageScopeDecorated=*/true);
// Kernel bundles may have isolated device globals. They need to be
// initialized too.
if (KernelBundleImplPtr && KernelBundleImplPtr->getDeviceGlobalMap().size())
KernelBundleImplPtr->getDeviceGlobalMap().getEntries(
DeviceGlobalIds, /*ExcludeDeviceImageScopeDecorated=*/true,
DeviceGlobalEntries);

// If there were no device globals without device_image_scope the device
// globals are trivially fully initialized and we can end early.
Expand Down
3 changes: 2 additions & 1 deletion sycl/source/detail/context_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,8 @@ class context_impl : public std::enable_shared_from_this<context_impl> {

/// Initializes device globals for a program on the associated queue.
std::vector<ur_event_handle_t>
initializeDeviceGlobals(ur_program_handle_t NativePrg, queue_impl &QueueImpl);
initializeDeviceGlobals(ur_program_handle_t NativePrg, queue_impl &QueueImpl,
detail::kernel_bundle_impl *KernelBundleImplPtr);

void memcpyToHostOnlyDeviceGlobal(device_impl &DeviceImpl,
const void *DeviceGlobalPtr,
Expand Down
52 changes: 37 additions & 15 deletions sycl/source/detail/device_global_map.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,30 @@
#include <detail/device_global_map_entry.hpp>
#include <sycl/detail/defines_elementary.hpp>
#include <sycl/detail/kernel_name_str_t.hpp>
#include <sycl/kernel_bundle.hpp>

namespace sycl {
inline namespace _V1 {
namespace detail {

class DeviceGlobalMap {
public:
DeviceGlobalMap(bool OwnerControlledCleanup)
: MOwnerControlledCleanup{OwnerControlledCleanup} {}

~DeviceGlobalMap() {
if (!MOwnerControlledCleanup)
for (auto &DeviceGlobalIt : MDeviceGlobals)
DeviceGlobalIt.second->cleanup();
}

void initializeEntries(const RTDeviceBinaryImage *Img) {
const auto &DeviceGlobals = Img->getDeviceGlobals();
std::lock_guard<std::mutex> DeviceGlobalsGuard(MDeviceGlobalsMutex);
initializeEntriesLockless(Img);
}

void initializeEntriesLockless(const RTDeviceBinaryImage *Img) {
const auto &DeviceGlobals = Img->getDeviceGlobals();
for (const sycl_device_binary_property &DeviceGlobal : DeviceGlobals) {
ByteArray DeviceGlobalInfo =
DeviceBinaryProperty(DeviceGlobal).asByteArray();
Expand Down Expand Up @@ -102,9 +116,16 @@ class DeviceGlobalMap {
return Entry->second;
}

DeviceGlobalMapEntry *tryGetEntry(const std::string &UniqueId,
bool ExcludeDeviceImageScopeDecorated) {
DeviceGlobalMapEntry *
tryGetEntry(const std::string &UniqueId,
bool ExcludeDeviceImageScopeDecorated = false) {
std::lock_guard<std::mutex> DeviceGlobalsGuard(MDeviceGlobalsMutex);
return tryGetEntryLockless(UniqueId, ExcludeDeviceImageScopeDecorated);
}

DeviceGlobalMapEntry *
tryGetEntryLockless(const std::string &UniqueId,
bool ExcludeDeviceImageScopeDecorated = false) const {
auto DeviceGlobalEntry = MDeviceGlobals.find(UniqueId);
if (DeviceGlobalEntry != MDeviceGlobals.end() &&
(!ExcludeDeviceImageScopeDecorated ||
Expand All @@ -113,22 +134,17 @@ class DeviceGlobalMap {
return nullptr;
}

std::vector<DeviceGlobalMapEntry *>
getEntries(const std::vector<std::string> &UniqueIds,
bool ExcludeDeviceImageScopeDecorated) {
std::vector<DeviceGlobalMapEntry *> FoundEntries;
FoundEntries.reserve(UniqueIds.size());

void getEntries(const std::vector<std::string> &UniqueIds,
bool ExcludeDeviceImageScopeDecorated,
std::vector<DeviceGlobalMapEntry *> &OutVec) {
std::lock_guard<std::mutex> DeviceGlobalsGuard(MDeviceGlobalsMutex);
for (const std::string &UniqueId : UniqueIds) {
auto DeviceGlobalEntry = MDeviceGlobals.find(UniqueId);
assert(DeviceGlobalEntry != MDeviceGlobals.end() &&
"Device global not found in map.");
if (!ExcludeDeviceImageScopeDecorated ||
!DeviceGlobalEntry->second->MIsDeviceImageScopeDecorated)
FoundEntries.push_back(DeviceGlobalEntry->second.get());
if (DeviceGlobalEntry != MDeviceGlobals.end() &&
(!ExcludeDeviceImageScopeDecorated ||
!DeviceGlobalEntry->second->MIsDeviceImageScopeDecorated))
OutVec.push_back(DeviceGlobalEntry->second.get());
}
return FoundEntries;
}

const std::unordered_map<const void *, DeviceGlobalMapEntry *>
Expand All @@ -143,6 +159,12 @@ class DeviceGlobalMap {
}

private:
// Indicates whether the owner will explicitly cleanup the entries. If false
// the dtor of DeviceGlobalMap will cleanup the enties.
// Note: This lets the global device global map avoid overhead at shutdown and
// instead let the contexts own the associated entries.
bool MOwnerControlledCleanup = true;

// Maps between device_global identifiers and associated information.
std::unordered_map<KernelNameStrT, std::unique_ptr<DeviceGlobalMapEntry>>
MDeviceGlobals;
Expand Down
100 changes: 74 additions & 26 deletions sycl/source/detail/device_global_map_entry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,22 +68,33 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl) {
{
std::lock_guard<std::mutex> Lock(NewAlloc.MInitEventMutex);
ur_event_handle_t InitEvent;
// C++ guarantees members appear in memory in the order they are declared,
// so since the member variable that contains the initial contents of the
// device_global is right after the usm_ptr member variable we can do
// some pointer arithmetic to memcopy over this value to the usm_ptr. This
// value inside of the device_global will be zero-initialized if it was not
// given a value on construction.

MemoryManager::copy_usm(reinterpret_cast<const void *>(
reinterpret_cast<uintptr_t>(MDeviceGlobalPtr) +
sizeof(MDeviceGlobalPtr)),
QueueImpl, MDeviceGlobalTSize, NewAlloc.MPtr,
std::vector<ur_event_handle_t>{}, &InitEvent);
if (MDeviceGlobalPtr) {
// C++ guarantees members appear in memory in the order they are declared,
// so since the member variable that contains the initial contents of the
// device_global is right after the usm_ptr member variable we can do
// some pointer arithmetic to memcopy over this value to the usm_ptr. This
// value inside of the device_global will be zero-initialized if it was
// not given a value on construction.
MemoryManager::copy_usm(
reinterpret_cast<const void *>(
reinterpret_cast<uintptr_t>(MDeviceGlobalPtr) +
sizeof(MDeviceGlobalPtr)),
QueueImpl, MDeviceGlobalTSize, NewAlloc.MPtr,
std::vector<ur_event_handle_t>{}, &InitEvent);
} else {
// For SYCLBIN device globals we do not have a host pointer to copy from,
// so instead we fill the USM memory with 0's.
MemoryManager::fill_usm(NewAlloc.MPtr, QueueImpl, MDeviceGlobalTSize,
{static_cast<unsigned char>(0)}, {}, &InitEvent);
}
NewAlloc.MInitEvent = InitEvent;
}

CtxImpl.addAssociatedDeviceGlobal(MDeviceGlobalPtr);
// Only device globals with host variables need to be registered with the
// context. The rest will be managed by their kernel bundles and cleaned up
// accordingly.
if (MDeviceGlobalPtr)
CtxImpl.addAssociatedDeviceGlobal(MDeviceGlobalPtr);
return NewAlloc;
}

Expand Down Expand Up @@ -111,19 +122,32 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(const context &Context) {
"USM allocation for device and context already happened.");
DeviceGlobalUSMMem &NewAlloc = NewAllocIt.first->second;

// C++ guarantees members appear in memory in the order they are declared,
// so since the member variable that contains the initial contents of the
// device_global is right after the usm_ptr member variable we can do
// some pointer arithmetic to memcopy over this value to the usm_ptr. This
// value inside of the device_global will be zero-initialized if it was not
// given a value on construction.
MemoryManager::context_copy_usm(
reinterpret_cast<const void *>(
reinterpret_cast<uintptr_t>(MDeviceGlobalPtr) +
sizeof(MDeviceGlobalPtr)),
&CtxImpl, MDeviceGlobalTSize, NewAlloc.MPtr);

CtxImpl.addAssociatedDeviceGlobal(MDeviceGlobalPtr);
if (MDeviceGlobalPtr) {
// C++ guarantees members appear in memory in the order they are declared,
// so since the member variable that contains the initial contents of the
// device_global is right after the usm_ptr member variable we can do
// some pointer arithmetic to memcopy over this value to the usm_ptr. This
// value inside of the device_global will be zero-initialized if it was not
// given a value on construction.
MemoryManager::context_copy_usm(
reinterpret_cast<const void *>(
reinterpret_cast<uintptr_t>(MDeviceGlobalPtr) +
sizeof(MDeviceGlobalPtr)),
&CtxImpl, MDeviceGlobalTSize, NewAlloc.MPtr);
} else {
// For SYCLBIN device globals we do not have a host pointer to copy from,
// so instead we fill the USM memory with 0's.
std::vector<unsigned char> ImmBuff(MDeviceGlobalTSize,
static_cast<unsigned char>(0));
MemoryManager::context_copy_usm(ImmBuff.data(), &CtxImpl,
MDeviceGlobalTSize, NewAlloc.MPtr);
}

// Only device globals with host variables need to be registered with the
// context. The rest will be managed by their kernel bundles and cleaned up
// accordingly.
if (MDeviceGlobalPtr)
CtxImpl.addAssociatedDeviceGlobal(MDeviceGlobalPtr);
return NewAlloc;
}

Expand All @@ -150,6 +174,30 @@ void DeviceGlobalMapEntry::removeAssociatedResources(
}
}

void DeviceGlobalMapEntry::cleanup() {
std::lock_guard<std::mutex> Lock{MDeviceToUSMPtrMapMutex};
assert(MDeviceGlobalPtr == nullptr &&
"Entry has host variable, so it should be associated with a context "
"and should be cleaned up by its dtor.");
for (auto &USMPtrIt : MDeviceToUSMPtrMap) {
// The context should be alive through the kernel_bundle owning these
// device_global entries.
const context_impl *CtxImpl = USMPtrIt.first.second;
DeviceGlobalUSMMem &USMMem = USMPtrIt.second;
detail::usm::freeInternal(USMMem.MPtr, CtxImpl);
if (USMMem.MInitEvent.has_value())
CtxImpl->getAdapter()->call<UrApiKind::urEventRelease>(
*USMMem.MInitEvent);
#ifndef NDEBUG
// For debugging we set the event and memory to some recognizable values
// to allow us to check that this cleanup happens before erasure.
USMMem.MPtr = nullptr;
USMMem.MInitEvent = {};
#endif
}
MDeviceToUSMPtrMap.clear();
}

} // namespace detail
} // namespace _V1
} // namespace sycl
7 changes: 7 additions & 0 deletions sycl/source/detail/device_global_map_entry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,13 @@ struct DeviceGlobalMapEntry {
// Removes resources for device_globals associated with the context.
void removeAssociatedResources(const context_impl *CtxImpl);

// Cleans up the USM memory and intialization events associated with this
// entry. This should only be called when the device global entry is not
// owned by the program manager, as otherwise it will be bound to the lifetime
// of the owner context and will be cleaned up through
// removeAssociatedResources.
void cleanup();

private:
// Map from a device and a context to the associated USM allocation for the
// device_global. This should always be empty if MIsDeviceImageScopeDecorated
Expand Down
9 changes: 2 additions & 7 deletions sycl/source/detail/device_image_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,20 +84,15 @@ class ManagedDeviceGlobalsRegistry {
bool hasDeviceGlobalName(const std::string &Name) const noexcept {
return !MDeviceGlobalNames.empty() &&
std::find(MDeviceGlobalNames.begin(), MDeviceGlobalNames.end(),
mangleDeviceGlobalName(Name)) != MDeviceGlobalNames.end();
Name) != MDeviceGlobalNames.end();
}

DeviceGlobalMapEntry *tryGetDeviceGlobalEntry(const std::string &Name) const {
auto &PM = detail::ProgramManager::getInstance();
return PM.tryGetDeviceGlobalEntry(MPrefix + mangleDeviceGlobalName(Name));
return PM.tryGetDeviceGlobalEntry(MPrefix + Name);
}

private:
static std::string mangleDeviceGlobalName(const std::string &Name) {
// TODO: Support device globals declared in namespaces.
return "_Z" + std::to_string(Name.length()) + Name;
}

void unregisterDeviceGlobalsFromContext() {
if (MDeviceGlobalNames.empty())
return;
Expand Down
Loading
Loading