Skip to content

Commit

Permalink
Merge improved memory ordering of sync-free kernels
Browse files Browse the repository at this point in the history
This uses proper relaxed/acquire-release operations for sync-free kernels in CUDA instead of `volatile`.

Related PR: #1344
  • Loading branch information
upsj authored Oct 10, 2023
2 parents 8368485 + 5acabea commit 6f65404
Show file tree
Hide file tree
Showing 9 changed files with 1,150 additions and 99 deletions.
File renamed without changes.
33 changes: 14 additions & 19 deletions common/cuda_hip/components/syncfree.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -93,18 +93,18 @@ public:
const auto dep_block = dependency / (block_size / subwarp_size);
const auto dep_local = dependency % (block_size / subwarp_size);
// assert(dependency < work_id);
if (dep_block == block_id) {
// wait for a local dependency
while (!load(local.status, dep_local)) {
__threadfence();
}
} else {
// wait for a global dependency
while (!load(global.status, dependency)) {
__threadfence();
if (get_lane() == 0) {
if (dep_block == block_id) {
// wait for a local dependency
while (!load_acquire_shared(local.status + dep_local)) {
}
} else {
// wait for a global dependency
while (!load_acquire(global.status + dependency)) {
}
}
}
__threadfence();
group::tiled_partition<subwarp_size>(group::this_thread_block()).sync();
}

__device__ __forceinline__ bool peek(IndexType dependency)
Expand All @@ -114,27 +114,22 @@ public:
// assert(dependency < work_id);
if (dep_block == block_id) {
// peek at a local dependency
auto finished = load(local.status, dep_local) != 0;
__threadfence();
return finished;
return load_acquire_shared(local.status + dep_local);
} else {
// peek at a global dependency
auto finished = load(global.status, dependency);
__threadfence();
return finished;
return load_acquire(global.status + dependency);
}
}

__device__ __forceinline__ void mark_ready()
{
group::tiled_partition<subwarp_size>(group::this_thread_block()).sync();
__threadfence();
if (get_lane() == 0) {
const auto sh_id = get_work_id() % (block_size / subwarp_size);
// notify local warps
store(local.status, sh_id, 1);
store_release_shared(local.status + sh_id, 1);
// notify other blocks
store(global.status, get_work_id(), 1);
store_release(global.status + get_work_id(), 1);
}
}

Expand Down
Loading

0 comments on commit 6f65404

Please sign in to comment.