This repository was archived by the owner on Mar 21, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 186
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add barrier/pipeline functions for Hopper * Fix pipeline stage overwrite when threads are diverged
- Loading branch information
Showing
18 changed files
with
1,347 additions
and
360 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
154 changes: 154 additions & 0 deletions
154
.upstream-tests/test/cuda/pipeline_divergent_threads.pass.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of libcu++, the C++ Standard Library for your entire system, | ||
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
// UNSUPPORTED: pre-sm-70 | ||
// UNSUPPORTED: nvrtc | ||
|
||
#include <cooperative_groups/memcpy_async.h> | ||
#include <cuda/pipeline> | ||
#include <cuda_pipeline.h> | ||
#include <math.h> | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
|
||
constexpr int nthreads = 256; | ||
constexpr size_t stages_count = 2; // Pipeline with two stages | ||
|
||
// Simply copy shared memory to global out | ||
__device__ __forceinline__ void compute(int* global_out, int const* shared_in){ | ||
auto block = cooperative_groups::this_thread_block(); | ||
for (int i = 0; i < block.size(); ++i) { | ||
global_out[i] = shared_in[i]; | ||
} | ||
} | ||
|
||
__global__ void with_staging(int* global_out, int const* global_in, size_t size, size_t batch_sz) { | ||
auto grid = cooperative_groups::this_grid(); | ||
auto block = cooperative_groups::this_thread_block(); | ||
assert(size == batch_sz * grid.size()); // Assume input size fits batch_sz * grid_size | ||
|
||
// Two batches must fit in shared memory: | ||
constexpr int smem_size = stages_count * nthreads; | ||
__shared__ int shared[smem_size]; | ||
size_t shared_offset[stages_count] = { 0, block.size() }; // Offsets to each batch | ||
|
||
// Allocate shared storage for a two-stage cuda::pipeline: | ||
#pragma nv_diag_suppress static_var_with_dynamic_init | ||
__shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, stages_count> shared_state; | ||
auto pipeline = cuda::make_pipeline(block, &shared_state); | ||
|
||
// Each thread processes `batch_sz` elements. | ||
// Compute offset of the batch `batch` of this thread block in global memory: | ||
auto block_batch = [&](size_t batch) -> int { | ||
return block.group_index().x * block.size() + grid.size() * batch; | ||
}; | ||
|
||
// Initialize first pipeline stage by submitting a `memcpy_async` to fetch a whole batch for the block: | ||
if (batch_sz == 0) return; | ||
pipeline.producer_acquire(); | ||
cuda::memcpy_async(block, shared + shared_offset[0], global_in + block_batch(0), sizeof(int) * block.size(), pipeline); | ||
pipeline.producer_commit(); | ||
// Pipelined copy/compute: | ||
for (size_t batch = 1; batch < batch_sz; ++batch) { | ||
// Stage indices for the compute and copy stages: | ||
size_t compute_stage_idx = (batch - 1) % 2; | ||
size_t copy_stage_idx = batch % 2; | ||
|
||
// This change fixes an unrelated bug. The global_idx that was passed to | ||
// the compute stage was wrong. | ||
size_t global_copy_idx = block_batch(batch); | ||
size_t global_compute_idx = block_batch(batch - 1); | ||
|
||
// Collectively acquire the pipeline head stage from all producer threads: | ||
pipeline.producer_acquire(); | ||
|
||
// Submit async copies to the pipeline's head stage to be | ||
// computed in the next loop iteration | ||
cuda::memcpy_async(block, shared + shared_offset[copy_stage_idx], global_in + global_copy_idx, sizeof(int) * block.size(), pipeline); | ||
|
||
// Collectively commit (advance) the pipeline's head stage | ||
pipeline.producer_commit(); | ||
|
||
// Collectively wait for the operations commited to the | ||
// previous `compute` stage to complete: | ||
pipeline.consumer_wait(); | ||
|
||
// Computation overlapped with the memcpy_async of the "copy" stage: | ||
compute(global_out + global_compute_idx, shared + shared_offset[compute_stage_idx]); | ||
// Diverge threads in block | ||
__nanosleep(block.thread_rank() * 10); | ||
// Collectively release the stage resources | ||
pipeline.consumer_release(); | ||
} | ||
|
||
// Compute the data fetch by the last iteration | ||
pipeline.consumer_wait(); | ||
compute(global_out + block_batch(batch_sz-1), | ||
shared + shared_offset[(batch_sz - 1) % 2]); | ||
pipeline.consumer_release(); | ||
} | ||
|
||
|
||
int main(int argc, char **argv) { | ||
NV_IF_ELSE_TARGET( | ||
NV_IS_HOST, ( | ||
constexpr int batch_size = 10; | ||
constexpr size_t size = batch_size * nthreads; | ||
int *in, *out; | ||
|
||
if (cudaMallocManaged((void **)&in, sizeof(int) * size) != cudaSuccess || | ||
cudaMallocManaged((void **)&out, sizeof(int) * size) != cudaSuccess) { | ||
printf("Setup failed\n"); | ||
return -1; | ||
} | ||
|
||
for (size_t i = 0; i < size; ++i) { | ||
in[i] = (int)i; | ||
} | ||
|
||
with_staging<<<1, nthreads>>>(out, in, size, batch_size); | ||
{ | ||
auto result = cudaGetLastError(); | ||
if (result != cudaSuccess) { | ||
printf("Kernel launch failed\n"); | ||
printf("Error: %s\n", cudaGetErrorString(result)); | ||
return -1; | ||
} | ||
} | ||
|
||
if (cudaDeviceSynchronize() != cudaSuccess) { | ||
printf("Kernel failed while running\n"); | ||
printf("Error: %s\n", cudaGetErrorString(cudaGetLastError())); | ||
return -1; | ||
} | ||
|
||
const int max_errors = 10; | ||
int num_errors = 0; | ||
for (size_t i = 0; i < size; ++i) { | ||
if (out[i] != (int)i) { | ||
printf("out[%d] did not match expected value %d. Got %d\r\n", (int)i, (int)i, out[i]); | ||
num_errors++; | ||
} | ||
if (num_errors == max_errors) { | ||
break; | ||
} | ||
} | ||
|
||
if (num_errors == 0) { | ||
printf("No errors.\r\n"); | ||
} | ||
else { | ||
return -1; | ||
} | ||
|
||
return 0; | ||
), | ||
return 0; | ||
) | ||
} |
109 changes: 109 additions & 0 deletions
109
.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
// UNSUPPORTED: nvrtc, pre-sm-70 | ||
|
||
// uncomment for a really verbose output detailing what test steps are being launched | ||
// #define DEBUG_TESTERS | ||
|
||
#include "helpers.h" | ||
|
||
#include <atomic> | ||
#include <cuda/barrier> | ||
|
||
template<typename Barrier> | ||
struct barrier_and_token | ||
{ | ||
using barrier_t = Barrier; | ||
using token_t = typename barrier_t::arrival_token; | ||
|
||
barrier_t barrier; | ||
cuda::std::atomic<bool> parity_waiting{false}; | ||
|
||
template<typename ...Args> | ||
__host__ __device__ | ||
barrier_and_token(Args && ...args) : barrier{ cuda::std::forward<Args>(args)... } | ||
{ | ||
} | ||
}; | ||
|
||
struct barrier_arrive_and_wait | ||
{ | ||
using async = cuda::std::true_type; | ||
|
||
template<typename Data> | ||
__host__ __device__ | ||
static void perform(Data & data) | ||
{ | ||
while (data.parity_waiting.load(cuda::std::memory_order_acquire) == false) | ||
{ | ||
data.parity_waiting.wait(false); | ||
} | ||
data.barrier.arrive_and_wait(); | ||
} | ||
}; | ||
|
||
template <bool Phase> | ||
struct barrier_parity_wait | ||
{ | ||
using async = cuda::std::true_type; | ||
|
||
template<typename Data> | ||
__host__ __device__ | ||
static void perform(Data & data) | ||
{ | ||
data.parity_waiting.store(true, cuda::std::memory_order_release); | ||
data.parity_waiting.notify_all(); | ||
data.barrier.wait_parity(Phase); | ||
} | ||
}; | ||
|
||
struct clear_token | ||
{ | ||
template<typename Data> | ||
__host__ __device__ | ||
static void perform(Data & data) | ||
{ | ||
data.parity_waiting.store(false, cuda::std::memory_order_release); | ||
} | ||
}; | ||
|
||
using aw_aw_pw = performer_list< | ||
barrier_parity_wait<false>, | ||
barrier_arrive_and_wait, | ||
barrier_arrive_and_wait, | ||
async_tester_fence, | ||
clear_token, | ||
barrier_parity_wait<true>, | ||
barrier_arrive_and_wait, | ||
barrier_arrive_and_wait, | ||
async_tester_fence, | ||
clear_token | ||
>; | ||
|
||
void kernel_invoker() | ||
{ | ||
validate_not_movable< | ||
barrier_and_token<cuda::std::barrier<>>, | ||
aw_aw_pw | ||
>(2); | ||
validate_not_movable< | ||
barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, | ||
aw_aw_pw | ||
>(2); | ||
} | ||
|
||
int main(int arg, char ** argv) | ||
{ | ||
#ifndef __CUDA_ARCH__ | ||
kernel_invoker(); | ||
#endif | ||
|
||
return 0; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
75 changes: 75 additions & 0 deletions
75
.upstream-tests/test/std/thread/thread.barrier/try_wait_for.pass.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// UNSUPPORTED: libcpp-has-no-threads | ||
// UNSUPPORTED: pre-sm-80 | ||
|
||
// <cuda/std/barrier> | ||
|
||
#include <cuda/std/barrier> | ||
|
||
#include "test_macros.h" | ||
#include "concurrent_agents.h" | ||
|
||
#include "cuda_space_selector.h" | ||
|
||
template<typename Barrier, | ||
template<typename, typename> typename Selector, | ||
typename Initializer = constructor_initializer> | ||
__host__ __device__ | ||
void test(bool add_delay = false) | ||
{ | ||
Selector<Barrier, Initializer> sel; | ||
SHARED Barrier * b; | ||
b = sel.construct(2); | ||
auto delay = cuda::std::chrono::duration<int>(0); | ||
|
||
if (add_delay) | ||
delay = cuda::std::chrono::duration<int>(1); | ||
|
||
#ifdef __CUDA_ARCH__ | ||
auto * tok = threadIdx.x == 0 ? new auto(b->arrive()) : nullptr; | ||
#else | ||
auto * tok = new auto(b->arrive()); | ||
#endif | ||
|
||
auto awaiter = LAMBDA (){ | ||
while(b->try_wait_for(cuda::std::move(*tok), delay) == false) {} | ||
}; | ||
auto arriver = LAMBDA (){ | ||
(void)b->arrive(); | ||
}; | ||
concurrent_agents_launch(awaiter, arriver); | ||
|
||
#ifdef __CUDA_ARCH__ | ||
if (threadIdx.x == 0) { | ||
#endif | ||
auto tok2 = b->arrive(2); | ||
while(b->try_wait_for(cuda::std::move(tok2), delay) == false) {} | ||
#ifdef __CUDA_ARCH__ | ||
} | ||
__syncthreads(); | ||
#endif | ||
} | ||
|
||
int main(int, char**) | ||
{ | ||
#ifndef __CUDA_ARCH__ | ||
//Required by concurrent_agents_launch to know how many we're launching | ||
cuda_thread_count = 2; | ||
|
||
test<cuda::barrier<cuda::thread_scope_block>, local_memory_selector>(); | ||
test<cuda::barrier<cuda::thread_scope_block>, local_memory_selector>(true); | ||
#else | ||
test<cuda::barrier<cuda::thread_scope_block>, shared_memory_selector>(); | ||
test<cuda::barrier<cuda::thread_scope_block>, global_memory_selector>(); | ||
test<cuda::barrier<cuda::thread_scope_block>, shared_memory_selector>(true); | ||
test<cuda::barrier<cuda::thread_scope_block>, global_memory_selector>(true); | ||
#endif | ||
return 0; | ||
} |
Oops, something went wrong.