Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Merge internal changes from 1.9.0
Browse files Browse the repository at this point in the history
* Add barrier/pipeline functions for Hopper
* Fix pipeline stage overwrite when threads are diverged
  • Loading branch information
wmaxey committed Dec 15, 2022
1 parent 37cc516 commit 538a6a5
Show file tree
Hide file tree
Showing 18 changed files with 1,347 additions and 360 deletions.
1 change: 1 addition & 0 deletions .upstream-tests/test/cuda/memcpy_async.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ void test_select_scope()
// API entrypoints accept barriers with arbitrary completion function, and that the synchronization mechanism detects it correctly.
test_fully_specialized<T, SourceSelector, DestSelector, BarrierSelector, cuda::thread_scope_block, completion>();
test_fully_specialized<T, SourceSelector, DestSelector, BarrierSelector, cuda::thread_scope_thread>();

}

template <class T,
Expand Down
154 changes: 154 additions & 0 deletions .upstream-tests/test/cuda/pipeline_divergent_threads.pass.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// UNSUPPORTED: pre-sm-70
// UNSUPPORTED: nvrtc

#include <cooperative_groups/memcpy_async.h>
#include <cuda/pipeline>
#include <cuda_pipeline.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

constexpr int nthreads = 256;
constexpr size_t stages_count = 2; // Pipeline with two stages

// Simply copy shared memory to global out
__device__ __forceinline__ void compute(int* global_out, int const* shared_in){
auto block = cooperative_groups::this_thread_block();
for (int i = 0; i < block.size(); ++i) {
global_out[i] = shared_in[i];
}
}

__global__ void with_staging(int* global_out, int const* global_in, size_t size, size_t batch_sz) {
auto grid = cooperative_groups::this_grid();
auto block = cooperative_groups::this_thread_block();
assert(size == batch_sz * grid.size()); // Assume input size fits batch_sz * grid_size

// Two batches must fit in shared memory:
constexpr int smem_size = stages_count * nthreads;
__shared__ int shared[smem_size];
size_t shared_offset[stages_count] = { 0, block.size() }; // Offsets to each batch

// Allocate shared storage for a two-stage cuda::pipeline:
#pragma nv_diag_suppress static_var_with_dynamic_init
__shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, stages_count> shared_state;
auto pipeline = cuda::make_pipeline(block, &shared_state);

// Each thread processes `batch_sz` elements.
// Compute offset of the batch `batch` of this thread block in global memory:
auto block_batch = [&](size_t batch) -> int {
return block.group_index().x * block.size() + grid.size() * batch;
};

// Initialize first pipeline stage by submitting a `memcpy_async` to fetch a whole batch for the block:
if (batch_sz == 0) return;
pipeline.producer_acquire();
cuda::memcpy_async(block, shared + shared_offset[0], global_in + block_batch(0), sizeof(int) * block.size(), pipeline);
pipeline.producer_commit();
// Pipelined copy/compute:
for (size_t batch = 1; batch < batch_sz; ++batch) {
// Stage indices for the compute and copy stages:
size_t compute_stage_idx = (batch - 1) % 2;
size_t copy_stage_idx = batch % 2;

// This change fixes an unrelated bug. The global_idx that was passed to
// the compute stage was wrong.
size_t global_copy_idx = block_batch(batch);
size_t global_compute_idx = block_batch(batch - 1);

// Collectively acquire the pipeline head stage from all producer threads:
pipeline.producer_acquire();

// Submit async copies to the pipeline's head stage to be
// computed in the next loop iteration
cuda::memcpy_async(block, shared + shared_offset[copy_stage_idx], global_in + global_copy_idx, sizeof(int) * block.size(), pipeline);

// Collectively commit (advance) the pipeline's head stage
pipeline.producer_commit();

// Collectively wait for the operations commited to the
// previous `compute` stage to complete:
pipeline.consumer_wait();

// Computation overlapped with the memcpy_async of the "copy" stage:
compute(global_out + global_compute_idx, shared + shared_offset[compute_stage_idx]);
// Diverge threads in block
__nanosleep(block.thread_rank() * 10);
// Collectively release the stage resources
pipeline.consumer_release();
}

// Compute the data fetch by the last iteration
pipeline.consumer_wait();
compute(global_out + block_batch(batch_sz-1),
shared + shared_offset[(batch_sz - 1) % 2]);
pipeline.consumer_release();
}


int main(int argc, char **argv) {
NV_IF_ELSE_TARGET(
NV_IS_HOST, (
constexpr int batch_size = 10;
constexpr size_t size = batch_size * nthreads;
int *in, *out;

if (cudaMallocManaged((void **)&in, sizeof(int) * size) != cudaSuccess ||
cudaMallocManaged((void **)&out, sizeof(int) * size) != cudaSuccess) {
printf("Setup failed\n");
return -1;
}

for (size_t i = 0; i < size; ++i) {
in[i] = (int)i;
}

with_staging<<<1, nthreads>>>(out, in, size, batch_size);
{
auto result = cudaGetLastError();
if (result != cudaSuccess) {
printf("Kernel launch failed\n");
printf("Error: %s\n", cudaGetErrorString(result));
return -1;
}
}

if (cudaDeviceSynchronize() != cudaSuccess) {
printf("Kernel failed while running\n");
printf("Error: %s\n", cudaGetErrorString(cudaGetLastError()));
return -1;
}

const int max_errors = 10;
int num_errors = 0;
for (size_t i = 0; i < size; ++i) {
if (out[i] != (int)i) {
printf("out[%d] did not match expected value %d. Got %d\r\n", (int)i, (int)i, out[i]);
num_errors++;
}
if (num_errors == max_errors) {
break;
}
}

if (num_errors == 0) {
printf("No errors.\r\n");
}
else {
return -1;
}

return 0;
),
return 0;
)
}
109 changes: 109 additions & 0 deletions .upstream-tests/test/heterogeneous/barrier_parity.pass.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
//===----------------------------------------------------------------------===//
//
// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// UNSUPPORTED: nvrtc, pre-sm-70

// uncomment for a really verbose output detailing what test steps are being launched
// #define DEBUG_TESTERS

#include "helpers.h"

#include <atomic>
#include <cuda/barrier>

template<typename Barrier>
struct barrier_and_token
{
using barrier_t = Barrier;
using token_t = typename barrier_t::arrival_token;

barrier_t barrier;
cuda::std::atomic<bool> parity_waiting{false};

template<typename ...Args>
__host__ __device__
barrier_and_token(Args && ...args) : barrier{ cuda::std::forward<Args>(args)... }
{
}
};

struct barrier_arrive_and_wait
{
using async = cuda::std::true_type;

template<typename Data>
__host__ __device__
static void perform(Data & data)
{
while (data.parity_waiting.load(cuda::std::memory_order_acquire) == false)
{
data.parity_waiting.wait(false);
}
data.barrier.arrive_and_wait();
}
};

template <bool Phase>
struct barrier_parity_wait
{
using async = cuda::std::true_type;

template<typename Data>
__host__ __device__
static void perform(Data & data)
{
data.parity_waiting.store(true, cuda::std::memory_order_release);
data.parity_waiting.notify_all();
data.barrier.wait_parity(Phase);
}
};

struct clear_token
{
template<typename Data>
__host__ __device__
static void perform(Data & data)
{
data.parity_waiting.store(false, cuda::std::memory_order_release);
}
};

using aw_aw_pw = performer_list<
barrier_parity_wait<false>,
barrier_arrive_and_wait,
barrier_arrive_and_wait,
async_tester_fence,
clear_token,
barrier_parity_wait<true>,
barrier_arrive_and_wait,
barrier_arrive_and_wait,
async_tester_fence,
clear_token
>;

void kernel_invoker()
{
validate_not_movable<
barrier_and_token<cuda::std::barrier<>>,
aw_aw_pw
>(2);
validate_not_movable<
barrier_and_token<cuda::barrier<cuda::thread_scope_system>>,
aw_aw_pw
>(2);
}

int main(int arg, char ** argv)
{
#ifndef __CUDA_ARCH__
kernel_invoker();
#endif

return 0;
}

Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
// UNSUPPORTED: libcpp-has-no-threads
// UNSUPPORTED: pre-sm-70

// Compiler bug for being unable to convert __nv_hdl lambdas
// XFAIL: msvc-19.33

// <cuda/std/barrier>

#include <cuda/std/barrier>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// UNSUPPORTED: libcpp-has-no-threads
// UNSUPPORTED: pre-sm-80

// <cuda/std/barrier>

#include <cuda/std/barrier>

#include "test_macros.h"
#include "concurrent_agents.h"

#include "cuda_space_selector.h"

template<typename Barrier,
template<typename, typename> typename Selector,
typename Initializer = constructor_initializer>
__host__ __device__
void test(bool add_delay = false)
{
Selector<Barrier, Initializer> sel;
SHARED Barrier * b;
b = sel.construct(2);
auto delay = cuda::std::chrono::duration<int>(0);

if (add_delay)
delay = cuda::std::chrono::duration<int>(1);

#ifdef __CUDA_ARCH__
auto * tok = threadIdx.x == 0 ? new auto(b->arrive()) : nullptr;
#else
auto * tok = new auto(b->arrive());
#endif

auto awaiter = LAMBDA (){
while(b->try_wait_for(cuda::std::move(*tok), delay) == false) {}
};
auto arriver = LAMBDA (){
(void)b->arrive();
};
concurrent_agents_launch(awaiter, arriver);

#ifdef __CUDA_ARCH__
if (threadIdx.x == 0) {
#endif
auto tok2 = b->arrive(2);
while(b->try_wait_for(cuda::std::move(tok2), delay) == false) {}
#ifdef __CUDA_ARCH__
}
__syncthreads();
#endif
}

int main(int, char**)
{
#ifndef __CUDA_ARCH__
//Required by concurrent_agents_launch to know how many we're launching
cuda_thread_count = 2;

test<cuda::barrier<cuda::thread_scope_block>, local_memory_selector>();
test<cuda::barrier<cuda::thread_scope_block>, local_memory_selector>(true);
#else
test<cuda::barrier<cuda::thread_scope_block>, shared_memory_selector>();
test<cuda::barrier<cuda::thread_scope_block>, global_memory_selector>();
test<cuda::barrier<cuda::thread_scope_block>, shared_memory_selector>(true);
test<cuda::barrier<cuda::thread_scope_block>, global_memory_selector>(true);
#endif
return 0;
}
Loading

0 comments on commit 538a6a5

Please sign in to comment.