Merge internal changes from 1.9.0

* Add barrier/pipeline functions for Hopper * Fix pipeline stage overwrite when threads are diverged
NVIDIA · Dec 15, 2022 · 538a6a5 · 538a6a5
1 parent 37cc516
commit 538a6a5
Show file tree

Hide file tree

Showing 18 changed files with 1,347 additions and 360 deletions.
diff --git a/.upstream-tests/test/cuda/memcpy_async.h b/.upstream-tests/test/cuda/memcpy_async.h
@@ -75,6 +75,7 @@ void test_select_scope()
     // API entrypoints accept barriers with arbitrary completion function, and that the synchronization mechanism detects it correctly.
     test_fully_specialized<T, SourceSelector, DestSelector, BarrierSelector, cuda::thread_scope_block, completion>();
     test_fully_specialized<T, SourceSelector, DestSelector, BarrierSelector, cuda::thread_scope_thread>();
+
 }
 
 template <class T,

diff --git a/.upstream-tests/test/cuda/pipeline_divergent_threads.pass.cpp b/.upstream-tests/test/cuda/pipeline_divergent_threads.pass.cpp
@@ -0,0 +1,154 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: nvrtc
+
+#include <cooperative_groups/memcpy_async.h>
+#include <cuda/pipeline>
+#include <cuda_pipeline.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+constexpr int nthreads = 256;
+constexpr size_t stages_count = 2; // Pipeline with two stages
+
+// Simply copy shared memory to global out
+__device__ __forceinline__ void compute(int* global_out, int const* shared_in){
+    auto block = cooperative_groups::this_thread_block();
+    for (int i = 0; i < block.size(); ++i) {
+        global_out[i] = shared_in[i];
+    }
+}
+
+__global__ void with_staging(int* global_out, int const* global_in, size_t size, size_t batch_sz) {
+    auto grid = cooperative_groups::this_grid();
+    auto block = cooperative_groups::this_thread_block();
+    assert(size == batch_sz * grid.size()); // Assume input size fits batch_sz * grid_size
+
+    // Two batches must fit in shared memory:
+    constexpr int smem_size = stages_count * nthreads;
+    __shared__ int shared[smem_size];
+    size_t shared_offset[stages_count] = { 0, block.size() }; // Offsets to each batch
+
+    // Allocate shared storage for a two-stage cuda::pipeline:
+    #pragma nv_diag_suppress static_var_with_dynamic_init
+    __shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, stages_count> shared_state;
+    auto pipeline = cuda::make_pipeline(block, &shared_state);
+
+    // Each thread processes `batch_sz` elements.
+    // Compute offset of the batch `batch` of this thread block in global memory:
+    auto block_batch = [&](size_t batch) -> int {
+      return block.group_index().x * block.size() + grid.size() * batch;
+    };
+
+    // Initialize first pipeline stage by submitting a `memcpy_async` to fetch a whole batch for the block:
+    if (batch_sz == 0) return;
+    pipeline.producer_acquire();
+    cuda::memcpy_async(block, shared + shared_offset[0], global_in + block_batch(0), sizeof(int) * block.size(), pipeline);
+    pipeline.producer_commit();
+    // Pipelined copy/compute:
+    for (size_t batch = 1; batch < batch_sz; ++batch) {
+        // Stage indices for the compute and copy stages:
+        size_t compute_stage_idx = (batch - 1) % 2;
+        size_t copy_stage_idx = batch % 2;
+
+        // This change fixes an unrelated bug. The global_idx that was passed to
+        // the compute stage was wrong.
+        size_t global_copy_idx = block_batch(batch);
+        size_t global_compute_idx = block_batch(batch - 1);
+
+        // Collectively acquire the pipeline head stage from all producer threads:
+        pipeline.producer_acquire();
+
+        // Submit async copies to the pipeline's head stage to be
+        // computed in the next loop iteration
+        cuda::memcpy_async(block, shared + shared_offset[copy_stage_idx], global_in + global_copy_idx, sizeof(int) * block.size(), pipeline);
+
+        // Collectively commit (advance) the pipeline's head stage
+        pipeline.producer_commit();
+
+        // Collectively wait for the operations commited to the
+        // previous `compute` stage to complete:
+        pipeline.consumer_wait();
+
+        // Computation overlapped with the memcpy_async of the "copy" stage:
+        compute(global_out + global_compute_idx, shared + shared_offset[compute_stage_idx]);
+        // Diverge threads in block
+        __nanosleep(block.thread_rank() * 10);
+        // Collectively release the stage resources
+        pipeline.consumer_release();
+    }
+
+    // Compute the data fetch by the last iteration
+    pipeline.consumer_wait();
+    compute(global_out + block_batch(batch_sz-1),
+            shared + shared_offset[(batch_sz - 1) % 2]);
+    pipeline.consumer_release();
+}
+
+
+int main(int argc, char **argv) {
+    NV_IF_ELSE_TARGET(
+        NV_IS_HOST, (
+            constexpr int batch_size = 10;
+            constexpr size_t size = batch_size * nthreads;
+            int *in, *out;
+
+            if (cudaMallocManaged((void **)&in, sizeof(int) * size) != cudaSuccess ||
+                cudaMallocManaged((void **)&out, sizeof(int) * size) != cudaSuccess) {
+                printf("Setup failed\n");
+                return -1;
+            }
+
+            for (size_t i = 0; i < size; ++i) {
+                in[i] = (int)i;
+            }
+
+            with_staging<<<1, nthreads>>>(out, in, size, batch_size);
+            {
+                auto result = cudaGetLastError();
+                if (result != cudaSuccess) {
+                    printf("Kernel launch failed\n");
+                    printf("Error: %s\n", cudaGetErrorString(result));
+                    return -1;
+                }
+            }
+
+            if (cudaDeviceSynchronize() != cudaSuccess) {
+                printf("Kernel failed while running\n");
+                printf("Error: %s\n", cudaGetErrorString(cudaGetLastError()));
+                return -1;
+            }
+
+            const int max_errors = 10;
+            int num_errors = 0;
+            for (size_t i = 0; i < size; ++i) {
+                if (out[i] != (int)i) {
+                    printf("out[%d] did not match expected value %d. Got %d\r\n", (int)i, (int)i, out[i]);
+                    num_errors++;
+                }
+                if (num_errors == max_errors) {
+                    break;
+                }
+            }
+
+            if (num_errors == 0) {
+                printf("No errors.\r\n");
+            }
+            else {
+                return -1;
+            }
+
+            return 0;
+        ),
+        return 0;
+    )
+}
diff --git a/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp b/.upstream-tests/test/heterogeneous/barrier_parity.pass.cpp
@@ -0,0 +1,109 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc, pre-sm-70
+
+// uncomment for a really verbose output detailing what test steps are being launched
+// #define DEBUG_TESTERS
+
+#include "helpers.h"
+
+#include <atomic>
+#include <cuda/barrier>
+
+template<typename Barrier>
+struct barrier_and_token
+{
+    using barrier_t = Barrier;
+    using token_t = typename barrier_t::arrival_token;
+
+    barrier_t barrier;
+    cuda::std::atomic<bool> parity_waiting{false};
+
+    template<typename ...Args>
+    __host__ __device__
+    barrier_and_token(Args && ...args) : barrier{ cuda::std::forward<Args>(args)... }
+    {
+    }
+};
+
+struct barrier_arrive_and_wait
+{
+    using async = cuda::std::true_type;
+
+    template<typename Data>
+    __host__ __device__
+    static void perform(Data & data)
+    {
+        while (data.parity_waiting.load(cuda::std::memory_order_acquire) == false)
+        {
+            data.parity_waiting.wait(false);
+        }
+        data.barrier.arrive_and_wait();
+    }
+};
+
+template <bool Phase>
+struct barrier_parity_wait
+{
+    using async = cuda::std::true_type;
+
+    template<typename Data>
+    __host__ __device__
+    static void perform(Data & data)
+    {
+        data.parity_waiting.store(true, cuda::std::memory_order_release);
+        data.parity_waiting.notify_all();
+        data.barrier.wait_parity(Phase);
+    }
+};
+
+struct clear_token
+{
+    template<typename Data>
+    __host__ __device__
+    static void perform(Data & data)
+    {
+        data.parity_waiting.store(false, cuda::std::memory_order_release);
+    }
+};
+
+using aw_aw_pw = performer_list<
+    barrier_parity_wait<false>,
+    barrier_arrive_and_wait,
+    barrier_arrive_and_wait,
+    async_tester_fence,
+    clear_token,
+    barrier_parity_wait<true>,
+    barrier_arrive_and_wait,
+    barrier_arrive_and_wait,
+    async_tester_fence,
+    clear_token
+>;
+
+void kernel_invoker()
+{
+    validate_not_movable<
+        barrier_and_token<cuda::std::barrier<>>,
+        aw_aw_pw
+      >(2);
+    validate_not_movable<
+        barrier_and_token<cuda::barrier<cuda::thread_scope_system>>,
+        aw_aw_pw
+      >(2);
+}
+
+int main(int arg, char ** argv)
+{
+#ifndef __CUDA_ARCH__
+    kernel_invoker();
+#endif
+
+    return 0;
+}
+
diff --git a/.upstream-tests/test/std/thread/thread.barrier/completion.pass.cpp b/.upstream-tests/test/std/thread/thread.barrier/completion.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: pre-sm-70
 
+// Compiler bug for being unable to convert __nv_hdl lambdas
+// XFAIL: msvc-19.33
+
 // <cuda/std/barrier>
 
 #include <cuda/std/barrier>

diff --git a/.upstream-tests/test/std/thread/thread.barrier/try_wait_for.pass.cpp b/.upstream-tests/test/std/thread/thread.barrier/try_wait_for.pass.cpp
@@ -0,0 +1,75 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-80
+
+// <cuda/std/barrier>
+
+#include <cuda/std/barrier>
+
+#include "test_macros.h"
+#include "concurrent_agents.h"
+
+#include "cuda_space_selector.h"
+
+template<typename Barrier,
+    template<typename, typename> typename Selector,
+    typename Initializer = constructor_initializer>
+__host__ __device__
+void test(bool add_delay = false)
+{
+  Selector<Barrier, Initializer> sel;
+  SHARED Barrier * b;
+  b = sel.construct(2);
+  auto delay = cuda::std::chrono::duration<int>(0);
+
+  if (add_delay)
+	delay = cuda::std::chrono::duration<int>(1);
+
+#ifdef __CUDA_ARCH__
+  auto * tok = threadIdx.x == 0 ? new auto(b->arrive()) : nullptr;
+#else
+  auto * tok = new auto(b->arrive());
+#endif
+
+  auto awaiter = LAMBDA (){
+    while(b->try_wait_for(cuda::std::move(*tok), delay) == false) {}
+  };
+  auto arriver = LAMBDA (){
+    (void)b->arrive();
+  };
+  concurrent_agents_launch(awaiter, arriver);
+
+#ifdef __CUDA_ARCH__
+  if (threadIdx.x == 0) {
+#endif
+  auto tok2 = b->arrive(2);
+  while(b->try_wait_for(cuda::std::move(tok2), delay) == false) {}
+#ifdef __CUDA_ARCH__
+  }
+  __syncthreads();
+#endif
+}
+
+int main(int, char**)
+{
+#ifndef __CUDA_ARCH__
+  //Required by concurrent_agents_launch to know how many we're launching
+  cuda_thread_count = 2;
+
+  test<cuda::barrier<cuda::thread_scope_block>, local_memory_selector>();
+  test<cuda::barrier<cuda::thread_scope_block>, local_memory_selector>(true);
+#else
+  test<cuda::barrier<cuda::thread_scope_block>, shared_memory_selector>();
+  test<cuda::barrier<cuda::thread_scope_block>, global_memory_selector>();
+  test<cuda::barrier<cuda::thread_scope_block>, shared_memory_selector>(true);
+  test<cuda::barrier<cuda::thread_scope_block>, global_memory_selector>(true);
+#endif
+  return 0;
+}