build: check-in breeze test fixtures for cuda

facebookincubator · Nov 25, 2024 · 827f116 · 827f116
1 parent 011d3d5
commit 827f116
Show file tree

Hide file tree

Showing 5 changed files with 654 additions and 0 deletions.
diff --git a/velox/experimental/breeze/test/generate.sh b/velox/experimental/breeze/test/generate.sh
@@ -30,3 +30,5 @@ function generate {
 
 generate openmp "algorithm" h
 generate openmp "function" h
+generate cuda "algorithm" cuh
+generate cuda "function" cuh
diff --git a/velox/experimental/breeze/test/generated/algorithms/algorithm_test-cuda.cuh b/velox/experimental/breeze/test/generated/algorithms/algorithm_test-cuda.cuh
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2024 by Rivos Inc.
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * This file is auto-generated from test_fixture_generator.py
+ * DO NOT EDIT!
+ */
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "breeze/platforms/cuda.cuh"
+#include "test/generated/algorithms/kernels-cuda.cuh"
+#include "test/platforms/cuda_test.cuh"
+
+template <typename T>
+class AlgorithmTest : public ::testing::Test {
+ protected:
+  template <typename ReduceOp, int BLOCK_THREADS, int ITEMS_PER_THREAD,
+            typename U>
+  void Reduce(const std::vector<T>& in, U* out, int num_blocks) {
+    std::vector<U> vec_out(1, *out);
+    CudaTestLaunch<BLOCK_THREADS>(
+        num_blocks,
+        &kernels::Reduce<ReduceOp, BLOCK_THREADS, ITEMS_PER_THREAD, T, U>, in,
+        vec_out, in.size());
+    *out = vec_out[0];
+  }
+
+  template <typename ScanOp, int BLOCK_THREADS, int ITEMS_PER_THREAD,
+            int LOOKBACK_DISTANCE, typename U, typename V>
+  void Scan(const std::vector<T>& in, std::vector<U>& out, int* next_blocks_idx,
+            std::vector<V>& blocks, int num_blocks) {
+    std::vector<int> vec_next_blocks_idx(1, *next_blocks_idx);
+    CudaTestLaunch<BLOCK_THREADS>(
+        num_blocks,
+        &kernels::Scan<ScanOp, BLOCK_THREADS, ITEMS_PER_THREAD,
+                       LOOKBACK_DISTANCE, T, U, V>,
+        in, out, vec_next_blocks_idx, blocks, in.size());
+    *next_blocks_idx = vec_next_blocks_idx[0];
+  }
+
+  template <int BLOCK_THREADS, int ITEMS_PER_THREAD, int TILE_SIZE,
+            int RADIX_BITS>
+  void RadixSortHistogram(const std::vector<T>& in, std::vector<unsigned>& out,
+                          int num_blocks) {
+    CudaTestLaunch<BLOCK_THREADS>(
+        num_blocks,
+        &kernels::RadixSortHistogram<BLOCK_THREADS, ITEMS_PER_THREAD, TILE_SIZE,
+                                     RADIX_BITS, T>,
+        in, out, in.size());
+  }
+
+  template <int BLOCK_THREADS, int ITEMS_PER_THREAD, int RADIX_BITS>
+  void RadixSort(const std::vector<T>& in,
+                 const std::vector<unsigned>& in_offsets, int start_bit,
+                 int num_pass_bits, std::vector<T>& out,
+                 std::vector<int>& next_block_idx,
+                 std::vector<unsigned>& blocks, int num_blocks) {
+    const std::vector<int> vec_start_bit(1, start_bit);
+    const std::vector<int> vec_num_pass_bits(1, num_pass_bits);
+    CudaTestLaunch<BLOCK_THREADS>(
+        num_blocks,
+        &kernels::RadixSort<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, T>, in,
+        in_offsets, vec_start_bit, vec_num_pass_bits, out, next_block_idx,
+        blocks, in.size());
+  }
+};
diff --git a/velox/experimental/breeze/test/generated/algorithms/kernels-cuda.cuh b/velox/experimental/breeze/test/generated/algorithms/kernels-cuda.cuh
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2024 by Rivos Inc.
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * This file is auto-generated from kernel_generator.py
+ * DO NOT EDIT!
+ */
+
+#include "breeze/algorithms/reduce.h"
+#include "breeze/algorithms/scan.h"
+#include "breeze/algorithms/sort.h"
+#include "breeze/platforms/cuda.cuh"
+#include "breeze/platforms/platform.h"
+#include "breeze/utils/types.h"
+
+namespace kernels {
+
+enum { WARP_THREADS = 32 };
+template <typename Op, int BLOCK_THREADS, int ITEMS_PER_THREAD, typename T,
+          typename U>
+__global__ void Reduce(const T* in, U* out, int num_items) {
+  using PlatformT = CudaPlatform<BLOCK_THREADS, WARP_THREADS>;
+  PlatformT p;
+  __shared__
+      typename breeze::algorithms::DeviceReduce<PlatformT, U>::Scratch scratch_;
+  auto scratch =
+      (typename breeze::algorithms::DeviceReduce<PlatformT,
+                                                 U>::Scratch*)&scratch_;
+
+  breeze::algorithms::DeviceReduce<PlatformT, U>::template Reduce<
+      Op, ITEMS_PER_THREAD>(
+      p, breeze::utils::make_slice<breeze::utils::GLOBAL>(in),
+      breeze::utils::make_slice<breeze::utils::GLOBAL>(out),
+      breeze::utils::make_slice<breeze::utils::SHARED>(scratch), num_items);
+}
+
+template <typename Op, int BLOCK_THREADS, int ITEMS_PER_THREAD,
+          int LOOKBACK_DISTANCE, typename T, typename U, typename V>
+__global__ void Scan(const T* in, U* out, int* next_block_idx, V* blocks,
+                     int num_items) {
+  using PlatformT = CudaPlatform<BLOCK_THREADS, WARP_THREADS>;
+  PlatformT p;
+  __shared__ typename breeze::algorithms::DeviceScan<
+      PlatformT, U, ITEMS_PER_THREAD, LOOKBACK_DISTANCE>::Scratch scratch_;
+  auto scratch = (typename breeze::algorithms::DeviceScan<
+                  PlatformT, U, ITEMS_PER_THREAD,
+                  LOOKBACK_DISTANCE>::Scratch*)&scratch_;
+
+  breeze::algorithms::DeviceScan<PlatformT, U, ITEMS_PER_THREAD,
+                                 LOOKBACK_DISTANCE>::
+      template Scan<Op>(
+          p, breeze::utils::make_slice<breeze::utils::GLOBAL>(in),
+          breeze::utils::make_slice<breeze::utils::GLOBAL>(out),
+          breeze::utils::make_slice<breeze::utils::GLOBAL>(next_block_idx),
+          breeze::utils::make_slice<breeze::utils::GLOBAL>(blocks),
+          breeze::utils::make_slice<breeze::utils::SHARED>(scratch), num_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD, int TILE_SIZE,
+          int RADIX_BITS, typename T>
+__global__ void RadixSortHistogram(const T* in, unsigned* out, int num_items) {
+  using PlatformT = CudaPlatform<BLOCK_THREADS, WARP_THREADS>;
+  PlatformT p;
+  __shared__ typename breeze::algorithms::DeviceRadixSortHistogram<
+      RADIX_BITS, T>::Scratch scratch_;
+  auto scratch = (typename breeze::algorithms::DeviceRadixSortHistogram<
+                  RADIX_BITS, T>::Scratch*)&scratch_;
+
+  breeze::algorithms::DeviceRadixSortHistogram<RADIX_BITS, T>::template Build<
+      ITEMS_PER_THREAD, TILE_SIZE>(
+      p, breeze::utils::make_slice<breeze::utils::GLOBAL>(in),
+      breeze::utils::make_slice<breeze::utils::GLOBAL>(out),
+      breeze::utils::make_slice<breeze::utils::SHARED>(scratch), num_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD, int RADIX_BITS, typename T>
+__global__ void RadixSort(const T* in, const unsigned* in_offsets,
+                          const int* start_bit, const int* num_pass_bits,
+                          T* out, int* next_block_idx, unsigned* blocks,
+                          int num_items) {
+  using PlatformT = CudaPlatform<BLOCK_THREADS, WARP_THREADS>;
+  PlatformT p;
+  __shared__ typename breeze::algorithms::DeviceRadixSort<
+      PlatformT, ITEMS_PER_THREAD, RADIX_BITS, T>::Scratch scratch_;
+  auto scratch =
+      (typename breeze::algorithms::DeviceRadixSort<
+          PlatformT, ITEMS_PER_THREAD, RADIX_BITS, T>::Scratch*)&scratch_;
+
+  breeze::algorithms::DeviceRadixSort<PlatformT, ITEMS_PER_THREAD, RADIX_BITS,
+                                      T>::
+      template Sort<unsigned>(
+          p, breeze::utils::make_slice<breeze::utils::GLOBAL>(in),
+          breeze::utils::make_slice<breeze::utils::GLOBAL>(in_offsets),
+          *start_bit, *num_pass_bits,
+          breeze::utils::make_slice<breeze::utils::GLOBAL>(out),
+          breeze::utils::make_slice<breeze::utils::GLOBAL>(next_block_idx),
+          breeze::utils::make_slice<breeze::utils::GLOBAL>(blocks),
+          breeze::utils::make_slice<breeze::utils::SHARED>(scratch), num_items);
+}
+
+}  // namespace kernels
diff --git a/velox/experimental/breeze/test/generated/functions/function_test-cuda.cuh b/velox/experimental/breeze/test/generated/functions/function_test-cuda.cuh
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2024 by Rivos Inc.
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * This file is auto-generated from test_fixture_generator.py
+ * DO NOT EDIT!
+ */
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "breeze/platforms/cuda.cuh"
+#include "test/generated/functions/kernels-cuda.cuh"
+#include "test/platforms/cuda_test.cuh"
+
+template <typename T>
+class FunctionTest : public ::testing::Test {
+ protected:
+  template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+  void BlockLoad(const std::vector<T>& in, std::vector<T>& out) {
+    CudaTestLaunch<BLOCK_THREADS>(
+        /*num_blocks=*/1,
+        &kernels::BlockLoad<BLOCK_THREADS, ITEMS_PER_THREAD, T>, in, out,
+        in.size());
+  }
+
+  template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+  void BlockLoadIf(const std::vector<T>& in,
+                   const std::vector<int>& selection_flags,
+                   std::vector<T>& out) {
+    CudaTestLaunch<BLOCK_THREADS>(
+        /*num_blocks=*/1,
+        &kernels::BlockLoadIf<BLOCK_THREADS, ITEMS_PER_THREAD, T>, in,
+        selection_flags, out, in.size());
+  }
+
+  template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+  void BlockLoadFrom(const std::vector<T>& in, const std::vector<int>& offsets,
+                     std::vector<T>& out) {
+    CudaTestLaunch<BLOCK_THREADS>(
+        /*num_blocks=*/1,
+        &kernels::BlockLoadFrom<BLOCK_THREADS, ITEMS_PER_THREAD, T>, in,
+        offsets, out, in.size());
+  }
+
+  template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+  void BlockStore(const std::vector<T>& in, std::vector<T>& out) {
+    CudaTestLaunch<BLOCK_THREADS>(
+        /*num_blocks=*/1,
+        &kernels::BlockStore<BLOCK_THREADS, ITEMS_PER_THREAD, T>, in, out,
+        out.size());
+  }
+
+  template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+  void BlockStoreIf(const std::vector<T>& in,
+                    const std::vector<int>& selection_flags,
+                    std::vector<T>& out) {
+    CudaTestLaunch<BLOCK_THREADS>(
+        /*num_blocks=*/1,
+        &kernels::BlockStoreIf<BLOCK_THREADS, ITEMS_PER_THREAD, T>, in,
+        selection_flags, out, out.size());
+  }
+
+  template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+  void BlockStoreAt(const std::vector<T>& in, const std::vector<int>& offsets,
+                    std::vector<T>& out) {
+    CudaTestLaunch<BLOCK_THREADS>(
+        /*num_blocks=*/1,
+        &kernels::BlockStoreAt<BLOCK_THREADS, ITEMS_PER_THREAD, T>, in, offsets,
+        out, out.size());
+  }
+
+  template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+  void BlockStoreAtIf(const std::vector<T>& in, const std::vector<int>& offsets,
+                      const std::vector<int>& selection_flags,
+                      std::vector<T>& out) {
+    CudaTestLaunch<BLOCK_THREADS>(
+        /*num_blocks=*/1,
+        &kernels::BlockStoreAtIf<BLOCK_THREADS, ITEMS_PER_THREAD, T>, in,
+        offsets, selection_flags, out, out.size());
+  }
+
+  template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+  void BlockFill(T value, std::vector<T>& out) {
+    const std::vector<T> vec_value(1, value);
+    CudaTestLaunch<BLOCK_THREADS>(
+        /*num_blocks=*/1,
+        &kernels::BlockFill<BLOCK_THREADS, ITEMS_PER_THREAD, T>, vec_value, out,
+        out.size());
+  }
+
+  template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+  void BlockFillAtIf(T value, const std::vector<int>& offsets,
+                     const std::vector<int>& selection_flags,
+                     std::vector<T>& out) {
+    const std::vector<T> vec_value(1, value);
+    CudaTestLaunch<BLOCK_THREADS>(
+        /*num_blocks=*/1,
+        &kernels::BlockFillAtIf<BLOCK_THREADS, ITEMS_PER_THREAD, T>, vec_value,
+        offsets, selection_flags, out, out.size());
+  }
+
+  template <typename ReduceOp, int BLOCK_THREADS, int ITEMS_PER_THREAD,
+            typename U>
+  void BlockReduce(const std::vector<T>& in, U* out) {
+    std::vector<U> vec_out(1, *out);
+    CudaTestLaunch<BLOCK_THREADS>(
+        /*num_blocks=*/1,
+        &kernels::BlockReduce<ReduceOp, BLOCK_THREADS, ITEMS_PER_THREAD, T, U>,
+        in, vec_out, in.size());
+    *out = vec_out[0];
+  }
+
+  template <typename ScanOp, int BLOCK_THREADS, int ITEMS_PER_THREAD,
+            typename U>
+  void BlockScan(const std::vector<T>& in, std::vector<U>& out) {
+    CudaTestLaunch<BLOCK_THREADS>(
+        /*num_blocks=*/1,
+        &kernels::BlockScan<ScanOp, BLOCK_THREADS, ITEMS_PER_THREAD, T, U>, in,
+        out, in.size());
+  }
+
+  template <int BLOCK_THREADS, int ITEMS_PER_THREAD, int RADIX_BITS>
+  void BlockRadixRank(const std::vector<T>& in, std::vector<int>& out) {
+    CudaTestLaunch<BLOCK_THREADS>(
+        /*num_blocks=*/1,
+        &kernels::BlockRadixRank<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS,
+                                 T>,
+        in, out, in.size());
+  }
+
+  template <int BLOCK_THREADS, int ITEMS_PER_THREAD, int RADIX_BITS>
+  void BlockRadixSort(const std::vector<T>& in, std::vector<T>& out) {
+    CudaTestLaunch<BLOCK_THREADS>(
+        /*num_blocks=*/1,
+        &kernels::BlockRadixSort<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS,
+                                 T>,
+        in, out, in.size());
+  }
+};