Skip to content

Commit

Permalink
build: check-in breeze test fixtures for cuda
Browse files Browse the repository at this point in the history
  • Loading branch information
David Reveman committed Nov 25, 2024
1 parent 011d3d5 commit 827f116
Show file tree
Hide file tree
Showing 5 changed files with 654 additions and 0 deletions.
2 changes: 2 additions & 0 deletions velox/experimental/breeze/test/generate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,5 @@ function generate {

generate openmp "algorithm" h
generate openmp "function" h
generate cuda "algorithm" cuh
generate cuda "function" cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*
* Copyright (c) 2024 by Rivos Inc.
* Licensed under the Apache License, Version 2.0, see LICENSE for details.
* SPDX-License-Identifier: Apache-2.0
*/

/*
* This file is auto-generated from test_fixture_generator.py
* DO NOT EDIT!
*/

#include <gtest/gtest.h>

#include <vector>

#include "breeze/platforms/cuda.cuh"
#include "test/generated/algorithms/kernels-cuda.cuh"
#include "test/platforms/cuda_test.cuh"

template <typename T>
class AlgorithmTest : public ::testing::Test {
protected:
template <typename ReduceOp, int BLOCK_THREADS, int ITEMS_PER_THREAD,
typename U>
void Reduce(const std::vector<T>& in, U* out, int num_blocks) {
std::vector<U> vec_out(1, *out);
CudaTestLaunch<BLOCK_THREADS>(
num_blocks,
&kernels::Reduce<ReduceOp, BLOCK_THREADS, ITEMS_PER_THREAD, T, U>, in,
vec_out, in.size());
*out = vec_out[0];
}

template <typename ScanOp, int BLOCK_THREADS, int ITEMS_PER_THREAD,
int LOOKBACK_DISTANCE, typename U, typename V>
void Scan(const std::vector<T>& in, std::vector<U>& out, int* next_blocks_idx,
std::vector<V>& blocks, int num_blocks) {
std::vector<int> vec_next_blocks_idx(1, *next_blocks_idx);
CudaTestLaunch<BLOCK_THREADS>(
num_blocks,
&kernels::Scan<ScanOp, BLOCK_THREADS, ITEMS_PER_THREAD,
LOOKBACK_DISTANCE, T, U, V>,
in, out, vec_next_blocks_idx, blocks, in.size());
*next_blocks_idx = vec_next_blocks_idx[0];
}

template <int BLOCK_THREADS, int ITEMS_PER_THREAD, int TILE_SIZE,
int RADIX_BITS>
void RadixSortHistogram(const std::vector<T>& in, std::vector<unsigned>& out,
int num_blocks) {
CudaTestLaunch<BLOCK_THREADS>(
num_blocks,
&kernels::RadixSortHistogram<BLOCK_THREADS, ITEMS_PER_THREAD, TILE_SIZE,
RADIX_BITS, T>,
in, out, in.size());
}

template <int BLOCK_THREADS, int ITEMS_PER_THREAD, int RADIX_BITS>
void RadixSort(const std::vector<T>& in,
const std::vector<unsigned>& in_offsets, int start_bit,
int num_pass_bits, std::vector<T>& out,
std::vector<int>& next_block_idx,
std::vector<unsigned>& blocks, int num_blocks) {
const std::vector<int> vec_start_bit(1, start_bit);
const std::vector<int> vec_num_pass_bits(1, num_pass_bits);
CudaTestLaunch<BLOCK_THREADS>(
num_blocks,
&kernels::RadixSort<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, T>, in,
in_offsets, vec_start_bit, vec_num_pass_bits, out, next_block_idx,
blocks, in.size());
}
};
120 changes: 120 additions & 0 deletions velox/experimental/breeze/test/generated/algorithms/kernels-cuda.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*
* Copyright (c) 2024 by Rivos Inc.
* Licensed under the Apache License, Version 2.0, see LICENSE for details.
* SPDX-License-Identifier: Apache-2.0
*/

/*
* This file is auto-generated from kernel_generator.py
* DO NOT EDIT!
*/

#include "breeze/algorithms/reduce.h"
#include "breeze/algorithms/scan.h"
#include "breeze/algorithms/sort.h"
#include "breeze/platforms/cuda.cuh"
#include "breeze/platforms/platform.h"
#include "breeze/utils/types.h"

namespace kernels {

enum { WARP_THREADS = 32 };
template <typename Op, int BLOCK_THREADS, int ITEMS_PER_THREAD, typename T,
typename U>
__global__ void Reduce(const T* in, U* out, int num_items) {
using PlatformT = CudaPlatform<BLOCK_THREADS, WARP_THREADS>;
PlatformT p;
__shared__
typename breeze::algorithms::DeviceReduce<PlatformT, U>::Scratch scratch_;
auto scratch =
(typename breeze::algorithms::DeviceReduce<PlatformT,
U>::Scratch*)&scratch_;

breeze::algorithms::DeviceReduce<PlatformT, U>::template Reduce<
Op, ITEMS_PER_THREAD>(
p, breeze::utils::make_slice<breeze::utils::GLOBAL>(in),
breeze::utils::make_slice<breeze::utils::GLOBAL>(out),
breeze::utils::make_slice<breeze::utils::SHARED>(scratch), num_items);
}

template <typename Op, int BLOCK_THREADS, int ITEMS_PER_THREAD,
int LOOKBACK_DISTANCE, typename T, typename U, typename V>
__global__ void Scan(const T* in, U* out, int* next_block_idx, V* blocks,
int num_items) {
using PlatformT = CudaPlatform<BLOCK_THREADS, WARP_THREADS>;
PlatformT p;
__shared__ typename breeze::algorithms::DeviceScan<
PlatformT, U, ITEMS_PER_THREAD, LOOKBACK_DISTANCE>::Scratch scratch_;
auto scratch = (typename breeze::algorithms::DeviceScan<
PlatformT, U, ITEMS_PER_THREAD,
LOOKBACK_DISTANCE>::Scratch*)&scratch_;

breeze::algorithms::DeviceScan<PlatformT, U, ITEMS_PER_THREAD,
LOOKBACK_DISTANCE>::
template Scan<Op>(
p, breeze::utils::make_slice<breeze::utils::GLOBAL>(in),
breeze::utils::make_slice<breeze::utils::GLOBAL>(out),
breeze::utils::make_slice<breeze::utils::GLOBAL>(next_block_idx),
breeze::utils::make_slice<breeze::utils::GLOBAL>(blocks),
breeze::utils::make_slice<breeze::utils::SHARED>(scratch), num_items);
}

template <int BLOCK_THREADS, int ITEMS_PER_THREAD, int TILE_SIZE,
int RADIX_BITS, typename T>
__global__ void RadixSortHistogram(const T* in, unsigned* out, int num_items) {
using PlatformT = CudaPlatform<BLOCK_THREADS, WARP_THREADS>;
PlatformT p;
__shared__ typename breeze::algorithms::DeviceRadixSortHistogram<
RADIX_BITS, T>::Scratch scratch_;
auto scratch = (typename breeze::algorithms::DeviceRadixSortHistogram<
RADIX_BITS, T>::Scratch*)&scratch_;

breeze::algorithms::DeviceRadixSortHistogram<RADIX_BITS, T>::template Build<
ITEMS_PER_THREAD, TILE_SIZE>(
p, breeze::utils::make_slice<breeze::utils::GLOBAL>(in),
breeze::utils::make_slice<breeze::utils::GLOBAL>(out),
breeze::utils::make_slice<breeze::utils::SHARED>(scratch), num_items);
}

template <int BLOCK_THREADS, int ITEMS_PER_THREAD, int RADIX_BITS, typename T>
__global__ void RadixSort(const T* in, const unsigned* in_offsets,
const int* start_bit, const int* num_pass_bits,
T* out, int* next_block_idx, unsigned* blocks,
int num_items) {
using PlatformT = CudaPlatform<BLOCK_THREADS, WARP_THREADS>;
PlatformT p;
__shared__ typename breeze::algorithms::DeviceRadixSort<
PlatformT, ITEMS_PER_THREAD, RADIX_BITS, T>::Scratch scratch_;
auto scratch =
(typename breeze::algorithms::DeviceRadixSort<
PlatformT, ITEMS_PER_THREAD, RADIX_BITS, T>::Scratch*)&scratch_;

breeze::algorithms::DeviceRadixSort<PlatformT, ITEMS_PER_THREAD, RADIX_BITS,
T>::
template Sort<unsigned>(
p, breeze::utils::make_slice<breeze::utils::GLOBAL>(in),
breeze::utils::make_slice<breeze::utils::GLOBAL>(in_offsets),
*start_bit, *num_pass_bits,
breeze::utils::make_slice<breeze::utils::GLOBAL>(out),
breeze::utils::make_slice<breeze::utils::GLOBAL>(next_block_idx),
breeze::utils::make_slice<breeze::utils::GLOBAL>(blocks),
breeze::utils::make_slice<breeze::utils::SHARED>(scratch), num_items);
}

} // namespace kernels
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*
* Copyright (c) 2024 by Rivos Inc.
* Licensed under the Apache License, Version 2.0, see LICENSE for details.
* SPDX-License-Identifier: Apache-2.0
*/

/*
* This file is auto-generated from test_fixture_generator.py
* DO NOT EDIT!
*/

#include <gtest/gtest.h>

#include <vector>

#include "breeze/platforms/cuda.cuh"
#include "test/generated/functions/kernels-cuda.cuh"
#include "test/platforms/cuda_test.cuh"

template <typename T>
class FunctionTest : public ::testing::Test {
protected:
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
void BlockLoad(const std::vector<T>& in, std::vector<T>& out) {
CudaTestLaunch<BLOCK_THREADS>(
/*num_blocks=*/1,
&kernels::BlockLoad<BLOCK_THREADS, ITEMS_PER_THREAD, T>, in, out,
in.size());
}

template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
void BlockLoadIf(const std::vector<T>& in,
const std::vector<int>& selection_flags,
std::vector<T>& out) {
CudaTestLaunch<BLOCK_THREADS>(
/*num_blocks=*/1,
&kernels::BlockLoadIf<BLOCK_THREADS, ITEMS_PER_THREAD, T>, in,
selection_flags, out, in.size());
}

template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
void BlockLoadFrom(const std::vector<T>& in, const std::vector<int>& offsets,
std::vector<T>& out) {
CudaTestLaunch<BLOCK_THREADS>(
/*num_blocks=*/1,
&kernels::BlockLoadFrom<BLOCK_THREADS, ITEMS_PER_THREAD, T>, in,
offsets, out, in.size());
}

template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
void BlockStore(const std::vector<T>& in, std::vector<T>& out) {
CudaTestLaunch<BLOCK_THREADS>(
/*num_blocks=*/1,
&kernels::BlockStore<BLOCK_THREADS, ITEMS_PER_THREAD, T>, in, out,
out.size());
}

template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
void BlockStoreIf(const std::vector<T>& in,
const std::vector<int>& selection_flags,
std::vector<T>& out) {
CudaTestLaunch<BLOCK_THREADS>(
/*num_blocks=*/1,
&kernels::BlockStoreIf<BLOCK_THREADS, ITEMS_PER_THREAD, T>, in,
selection_flags, out, out.size());
}

template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
void BlockStoreAt(const std::vector<T>& in, const std::vector<int>& offsets,
std::vector<T>& out) {
CudaTestLaunch<BLOCK_THREADS>(
/*num_blocks=*/1,
&kernels::BlockStoreAt<BLOCK_THREADS, ITEMS_PER_THREAD, T>, in, offsets,
out, out.size());
}

template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
void BlockStoreAtIf(const std::vector<T>& in, const std::vector<int>& offsets,
const std::vector<int>& selection_flags,
std::vector<T>& out) {
CudaTestLaunch<BLOCK_THREADS>(
/*num_blocks=*/1,
&kernels::BlockStoreAtIf<BLOCK_THREADS, ITEMS_PER_THREAD, T>, in,
offsets, selection_flags, out, out.size());
}

template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
void BlockFill(T value, std::vector<T>& out) {
const std::vector<T> vec_value(1, value);
CudaTestLaunch<BLOCK_THREADS>(
/*num_blocks=*/1,
&kernels::BlockFill<BLOCK_THREADS, ITEMS_PER_THREAD, T>, vec_value, out,
out.size());
}

template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
void BlockFillAtIf(T value, const std::vector<int>& offsets,
const std::vector<int>& selection_flags,
std::vector<T>& out) {
const std::vector<T> vec_value(1, value);
CudaTestLaunch<BLOCK_THREADS>(
/*num_blocks=*/1,
&kernels::BlockFillAtIf<BLOCK_THREADS, ITEMS_PER_THREAD, T>, vec_value,
offsets, selection_flags, out, out.size());
}

template <typename ReduceOp, int BLOCK_THREADS, int ITEMS_PER_THREAD,
typename U>
void BlockReduce(const std::vector<T>& in, U* out) {
std::vector<U> vec_out(1, *out);
CudaTestLaunch<BLOCK_THREADS>(
/*num_blocks=*/1,
&kernels::BlockReduce<ReduceOp, BLOCK_THREADS, ITEMS_PER_THREAD, T, U>,
in, vec_out, in.size());
*out = vec_out[0];
}

template <typename ScanOp, int BLOCK_THREADS, int ITEMS_PER_THREAD,
typename U>
void BlockScan(const std::vector<T>& in, std::vector<U>& out) {
CudaTestLaunch<BLOCK_THREADS>(
/*num_blocks=*/1,
&kernels::BlockScan<ScanOp, BLOCK_THREADS, ITEMS_PER_THREAD, T, U>, in,
out, in.size());
}

template <int BLOCK_THREADS, int ITEMS_PER_THREAD, int RADIX_BITS>
void BlockRadixRank(const std::vector<T>& in, std::vector<int>& out) {
CudaTestLaunch<BLOCK_THREADS>(
/*num_blocks=*/1,
&kernels::BlockRadixRank<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS,
T>,
in, out, in.size());
}

template <int BLOCK_THREADS, int ITEMS_PER_THREAD, int RADIX_BITS>
void BlockRadixSort(const std::vector<T>& in, std::vector<T>& out) {
CudaTestLaunch<BLOCK_THREADS>(
/*num_blocks=*/1,
&kernels::BlockRadixSort<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS,
T>,
in, out, in.size());
}
};
Loading

0 comments on commit 827f116

Please sign in to comment.