From b1d8f9c7b2b3cad8c4b6a359a4cf3d4b36112423 Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 11 Oct 2023 15:46:12 +0200 Subject: [PATCH] Update single_include --- single_include/kernel_float.h | 844 ++++++++++++++++++++++++++++++++-- 1 file changed, 817 insertions(+), 27 deletions(-) diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index aea324e..313cc85 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -16,8 +16,13 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! +<<<<<<< HEAD // date: 2023-10-13 14:55:52.284209 // git hash: 3da5ba08788e4d89a1b20b6a12bb4ba0f8de6b40 +======= +// date: 2023-10-11 15:46:04.149164 +// git hash: b1f6c1b73c2212223b10142054a28806f56b5ee6 +>>>>>>> 9bf416c (Update single_include) //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -66,6 +71,11 @@ } while (0) #define KERNEL_FLOAT_UNREACHABLE __builtin_unreachable() +// Somet utility macros +#define KERNEL_FLOAT_CONCAT_IMPL(A, B) A##B +#define KERNEL_FLOAT_CONCAT(A, B) KERNEL_FLOAT_CONCAT_IMPL(A, B) +#define KERNEL_FLOAT_CALL(F, ...) F(__VA_ARGS__) + #endif //KERNEL_FLOAT_MACROS_H #ifndef KERNEL_FLOAT_CORE_H #define KERNEL_FLOAT_CORE_H @@ -146,7 +156,7 @@ template using decay_t = typename detail::decay_impl::type; template -struct promote_type; +struct promote_type {}; template struct promote_type { @@ -1303,13 +1313,54 @@ KERNEL_FLOAT_INLINE vector> convert(const V& input, extent new_s return convert_storage(input); } +template +struct AssignConversionProxy { + KERNEL_FLOAT_INLINE + explicit AssignConversionProxy(T* ptr) : ptr_(ptr) {} + + template + KERNEL_FLOAT_INLINE AssignConversionProxy& operator=(U&& values) { + *ptr_ = detail::convert_impl< + vector_value_type, + vector_extent_type, + vector_value_type, + vector_extent_type, + M>::call(into_vector_storage(values)); + + return *this; + } + + private: + T* ptr_; +}; + +/** + * Takes a vector reference and gives back a helper object. This object allows you to assign + * a vector of a different type to another vector while perofrming implicit type converion. + * + * For example, if `x = expression;` does not compile because `x` and `expression` are + * different vector types, you can use `cast_to(x) = expression;` to make it work. + * + * Example + * ======= + * ``` + * vec x; + * vec y = {1.0, 2.0}; + * cast_to(x) = y; // Normally, `x = y;` would give an error, but `cast_to` fixes that. + * ``` + */ +template +KERNEL_FLOAT_INLINE AssignConversionProxy cast_to(T& input) { + return AssignConversionProxy(&input); +} + /** * Returns a vector containing `N` copies of `value`. * * Example * ======= * ``` - * vec a = fill<3>(42); // return [42, 42, 42] + * vec a = fill<3>(42); // returns [42, 42, 42] * ``` */ template @@ -1324,7 +1375,7 @@ KERNEL_FLOAT_INLINE vector> fill(T value = {}, extent = {}) { * Example * ======= * ``` - * vec a = zeros(); // return [0, 0, 0] + * vec a = zeros(); // returns [0, 0, 0] * ``` */ template @@ -1339,7 +1390,7 @@ KERNEL_FLOAT_INLINE vector> zeros(extent = {}) { * Example * ======= * ``` - * vec a = ones(); // return [1, 1, 1] + * vec a = ones(); // returns [1, 1, 1] * ``` */ template @@ -1355,7 +1406,7 @@ KERNEL_FLOAT_INLINE vector> ones(extent = {}) { * ======= * ``` * vec a = {1, 2, 3}; - * vec b = fill_like(a, 42); // return [42, 42, 42] + * vec b = fill_like(a, 42); // returns [42, 42, 42] * ``` */ template, typename E = vector_extent_type> @@ -1370,7 +1421,7 @@ KERNEL_FLOAT_INLINE vector fill_like(const V&, T value) { * ======= * ``` * vec a = {1, 2, 3}; - * vec b = zeros_like(a); // return [0, 0, 0] + * vec b = zeros_like(a); // returns [0, 0, 0] * ``` */ template, typename E = vector_extent_type> @@ -1385,7 +1436,7 @@ KERNEL_FLOAT_INLINE vector zeros_like(const V& = {}) { * ======= * ``` * vec a = {1, 2, 3}; - * vec b = ones_like(a); // return [1, 1, 1] + * vec b = ones_like(a); // returns [1, 1, 1] * ``` */ template, typename E = vector_extent_type> @@ -1765,17 +1816,44 @@ KERNEL_FLOAT_INLINE vector> cross(const L& left, const R& right) { namespace kernel_float { +/** + * `constant` represents a constant value of type `T`. + * + * The object has the property that for any binary operation involving + * a `constant` and a value of type `U`, the constant is automatically + * cast to also be of type `U`. + * + * For example: + * ``` + * float a = 5; + * constant b = 3; + * + * auto c = a + b; // The result will be of type `float` + * ``` + */ template struct constant { + /** + * Create a new constant from the given value. + */ + KERNEL_FLOAT_INLINE + constexpr constant(T value = {}) : value_(value) {} + + KERNEL_FLOAT_INLINE + constexpr constant(const constant& that) : value_(that.value) {} + + /** + * Create a new constant from another constant of type `R`. + */ template KERNEL_FLOAT_INLINE explicit constexpr constant(const constant& that) { auto f = ops::cast(); value_ = f(that.get()); } - KERNEL_FLOAT_INLINE - constexpr constant(T value = {}) : value_(value) {} - + /** + * Return the value of the constant + */ KERNEL_FLOAT_INLINE constexpr T get() const { return value_; @@ -1893,7 +1971,7 @@ namespace kernel_float { * ``` */ template -void for_each(V&& input, F fun) { +KERNEL_FLOAT_INLINE void for_each(V&& input, F fun) { auto storage = into_vector_storage(input); #pragma unroll @@ -1905,13 +1983,13 @@ void for_each(V&& input, F fun) { namespace detail { template struct range_impl { - KERNEL_FLOAT_INLINE - static vector_storage call() { + template + KERNEL_FLOAT_INLINE static vector_storage call(F fun) { vector_storage result; #pragma unroll for (size_t i = 0; i < N; i++) { - result.data()[i] = T(i); + result.data()[i] = fun(i); } return result; @@ -1920,7 +1998,22 @@ struct range_impl { } // namespace detail /** - * Generate vector consisting of the numbers `0...N-1` of type `T` + * Generate vector consisting of the result `fun(0), ..., fun(N-1)` + * + * Example + * ======= + * ``` + * // Returns [0.0f, 2.0f, 4.0f] + * vec vec = range<3>([](auto i){ return float(i * 2.0f); }); + * ``` + */ +template> +KERNEL_FLOAT_INLINE vector> range(F fun) { + return detail::range_impl::call(fun); +} + +/** + * Generate vector consisting of the numbers `0, ..., N-1` of type `T` * * Example * ======= @@ -1931,11 +2024,11 @@ struct range_impl { */ template KERNEL_FLOAT_INLINE vector> range() { - return detail::range_impl::call(); + return detail::range_impl::call(ops::cast()); } /** - * Takes a vector `vec` and returns a new vector consisting of the numbers ``0...N-1`` of type ``T`` + * Takes a vector `vec` and returns a new vector consisting of the numbers ``0, ..., N-1`` of type ``T`` * * Example * ======= @@ -1946,11 +2039,11 @@ KERNEL_FLOAT_INLINE vector> range() { */ template KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { - return detail::range_impl, vector_extent>::call(); + return range, vector_extent>(); } /** - * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0...N-1``. The data type used + * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0, ..., N-1``. The data type used * for the indices is given by the first template argument, which is `size_t` by default. This function is useful when * needing to iterate over the indices of a vector. * @@ -1971,7 +2064,7 @@ KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { */ template KERNEL_FLOAT_INLINE vector> each_index(const V& = {}) { - return detail::range_impl>::call(); + return range>(); } namespace detail { @@ -2118,14 +2211,14 @@ using concat_type = vector, extent>> * ======= * ``` * double vec1 = 1.0; - * double3 vec2 = {3.0, 4.0, 5.0); - * double4 vec3 = {6.0, 7.0, 8.0, 9.0}; - * vec concatenated = concat(vec1, vec2, vec3); // contains [1, 2, 3, 4, 5, 6, 7, 8, 9] + * double3 vec2 = {2.0, 3.0, 4.0); + * double4 vec3 = {5.0, 6.0, 7.0, 8.0}; + * vec concatenated = concat(vec1, vec2, vec3); // contains [1, 2, 3, 4, 5, 6, 7, 8] * * int num1 = 42; * float num2 = 3.14159; * int2 num3 = {-10, 10}; - * vec concatenated = concat(num1, num2, num3); // contains [42, 3.14159, -10, 10] + * vec concatenated = concat(num1, num2, num3); // contains [42, 3.14159, -10, 10] * ``` */ template @@ -2393,6 +2486,7 @@ KERNEL_FLOAT_INLINE void storen(const V& values, T* ptr, size_t offset, size_t m return store(values, ptr, indices, indices < max_length); } +<<<<<<< HEAD // TOOD: check if this way is support across all compilers #if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned) #define KERNEL_FLOAT_ASSUME_ALIGNED(ptr, alignment) (__builtin_assume_aligned(ptr, alignment)) @@ -2537,13 +2631,135 @@ struct aligned_ptr { KERNEL_FLOAT_INLINE explicit aligned_ptr(const T* ptr) : ptr_(ptr) {} +======= +/** + * Returns the original pointer ``ptr`` and hints to the compiler that this pointer is aligned to ``alignment`` bytes. + * If this is not actually the case, compiler optimizations will break things and generate invalid code. Be careful! + */ +template +KERNEL_FLOAT_INLINE T* unsafe_assume_aligned(T* ptr, size_t alignment) { +// TOOD: check if this way is support across all compilers +#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned) + return static_cast(__builtin_assume_aligned(ptr, alignment)); +#else + return ptr; +#endif +} + +/** + * Represents a pointer of type ``T*`` that is guaranteed to be aligned to ``alignment`` bytes. + */ +template +struct aligned_ptr { + static_assert(alignment >= alignof(T), "invalid alignment"); + + KERNEL_FLOAT_INLINE + aligned_ptr(nullptr_t = nullptr) {} + + KERNEL_FLOAT_INLINE + explicit aligned_ptr(T* ptr) : ptr_(ptr) {} + + /** + * Return the pointer value. + */ + KERNEL_FLOAT_INLINE + T* get() const { + return unsafe_assume_aligned(ptr_, alignment); + } + + KERNEL_FLOAT_INLINE + operator T*() const { + return get(); + } + + template + KERNEL_FLOAT_INLINE T& operator[](I&& index) const { + return get()[std::forward(index)]; + } + + /** + * See ``kernel_float::load`` + */ + template> + KERNEL_FLOAT_INLINE vector load(const I& indices, const M& mask = true) const { + return ::kernel_float::load(get(), indices, mask); + } + + /** + * See ``kernel_float::loadn`` + */ + template + KERNEL_FLOAT_INLINE vector> loadn(size_t offset = 0) const { + return ::kernel_float::loadn(get(), offset); + } + + /** + * See ``kernel_float::loadn`` + */ + template + KERNEL_FLOAT_INLINE vector> loadn(size_t offset, size_t max_length) const { + return ::kernel_float::loadn(get(), offset, max_length); + } + + /** + * See ``kernel_float::store`` + */ + template> + KERNEL_FLOAT_INLINE void store(const V& values, const I& indices, const M& mask = true) const { + ::kernel_float::store(values, get(), indices, mask); + } + /** + * See ``kernel_float::storen`` + */ + template> + KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset = 0) const { + ::kernel_float::storen(values, get(), offset); + } + /** + * See ``kernel_float::storen`` + */ + template> + KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset, size_t max_length) const { + ::kernel_float::storen(values, get(), offset, max_length); + } + + private: + T* ptr_ = nullptr; +}; + +/** + * Represents a pointer of type ``const T*`` that is guaranteed to be aligned to ``alignment`` bytes. + */ +template +struct aligned_ptr { + static_assert(alignment >= alignof(T), "invalid alignment"); + + KERNEL_FLOAT_INLINE + aligned_ptr(nullptr_t = nullptr) {} + + KERNEL_FLOAT_INLINE + explicit aligned_ptr(T* ptr) : ptr_(ptr) {} + + KERNEL_FLOAT_INLINE + explicit aligned_ptr(const T* ptr) : ptr_(ptr) {} + + KERNEL_FLOAT_INLINE + aligned_ptr(const aligned_ptr& ptr) : ptr_(ptr.get()) {} + + KERNEL_FLOAT_INLINE + aligned_ptr(const aligned_ptr& ptr) : ptr_(ptr.get()) {} +>>>>>>> 9bf416c (Update single_include) /** * Return the pointer value. */ KERNEL_FLOAT_INLINE const T* get() const { +<<<<<<< HEAD return KERNEL_FLOAT_ASSUME_ALIGNED(ptr_, alignment); +======= + return unsafe_assume_aligned(ptr_, alignment); +>>>>>>> 9bf416c (Update single_include) } KERNEL_FLOAT_INLINE @@ -2584,6 +2800,12 @@ struct aligned_ptr { const T* ptr_ = nullptr; }; +<<<<<<< HEAD +======= +template +aligned_ptr(T*) -> aligned_ptr; + +>>>>>>> 9bf416c (Update single_include) } // namespace kernel_float #endif //KERNEL_FLOAT_MEMORY_H @@ -2686,7 +2908,7 @@ KERNEL_FLOAT_INLINE T sum(const V& input) { * ======= * ``` * vec x = {5, 0, 2, 1, 0}; - * int y = sum(x); // Returns 5*0*2*1*0 = 0 + * int y = product(x); // Returns 5*0*2*1*0 = 0 * ``` */ template> @@ -2994,9 +3216,9 @@ KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { namespace kernel_float { /** - * Container that stores ``N`` elements of type ``T``. + * Container that store fixed number of elements of type ``T``. * - * It is not recommended to use this class directly, but instead, use the type `vec` which is an alias for + * It is not recommended to use this class directly, instead, use the type `vec` which is an alias for * `vector, vector_storage>`. * * @tparam T The type of the values stored within the vector. @@ -3047,11 +3269,17 @@ struct vector: public S { return E::size; } + /** + * Returns a reference to the underlying storage type. + */ KERNEL_FLOAT_INLINE storage_type& storage() { return *this; } + /** + * Returns a reference to the underlying storage type. + */ KERNEL_FLOAT_INLINE const storage_type& storage() const { return *this; @@ -4147,3 +4375,565 @@ kconstant(T&&) -> kconstant>; } // namespace kernel_float #endif +#ifndef KERNEL_FLOAT_TILING_H +#define KERNEL_FLOAT_TILING_H + + + + +namespace kernel_float { + +template +struct block_size { + static constexpr size_t rank = sizeof...(Ns); + + KERNEL_FLOAT_INLINE + block_size(dim3 thread_index) { + if (rank > 0 && size(0) > 1) { + thread_index_[0] = thread_index.x; + } + + if (rank > 1 && size(1) > 1) { + thread_index_[1] = thread_index.y; + } + + if (rank > 2 && size(2) > 1) { + thread_index_[2] = thread_index.z; + } + } + + KERNEL_FLOAT_INLINE + size_t thread_index(size_t axis) const { + return axis < rank ? thread_index_[axis] : 0; + } + + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis) { + size_t sizes[rank] = {Ns...}; + return axis < rank ? sizes[axis] : 1; + } + + private: + unsigned int thread_index_[rank] = {0}; +}; + +template +struct virtual_block_size { + static constexpr size_t rank = sizeof...(Ns); + + KERNEL_FLOAT_INLINE + virtual_block_size(dim3 thread_index) { + thread_index_ = thread_index.x; + } + + KERNEL_FLOAT_INLINE + size_t thread_index(size_t axis) const { + size_t product_up_to_axis = 1; +#pragma unroll + for (size_t i = 0; i < axis; i++) { + product_up_to_axis *= size(i); + } + + return (thread_index_ / product_up_to_axis) % size(axis); + } + + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis) { + size_t sizes[rank] = {Ns...}; + return axis < rank ? sizes[axis] : 1; + } + + private: + unsigned int thread_index_ = 0; +}; + +template +struct tile_size { + static constexpr size_t rank = sizeof...(Ns); + + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis, size_t block_size = 0) { + size_t sizes[rank] = {Ns...}; + return axis < rank ? sizes[axis] : 1; + } +}; + +template +struct tile_factor { + static constexpr size_t rank = sizeof...(Ns); + + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis, size_t block_size) { + size_t factors[rank] = {Ns...}; + return block_size * (axis < rank ? factors[axis] : 1); + } +}; + +namespace dist { +template +struct blocked_impl { + static constexpr bool is_exhaustive = N % K == 0; + static constexpr size_t items_per_thread = (N / K) + (is_exhaustive ? 0 : 1); + + KERNEL_FLOAT_INLINE + static constexpr bool local_is_present(size_t thread_index, size_t local_index) { + return is_exhaustive || (local_to_global(thread_index, local_index) < N); + } + + KERNEL_FLOAT_INLINE + static constexpr size_t local_to_global(size_t thread_index, size_t local_index) { + return thread_index * items_per_thread + local_index; + } + + KERNEL_FLOAT_INLINE + static constexpr size_t global_to_local(size_t global_index) { + return global_index % items_per_thread; + } + + KERNEL_FLOAT_INLINE + static constexpr size_t global_to_owner(size_t global_index) { + return global_index / items_per_thread; + } +}; + +struct blocked { + template + using type = blocked_impl; +}; + +template +struct cyclic_impl { + static constexpr bool is_exhaustive = N % (K * M) == 0; + static constexpr size_t items_per_thread = ((N / (K * M)) + (is_exhaustive ? 0 : 1)) * M; + + KERNEL_FLOAT_INLINE + static constexpr bool local_is_present(size_t thread_index, size_t local_index) { + return is_exhaustive || (local_to_global(thread_index, local_index) < N); + } + + KERNEL_FLOAT_INLINE + static constexpr size_t local_to_global(size_t thread_index, size_t local_index) { + return (local_index / M) * M * K + thread_index * M + (local_index % M); + } + + KERNEL_FLOAT_INLINE + static constexpr size_t global_to_local(size_t global_index) { + return (global_index / (M * K)) * M + (global_index % M); + } + + KERNEL_FLOAT_INLINE + static constexpr size_t global_to_owner(size_t global_index) { + return (global_index / M) % K; + } +}; + +struct cyclic { + template + using type = cyclic_impl<1, N, K>; +}; + +template +struct block_cyclic { + template + using type = cyclic_impl; +}; +} // namespace dist + +template +struct distributions {}; + +namespace detail { +template +struct instantiate_distribution_impl { + template + using type = dist::cyclic::type; +}; + +template +struct instantiate_distribution_impl<0, distributions> { + template + using type = typename First::type; +}; + +template +struct instantiate_distribution_impl>: + instantiate_distribution_impl> {}; + +template< + typename TileDim, + typename BlockDim, + typename Distributions, + typename = make_index_sequence> +struct tiling_impl; + +template +struct tiling_impl> { + template + using dist_type = typename instantiate_distribution_impl:: + type; + + static constexpr size_t rank = TileDim::rank; + static constexpr size_t items_per_thread = (dist_type::items_per_thread * ... * 1); + static constexpr bool is_exhaustive = (dist_type::is_exhaustive && ...); + + template + KERNEL_FLOAT_INLINE static vector_storage + local_to_global(const BlockDim& block, size_t item) { + vector_storage result; + ((result.data()[Is] = dist_type::local_to_global( + block.thread_index(Is), + item % dist_type::items_per_thread), + item /= dist_type::items_per_thread), + ...); + return result; + } + + KERNEL_FLOAT_INLINE + static bool local_is_present(const BlockDim& block, size_t item) { + bool is_present = true; + ((is_present &= dist_type::local_is_present( + block.thread_index(Is), + item % dist_type::items_per_thread), + item /= dist_type::items_per_thread), + ...); + return is_present; + } +}; +}; // namespace detail + +template +struct tiling_iterator; + +/** + * Represents a tiling where the elements given by `TileDim` are distributed over the + * threads given by `BlockDim` according to the distributions given by `Distributions`. + * + * The template parameters should be the following: + * + * * ``TileDim``: Should be an instance of ``tile_size<...>``. For example, + * ``tile_size<16, 16>`` represents a 2-dimensional 16x16 tile. + * * ``BlockDim``: Should be an instance of ``block_dim<...>``. For example, + * ``block_dim<16, 4>`` represents a thread block having X dimension 16 + * and Y-dimension 4 for a total of 64 threads per block. + * * ``Distributions``: Should be an instance of ``distributions<...>``. For example, + * ``distributions`` will distribute elements in + * cyclic fashion along the X-axis and blocked fashion along the Y-axis. + * * ``IndexType``: The type used for index values (``int`` by default) + */ +template< + typename TileDim, + typename BlockDim, + typename Distributions = distributions<>, + typename IndexType = int> +struct tiling { + using self_type = tiling; + using impl_type = detail::tiling_impl; + using block_type = BlockDim; + using tile_type = TileDim; + + static constexpr size_t rank = tile_type::rank; + static constexpr size_t num_locals = impl_type::items_per_thread; + + using index_type = IndexType; + using point_type = vector>; + +#if KERNEL_FLOAT_IS_DEVICE + __forceinline__ __device__ tiling() : block_(threadIdx) {} +#endif + + KERNEL_FLOAT_INLINE + tiling(BlockDim block, vec offset = {}) : block_(block), offset_(offset) {} + + /** + * Returns the number of items per thread in the tiling. + * + * Note that this method is ``constexpr`` and can be called at compile-time. + */ + KERNEL_FLOAT_INLINE + static constexpr size_t size() { + return impl_type::items_per_thread; + } + + /** + * Checks if the tiling is exhaustive, meaning all items are always present for all threads. If this returns + * `true`, then ``is_present`` will always true for any given index. + * + * Note that this method is ``constexpr`` and can thus be called at compile-time. + */ + KERNEL_FLOAT_INLINE + static constexpr bool all_present() { + return impl_type::is_exhaustive; + } + + /** + * Checks if a specific item is present for the current thread based on the distribution strategy. Not always + * is the number of items stored per thread equal to the number of items _owned_ by each thread (for example, + * if the tile size is not divisible by the block size). In this case, ``is_present`` will return `false` for + * certain items. + */ + KERNEL_FLOAT_INLINE + bool is_present(size_t item) const { + return all_present() || impl_type::local_is_present(block_, item); + } + + /** + * Returns the global coordinates of a specific item for the current thread. + */ + KERNEL_FLOAT_INLINE + vector> at(size_t item) const { + return impl_type::template local_to_global(block_, item) + offset_; + } + + /** + * Returns the global coordinates of a specific item along a specified axis for the current thread. + */ + KERNEL_FLOAT_INLINE + index_type at(size_t item, size_t axis) const { + return axis < rank ? at(item)[axis] : index_type {}; + } + + /** + * Returns the global coordinates of a specific item for the current thread (alias of ``at``). + */ + KERNEL_FLOAT_INLINE + vector> operator[](size_t item) const { + return at(item); + } + + /** + * Returns a vector of global coordinates of all items present for the current thread. + */ + KERNEL_FLOAT_INLINE + vector>, extent> local_points() const { + return range([&](size_t i) { return at(i); }); + } + + /** + * Returns a vector of coordinate values along a specified axis for all items present for the current thread. + */ + KERNEL_FLOAT_INLINE + vector> local_points(size_t axis) const { + return range([&](size_t i) { return at(i, axis); }); + } + + /** + * Returns a vector of boolean values representing the result of ``is_present`` of the items for the current thread. + */ + KERNEL_FLOAT_INLINE + vector> local_mask() const { + return range([&](size_t i) { return is_present(i); }); + } + + /** + * Returns the thread index (position) along a specified axis for the current thread. + */ + KERNEL_FLOAT_INLINE + index_type thread_index(size_t axis) const { + return index_type(block_.thread_index(axis)); + } + + /** + * Returns the size of the block (number of threads) along a specified axis. + * + * Note that this method is ``constexpr`` and can thus be called at compile-time. + */ + KERNEL_FLOAT_INLINE + static constexpr index_type block_size(size_t axis) { + return index_type(block_type::size(axis)); + } + + /** + * Returns the size of the tile along a specified axis. + * + * Note that this method is ``constexpr`` and can thus be called at compile-time. + */ + KERNEL_FLOAT_INLINE + static constexpr index_type tile_size(size_t axis) { + return index_type(tile_type::size(axis, block_size(axis))); + } + + /** + * Returns the offset of the tile along a specified axis. + */ + KERNEL_FLOAT_INLINE + index_type tile_offset(size_t axis) const { + return index_type(offset_[axis]); + } + + /** + * Returns a vector of thread indices for all axes. + */ + KERNEL_FLOAT_INLINE + vector> thread_index() const { + return range([&](size_t i) { return thread_index(i); }); + } + + /** + * Returns a vector of block sizes for all axes. + */ + KERNEL_FLOAT_INLINE + static vector> block_size() { + return range([&](size_t i) { return block_size(i); }); + } + + /** + * Returns a vector of tile sizes for all axes. + */ + KERNEL_FLOAT_INLINE + static vector> tile_size() { + return range([&](size_t i) { return tile_size(i); }); + } + + /** + * Returns the offset of the tile for all axes. + */ + KERNEL_FLOAT_INLINE + vector> tile_offset() const { + return range([&](size_t i) { return tile_offset(i); }); + } + + /** + * Returns an iterator pointing to the beginning of the tiling. + */ + KERNEL_FLOAT_INLINE + tiling_iterator begin() const { + return {*this, 0}; + } + + /** + * Returns an iterator pointing to the end of the tiling. + */ + KERNEL_FLOAT_INLINE + tiling_iterator end() const { + return {*this, num_locals}; + } + + /** + * Applies a provided function to each item present in the tiling for the current thread. + * The function should take an index and a ``vector`` of global coordinates as arguments. + */ + template + KERNEL_FLOAT_INLINE void for_each(F fun) const { +#pragma unroll + for (size_t i = 0; i < num_locals; i++) { + if (is_present(i)) { + fun(i, at(i)); + } + } + } + + /** + * Adds ``offset`` to all points of this tiling and returns a new tiling. + */ + KERNEL_FLOAT_INLINE friend tiling + operator+(const tiling& self, const vector>& offset) { + return tiling {self.block_, self.offset_ + offset}; + } + + /** + * Adds ``offset`` to all points of this tiling and returns a new tiling. + */ + KERNEL_FLOAT_INLINE friend tiling + operator+(const vector>& offset, const tiling& self) { + return self + offset; + } + + /** + * Adds ``offset`` to all points of this tiling. + */ + KERNEL_FLOAT_INLINE friend tiling& + operator+=(tiling& self, const vector>& offset) { + return self = self + offset; + } + + private: + BlockDim block_; + vector> offset_; +}; + +template +struct tiling_iterator { + using value_type = vector>; + + KERNEL_FLOAT_INLINE + tiling_iterator(const T& inner, size_t position = 0) : inner_(&inner), position_(position) { + while (position_ < T::num_locals && !inner_->is_present(position_)) { + position_++; + } + } + + KERNEL_FLOAT_INLINE + value_type operator*() const { + return inner_->at(position_); + } + + KERNEL_FLOAT_INLINE + tiling_iterator& operator++() { + return *this = tiling_iterator(*inner_, position_ + 1); + } + + KERNEL_FLOAT_INLINE + tiling_iterator operator++(int) { + tiling_iterator old = *this; + this ++; + return old; + } + + KERNEL_FLOAT_INLINE + friend bool operator==(const tiling_iterator& a, const tiling_iterator& b) { + return a.position_ == b.position_; + } + + KERNEL_FLOAT_INLINE + friend bool operator!=(const tiling_iterator& a, const tiling_iterator& b) { + return !operator==(a, b); + } + + size_t position_ = 0; + const T* inner_; +}; + +template +using tiling_1d = tiling, block_size, distributions, IndexType>; + +// clang-format off +#define KERNEL_FLOAT_TILING_FOR_IMPL1(ITER_VAR, TILING, POINT_VAR, _) \ + _Pragma("unroll") \ + for (size_t ITER_VAR = 0; ITER_VAR < (TILING).size(); ITER_VAR++) \ + if (POINT_VAR = (TILING).at(ITER_VAR); (TILING).is_present(ITER_VAR)) \ + +#define KERNEL_FLOAT_TILING_FOR_IMPL2(ITER_VAR, TILING, INDEX_VAR, POINT_VAR) \ + KERNEL_FLOAT_TILING_FOR_IMPL1(ITER_VAR, TILING, POINT_VAR, _) \ + if (INDEX_VAR = ITER_VAR; true) + +#define KERNEL_FLOAT_TILING_FOR_IMPL(ITER_VAR, TILING, A, B, N, ...) \ + KERNEL_FLOAT_CALL(KERNEL_FLOAT_CONCAT(KERNEL_FLOAT_TILING_FOR_IMPL, N), ITER_VAR, TILING, A, B) + +/** + * Iterate over the points in a ``tiling<...>`` using a for loop. + * + * There are two ways to use this macro. Using the 1 variable form: + * ``` + * auto t = tiling, block_size<4, 4>>; + * + * KERNEL_FLOAT_TILING_FOR(t, auto point) { + * printf("%d,%d\n", point[0], point[1]); + * } + * ``` + * + * Or using the 2 variables form: + * ``` + * auto t = tiling, block_size<4, 4>>; + * + * KERNEL_FLOAT_TILING_FOR(t, auto index, auto point) { + * printf("%d] %d,%d\n", index, point[0], point[1]); + * } + * ``` + */ +#define KERNEL_FLOAT_TILING_FOR(...) \ + KERNEL_FLOAT_TILING_FOR_IMPL(KERNEL_FLOAT_CONCAT(__tiling_index_variable__, __LINE__), __VA_ARGS__, 2, 1) +// clang-format on + +} // namespace kernel_float + +#endif // KERNEL_FLOAT_TILING_H