Skip to content

Commit

Permalink
Harmonize constant batch type signature with non constant batch
Browse files Browse the repository at this point in the history
batch<T, A> <> batch_constant<T, A, Csts...>
batch_bool<T, A> <> batch_bool_constant<T, A, Csts...>

This is a strong API (and ABI) change, but it makes the type system more
harmonious.
  • Loading branch information
serge-sans-paille committed Mar 25, 2024
1 parent 6b372a2 commit 76cc907
Show file tree
Hide file tree
Showing 22 changed files with 273 additions and 269 deletions.
36 changes: 19 additions & 17 deletions docs/source/api/batch_manip.rst
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
.. Copyright (c) 2021, Serge Guelton
..Copyright(c) 2021, Serge Guelton

Distributed under the terms of the BSD 3-Clause License.
Distributed under the terms of the BSD 3
- Clause License.

The full license is in the file LICENSE, distributed with this software.
The full license is in the file LICENSE,
distributed with this software.

Conditional expression
======================
Conditional expression
== == == == == == == == == == ==

+------------------------------+-------------------------------------------+
| :cpp:func:`select` | conditional selection with mask |
+------------------------------+-------------------------------------------+
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- --+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -+
| : cpp : func :`select` | conditional selection with mask | +-- -- -- -- -- -- -- -- -- -- -- -- -- -- --+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -+

----
-- --

.. doxygenfunction:: select(batch_bool<T, A> const &cond, batch<T, A> const &true_br, batch<T, A> const &false_br) noexcept
:project: xsimd
..doxygenfunction::select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
: project : xsimd

.. doxygenfunction:: select(batch_bool_constant<batch<T, A>, Values...> const &cond, batch<T, A> const &true_br, batch<T, A> const &false_br) noexcept
:project: xsimd
..doxygenfunction::select(batch_bool_constant<T, A, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
: project : xsimd


In the specific case when one needs to conditionnaly increment or decrement a
batch based on a mask, :cpp:func:`incr_if` and
:cpp:func:`decr_if` provide specialized version.
In the specific case when one needs to conditionnaly increment
or decrement a
batch based on a mask,
:
cpp : func :`incr_if` and : cpp : func :`decr_if` provide specialized version.
2 changes: 1 addition & 1 deletion include/xsimd/arch/generic/xsimd_generic_math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2064,7 +2064,7 @@ namespace xsimd
inline T reduce(Op op, batch<T, A> const& self, std::integral_constant<unsigned, Lvl>) noexcept
{
using index_type = as_unsigned_integer_t<T>;
batch<T, A> split = swizzle(self, make_batch_constant<batch<index_type, A>, split_high<index_type, Lvl / 2>>());
batch<T, A> split = swizzle(self, make_batch_constant<index_type, A, split_high<index_type, Lvl / 2>>());
return reduce(op, op(split, self), std::integral_constant<unsigned, Lvl / 2>());
}
}
Expand Down
26 changes: 13 additions & 13 deletions include/xsimd/arch/generic/xsimd_generic_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@

namespace xsimd
{
template <class batch_type, typename batch_type::value_type... Values>
template <typename T, class A, T... Values>
struct batch_constant;

template <class batch_type, bool... Values>
template <typename T, class A, bool... Values>
struct batch_bool_constant;

namespace kernel
Expand Down Expand Up @@ -180,7 +180,7 @@ namespace xsimd
}
};
batch<T, A> tmp(val);
return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp);
return select(make_batch_bool_constant<T, A, index_mask>(), self, tmp);
}

// get
Expand Down Expand Up @@ -295,7 +295,7 @@ namespace xsimd
}
};

return swizzle(self, make_batch_constant<batch<as_unsigned_integer_t<T>, A>, rotate_generator>(), A {});
return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});
}

template <size_t N, class A, class T>
Expand All @@ -316,7 +316,7 @@ namespace xsimd
}
};

return swizzle(self, make_batch_constant<batch<as_unsigned_integer_t<T>, A>, rotate_generator>(), A {});
return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});
}

template <size_t N, class A, class T>
Expand Down Expand Up @@ -455,19 +455,19 @@ namespace xsimd
}

template <class A, typename T, typename ITy, ITy... Indices>
inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept
inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept
{
constexpr size_t bsize = sizeof...(Indices);

// Detect common patterns
XSIMD_IF_CONSTEXPR(detail::is_swizzle_fst(bsize, Indices...))
{
return swizzle(x, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>());
return swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>());
}

XSIMD_IF_CONSTEXPR(detail::is_swizzle_snd(bsize, Indices...))
{
return swizzle(y, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>());
return swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>());
}

XSIMD_IF_CONSTEXPR(detail::is_zip_lo(bsize, Indices...))
Expand All @@ -482,7 +482,7 @@ namespace xsimd

XSIMD_IF_CONSTEXPR(detail::is_select(bsize, Indices...))
{
return select(batch_bool_constant<batch<T, A>, (Indices < bsize)...>(), x, y);
return select(batch_bool_constant<T, A, (Indices < bsize)...>(), x, y);
}

#if defined(__has_builtin)
Expand All @@ -503,9 +503,9 @@ namespace xsimd
#else
// Use a generic_pattern. It is suboptimal but clang optimizes this
// pretty well.
batch<T, A> x_lane = swizzle(x, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
batch<T, A> y_lane = swizzle(y, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
batch_bool_constant<batch<T, A>, (Indices < bsize)...> select_x_lane;
batch<T, A> x_lane = swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
batch<T, A> y_lane = swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
batch_bool_constant<T, A, (Indices < bsize)...> select_x_lane;
return select(select_x_lane, x_lane, y_lane);
#endif
}
Expand Down Expand Up @@ -542,7 +542,7 @@ namespace xsimd

// swizzle
template <class A, class T, class ITy, ITy... Vs>
inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept
inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<ITy, A, Vs...> mask, requires_arch<generic>) noexcept
{
return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
}
Expand Down
30 changes: 15 additions & 15 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1161,22 +1161,22 @@ namespace xsimd
return detail::merge_sse(res_low, res_hi);
}
template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
{
return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
}

template <class A, bool... Values>
inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
inline batch<float, A> select(batch_bool_constant<float, A, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
{
constexpr auto mask = batch_bool_constant<batch<float, A>, Values...>::mask();
constexpr auto mask = batch_bool_constant<float, A, Values...>::mask();
return _mm256_blend_ps(false_br, true_br, mask);
}

template <class A, bool... Values>
inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
inline batch<double, A> select(batch_bool_constant<double, A, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
{
constexpr auto mask = batch_bool_constant<batch<double, A>, Values...>::mask();
constexpr auto mask = batch_bool_constant<double, A, Values...>::mask();
return _mm256_blend_pd(false_br, true_br, mask);
}

Expand Down Expand Up @@ -1238,7 +1238,7 @@ namespace xsimd

// shuffle
template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
{
constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
// shuffle within lane
Expand All @@ -1253,7 +1253,7 @@ namespace xsimd
}

template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
{
constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3);
// shuffle within lane
Expand Down Expand Up @@ -1504,7 +1504,7 @@ namespace xsimd

// swizzle (constant mask)
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
{
// duplicate low and high part of input
__m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
Expand All @@ -1514,22 +1514,22 @@ namespace xsimd
__m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);

// normalize mask
batch_constant<batch<uint32_t, A>, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;

// permute within each lane
__m256 r0 = _mm256_permutevar_ps(low_low, (batch<uint32_t, A>)half_mask);
__m256 r1 = _mm256_permutevar_ps(hi_hi, (batch<uint32_t, A>)half_mask);

// mask to choose the right lane
batch_bool_constant<batch<uint32_t, A>, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;

// blend the two permutes
constexpr auto mask = blend_mask.mask();
return _mm256_blend_ps(r0, r1, mask);
}

template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx>) noexcept
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx>) noexcept
{
// duplicate low and high part of input
__m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
Expand All @@ -1539,14 +1539,14 @@ namespace xsimd
__m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);

// normalize mask
batch_constant<batch<uint64_t, A>, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
batch_constant<uint64_t, A, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;

// permute within each lane
__m256d r0 = _mm256_permutevar_pd(low_low, (batch<uint64_t, A>)half_mask);
__m256d r1 = _mm256_permutevar_pd(hi_hi, (batch<uint64_t, A>)half_mask);

// mask to choose the right lane
batch_bool_constant<batch<uint64_t, A>, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;

// blend the two permutes
constexpr auto mask = blend_mask.mask();
Expand All @@ -1564,7 +1564,7 @@ namespace xsimd
uint32_t V7,
detail::enable_sized_integral_t<T, 4> = 0>
inline batch<T, A> swizzle(batch<T, A> const& self,
batch_constant<batch<uint32_t, A>,
batch_constant<uint32_t, A,
V0,
V1,
V2,
Expand All @@ -1588,7 +1588,7 @@ namespace xsimd
detail::enable_sized_integral_t<T, 8> = 0>
inline batch<T, A>
swizzle(batch<T, A> const& self,
batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> const& mask,
batch_constant<uint64_t, A, V0, V1, V2, V3> const& mask,
requires_arch<avx>) noexcept
{
return bitwise_cast<T>(
Expand Down
16 changes: 8 additions & 8 deletions include/xsimd/arch/xsimd_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -729,9 +729,9 @@ namespace xsimd
}
}
template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
{
constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
// FIXME: for some reason mask here is not considered as an immediate,
// but it's okay for _mm256_blend_epi32
// case 2: return _mm256_blend_epi16(false_br, true_br, mask);
Expand Down Expand Up @@ -912,36 +912,36 @@ namespace xsimd

// swizzle (constant mask)
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
{
return _mm256_permutevar8x32_ps(self, (batch<uint32_t, A>)mask);
}

template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
{
constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
return _mm256_permute4x64_pd(self, mask);
}

template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
{
constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
return _mm256_permute4x64_epi64(self, mask);
}
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
{
return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {}));
}
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
{
return _mm256_permutevar8x32_epi32(self, (batch<uint32_t, A>)mask);
}
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
{
return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
}
Expand Down
Loading

0 comments on commit 76cc907

Please sign in to comment.