diff --git a/src/Reduce.cpp b/src/Reduce.cpp index 7bd2e3f9..3786642d 100644 --- a/src/Reduce.cpp +++ b/src/Reduce.cpp @@ -5,8 +5,10 @@ #include "Convert.h" #include "missing_values.h" #include "platform_detect.h" +#include "simd/avx2.h" #include +#include #if defined(__clang__) #pragma clang diagnostic ignored "-Wmissing-braces" @@ -76,13 +78,41 @@ static inline __m256i MIN_OP(uint32_t z, __m256i x, __m256i y) { return _mm256_min_epu32(x, y); } -static inline __m256 MIN_OP(float z, __m256 x, __m256 y) + +static inline __m256 MIN_OP(float z, __m256 x, __m256 y) { - return _mm256_min_ps(x, y); + // Create a mask indicating which components of 'x', if any, are NaNs. + const auto unord_cmp_mask = _mm256_cmp_ps(x, x, _CMP_UNORD_Q); + + // Use the blendv operation with our "x is nan" mask to create a new vector + // containing the components of 'y' where 'x' was _not_ a NaN, and the components + // of 'x' which were NaNs. + const auto blended_nans = _mm256_blendv_ps(y, x, unord_cmp_mask); + + // Perform the min operation on 'x' and the blended vector. + // minpd / vminpd will take the component from the 2nd argument whenever one of + // the components (from the 1st and 2nd arguments) is NaN. Putting the blended + // vector we created on the r.h.s. means we'll get a NaN component in the output + // vector for any index (within the vector) where _either_ the left or right side was NaN. + return _mm256_min_ps(x, blended_nans); } + static inline __m256d MIN_OP(double z, __m256d x, __m256d y) { - return _mm256_min_pd(x, y); + // Create a mask indicating which components of 'x', if any, are NaNs. + const auto unord_cmp_mask = _mm256_cmp_pd(x, x, _CMP_UNORD_Q); + + // Use the blendv operation with our "x is nan" mask to create a new vector + // containing the components of 'y' where 'x' was _not_ a NaN, and the components + // of 'x' which were NaNs. + const auto blended_nans = _mm256_blendv_pd(y, x, unord_cmp_mask); + + // Perform the min operation on 'x' and the blended vector. + // minpd / vminpd will take the component from the 2nd argument whenever one of + // the components (from the 1st and 2nd arguments) is NaN. Putting the blended + // vector we created on the r.h.s. means we'll get a NaN component in the output + // vector for any index (within the vector) where _either_ the left or right side was NaN. + return _mm256_min_pd(x, blended_nans); } static inline __m256i MAX_OP(int8_t z, __m256i x, __m256i y) @@ -109,18 +139,51 @@ static inline __m256i MAX_OP(uint32_t z, __m256i x, __m256i y) { return _mm256_max_epu32(x, y); } -static inline __m256 MAX_OP(float z, __m256 x, __m256 y) + +static inline __m256 MAX_OP(float z, __m256 x, __m256 y) { - return _mm256_max_ps(x, y); + // Create a mask indicating which components of 'x', if any, are NaNs. + const auto unord_cmp_mask = _mm256_cmp_ps(x, x, _CMP_UNORD_Q); + + // Use the blendv operation with our "x is nan" mask to create a new vector + // containing the components of 'y' where 'x' was _not_ a NaN, and the components + // of 'x' which were NaNs. + const auto blended_nans = _mm256_blendv_ps(y, x, unord_cmp_mask); + + // Perform the max operation on 'x' and the blended vector. + // maxpd / vmaxpd will take the component from the 2nd argument whenever one of + // the components (from the 1st and 2nd arguments) is NaN. Putting the blended + // vector we created on the r.h.s. means we'll get a NaN component in the output + // vector for any index (within the vector) where _either_ the left or right side was NaN. + return _mm256_max_ps(x, blended_nans); } + +// This function implements a 'max' operation which returns a NaN when either argument (per component) +// contains a NaN. This fixes the behavior of the _mm256_max_pd() intrinsic, which does not always propagate +// NaNs as expected. static inline __m256d MAX_OP(double z, __m256d x, __m256d y) { - return _mm256_max_pd(x, y); + // Create a mask indicating which components of 'x', if any, are NaNs. + const auto unord_cmp_mask = _mm256_cmp_pd(x, x, _CMP_UNORD_Q); + + // Use the blendv operation with our "x is nan" mask to create a new vector + // containing the components of 'y' where 'x' was _not_ a NaN, and the components + // of 'x' which were NaNs. + const auto blended_nans = _mm256_blendv_pd(y, x, unord_cmp_mask); + + // Perform the max operation on 'x' and the blended vector. + // maxpd / vmaxpd will take the component from the 2nd argument whenever one of + // the components (from the 1st and 2nd arguments) is NaN. Putting the blended + // vector we created on the r.h.s. means we'll get a NaN component in the output + // vector for any index (within the vector) where _either_ the left or right side was NaN. + return _mm256_max_pd(x, blended_nans); } + + /** * @brief AVX psuedo-intrinsic implementing the C 'fmax' function for a vector - * of packed flats. + * of packed floats. * * @param x * @param y @@ -150,7 +213,7 @@ static inline __m256d FMAX_OP(double z, __m256d x, __m256d y) /** * @brief AVX psuedo-intrinsic implementing the C 'fmin' function for a vector - * of packed flats. + * of packed floats. * * @param x * @param y @@ -336,6 +399,77 @@ static inline __m128i CAST_256_TO_128_HI(__m256i x) return _mm256_extractf128_si256(x, 1); } +// /** +// * @brief AVX psuedo-intrinsic implementing the C 'fmax' function for a vector of packed doubles. +// * +// * @param x +// * @param y +// * @return __m256d +// */ +// static inline __m256d _mm256_fmax_pd(__m256d x, __m256d y) +// { +// const auto max_result = _mm256_max_pd(x, y); +// const auto unord_cmp_mask = _mm256_cmp_pd(max_result, max_result, _CMP_UNORD_Q); +// return _mm256_blendv_pd(x, y, unord_cmp_mask); +// } + + +// /** +// * @brief Function like @c std::min but which always propagates NaN values (for floating-point types). +// * +// * @tparam T The element type. +// * @param x The left element. +// * @param y The right element. +// * @return T const& The result of the operation. +// */ +// template +// static T const& min_with_nan_passthru(T const& x, T const& y) +// { +// return (std::min)(x, y); +// } + +// template<> +// static float const& min_with_nan_passthru(float const& x, float const& y) +// { +// const auto blended = (x != x) ? x : y; +// return x < blended ? x : blended; +// } + +// template<> +// static double const& min_with_nan_passthru(double const& x, double const& y) +// { +// const auto blended = (x != x) ? x : y; +// return x < blended ? x : blended; +// } + +// /** +// * @brief Function like @c std::max but which always propagates NaN values (for floating-point types). +// * +// * @tparam T The element type. +// * @param x The left element. +// * @param y The right element. +// * @return T const& The result of the operation. +// */ +// template +// static T const& max_with_nan_passthru(T const& x, T const& y) +// { +// return (std::max)(x, y); +// } + +// template<> +// static float const& max_with_nan_passthru(float const& x, float const& y) +// { +// const auto blended = (x != x) ? x : y; +// return x > blended ? x : blended; +// } + +// template<> +// static double const& max_with_nan_passthru(double const& x, double const& y) +// { +// const auto blended = (x != x) ? x : y; +// return x > blended ? x : blended; +// } + struct stArgScatterGatherFunc { // numpy input type @@ -781,24 +915,38 @@ class ReduceNanargmin final class ReduceMax final { //-------------------------------------------------------------------------------------------- - template + template static double non_vector(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc) { - T * pIn = (T *)pDataIn; - const T * const pEnd = pIn + len; + using namespace riptide::math; - // Always set first item - T result = *pIn++; + // TODO: Handle case where len <= 0 to avoid dereferencing an empty array below. - while (pIn < pEnd) - { - // get the maximum - T temp = *pIn; - if (temp > result) - { - result = temp; + const T* const pIn = (T*)pDataIn; + + // Initialize the result with the first (0th) element. + T result = pIn[0]; + + // Indicates when T is an integral type AND we're *NOT* recognizing / doing anything special + // for integer invalid values. + static constexpr bool skip_invalid_check = std::is_integral::value && !recognize_integer_invalids; + + // get the maximum, making sure NaNs are propagated correctly. + if (skip_invalid_check || invalid_for_type::is_valid(result)) { + for (int64_t i = 1; i < len; i++) { + const auto temp = pIn[i]; + + if (!skip_invalid_check && !invalid_for_type::is_valid(temp)) { + // We've found a NaN value. NaNs taint the result, so we set + // the result value here then short-circuit since there's no + // point in continuing any further. + result = temp; + break; + } + else if (temp > result) { + result = temp; + } } - pIn++; } // Check for previous scattering. If we are the first one @@ -812,31 +960,41 @@ class ReduceMax final // in case of nan when calling max (instead of nanmax), preserve nans if (pstScatterGatherFunc->resultOut == pstScatterGatherFunc->resultOut) { - pstScatterGatherFunc->resultOut = MAXF(pstScatterGatherFunc->resultOut, (double)result); + pstScatterGatherFunc->resultOut = max_with_nan_passthru(pstScatterGatherFunc->resultOut, (double)result); } T previous = (T)(pstScatterGatherFunc->resultOutInt64); - pstScatterGatherFunc->resultOutInt64 = (int64_t)(MAXF(previous, result)); + pstScatterGatherFunc->resultOutInt64 = (int64_t)(max_with_nan_passthru(previous, result)); } pstScatterGatherFunc->lenOut += len; return (double)pstScatterGatherFunc->resultOutInt64; } //-------------------------------------------------------------------------------------------- - template + template static double avx2(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc) { + using namespace riptide::math; + + static_assert( + !std::is_integral::value || !recognize_integer_invalids, + "Recognition of integer invalids not yet supported in vectorized (AVX2) mode." + ); + + typedef typename riptide::simd::avx2::vec256::reg_type U256; + T * pIn = (T *)pDataIn; T * pEnd; // Always set first item T result = *pIn; + static constexpr uint8_t unroll_factor = 8; // For float32 this is 64 /8 since sizeof(float) = 4 // For float64 this is 32/ 4 // for int16 this is 128 / 16 // for int8 this is 256 / 32 - static constexpr int64_t chunkSize = (sizeof(U256) * 8) / sizeof(T); + static constexpr int64_t chunkSize = (sizeof(U256) * unroll_factor) / sizeof(T); static constexpr int64_t perReg = sizeof(U256) / sizeof(T); if (len >= chunkSize) @@ -858,6 +1016,13 @@ class ReduceMax final while (pIn256 < pEnd256) { + // TODO: Consider integrating a check into this loop to detect if we've seen a NaN, + // and if so, short-circuiting the loop -- there's no point in continuing the + // loop if e.g. the first element of the array is a NaN. + // We could also short-circuit for integer types if we've seen std::numeric_limits::max() + // or std::numeric_limits::infinity() for floating-point types. + // This can be done quickly with some comparisons against those values, then horizontally + // reducing that mask using e.g. _mm256_movemask_pd/ps() and checking if the result == 15 or 255 (respectively). m0 = MAX_OP(result, m0, LOADU(pIn256)); m1 = MAX_OP(result, m1, LOADU(pIn256 + 1)); m2 = MAX_OP(result, m2, LOADU(pIn256 + 2)); @@ -875,20 +1040,21 @@ class ReduceMax final m2 = MAX_OP(result, m2, m3); m4 = MAX_OP(result, m4, m5); m6 = MAX_OP(result, m6, m7); + m0 = MAX_OP(result, m0, m2); m4 = MAX_OP(result, m4, m6); + m0 = MAX_OP(result, m0, m4); // Write 256 bits into memory + // PERF: Replace this memory spill with a vector reduction (e.g. using unpackhi/unpacklo) __m256i temp; _mm256_storeu_si256(&temp, *(__m256i *)&m0); T * tempT = (T *)&temp; - result = tempT[0]; - - for (int i = 1; i < perReg; i++) + for (int i = 0; i < perReg; i++) { - result = MAXF(result, tempT[i]); + result = max_with_nan_passthru(result, tempT[i]); } // update pIn to last location we read pIn = (T *)pIn256; @@ -898,7 +1064,7 @@ class ReduceMax final while (pIn < pEnd) { // get the minimum - result = MAXF(result, *pIn); + result = max_with_nan_passthru(result, *pIn); pIn++; } @@ -910,9 +1076,9 @@ class ReduceMax final } else { - pstScatterGatherFunc->resultOut = MAXF(pstScatterGatherFunc->resultOut, (double)result); + pstScatterGatherFunc->resultOut = max_with_nan_passthru(pstScatterGatherFunc->resultOut, (double)result); T previous = (T)(pstScatterGatherFunc->resultOutInt64); - pstScatterGatherFunc->resultOutInt64 = (int64_t)(MAXF(previous, result)); + pstScatterGatherFunc->resultOutInt64 = (int64_t)(max_with_nan_passthru(previous, result)); } pstScatterGatherFunc->lenOut += len; return pstScatterGatherFunc->resultOut; @@ -921,35 +1087,36 @@ class ReduceMax final public: // Given the numpy dtype number of the input array, returns a pointer to the // reduction function specialized for that array type. - static ANY_SCATTER_GATHER_FUNC GetScatterGatherFuncPtr(const NPY_TYPES inputType) + static ANY_SCATTER_GATHER_FUNC GetScatterGatherFuncPtr(const NPY_TYPES inputType, const bool recognize_integer_invalids) { switch (inputType) { case NPY_FLOAT: - return avx2; + return avx2; case NPY_DOUBLE: - return avx2; + return avx2; case NPY_LONGDOUBLE: - return non_vector; + return non_vector; case NPY_BOOL: + // TODO: The special case of bool (or bool as a view of int8/uint8) should use + // a vectorized any() kernel instead, because it doesn't have an invalid value + // and it can short-circuit. case NPY_INT8: - return avx2; + return recognize_integer_invalids ? non_vector : avx2; case NPY_INT16: - return avx2; + return recognize_integer_invalids ? non_vector : avx2; CASE_NPY_INT32: - return avx2; + return recognize_integer_invalids ? non_vector : avx2; CASE_NPY_INT64: - - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; case NPY_UINT8: - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; case NPY_UINT16: - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; CASE_NPY_UINT32: - return avx2; + return recognize_integer_invalids ? non_vector : avx2; CASE_NPY_UINT64: - - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; default: return nullptr; } @@ -963,24 +1130,38 @@ class ReduceMax final class ReduceMin final { //-------------------------------------------------------------------------------------------- - template + template static double non_vector(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc) { - T * pIn = (T *)pDataIn; - const T * const pEnd = pIn + len; + using namespace riptide::math; - // Always set first item - T result = *pIn++; + // TODO: Handle case where len <= 0 to avoid dereferencing an empty array below. - while (pIn < pEnd) - { - // get the minimum - T temp = *pIn; - if (temp < result) - { - result = temp; + const T* const pIn = (T*)pDataIn; + + // Initialize the result with the first (0th) element. + T result = pIn[0]; + + // Indicates when T is an integral type AND we're *NOT* recognizing / doing anything special + // for integer invalid values. + static constexpr bool skip_invalid_check = std::is_integral::value && !recognize_integer_invalids; + + // get the minimum, making sure NaNs are propagated correctly. + if (invalid_for_type::is_valid(result)) { + for (int64_t i = 1; i < len; i++) { + const auto temp = pIn[i]; + + if (!invalid_for_type::is_valid(temp)) { + // We've found a NaN value. NaNs taint the result, so we set + // the result value here then short-circuit since there's no + // point in continuing any further. + result = temp; + break; + } + else if (temp < result) { + result = temp; + } } - pIn++; } // Check for previous scattering. If we are the first one @@ -994,19 +1175,28 @@ class ReduceMin final // in case of nan when calling min (instead of nanmin), preserve nans if (pstScatterGatherFunc->resultOut == pstScatterGatherFunc->resultOut) { - pstScatterGatherFunc->resultOut = MINF(pstScatterGatherFunc->resultOut, (double)result); + pstScatterGatherFunc->resultOut = min_with_nan_passthru(pstScatterGatherFunc->resultOut, (double)result); } T previous = (T)(pstScatterGatherFunc->resultOutInt64); - pstScatterGatherFunc->resultOutInt64 = (int64_t)(MINF(previous, result)); + pstScatterGatherFunc->resultOutInt64 = (int64_t)(min_with_nan_passthru(previous, result)); } pstScatterGatherFunc->lenOut += len; return (double)pstScatterGatherFunc->resultOutInt64; } //-------------------------------------------------------------------------------------------- - template + template static double avx2(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc) { + using namespace riptide::math; + + static_assert( + !std::is_integral::value || !recognize_integer_invalids, + "Recognition of integer invalids not yet supported in vectorized (AVX2) mode." + ); + + typedef typename riptide::simd::avx2::vec256::reg_type U256; + T * pIn = (T *)pDataIn; T * pEnd; @@ -1017,8 +1207,8 @@ class ReduceMin final // For float64 this is 32/ 4 // for int16 this is 128 / 16 // for int8 this is 256 / 32 - const int64_t chunkSize = (sizeof(U256) * 8) / sizeof(T); - const int64_t perReg = sizeof(U256) / sizeof(T); + static constexpr int64_t chunkSize = (sizeof(U256) * 8) / sizeof(T); + static constexpr int64_t perReg = sizeof(U256) / sizeof(T); // printf("hit this %lld %lld\n", len, chunkSize); if (len >= chunkSize) @@ -1040,6 +1230,13 @@ class ReduceMin final while (pIn256 < pEnd256) { + // TODO: Consider integrating a check into this loop to detect if we've seen a NaN, + // and if so, short-circuiting the loop -- there's no point in continuing the + // loop if e.g. the first element of the array is a NaN. + // We could also short-circuit for integer types if we've seen std::numeric_limits::min() + // or -std::numeric_limits::infinity() for floating-point types. + // This can be done quickly with some comparisons against those values, then horizontally + // reducing that mask using e.g. _mm256_movemask_pd/ps() and checking if the result == 15 or 255 (respectively). m0 = MIN_OP(result, m0, LOADU(pIn256)); m1 = MIN_OP(result, m1, LOADU(pIn256 + 1)); m2 = MIN_OP(result, m2, LOADU(pIn256 + 2)); @@ -1057,54 +1254,22 @@ class ReduceMin final m2 = MIN_OP(result, m2, m3); m4 = MIN_OP(result, m4, m5); m6 = MIN_OP(result, m6, m7); + m0 = MIN_OP(result, m0, m2); m4 = MIN_OP(result, m4, m6); + m0 = MIN_OP(result, m0, m4); - if (false) - { - // Older path - // Write 256 bits into memory - __m256i temp; - _mm256_storeu_si256(&temp, *(__m256i *)&m0); - T * tempT = (T *)&temp; - result = tempT[0]; - - // printf("sofar minop 0 chunk: %lld %lf %lf %lld\n", chunkSize, - // pstScatterGatherFunc->resultOut, (double)result, - // pstScatterGatherFunc->resultOutInt64); - for (int i = 1; i < perReg; i++) - { - result = MINF(result, tempT[i]); - } - } - else + // Write 256 bits into memory + // PERF: Replace this memory spill with a vector reduction (e.g. using unpackhi/unpacklo) + __m256i temp; + _mm256_storeu_si256(&temp, *(__m256i *)&m0); + T * tempT = (T *)&temp; + + // printf("sofar minop 0 chunk: %lld %lf %lf %lld\n", chunkSize, + for (int i = 0; i < perReg; i++) { - // go from 256bit to 128bit to 64bit - // split the single 256bit register into two 128bit registers - U128 ym0 = CAST_256_TO_128_LO(m0); - U128 ym1 = CAST_256_TO_128_HI(m0); - __m128i temp; - - // move the min to ym0 - ym0 = MIN_OP128(result, ym0, ym1); - - // move the min to lower half (64 bits) - ym1 = CAST_TO128ME(ym1, _mm_shuffle_pd(CAST_TO128d(ym0), CAST_TO128d(ym1), 1)); - ym0 = MIN_OP128(result, ym0, ym1); - - // Write 128 bits into memory (although only need 64bits) - _mm_storeu_si128(&temp, CAST_TO128i(ym0)); - T * tempT = (T *)&temp; - result = tempT[0]; - - // printf("sofar minop 0 chunk: %lld %lf %lf %lld\n", chunkSize, - // pstScatterGatherFunc->resultOut, (double)result, - // pstScatterGatherFunc->resultOutInt64); - for (int i = 0; i < (perReg / 4); i++) - { - result = MINF(result, tempT[i]); - } + result = min_with_nan_passthru(result, tempT[i]); } // update pIn to last location we read @@ -1115,7 +1280,7 @@ class ReduceMin final while (pIn < pEnd) { // get the minimum - result = MINF(result, *pIn); + result = min_with_nan_passthru(result, *pIn); pIn++; } @@ -1132,203 +1297,9 @@ class ReduceMin final { // printf("minop !0 %lf %lf\n", pstScatterGatherFunc->resultOut, // (double)result); - pstScatterGatherFunc->resultOut = MINF(pstScatterGatherFunc->resultOut, (double)result); + pstScatterGatherFunc->resultOut = min_with_nan_passthru(pstScatterGatherFunc->resultOut, (double)result); T previous = (T)(pstScatterGatherFunc->resultOutInt64); - pstScatterGatherFunc->resultOutInt64 = (int64_t)(MINF(previous, result)); - } - pstScatterGatherFunc->lenOut += len; - return pstScatterGatherFunc->resultOut; - } - - //-------------------------------------------------------------------------------------------- - // This routine only support floats - template - static double avx2_nan_aware(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc) - { - T * pIn = (T *)pDataIn; - T * pEnd; - - // Always set first item - T result = *pIn; - - // For float32 this is 64 /8 since sizeof(float) = 4 - // For float64 this is 32/ 4 - const int64_t chunkSize = (sizeof(U256) * 8) / sizeof(T); - const int64_t perReg = sizeof(U256) / sizeof(T); - - // printf("hit this %lld %lld\n", len, chunkSize); - if (len >= chunkSize) - { - pEnd = &pIn[chunkSize * (len / chunkSize)]; - - U256 * pIn256 = (U256 *)pIn; - U256 * pEnd256 = (U256 *)pEnd; - - // Use 256 bit registers which hold 8 Ts - U256 m0 = LOADU(pIn256++); - U256 m1 = LOADU(pIn256++); - - // Stagger the loads, while m1 loads, should be able to calculate mnan - U256 mnan = _mm256_cmp_ps(m0, m0, _CMP_EQ_OQ); - U256 m2 = LOADU(pIn256++); - mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m1, m1, _CMP_EQ_OQ)); - U256 m3 = LOADU(pIn256++); - mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m2, m2, _CMP_EQ_OQ)); - U256 m4 = LOADU(pIn256++); - mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m3, m3, _CMP_EQ_OQ)); - U256 m5 = LOADU(pIn256++); - mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m4, m4, _CMP_EQ_OQ)); - U256 m6 = LOADU(pIn256++); - mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m5, m5, _CMP_EQ_OQ)); - U256 m7 = LOADU(pIn256++); - mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m6, m6, _CMP_EQ_OQ)); - mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m7, m7, _CMP_EQ_OQ)); - - while (pIn256 < pEnd256) - { - U256 m10 = LOADU(pIn256++); - m0 = MIN_OP(result, m0, m10); - mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m10, m10, _CMP_EQ_OQ)); - U256 m11 = LOADU(pIn256++); - m1 = MIN_OP(result, m1, m11); - mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m11, m11, _CMP_EQ_OQ)); - - m10 = LOADU(pIn256++); - m2 = MIN_OP(result, m2, m10); - mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m10, m10, _CMP_EQ_OQ)); - m11 = LOADU(pIn256++); - m3 = MIN_OP(result, m3, m11); - mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m11, m11, _CMP_EQ_OQ)); - - m10 = LOADU(pIn256++); - m4 = MIN_OP(result, m4, m10); - mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m10, m10, _CMP_EQ_OQ)); - m11 = LOADU(pIn256++); - m5 = MIN_OP(result, m5, m11); - mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m11, m11, _CMP_EQ_OQ)); - - m10 = LOADU(pIn256++); - m6 = MIN_OP(result, m6, m10); - mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m10, m10, _CMP_EQ_OQ)); - m11 = LOADU(pIn256++); - m7 = MIN_OP(result, m7, m11); - mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m11, m11, _CMP_EQ_OQ)); - } - - // TODO: Check mnan for nans - // NOTE: a nan check can go here for floating point and short circuit - if (_mm256_movemask_ps(mnan) == 255) - { - // Wind this calculation down from 8 256 bit registers to the data T - // MIN_OP - m0 = MIN_OP(result, m0, m1); - m2 = MIN_OP(result, m2, m3); - m4 = MIN_OP(result, m4, m5); - m6 = MIN_OP(result, m6, m7); - m0 = MIN_OP(result, m0, m2); - m4 = MIN_OP(result, m4, m6); - m0 = MIN_OP(result, m0, m4); - - if (false) - { - // Older path - // Write 256 bits into memory - __m256i temp; - //_mm256_movedup_pd - //_mm256_shuffle_pd(y0, y0, 0); - _mm256_storeu_si256(&temp, *(__m256i *)&m0); - T * tempT = (T *)&temp; - result = tempT[0]; - - // printf("sofar minop 0 chunk: %lld %lf %lf %lld\n", chunkSize, - // pstScatterGatherFunc->resultOut, (double)result, - // pstScatterGatherFunc->resultOutInt64); - for (int i = 1; i < perReg; i++) - { - result = MINF(result, tempT[i]); - } - } - else - { - // go from 256bit to 128bit to 64bit - // split the single 256bit register into two 128bit registers - U128 ym0 = CAST_256_TO_128_LO(m0); - U128 ym1 = CAST_256_TO_128_HI(m0); - __m128i temp; - - // move the min to ym0 - ym0 = MIN_OP128(result, ym0, ym1); - - // move the min to lower half (64 bits) - ym1 = CAST_TO128ME(ym1, _mm_shuffle_pd(CAST_TO128d(ym0), CAST_TO128d(ym1), 1)); - ym0 = MIN_OP128(result, ym0, ym1); - - // Write 128 bits into memory (although only need 64bits) - _mm_storeu_si128(&temp, CAST_TO128i(ym0)); - T * tempT = (T *)&temp; - result = tempT[0]; - - // printf("sofar minop 0 chunk: %lld %lf %lf %lld\n", chunkSize, - // pstScatterGatherFunc->resultOut, (double)result, - // pstScatterGatherFunc->resultOutInt64); - for (int i = 1; i < (perReg / 4); i++) - { - result = MINF(result, tempT[i]); - } - } - - // update pIn to last location we read - pIn = (T *)pIn256; - } - else - { - // this should be cached somewhere - result = std::numeric_limits::quiet_NaN(); - } - } - - pEnd = &pIn[len & (chunkSize - 1)]; - while (pIn < pEnd) - { - // get the minimum - // TODO: Check for nans also - T value = *pIn++; - if (value == value) - { - result = MINF(result, value); - } - else - { - // this should be cached somewhere - result = std::numeric_limits::quiet_NaN(); - break; - } - } - - // Check for previous scattering. If we are the first one - if (pstScatterGatherFunc->lenOut == 0) - { - pstScatterGatherFunc->resultOut = (double)result; - pstScatterGatherFunc->resultOutInt64 = (int64_t)result; - // printf("minop 0 chunk: %lld %lf %lf %lld\n", chunkSize, - // pstScatterGatherFunc->resultOut, (double)result, - // pstScatterGatherFunc->resultOutInt64); - } - else - { - // printf("minop !0 %lf %lf\n", pstScatterGatherFunc->resultOut, - // (double)result); - if (result == result) - { - pstScatterGatherFunc->resultOut = MINF(pstScatterGatherFunc->resultOut, (double)result); - T previous = (T)(pstScatterGatherFunc->resultOutInt64); - pstScatterGatherFunc->resultOutInt64 = (int64_t)(MINF(previous, result)); - } - else - { - // we know this if a float - pstScatterGatherFunc->resultOut = result; - } + pstScatterGatherFunc->resultOutInt64 = (int64_t)(min_with_nan_passthru(previous, result)); } pstScatterGatherFunc->lenOut += len; return pstScatterGatherFunc->resultOut; @@ -1337,47 +1308,36 @@ class ReduceMin final public: // Given the numpy dtype number of the input array, returns a pointer to the // reduction function specialized for that array type. - template - static ANY_SCATTER_GATHER_FUNC GetScatterGatherFuncPtr(const NPY_TYPES inputType) + static ANY_SCATTER_GATHER_FUNC GetScatterGatherFuncPtr(const NPY_TYPES inputType, const bool recognize_integer_invalids) { switch (inputType) { case NPY_FLOAT: - // For testing/development purposes, we have a "nan-aware" (i.e. - // nan-propagating) vectorized implementation for floats. Use the template - // parameter to select that version if specified. - if /*constexpr*/ (is_nan_aware) - { - return avx2_nan_aware; - } - else - { - return avx2; - } - + return avx2; case NPY_DOUBLE: - return avx2; + return avx2; case NPY_LONGDOUBLE: - return non_vector; + return non_vector; case NPY_BOOL: + // TODO: The special case of bool (or bool as a view of int8/uint8) should use + // a vectorized all() kernel instead, because it doesn't have an invalid value + // and it can short-circuit. case NPY_INT8: - return avx2; + return recognize_integer_invalids ? non_vector : avx2; case NPY_INT16: - return avx2; + return recognize_integer_invalids ? non_vector : avx2; CASE_NPY_INT32: - return avx2; + return recognize_integer_invalids ? non_vector : avx2; CASE_NPY_INT64: - - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; case NPY_UINT8: - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; case NPY_UINT16: - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; CASE_NPY_UINT32: - return avx2; + return recognize_integer_invalids ? non_vector : avx2; CASE_NPY_UINT64: - - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; default: return nullptr; } @@ -1392,7 +1352,7 @@ class ReduceNanMin final { // Simple, non-vectorized implementation of the nanmin (fmin) reduction // operation. - template + template static double non_vector(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc) { T * pIn = (T *)pDataIn; @@ -1402,15 +1362,22 @@ class ReduceNanMin final // so it'll be overwritten by any non-NaN value encountered below. T result = invalid_for_type::value; + // Indicates when T is an integral type AND we're *NOT* recognizing / doing anything special + // for integer invalid values. + static constexpr bool skip_invalid_check = std::is_integral::value && !recognize_integer_invalids; + // Search for first non Nan - while (pIn < pEnd) + if constexpr (!skip_invalid_check) { - // break on first non nan - if (invalid_for_type::is_valid(*pIn)) + while (pIn < pEnd) { - break; + // break on first non nan + if (invalid_for_type::is_valid(*pIn)) + { + break; + } + pIn++; } - pIn++; } if (pIn < pEnd) @@ -1421,7 +1388,7 @@ class ReduceNanMin final { // get the minimum // include inf because of range function in cut/qcut - if (invalid_for_type::is_valid(*pIn)) + if (skip_invalid_check || invalid_for_type::is_valid(*pIn)) { result = (std::min)(result, *pIn); } @@ -1435,7 +1402,7 @@ class ReduceNanMin final // stScatterGatherFunc value used to accumulate any intermediate results // will retain whatever values it was initialized with, which makes it // easier to detect and handle the case of an all-NaN array. - if (invalid_for_type::is_valid(result)) + if (skip_invalid_check || invalid_for_type::is_valid(result)) { // Is there a previous intermediate result we need to combine with? // If not, this chunk of the array was the first result so we can just @@ -1465,23 +1432,21 @@ class ReduceNanMin final else { // This chunk of the array contained all NaNs. - return std::numeric_limits::quiet_NaN(); + return invalid_for_type::value; } } // AVX2-based implementation of the nanmin (fmin) reduction operation. - template + template static double avx2(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc) { - // TEMP: Call the non-vectorized version of this operation until a - // vectorized version can be implemented. - return non_vector(pDataIn, len, pstScatterGatherFunc); - } + static_assert( + std::is_integral::value, + "Integers not yet supported in vectorized (AVX2) mode." + ); + + typedef typename riptide::simd::avx2::vec256::reg_type U256; - //-------------------------------------------------------------------------------------------- - template - static double avx2(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc) - { T * pIn = (T *)pDataIn; T * pEnd; @@ -1533,8 +1498,10 @@ class ReduceNanMin final m2 = FMIN_OP(result, m2, m3); m4 = FMIN_OP(result, m4, m5); m6 = FMIN_OP(result, m6, m7); + m0 = FMIN_OP(result, m0, m2); m4 = FMIN_OP(result, m4, m6); + m0 = FMIN_OP(result, m0, m4); // Write 256 bits into memory @@ -1544,13 +1511,12 @@ class ReduceNanMin final _mm256_storeu_si256(&temp, *(__m256i *)&m0); T * tempT = (T *)&temp; - result = tempT[0]; // printf("sofar minop 0 chunk: %lld %lf %lf %lld\n", chunkSize, // pstScatterGatherFunc->resultOut, (double)result, // pstScatterGatherFunc->resultOutInt64); - for (int i = 1; i < perReg; i++) + for (int i = 0; i < perReg; i++) { const T current = tempT[i]; if (invalid_for_type::is_valid(current)) @@ -1610,47 +1576,50 @@ class ReduceNanMin final else { // This chunk of the array contained all NaNs. - return std::numeric_limits::quiet_NaN(); + return invalid_for_type::value; } } public: // Given the numpy dtype number of the input array, returns a pointer to the // reduction function specialized for that array type. - static ANY_SCATTER_GATHER_FUNC GetScatterGatherFuncPtr(const NPY_TYPES inputType) + static ANY_SCATTER_GATHER_FUNC GetScatterGatherFuncPtr(const NPY_TYPES inputType, const bool recognize_integer_invalids) { + // TODO: Enable avx2 implementation once it's been tested. + // Initially, do this for `float` and `double`; the FMIN_OP function + // needs to have overloads implemented for integer types before the + // AVX2 implementation can be enabled for integral types. + switch (inputType) { - // TODO: Enable avx2 implementation once it's been tested. case NPY_FLOAT: - return non_vector; + return non_vector; case NPY_DOUBLE: - return non_vector; + return non_vector; case NPY_LONGDOUBLE: - return non_vector; + return non_vector; + case NPY_INT8: - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; case NPY_INT16: - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; CASE_NPY_INT32: - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; CASE_NPY_INT64: - - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; case NPY_UINT8: - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; case NPY_UINT16: - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; CASE_NPY_UINT32: - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; CASE_NPY_UINT64: - - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; // bools are currently handled specially; we don't consider bool to have a // nan/invalid value so we utilize the normal reduction operation for it. case NPY_BOOL: - return ReduceMin::GetScatterGatherFuncPtr(inputType); + return ReduceMin::GetScatterGatherFuncPtr(inputType, recognize_integer_invalids); default: return nullptr; @@ -1666,7 +1635,7 @@ class ReduceNanMax final { // Simple, non-vectorized implementation of the nanmax (fmax) reduction // operation. - template + template static double non_vector(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc) { T * pIn = (T *)pDataIn; @@ -1676,15 +1645,22 @@ class ReduceNanMax final // so it'll be overwritten by any non-NaN value encountered below. T result = invalid_for_type::value; + // Indicates when T is an integral type AND we're *NOT* recognizing / doing anything special + // for integer invalid values. + static constexpr bool skip_invalid_check = std::is_integral::value && !recognize_integer_invalids; + // Search for first non Nan - while (pIn < pEnd) + if constexpr (!skip_invalid_check) { - // break on first non nan - if (invalid_for_type::is_valid(*pIn)) + while (pIn < pEnd) { - break; + // break on first non nan + if (invalid_for_type::is_valid(*pIn)) + { + break; + } + pIn++; } - pIn++; } if (pIn < pEnd) @@ -1695,7 +1671,7 @@ class ReduceNanMax final { // get the maximum // include inf because of range function in cut/qcut - if (invalid_for_type::is_valid(*pIn)) + if (skip_invalid_check || invalid_for_type::is_valid(*pIn)) { result = (std::max)(result, *pIn); } @@ -1709,7 +1685,7 @@ class ReduceNanMax final // stScatterGatherFunc value used to accumulate any intermediate results // will retain whatever values it was initialized with, which makes it // easier to detect and handle the case of an all-NaN array. - if (invalid_for_type::is_valid(result)) + if (skip_invalid_check || invalid_for_type::is_valid(result)) { // Is there a previous intermediate result we need to combine with? // If not, this chunk of the array was the first result so we can just @@ -1739,23 +1715,21 @@ class ReduceNanMax final else { // This chunk of the array contained all NaNs. - return std::numeric_limits::quiet_NaN(); + return invalid_for_type::value; } } // AVX2-based implementation of the nanmax (fmax) reduction operation. - template + template static double avx2(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc) { - // TEMP: Call the non-vectorized version of this operation until a - // vectorized version can be implemented. - return non_vector(pDataIn, len, pstScatterGatherFunc); - } + static_assert( + std::is_integral::value, + "Integers not yet supported in vectorized (AVX2) mode." + ); + + typedef typename riptide::simd::avx2::vec256::reg_type U256; - //-------------------------------------------------------------------------------------------- - template - static double avx2(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc) - { T * pIn = (T *)pDataIn; T * pEnd; @@ -1884,47 +1858,50 @@ class ReduceNanMax final else { // This chunk of the array contained all NaNs. - return std::numeric_limits::quiet_NaN(); + return invalid_for_type::value; } } public: // Given the numpy dtype number of the input array, returns a pointer to the // reduction function specialized for that array type. - static ANY_SCATTER_GATHER_FUNC GetScatterGatherFuncPtr(const NPY_TYPES inputType) + static ANY_SCATTER_GATHER_FUNC GetScatterGatherFuncPtr(const NPY_TYPES inputType, const bool recognize_integer_invalids) { + // TODO: Enable avx2 implementation once it's been tested. + // Initially, do this for `float` and `double`; the FMAX_OP function + // needs to have overloads implemented for integer types before the + // AVX2 implementation can be enabled for integral types. + switch (inputType) { - // TODO: Enable avx2 implementation once it's been tested. case NPY_FLOAT: - return non_vector; + return non_vector; case NPY_DOUBLE: - return non_vector; + return non_vector; case NPY_LONGDOUBLE: - return non_vector; + return non_vector; + case NPY_INT8: - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; case NPY_INT16: - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; CASE_NPY_INT32: - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; CASE_NPY_INT64: - - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; case NPY_UINT8: - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; case NPY_UINT16: - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; CASE_NPY_UINT32: - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; CASE_NPY_UINT64: - - return non_vector; + return recognize_integer_invalids ? non_vector : non_vector; // bools are currently handled specially; we don't consider bool to have a // nan/invalid value so we utilize the normal reduction operation for it. case NPY_BOOL: - return ReduceMax::GetScatterGatherFuncPtr(inputType); + return ReduceMax::GetScatterGatherFuncPtr(inputType, recognize_integer_invalids); default: return nullptr; @@ -2845,16 +2822,16 @@ static ANY_SCATTER_GATHER_FUNC GetReduceFuncPtr(const NPY_TYPES inputType, const return ReduceNanSum::GetScatterGatherFuncPtr(inputType); case REDUCE_FUNCTIONS::REDUCE_MIN: - return ReduceMin::GetScatterGatherFuncPtr(inputType); + return ReduceMin::GetScatterGatherFuncPtr(inputType, false); case REDUCE_FUNCTIONS::REDUCE_MIN_NANAWARE: - return ReduceMin::GetScatterGatherFuncPtr(inputType); + return ReduceMin::GetScatterGatherFuncPtr(inputType, true); case REDUCE_FUNCTIONS::REDUCE_NANMIN: - return ReduceNanMin::GetScatterGatherFuncPtr(inputType); + return ReduceNanMin::GetScatterGatherFuncPtr(inputType, false); case REDUCE_FUNCTIONS::REDUCE_MAX: - return ReduceMax::GetScatterGatherFuncPtr(inputType); + return ReduceMax::GetScatterGatherFuncPtr(inputType, false); case REDUCE_FUNCTIONS::REDUCE_NANMAX: - return ReduceNanMax::GetScatterGatherFuncPtr(inputType); + return ReduceNanMax::GetScatterGatherFuncPtr(inputType, false); case REDUCE_FUNCTIONS::REDUCE_VAR: return ReduceVariance::GetScatterGatherFuncPtr(inputType); diff --git a/src/simd/avx2.h b/src/simd/avx2.h index 6281fcdf..b6215c72 100644 --- a/src/simd/avx2.h +++ b/src/simd/avx2.h @@ -1,4 +1,5 @@ -#pragma once +#ifndef RIPTABLE_CPP_SIMD_AVX2_H +#define RIPTABLE_CPP_SIMD_AVX2_H #include #include #include @@ -64,20 +65,20 @@ namespace riptide * @return T const& The result of the operation. */ template - T const & min_with_nan_passthru(T const & x, T const & y) + static RT_FORCEINLINE T const & min_with_nan_passthru(T const & x, T const & y) { return (std::min)(x, y); } template <> - float const & min_with_nan_passthru(float const & x, float const & y) + RT_FORCEINLINE float const & min_with_nan_passthru(float const & x, float const & y) { const auto & blended = (x != x) ? x : y; return x < blended ? x : blended; } template <> - double const & min_with_nan_passthru(double const & x, double const & y) + RT_FORCEINLINE double const & min_with_nan_passthru(double const & x, double const & y) { const auto & blended = (x != x) ? x : y; return x < blended ? x : blended; @@ -93,20 +94,20 @@ namespace riptide * @return T const& The result of the operation. */ template - T const & max_with_nan_passthru(T const & x, T const & y) + static RT_FORCEINLINE T const & max_with_nan_passthru(T const & x, T const & y) { return (std::max)(x, y); } template <> - float const & max_with_nan_passthru(float const & x, float const & y) + RT_FORCEINLINE float const & max_with_nan_passthru(float const & x, float const & y) { const auto & blended = (x != x) ? x : y; return x > blended ? x : blended; } template <> - double const & max_with_nan_passthru(double const & x, double const & y) + RT_FORCEINLINE double const & max_with_nan_passthru(double const & x, double const & y) { const auto & blended = (x != x) ? x : y; return x > blended ? x : blended; @@ -5473,3 +5474,5 @@ namespace riptide } // namespace avx2 } // namespace simd } // namespace riptide + +#endif // RIPTABLE_CPP_SIMD_AVX2_H