diff --git a/src/Reduce.cpp b/src/Reduce.cpp
index 7bd2e3f9..3786642d 100644
--- a/src/Reduce.cpp
+++ b/src/Reduce.cpp
@@ -5,8 +5,10 @@
 #include "Convert.h"
 #include "missing_values.h"
 #include "platform_detect.h"
+#include "simd/avx2.h"
 
 #include <algorithm>
+#include <type_traits>
 
 #if defined(__clang__)
     #pragma clang diagnostic ignored "-Wmissing-braces"
@@ -76,13 +78,41 @@ static inline __m256i MIN_OP(uint32_t z, __m256i x, __m256i y)
 {
     return _mm256_min_epu32(x, y);
 }
-static inline __m256 MIN_OP(float z, __m256 x, __m256 y)
+
+static inline __m256  MIN_OP(float z, __m256 x, __m256 y)
 {
-    return _mm256_min_ps(x, y);
+   // Create a mask indicating which components of 'x', if any, are NaNs.
+   const auto unord_cmp_mask = _mm256_cmp_ps(x, x, _CMP_UNORD_Q);
+
+   // Use the blendv operation with our "x is nan" mask to create a new vector
+   // containing the components of 'y' where 'x' was _not_ a NaN, and the components
+   // of 'x' which were NaNs.
+   const auto blended_nans = _mm256_blendv_ps(y, x, unord_cmp_mask);
+
+   // Perform the min operation on 'x' and the blended vector.
+   // minpd / vminpd will take the component from the 2nd argument whenever one of
+   // the components (from the 1st and 2nd arguments) is NaN. Putting the blended
+   // vector we created on the r.h.s. means we'll get a NaN component in the output
+   // vector for any index (within the vector) where _either_ the left or right side was NaN.
+   return _mm256_min_ps(x, blended_nans);
 }
+
 static inline __m256d MIN_OP(double z, __m256d x, __m256d y)
 {
-    return _mm256_min_pd(x, y);
+   // Create a mask indicating which components of 'x', if any, are NaNs.
+   const auto unord_cmp_mask = _mm256_cmp_pd(x, x, _CMP_UNORD_Q);
+
+   // Use the blendv operation with our "x is nan" mask to create a new vector
+   // containing the components of 'y' where 'x' was _not_ a NaN, and the components
+   // of 'x' which were NaNs.
+   const auto blended_nans = _mm256_blendv_pd(y, x, unord_cmp_mask);
+
+   // Perform the min operation on 'x' and the blended vector.
+   // minpd / vminpd will take the component from the 2nd argument whenever one of
+   // the components (from the 1st and 2nd arguments) is NaN. Putting the blended
+   // vector we created on the r.h.s. means we'll get a NaN component in the output
+   // vector for any index (within the vector) where _either_ the left or right side was NaN.
+   return _mm256_min_pd(x, blended_nans);
 }
 
 static inline __m256i MAX_OP(int8_t z, __m256i x, __m256i y)
@@ -109,18 +139,51 @@ static inline __m256i MAX_OP(uint32_t z, __m256i x, __m256i y)
 {
     return _mm256_max_epu32(x, y);
 }
-static inline __m256 MAX_OP(float z, __m256 x, __m256 y)
+
+static inline __m256  MAX_OP(float z, __m256 x, __m256 y)
 {
-    return _mm256_max_ps(x, y);
+   // Create a mask indicating which components of 'x', if any, are NaNs.
+   const auto unord_cmp_mask = _mm256_cmp_ps(x, x, _CMP_UNORD_Q);
+
+   // Use the blendv operation with our "x is nan" mask to create a new vector
+   // containing the components of 'y' where 'x' was _not_ a NaN, and the components
+   // of 'x' which were NaNs.
+   const auto blended_nans = _mm256_blendv_ps(y, x, unord_cmp_mask);
+
+   // Perform the max operation on 'x' and the blended vector.
+   // maxpd / vmaxpd will take the component from the 2nd argument whenever one of
+   // the components (from the 1st and 2nd arguments) is NaN. Putting the blended
+   // vector we created on the r.h.s. means we'll get a NaN component in the output
+   // vector for any index (within the vector) where _either_ the left or right side was NaN.
+   return _mm256_max_ps(x, blended_nans);
 }
+
+// This function implements a 'max' operation which returns a NaN when either argument (per component)
+// contains a NaN. This fixes the behavior of the _mm256_max_pd() intrinsic, which does not always propagate
+// NaNs as expected.
 static inline __m256d MAX_OP(double z, __m256d x, __m256d y)
 {
-    return _mm256_max_pd(x, y);
+   // Create a mask indicating which components of 'x', if any, are NaNs.
+   const auto unord_cmp_mask = _mm256_cmp_pd(x, x, _CMP_UNORD_Q);
+
+   // Use the blendv operation with our "x is nan" mask to create a new vector
+   // containing the components of 'y' where 'x' was _not_ a NaN, and the components
+   // of 'x' which were NaNs.
+   const auto blended_nans = _mm256_blendv_pd(y, x, unord_cmp_mask);
+
+   // Perform the max operation on 'x' and the blended vector.
+   // maxpd / vmaxpd will take the component from the 2nd argument whenever one of
+   // the components (from the 1st and 2nd arguments) is NaN. Putting the blended
+   // vector we created on the r.h.s. means we'll get a NaN component in the output
+   // vector for any index (within the vector) where _either_ the left or right side was NaN.
+   return _mm256_max_pd(x, blended_nans);
 }
 
+
+
 /**
  * @brief AVX psuedo-intrinsic implementing the C 'fmax' function for a vector
- * of packed flats.
+ * of packed floats.
  *
  * @param x
  * @param y
@@ -150,7 +213,7 @@ static inline __m256d FMAX_OP(double z, __m256d x, __m256d y)
 
 /**
  * @brief AVX psuedo-intrinsic implementing the C 'fmin' function for a vector
- * of packed flats.
+ * of packed floats.
  *
  * @param x
  * @param y
@@ -336,6 +399,77 @@ static inline __m128i CAST_256_TO_128_HI(__m256i x)
     return _mm256_extractf128_si256(x, 1);
 }
 
+// /**
+//  * @brief AVX psuedo-intrinsic implementing the C 'fmax' function for a vector of packed doubles.
+//  *
+//  * @param x
+//  * @param y
+//  * @return __m256d
+//  */
+// static inline __m256d _mm256_fmax_pd(__m256d x, __m256d y)
+// {
+//    const auto max_result = _mm256_max_pd(x, y);
+//    const auto unord_cmp_mask = _mm256_cmp_pd(max_result, max_result, _CMP_UNORD_Q);
+//    return _mm256_blendv_pd(x, y, unord_cmp_mask);
+// }
+
+
+// /**
+//  * @brief Function like @c std::min but which always propagates NaN values (for floating-point types).
+//  *
+//  * @tparam T The element type.
+//  * @param x The left element.
+//  * @param y The right element.
+//  * @return T const& The result of the operation.
+//  */
+// template<typename T>
+// static T const& min_with_nan_passthru(T const& x, T const& y)
+// {
+//    return (std::min)(x, y);
+// }
+
+// template<>
+// static float const& min_with_nan_passthru(float const& x, float const& y)
+// {
+//    const auto blended = (x != x) ? x : y;
+//    return x < blended ? x : blended;
+// }
+
+// template<>
+// static double const& min_with_nan_passthru(double const& x, double const& y)
+// {
+//    const auto blended = (x != x) ? x : y;
+//    return x < blended ? x : blended;
+// }
+
+// /**
+//  * @brief Function like @c std::max but which always propagates NaN values (for floating-point types).
+//  *
+//  * @tparam T The element type.
+//  * @param x The left element.
+//  * @param y The right element.
+//  * @return T const& The result of the operation.
+//  */
+// template<typename T>
+// static T const& max_with_nan_passthru(T const& x, T const& y)
+// {
+//    return (std::max)(x, y);
+// }
+
+// template<>
+// static float const& max_with_nan_passthru(float const& x, float const& y)
+// {
+//    const auto blended = (x != x) ? x : y;
+//    return x > blended ? x : blended;
+// }
+
+// template<>
+// static double const& max_with_nan_passthru(double const& x, double const& y)
+// {
+//    const auto blended = (x != x) ? x : y;
+//    return x > blended ? x : blended;
+// }
+
 struct stArgScatterGatherFunc
 {
     // numpy input type
@@ -781,24 +915,38 @@ class ReduceNanargmin final
 class ReduceMax final
 {
     //--------------------------------------------------------------------------------------------
-    template <typename T>
+    template <typename T, bool recognize_integer_invalids>
     static double non_vector(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc)
     {
-        T * pIn = (T *)pDataIn;
-        const T * const pEnd = pIn + len;
+        using namespace riptide::math;
 
-        // Always set first item
-        T result = *pIn++;
+        // TODO: Handle case where len <= 0 to avoid dereferencing an empty array below.
 
-        while (pIn < pEnd)
-        {
-            // get the maximum
-            T temp = *pIn;
-            if (temp > result)
-            {
-                result = temp;
+        const T* const pIn = (T*)pDataIn;
+
+        // Initialize the result with the first (0th) element.
+        T result = pIn[0];
+
+        // Indicates when T is an integral type AND we're *NOT* recognizing / doing anything special
+        // for integer invalid values.
+        static constexpr bool skip_invalid_check = std::is_integral<T>::value && !recognize_integer_invalids;
+
+        // get the maximum, making sure NaNs are propagated correctly.
+        if (skip_invalid_check || invalid_for_type<T>::is_valid(result)) {
+            for (int64_t i = 1; i < len; i++) {
+                const auto temp = pIn[i];
+
+                if (!skip_invalid_check && !invalid_for_type<T>::is_valid(temp)) {
+                    // We've found a NaN value. NaNs taint the result, so we set
+                    // the result value here then short-circuit since there's no
+                    // point in continuing any further.
+                    result = temp;
+                    break;
+                }
+                else if (temp > result) {
+                    result = temp;
+                }
             }
-            pIn++;
         }
 
         // Check for previous scattering.  If we are the first one
@@ -812,31 +960,41 @@ class ReduceMax final
             // in case of nan when calling max (instead of nanmax), preserve nans
             if (pstScatterGatherFunc->resultOut == pstScatterGatherFunc->resultOut)
             {
-                pstScatterGatherFunc->resultOut = MAXF(pstScatterGatherFunc->resultOut, (double)result);
+                pstScatterGatherFunc->resultOut = max_with_nan_passthru(pstScatterGatherFunc->resultOut, (double)result);
             }
 
             T previous = (T)(pstScatterGatherFunc->resultOutInt64);
-            pstScatterGatherFunc->resultOutInt64 = (int64_t)(MAXF(previous, result));
+            pstScatterGatherFunc->resultOutInt64 = (int64_t)(max_with_nan_passthru(previous, result));
         }
         pstScatterGatherFunc->lenOut += len;
         return (double)pstScatterGatherFunc->resultOutInt64;
     }
 
     //--------------------------------------------------------------------------------------------
-    template <typename T, typename U256, typename U128>
+    template <typename T, bool recognize_integer_invalids>
     static double avx2(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc)
     {
+        using namespace riptide::math;
+
+        static_assert(
+            !std::is_integral<T>::value || !recognize_integer_invalids,
+            "Recognition of integer invalids not yet supported in vectorized (AVX2) mode."
+            );
+
+        typedef typename riptide::simd::avx2::vec256<T>::reg_type U256;
+
         T * pIn = (T *)pDataIn;
         T * pEnd;
 
         // Always set first item
         T result = *pIn;
 
+        static constexpr uint8_t unroll_factor = 8;
         // For float32 this is 64 /8 since sizeof(float) = 4
         // For float64 this is 32/ 4
         // for int16 this is 128 / 16
         // for int8  this is 256 / 32
-        static constexpr int64_t chunkSize = (sizeof(U256) * 8) / sizeof(T);
+        static constexpr int64_t chunkSize = (sizeof(U256) * unroll_factor) / sizeof(T);
         static constexpr int64_t perReg = sizeof(U256) / sizeof(T);
 
         if (len >= chunkSize)
@@ -858,6 +1016,13 @@ class ReduceMax final
 
             while (pIn256 < pEnd256)
             {
+                // TODO: Consider integrating a check into this loop to detect if we've seen a NaN,
+                //       and if so, short-circuiting the loop -- there's no point in continuing the
+                //       loop if e.g. the first element of the array is a NaN.
+                //       We could also short-circuit for integer types if we've seen std::numeric_limits<T>::max()
+                //       or std::numeric_limits<T>::infinity() for floating-point types.
+                //       This can be done quickly with some comparisons against those values, then horizontally
+                //       reducing that mask using e.g. _mm256_movemask_pd/ps() and checking if the result == 15 or 255 (respectively).
                 m0 = MAX_OP(result, m0, LOADU(pIn256));
                 m1 = MAX_OP(result, m1, LOADU(pIn256 + 1));
                 m2 = MAX_OP(result, m2, LOADU(pIn256 + 2));
@@ -875,20 +1040,21 @@ class ReduceMax final
             m2 = MAX_OP(result, m2, m3);
             m4 = MAX_OP(result, m4, m5);
             m6 = MAX_OP(result, m6, m7);
+
             m0 = MAX_OP(result, m0, m2);
             m4 = MAX_OP(result, m4, m6);
+
             m0 = MAX_OP(result, m0, m4);
 
             // Write 256 bits into memory
+            // PERF: Replace this memory spill with a vector reduction (e.g. using unpackhi/unpacklo)
             __m256i temp;
             _mm256_storeu_si256(&temp, *(__m256i *)&m0);
 
             T * tempT = (T *)&temp;
-            result = tempT[0];
-
-            for (int i = 1; i < perReg; i++)
+            for (int i = 0; i < perReg; i++)
             {
-                result = MAXF(result, tempT[i]);
+                result = max_with_nan_passthru(result, tempT[i]);
             }
             // update pIn to last location we read
             pIn = (T *)pIn256;
@@ -898,7 +1064,7 @@ class ReduceMax final
         while (pIn < pEnd)
         {
             // get the minimum
-            result = MAXF(result, *pIn);
+            result = max_with_nan_passthru(result, *pIn);
             pIn++;
         }
 
@@ -910,9 +1076,9 @@ class ReduceMax final
         }
         else
         {
-            pstScatterGatherFunc->resultOut = MAXF(pstScatterGatherFunc->resultOut, (double)result);
+            pstScatterGatherFunc->resultOut = max_with_nan_passthru(pstScatterGatherFunc->resultOut, (double)result);
             T previous = (T)(pstScatterGatherFunc->resultOutInt64);
-            pstScatterGatherFunc->resultOutInt64 = (int64_t)(MAXF(previous, result));
+            pstScatterGatherFunc->resultOutInt64 = (int64_t)(max_with_nan_passthru(previous, result));
         }
         pstScatterGatherFunc->lenOut += len;
         return pstScatterGatherFunc->resultOut;
@@ -921,35 +1087,36 @@ class ReduceMax final
 public:
     // Given the numpy dtype number of the input array, returns a pointer to the
     // reduction function specialized for that array type.
-    static ANY_SCATTER_GATHER_FUNC GetScatterGatherFuncPtr(const NPY_TYPES inputType)
+    static ANY_SCATTER_GATHER_FUNC GetScatterGatherFuncPtr(const NPY_TYPES inputType, const bool recognize_integer_invalids)
     {
         switch (inputType)
         {
         case NPY_FLOAT:
-            return avx2<float, __m256, __m128>;
+            return avx2<float, false>;
         case NPY_DOUBLE:
-            return avx2<double, __m256d, __m128d>;
+            return avx2<double, false>;
         case NPY_LONGDOUBLE:
-            return non_vector<long double>;
+            return non_vector<long double, false>;
         case NPY_BOOL:
+            // TODO: The special case of bool (or bool as a view of int8/uint8) should use
+            //       a vectorized any() kernel instead, because it doesn't have an invalid value
+            //       and it can short-circuit.
         case NPY_INT8:
-            return avx2<int8_t, __m256i, __m128i>;
+            return recognize_integer_invalids ? non_vector<int8_t, true> : avx2<int8_t, false>;
         case NPY_INT16:
-            return avx2<int16_t, __m256i, __m128i>;
+            return recognize_integer_invalids ? non_vector<int16_t, true> : avx2<int16_t, false>;
         CASE_NPY_INT32:
-            return avx2<int32_t, __m256i, __m128i>;
+            return recognize_integer_invalids ? non_vector<int32_t, true> : avx2<int32_t, false>;
         CASE_NPY_INT64:
-
-            return non_vector<int64_t>;
+            return recognize_integer_invalids ? non_vector<int64_t, true> : non_vector<int64_t, false>;
         case NPY_UINT8:
-            return non_vector<uint8_t>;
+            return recognize_integer_invalids ? non_vector<uint8_t, true> : non_vector<uint8_t, false>;
         case NPY_UINT16:
-            return non_vector<uint16_t>;
+            return recognize_integer_invalids ? non_vector<uint16_t, true> : non_vector<uint16_t, false>;
         CASE_NPY_UINT32:
-            return avx2<uint32_t, __m256i, __m128i>;
+            return recognize_integer_invalids ? non_vector<uint32_t, true> : avx2<uint32_t, false>;
         CASE_NPY_UINT64:
-
-            return non_vector<uint64_t>;
+            return recognize_integer_invalids ? non_vector<uint64_t, true> : non_vector<uint64_t, false>;
         default:
             return nullptr;
         }
@@ -963,24 +1130,38 @@ class ReduceMax final
 class ReduceMin final
 {
     //--------------------------------------------------------------------------------------------
-    template <typename T>
+    template <typename T, bool recognize_integer_invalids>
     static double non_vector(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc)
     {
-        T * pIn = (T *)pDataIn;
-        const T * const pEnd = pIn + len;
+        using namespace riptide::math;
 
-        // Always set first item
-        T result = *pIn++;
+        // TODO: Handle case where len <= 0 to avoid dereferencing an empty array below.
 
-        while (pIn < pEnd)
-        {
-            // get the minimum
-            T temp = *pIn;
-            if (temp < result)
-            {
-                result = temp;
+        const T* const pIn = (T*)pDataIn;
+
+        // Initialize the result with the first (0th) element.
+        T result = pIn[0];
+
+        // Indicates when T is an integral type AND we're *NOT* recognizing / doing anything special
+        // for integer invalid values.
+        static constexpr bool skip_invalid_check = std::is_integral<T>::value && !recognize_integer_invalids;
+
+        // get the minimum, making sure NaNs are propagated correctly.
+        if (invalid_for_type<T>::is_valid(result)) {
+            for (int64_t i = 1; i < len; i++) {
+                const auto temp = pIn[i];
+
+                if (!invalid_for_type<T>::is_valid(temp)) {
+                    // We've found a NaN value. NaNs taint the result, so we set
+                    // the result value here then short-circuit since there's no
+                    // point in continuing any further.
+                    result = temp;
+                    break;
+                }
+                else if (temp < result) {
+                    result = temp;
+                }
             }
-            pIn++;
         }
 
         // Check for previous scattering.  If we are the first one
@@ -994,19 +1175,28 @@ class ReduceMin final
             // in case of nan when calling min (instead of nanmin), preserve nans
             if (pstScatterGatherFunc->resultOut == pstScatterGatherFunc->resultOut)
             {
-                pstScatterGatherFunc->resultOut = MINF(pstScatterGatherFunc->resultOut, (double)result);
+                pstScatterGatherFunc->resultOut = min_with_nan_passthru(pstScatterGatherFunc->resultOut, (double)result);
             }
             T previous = (T)(pstScatterGatherFunc->resultOutInt64);
-            pstScatterGatherFunc->resultOutInt64 = (int64_t)(MINF(previous, result));
+            pstScatterGatherFunc->resultOutInt64 = (int64_t)(min_with_nan_passthru(previous, result));
         }
         pstScatterGatherFunc->lenOut += len;
         return (double)pstScatterGatherFunc->resultOutInt64;
     }
 
     //--------------------------------------------------------------------------------------------
-    template <typename T, typename U256, typename U128>
+    template <typename T, bool recognize_integer_invalids>
     static double avx2(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc)
     {
+        using namespace riptide::math;
+
+        static_assert(
+            !std::is_integral<T>::value || !recognize_integer_invalids,
+            "Recognition of integer invalids not yet supported in vectorized (AVX2) mode."
+            );
+
+        typedef typename riptide::simd::avx2::vec256<T>::reg_type U256;
+
         T * pIn = (T *)pDataIn;
         T * pEnd;
 
@@ -1017,8 +1207,8 @@ class ReduceMin final
         // For float64 this is 32/ 4
         // for int16 this is 128 / 16
         // for int8  this is 256 / 32
-        const int64_t chunkSize = (sizeof(U256) * 8) / sizeof(T);
-        const int64_t perReg = sizeof(U256) / sizeof(T);
+        static constexpr int64_t chunkSize = (sizeof(U256) * 8) / sizeof(T);
+        static constexpr int64_t perReg = sizeof(U256) / sizeof(T);
 
         // printf("hit this %lld %lld\n", len, chunkSize);
         if (len >= chunkSize)
@@ -1040,6 +1230,13 @@ class ReduceMin final
 
             while (pIn256 < pEnd256)
             {
+                // TODO: Consider integrating a check into this loop to detect if we've seen a NaN,
+                //       and if so, short-circuiting the loop -- there's no point in continuing the
+                //       loop if e.g. the first element of the array is a NaN.
+                //       We could also short-circuit for integer types if we've seen std::numeric_limits<T>::min()
+                //       or -std::numeric_limits<T>::infinity() for floating-point types.
+                //       This can be done quickly with some comparisons against those values, then horizontally
+                //       reducing that mask using e.g. _mm256_movemask_pd/ps() and checking if the result == 15 or 255 (respectively).
                 m0 = MIN_OP(result, m0, LOADU(pIn256));
                 m1 = MIN_OP(result, m1, LOADU(pIn256 + 1));
                 m2 = MIN_OP(result, m2, LOADU(pIn256 + 2));
@@ -1057,54 +1254,22 @@ class ReduceMin final
             m2 = MIN_OP(result, m2, m3);
             m4 = MIN_OP(result, m4, m5);
             m6 = MIN_OP(result, m6, m7);
+
             m0 = MIN_OP(result, m0, m2);
             m4 = MIN_OP(result, m4, m6);
+
             m0 = MIN_OP(result, m0, m4);
 
-            if (false)
-            {
-                // Older path
-                // Write 256 bits into memory
-                __m256i temp;
-                _mm256_storeu_si256(&temp, *(__m256i *)&m0);
-                T * tempT = (T *)&temp;
-                result = tempT[0];
-
-                // printf("sofar minop 0  chunk: %lld    %lf %lf %lld\n", chunkSize,
-                // pstScatterGatherFunc->resultOut, (double)result,
-                // pstScatterGatherFunc->resultOutInt64);
-                for (int i = 1; i < perReg; i++)
-                {
-                    result = MINF(result, tempT[i]);
-                }
-            }
-            else
+            // Write 256 bits into memory
+            // PERF: Replace this memory spill with a vector reduction (e.g. using unpackhi/unpacklo)
+            __m256i temp;
+            _mm256_storeu_si256(&temp, *(__m256i *)&m0);
+            T * tempT = (T *)&temp;
+
+            // printf("sofar minop 0  chunk: %lld    %lf %lf %lld\n", chunkSize,
+            for (int i = 0; i < perReg; i++)
             {
-                // go from 256bit to 128bit to 64bit
-                // split the single 256bit register into two 128bit registers
-                U128 ym0 = CAST_256_TO_128_LO(m0);
-                U128 ym1 = CAST_256_TO_128_HI(m0);
-                __m128i temp;
-
-                // move the min to ym0
-                ym0 = MIN_OP128(result, ym0, ym1);
-
-                // move the min to lower half (64 bits)
-                ym1 = CAST_TO128ME(ym1, _mm_shuffle_pd(CAST_TO128d(ym0), CAST_TO128d(ym1), 1));
-                ym0 = MIN_OP128(result, ym0, ym1);
-
-                // Write 128 bits into memory (although only need 64bits)
-                _mm_storeu_si128(&temp, CAST_TO128i(ym0));
-                T * tempT = (T *)&temp;
-                result = tempT[0];
-
-                // printf("sofar minop 0  chunk: %lld    %lf %lf %lld\n", chunkSize,
-                // pstScatterGatherFunc->resultOut, (double)result,
-                // pstScatterGatherFunc->resultOutInt64);
-                for (int i = 0; i < (perReg / 4); i++)
-                {
-                    result = MINF(result, tempT[i]);
-                }
+                result = min_with_nan_passthru(result, tempT[i]);
             }
 
             // update pIn to last location we read
@@ -1115,7 +1280,7 @@ class ReduceMin final
         while (pIn < pEnd)
         {
             // get the minimum
-            result = MINF(result, *pIn);
+            result = min_with_nan_passthru(result, *pIn);
             pIn++;
         }
 
@@ -1132,203 +1297,9 @@ class ReduceMin final
         {
             // printf("minop !0 %lf %lf\n", pstScatterGatherFunc->resultOut,
             // (double)result);
-            pstScatterGatherFunc->resultOut = MINF(pstScatterGatherFunc->resultOut, (double)result);
+            pstScatterGatherFunc->resultOut = min_with_nan_passthru(pstScatterGatherFunc->resultOut, (double)result);
             T previous = (T)(pstScatterGatherFunc->resultOutInt64);
-            pstScatterGatherFunc->resultOutInt64 = (int64_t)(MINF(previous, result));
-        }
-        pstScatterGatherFunc->lenOut += len;
-        return pstScatterGatherFunc->resultOut;
-    }
-
-    //--------------------------------------------------------------------------------------------
-    // This routine only support floats
-    template <typename T, typename U256, typename U128>
-    static double avx2_nan_aware(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc)
-    {
-        T * pIn = (T *)pDataIn;
-        T * pEnd;
-
-        // Always set first item
-        T result = *pIn;
-
-        // For float32 this is 64 /8 since sizeof(float) = 4
-        // For float64 this is 32/ 4
-        const int64_t chunkSize = (sizeof(U256) * 8) / sizeof(T);
-        const int64_t perReg = sizeof(U256) / sizeof(T);
-
-        // printf("hit this %lld %lld\n", len, chunkSize);
-        if (len >= chunkSize)
-        {
-            pEnd = &pIn[chunkSize * (len / chunkSize)];
-
-            U256 * pIn256 = (U256 *)pIn;
-            U256 * pEnd256 = (U256 *)pEnd;
-
-            // Use 256 bit registers which hold 8 Ts
-            U256 m0 = LOADU(pIn256++);
-            U256 m1 = LOADU(pIn256++);
-
-            // Stagger the loads, while m1 loads, should be able to calculate mnan
-            U256 mnan = _mm256_cmp_ps(m0, m0, _CMP_EQ_OQ);
-            U256 m2 = LOADU(pIn256++);
-            mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m1, m1, _CMP_EQ_OQ));
-            U256 m3 = LOADU(pIn256++);
-            mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m2, m2, _CMP_EQ_OQ));
-            U256 m4 = LOADU(pIn256++);
-            mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m3, m3, _CMP_EQ_OQ));
-            U256 m5 = LOADU(pIn256++);
-            mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m4, m4, _CMP_EQ_OQ));
-            U256 m6 = LOADU(pIn256++);
-            mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m5, m5, _CMP_EQ_OQ));
-            U256 m7 = LOADU(pIn256++);
-            mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m6, m6, _CMP_EQ_OQ));
-            mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m7, m7, _CMP_EQ_OQ));
-
-            while (pIn256 < pEnd256)
-            {
-                U256 m10 = LOADU(pIn256++);
-                m0 = MIN_OP(result, m0, m10);
-                mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m10, m10, _CMP_EQ_OQ));
-                U256 m11 = LOADU(pIn256++);
-                m1 = MIN_OP(result, m1, m11);
-                mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m11, m11, _CMP_EQ_OQ));
-
-                m10 = LOADU(pIn256++);
-                m2 = MIN_OP(result, m2, m10);
-                mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m10, m10, _CMP_EQ_OQ));
-                m11 = LOADU(pIn256++);
-                m3 = MIN_OP(result, m3, m11);
-                mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m11, m11, _CMP_EQ_OQ));
-
-                m10 = LOADU(pIn256++);
-                m4 = MIN_OP(result, m4, m10);
-                mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m10, m10, _CMP_EQ_OQ));
-                m11 = LOADU(pIn256++);
-                m5 = MIN_OP(result, m5, m11);
-                mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m11, m11, _CMP_EQ_OQ));
-
-                m10 = LOADU(pIn256++);
-                m6 = MIN_OP(result, m6, m10);
-                mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m10, m10, _CMP_EQ_OQ));
-                m11 = LOADU(pIn256++);
-                m7 = MIN_OP(result, m7, m11);
-                mnan = _mm256_and_ps(mnan, _mm256_cmp_ps(m11, m11, _CMP_EQ_OQ));
-            }
-
-            // TODO: Check mnan for nans
-            // NOTE: a nan check can go here for floating point and short circuit
-            if (_mm256_movemask_ps(mnan) == 255)
-            {
-                // Wind this calculation down from 8 256 bit registers to the data T
-                // MIN_OP
-                m0 = MIN_OP(result, m0, m1);
-                m2 = MIN_OP(result, m2, m3);
-                m4 = MIN_OP(result, m4, m5);
-                m6 = MIN_OP(result, m6, m7);
-                m0 = MIN_OP(result, m0, m2);
-                m4 = MIN_OP(result, m4, m6);
-                m0 = MIN_OP(result, m0, m4);
-
-                if (false)
-                {
-                    // Older path
-                    // Write 256 bits into memory
-                    __m256i temp;
-                    //_mm256_movedup_pd
-                    //_mm256_shuffle_pd(y0, y0, 0);
-                    _mm256_storeu_si256(&temp, *(__m256i *)&m0);
-                    T * tempT = (T *)&temp;
-                    result = tempT[0];
-
-                    // printf("sofar minop 0  chunk: %lld    %lf %lf %lld\n", chunkSize,
-                    // pstScatterGatherFunc->resultOut, (double)result,
-                    // pstScatterGatherFunc->resultOutInt64);
-                    for (int i = 1; i < perReg; i++)
-                    {
-                        result = MINF(result, tempT[i]);
-                    }
-                }
-                else
-                {
-                    // go from 256bit to 128bit to 64bit
-                    // split the single 256bit register into two 128bit registers
-                    U128 ym0 = CAST_256_TO_128_LO(m0);
-                    U128 ym1 = CAST_256_TO_128_HI(m0);
-                    __m128i temp;
-
-                    // move the min to ym0
-                    ym0 = MIN_OP128(result, ym0, ym1);
-
-                    // move the min to lower half (64 bits)
-                    ym1 = CAST_TO128ME(ym1, _mm_shuffle_pd(CAST_TO128d(ym0), CAST_TO128d(ym1), 1));
-                    ym0 = MIN_OP128(result, ym0, ym1);
-
-                    // Write 128 bits into memory (although only need 64bits)
-                    _mm_storeu_si128(&temp, CAST_TO128i(ym0));
-                    T * tempT = (T *)&temp;
-                    result = tempT[0];
-
-                    // printf("sofar minop 0  chunk: %lld    %lf %lf %lld\n", chunkSize,
-                    // pstScatterGatherFunc->resultOut, (double)result,
-                    // pstScatterGatherFunc->resultOutInt64);
-                    for (int i = 1; i < (perReg / 4); i++)
-                    {
-                        result = MINF(result, tempT[i]);
-                    }
-                }
-
-                // update pIn to last location we read
-                pIn = (T *)pIn256;
-            }
-            else
-            {
-                // this should be cached somewhere
-                result = std::numeric_limits<float>::quiet_NaN();
-            }
-        }
-
-        pEnd = &pIn[len & (chunkSize - 1)];
-        while (pIn < pEnd)
-        {
-            // get the minimum
-            // TODO: Check for nans also
-            T value = *pIn++;
-            if (value == value)
-            {
-                result = MINF(result, value);
-            }
-            else
-            {
-                // this should be cached somewhere
-                result = std::numeric_limits<float>::quiet_NaN();
-                break;
-            }
-        }
-
-        // Check for previous scattering.  If we are the first one
-        if (pstScatterGatherFunc->lenOut == 0)
-        {
-            pstScatterGatherFunc->resultOut = (double)result;
-            pstScatterGatherFunc->resultOutInt64 = (int64_t)result;
-            // printf("minop 0  chunk: %lld    %lf %lf %lld\n", chunkSize,
-            // pstScatterGatherFunc->resultOut, (double)result,
-            // pstScatterGatherFunc->resultOutInt64);
-        }
-        else
-        {
-            // printf("minop !0 %lf %lf\n", pstScatterGatherFunc->resultOut,
-            // (double)result);
-            if (result == result)
-            {
-                pstScatterGatherFunc->resultOut = MINF(pstScatterGatherFunc->resultOut, (double)result);
-                T previous = (T)(pstScatterGatherFunc->resultOutInt64);
-                pstScatterGatherFunc->resultOutInt64 = (int64_t)(MINF(previous, result));
-            }
-            else
-            {
-                // we know this if a float
-                pstScatterGatherFunc->resultOut = result;
-            }
+            pstScatterGatherFunc->resultOutInt64 = (int64_t)(min_with_nan_passthru(previous, result));
         }
         pstScatterGatherFunc->lenOut += len;
         return pstScatterGatherFunc->resultOut;
@@ -1337,47 +1308,36 @@ class ReduceMin final
 public:
     // Given the numpy dtype number of the input array, returns a pointer to the
     // reduction function specialized for that array type.
-    template <bool is_nan_aware = false>
-    static ANY_SCATTER_GATHER_FUNC GetScatterGatherFuncPtr(const NPY_TYPES inputType)
+    static ANY_SCATTER_GATHER_FUNC GetScatterGatherFuncPtr(const NPY_TYPES inputType, const bool recognize_integer_invalids)
     {
         switch (inputType)
         {
         case NPY_FLOAT:
-            // For testing/development purposes, we have a "nan-aware" (i.e.
-            // nan-propagating) vectorized implementation for floats. Use the template
-            // parameter to select that version if specified.
-            if /*constexpr*/ (is_nan_aware)
-            {
-                return avx2_nan_aware<float, __m256, __m128>;
-            }
-            else
-            {
-                return avx2<float, __m256, __m128>;
-            }
-
+            return avx2<float, false>;
         case NPY_DOUBLE:
-            return avx2<double, __m256d, __m128d>;
+            return avx2<double, false>;
         case NPY_LONGDOUBLE:
-            return non_vector<long double>;
+            return non_vector<long double, false>;
         case NPY_BOOL:
+            // TODO: The special case of bool (or bool as a view of int8/uint8) should use
+            //       a vectorized all() kernel instead, because it doesn't have an invalid value
+            //       and it can short-circuit.
         case NPY_INT8:
-            return avx2<int8_t, __m256i, __m128i>;
+            return recognize_integer_invalids ? non_vector<int8_t, true> : avx2<int8_t, false>;
         case NPY_INT16:
-            return avx2<int16_t, __m256i, __m128i>;
+            return recognize_integer_invalids ? non_vector<int16_t, true> :  avx2<int16_t, false>;
         CASE_NPY_INT32:
-            return avx2<int32_t, __m256i, __m128i>;
+            return recognize_integer_invalids ? non_vector<int32_t, true> :  avx2<int32_t, false>;
         CASE_NPY_INT64:
-
-            return non_vector<int64_t>;
+            return recognize_integer_invalids ? non_vector<int64_t, true> :  non_vector<int64_t, false>;
         case NPY_UINT8:
-            return non_vector<uint8_t>;
+            return recognize_integer_invalids ? non_vector<uint8_t, true> :  non_vector<uint8_t, false>;
         case NPY_UINT16:
-            return non_vector<uint16_t>;
+            return recognize_integer_invalids ? non_vector<uint16_t, true> :  non_vector<uint16_t, false>;
         CASE_NPY_UINT32:
-            return avx2<uint32_t, __m256i, __m128i>;
+            return recognize_integer_invalids ? non_vector<uint32_t, true> :  avx2<uint32_t, false>;
         CASE_NPY_UINT64:
-
-            return non_vector<uint64_t>;
+            return recognize_integer_invalids ? non_vector<uint64_t, true> :  non_vector<uint64_t, false>;
         default:
             return nullptr;
         }
@@ -1392,7 +1352,7 @@ class ReduceNanMin final
 {
     // Simple, non-vectorized implementation of the nanmin (fmin) reduction
     // operation.
-    template <typename T>
+    template <typename T, bool recognize_integer_invalids>
     static double non_vector(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc)
     {
         T * pIn = (T *)pDataIn;
@@ -1402,15 +1362,22 @@ class ReduceNanMin final
         // so it'll be overwritten by any non-NaN value encountered below.
         T result = invalid_for_type<T>::value;
 
+        // Indicates when T is an integral type AND we're *NOT* recognizing / doing anything special
+        // for integer invalid values.
+        static constexpr bool skip_invalid_check = std::is_integral<T>::value && !recognize_integer_invalids;
+
         // Search for first non Nan
-        while (pIn < pEnd)
+        if constexpr (!skip_invalid_check)
         {
-            // break on first non nan
-            if (invalid_for_type<T>::is_valid(*pIn))
+            while (pIn < pEnd)
             {
-                break;
+                // break on first non nan
+                if (invalid_for_type<T>::is_valid(*pIn))
+                {
+                    break;
+                }
+                pIn++;
             }
-            pIn++;
         }
 
         if (pIn < pEnd)
@@ -1421,7 +1388,7 @@ class ReduceNanMin final
             {
                 // get the minimum
                 // include inf because of range function in cut/qcut
-                if (invalid_for_type<T>::is_valid(*pIn))
+                if (skip_invalid_check || invalid_for_type<T>::is_valid(*pIn))
                 {
                     result = (std::min)(result, *pIn);
                 }
@@ -1435,7 +1402,7 @@ class ReduceNanMin final
         // stScatterGatherFunc value used to accumulate any intermediate results
         // will retain whatever values it was initialized with, which makes it
         // easier to detect and handle the case of an all-NaN array.
-        if (invalid_for_type<T>::is_valid(result))
+        if (skip_invalid_check || invalid_for_type<T>::is_valid(result))
         {
             // Is there a previous intermediate result we need to combine with?
             // If not, this chunk of the array was the first result so we can just
@@ -1465,23 +1432,21 @@ class ReduceNanMin final
         else
         {
             // This chunk of the array contained all NaNs.
-            return std::numeric_limits<double>::quiet_NaN();
+            return invalid_for_type<double>::value;
         }
     }
 
     // AVX2-based implementation of the nanmin (fmin) reduction operation.
-    template <typename T>
+    template <typename T, bool recognize_integer_invalids>
     static double avx2(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc)
     {
-        // TEMP: Call the non-vectorized version of this operation until a
-        // vectorized version can be implemented.
-        return non_vector<T>(pDataIn, len, pstScatterGatherFunc);
-    }
+        static_assert(
+            std::is_integral<T>::value,
+            "Integers not yet supported in vectorized (AVX2) mode."
+            );
+
+        typedef typename riptide::simd::avx2::vec256<T>::reg_type U256;
 
-    //--------------------------------------------------------------------------------------------
-    template <typename T, typename U256>
-    static double avx2(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc)
-    {
         T * pIn = (T *)pDataIn;
         T * pEnd;
 
@@ -1533,8 +1498,10 @@ class ReduceNanMin final
             m2 = FMIN_OP(result, m2, m3);
             m4 = FMIN_OP(result, m4, m5);
             m6 = FMIN_OP(result, m6, m7);
+
             m0 = FMIN_OP(result, m0, m2);
             m4 = FMIN_OP(result, m4, m6);
+
             m0 = FMIN_OP(result, m0, m4);
 
             // Write 256 bits into memory
@@ -1544,13 +1511,12 @@ class ReduceNanMin final
             _mm256_storeu_si256(&temp, *(__m256i *)&m0);
 
             T * tempT = (T *)&temp;
-            result = tempT[0];
 
             // printf("sofar minop 0  chunk: %lld    %lf %lf %lld\n", chunkSize,
             // pstScatterGatherFunc->resultOut, (double)result,
             // pstScatterGatherFunc->resultOutInt64);
 
-            for (int i = 1; i < perReg; i++)
+            for (int i = 0; i < perReg; i++)
             {
                 const T current = tempT[i];
                 if (invalid_for_type<T>::is_valid(current))
@@ -1610,47 +1576,50 @@ class ReduceNanMin final
         else
         {
             // This chunk of the array contained all NaNs.
-            return std::numeric_limits<double>::quiet_NaN();
+            return invalid_for_type<double>::value;
         }
     }
 
 public:
     // Given the numpy dtype number of the input array, returns a pointer to the
     // reduction function specialized for that array type.
-    static ANY_SCATTER_GATHER_FUNC GetScatterGatherFuncPtr(const NPY_TYPES inputType)
+    static ANY_SCATTER_GATHER_FUNC GetScatterGatherFuncPtr(const NPY_TYPES inputType, const bool recognize_integer_invalids)
     {
+        // TODO: Enable avx2 implementation once it's been tested.
+        //       Initially, do this for `float` and `double`; the FMIN_OP function
+        //       needs to have overloads implemented for integer types before the
+        //       AVX2 implementation can be enabled for integral types.
+
         switch (inputType)
         {
-        // TODO: Enable avx2 implementation once it's been tested.
         case NPY_FLOAT:
-            return non_vector<float>;
+            return non_vector<float, false>;
         case NPY_DOUBLE:
-            return non_vector<double>;
+            return non_vector<double, false>;
         case NPY_LONGDOUBLE:
-            return non_vector<long double>;
+            return non_vector<long double, false>;
+
         case NPY_INT8:
-            return non_vector<int8_t>;
+            return recognize_integer_invalids ? non_vector<int8_t, true> : non_vector<int8_t, false>;
         case NPY_INT16:
-            return non_vector<int16_t>;
+            return recognize_integer_invalids ? non_vector<int16_t, true> :  non_vector<int16_t, false>;
         CASE_NPY_INT32:
-            return non_vector<int32_t>;
+            return recognize_integer_invalids ? non_vector<int32_t, true> :  non_vector<int32_t, false>;
         CASE_NPY_INT64:
-
-            return non_vector<int64_t>;
+            return recognize_integer_invalids ? non_vector<int64_t, true> :  non_vector<int64_t, false>;
         case NPY_UINT8:
-            return non_vector<uint8_t>;
+            return recognize_integer_invalids ? non_vector<uint8_t, true> :  non_vector<uint8_t, false>;
         case NPY_UINT16:
-            return non_vector<uint16_t>;
+            return recognize_integer_invalids ? non_vector<uint16_t, true> :  non_vector<uint16_t, false>;
         CASE_NPY_UINT32:
-            return non_vector<uint32_t>;
+            return recognize_integer_invalids ? non_vector<uint32_t, true> :  non_vector<uint32_t, false>;
         CASE_NPY_UINT64:
-
-            return non_vector<uint64_t>;
+            return recognize_integer_invalids ? non_vector<uint64_t, true> :  non_vector<uint64_t, false>;
 
         // bools are currently handled specially; we don't consider bool to have a
         // nan/invalid value so we utilize the normal reduction operation for it.
         case NPY_BOOL:
-            return ReduceMin::GetScatterGatherFuncPtr(inputType);
+            return ReduceMin::GetScatterGatherFuncPtr(inputType, recognize_integer_invalids);
 
         default:
             return nullptr;
@@ -1666,7 +1635,7 @@ class ReduceNanMax final
 {
     // Simple, non-vectorized implementation of the nanmax (fmax) reduction
     // operation.
-    template <typename T>
+    template <typename T, bool recognize_integer_invalids>
     static double non_vector(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc)
     {
         T * pIn = (T *)pDataIn;
@@ -1676,15 +1645,22 @@ class ReduceNanMax final
         // so it'll be overwritten by any non-NaN value encountered below.
         T result = invalid_for_type<T>::value;
 
+        // Indicates when T is an integral type AND we're *NOT* recognizing / doing anything special
+        // for integer invalid values.
+        static constexpr bool skip_invalid_check = std::is_integral<T>::value && !recognize_integer_invalids;
+
         // Search for first non Nan
-        while (pIn < pEnd)
+        if constexpr (!skip_invalid_check)
         {
-            // break on first non nan
-            if (invalid_for_type<T>::is_valid(*pIn))
+            while (pIn < pEnd)
             {
-                break;
+                // break on first non nan
+                if (invalid_for_type<T>::is_valid(*pIn))
+                {
+                    break;
+                }
+                pIn++;
             }
-            pIn++;
         }
 
         if (pIn < pEnd)
@@ -1695,7 +1671,7 @@ class ReduceNanMax final
             {
                 // get the maximum
                 // include inf because of range function in cut/qcut
-                if (invalid_for_type<T>::is_valid(*pIn))
+                if (skip_invalid_check || invalid_for_type<T>::is_valid(*pIn))
                 {
                     result = (std::max)(result, *pIn);
                 }
@@ -1709,7 +1685,7 @@ class ReduceNanMax final
         // stScatterGatherFunc value used to accumulate any intermediate results
         // will retain whatever values it was initialized with, which makes it
         // easier to detect and handle the case of an all-NaN array.
-        if (invalid_for_type<T>::is_valid(result))
+        if (skip_invalid_check || invalid_for_type<T>::is_valid(result))
         {
             // Is there a previous intermediate result we need to combine with?
             // If not, this chunk of the array was the first result so we can just
@@ -1739,23 +1715,21 @@ class ReduceNanMax final
         else
         {
             // This chunk of the array contained all NaNs.
-            return std::numeric_limits<double>::quiet_NaN();
+            return invalid_for_type<double>::value;
         }
     }
 
     // AVX2-based implementation of the nanmax (fmax) reduction operation.
-    template <typename T>
+    template <typename T, bool recognize_integer_invalids>
     static double avx2(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc)
     {
-        // TEMP: Call the non-vectorized version of this operation until a
-        // vectorized version can be implemented.
-        return non_vector<T>(pDataIn, len, pstScatterGatherFunc);
-    }
+        static_assert(
+            std::is_integral<T>::value,
+            "Integers not yet supported in vectorized (AVX2) mode."
+            );
+
+        typedef typename riptide::simd::avx2::vec256<T>::reg_type U256;
 
-    //--------------------------------------------------------------------------------------------
-    template <typename T, typename U256>
-    static double avx2(void * pDataIn, int64_t len, stScatterGatherFunc * pstScatterGatherFunc)
-    {
         T * pIn = (T *)pDataIn;
         T * pEnd;
 
@@ -1884,47 +1858,50 @@ class ReduceNanMax final
         else
         {
             // This chunk of the array contained all NaNs.
-            return std::numeric_limits<double>::quiet_NaN();
+            return invalid_for_type<double>::value;
         }
     }
 
 public:
     // Given the numpy dtype number of the input array, returns a pointer to the
     // reduction function specialized for that array type.
-    static ANY_SCATTER_GATHER_FUNC GetScatterGatherFuncPtr(const NPY_TYPES inputType)
+    static ANY_SCATTER_GATHER_FUNC GetScatterGatherFuncPtr(const NPY_TYPES inputType, const bool recognize_integer_invalids)
     {
+        // TODO: Enable avx2 implementation once it's been tested.
+        //       Initially, do this for `float` and `double`; the FMAX_OP function
+        //       needs to have overloads implemented for integer types before the
+        //       AVX2 implementation can be enabled for integral types.
+
         switch (inputType)
         {
-        // TODO: Enable avx2 implementation once it's been tested.
         case NPY_FLOAT:
-            return non_vector<float>;
+            return non_vector<float, false>;
         case NPY_DOUBLE:
-            return non_vector<double>;
+            return non_vector<double, false>;
         case NPY_LONGDOUBLE:
-            return non_vector<long double>;
+            return non_vector<long double, false>;
+
         case NPY_INT8:
-            return non_vector<int8_t>;
+            return recognize_integer_invalids ? non_vector<int8_t, true> : non_vector<int8_t, false>;
         case NPY_INT16:
-            return non_vector<int16_t>;
+            return recognize_integer_invalids ? non_vector<int16_t, true> : non_vector<int16_t, false>;
         CASE_NPY_INT32:
-            return non_vector<int32_t>;
+            return recognize_integer_invalids ? non_vector<int32_t, true> : non_vector<int32_t, false>;
         CASE_NPY_INT64:
-
-            return non_vector<int64_t>;
+            return recognize_integer_invalids ? non_vector<int64_t, true> : non_vector<int64_t, false>;
         case NPY_UINT8:
-            return non_vector<uint8_t>;
+            return recognize_integer_invalids ? non_vector<uint8_t, true> : non_vector<uint8_t, false>;
         case NPY_UINT16:
-            return non_vector<uint16_t>;
+            return recognize_integer_invalids ? non_vector<uint16_t, true> : non_vector<uint16_t, false>;
         CASE_NPY_UINT32:
-            return non_vector<uint32_t>;
+            return recognize_integer_invalids ? non_vector<uint32_t, true> : non_vector<uint32_t, false>;
         CASE_NPY_UINT64:
-
-            return non_vector<uint64_t>;
+            return recognize_integer_invalids ? non_vector<uint64_t, true> : non_vector<uint64_t, false>;
 
         // bools are currently handled specially; we don't consider bool to have a
         // nan/invalid value so we utilize the normal reduction operation for it.
         case NPY_BOOL:
-            return ReduceMax::GetScatterGatherFuncPtr(inputType);
+            return ReduceMax::GetScatterGatherFuncPtr(inputType, recognize_integer_invalids);
 
         default:
             return nullptr;
@@ -2845,16 +2822,16 @@ static ANY_SCATTER_GATHER_FUNC GetReduceFuncPtr(const NPY_TYPES inputType, const
         return ReduceNanSum::GetScatterGatherFuncPtr(inputType);
 
     case REDUCE_FUNCTIONS::REDUCE_MIN:
-        return ReduceMin::GetScatterGatherFuncPtr(inputType);
+        return ReduceMin::GetScatterGatherFuncPtr(inputType, false);
     case REDUCE_FUNCTIONS::REDUCE_MIN_NANAWARE:
-        return ReduceMin::GetScatterGatherFuncPtr<true>(inputType);
+        return ReduceMin::GetScatterGatherFuncPtr(inputType, true);
     case REDUCE_FUNCTIONS::REDUCE_NANMIN:
-        return ReduceNanMin::GetScatterGatherFuncPtr(inputType);
+        return ReduceNanMin::GetScatterGatherFuncPtr(inputType, false);
 
     case REDUCE_FUNCTIONS::REDUCE_MAX:
-        return ReduceMax::GetScatterGatherFuncPtr(inputType);
+        return ReduceMax::GetScatterGatherFuncPtr(inputType, false);
     case REDUCE_FUNCTIONS::REDUCE_NANMAX:
-        return ReduceNanMax::GetScatterGatherFuncPtr(inputType);
+        return ReduceNanMax::GetScatterGatherFuncPtr(inputType, false);
 
     case REDUCE_FUNCTIONS::REDUCE_VAR:
         return ReduceVariance::GetScatterGatherFuncPtr(inputType);
diff --git a/src/simd/avx2.h b/src/simd/avx2.h
index 6281fcdf..b6215c72 100644
--- a/src/simd/avx2.h
+++ b/src/simd/avx2.h
@@ -1,4 +1,5 @@
-#pragma once
+#ifndef RIPTABLE_CPP_SIMD_AVX2_H
+#define RIPTABLE_CPP_SIMD_AVX2_H
 #include <algorithm>
 #include <cstdint>
 #include <immintrin.h>
@@ -64,20 +65,20 @@ namespace riptide
          * @return T const& The result of the operation.
          */
         template <typename T>
-        T const & min_with_nan_passthru(T const & x, T const & y)
+        static RT_FORCEINLINE T const & min_with_nan_passthru(T const & x, T const & y)
         {
             return (std::min)(x, y);
         }
 
         template <>
-        float const & min_with_nan_passthru(float const & x, float const & y)
+        RT_FORCEINLINE float const & min_with_nan_passthru(float const & x, float const & y)
         {
             const auto & blended = (x != x) ? x : y;
             return x < blended ? x : blended;
         }
 
         template <>
-        double const & min_with_nan_passthru(double const & x, double const & y)
+        RT_FORCEINLINE double const & min_with_nan_passthru(double const & x, double const & y)
         {
             const auto & blended = (x != x) ? x : y;
             return x < blended ? x : blended;
@@ -93,20 +94,20 @@ namespace riptide
          * @return T const& The result of the operation.
          */
         template <typename T>
-        T const & max_with_nan_passthru(T const & x, T const & y)
+        static RT_FORCEINLINE T const & max_with_nan_passthru(T const & x, T const & y)
         {
             return (std::max)(x, y);
         }
 
         template <>
-        float const & max_with_nan_passthru(float const & x, float const & y)
+        RT_FORCEINLINE float const & max_with_nan_passthru(float const & x, float const & y)
         {
             const auto & blended = (x != x) ? x : y;
             return x > blended ? x : blended;
         }
 
         template <>
-        double const & max_with_nan_passthru(double const & x, double const & y)
+        RT_FORCEINLINE double const & max_with_nan_passthru(double const & x, double const & y)
         {
             const auto & blended = (x != x) ? x : y;
             return x > blended ? x : blended;
@@ -5473,3 +5474,5 @@ namespace riptide
         } // namespace avx2
     }     // namespace simd
 } // namespace riptide
+
+#endif // RIPTABLE_CPP_SIMD_AVX2_H