diff --git a/sse2neon.h b/sse2neon.h index c2228667..88d0bdce 100644 --- a/sse2neon.h +++ b/sse2neon.h @@ -114,7 +114,6 @@ #warning "Optimization may cause potential errors in sse2neon. see #648" #endif - /* C language does not allow initializing a variable with a function call. */ #ifdef __cplusplus #define _sse2neon_const static const @@ -122,6 +121,7 @@ #define _sse2neon_const const #endif +#include #include #include #include @@ -1840,25 +1840,20 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) { - union { - fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) - uint64_t value; -#else - uint32_t value; -#endif - } r; - -#if defined(__aarch64__) || defined(_M_ARM64) - r.value = _sse2neon_get_fpcr(); -#else - __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ -#endif - - if (r.field.bit22) { - return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP; - } else { - return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST; + switch (fegetround()) { + case FE_TONEAREST: + return _MM_ROUND_NEAREST; + case FE_DOWNWARD: + return _MM_ROUND_DOWN; + case FE_UPWARD: + return _MM_ROUND_UP; + case FE_TOWARDZERO: + return _MM_ROUND_TOWARD_ZERO; + default: + // fegetround() must return _MM_ROUND_NEAREST, _MM_ROUND_DOWN, + // _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO on success. all the other error + // cases we treat them as FE_TOWARDZERO (truncate). + return _MM_ROUND_TOWARD_ZERO; } } @@ -2454,44 +2449,26 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) { - union { - fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) - uint64_t value; -#else - uint32_t value; -#endif - } r; - -#if defined(__aarch64__) || defined(_M_ARM64) - r.value = _sse2neon_get_fpcr(); -#else - __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ -#endif - switch (rounding) { - case _MM_ROUND_TOWARD_ZERO: - r.field.bit22 = 1; - r.field.bit23 = 1; + case _MM_ROUND_NEAREST: + rounding = FE_TONEAREST; break; case _MM_ROUND_DOWN: - r.field.bit22 = 0; - r.field.bit23 = 1; + rounding = FE_DOWNWARD; break; case _MM_ROUND_UP: - r.field.bit22 = 1; - r.field.bit23 = 0; + rounding = FE_UPWARD; + break; + case _MM_ROUND_TOWARD_ZERO: + rounding = FE_TOWARDZERO; break; - default: //_MM_ROUND_NEAREST - r.field.bit22 = 0; - r.field.bit23 = 0; + default: + // rounding must be _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, + // _MM_ROUND_TOWARD_ZERO. all the other invalid values we treat them as + // FE_TOWARDZERO (truncate). + rounding = FE_TOWARDZERO; } - -#if defined(__aarch64__) || defined(_M_ARM64) - _sse2neon_set_fpcr(r.value); -#else - __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ -#endif + fesetround(rounding); } // Copy single-precision (32-bit) floating-point element a to the lower element @@ -9340,8 +9317,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) #endif } -FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode( - unsigned int flag) +FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) { // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, // regardless of the value of the FZ bit. diff --git a/tests/impl.cpp b/tests/impl.cpp index f29a4ae0..a3887606 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -4793,7 +4793,8 @@ result_t test_mm_cvttpd_epi32(const SSE2NEONTestImpl &impl, uint32_t iter) return validateInt32(ret, d0, d1, 0, 0); } -OPTNONE result_t test_mm_cvttpd_pi32(const SSE2NEONTestImpl &impl, uint32_t iter) +OPTNONE result_t test_mm_cvttpd_pi32(const SSE2NEONTestImpl &impl, + uint32_t iter) { const double *_a = (const double *) impl.mTestFloatPointer1;