Skip to content

Commit

Permalink
feat: Use fesetround() and fegetround()
Browse files Browse the repository at this point in the history
Setting/getting rounding mode directly through fpcr with volatile
keyword could be unstable.
Therefore we use the C99 fesetround()/fegetround() here to ensure
the behavior.
  • Loading branch information
howjmay committed Oct 6, 2024
1 parent 1fa51c1 commit 7698ecf
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 54 deletions.
82 changes: 29 additions & 53 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,14 @@
#warning "Optimization may cause potential errors in sse2neon. see #648"
#endif


/* C language does not allow initializing a variable with a function call. */
#ifdef __cplusplus
#define _sse2neon_const static const
#else
#define _sse2neon_const const
#endif

#include <fenv.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
Expand Down Expand Up @@ -1840,25 +1840,20 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
{
union {
fpcr_bitfield field;
#if defined(__aarch64__) || defined(_M_ARM64)
uint64_t value;
#else
uint32_t value;
#endif
} r;

#if defined(__aarch64__) || defined(_M_ARM64)
r.value = _sse2neon_get_fpcr();
#else
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
#endif

if (r.field.bit22) {
return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
} else {
return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
switch (fegetround()) {
case FE_TONEAREST:
return _MM_ROUND_NEAREST;
case FE_DOWNWARD:
return _MM_ROUND_DOWN;
case FE_UPWARD:
return _MM_ROUND_UP;
case FE_TOWARDZERO:
return _MM_ROUND_TOWARD_ZERO;
default:
// fegetround() must return _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO on success. all the other error
// cases we treat them as FE_TOWARDZERO (truncate).
return _MM_ROUND_TOWARD_ZERO;
}
}

Expand Down Expand Up @@ -2454,44 +2449,26 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
{
union {
fpcr_bitfield field;
#if defined(__aarch64__) || defined(_M_ARM64)
uint64_t value;
#else
uint32_t value;
#endif
} r;

#if defined(__aarch64__) || defined(_M_ARM64)
r.value = _sse2neon_get_fpcr();
#else
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
#endif

switch (rounding) {
case _MM_ROUND_TOWARD_ZERO:
r.field.bit22 = 1;
r.field.bit23 = 1;
case _MM_ROUND_NEAREST:
rounding = FE_TONEAREST;
break;
case _MM_ROUND_DOWN:
r.field.bit22 = 0;
r.field.bit23 = 1;
rounding = FE_DOWNWARD;
break;
case _MM_ROUND_UP:
r.field.bit22 = 1;
r.field.bit23 = 0;
rounding = FE_UPWARD;
break;
case _MM_ROUND_TOWARD_ZERO:
rounding = FE_TOWARDZERO;
break;
default: //_MM_ROUND_NEAREST
r.field.bit22 = 0;
r.field.bit23 = 0;
default:
// rounding must be _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
// _MM_ROUND_TOWARD_ZERO. all the other invalid values we treat them as
// FE_TOWARDZERO (truncate).
rounding = FE_TOWARDZERO;
}

#if defined(__aarch64__) || defined(_M_ARM64)
_sse2neon_set_fpcr(r.value);
#else
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
#endif
fesetround(rounding);
}

// Copy single-precision (32-bit) floating-point element a to the lower element
Expand Down Expand Up @@ -9340,8 +9317,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
#endif
}

FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(
unsigned int flag)
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
{
// AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
// regardless of the value of the FZ bit.
Expand Down
3 changes: 2 additions & 1 deletion tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4793,7 +4793,8 @@ result_t test_mm_cvttpd_epi32(const SSE2NEONTestImpl &impl, uint32_t iter)
return validateInt32(ret, d0, d1, 0, 0);
}

OPTNONE result_t test_mm_cvttpd_pi32(const SSE2NEONTestImpl &impl, uint32_t iter)
OPTNONE result_t test_mm_cvttpd_pi32(const SSE2NEONTestImpl &impl,
uint32_t iter)
{
const double *_a = (const double *) impl.mTestFloatPointer1;

Expand Down

0 comments on commit 7698ecf

Please sign in to comment.