Skip to content

Commit

Permalink
refactor: Use fesetround() and fegetround()
Browse files Browse the repository at this point in the history
  • Loading branch information
howjmay committed Sep 22, 2024
1 parent d1fe9f2 commit 6e3fd9a
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 86 deletions.
135 changes: 51 additions & 84 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,14 @@
#warning "Optimization may cause potential errors in sse2neon. see #648"
#endif


/* C language does not allow initializing a variable with a function call. */
#ifdef __cplusplus
#define _sse2neon_const static const
#else
#define _sse2neon_const const
#endif

#include <fenv.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
Expand Down Expand Up @@ -193,10 +193,7 @@ FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64)
#define _sse2neon_return(ret) return ret
#endif

#define _sse2neon_init(...) \
{ \
__VA_ARGS__ \
}
#define _sse2neon_init(...) {__VA_ARGS__}

/* Compiler barrier */
#if defined(_MSC_VER) && !defined(__clang__)
Expand Down Expand Up @@ -1806,7 +1803,7 @@ FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
#if defined(_MSC_VER) && !defined(__clang__)
_WriteStatusReg(ARM64_FPCR, value);
#else
__asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
__asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
#endif
}

Expand Down Expand Up @@ -1840,25 +1837,17 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
{
union {
fpcr_bitfield field;
#if defined(__aarch64__) || defined(_M_ARM64)
uint64_t value;
#else
uint32_t value;
#endif
} r;

#if defined(__aarch64__) || defined(_M_ARM64)
r.value = _sse2neon_get_fpcr();
#else
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
#endif

if (r.field.bit22) {
return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
} else {
return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
switch (fegetround()) {
case FE_TONEAREST:
return _MM_ROUND_NEAREST;
case FE_DOWNWARD:
return _MM_ROUND_DOWN;
case FE_UPWARD:
return _MM_ROUND_UP;
case FE_TOWARDZERO:
return _MM_ROUND_TOWARD_ZERO;
default: // FIXME
return _MM_ROUND_TOWARD_ZERO;
}
}

Expand Down Expand Up @@ -2426,7 +2415,7 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
#if defined(__aarch64__) || defined(_M_ARM64)
_sse2neon_set_fpcr(r.value);
#else
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
#endif
}

Expand Down Expand Up @@ -2454,44 +2443,23 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
{
union {
fpcr_bitfield field;
#if defined(__aarch64__) || defined(_M_ARM64)
uint64_t value;
#else
uint32_t value;
#endif
} r;

#if defined(__aarch64__) || defined(_M_ARM64)
r.value = _sse2neon_get_fpcr();
#else
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
#endif

switch (rounding) {
case _MM_ROUND_TOWARD_ZERO:
r.field.bit22 = 1;
r.field.bit23 = 1;
case _MM_ROUND_NEAREST:
rounding = FE_TONEAREST;
break;
case _MM_ROUND_DOWN:
r.field.bit22 = 0;
r.field.bit23 = 1;
rounding = FE_DOWNWARD;
break;
case _MM_ROUND_UP:
r.field.bit22 = 1;
r.field.bit23 = 0;
rounding = FE_UPWARD;
break;
case _MM_ROUND_TOWARD_ZERO:
rounding = FE_TOWARDZERO;
break;
default: //_MM_ROUND_NEAREST
r.field.bit22 = 0;
r.field.bit23 = 0;
default: // FIXME
rounding = FE_TOWARDZERO;
}

#if defined(__aarch64__) || defined(_M_ARM64)
_sse2neon_set_fpcr(r.value);
#else
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
#endif
fesetround(rounding);
}

// Copy single-precision (32-bit) floating-point element a to the lower element
Expand Down Expand Up @@ -4990,11 +4958,11 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
signed char b1,
signed char b0)
{
int8_t ALIGN_STRUCT(16)
data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
(int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
(int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
(int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
int8_t ALIGN_STRUCT(16) data[16] = {
(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
(int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
(int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
(int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
return (__m128i) vld1q_s8(data);
}

Expand Down Expand Up @@ -5125,11 +5093,11 @@ FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
signed char b14,
signed char b15)
{
int8_t ALIGN_STRUCT(16)
data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
(int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
(int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
(int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
int8_t ALIGN_STRUCT(16) data[16] = {
(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
(int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
(int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
(int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
return (__m128i) vld1q_s8(data);
}

Expand Down Expand Up @@ -6282,7 +6250,7 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
uint8x8_t tmp_low; \
uint8x8_t tmp_high; \
if ((imm) >= 8) { \
const int idx = (imm) -8; \
const int idx = (imm) - 8; \
tmp_low = vreinterpret_u8_m64(_a); \
tmp_high = vdup_n_u8(0); \
ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
Expand Down Expand Up @@ -6803,14 +6771,14 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
_sse2neon_define2( \
__m128i, a, b, \
const uint16_t _mask[8] = \
_sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \
((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \
((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \
((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \
((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \
((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \
((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \
((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0); \
_sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) - 1 : 0x0, \
((imm) & (1 << 1)) ? (uint16_t) - 1 : 0x0, \
((imm) & (1 << 2)) ? (uint16_t) - 1 : 0x0, \
((imm) & (1 << 3)) ? (uint16_t) - 1 : 0x0, \
((imm) & (1 << 4)) ? (uint16_t) - 1 : 0x0, \
((imm) & (1 << 5)) ? (uint16_t) - 1 : 0x0, \
((imm) & (1 << 6)) ? (uint16_t) - 1 : 0x0, \
((imm) & (1 << 7)) ? (uint16_t) - 1 : 0x0); \
uint16x8_t _mask_vec = vld1q_u16(_mask); \
uint16x8_t __a = vreinterpretq_u16_m128i(_a); \
uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
Expand All @@ -6835,11 +6803,11 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
{
const uint32_t ALIGN_STRUCT(16)
data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
((imm8) & (1 << 1)) ? UINT32_MAX : 0,
((imm8) & (1 << 2)) ? UINT32_MAX : 0,
((imm8) & (1 << 3)) ? UINT32_MAX : 0};
const uint32_t
ALIGN_STRUCT(16) data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
((imm8) & (1 << 1)) ? UINT32_MAX : 0,
((imm8) & (1 << 2)) ? UINT32_MAX : 0,
((imm8) & (1 << 3)) ? UINT32_MAX : 0};
uint32x4_t mask = vld1q_u32(data);
float32x4_t a = vreinterpretq_f32_m128(_a);
float32x4_t b = vreinterpretq_f32_m128(_b);
Expand Down Expand Up @@ -9340,8 +9308,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
#endif
}

FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(
unsigned int flag)
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
{
// AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
// regardless of the value of the FZ bit.
Expand All @@ -9365,7 +9332,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(
#if defined(__aarch64__) || defined(_M_ARM64)
_sse2neon_set_fpcr(r.value);
#else
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
#endif
}

Expand Down
6 changes: 4 additions & 2 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4793,7 +4793,8 @@ result_t test_mm_cvttpd_epi32(const SSE2NEONTestImpl &impl, uint32_t iter)
return validateInt32(ret, d0, d1, 0, 0);
}

OPTNONE result_t test_mm_cvttpd_pi32(const SSE2NEONTestImpl &impl, uint32_t iter)
OPTNONE result_t test_mm_cvttpd_pi32(const SSE2NEONTestImpl &impl,
uint32_t iter)
{
const double *_a = (const double *) impl.mTestFloatPointer1;

Expand Down Expand Up @@ -5877,7 +5878,7 @@ result_t test_mm_shuffle_epi32(const SSE2NEONTestImpl &impl, uint32_t iter)
int32_t _d[4];

#define TEST_IMPL(IDX) \
_d[0] = _a[((IDX) &0x3)]; \
_d[0] = _a[((IDX) & 0x3)]; \
_d[1] = _a[((IDX >> 2) & 0x3)]; \
_d[2] = _a[((IDX >> 4) & 0x3)]; \
_d[3] = _a[((IDX >> 6) & 0x3)]; \
Expand Down Expand Up @@ -8957,6 +8958,7 @@ OPTNONE result_t test_mm_round_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
__m128d ret;

__m128d a = load_m128d(_a);

switch (iter & 0x7) {
case 0:
d[0] = bankersRounding(_a[0]);
Expand Down

0 comments on commit 6e3fd9a

Please sign in to comment.