Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use unaligned data types for unaligned intrinsics #632

Merged
merged 1 commit into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,11 @@ typedef float32x4_t __m128d;
#endif
typedef int64x2_t __m128i; /* 128-bit vector containing integers */

// Some intrinsics operate on unaligned data types.
typedef int16_t ALIGN_STRUCT(1) unaligned_int16_t;
typedef int32_t ALIGN_STRUCT(1) unaligned_int32_t;
typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t;

// __int64 is defined in the Intrinsics Guide which maps to different datatype
// in different data model
#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
Expand Down Expand Up @@ -1927,15 +1932,15 @@ FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
{
return vreinterpretq_m128i_s16(
vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
vsetq_lane_s16(*(const unaligned_int16_t *) p, vdupq_n_s16(0), 0));
}

// Load unaligned 64-bit integer from memory into the first element of dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
{
return vreinterpretq_m128i_s64(
vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
vsetq_lane_s64(*(const unaligned_int64_t *) p, vdupq_n_s64(0), 0));
}

// Allocate size bytes of memory, aligned to the alignment specified in align,
Expand Down Expand Up @@ -4360,15 +4365,15 @@ FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
{
return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
return vreinterpretq_m128i_s32(vld1q_s32((const unaligned_int32_t *) p));
}

// Load unaligned 32-bit integer from memory into the first element of dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
{
return vreinterpretq_m128i_s32(
vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
vsetq_lane_s32(*(const unaligned_int32_t *) p, vdupq_n_s32(0), 0));
}

// Multiply packed signed 16-bit integers in a and b, producing intermediate
Expand Down
29 changes: 25 additions & 4 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,19 @@
/* run the 1st parameter */
#define IIF_1(t, ...) t

// Some intrinsics operate on unaligned data types.
#if defined(__GNUC__) || defined(__clang__)
#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
#elif defined(_MSC_VER)
#ifndef ALIGN_STRUCT
#define ALIGN_STRUCT(x) __declspec(align(x))
#endif
#endif

typedef int16_t ALIGN_STRUCT(1) unaligned_int16_t;
typedef int32_t ALIGN_STRUCT(1) unaligned_int32_t;
typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t;

// This program a set of unit tests to ensure that each SSE call provide the
// output we expect. If this fires an assert, then something didn't match up.
//
Expand All @@ -49,6 +62,10 @@ class SSE2NEONTestImpl : public SSE2NEONTest
int32_t *mTestIntPointer2;
float mTestFloats[MAX_TEST_VALUE];
int32_t mTestInts[MAX_TEST_VALUE];
int8_t mTestUnalignedInts[32] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
};

virtual ~SSE2NEONTestImpl(void)
{
Expand Down Expand Up @@ -2141,7 +2158,8 @@ result_t test_mm_loadu_si16(const SSE2NEONTestImpl &impl, uint32_t iter)
#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
return TEST_UNIMPL;
#else
const int16_t *addr = (const int16_t *) impl.mTestIntPointer1;
const unaligned_int16_t *addr =
(const unaligned_int16_t *) (impl.mTestUnalignedInts + 1);

__m128i ret = _mm_loadu_si16((const void *) addr);

Expand All @@ -2157,7 +2175,8 @@ result_t test_mm_loadu_si64(const SSE2NEONTestImpl &impl, uint32_t iter)
#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 9)
return TEST_UNIMPL;
#else
const int64_t *addr = (const int64_t *) impl.mTestIntPointer1;
const unaligned_int64_t *addr =
(const unaligned_int64_t *) (impl.mTestUnalignedInts + 1);

__m128i ret = _mm_loadu_si64((const void *) addr);

Expand Down Expand Up @@ -5024,7 +5043,8 @@ result_t test_mm_loadu_pd(const SSE2NEONTestImpl &impl, uint32_t iter)

result_t test_mm_loadu_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
{
const int32_t *_a = (const int32_t *) impl.mTestIntPointer1;
const unaligned_int32_t *_a =
(const unaligned_int32_t *) (impl.mTestUnalignedInts + 1);
__m128i c = _mm_loadu_si128((const __m128i *) _a);
return VALIDATE_INT32_M128(c, _a);
}
Expand All @@ -5037,7 +5057,8 @@ result_t test_mm_loadu_si32(const SSE2NEONTestImpl &impl, uint32_t iter)
#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
return TEST_UNIMPL;
#else
const int32_t *addr = (const int32_t *) impl.mTestIntPointer1;
const unaligned_int32_t *addr =
(const unaligned_int32_t *) (impl.mTestUnalignedInts + 1);

__m128i ret = _mm_loadu_si32((const void *) addr);

Expand Down
Loading