Skip to content

Commit

Permalink
Use unaligned data types for unaligned intrinsics (#632)
Browse files Browse the repository at this point in the history
Unaligned memory accesses are primarily useful in cases where space is a
concern, and data would benefit from being packed. While unaligned
accesses are known for causing non-negligible performance degradation,
in situations where performance is also a primary concern, having access
to unaligned SIMD instructions is monumentally beneficial.

This has become particularly relevant with the growing popularity of
edge and mobile machine learning.
  • Loading branch information
Logikable authored May 20, 2024
1 parent de0538f commit 42c7047
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 8 deletions.
13 changes: 9 additions & 4 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,11 @@ typedef float32x4_t __m128d;
#endif
typedef int64x2_t __m128i; /* 128-bit vector containing integers */

// Some intrinsics operate on unaligned data types.
typedef int16_t ALIGN_STRUCT(1) unaligned_int16_t;
typedef int32_t ALIGN_STRUCT(1) unaligned_int32_t;
typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t;

// __int64 is defined in the Intrinsics Guide which maps to different datatype
// in different data model
#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
Expand Down Expand Up @@ -1927,15 +1932,15 @@ FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
{
return vreinterpretq_m128i_s16(
vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
vsetq_lane_s16(*(const unaligned_int16_t *) p, vdupq_n_s16(0), 0));
}

// Load unaligned 64-bit integer from memory into the first element of dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
{
return vreinterpretq_m128i_s64(
vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
vsetq_lane_s64(*(const unaligned_int64_t *) p, vdupq_n_s64(0), 0));
}

// Allocate size bytes of memory, aligned to the alignment specified in align,
Expand Down Expand Up @@ -4360,15 +4365,15 @@ FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
{
return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
return vreinterpretq_m128i_s32(vld1q_s32((const unaligned_int32_t *) p));
}

// Load unaligned 32-bit integer from memory into the first element of dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
{
return vreinterpretq_m128i_s32(
vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
vsetq_lane_s32(*(const unaligned_int32_t *) p, vdupq_n_s32(0), 0));
}

// Multiply packed signed 16-bit integers in a and b, producing intermediate
Expand Down
29 changes: 25 additions & 4 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,19 @@
/* run the 1st parameter */
#define IIF_1(t, ...) t

// Some intrinsics operate on unaligned data types.
#if defined(__GNUC__) || defined(__clang__)
#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
#elif defined(_MSC_VER)
#ifndef ALIGN_STRUCT
#define ALIGN_STRUCT(x) __declspec(align(x))
#endif
#endif

typedef int16_t ALIGN_STRUCT(1) unaligned_int16_t;
typedef int32_t ALIGN_STRUCT(1) unaligned_int32_t;
typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t;

// This program a set of unit tests to ensure that each SSE call provide the
// output we expect. If this fires an assert, then something didn't match up.
//
Expand All @@ -49,6 +62,10 @@ class SSE2NEONTestImpl : public SSE2NEONTest
int32_t *mTestIntPointer2;
float mTestFloats[MAX_TEST_VALUE];
int32_t mTestInts[MAX_TEST_VALUE];
int8_t mTestUnalignedInts[32] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
};

virtual ~SSE2NEONTestImpl(void)
{
Expand Down Expand Up @@ -2141,7 +2158,8 @@ result_t test_mm_loadu_si16(const SSE2NEONTestImpl &impl, uint32_t iter)
#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
return TEST_UNIMPL;
#else
const int16_t *addr = (const int16_t *) impl.mTestIntPointer1;
const unaligned_int16_t *addr =
(const unaligned_int16_t *) (impl.mTestUnalignedInts + 1);

__m128i ret = _mm_loadu_si16((const void *) addr);

Expand All @@ -2157,7 +2175,8 @@ result_t test_mm_loadu_si64(const SSE2NEONTestImpl &impl, uint32_t iter)
#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 9)
return TEST_UNIMPL;
#else
const int64_t *addr = (const int64_t *) impl.mTestIntPointer1;
const unaligned_int64_t *addr =
(const unaligned_int64_t *) (impl.mTestUnalignedInts + 1);

__m128i ret = _mm_loadu_si64((const void *) addr);

Expand Down Expand Up @@ -5024,7 +5043,8 @@ result_t test_mm_loadu_pd(const SSE2NEONTestImpl &impl, uint32_t iter)

result_t test_mm_loadu_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
{
const int32_t *_a = (const int32_t *) impl.mTestIntPointer1;
const unaligned_int32_t *_a =
(const unaligned_int32_t *) (impl.mTestUnalignedInts + 1);
__m128i c = _mm_loadu_si128((const __m128i *) _a);
return VALIDATE_INT32_M128(c, _a);
}
Expand All @@ -5037,7 +5057,8 @@ result_t test_mm_loadu_si32(const SSE2NEONTestImpl &impl, uint32_t iter)
#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
return TEST_UNIMPL;
#else
const int32_t *addr = (const int32_t *) impl.mTestIntPointer1;
const unaligned_int32_t *addr =
(const unaligned_int32_t *) (impl.mTestUnalignedInts + 1);

__m128i ret = _mm_loadu_si32((const void *) addr);

Expand Down

0 comments on commit 42c7047

Please sign in to comment.