DLTcollab · jserv · May 20, 2024 · May 20, 2024
@@ -382,6 +382,11 @@ typedef float32x4_t __m128d;
 #endif
 typedef int64x2_t __m128i; /* 128-bit vector containing integers */
 
+// Some intrinsics operate on unaligned data types.
+typedef int16_t ALIGN_STRUCT(1) unaligned_int16_t;
+typedef int32_t ALIGN_STRUCT(1) unaligned_int32_t;
+typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t;
+
 // __int64 is defined in the Intrinsics Guide which maps to different datatype
 // in different data model
 #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
@@ -1927,15 +1932,15 @@ FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
 FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
 {
     return vreinterpretq_m128i_s16(
-        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
+        vsetq_lane_s16(*(const unaligned_int16_t *) p, vdupq_n_s16(0), 0));
 }
 
 // Load unaligned 64-bit integer from memory into the first element of dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
 FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
 {
     return vreinterpretq_m128i_s64(
-        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
+        vsetq_lane_s64(*(const unaligned_int64_t *) p, vdupq_n_s64(0), 0));
 }
 
 // Allocate size bytes of memory, aligned to the alignment specified in align,
@@ -4360,15 +4365,15 @@ FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
 {
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+    return vreinterpretq_m128i_s32(vld1q_s32((const unaligned_int32_t *) p));
 }
 
 // Load unaligned 32-bit integer from memory into the first element of dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
 FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
 {
     return vreinterpretq_m128i_s32(
-        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
+        vsetq_lane_s32(*(const unaligned_int32_t *) p, vdupq_n_s32(0), 0));
 }
 
 // Multiply packed signed 16-bit integers in a and b, producing intermediate

@@ -28,6 +28,19 @@
 /* run the 1st parameter */
 #define IIF_1(t, ...) t
 
+// Some intrinsics operate on unaligned data types.
+#if defined(__GNUC__) || defined(__clang__)
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#elif defined(_MSC_VER)
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+#endif
+
+typedef int16_t ALIGN_STRUCT(1) unaligned_int16_t;
+typedef int32_t ALIGN_STRUCT(1) unaligned_int32_t;
+typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t;
+
 // This program a set of unit tests to ensure that each SSE call provide the
 // output we expect.  If this fires an assert, then something didn't match up.
 //
@@ -49,6 +62,10 @@ class SSE2NEONTestImpl : public SSE2NEONTest
     int32_t *mTestIntPointer2;
     float mTestFloats[MAX_TEST_VALUE];
     int32_t mTestInts[MAX_TEST_VALUE];
+    int8_t mTestUnalignedInts[32] = {
+        0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    };
 
     virtual ~SSE2NEONTestImpl(void)
     {
@@ -2141,7 +2158,8 @@ result_t test_mm_loadu_si16(const SSE2NEONTestImpl &impl, uint32_t iter)
 #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
     return TEST_UNIMPL;
 #else
-    const int16_t *addr = (const int16_t *) impl.mTestIntPointer1;
+    const unaligned_int16_t *addr =
+        (const unaligned_int16_t *) (impl.mTestUnalignedInts + 1);
 
     __m128i ret = _mm_loadu_si16((const void *) addr);
 
@@ -2157,7 +2175,8 @@ result_t test_mm_loadu_si64(const SSE2NEONTestImpl &impl, uint32_t iter)
 #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 9)
     return TEST_UNIMPL;
 #else
-    const int64_t *addr = (const int64_t *) impl.mTestIntPointer1;
+    const unaligned_int64_t *addr =
+        (const unaligned_int64_t *) (impl.mTestUnalignedInts + 1);
 
     __m128i ret = _mm_loadu_si64((const void *) addr);
 
@@ -5024,7 +5043,8 @@ result_t test_mm_loadu_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 
 result_t test_mm_loadu_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
-    const int32_t *_a = (const int32_t *) impl.mTestIntPointer1;
+    const unaligned_int32_t *_a =
+        (const unaligned_int32_t *) (impl.mTestUnalignedInts + 1);
     __m128i c = _mm_loadu_si128((const __m128i *) _a);
     return VALIDATE_INT32_M128(c, _a);
 }
@@ -5037,7 +5057,8 @@ result_t test_mm_loadu_si32(const SSE2NEONTestImpl &impl, uint32_t iter)
 #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
     return TEST_UNIMPL;
 #else
-    const int32_t *addr = (const int32_t *) impl.mTestIntPointer1;
+    const unaligned_int32_t *addr =
+        (const unaligned_int32_t *) (impl.mTestUnalignedInts + 1);
 
     __m128i ret = _mm_loadu_si32((const void *) addr);