From bdb51039c27321e2bebd115e7bb7e6dec01e28b4 Mon Sep 17 00:00:00 2001 From: Yang Hau Date: Sun, 25 Dec 2022 22:23:15 +0800 Subject: [PATCH] Optimize aeskeygenassist_si128 for Arm64 (#569) --- sse2neon.h | 26 ++++++++++++++++++++++---- tests/impl.cpp | 10 +++++----- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/sse2neon.h b/sse2neon.h index c5c0adff..5dab7cca 100644 --- a/sse2neon.h +++ b/sse2neon.h @@ -9811,17 +9811,35 @@ FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/ // for details. // -// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx -FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon) +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128 +FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) { - uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)); - uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)); +#if defined(__aarch64__) + uint8x16_t _a = vreinterpretq_u8_m128i(a); + uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0); + + uint32x4_t select_mask = {0xffffffff, 0x0, 0xffffffff, 0x0}; + uint64x2_t v_mask = vshrq_n_u64(vreinterpretq_u64_u8(v), 32); + uint32x4_t x = vbslq_u32(select_mask, vreinterpretq_u32_u64(v_mask), + vreinterpretq_u32_u8(v)); + uint32x4_t ror_x = vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 24)); + uint32x4_t ror_xor_x = veorq_u32(ror_x, vdupq_n_u32(rcon)); + + return vreinterpretq_m128i_u32(vbslq_u32(select_mask, x, ror_xor_x)); + +#else /* ARMv7-A NEON implementation */ + uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); + uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); for (int i = 0; i < 4; ++i) { ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]]; ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]]; } return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3, ((X1 >> 8) | (X1 << 24)) ^ rcon, X1); +#endif } #undef SSE2NEON_AES_SBOX #undef SSE2NEON_AES_RSBOX diff --git a/tests/impl.cpp b/tests/impl.cpp index aa2da9ed..7b720804 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -11628,12 +11628,12 @@ result_t test_mm_aesimc_si128(const SSE2NEONTestImpl &impl, uint32_t iter) return validate128(result_reference, result_intrinsic); } -static inline uint32_t sub_word(uint32_t key) +static inline uint32_t sub_word(uint32_t in) { - return (crypto_aes_sbox[(key >> 24) & 0xff] << 24) | - (crypto_aes_sbox[(key >> 16) & 0xff] << 16) | - (crypto_aes_sbox[(key >> 8) & 0xff] << 8) | - (crypto_aes_sbox[key & 0xff]); + return (crypto_aes_sbox[(in >> 24) & 0xff] << 24) | + (crypto_aes_sbox[(in >> 16) & 0xff] << 16) | + (crypto_aes_sbox[(in >> 8) & 0xff] << 8) | + (crypto_aes_sbox[in & 0xff]); } // FIXME: improve the test case for AES-256 key expansion.