Skip to content

Commit

Permalink
Optimize aeskeygenassist_si128 for Arm64 (#569)
Browse files Browse the repository at this point in the history
  • Loading branch information
howjmay authored Dec 25, 2022
1 parent 18c6f20 commit bdb5103
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 9 deletions.
26 changes: 22 additions & 4 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -9811,17 +9811,35 @@ FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
// for details.
//
// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
{
uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
#if defined(__aarch64__)
uint8x16_t _a = vreinterpretq_u8_m128i(a);
uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);

uint32x4_t select_mask = {0xffffffff, 0x0, 0xffffffff, 0x0};
uint64x2_t v_mask = vshrq_n_u64(vreinterpretq_u64_u8(v), 32);
uint32x4_t x = vbslq_u32(select_mask, vreinterpretq_u32_u64(v_mask),
vreinterpretq_u32_u8(v));
uint32x4_t ror_x = vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 24));
uint32x4_t ror_xor_x = veorq_u32(ror_x, vdupq_n_u32(rcon));

return vreinterpretq_m128i_u32(vbslq_u32(select_mask, x, ror_xor_x));

#else /* ARMv7-A NEON implementation */
uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
for (int i = 0; i < 4; ++i) {
((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
}
return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
#endif
}
#undef SSE2NEON_AES_SBOX
#undef SSE2NEON_AES_RSBOX
Expand Down
10 changes: 5 additions & 5 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11628,12 +11628,12 @@ result_t test_mm_aesimc_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
return validate128(result_reference, result_intrinsic);
}

static inline uint32_t sub_word(uint32_t key)
static inline uint32_t sub_word(uint32_t in)
{
return (crypto_aes_sbox[(key >> 24) & 0xff] << 24) |
(crypto_aes_sbox[(key >> 16) & 0xff] << 16) |
(crypto_aes_sbox[(key >> 8) & 0xff] << 8) |
(crypto_aes_sbox[key & 0xff]);
return (crypto_aes_sbox[(in >> 24) & 0xff] << 24) |
(crypto_aes_sbox[(in >> 16) & 0xff] << 16) |
(crypto_aes_sbox[(in >> 8) & 0xff] << 8) |
(crypto_aes_sbox[in & 0xff]);
}

// FIXME: improve the test case for AES-256 key expansion.
Expand Down

0 comments on commit bdb5103

Please sign in to comment.