Skip to content

Commit

Permalink
Merge pull request #567 from howjmay/aesimc_si128
Browse files Browse the repository at this point in the history
feat: Add _mm_aesimc_si128
  • Loading branch information
jserv authored Dec 25, 2022
2 parents 6cd92d9 + fd43122 commit 18c6f20
Show file tree
Hide file tree
Showing 3 changed files with 119 additions and 20 deletions.
107 changes: 87 additions & 20 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -9490,6 +9490,16 @@ static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
#undef SSE2NEON_AES_H0

/* x_time function and matrix multiply function */
#if !defined(__aarch64__)
#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
#define SSE2NEON_MULTIPLY(x, y) \
(((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \
((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^ \
((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
#endif

// In the absence of crypto extensions, implement aesenc using regular neon
// intrinsics instead. See:
// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
Expand All @@ -9503,8 +9513,10 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
};
static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
static const uint8_t ror32by8[] = {
0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
};

uint8x16_t v;
uint8x16_t w = vreinterpretq_u8_m128i(a);
Expand Down Expand Up @@ -9598,8 +9610,10 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
};
static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
static const uint8_t ror32by8[] = {
0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
};

uint8x16_t v;
uint8x16_t w = vreinterpretq_u8_m128i(a);
Expand Down Expand Up @@ -9629,35 +9643,29 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
return vreinterpretq_m128i_u8(w) ^ RoundKey;

#else /* ARMv7-A NEON implementation */
/* FIXME: optimized for NEON */
#define XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
#define MULTIPLY(x, y) \
(((y & 1) * x) ^ ((y >> 1 & 1) * XT(x)) ^ ((y >> 2 & 1) * XT(XT(x))) ^ \
((y >> 3 & 1) * XT(XT(XT(x)))) ^ ((y >> 4 & 1) * XT(XT(XT(XT(x))))))

/* FIXME: optimized for NEON */
uint8_t i, e, f, g, h, v[4][4];
uint8_t *_a = (uint8_t *) &a;
for (i = 0; i < 16; ++i) {
v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
}

// inverse mix columns
for (i = 0; i < 4; ++i) {
e = v[i][0];
f = v[i][1];
g = v[i][2];
h = v[i][3];

v[i][0] = MULTIPLY(e, 0x0e) ^ MULTIPLY(f, 0x0b) ^ MULTIPLY(g, 0x0d) ^
MULTIPLY(h, 0x09);
v[i][1] = MULTIPLY(e, 0x09) ^ MULTIPLY(f, 0x0e) ^ MULTIPLY(g, 0x0b) ^
MULTIPLY(h, 0x0d);
v[i][2] = MULTIPLY(e, 0x0d) ^ MULTIPLY(f, 0x09) ^ MULTIPLY(g, 0x0e) ^
MULTIPLY(h, 0x0b);
v[i][3] = MULTIPLY(e, 0x0b) ^ MULTIPLY(f, 0x0d) ^ MULTIPLY(g, 0x09) ^
MULTIPLY(h, 0x0e);
v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
}
#undef XT
#undef MULTIPLY

return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
#endif
Expand Down Expand Up @@ -9751,6 +9759,53 @@ FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
#endif
}

// Perform the InvMixColumns transformation on a and store the result in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
{
#if defined(__aarch64__)
static const uint8_t ror32by8[] = {
0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
};
uint8x16_t v = vreinterpretq_u8_m128i(a);
uint8x16_t w;

// multiplying 'v' by 4 in GF(2^8)
w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
v ^= w;
v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);

// multiplying 'v' by 2 in GF(2^8)
w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
return vreinterpretq_m128i_u8(w);

#else /* ARMv7-A NEON implementation */
uint8_t i, e, f, g, h, v[4][4];
vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
for (i = 0; i < 4; ++i) {
e = v[i][0];
f = v[i][1];
g = v[i][2];
h = v[i][3];

v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
}

return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
#endif
}

// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
// This instruction generates a round key for AES encryption. See
// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
Expand All @@ -9771,6 +9826,11 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
#undef SSE2NEON_AES_SBOX
#undef SSE2NEON_AES_RSBOX

#if defined(__aarch64__)
#undef SSE2NEON_XT
#undef SSE2NEON_MULTIPLY
#endif

#else /* __ARM_FEATURE_CRYPTO */
// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
// AESMC and then manually applying the real key as an xor operation. This
Expand Down Expand Up @@ -9815,6 +9875,13 @@ FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
vreinterpretq_u8_m128i(RoundKey);
}

// Perform the InvMixColumns transformation on a and store the result in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
{
return vreinterpretq_m128i_u8(vaesimcq_u8(a));
}

// Assist in expanding the AES cipher key by computing steps towards generating
// a round key for encryption cipher using data from a and an 8-bit round
// constant specified in imm8, and store the result in dst."
Expand Down
31 changes: 31 additions & 0 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11597,6 +11597,37 @@ result_t test_mm_aesdeclast_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
return validate128(result_reference, result_intrinsic);
}

result_t test_mm_aesimc_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
{
const uint8_t *a = (uint8_t *) impl.mTestIntPointer1;
__m128i _a = _mm_loadu_si128((const __m128i *) a);

uint8_t e, f, g, h, v[4][4];
for (int i = 0; i < 16; ++i) {
((uint8_t *) v)[i] = a[i];
}
for (int i = 0; i < 4; ++i) {
e = v[i][0];
f = v[i][1];
g = v[i][2];
h = v[i][3];

v[i][0] = MULTIPLY(e, 0x0e) ^ MULTIPLY(f, 0x0b) ^ MULTIPLY(g, 0x0d) ^
MULTIPLY(h, 0x09);
v[i][1] = MULTIPLY(e, 0x09) ^ MULTIPLY(f, 0x0e) ^ MULTIPLY(g, 0x0b) ^
MULTIPLY(h, 0x0d);
v[i][2] = MULTIPLY(e, 0x0d) ^ MULTIPLY(f, 0x09) ^ MULTIPLY(g, 0x0e) ^
MULTIPLY(h, 0x0b);
v[i][3] = MULTIPLY(e, 0x0b) ^ MULTIPLY(f, 0x0d) ^ MULTIPLY(g, 0x09) ^
MULTIPLY(h, 0x0e);
}

__m128i result_reference = _mm_loadu_si128((const __m128i *) v);
__m128i result_intrinsic = _mm_aesimc_si128(_a);

return validate128(result_reference, result_intrinsic);
}

static inline uint32_t sub_word(uint32_t key)
{
return (crypto_aes_sbox[(key >> 24) & 0xff] << 24) |
Expand Down
1 change: 1 addition & 0 deletions tests/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,7 @@
_(mm_aesdec_si128) \
_(mm_aesenclast_si128) \
_(mm_aesdeclast_si128) \
_(mm_aesimc_si128) \
_(mm_aeskeygenassist_si128) \
/* Others */ \
_(mm_clmulepi64_si128) \
Expand Down

0 comments on commit 18c6f20

Please sign in to comment.