Skip to content

Commit

Permalink
Merge pull request #1660 from Gorachya/master
Browse files Browse the repository at this point in the history
Add SM2_mont_mul_2_way for NEON
  • Loading branch information
guanzhi authored Apr 20, 2024
2 parents f3fb2ee + bab1316 commit afd9682
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 1 deletion.
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,12 @@ if (ENABLE_SM2_Z256_ARMV8)
list(APPEND src src/sm2_z256_armv8.S)
endif()

option(ENABLE_SM2_NEON "Enable SM2 NEON intrinsics" OFF)
if (ENABLE_SM2_NEON)
message(STATUS "ENABLE_SM2_NEON is ON")
add_definitions(-DENABLE_SM2_NEON)
endif()

option(ENABLE_SM9_Z256_ARMV8 "Enable SM9_Z256 ARMv8 assembly" OFF)
if (ENABLE_SM9_Z256_ARMV8)
message(STATUS "ENABLE_SM9_Z256_ARMV8 is ON")
Expand Down
119 changes: 118 additions & 1 deletion src/sm2_z256.c
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,124 @@ const uint64_t SM2_Z256_P_PRIME[4] = {
// mont(1) (mod p) = 2^256 mod p = 2^256 - p
const uint64_t *SM2_Z256_MODP_MONT_ONE = SM2_Z256_NEG_P;

#ifndef ENABLE_SM2_Z256_ARMV8
#if defined(ENABLE_SM2_Z256_ARMV8)
// src/sm2_z256_armv8.S
#elif defined(ENABLE_SM2_Z256_NEON)
#include <arm_neon.h>

// precompute <<= 32
// How to use special values of SM2_Z256_P?
const uint64_t SM2_Z256_P_LEFT_32[8] = {
0xffffffff00000000, 0xffffffff00000000, 0x0000000000000000, 0xffffffff00000000,
0xffffffff00000000, 0xffffffff00000000, 0xffffffff00000000, 0xfffffffe00000000
};

//const uint32_t SM2_Z256_MU_32 = 0xffffffff; // -1

void sm2_z256_modp_mont_mul(uint64_t r[4], const uint64_t a[4], const uint64_t b[4])
{
int i;
uint32_t a_[8], b_[8];

for (i = 0; i < 4; ++i) {
a_[2 * i] = a[i] & 0xffffffff;
a_[2 * i + 1] = a[i] >> 32;
b_[2 * i] = b[i] & 0xffffffff;
b_[2 * i + 1] = b[i] >> 32;
}

uint64x2_t d0, d1, d2, d3, d4, d5, d6, d7;
uint64x2_t t, low32 = vmovq_n_u64(0xffffffff);
uint32x2_t w0, w1;
uint64_t q, d[16] = {};
//uint32_t pre = SM2_Z256_MU_32 * b_[0]; // pre = -b_[0]

d0 = vmovq_n_u64(0);
d1 = vmovq_n_u64(0);
d2 = vmovq_n_u64(0);
d3 = vmovq_n_u64(0);
d4 = vmovq_n_u64(0);
d5 = vmovq_n_u64(0);
d6 = vmovq_n_u64(0);
d7 = vmovq_n_u64(0);

for (i = 0; i < 8; i++) {
q = -b_[0] * a_[i] + d[1] - d[0];
q <<= 32;

w0 = vcreate_u32(a_[i] | q);
w1 = vcreate_u32(b_[0] | SM2_Z256_P_LEFT_32[0]);
t = vmlal_u32(d0, w0, w1);
t = vshrq_n_u64(t, 32);

w1 = vcreate_u32(b_[1] | SM2_Z256_P_LEFT_32[1]);
t = vmlal_u32(t, w0, w1);
t = vaddq_u64(t, d1);
d0 = vandq_u64(t, low32);
t = vshrq_n_u64(t, 32);

w1 = vcreate_u32(b_[2] | SM2_Z256_P_LEFT_32[2]);
t = vmlal_u32(t, w0, w1);
t = vaddq_u64(t, d2);
d1 = vandq_u64(t, low32);
t = vshrq_n_u64(t, 32);

w1 = vcreate_u32(b_[3] | SM2_Z256_P_LEFT_32[3]);
t = vmlal_u32(t, w0, w1);
t = vaddq_u64(t, d3);
d2 = vandq_u64(t, low32);
t = vshrq_n_u64(t, 32);

w1 = vcreate_u32(b_[4] | SM2_Z256_P_LEFT_32[4]);
t = vmlal_u32(t, w0, w1);
t = vaddq_u64(t, d4);
d3 = vandq_u64(t, low32);
t = vshrq_n_u64(t, 32);

w1 = vcreate_u32(b_[5] | SM2_Z256_P_LEFT_32[5]);
t = vmlal_u32(t, w0, w1);
t = vaddq_u64(t, d5);
d4 = vandq_u64(t, low32);
t = vshrq_n_u64(t, 32);

w1 = vcreate_u32(b_[6] | SM2_Z256_P_LEFT_32[6]);
t = vmlal_u32(t, w0, w1);
t = vaddq_u64(t, d6);
d5 = vandq_u64(t, low32);
t = vshrq_n_u64(t, 32);

w1 = vcreate_u32(b_[7] | SM2_Z256_P_LEFT_32[7]);
t = vmlal_u32(t, w0, w1);
t = vaddq_u64(t, d7);
d6 = vandq_u64(t, low32);

d7 = vshrq_n_u64(t, 32);

vst1q_u64(d, d0);
}

vst1q_u64(d, d0);
vst1q_u64(d + 2, d1);
vst1q_u64(d + 4, d2);
vst1q_u64(d + 6, d3);
vst1q_u64(d + 8, d4);
vst1q_u64(d + 10, d5);
vst1q_u64(d + 12, d6);
vst1q_u64(d + 14, d7);

uint64_t e[4], f[4];
for (i = 0; i < 4; ++i) {
e[i] = d[4 * i] | d[4 * i + 2] << 32;
f[i] = d[4 * i + 1] | d[4 * i + 3] << 32;
}

if (sm2_z256_sub(r, e, f)) {
sm2_z256_add(r, r, SM2_Z256_P);
}
}

#else // ENABLE_SM2_Z256_NEON

// z = a*b
// c = (z + (z * p' mod 2^256) * p)/2^256
void sm2_z256_modp_mont_mul(uint64_t r[4], const uint64_t a[4], const uint64_t b[4])
Expand Down

0 comments on commit afd9682

Please sign in to comment.