diff --git a/include/mcl/bn.hpp b/include/mcl/bn.hpp index b6b3bb65..80a5fbde 100644 --- a/include/mcl/bn.hpp +++ b/include/mcl/bn.hpp @@ -726,34 +726,13 @@ struct GLV1 : mcl::GLV1T { } static inline void optimizedSplitForBLS12_381(mpz_class u[2], const mpz_class& x) { - assert(sizeof(Unit) == 8); - /* - z = -0xd201000000010000 - L = z^2-1 = 0xac45a4010001a40200000000ffffffff - r = L^2+L+1 = 0x73eda753299d7d483339d80809a1d80553bda402fffe5bfeffffffff00000001 - s=255 - v = 0xbe35f678f00fd56eb1fb72917b67f718 - */ - mpz_class& a = u[0]; - mpz_class& b = u[1]; - static const uint64_t Lv[] = { 0x00000000ffffffff, 0xac45a4010001a402 }; - static const uint64_t vv[] = { 0xb1fb72917b67f718, 0xbe35f678f00fd56e }; static const size_t n = 128 / mcl::UnitBitSize; - Unit t[n*3]; - // n = 128 bit - // t[n*3] = x[n*2] * vv[n] - mcl::bint::mulNM(t, gmp::getUnit(x), n*2, (const Unit*)vv, n); - // t[n] <- t[n*3] - mcl::bint::shrT(t, t+n*2-1, mcl::UnitBitSize-1); // >>255 + Unit xa[n*2], a[2], b[2]; + mcl::gmp::getArray(xa, n*2, x); + ec::local::optimizedSplitRawForBLS12_381(a, b, xa); bool dummy; - gmp::setArray(&dummy, b, t, n); - Unit t2[n*2]; - // t2[n*2] = t[n] * Lv[n] - // Do not overlap I/O buffers on pre-Broadwell CPUs. - mcl::bint::mulT(t2, t, (const Unit*)Lv); - // t[n] = x[n*2] - t2[n*2] - mcl::bint::subT(t, gmp::getUnit(x), t2); - gmp::setArray(&dummy, a, t, n); + gmp::setArray(&dummy, u[0], a, n); + gmp::setArray(&dummy, u[1], b, n); (void)dummy; } }; diff --git a/include/mcl/ec.hpp b/include/mcl/ec.hpp index 4230295a..847802c0 100644 --- a/include/mcl/ec.hpp +++ b/include/mcl/ec.hpp @@ -242,6 +242,35 @@ void normalizeVecT(Eout& Q, Ein& P, size_t n, size_t N = 256) } } +inline void optimizedSplitRawForBLS12_381(Unit a[2], Unit b[2], const Unit x[4]) +{ + assert(sizeof(Unit) == 8); + /* + z = -0xd201000000010000 + L = z^2-1 = 0xac45a4010001a40200000000ffffffff + r = L^2+L+1 = 0x73eda753299d7d483339d80809a1d80553bda402fffe5bfeffffffff00000001 + s=255 + v = 0xbe35f678f00fd56eb1fb72917b67f718 + */ + static const uint64_t Lv[] = { 0x00000000ffffffff, 0xac45a4010001a402 }; + static const uint64_t vv[] = { 0xb1fb72917b67f718, 0xbe35f678f00fd56e }; + static const size_t n = 128 / mcl::UnitBitSize; + Unit t[n*3]; + // n = 128 bit + // t[n*3] = x[n*2] * vv[n] + mcl::bint::mulNM(t, x, n*2, vv, n); + // b[n] = t[n*3]>>255 + mcl::bint::shrT(t, t+n*2-1, mcl::UnitBitSize-1); // >>255 + b[0] = t[0]; + b[1] = t[1]; + Unit t2[n*2]; + // t2[n*2] = t[n] * Lv[n] + // Do not overlap I/O buffers on pre-Broadwell CPUs. + mcl::bint::mulT(t2, t, Lv); + // a[n] = x[n*2] - t2[n*2] + mcl::bint::subT(a, x, t2); +} + } // mcl::ec::local // [X:Y:Z] as Proj = (X/Z, Y/Z) as Affine = [XZ:YZ^2:Z] as Jacobi diff --git a/src/msm_avx.cpp b/src/msm_avx.cpp index f14b596e..ae03f2c9 100644 --- a/src/msm_avx.cpp +++ b/src/msm_avx.cpp @@ -471,27 +471,6 @@ inline Vec getUnitAt(const Vec *x, size_t xN, size_t bitPos) return vor(vpsrlq(x[q], r), vpsllq(x[q+1], bitSize - r)); } -inline void split(Unit a[2], Unit b[2], const Unit x[4]) -{ - /* - z = -0xd201000000010000 - L = z^2-1 = 0xac45a4010001a40200000000ffffffff - r = L^2+L+1 = 0x73eda753299d7d483339d80809a1d80553bda402fffe5bfeffffffff00000001 - s=255 - v = 0xbe35f678f00fd56eb1fb72917b67f718 - */ - static const uint64_t Lv[] = { 0x00000000ffffffff, 0xac45a4010001a402 }; - static const uint64_t vv[] = { 0xb1fb72917b67f718, 0xbe35f678f00fd56e }; - static const size_t n = 128 / mcl::UnitBitSize; - Unit t[n*3]; - mcl::bint::mulNM(t, x, n*2, vv, n); - mcl::bint::shrT(t, t+n*2-1, mcl::UnitBitSize-1); // >>255 - b[0] = t[0]; - b[1] = t[1]; - mcl::bint::mulT(t, t, Lv); - mcl::bint::subT(a, x, t); -} - class Montgomery { Unit v_[N]; public: @@ -1120,7 +1099,7 @@ struct EcM { static void mulGLV(EcM& Q, const EcM& _P, const Vec y[4]) { EcM P = _P; - if (!isProj) mcl::ec::ProjToJacobi(P, _P); +// if (!isProj) mcl::ec::ProjToJacobi(P, _P); Vec a[2], b[2]; EcM tbl1[tblN], tbl2[tblN]; makeTable(tbl1, P); @@ -1134,7 +1113,7 @@ struct EcM { for (size_t i = 0; i < M; i++) { Unit buf[4] = { src[i+M*0], src[i+M*1], src[i+M*2], src[i+M*3] }; Unit aa[2], bb[2]; - split(aa, bb, buf); + mcl::ec::local::optimizedSplitRawForBLS12_381(aa, bb, buf); pa[i+M*0] = aa[0]; pa[i+M*1] = aa[1]; pb[i+M*0] = bb[0]; pb[i+M*1] = bb[1]; } @@ -1167,7 +1146,7 @@ struct EcM { mul(T, T, b, 2); add(Q, Q, T); #endif - if (!isProj) mcl::ec::JacobiToProj(Q, Q); +// if (!isProj) mcl::ec::JacobiToProj(Q, Q); } static void mulGLVbn(mcl::msm::G1A _Q[8], mcl::msm::G1A _P[8], const Vec y[4]) { @@ -1356,7 +1335,7 @@ void mulVecAVX512(Unit *_P, Unit *_x, const Unit *_y, size_t n) Unit ya[4]; fr->fromMont(ya, y[i*8+j].v); Unit a[2], b[2]; - split(a, b, ya); + mcl::ec::local::optimizedSplitRawForBLS12_381(a, b, ya); py[j+0] = a[0]; py[j+8] = a[1]; py[j+16] = b[0]; @@ -1393,7 +1372,7 @@ void mulEachAVX512(Unit *_x, const Unit *_y, size_t n) const bool mixed = true; mcl::msm::G1A *x = (mcl::msm::G1A*)_x; const mcl::msm::FrA *y = (const mcl::msm::FrA*)_y; - g_param.normalizeVecG1(x, x, n); + if (!isProj) g_param.normalizeVecG1(x, x, n); for (size_t i = 0; i < n; i += 8) { EcM P; Vec yv[4];