Skip to content

Commit

Permalink
improve performance of mul/wasm a little by reducing conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
herumi committed Sep 27, 2024
1 parent 595ee0b commit 71508ea
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 44 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ endif

# test
bin/emu:
$(CXX) -g -o $@ src/fp.cpp src/bn_c384_256.cpp test/bn_c384_256_test.cpp -DMCL_DONT_USE_XBYAK -DMCL_SIZEOF_UNIT=$(MCL_SIZEOF_UNIT) -DMCL_MAX_BIT_SIZE=384 -I./include -DMCL_BINT_ASM=0 -DMCL_MSM=0
$(CXX) -g -o $@ src/fp.cpp src/bn_c384_256.cpp test/bn_c384_256_test.cpp -DMCL_DONT_USE_XBYAK -DMCL_SIZEOF_UNIT=$(MCL_SIZEOF_UNIT) -DMCL_MAX_BIT_SIZE=384 -I./include -DMCL_BINT_ASM=0 -DMCL_MSM=0 $(CFLAGS_USER)
bin/pairing_c_min.exe: sample/pairing_c.c include/mcl/vint.hpp src/fp.cpp include/mcl/bn.hpp
$(CXX) -std=c++03 -O3 -g -fno-threadsafe-statics -fno-exceptions -fno-rtti -o $@ sample/pairing_c.c src/fp.cpp src/bn_c384_256.cpp -I./include -DXBYAK_NO_EXCEPTION -DMCL_SIZEOF_UNIT=$(MCL_SIZEOF_UNIT) -DMCL_MAX_BIT_SIZE=384 -DCYBOZU_DONT_USE_STRING -DCYBOZU_DONT_USE_EXCEPTION -DNDEBUG -DMCL_BINT_ASM=0 -DMCL_MSM=0 # -DMCL_DONT_USE_CSPRNG
bin/ecdsa-emu:
Expand Down
18 changes: 9 additions & 9 deletions include/mcl/bint.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,15 +143,15 @@ inline uint64_t divUnit1(uint64_t *pr, uint64_t H, uint64_t L, uint64_t y)
// z[N] = x[N] + y[N] and return CF(0 or 1)
template<size_t N>Unit addT(Unit *z, const Unit *x, const Unit *y);
// z[N] = x[N] - y[N] and return CF(0 or 1)
template<size_t N>Unit subT(Unit *z, const Unit *x, const Unit *y);
template<size_t N, typename T>Unit subT(Unit *z, const T *x, const Unit *y);
// z[N] = x[N] + y[N]. assume x, y are Not Full bit
template<size_t N>void addNFT(Unit *z, const Unit *x, const Unit *y);
// z[N] = x[N] - y[N] and return CF(0 or 1). assume x, y are Not Full bit
template<size_t N>Unit subNFT(Unit *z, const Unit *x, const Unit *y);
// [ret:z[N]] = x[N] * y
template<size_t N>Unit mulUnitT(Unit *z, const Unit *x, Unit y);
template<size_t N, typename T>Unit mulUnitT(T *z, const Unit *x, Unit y);
// [ret:z[N]] = z[N] + x[N] * y
template<size_t N>Unit mulUnitAddT(Unit *z, const Unit *x, Unit y);
template<size_t N, typename T>Unit mulUnitAddT(T *z, const Unit *x, Unit y);
// z[2N] = x[N] * y[N]
template<size_t N>void mulT(Unit *pz, const Unit *px, const Unit *py);
// y[2N] = x[N] * x[N]
Expand All @@ -173,17 +173,17 @@ MCL_DLL_API void mulNM(Unit *z, const Unit *x, size_t xn, const Unit *y, size_t
// explicit specialization of template functions and external asm functions
#include "bint_proto.hpp"

template<size_t N, typename T>
void copyT(T *y, const T *x)
template<size_t N, typename T, typename U>
void copyT(T *y, const U *x)
{
for (size_t i = 0; i < N; i++) y[i] = x[i];
for (size_t i = 0; i < N; i++) y[i] = T(x[i]);
}

// y[n] = x[n]
template<typename T>
void copyN(T *y, const T *x, size_t n)
template<typename T, typename U>
void copyN(T *y, const U *x, size_t n)
{
for (size_t i = 0; i < n; i++) y[i] = x[i];
for (size_t i = 0; i < n; i++) y[i] = T(x[i]);
}

template<size_t N, typename T>
Expand Down
48 changes: 17 additions & 31 deletions src/bint_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ Unit addT(Unit *z, const Unit *x, const Unit *y)
#endif
}

template<size_t N>
Unit subT(Unit *z, const Unit *x, const Unit *y)
template<size_t N, typename T>
Unit subT(Unit *z, const T *x, const Unit *y)
{
#if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4
// wasm32 supports 64-bit sub
Expand Down Expand Up @@ -164,30 +164,19 @@ Unit subNFT(Unit *z, const Unit *x, const Unit *y)
}


template<size_t N>
Unit mulUnitT(Unit *z, const Unit *x, Unit y)
template<size_t N, typename T>
Unit mulUnitT(T *z, const Unit *x, Unit y)
{
#if MCL_SIZEOF_UNIT == 4
#if 1
uint64_t H = 0;
// use T as uint64_t to reduce conversion
uint64_t y_ = y;
for (size_t i = 0; i < N; i++) {
uint64_t v = x[i] * y_;
v += H;
z[i] = uint32_t(v);
H = v >> 32;
}
return uint32_t(H);
#else
uint64_t H = 0;
for (size_t i = 0; i < N; i++) {
uint64_t v = x[i] * uint64_t(y);
v += H;
uint64_t v = x[0] * y_;
z[0] = uint32_t(v);
for (size_t i = 1; i < N; i++) {
v = x[i] * y_ + (v >> 32);
z[i] = uint32_t(v);
H = v >> 32;
}
return uint32_t(H);
#endif
return uint32_t(v >> 32);
#elif defined(MCL_DEFINED_UINT128_T)
uint64_t H = 0;
for (size_t i = 0; i < N; i++) {
Expand All @@ -211,21 +200,18 @@ Unit mulUnitT(Unit *z, const Unit *x, Unit y)
#endif
}

template<size_t N>
Unit mulUnitAddT(Unit *z, const Unit *x, Unit y)
template<size_t N, typename T>
Unit mulUnitAddT(T *z, const Unit *x, Unit y)
{
#if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4
// reduce cast operation
uint64_t H = 0;
uint64_t y_ = y;
for (size_t i = 0; i < N; i++) {
uint64_t v = x[i] * y_;
v += H;
v += z[i];
uint64_t v = z[0] + x[0] * y_;
z[0] = uint32_t(v);
for (size_t i = 1; i < N; i++) {
v = z[i] + x[i] * y_ + (v >> 32);
z[i] = uint32_t(v);
H = v >> 32;
}
return H;
return uint32_t(v >> 32);
#else
Unit xy[N], ret;
ret = mulUnitT<N>(xy, x, y);
Expand Down
15 changes: 12 additions & 3 deletions src/low_func.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,11 +126,11 @@ static void fpDblSubModT(Unit *z, const Unit *x, const Unit *y, const Unit *p)
}

// [return:z[N+1]] = z[N+1] + x[N] * y + (CF << (N * UnitBitSize))
template<size_t N>
Unit mulUnitAddFullWithCF(Unit z[N + 1], const Unit x[N], Unit y, Unit CF)
template<size_t N, typename T>
Unit mulUnitAddFullWithCF(T z[N + 1], const Unit x[N], Unit y, Unit CF)
{
Unit H = bint::mulUnitAddT<N>(z, x, y);
Unit v = z[N];
T v = z[N];
v += H;
Unit CF2 = v < H;
v += CF;
Expand All @@ -147,7 +147,11 @@ template<size_t N>
static void modRedT(Unit *z, const Unit *xy, const Unit *p)
{
const Unit rp = p[-1];
#if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4
uint64_t buf[N * 2];
#else
Unit buf[N * 2];
#endif
bint::copyT<N * 2>(buf, xy);
Unit CF = 0;
for (size_t i = 0; i < N; i++) {
Expand Down Expand Up @@ -243,7 +247,12 @@ static void mulMontNFT(Unit *z, const Unit *x, const Unit *y, const Unit *p)
t >> 64 <= (F - 2)(R - 1)/R = (F - 2) - (F - 2)/R
t + (t >> 64) = (F - 2)R - (F - 2)/R < FR
*/
#if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4
// use uint64_t if Unit = uint32_t to reduce conversion
uint64_t buf[N * 2];
#else
Unit buf[N * 2];
#endif
buf[N] = bint::mulUnitT<N>(buf, x, y[0]);
Unit q = buf[0] * rp;
buf[N] += bint::mulUnitAddT<N>(buf, p, q);
Expand Down

0 comments on commit 71508ea

Please sign in to comment.