From 71508ea249552a93bf44040a564c7fb034f90b0d Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Fri, 27 Sep 2024 17:37:41 +0900
Subject: [PATCH] improve performance of mul/wasm a little by reducing
 conversion

---
 Makefile             |  2 +-
 include/mcl/bint.hpp | 18 ++++++++---------
 src/bint_impl.hpp    | 48 ++++++++++++++++----------------------------
 src/low_func.hpp     | 15 +++++++++++---
 4 files changed, 39 insertions(+), 44 deletions(-)
diff --git a/Makefile b/Makefile
index 4053d466..feaeb0fe 100644
--- a/Makefile
+++ b/Makefile
@@ -443,7 +443,7 @@ endif
 
 # test
 bin/emu:
-	$(CXX) -g -o $@ src/fp.cpp src/bn_c384_256.cpp test/bn_c384_256_test.cpp -DMCL_DONT_USE_XBYAK -DMCL_SIZEOF_UNIT=$(MCL_SIZEOF_UNIT) -DMCL_MAX_BIT_SIZE=384 -I./include -DMCL_BINT_ASM=0 -DMCL_MSM=0
+	$(CXX) -g -o $@ src/fp.cpp src/bn_c384_256.cpp test/bn_c384_256_test.cpp -DMCL_DONT_USE_XBYAK -DMCL_SIZEOF_UNIT=$(MCL_SIZEOF_UNIT) -DMCL_MAX_BIT_SIZE=384 -I./include -DMCL_BINT_ASM=0 -DMCL_MSM=0 $(CFLAGS_USER)
 bin/pairing_c_min.exe: sample/pairing_c.c include/mcl/vint.hpp src/fp.cpp include/mcl/bn.hpp
 	$(CXX) -std=c++03 -O3 -g -fno-threadsafe-statics -fno-exceptions -fno-rtti -o $@ sample/pairing_c.c src/fp.cpp src/bn_c384_256.cpp -I./include -DXBYAK_NO_EXCEPTION -DMCL_SIZEOF_UNIT=$(MCL_SIZEOF_UNIT) -DMCL_MAX_BIT_SIZE=384 -DCYBOZU_DONT_USE_STRING -DCYBOZU_DONT_USE_EXCEPTION -DNDEBUG -DMCL_BINT_ASM=0 -DMCL_MSM=0 # -DMCL_DONT_USE_CSPRNG
 bin/ecdsa-emu:
diff --git a/include/mcl/bint.hpp b/include/mcl/bint.hpp
index 7a601923..a3ba627d 100644
--- a/include/mcl/bint.hpp
+++ b/include/mcl/bint.hpp
@@ -143,15 +143,15 @@ inline uint64_t divUnit1(uint64_t *pr, uint64_t H, uint64_t L, uint64_t y)
 // z[N] = x[N] + y[N] and return CF(0 or 1)
 template<size_t N>Unit addT(Unit *z, const Unit *x, const Unit *y);
 // z[N] = x[N] - y[N] and return CF(0 or 1)
-template<size_t N>Unit subT(Unit *z, const Unit *x, const Unit *y);
+template<size_t N, typename T>Unit subT(Unit *z, const T *x, const Unit *y);
 // z[N] = x[N] + y[N]. assume x, y are Not Full bit
 template<size_t N>void addNFT(Unit *z, const Unit *x, const Unit *y);
 // z[N] = x[N] - y[N] and return CF(0 or 1). assume x, y are Not Full bit
 template<size_t N>Unit subNFT(Unit *z, const Unit *x, const Unit *y);
 // [ret:z[N]] = x[N] * y
-template<size_t N>Unit mulUnitT(Unit *z, const Unit *x, Unit y);
+template<size_t N, typename T>Unit mulUnitT(T *z, const Unit *x, Unit y);
 // [ret:z[N]] = z[N] + x[N] * y
-template<size_t N>Unit mulUnitAddT(Unit *z, const Unit *x, Unit y);
+template<size_t N, typename T>Unit mulUnitAddT(T *z, const Unit *x, Unit y);
 // z[2N] = x[N] * y[N]
 template<size_t N>void mulT(Unit *pz, const Unit *px, const Unit *py);
 // y[2N] = x[N] * x[N]
@@ -173,17 +173,17 @@ MCL_DLL_API void mulNM(Unit *z, const Unit *x, size_t xn, const Unit *y, size_t
 // explicit specialization of template functions and external asm functions
 #include "bint_proto.hpp"
 
-template<size_t N, typename T>
-void copyT(T *y, const T *x)
+template<size_t N, typename T, typename U>
+void copyT(T *y, const U *x)
 {
-	for (size_t i = 0; i < N; i++) y[i] = x[i];
+	for (size_t i = 0; i < N; i++) y[i] = T(x[i]);
 }
 
 // y[n] = x[n]
-template<typename T>
-void copyN(T *y, const T *x, size_t n)
+template<typename T, typename U>
+void copyN(T *y, const U *x, size_t n)
 {
-	for (size_t i = 0; i < n; i++) y[i] = x[i];
+	for (size_t i = 0; i < n; i++) y[i] = T(x[i]);
 }
 
 template<size_t N, typename T>
diff --git a/src/bint_impl.hpp b/src/bint_impl.hpp
index 1145893e..10c07100 100644
--- a/src/bint_impl.hpp
+++ b/src/bint_impl.hpp
@@ -104,8 +104,8 @@ Unit addT(Unit *z, const Unit *x, const Unit *y)
 #endif
 }
 
-template<size_t N>
-Unit subT(Unit *z, const Unit *x, const Unit *y)
+template<size_t N, typename T>
+Unit subT(Unit *z, const T *x, const Unit *y)
 {
 #if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4
 	// wasm32 supports 64-bit sub
@@ -164,30 +164,19 @@ Unit subNFT(Unit *z, const Unit *x, const Unit *y)
 }
 
 
-template<size_t N>
-Unit mulUnitT(Unit *z, const Unit *x, Unit y)
+template<size_t N, typename T>
+Unit mulUnitT(T *z, const Unit *x, Unit y)
 {
 #if MCL_SIZEOF_UNIT == 4
-#if 1
-	uint64_t H = 0;
+// use T as uint64_t to reduce conversion
 	uint64_t y_ = y;
-	for (size_t i = 0; i < N; i++) {
-		uint64_t v = x[i] * y_;
-		v += H;
-		z[i] = uint32_t(v);
-		H = v >> 32;
-	}
-	return uint32_t(H);
-#else
-	uint64_t H = 0;
-	for (size_t i = 0; i < N; i++) {
-		uint64_t v = x[i] * uint64_t(y);
-		v += H;
+	uint64_t v = x[0] * y_;
+	z[0] = uint32_t(v);
+	for (size_t i = 1; i < N; i++) {
+		v = x[i] * y_ + (v >> 32);
 		z[i] = uint32_t(v);
-		H = v >> 32;
 	}
-	return uint32_t(H);
-#endif
+	return uint32_t(v >> 32);
 #elif defined(MCL_DEFINED_UINT128_T)
 	uint64_t H = 0;
 	for (size_t i = 0; i < N; i++) {
@@ -211,21 +200,18 @@ Unit mulUnitT(Unit *z, const Unit *x, Unit y)
 #endif
 }
 
-template<size_t N>
-Unit mulUnitAddT(Unit *z, const Unit *x, Unit y)
+template<size_t N, typename T>
+Unit mulUnitAddT(T *z, const Unit *x, Unit y)
 {
 #if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4
-	// reduce cast operation
-	uint64_t H = 0;
 	uint64_t y_ = y;
-	for (size_t i = 0; i < N; i++) {
-		uint64_t v = x[i] * y_;
-		v += H;
-		v += z[i];
+	uint64_t v = z[0] + x[0] * y_;
+	z[0] = uint32_t(v);
+	for (size_t i = 1; i < N; i++) {
+		v = z[i] + x[i] * y_ + (v >> 32);
 		z[i] = uint32_t(v);
-		H = v >> 32;
 	}
-	return H;
+	return uint32_t(v >> 32);
 #else
 	Unit xy[N], ret;
 	ret = mulUnitT<N>(xy, x, y);
diff --git a/src/low_func.hpp b/src/low_func.hpp
index 6f4ffda9..6cd44081 100644
--- a/src/low_func.hpp
+++ b/src/low_func.hpp
@@ -126,11 +126,11 @@ static void fpDblSubModT(Unit *z, const Unit *x, const Unit *y, const Unit *p)
 }
 
 // [return:z[N+1]] = z[N+1] + x[N] * y + (CF << (N * UnitBitSize))
-template<size_t N>
-Unit mulUnitAddFullWithCF(Unit z[N + 1], const Unit x[N], Unit y, Unit CF)
+template<size_t N, typename T>
+Unit mulUnitAddFullWithCF(T z[N + 1], const Unit x[N], Unit y, Unit CF)
 {
 	Unit H = bint::mulUnitAddT<N>(z, x, y);
-	Unit v = z[N];
+	T v = z[N];
 	v += H;
 	Unit CF2 = v < H;
 	v += CF;
@@ -147,7 +147,11 @@ template<size_t N>
 static void modRedT(Unit *z, const Unit *xy, const Unit *p)
 {
 	const Unit rp = p[-1];
+#if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4
+	uint64_t buf[N * 2];
+#else
 	Unit buf[N * 2];
+#endif
 	bint::copyT<N * 2>(buf, xy);
 	Unit CF = 0;
 	for (size_t i = 0; i < N; i++) {
@@ -243,7 +247,12 @@ static void mulMontNFT(Unit *z, const Unit *x, const Unit *y, const Unit *p)
 		t >> 64 <= (F - 2)(R - 1)/R = (F - 2) - (F - 2)/R
 			t + (t >> 64) = (F - 2)R - (F - 2)/R < FR
 	*/
+#if defined(MCL_WASM32) && MCL_SIZEOF_UNIT == 4
+	// use uint64_t if Unit = uint32_t to reduce conversion
+	uint64_t buf[N * 2];
+#else
 	Unit buf[N * 2];
+#endif
 	buf[N] = bint::mulUnitT<N>(buf, x, y[0]);
 	Unit q = buf[0] * rp;
 	buf[N] += bint::mulUnitAddT<N>(buf, p, q);