grandinetech · owanikin · Mar 20, 2024 · Apr 7, 2024 · Apr 7, 2024 · Apr 7, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/arkworks/Cargo.toml b/arkworks/Cargo.toml
@@ -15,7 +15,7 @@ ark-serialize = { version = "^0.4.2", default-features = false }
 hex = "0.4.3"
 rand = { version = "0.8.5", optional = true }
 libc = { version = "0.2.148", default-features = false }
-rayon = { version = "1.8.0", optional = true }
+rayon = { version = "1.9.0", optional = true }
 
 [dev-dependencies]
 criterion = "0.5.1"
@@ -47,6 +47,9 @@ bgmw = [
 arkmsm = [
     "kzg/arkmsm"
 ]
+cuda = [
+    "kzg/cuda"
+]
 
 [[bench]]
 name = "fft"

diff --git a/arkworks/src/kzg_types.rs b/arkworks/src/kzg_types.rs
@@ -17,6 +17,7 @@ use crate::utils::{
 use ark_bls12_381::{g1, g2, Fr, G1Affine, G2Affine};
 use ark_ec::{models::short_weierstrass::Projective, AffineRepr, Group};
 use ark_ec::{CurveConfig, CurveGroup};
+use ark_ff::BigInt;
 use ark_ff::{biginteger::BigInteger256, BigInteger, Field};
 use ark_serialize::{CanonicalDeserialize, CanonicalSerialize};
 use ark_std::{One, Zero};
@@ -840,6 +841,16 @@ impl G1Fp for ArkFp {
         Self(default)
     }
 
+    fn to_limbs(&self) -> [u64; 6] {
+        self.0.0.0
+    }
+
+    fn from_bytes_le(bytes: &[u8; 48]) -> Self {
+        let storage: [u64; 6] = bytes.chunks(8).map(|it| u64::from_le_bytes(it.try_into().unwrap())).collect::<Vec<_>>().try_into().unwrap();
+        let big_int = BigInt::new(storage);
+        Self(ArkFpInt::from(big_int))
+    }
+
     fn neg_assign(&mut self) {
         self.0 = -self.0;
     }

diff --git a/kzg/Cargo.toml b/kzg/Cargo.toml
@@ -10,6 +10,9 @@ num_cpus = { version = "1.16.0", optional = true }
 rayon = { version = "1.8.0", optional = true } 
 threadpool = { version = "^1.8.1", optional = true }
 siphasher = { version = "1.0.0", default-features = false }
+icicle-bls12-381 = { git = "https://github.com/ArtiomTr/icicle.git", rev = "2942ad9f9894119f0204325e08ddb55b8a8de227", version = "1.9.1", optional = true }
+icicle-core = { git = "https://github.com/ArtiomTr/icicle.git", rev = "2942ad9f9894119f0204325e08ddb55b8a8de227", version = "1.9.1", optional = true }
+icicle-cuda-runtime = { git = "https://github.com/ArtiomTr/icicle.git", rev = "2942ad9f9894119f0204325e08ddb55b8a8de227", version = "1.9.1", optional = true }
 
 [features]
 default = [
@@ -29,3 +32,9 @@ std = [
 rand = []
 arkmsm = []
 bgmw = []
+cuda = [
+    "parallel",
+    "dep:icicle-bls12-381",
+    "dep:icicle-core",
+    "dep:icicle-cuda-runtime"
+]
diff --git a/kzg/src/lib.rs b/kzg/src/lib.rs
@@ -201,6 +201,10 @@ pub trait G1Fp: Clone + Default + Sync + Copy + PartialEq + Debug + Send {
     fn set_one(&mut self) {
         *self = Self::ONE;
     }
+
+    fn to_limbs(&self) -> [u64; 6];
+
+    fn from_bytes_le(bytes: &[u8; 48]) -> Self;
 }
 
 pub trait G1Affine<TG1: G1, TG1Fp: G1Fp>:

diff --git a/kzg/src/msm/cuda.rs b/kzg/src/msm/cuda.rs
@@ -0,0 +1,122 @@
+use core::marker::PhantomData;
+
+use icicle_bls12_381::curve::CurveCfg;
+use icicle_core::{curve::Affine, msm::{precompute_bases, MSMConfig}, traits::FieldImpl};
+use icicle_cuda_runtime::{memory::HostOrDeviceSlice, device_context::{DeviceContext, DEFAULT_DEVICE_ID}};
+use core::fmt::Debug;
+use crate::{Fr, G1Affine, G1Fp, G1GetFp, G1Mul, Scalar256, G1};
+
+use super::msm_impls::batch_convert;
+
+pub struct IcicleConfig<TFr, TG1, TG1Fp, TG1Affine>
+where
+    TFr: Fr,
+    TG1: G1 + G1Mul<TFr> + G1GetFp<TG1Fp>,
+    TG1Fp: G1Fp,
+    TG1Affine: G1Affine<TG1, TG1Fp>,
+{
+    affines: HostOrDeviceSlice<'static, Affine<CurveCfg>>,
+
+    g1_marker: PhantomData<TG1>,
+    g1_fp_marker: PhantomData<TG1Fp>,
+    fr_marker: PhantomData<TFr>,
+    g1_affine_marker: PhantomData<TG1Affine>
+}
+
+impl<
+TFr: Fr,
+TG1Fp: G1Fp,
+TG1: G1 + G1Mul<TFr> + G1GetFp<TG1Fp>,
+TG1Affine: G1Affine<TG1, TG1Fp>,
+> Debug for IcicleConfig<TFr, TG1, TG1Fp, TG1Affine> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        // TODO: add formatting for affines
+        f.debug_struct("IcicleConfig").finish()
+    }
+}
+
+impl<
+TFr: Fr,
+TG1Fp: G1Fp,
+TG1: G1 + G1Mul<TFr> + G1GetFp<TG1Fp>,
+TG1Affine: G1Affine<TG1, TG1Fp>,
+> Clone for IcicleConfig<TFr, TG1, TG1Fp, TG1Affine> {
+    fn clone(&self) -> Self {
+        // FIXME: affines should be cloned actually
+        Self { affines: HostOrDeviceSlice::Host(vec![]), g1_marker: PhantomData, g1_fp_marker: PhantomData, fr_marker: PhantomData, g1_affine_marker: PhantomData }
+    }
+}
+
+const PRECOMPUTE_FACTOR: usize = 8;
+
+impl<
+        TFr: Fr,
+        TG1Fp: G1Fp,
+        TG1: G1 + G1Mul<TFr> + G1GetFp<TG1Fp>,
+        TG1Affine: G1Affine<TG1, TG1Fp>,
+    > IcicleConfig<TFr, TG1, TG1Fp, TG1Affine>
+{
+    pub fn new(points: &[TG1]) -> Result<Option<Self>, String> {
+        let affines_raw = batch_convert::<TG1, TG1Fp, TG1Affine>(points).iter().map(|it| icicle_bls12_381::curve::G1Affine::from_limbs(it.x().to_limbs(), it.y().to_limbs())).collect::<Vec<_>>();
+        // let Ok(mut affines) = HostOrDeviceSlice::<'static, Affine<CurveCfg>>::cuda_malloc(affines_raw.len()) else {
+        //     return Ok(None);
+        // };
+        // if affines.copy_from_host(&affines_raw).is_err() {
+        //     return Ok(None);
+        // }
+        let device_affines = HostOrDeviceSlice::on_host(affines_raw);
+
+        let Ok(mut affines) = HostOrDeviceSlice::<'static, Affine<CurveCfg>>::cuda_malloc(points.len() * PRECOMPUTE_FACTOR) else {
+            return Ok(None);
+        };
+
+        if precompute_bases(&device_affines, PRECOMPUTE_FACTOR as i32, 0, &DeviceContext::default_for_device(DEFAULT_DEVICE_ID), &mut affines).is_err() {
+            return Ok(None);
+        }
+
+        Ok(Some(Self {
+            affines,
+
+            fr_marker: PhantomData,
+            g1_fp_marker: PhantomData,
+            g1_marker: PhantomData,
+            g1_affine_marker: PhantomData
+        }))
+    }
+
+    pub fn multiply_sequential(&self, _scalars: &[Scalar256]) -> TG1 {
+        panic!("No sequential implementation for CUDA MSM");
+    }
+
+    #[cfg(feature = "parallel")]
+    pub fn multiply_parallel(&self, scalars: &[Scalar256]) -> TG1 {
+        use icicle_bls12_381::curve::ScalarField;
+        use icicle_core::curve::Projective;
+        use icicle_cuda_runtime::stream::CudaStream;
+
+        let mut results = HostOrDeviceSlice::cuda_malloc(1).unwrap();
+        let mut scalars_d = HostOrDeviceSlice::cuda_malloc(scalars.len()).unwrap();
+        let stream = CudaStream::create().unwrap();
+        scalars_d.copy_from_host_async(&scalars.iter().map(|it| ScalarField::from_bytes_le(it.as_u8())).collect::<Vec<_>>(), &stream).unwrap();
+        let mut config = MSMConfig::default_for_device(DEFAULT_DEVICE_ID);
+        config.precompute_factor = PRECOMPUTE_FACTOR as i32;
+        config.ctx.stream = &stream;
+        config.is_async = true;
+
+        icicle_core::msm::msm(&scalars_d, &self.affines, &config, &mut results).unwrap();
+
+        let mut results_h = vec![Projective::<CurveCfg>::zero(); 1];
+        results.copy_to_host_async(&mut results_h, &stream);
+
+        stream.synchronize().unwrap();
+        stream.destroy().unwrap();
+
+        let mut output = TG1::default();
+
+        *output.x_mut() = TG1Fp::from_bytes_le(&results_h.as_slice()[0].x.to_bytes_le().try_into().unwrap());
+        *output.y_mut() = TG1Fp::from_bytes_le(&results_h.as_slice()[0].y.to_bytes_le().try_into().unwrap());
+        *output.z_mut() = TG1Fp::from_bytes_le(&results_h.as_slice()[0].z.to_bytes_le().try_into().unwrap());
+
+        output
+    }
+}
diff --git a/kzg/src/msm/mod.rs b/kzg/src/msm/mod.rs
@@ -15,3 +15,11 @@ mod pippenger_utils;
 
 #[cfg(all(feature = "bgmw", any(not(feature = "arkmsm"), feature = "parallel")))]
 mod bgmw;
+
+#[cfg(feature = "cuda")]
+mod cuda;
+
+#[cfg(all(feature = "cuda", feature = "bgmw"))]
+compile_error!{"features `cuda` and `bgmw` are mutally exclusive"}
+#[cfg(all(feature = "cuda", not(feature = "parallel")))]
+compile_error!{"feature `cuda` requires feature `parallel`"}
diff --git a/kzg/src/msm/msm_impls.rs b/kzg/src/msm/msm_impls.rs
@@ -59,7 +59,7 @@ fn msm_sequential<
     }
 }
 
-fn batch_convert<TG1: G1, TFp: G1Fp, TG1Affine: G1Affine<TG1, TFp> + Sized>(
+pub fn batch_convert<TG1: G1, TFp: G1Fp, TG1Affine: G1Affine<TG1, TFp> + Sized>(
     points: &[TG1],
 ) -> Vec<TG1Affine> {
     #[cfg(feature = "parallel")]

diff --git a/kzg/src/msm/precompute.rs b/kzg/src/msm/precompute.rs
@@ -9,7 +9,7 @@ pub type PrecomputationTable<TFr, TG1, TG1Fp, TG1Affine> =
     super::bgmw::BgmwTable<TFr, TG1, TG1Fp, TG1Affine>;
 
 #[cfg(any(
-    not(feature = "bgmw"),
+    all(not(feature = "bgmw"), not(feature = "cuda")),
     all(feature = "arkmsm", not(feature = "parallel"))
 ))]
 #[derive(Debug, Clone)]
@@ -27,7 +27,7 @@ where
 }
 
 #[cfg(any(
-    not(feature = "bgmw"),
+    all(not(feature = "bgmw"), not(feature = "cuda")),
     all(feature = "arkmsm", not(feature = "parallel"))
 ))]
 impl<TFr, TG1, TG1Fp, TG1Affine> EmptyTable<TFr, TG1, TG1Fp, TG1Affine>
@@ -52,11 +52,14 @@ where
 }
 
 #[cfg(any(
-    not(feature = "bgmw"),
+    all(not(feature = "bgmw"), not(feature = "cuda")),
     all(feature = "arkmsm", not(feature = "parallel"))
 ))]
 pub type PrecomputationTable<TFr, TG1, TG1Fp, TG1Affine> = EmptyTable<TFr, TG1, TG1Fp, TG1Affine>;
 
+#[cfg(feature = "cuda")]
+pub type PrecomputationTable<TFr, TG1, TG1Fp, TG1Affine> = super::cuda::IcicleConfig<TFr, TG1, TG1Fp, TG1Affine>;
+
 pub fn precompute<TFr, TG1, TG1Fp, TG1Affine>(
     points: &[TG1],
 ) -> Result<Option<PrecomputationTable<TFr, TG1, TG1Fp, TG1Affine>>, String>