perf(shivini): implement GPU PoW

matter-labs · Nov 19, 2024 · 71babf9 · 71babf9
1 parent 0128a63
commit 71babf9
Show file tree

Hide file tree

Showing 14 changed files with 207 additions and 117 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -28,13 +28,17 @@ wrapper-prover = { version = "=0.152.6", path = "crates/wrapper-prover", package
 fflonk = { version = "=0.152.6", path = "crates/fflonk", package = "fflonk-cuda" }
 
 # These dependencies should be shared by all the crates.
-circuit_definitions = { version = "=0.150.14" }
-zkevm_test_harness = { version = "=0.150.14" }
+# zksync-crypto repository
 boojum = "=0.30.8"
+fflonk-cpu = {package = "fflonk", version = "=0.30.8"}
 franklin-crypto = "=0.30.8"
 rescue_poseidon = "=0.30.8"
 snark_wrapper = "=0.30.8"
-fflonk-cpu = {package = "fflonk", version = "=0.30.8"}
+# zksync-protocol repository
+#circuit_definitions = { version = "=0.150.14" }
+#zkevm_test_harness = { version = "=0.150.14" }
+circuit_definitions = { git = "https://github.com/matter-labs/zksync-protocol.git", branch = "rr/compression-proof-configs", package = "circuit_definitions" }
+zkevm_test_harness = { git = "https://github.com/matter-labs/zksync-protocol.git", branch = "rr/compression-proof-configs", package = "zkevm_test_harness" }
 
 [profile.release]
 debug = "line-tables-only"
diff --git a/crates/boojum-cuda/benches/poseidon2.rs b/crates/boojum-cuda/benches/poseidon2.rs
@@ -2,7 +2,7 @@
 #![test_runner(criterion::runner)]
 
 use boojum::field::goldilocks::GoldilocksField;
-use boojum_cuda::poseidon2::{BNHasher, GLHasher, GpuTreeHasher};
+use boojum_cuda::poseidon2::{poseidon2_bn_pow, BNHasher, GLHasher, GpuTreeHasher};
 use criterion::{
     criterion_group, criterion_main, BenchmarkId, Criterion, SamplingMode, Throughput,
 };
@@ -161,10 +161,29 @@ fn bn_merkle_tree(c: &mut Criterion<CudaMeasurement>) {
     bench_merkle_tree::<BNHasher, 47>(c, String::from("bn_merkle_tree")).unwrap();
 }
 
+fn bn_pow(c: &mut Criterion<CudaMeasurement>) {
+    const MIN_BITS_COUNT: u32 = 15;
+    const MAX_BITS_COUNT: u32 = 26;
+    let d_seed = DeviceAllocation::alloc(4).unwrap();
+    let mut d_result = DeviceAllocation::alloc(1).unwrap();
+    let stream = CudaStream::default();
+    let mut group = c.benchmark_group("bn_pow");
+    for bits_count in MIN_BITS_COUNT..=MAX_BITS_COUNT {
+        let max_nonce = 1 << bits_count;
+        group.throughput(Throughput::Elements(max_nonce));
+        group.bench_function(BenchmarkId::from_parameter(bits_count), |b| {
+            b.iter(|| {
+                poseidon2_bn_pow(&d_seed, u32::MAX, max_nonce, &mut d_result[0], &stream).unwrap();
+            })
+        });
+    }
+    group.finish();
+}
+
 criterion_group!(
     name = bench_poseidon2;
     config = Criterion::default().with_measurement::<CudaMeasurement>(CudaMeasurement{});
-    targets = gl_leafs, bn_leafs, gl_nodes, bn_nodes, gl_merkle_tree, bn_merkle_tree
+    targets = gl_leafs, bn_leafs, gl_nodes, bn_nodes, gl_merkle_tree, bn_merkle_tree, bn_pow
 );
 
 criterion_main!(bench_poseidon2);
diff --git a/crates/boojum-cuda/native/blake2s.cu b/crates/boojum-cuda/native/blake2s.cu
@@ -54,6 +54,7 @@ DEVICE_FORCEINLINE uint64_t get_digest(const uint32_t input[10]) {
 }
 
 EXTERN __global__ void blake2s_pow_kernel(const uint64_t *seed, const uint32_t bits_count, const uint64_t max_nonce, volatile uint64_t *result) {
+  const uint32_t digest_mask = (1 << bits_count) - 1;
   __align__(8) uint32_t input_u32[10];
   auto input_u64 = reinterpret_cast<uint64_t *>(input_u32);
 #pragma unroll
@@ -62,7 +63,7 @@ EXTERN __global__ void blake2s_pow_kernel(const uint64_t *seed, const uint32_t b
   for (uint64_t nonce = threadIdx.x + blockIdx.x * blockDim.x; nonce < max_nonce && *result == UINT64_MAX; nonce += blockDim.x * gridDim.x) {
     input_u64[4] = nonce;
     uint64_t digest = get_digest(input_u32);
-    if (__clzll((long long)__brevll(digest)) >= bits_count)
+    if (!(digest & digest_mask))
       atomicCAS(reinterpret_cast<unsigned long long *>(const_cast<uint64_t *>(result)), UINT64_MAX, nonce);
   }
 }
diff --git a/crates/boojum-cuda/native/poseidon2/bn/poseidon2_bn_st.cu b/crates/boojum-cuda/native/poseidon2/bn/poseidon2_bn_st.cu
@@ -120,4 +120,35 @@ EXTERN __global__ void poseidon2_bn_st_nodes_kernel(const bn *values, bn *result
     store_cs(results, state[i]);
 }
 
+EXTERN __global__ void poseidon2_bn_pow_kernel(const gl *seed, const uint32_t bits_count, const uint64_t max_nonce, volatile uint64_t *result) {
+  const unsigned SEED_SIZE = 4;
+  static_assert(RATE == 2);
+  static_assert(CAPACITY == 1);
+  static_assert(CHUNK_BY == 3);
+  static_assert(SEED_SIZE + 2 == RATE * CHUNK_BY);
+  const uint32_t digest_mask = (1 << bits_count) - 1;
+  __align__(8) poseidon_state base_state{};
+#pragma unroll
+  for (unsigned i = 0; i < CHUNK_BY; i++)
+    reinterpret_cast<gl *>(&base_state[0])[i] = load_ca(seed + i);
+  base_state[0] = fr::to_montgomery(base_state[0]);
+  reinterpret_cast<gl *>(&base_state[1])[0] = load_ca(seed + CHUNK_BY);
+  for (uint64_t nonce = threadIdx.x + blockIdx.x * blockDim.x; nonce < max_nonce && *result == UINT64_MAX; nonce += blockDim.x * gridDim.x) {
+    poseidon_state state{};
+#pragma unroll
+    for (unsigned i = 0; i < STATE_WIDTH; i++)
+      state[i] = base_state[i];
+    state[1].limbs[2] = nonce;
+    state[1].limbs[4] = nonce >> 32;
+    state[1] = fr::to_montgomery(state[1]);
+    permutation(state);
+    state[0] = fr::get_one();
+    state[1] = {};
+    permutation(state);
+    const uint32_t digest = fr::from_montgomery(state[0]).limbs[0];
+    if (!(digest & digest_mask))
+      atomicCAS(reinterpret_cast<unsigned long long *>(const_cast<uint64_t *>(result)), UINT64_MAX, nonce);
+  }
+}
+
 } // namespace poseidon2::bn254
diff --git a/crates/boojum-cuda/src/blake2s.rs b/crates/boojum-cuda/src/blake2s.rs
@@ -25,7 +25,7 @@ pub fn blake2s_pow(
     }
     const BLOCK_SIZE: u32 = WARP_SIZE * 4;
     let device_id = get_device()?;
-    let mpc = device_get_attribute(CudaDeviceAttr::MultiProcessorCount, device_id).unwrap();
+    let mpc = device_get_attribute(CudaDeviceAttr::MultiProcessorCount, device_id)?;
     let kernel_function = Blake2SPowFunction::default();
     let max_blocks = max_active_blocks_per_multiprocessor(&kernel_function, BLOCK_SIZE as i32, 0)?;
     let num_blocks = (mpc * max_blocks) as u32;
@@ -43,28 +43,24 @@ pub fn blake2s_pow(
 
 #[cfg(test)]
 mod tests {
-    use blake2::{Blake2s256, Digest};
+    use blake2::Blake2s256;
+    use boojum::cs::implementations::pow::PoWRunner;
     use era_cudart::memory::{memory_copy_async, DeviceAllocation};
     use era_cudart::stream::CudaStream;
 
     #[test]
     fn blake2s_pow() {
         const BITS_COUNT: u32 = 24;
-        let h_seed = [42u8; 32];
+        let seed = vec![42u8; 32];
         let mut h_result = [0u64; 1];
         let mut d_seed = DeviceAllocation::alloc(32).unwrap();
         let mut d_result = DeviceAllocation::alloc(1).unwrap();
         let stream = CudaStream::default();
-        memory_copy_async(&mut d_seed, &h_seed, &stream).unwrap();
+        memory_copy_async(&mut d_seed, &seed, &stream).unwrap();
         super::blake2s_pow(&d_seed, BITS_COUNT, u64::MAX, &mut d_result[0], &stream).unwrap();
         memory_copy_async(&mut h_result, &d_result, &stream).unwrap();
         stream.synchronize().unwrap();
-        let mut digest = Blake2s256::new();
-        digest.update(h_seed);
-        digest.update(h_result[0].to_le_bytes());
-        let output = digest.finalize();
-        let mut le_bytes = [0u8; 8];
-        le_bytes.copy_from_slice(&output[..8]);
-        assert!(u64::from_le_bytes(le_bytes).trailing_zeros() >= BITS_COUNT);
+        let challenge = h_result[0];
+        assert!(Blake2s256::verify_from_bytes(seed, BITS_COUNT, challenge));
     }
 }
diff --git a/crates/boojum-cuda/src/poseidon2.rs b/crates/boojum-cuda/src/poseidon2.rs
@@ -6,12 +6,18 @@ use boojum::algebraic_props::round_function::AbsorptionModeOverwrite;
 use boojum::algebraic_props::sponge::GoldilocksPoseidon2Sponge;
 use boojum::cs::oracle::TreeHasher;
 use boojum::field::goldilocks::GoldilocksField;
+use era_cudart::device::{device_get_attribute, get_device};
 use era_cudart::execution::{CudaLaunchConfig, Dim3, KernelFunction};
+use era_cudart::memory::memory_set_async;
+use era_cudart::occupancy::max_active_blocks_per_multiprocessor;
 use era_cudart::paste::paste;
 use era_cudart::result::CudaResult;
-use era_cudart::slice::DeviceSlice;
+use era_cudart::slice::{DeviceSlice, DeviceVariable};
 use era_cudart::stream::CudaStream;
-use era_cudart::{cuda_kernel_declaration, cuda_kernel_signature_arguments_and_function};
+use era_cudart::{
+    cuda_kernel, cuda_kernel_declaration, cuda_kernel_signature_arguments_and_function,
+};
+use era_cudart_sys::CudaDeviceAttr;
 use snark_wrapper::franklin_crypto::bellman::bn256::{Bn256, Fr};
 use snark_wrapper::implementations::poseidon2::tree_hasher::AbsorptionModeReplacement;
 use snark_wrapper::rescue_poseidon::poseidon2::Poseidon2Sponge;
@@ -494,12 +500,44 @@ impl GpuTreeHasher for BNHasher {
         GatherMerklePathsKernelFunction(poseidon2_bn_gather_merkle_paths_kernel);
 }
 
+cuda_kernel!(Poseidon2Pow, poseidon2_bn_pow_kernel(seed: *const GL, bits_count: u32, max_nonce: u64, result: *mut u64));
+
+pub fn poseidon2_bn_pow(
+    seed: &DeviceSlice<GL>,
+    bits_count: u32,
+    max_nonce: u64,
+    result: &mut DeviceVariable<u64>,
+    stream: &CudaStream,
+) -> CudaResult<()> {
+    assert_eq!(seed.len(), 4);
+    unsafe {
+        memory_set_async(result.transmute_mut(), 0xff, stream)?;
+    }
+    const BLOCK_SIZE: u32 = WARP_SIZE * 4;
+    let device_id = get_device()?;
+    let mpc = device_get_attribute(CudaDeviceAttr::MultiProcessorCount, device_id)?;
+    let kernel_function = Poseidon2PowFunction::default();
+    let max_blocks = max_active_blocks_per_multiprocessor(&kernel_function, BLOCK_SIZE as i32, 0)?;
+    let num_blocks = (mpc * max_blocks) as u32;
+    let config = CudaLaunchConfig::basic(num_blocks, BLOCK_SIZE, stream);
+    let seed = seed.as_ptr();
+    let result = result.as_mut_ptr();
+    let args = Poseidon2PowArguments {
+        seed,
+        bits_count,
+        max_nonce,
+        result,
+    };
+    kernel_function.launch(&config, &args)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
     use crate::device_structures::{DeviceMatrix, DeviceMatrixMut};
     use crate::ops_simple::set_to_zero;
     use crate::tests_helpers::RandomIterator;
+    use boojum::cs::implementations::pow::PoWRunner;
     use era_cudart::memory::{memory_copy_async, DeviceAllocation};
     use itertools::Itertools;
     use rand::{thread_rng, Rng};
@@ -898,4 +936,22 @@ mod tests {
     fn bn_gather_merkle_paths() -> CudaResult<()> {
         BNHasher::test_gather_merkle_paths()
     }
+
+    #[test]
+    fn poseidon2_bn_pow() {
+        const BITS_COUNT: u32 = 26;
+        let seed = GL::get_random_iterator().take(4).collect_vec();
+        let mut h_result = [0u64; 1];
+        let mut d_seed = DeviceAllocation::alloc(4).unwrap();
+        let mut d_result = DeviceAllocation::alloc(1).unwrap();
+        let stream = CudaStream::default();
+        memory_copy_async(&mut d_seed, &seed, &stream).unwrap();
+        super::poseidon2_bn_pow(&d_seed, BITS_COUNT, u64::MAX, &mut d_result[0], &stream).unwrap();
+        memory_copy_async(&mut h_result, &d_result, &stream).unwrap();
+        stream.synchronize().unwrap();
+        let challenge = h_result[0];
+        assert!(BNHasher::verify_from_field_elements(
+            seed, BITS_COUNT, challenge
+        ));
+    }
 }
diff --git a/crates/proof-compression/src/gpu.rs b/crates/proof-compression/src/gpu.rs
@@ -20,7 +20,7 @@ use shivini::cs::GpuSetup;
 use shivini::gpu_proof_config::GpuProofConfig;
 use shivini::{
     gpu_prove_from_external_witness_data_with_cache_strategy, CacheStrategy,
-    CommitmentCacheStrategy, GpuTreeHasher, PolynomialsCacheStrategy, ProverContext,
+    CommitmentCacheStrategy, GPUPoWRunner, GpuTreeHasher, PolynomialsCacheStrategy, ProverContext,
     ProverContextConfig,
 };
 use std::alloc::Global;
@@ -369,7 +369,9 @@ pub fn prove_compression_wrapper_circuit_with_precomputations(
     (proof, vk)
 }
 
-pub fn inner_prove_compression_layer_circuit<CF: ProofCompressionFunction>(
+pub fn inner_prove_compression_layer_circuit<
+    CF: ProofCompressionFunction<ThisLayerPoW: GPUPoWRunner>,
+>(
     circuit: CompressionLayerCircuit<CF>,
     device_setup: &GpuSetup<CompressionProofsTreeHasher>,
     finalization_hint: FinalizationHintsForProver,
@@ -411,7 +413,9 @@ pub fn inner_prove_compression_layer_circuit<CF: ProofCompressionFunction>(
     (proof, vk)
 }
 
-pub fn inner_prove_compression_wrapper_circuit<CF: ProofCompressionFunction>(
+pub fn inner_prove_compression_wrapper_circuit<
+    CF: ProofCompressionFunction<ThisLayerPoW: GPUPoWRunner>,
+>(
     circuit: CompressionLayerCircuit<CF>,
     device_setup: &GpuSetup<CompressionTreeHasherForWrapper>,
     finalization_hint: FinalizationHintsForProver,

diff --git a/crates/shivini/Cargo.toml b/crates/shivini/Cargo.toml
@@ -24,8 +24,6 @@ smallvec = { version = "1.13", features = [
     "const_new",
     "serde",
 ] }
-sha2 = "0.10"
-blake2 = "0.10"
 hex = "0.4"
 derivative = "2.2"
 bincode = "1.3"

diff --git a/crates/shivini/src/data_structures/cache.rs b/crates/shivini/src/data_structures/cache.rs
@@ -7,7 +7,6 @@ use crate::poly::{CosetEvaluations, LagrangeBasis, MonomialBasis};
 use crate::prover::{
     compute_quotient_degree, gpu_prove_from_external_witness_data_with_cache_strategy,
 };
-use boojum::cs::implementations::pow::PoWRunner;
 use boojum::cs::implementations::prover::ProofConfig;
 use boojum::cs::implementations::transcript::Transcript;
 use boojum::cs::implementations::verifier::{VerificationKey, VerificationKeyCircuitGeometry};
@@ -736,7 +735,7 @@ impl CacheStrategy {
     pub(crate) fn get<
         TR: Transcript<F, CompatibleCap: Hash>,
         H: GpuTreeHasher<Output = TR::CompatibleCap>,
-        POW: PoWRunner,
+        POW: GPUPoWRunner,
         A: GoodAllocator,
     >(
         config: &GpuProofConfig,

diff --git a/crates/shivini/src/lib.rs b/crates/shivini/src/lib.rs
@@ -68,6 +68,7 @@ pub use context::ProverContextConfig;
 pub use data_structures::CacheStrategy;
 pub use data_structures::CommitmentCacheStrategy;
 pub use data_structures::PolynomialsCacheStrategy;
+pub use pow::GPUPoWRunner;
 pub use primitives::tree::GpuTreeHasher;
 pub use prover::gpu_prove_from_external_witness_data;
 pub use prover::gpu_prove_from_external_witness_data_with_cache_strategy;