diff --git a/src/groth16/params.rs b/src/groth16/params.rs index e909c640..0de903de 100644 --- a/src/groth16/params.rs +++ b/src/groth16/params.rs @@ -405,7 +405,7 @@ where } } -pub trait ParameterSource: Send + Sync +pub trait ParameterSource: Clone + Send + Sync where E: MultiMillerLoop, { diff --git a/src/groth16/prover/mod.rs b/src/groth16/prover/mod.rs index 39a08b19..7b8f80ab 100644 --- a/src/groth16/prover/mod.rs +++ b/src/groth16/prover/mod.rs @@ -3,213 +3,224 @@ mod native; #[cfg(feature = "cuda-supraseal")] mod supraseal; -use std::fmt; +use std::time::Instant; -use bellpepper_core::{ - Circuit, ConstraintSystem, Index, LinearCombination, SynthesisError, Variable, -}; +use bellpepper_core::{Circuit, ConstraintSystem, Index, SynthesisError, Variable}; use ec_gpu_gen::multiexp_cpu::DensityTracker; use ff::{Field, PrimeField}; +use log::info; use pairing::MultiMillerLoop; use rand_core::RngCore; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; #[cfg(not(feature = "cuda-supraseal"))] use self::native as prover; #[cfg(feature = "cuda-supraseal")] use self::supraseal as prover; use super::{ParameterSource, Proof}; -use crate::{gpu::GpuName, lc}; - -struct ProvingAssignment { - // Density of queries - a_aux_density: DensityTracker, - b_input_density: DensityTracker, - b_aux_density: DensityTracker, +use crate::gpu::GpuName; + +/// Implement various traits for the proving assignment. +/// +/// It's a macro so that it can be used for different types. SupraSeal is using some special memory +/// optimized data structures internally. Using a macro makes sure that the implementation will not +/// diverge over time. +macro_rules! proving_assignment_impls { + ($type:ty) => { + use bellpepper_core as bc; + impl std::fmt::Debug for $type { + fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { + fmt.debug_struct(stringify!($type)) + .field("a_aux_density", &self.a_aux_density) + .field("b_input_density", &self.b_input_density) + .field("b_aux_density", &self.b_aux_density) + .field( + "a", + &self + .a + .iter() + .map(|v| format!("Fr({:?})", v)) + .collect::>(), + ) + .field( + "b", + &self + .b + .iter() + .map(|v| format!("Fr({:?})", v)) + .collect::>(), + ) + .field( + "c", + &self + .c + .iter() + .map(|v| format!("Fr({:?})", v)) + .collect::>(), + ) + .field("input_assignment", &self.input_assignment) + .field("aux_assignment", &self.aux_assignment) + .finish() + } + } - // Evaluations of A, B, C polynomials - a: Vec, - b: Vec, - c: Vec, + impl PartialEq for $type { + fn eq(&self, other: &$type) -> bool { + self.a_aux_density == other.a_aux_density + && self.b_input_density == other.b_input_density + && self.b_aux_density == other.b_aux_density + && self.a == other.a + && self.b == other.b + && self.c == other.c + && self.input_assignment == other.input_assignment + && self.aux_assignment == other.aux_assignment + } + } - // Assignments of variables - input_assignment: Vec, - aux_assignment: Vec, -} + impl bc::ConstraintSystem for $type { + type Root = Self; -impl fmt::Debug for ProvingAssignment { - fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { - fmt.debug_struct("ProvingAssignment") - .field("a_aux_density", &self.a_aux_density) - .field("b_input_density", &self.b_input_density) - .field("b_aux_density", &self.b_aux_density) - .field( - "a", - &self - .a - .iter() - .map(|v| format!("Fr({:?})", v)) - .collect::>(), - ) - .field( - "b", - &self - .b - .iter() - .map(|v| format!("Fr({:?})", v)) - .collect::>(), - ) - .field( - "c", - &self - .c - .iter() - .map(|v| format!("Fr({:?})", v)) - .collect::>(), - ) - .field("input_assignment", &self.input_assignment) - .field("aux_assignment", &self.aux_assignment) - .finish() - } -} + fn new() -> Self { + Self::default() + } -impl PartialEq for ProvingAssignment { - fn eq(&self, other: &ProvingAssignment) -> bool { - self.a_aux_density == other.a_aux_density - && self.b_input_density == other.b_input_density - && self.b_aux_density == other.b_aux_density - && self.a == other.a - && self.b == other.b - && self.c == other.c - && self.input_assignment == other.input_assignment - && self.aux_assignment == other.aux_assignment - } -} + fn alloc(&mut self, _: A, f: F) -> Result + where + F: FnOnce() -> Result, + A: FnOnce() -> AR, + AR: Into, + { + self.aux_assignment.push(f()?); + self.a_aux_density.add_element(); + self.b_aux_density.add_element(); + + Ok(bc::Variable(bc::Index::Aux(self.aux_assignment.len() - 1))) + } -impl ConstraintSystem for ProvingAssignment { - type Root = Self; - - fn new() -> Self { - Self { - a_aux_density: DensityTracker::new(), - b_input_density: DensityTracker::new(), - b_aux_density: DensityTracker::new(), - a: vec![], - b: vec![], - c: vec![], - input_assignment: vec![], - aux_assignment: vec![], - } - } + fn alloc_input( + &mut self, + _: A, + f: F, + ) -> Result + where + F: FnOnce() -> Result, + A: FnOnce() -> AR, + AR: Into, + { + self.input_assignment.push(f()?); + self.b_input_density.add_element(); + + Ok(bc::Variable(bc::Index::Input( + self.input_assignment.len() - 1, + ))) + } - fn alloc(&mut self, _: A, f: F) -> Result - where - F: FnOnce() -> Result, - A: FnOnce() -> AR, - AR: Into, - { - self.aux_assignment.push(f()?); - self.a_aux_density.add_element(); - self.b_aux_density.add_element(); - - Ok(Variable(Index::Aux(self.aux_assignment.len() - 1))) - } + fn enforce(&mut self, _: A, a: LA, b: LB, c: LC) + where + A: FnOnce() -> AR, + AR: Into, + LA: FnOnce(bc::LinearCombination) -> bc::LinearCombination, + LB: FnOnce(bc::LinearCombination) -> bc::LinearCombination, + LC: FnOnce(bc::LinearCombination) -> bc::LinearCombination, + { + let a = a(bc::LinearCombination::zero()); + let b = b(bc::LinearCombination::zero()); + let c = c(bc::LinearCombination::zero()); + + let input_assignment = &self.input_assignment; + let aux_assignment = &self.aux_assignment; + let a_aux_density = &mut self.a_aux_density; + let b_input_density = &mut self.b_input_density; + let b_aux_density = &mut self.b_aux_density; + + let a_res = crate::lc::eval_with_trackers( + &a, + // Inputs have full density in the A query + // because there are constraints of the + // form x * 0 = 0 for each input. + None, + Some(a_aux_density), + input_assignment, + aux_assignment, + ); + + let b_res = crate::lc::eval_with_trackers( + &b, + Some(b_input_density), + Some(b_aux_density), + input_assignment, + aux_assignment, + ); + + // There is no C polynomial query, + // though there is an (beta)A + (alpha)B + C + // query for all aux variables. + // However, that query has full density. + let c_res = crate::lc::eval(&c, input_assignment, aux_assignment); + + self.a.push(a_res); + self.b.push(b_res); + self.c.push(c_res); + } - fn alloc_input(&mut self, _: A, f: F) -> Result - where - F: FnOnce() -> Result, - A: FnOnce() -> AR, - AR: Into, - { - self.input_assignment.push(f()?); - self.b_input_density.add_element(); + fn push_namespace(&mut self, _: N) + where + NR: Into, + N: FnOnce() -> NR, + { + // Do nothing; we don't care about namespaces in this context. + } - Ok(Variable(Index::Input(self.input_assignment.len() - 1))) - } + fn pop_namespace(&mut self) { + // Do nothing; we don't care about namespaces in this context. + } - fn enforce(&mut self, _: A, a: LA, b: LB, c: LC) - where - A: FnOnce() -> AR, - AR: Into, - LA: FnOnce(LinearCombination) -> LinearCombination, - LB: FnOnce(LinearCombination) -> LinearCombination, - LC: FnOnce(LinearCombination) -> LinearCombination, - { - let a = a(LinearCombination::zero()); - let b = b(LinearCombination::zero()); - let c = c(LinearCombination::zero()); - - let input_assignment = &self.input_assignment; - let aux_assignment = &self.aux_assignment; - let a_aux_density = &mut self.a_aux_density; - let b_input_density = &mut self.b_input_density; - let b_aux_density = &mut self.b_aux_density; - - let a_res = lc::eval_with_trackers( - &a, - // Inputs have full density in the A query - // because there are constraints of the - // form x * 0 = 0 for each input. - None, - Some(a_aux_density), - input_assignment, - aux_assignment, - ); - - let b_res = lc::eval_with_trackers( - &b, - Some(b_input_density), - Some(b_aux_density), - input_assignment, - aux_assignment, - ); - - // There is no C polynomial query, - // though there is an (beta)A + (alpha)B + C - // query for all aux variables. - // However, that query has full density. - let c_res = c.eval(input_assignment, aux_assignment); - - self.a.push(a_res); - self.b.push(b_res); - self.c.push(c_res); - } + fn get_root(&mut self) -> &mut Self::Root { + self + } - fn push_namespace(&mut self, _: N) - where - NR: Into, - N: FnOnce() -> NR, - { - // Do nothing; we don't care about namespaces in this context. - } + fn is_extensible() -> bool { + true + } - fn pop_namespace(&mut self) { - // Do nothing; we don't care about namespaces in this context. - } + fn extend(&mut self, other: &Self) { + self.a_aux_density.extend(&other.a_aux_density, false); + self.b_input_density.extend(&other.b_input_density, true); + self.b_aux_density.extend(&other.b_aux_density, false); - fn get_root(&mut self) -> &mut Self::Root { - self - } + self.a.extend(&other.a); + self.b.extend(&other.b); + self.c.extend(&other.c); - fn is_extensible() -> bool { - true - } + self.input_assignment + // Skip first input, which must have been a temporarily allocated one variable. + .extend(&other.input_assignment[1..]); + self.aux_assignment.extend(&other.aux_assignment); + } + } + }; +} +pub(crate) use proving_assignment_impls; - fn extend(&mut self, other: &Self) { - self.a_aux_density.extend(&other.a_aux_density, false); - self.b_input_density.extend(&other.b_input_density, true); - self.b_aux_density.extend(&other.b_aux_density, false); +#[derive(Default)] +struct ProvingAssignment { + // Density of queries + a_aux_density: DensityTracker, + b_input_density: DensityTracker, + b_aux_density: DensityTracker, - self.a.extend(&other.a); - self.b.extend(&other.b); - self.c.extend(&other.c); + // Evaluations of A, B, C polynomials + a: Vec, + b: Vec, + c: Vec, - self.input_assignment - // Skip first input, which must have been a temporarily allocated one variable. - .extend(&other.input_assignment[1..]); - self.aux_assignment.extend(&other.aux_assignment); - } + // Assignments of variables + input_assignment: Vec, + aux_assignment: Vec, } +proving_assignment_impls!(ProvingAssignment); + pub(super) fn create_random_proof_batch_priority>( circuits: Vec, params: P, @@ -253,6 +264,36 @@ where prover::create_proof_batch_priority_inner(circuits, params, Some((r_s, s_s)), priority) } +fn synthesize_circuits_batch( + circuits: Vec, +) -> Result>, SynthesisError> +where + Scalar: PrimeField, + C: Circuit + Send, +{ + let start = Instant::now(); + let provers = circuits + .into_par_iter() + .map(|circuit| -> Result<_, SynthesisError> { + let mut prover = ProvingAssignment::new(); + + prover.alloc_input(|| "", || Ok(Scalar::ONE))?; + + circuit.synthesize(&mut prover)?; + + for i in 0..prover.input_assignment.len() { + prover.enforce(|| "", |lc| lc + Variable(Index::Input(i)), |lc| lc, |lc| lc); + } + + Ok(prover) + }) + .collect::, _>>()?; + + info!("synthesis time: {:?}", start.elapsed()); + + Ok(provers) +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/groth16/prover/native.rs b/src/groth16/prover/native.rs index 69d4fa8b..39bcaa94 100644 --- a/src/groth16/prover/native.rs +++ b/src/groth16/prover/native.rs @@ -6,7 +6,7 @@ use std::{ time::Instant, }; -use bellpepper_core::{Circuit, ConstraintSystem, Index, SynthesisError, Variable}; +use bellpepper_core::{Circuit, SynthesisError}; use ec_gpu_gen::{ multiexp_cpu::FullDensity, threadpool::{Worker, THREAD_POOL}, @@ -47,7 +47,7 @@ where { info!("Bellperson {} is being used!", BELLMAN_VERSION); - let provers = synthesize_circuits_batch(circuits)?; + let provers = super::synthesize_circuits_batch(circuits)?; proof_circuits_batch(provers, params, randomization, priority) } @@ -445,33 +445,3 @@ where .collect::>(); Ok(Arc::new(a)) } - -fn synthesize_circuits_batch( - circuits: Vec, -) -> Result>, SynthesisError> -where - Scalar: PrimeField, - C: Circuit + Send, -{ - let start = Instant::now(); - let provers = circuits - .into_par_iter() - .map(|circuit| -> Result<_, SynthesisError> { - let mut prover = ProvingAssignment::new(); - - prover.alloc_input(|| "", || Ok(Scalar::ONE))?; - - circuit.synthesize(&mut prover)?; - - for i in 0..prover.input_assignment.len() { - prover.enforce(|| "", |lc| lc + Variable(Index::Input(i)), |lc| lc, |lc| lc); - } - - Ok(prover) - }) - .collect::, _>>()?; - - info!("synthesis time: {:?}", start.elapsed()); - - Ok(provers) -} diff --git a/src/groth16/prover/supraseal.rs b/src/groth16/prover/supraseal.rs index 6411ff9a..2748cf27 100644 --- a/src/groth16/prover/supraseal.rs +++ b/src/groth16/prover/supraseal.rs @@ -1,8 +1,9 @@ //! Prover implementation implemented using SupraSeal (C++). -use std::time::Instant; +use std::{cmp, collections::BTreeMap, io, ops, thread, time::Instant}; use bellpepper_core::{Circuit, ConstraintSystem, Index, SynthesisError, Variable}; +use ec_gpu_gen::multiexp_cpu::DensityTracker; use ff::{Field, PrimeField}; use log::info; use pairing::MultiMillerLoop; @@ -11,6 +12,268 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator}; use super::{ParameterSource, Proof, ProvingAssignment}; use crate::{gpu::GpuName, BELLMAN_VERSION}; +/// The number of circuits that will synthesized in parallel. +/// +/// Due to a memory optimized representation it's possible to synthesize circuits in bigger batches +/// than proving them. That optimized representation will then be transformed into the one the +/// prover expects in a separate step. +const SYNTHESIZE_BATCH_SIZE: usize = 20; + +/// The number of synthesized circuits that are passed on to the prover. Those need a lot of memory +/// and the proving is mostly sequentially anyway, which means that bigger sized won't result in +/// much faster proving times. Lower memory usage is usally worth the trade-off. +const PROVER_BATCH_SIZE: usize = 5; + +/// The number of scalars we pack into a single byte. +const SCALARS_PER_BYTE: usize = 4; + +/// An enum to distinguish between common and other scalar values. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum ScalarValue { + Zero = 0, + One = 1, + Two = 2, + Other = 3, +} + +impl Default for ScalarValue { + fn default() -> Self { + Self::Zero + } +} + +/// Use a custom representation in order to use less memory. In Filecoin the synthesized exponents +/// are mostly zero, ones or twos. Those can be represented with 2 bits instead of their full field +/// representation of 256 bits. Other values have a slight overhead, but as there are so few, it +/// doesn't matter much. +#[derive(Debug, Eq, PartialEq)] +pub struct ScalarVec { + /// The scalar representing zero. It's owned here so that it can be referenced later. + zero: Scalar, + /// The scalar representing one. It's owned here so that it can be referenced later. + one: Scalar, + /// The scalar representing two. It's owned here so that it can be referenced later. + two: Scalar, + /// This is the vector of all values. 4 values are packed into a single byte. If the value is + /// [`ScalarValue::Other`], then there will be the actual value stored in the `other` field, + /// keyed by the current position in the list of values (where the position is the one as if it + /// wouldn't be packed). + values: Vec, + /// In case the value is [`ScalarValue::Other`], then the actual scalar is stored in this map, + /// where the key the position within the list of values. + other: BTreeMap, + /// Temporary buffer before the values are packed into a single byte. + buffer: [ScalarValue; SCALARS_PER_BYTE], + /// The offset where the next value within the buffer will be written to. + buffer_pos: usize, +} + +impl ScalarVec { + pub fn new() -> Self { + Self { + zero: Scalar::ZERO, + one: Scalar::ONE, + two: Scalar::ONE.double(), + values: Vec::new(), + other: BTreeMap::new(), + buffer: [ScalarValue::Zero; SCALARS_PER_BYTE], + buffer_pos: 0, + } + } + + /// Tthe number of scalars stored. + pub fn len(&self) -> usize { + // The scalar values are 2 bit, we store 4 of them in a single byte. + (self.values.len() * SCALARS_PER_BYTE) + self.buffer_pos + } + + pub fn push(&mut self, scalar: Scalar) { + let value = if scalar == Scalar::ZERO { + ScalarValue::Zero + } else if scalar == Scalar::ONE { + ScalarValue::One + } else if scalar == self.two { + ScalarValue::Two + } else { + self.other.insert(self.len(), scalar); + ScalarValue::Other + }; + + if self.buffer_pos < SCALARS_PER_BYTE { + self.buffer[self.buffer_pos] = value; + self.buffer_pos += 1; + } + + // The buffer is full, flush the values into the actual data vector. + if self.buffer_pos == SCALARS_PER_BYTE { + self.buffer_pos = 0; + self.flush_buffer(); + } + } + + pub fn iter(&self) -> ScalarVecIterator { + ScalarVecIterator { + scalar_vec: self, + pos: 0, + } + } + + /// Transform into arepresentation where all elements arranged in continuous memory. + pub fn into_vec(self) -> Vec { + // NOTE vmx 2023-12-13: A simple collect of the iterator is slower when micro-benchmarking. + let mut output = Vec::with_capacity(self.len()); + for scalar in self.iter() { + output.push(*scalar) + } + output + } + + /// Flush the buffer into the actual vector of data. + fn flush_buffer(&mut self) { + let mut data_byte = 0; + data_byte |= self.buffer[0] as u8; + data_byte |= (self.buffer[1] as u8) << 2; + data_byte |= (self.buffer[2] as u8) << 4; + data_byte |= (self.buffer[3] as u8) << 6; + self.values.push(data_byte); + } + + fn get(&self, pos: usize) -> Option<&Scalar> { + if pos < self.len() { + // The position is within the stored values (not the buffer) + if pos < self.values.len() * SCALARS_PER_BYTE { + let value_byte = &self.values[pos / SCALARS_PER_BYTE]; + let within_buffer_pos = pos % SCALARS_PER_BYTE; + // Determine where the bits we want to read. Each value is 2 bits => `* 2`. + let bitmask = 0b11 << (within_buffer_pos * 2); + // Read those bits and shift them back, so that it matches the enum values. + let value = (value_byte & bitmask) >> (within_buffer_pos * 2); + + if value == ScalarValue::Zero as u8 { + Some(&self.zero) + } else if value == ScalarValue::One as u8 { + Some(&self.one) + } else if value == ScalarValue::Two as u8 { + Some(&self.two) + } else if value == ScalarValue::Other as u8 { + self.other.get(&pos) + } else { + unreachable!() + } + } else { + let within_buffer_pos = pos - (self.values.len() * SCALARS_PER_BYTE); + match self.buffer[within_buffer_pos] { + ScalarValue::Zero => Some(&self.zero), + ScalarValue::One => Some(&self.one), + ScalarValue::Two => Some(&self.two), + ScalarValue::Other => self.other.get(&pos), + } + } + } else { + None + } + } +} + +impl Default for ScalarVec { + fn default() -> Self { + Self::new() + } +} + +impl<'a, Scalar: PrimeField> Extend<&'a Scalar> for ScalarVec { + fn extend>(&mut self, iter: T) { + for scalar in iter { + self.push(*scalar); + } + } +} + +impl ops::Index for ScalarVec { + type Output = Scalar; + + fn index(&self, index: usize) -> &Self::Output { + self.get(index).expect("index out of range") + } +} + +pub struct ScalarVecIterator<'a, Scalar> { + scalar_vec: &'a ScalarVec, + pos: usize, +} + +impl<'a, Scalar: PrimeField> Iterator for ScalarVecIterator<'a, Scalar> { + type Item = &'a Scalar; + + fn next(&mut self) -> Option { + // Early return in case index is out of range. + let value = self.scalar_vec.get(self.pos)?; + self.pos += 1; + Some(value) + } +} + +impl<'a, Scalar: PrimeField> IntoIterator for &'a ScalarVec { + type Item = &'a Scalar; + type IntoIter = ScalarVecIterator<'a, Scalar>; + + fn into_iter(self) -> Self::IntoIter { + ScalarVecIterator { + scalar_vec: self, + pos: 0, + } + } +} + +/// A copy of `[prover::ProvingAssignment` which has a lower memory footprint. +/// +/// At the cost of the need to convert into the usual representation when it's passed into the +/// prover. +#[derive(Default)] +struct ProvingAssignmentCompact { + // Density of queries + a_aux_density: DensityTracker, + b_input_density: DensityTracker, + b_aux_density: DensityTracker, + + // Evaluations of A, B, C polynomials + a: ScalarVec, + b: ScalarVec, + c: ScalarVec, + + // Assignments of variables + input_assignment: Vec, + aux_assignment: ScalarVec, +} + +super::proving_assignment_impls!(ProvingAssignmentCompact); + +impl From> for ProvingAssignment { + fn from(assignment: ProvingAssignmentCompact) -> Self { + let mut a = Vec::new(); + let mut b = Vec::new(); + let mut c = Vec::new(); + let mut aux_assignment = Vec::new(); + rayon::scope(|s| { + s.spawn(|_| a = assignment.a.into_vec()); + s.spawn(|_| b = assignment.b.into_vec()); + s.spawn(|_| c = assignment.c.into_vec()); + s.spawn(|_| aux_assignment = assignment.aux_assignment.into_vec()); + }); + + Self { + a_aux_density: assignment.a_aux_density, + b_input_density: assignment.b_input_density, + b_aux_density: assignment.b_aux_density, + a, + b, + c, + input_assignment: assignment.input_assignment, + aux_assignment, + } + } +} + impl From<&ProvingAssignment> for supraseal_c2::Assignment where Scalar: PrimeField, @@ -65,15 +328,164 @@ where BELLMAN_VERSION ); - let provers = synthesize_circuits_batch(circuits)?; - proof_circuits_batch(provers, params, randomization) + let (r_s, s_s) = randomization.unwrap_or(( + vec![E::Fr::ZERO; circuits.len()], + vec![E::Fr::ZERO; circuits.len()], + )); + + // The memory-optimized version, which is more CPU intensive only makes sense for larger batch + // sizes. Hence use the normal synthesis for smaller batches. + if circuits.len() <= 10 { + let provers = super::synthesize_circuits_batch(circuits)?; + proof_circuits_batch(provers, params, (r_s, s_s)) + } else { + create_proof_batch_pipelined(circuits, params, (r_s, s_s)) + } +} + +/// Create a custom [`SynthesisError`]. +/// +/// The closest to a custom error is the IO Error, hence use that. +fn custom_error(error: &str) -> SynthesisError { + SynthesisError::IoError(io::Error::new(io::ErrorKind::Other, error)) +} + +/// The circuit synthesis is CPU intensive. Itself isn't parallelized, hence we parallelize with +/// running several synthesis at the same time. The proving isn't that CPU intensive. +/// Therefore we interleave the synthesis with the proving. +/// We create a large batch of synthesized circuits, and then proof in smaller batches as the +/// proving takes way more memory. Whenever the proving of synthesized batch starts, we kick of a +/// new batch for synthesis, while the proving is going on. We achieve that with having a bounded +/// message queue which blocks after a certain amount of batches. +/// +/// The flow looks like that: +/// +/// - Each uppercase letter corresponds to one proof. +/// - The total number of proofs is 18. +/// - The batch size for synthesis is 6. +/// - The batch size for proving is 2. +/// - The message queue size is the batch size of the synthesis divided bt the batch size of +/// the proving minus one, so that the queue blocks before the next synthesis starts. +/// => (6 / 2) - 1 = 2. +/// +/// ```text +/// The downwards axis is time. The Synthesize and Prover thread run in parallel. If things +/// appeach on the same line it means that they start at the same time, but they might take +/// different amounts of time. +/// +/// Description Synthesize thread Message queue Prover thread +/// +/// The full set of proofs is: +/// A B C D E F G H I J K L M N O P Q R +/// +/// Start with synthesizing a batch of A B C D E F +/// circuits. +/// +/// Once finished, put them into the (C D) (A B) +/// message queue. One item in the queue +/// consists is one batch for the prover. +/// +/// Once the prover starts, the last item G H I J K L (E F) (C D) A B +/// of the synthesis batch is pushed into (E F) C D +/// queue, hence a new synthesis starts. E F +/// +/// The synthesis keeps pushing into the (I J) (G H) +/// queue whenever there's a free spot. +/// +/// Keep repeating the previous two steps. M N O P Q R (K L) (I J) G H +/// (K L) I J +/// +/// (O P) (M N) +/// +/// All sircuits were synthesized, hence (Q R) (O P) M N +/// only the proving is to be done. (Q R) O P +/// Q R +/// ``` +fn create_proof_batch_pipelined( + circuits: Vec, + params: P, + randomization: (Vec, Vec), +) -> Result>, SynthesisError> +where + E: MultiMillerLoop, + C: Circuit + Send, + E::Fr: GpuName, + E::G1Affine: GpuName, + E::G2Affine: GpuName, + P: ParameterSource, +{ + let (r_s, s_s) = randomization; + assert_eq!(circuits.len(), r_s.len()); + assert_eq!(circuits.len(), s_s.len()); + + // This channel size makes sure that the next synthesizing batch starts as soon as the first + // proving batch starts. + let (sender, receiver) = + crossbeam_channel::bounded((SYNTHESIZE_BATCH_SIZE / PROVER_BATCH_SIZE) - 1); + + let num_circuits = circuits.len(); + + thread::scope(|s| { + let synthesis = s.spawn(|| -> Result<(), SynthesisError> { + let mut circuits_mut = circuits; + // A vector of proofs is expected, hence drain it from the list of proofs, so that we + // don't need to keep an extra copy around. + while !circuits_mut.is_empty() { + let size = cmp::min(SYNTHESIZE_BATCH_SIZE, circuits_mut.len()); + let batch = circuits_mut.drain(0..size).collect(); + let mut provers = synthesize_circuits_batch(batch)?; + // Do not send all synthesized circuits at once, but only a subset as the memory + // footprint will increase in the proving stage. + while !provers.is_empty() { + let provers_size = cmp::min(PROVER_BATCH_SIZE, provers.len()); + let provers_batch: Vec<_> = provers.drain(0..provers_size).collect(); + sender + .send(provers_batch) + .map_err(|_| custom_error("cannot send circuits"))?; + } + } + Ok(()) + }); + + let prover = s.spawn(|| { + let mut groth_proofs = Vec::with_capacity(num_circuits); + // There is one randomnes element per circuit, hence we can use that as termination + // condition for the loop. + let mut r_s_mut = r_s; + let mut s_s_mut = s_s; + while !r_s_mut.is_empty() { + let provers_compact = receiver + .recv() + .map_err(|_| custom_error("cannot receive circuits"))?; + let r_s_batch = r_s_mut.drain(0..provers_compact.len()).collect(); + let s_s_batch = s_s_mut.drain(0..provers_compact.len()).collect(); + + // Transform the provers from the memory efficient representation into one suitable + // to be used with SupraSeal. + log::trace!("converting representation of provers"); + let provers: Vec> = + provers_compact.into_par_iter().map(Into::into).collect(); + + let proofs = proof_circuits_batch(provers, params.clone(), (r_s_batch, s_s_batch))?; + groth_proofs.extend_from_slice(&proofs); + } + Ok(groth_proofs) + }); + + synthesis + .join() + .map_err(|_| custom_error("cannot prove circuits"))??; + // The prover result is what we actually return. + prover + .join() + .map_err(|_| custom_error("cannot prove circuits"))? + }) } -#[allow(clippy::type_complexity)] fn proof_circuits_batch( - provers: std::vec::Vec>, + provers: Vec>, params: P, - randomization: Option<(Vec, Vec)>, + randomization: (Vec, Vec), ) -> Result>, SynthesisError> where E: MultiMillerLoop, @@ -87,10 +499,7 @@ where info!("starting proof timer"); let num_circuits = provers.len(); - let (r_s, s_s) = randomization.unwrap_or(( - vec![E::Fr::ZERO; num_circuits], - vec![E::Fr::ZERO; num_circuits], - )); + let (r_s, s_s) = randomization; // Make sure all circuits have the same input len. for prover in &provers { @@ -129,10 +538,11 @@ where Ok(proofs) } -#[allow(clippy::type_complexity)] +// The only difference to [`groth16::prover::synthesize_circuits-batch`] is, that it's using the +// memory optimized representation for the proving assignment. fn synthesize_circuits_batch( circuits: Vec, -) -> Result>, SynthesisError> +) -> Result>, SynthesisError> where Scalar: PrimeField, C: Circuit + Send, @@ -142,7 +552,7 @@ where let provers = circuits .into_par_iter() .map(|circuit| -> Result<_, SynthesisError> { - let mut prover = ProvingAssignment::new(); + let mut prover = ProvingAssignmentCompact::new(); prover.alloc_input(|| "", || Ok(Scalar::ONE))?; diff --git a/src/lc.rs b/src/lc.rs index 12facbc9..f4af8e20 100644 --- a/src/lc.rs +++ b/src/lc.rs @@ -1,14 +1,59 @@ -use crate::LinearCombination; +use std::ops; + use ec_gpu_gen::multiexp_cpu::DensityTracker; use ff::PrimeField; -pub fn eval_with_trackers( - lc: &LinearCombination, - mut input_density: Option<&mut DensityTracker>, - mut aux_density: Option<&mut DensityTracker>, +use crate::LinearCombination; + +/// Copy of `eval` from bellpepper that also works with a +/// [`groth16::prover::superaseal::ScalarVec`]. +// `T` is a slice of `Scalar`s. This way it works with `&[Scalar]` as well as `&ScalarVec` +pub(crate) fn eval<'a, Scalar, T>( + lc: &'a LinearCombination, + input_assignment: &[Scalar], + aux_assignment: &'a T, +) -> Scalar +where + Scalar: PrimeField + ops::AddAssign, + T: ops::Index, + T::Output: PrimeField + std::ops::MulAssign<&'a Scalar>, +{ + let mut acc = Scalar::ZERO; + + let one = Scalar::ONE; + + for (index, coeff) in lc.iter_inputs() { + let mut tmp = input_assignment[*index]; + if coeff != &one { + tmp *= coeff; + } + acc += tmp; + } + + for (index, coeff) in lc.iter_aux() { + let mut tmp = aux_assignment[*index]; + if coeff != &one { + tmp *= coeff; + } + acc += tmp; + } + + acc +} + +// `T` is a slice of `Scalar`s. This way it works with `&[Scalar]` as well as `&ScalarVec` +pub(crate) fn eval_with_trackers<'a, Scalar, T>( + lc: &'a LinearCombination, + mut input_density: Option<&'a mut DensityTracker>, + mut aux_density: Option<&'a mut DensityTracker>, input_assignment: &[Scalar], - aux_assignment: &[Scalar], -) -> Scalar { + aux_assignment: &'a T, +) -> Scalar +where + Scalar: PrimeField + ops::AddAssign, + T: ops::Index, + T::Output: PrimeField + std::ops::MulAssign<&'a Scalar>, +{ let mut acc = Scalar::ZERO; let one = Scalar::ONE;