From 011c3ca368dd715141de460d51ab169894a83ca6 Mon Sep 17 00:00:00 2001
From: Volker Mische <volker.mische@gmail.com>
Date: Wed, 29 Nov 2023 15:12:05 +0100
Subject: [PATCH 1/3] refactor: refactor the proving into a separate function
 call

This is the first step of make more code share between the native
and the SupraSeal implementation possible.
---
 src/groth16/prover/supraseal.rs | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
diff --git a/src/groth16/prover/supraseal.rs b/src/groth16/prover/supraseal.rs
index e367b041..6411ff9a 100644
--- a/src/groth16/prover/supraseal.rs
+++ b/src/groth16/prover/supraseal.rs
@@ -66,7 +66,22 @@ where
     );
 
     let provers = synthesize_circuits_batch(circuits)?;
+    proof_circuits_batch(provers, params, randomization)
+}
 
+#[allow(clippy::type_complexity)]
+fn proof_circuits_batch<E, P>(
+    provers: std::vec::Vec<ProvingAssignment<E::Fr>>,
+    params: P,
+    randomization: Option<(Vec<E::Fr>, Vec<E::Fr>)>,
+) -> Result<Vec<Proof<E>>, SynthesisError>
+where
+    E: MultiMillerLoop,
+    E::Fr: GpuName,
+    E::G1Affine: GpuName,
+    E::G2Affine: GpuName,
+    P: ParameterSource<E>,
+{
     // Start fft/multiexp prover timer
     let start = Instant::now();
     info!("starting proof timer");

From d4bdbe438244803868e91a08438c9541fac90380 Mon Sep 17 00:00:00 2001
From: Volker Mische <volker.mische@gmail.com>
Date: Wed, 29 Nov 2023 15:17:34 +0100
Subject: [PATCH 2/3] refactor: put the proving into its own function

Refactor things similar to the SupraSeal code.
---
 src/groth16/prover/native.rs | 94 +++++++++++++++++++-----------------
 1 file changed, 50 insertions(+), 44 deletions(-)

diff --git a/src/groth16/prover/native.rs b/src/groth16/prover/native.rs
index 7bd9e1d6..69d4fa8b 100644
--- a/src/groth16/prover/native.rs
+++ b/src/groth16/prover/native.rs
@@ -47,8 +47,53 @@ where
 {
     info!("Bellperson {} is being used!", BELLMAN_VERSION);
 
-    let (start, mut provers, input_assignments, aux_assignments) =
-        synthesize_circuits_batch(circuits)?;
+    let provers = synthesize_circuits_batch(circuits)?;
+    proof_circuits_batch(provers, params, randomization, priority)
+}
+
+#[allow(clippy::type_complexity)]
+fn proof_circuits_batch<E, P>(
+    mut provers: std::vec::Vec<ProvingAssignment<E::Fr>>,
+    params: P,
+    randomization: Option<(Vec<E::Fr>, Vec<E::Fr>)>,
+    priority: bool,
+) -> Result<Vec<Proof<E>>, SynthesisError>
+where
+    E: MultiMillerLoop,
+    E::Fr: GpuName,
+    E::G1Affine: GpuName,
+    E::G2Affine: GpuName,
+    P: ParameterSource<E>,
+{
+    // Start fft/multiexp prover timer
+    let start = Instant::now();
+    info!("starting proof timer");
+
+    let input_assignments = provers
+        .par_iter_mut()
+        .map(|prover| {
+            let input_assignment = std::mem::take(&mut prover.input_assignment);
+            Arc::new(
+                input_assignment
+                    .into_iter()
+                    .map(|s| s.to_repr())
+                    .collect::<Vec<_>>(),
+            )
+        })
+        .collect::<Vec<_>>();
+
+    let aux_assignments = provers
+        .par_iter_mut()
+        .map(|prover| {
+            let aux_assignment = std::mem::take(&mut prover.aux_assignment);
+            Arc::new(
+                aux_assignment
+                    .into_iter()
+                    .map(|s| s.to_repr())
+                    .collect::<Vec<_>>(),
+            )
+        })
+        .collect::<Vec<_>>();
 
     let worker = Worker::new();
     let input_len = input_assignments[0].len();
@@ -401,24 +446,15 @@ where
     Ok(Arc::new(a))
 }
 
-#[allow(clippy::type_complexity)]
 fn synthesize_circuits_batch<Scalar, C>(
     circuits: Vec<C>,
-) -> Result<
-    (
-        Instant,
-        std::vec::Vec<ProvingAssignment<Scalar>>,
-        std::vec::Vec<std::sync::Arc<std::vec::Vec<<Scalar as PrimeField>::Repr>>>,
-        std::vec::Vec<std::sync::Arc<std::vec::Vec<<Scalar as PrimeField>::Repr>>>,
-    ),
-    SynthesisError,
->
+) -> Result<std::vec::Vec<ProvingAssignment<Scalar>>, SynthesisError>
 where
     Scalar: PrimeField,
     C: Circuit<Scalar> + Send,
 {
     let start = Instant::now();
-    let mut provers = circuits
+    let provers = circuits
         .into_par_iter()
         .map(|circuit| -> Result<_, SynthesisError> {
             let mut prover = ProvingAssignment::new();
@@ -437,35 +473,5 @@ where
 
     info!("synthesis time: {:?}", start.elapsed());
 
-    // Start fft/multiexp prover timer
-    let start = Instant::now();
-    info!("starting proof timer");
-
-    let input_assignments = provers
-        .par_iter_mut()
-        .map(|prover| {
-            let input_assignment = std::mem::take(&mut prover.input_assignment);
-            Arc::new(
-                input_assignment
-                    .into_iter()
-                    .map(|s| s.to_repr())
-                    .collect::<Vec<_>>(),
-            )
-        })
-        .collect::<Vec<_>>();
-
-    let aux_assignments = provers
-        .par_iter_mut()
-        .map(|prover| {
-            let aux_assignment = std::mem::take(&mut prover.aux_assignment);
-            Arc::new(
-                aux_assignment
-                    .into_iter()
-                    .map(|s| s.to_repr())
-                    .collect::<Vec<_>>(),
-            )
-        })
-        .collect::<Vec<_>>();
-
-    Ok((start, provers, input_assignments, aux_assignments))
+    Ok(provers)
 }

From 69b83f69562dca922db075bdb7db9f3cf79fd1ef Mon Sep 17 00:00:00 2001
From: Volker Mische <volker.mische@gmail.com>
Date: Mon, 11 Dec 2023 13:54:03 +0100
Subject: [PATCH 3/3] feat: make SupraSeal take less memory and work with large
 batches

SupraSeal is so efficient as it leverages the fact that the Filecoin
circuits have a specific shape. This can also be used for a more
memory efficient representation during synthesis.

This makes the synthesis less memory intensive and thus makes it
possible to synthesize more circuits in parallel. The prover still
needs the normal representation, hence is still memory intensive.
We run a batch size of 5 instead of 10, which even reduces that memory
footprint a lot which justified the additional computation costs.

Those changes enables proving a large number of proofs pretty
efficiently, which makes it suitable for Filecoins non-interactive
PoRep.

All those optimization are only for the SupraSeal code path as they
are kind of Filecoin specific.
---
 src/groth16/params.rs           |   2 +-
 src/groth16/prover/mod.rs       | 397 ++++++++++++++++-------------
 src/groth16/prover/native.rs    |  34 +--
 src/groth16/prover/supraseal.rs | 436 +++++++++++++++++++++++++++++++-
 src/lc.rs                       |  59 ++++-
 5 files changed, 697 insertions(+), 231 deletions(-)

diff --git a/src/groth16/params.rs b/src/groth16/params.rs
index e909c640..0de903de 100644
--- a/src/groth16/params.rs
+++ b/src/groth16/params.rs
@@ -405,7 +405,7 @@ where
     }
 }
 
-pub trait ParameterSource<E>: Send + Sync
+pub trait ParameterSource<E>: Clone + Send + Sync
 where
     E: MultiMillerLoop,
 {
diff --git a/src/groth16/prover/mod.rs b/src/groth16/prover/mod.rs
index 39a08b19..7b8f80ab 100644
--- a/src/groth16/prover/mod.rs
+++ b/src/groth16/prover/mod.rs
@@ -3,213 +3,224 @@ mod native;
 #[cfg(feature = "cuda-supraseal")]
 mod supraseal;
 
-use std::fmt;
+use std::time::Instant;
 
-use bellpepper_core::{
-    Circuit, ConstraintSystem, Index, LinearCombination, SynthesisError, Variable,
-};
+use bellpepper_core::{Circuit, ConstraintSystem, Index, SynthesisError, Variable};
 use ec_gpu_gen::multiexp_cpu::DensityTracker;
 use ff::{Field, PrimeField};
+use log::info;
 use pairing::MultiMillerLoop;
 use rand_core::RngCore;
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
 
 #[cfg(not(feature = "cuda-supraseal"))]
 use self::native as prover;
 #[cfg(feature = "cuda-supraseal")]
 use self::supraseal as prover;
 use super::{ParameterSource, Proof};
-use crate::{gpu::GpuName, lc};
-
-struct ProvingAssignment<Scalar: PrimeField> {
-    // Density of queries
-    a_aux_density: DensityTracker,
-    b_input_density: DensityTracker,
-    b_aux_density: DensityTracker,
+use crate::gpu::GpuName;
+
+/// Implement various traits for the proving assignment.
+///
+/// It's a macro so that it can be used for different types. SupraSeal is using some special memory
+/// optimized data structures internally. Using a macro makes sure that the implementation will not
+/// diverge over time.
+macro_rules! proving_assignment_impls {
+    ($type:ty) => {
+        use bellpepper_core as bc;
+        impl<Scalar: PrimeField> std::fmt::Debug for $type {
+            fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
+                fmt.debug_struct(stringify!($type))
+                    .field("a_aux_density", &self.a_aux_density)
+                    .field("b_input_density", &self.b_input_density)
+                    .field("b_aux_density", &self.b_aux_density)
+                    .field(
+                        "a",
+                        &self
+                            .a
+                            .iter()
+                            .map(|v| format!("Fr({:?})", v))
+                            .collect::<Vec<_>>(),
+                    )
+                    .field(
+                        "b",
+                        &self
+                            .b
+                            .iter()
+                            .map(|v| format!("Fr({:?})", v))
+                            .collect::<Vec<_>>(),
+                    )
+                    .field(
+                        "c",
+                        &self
+                            .c
+                            .iter()
+                            .map(|v| format!("Fr({:?})", v))
+                            .collect::<Vec<_>>(),
+                    )
+                    .field("input_assignment", &self.input_assignment)
+                    .field("aux_assignment", &self.aux_assignment)
+                    .finish()
+            }
+        }
 
-    // Evaluations of A, B, C polynomials
-    a: Vec<Scalar>,
-    b: Vec<Scalar>,
-    c: Vec<Scalar>,
+        impl<Scalar: PrimeField> PartialEq for $type {
+            fn eq(&self, other: &$type) -> bool {
+                self.a_aux_density == other.a_aux_density
+                    && self.b_input_density == other.b_input_density
+                    && self.b_aux_density == other.b_aux_density
+                    && self.a == other.a
+                    && self.b == other.b
+                    && self.c == other.c
+                    && self.input_assignment == other.input_assignment
+                    && self.aux_assignment == other.aux_assignment
+            }
+        }
 
-    // Assignments of variables
-    input_assignment: Vec<Scalar>,
-    aux_assignment: Vec<Scalar>,
-}
+        impl<Scalar: PrimeField> bc::ConstraintSystem<Scalar> for $type {
+            type Root = Self;
 
-impl<Scalar: PrimeField> fmt::Debug for ProvingAssignment<Scalar> {
-    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
-        fmt.debug_struct("ProvingAssignment")
-            .field("a_aux_density", &self.a_aux_density)
-            .field("b_input_density", &self.b_input_density)
-            .field("b_aux_density", &self.b_aux_density)
-            .field(
-                "a",
-                &self
-                    .a
-                    .iter()
-                    .map(|v| format!("Fr({:?})", v))
-                    .collect::<Vec<_>>(),
-            )
-            .field(
-                "b",
-                &self
-                    .b
-                    .iter()
-                    .map(|v| format!("Fr({:?})", v))
-                    .collect::<Vec<_>>(),
-            )
-            .field(
-                "c",
-                &self
-                    .c
-                    .iter()
-                    .map(|v| format!("Fr({:?})", v))
-                    .collect::<Vec<_>>(),
-            )
-            .field("input_assignment", &self.input_assignment)
-            .field("aux_assignment", &self.aux_assignment)
-            .finish()
-    }
-}
+            fn new() -> Self {
+                Self::default()
+            }
 
-impl<Scalar: PrimeField> PartialEq for ProvingAssignment<Scalar> {
-    fn eq(&self, other: &ProvingAssignment<Scalar>) -> bool {
-        self.a_aux_density == other.a_aux_density
-            && self.b_input_density == other.b_input_density
-            && self.b_aux_density == other.b_aux_density
-            && self.a == other.a
-            && self.b == other.b
-            && self.c == other.c
-            && self.input_assignment == other.input_assignment
-            && self.aux_assignment == other.aux_assignment
-    }
-}
+            fn alloc<F, A, AR>(&mut self, _: A, f: F) -> Result<bc::Variable, bc::SynthesisError>
+            where
+                F: FnOnce() -> Result<Scalar, bc::SynthesisError>,
+                A: FnOnce() -> AR,
+                AR: Into<String>,
+            {
+                self.aux_assignment.push(f()?);
+                self.a_aux_density.add_element();
+                self.b_aux_density.add_element();
+
+                Ok(bc::Variable(bc::Index::Aux(self.aux_assignment.len() - 1)))
+            }
 
-impl<Scalar: PrimeField> ConstraintSystem<Scalar> for ProvingAssignment<Scalar> {
-    type Root = Self;
-
-    fn new() -> Self {
-        Self {
-            a_aux_density: DensityTracker::new(),
-            b_input_density: DensityTracker::new(),
-            b_aux_density: DensityTracker::new(),
-            a: vec![],
-            b: vec![],
-            c: vec![],
-            input_assignment: vec![],
-            aux_assignment: vec![],
-        }
-    }
+            fn alloc_input<F, A, AR>(
+                &mut self,
+                _: A,
+                f: F,
+            ) -> Result<bc::Variable, bc::SynthesisError>
+            where
+                F: FnOnce() -> Result<Scalar, bc::SynthesisError>,
+                A: FnOnce() -> AR,
+                AR: Into<String>,
+            {
+                self.input_assignment.push(f()?);
+                self.b_input_density.add_element();
+
+                Ok(bc::Variable(bc::Index::Input(
+                    self.input_assignment.len() - 1,
+                )))
+            }
 
-    fn alloc<F, A, AR>(&mut self, _: A, f: F) -> Result<Variable, SynthesisError>
-    where
-        F: FnOnce() -> Result<Scalar, SynthesisError>,
-        A: FnOnce() -> AR,
-        AR: Into<String>,
-    {
-        self.aux_assignment.push(f()?);
-        self.a_aux_density.add_element();
-        self.b_aux_density.add_element();
-
-        Ok(Variable(Index::Aux(self.aux_assignment.len() - 1)))
-    }
+            fn enforce<A, AR, LA, LB, LC>(&mut self, _: A, a: LA, b: LB, c: LC)
+            where
+                A: FnOnce() -> AR,
+                AR: Into<String>,
+                LA: FnOnce(bc::LinearCombination<Scalar>) -> bc::LinearCombination<Scalar>,
+                LB: FnOnce(bc::LinearCombination<Scalar>) -> bc::LinearCombination<Scalar>,
+                LC: FnOnce(bc::LinearCombination<Scalar>) -> bc::LinearCombination<Scalar>,
+            {
+                let a = a(bc::LinearCombination::zero());
+                let b = b(bc::LinearCombination::zero());
+                let c = c(bc::LinearCombination::zero());
+
+                let input_assignment = &self.input_assignment;
+                let aux_assignment = &self.aux_assignment;
+                let a_aux_density = &mut self.a_aux_density;
+                let b_input_density = &mut self.b_input_density;
+                let b_aux_density = &mut self.b_aux_density;
+
+                let a_res = crate::lc::eval_with_trackers(
+                    &a,
+                    // Inputs have full density in the A query
+                    // because there are constraints of the
+                    // form x * 0 = 0 for each input.
+                    None,
+                    Some(a_aux_density),
+                    input_assignment,
+                    aux_assignment,
+                );
+
+                let b_res = crate::lc::eval_with_trackers(
+                    &b,
+                    Some(b_input_density),
+                    Some(b_aux_density),
+                    input_assignment,
+                    aux_assignment,
+                );
+
+                // There is no C polynomial query,
+                // though there is an (beta)A + (alpha)B + C
+                // query for all aux variables.
+                // However, that query has full density.
+                let c_res = crate::lc::eval(&c, input_assignment, aux_assignment);
+
+                self.a.push(a_res);
+                self.b.push(b_res);
+                self.c.push(c_res);
+            }
 
-    fn alloc_input<F, A, AR>(&mut self, _: A, f: F) -> Result<Variable, SynthesisError>
-    where
-        F: FnOnce() -> Result<Scalar, SynthesisError>,
-        A: FnOnce() -> AR,
-        AR: Into<String>,
-    {
-        self.input_assignment.push(f()?);
-        self.b_input_density.add_element();
+            fn push_namespace<NR, N>(&mut self, _: N)
+            where
+                NR: Into<String>,
+                N: FnOnce() -> NR,
+            {
+                // Do nothing; we don't care about namespaces in this context.
+            }
 
-        Ok(Variable(Index::Input(self.input_assignment.len() - 1)))
-    }
+            fn pop_namespace(&mut self) {
+                // Do nothing; we don't care about namespaces in this context.
+            }
 
-    fn enforce<A, AR, LA, LB, LC>(&mut self, _: A, a: LA, b: LB, c: LC)
-    where
-        A: FnOnce() -> AR,
-        AR: Into<String>,
-        LA: FnOnce(LinearCombination<Scalar>) -> LinearCombination<Scalar>,
-        LB: FnOnce(LinearCombination<Scalar>) -> LinearCombination<Scalar>,
-        LC: FnOnce(LinearCombination<Scalar>) -> LinearCombination<Scalar>,
-    {
-        let a = a(LinearCombination::zero());
-        let b = b(LinearCombination::zero());
-        let c = c(LinearCombination::zero());
-
-        let input_assignment = &self.input_assignment;
-        let aux_assignment = &self.aux_assignment;
-        let a_aux_density = &mut self.a_aux_density;
-        let b_input_density = &mut self.b_input_density;
-        let b_aux_density = &mut self.b_aux_density;
-
-        let a_res = lc::eval_with_trackers(
-            &a,
-            // Inputs have full density in the A query
-            // because there are constraints of the
-            // form x * 0 = 0 for each input.
-            None,
-            Some(a_aux_density),
-            input_assignment,
-            aux_assignment,
-        );
-
-        let b_res = lc::eval_with_trackers(
-            &b,
-            Some(b_input_density),
-            Some(b_aux_density),
-            input_assignment,
-            aux_assignment,
-        );
-
-        // There is no C polynomial query,
-        // though there is an (beta)A + (alpha)B + C
-        // query for all aux variables.
-        // However, that query has full density.
-        let c_res = c.eval(input_assignment, aux_assignment);
-
-        self.a.push(a_res);
-        self.b.push(b_res);
-        self.c.push(c_res);
-    }
+            fn get_root(&mut self) -> &mut Self::Root {
+                self
+            }
 
-    fn push_namespace<NR, N>(&mut self, _: N)
-    where
-        NR: Into<String>,
-        N: FnOnce() -> NR,
-    {
-        // Do nothing; we don't care about namespaces in this context.
-    }
+            fn is_extensible() -> bool {
+                true
+            }
 
-    fn pop_namespace(&mut self) {
-        // Do nothing; we don't care about namespaces in this context.
-    }
+            fn extend(&mut self, other: &Self) {
+                self.a_aux_density.extend(&other.a_aux_density, false);
+                self.b_input_density.extend(&other.b_input_density, true);
+                self.b_aux_density.extend(&other.b_aux_density, false);
 
-    fn get_root(&mut self) -> &mut Self::Root {
-        self
-    }
+                self.a.extend(&other.a);
+                self.b.extend(&other.b);
+                self.c.extend(&other.c);
 
-    fn is_extensible() -> bool {
-        true
-    }
+                self.input_assignment
+                    // Skip first input, which must have been a temporarily allocated one variable.
+                    .extend(&other.input_assignment[1..]);
+                self.aux_assignment.extend(&other.aux_assignment);
+            }
+        }
+    };
+}
+pub(crate) use proving_assignment_impls;
 
-    fn extend(&mut self, other: &Self) {
-        self.a_aux_density.extend(&other.a_aux_density, false);
-        self.b_input_density.extend(&other.b_input_density, true);
-        self.b_aux_density.extend(&other.b_aux_density, false);
+#[derive(Default)]
+struct ProvingAssignment<Scalar: PrimeField> {
+    // Density of queries
+    a_aux_density: DensityTracker,
+    b_input_density: DensityTracker,
+    b_aux_density: DensityTracker,
 
-        self.a.extend(&other.a);
-        self.b.extend(&other.b);
-        self.c.extend(&other.c);
+    // Evaluations of A, B, C polynomials
+    a: Vec<Scalar>,
+    b: Vec<Scalar>,
+    c: Vec<Scalar>,
 
-        self.input_assignment
-            // Skip first input, which must have been a temporarily allocated one variable.
-            .extend(&other.input_assignment[1..]);
-        self.aux_assignment.extend(&other.aux_assignment);
-    }
+    // Assignments of variables
+    input_assignment: Vec<Scalar>,
+    aux_assignment: Vec<Scalar>,
 }
 
+proving_assignment_impls!(ProvingAssignment<Scalar>);
+
 pub(super) fn create_random_proof_batch_priority<E, C, R, P: ParameterSource<E>>(
     circuits: Vec<C>,
     params: P,
@@ -253,6 +264,36 @@ where
     prover::create_proof_batch_priority_inner(circuits, params, Some((r_s, s_s)), priority)
 }
 
+fn synthesize_circuits_batch<Scalar, C>(
+    circuits: Vec<C>,
+) -> Result<std::vec::Vec<ProvingAssignment<Scalar>>, SynthesisError>
+where
+    Scalar: PrimeField,
+    C: Circuit<Scalar> + Send,
+{
+    let start = Instant::now();
+    let provers = circuits
+        .into_par_iter()
+        .map(|circuit| -> Result<_, SynthesisError> {
+            let mut prover = ProvingAssignment::new();
+
+            prover.alloc_input(|| "", || Ok(Scalar::ONE))?;
+
+            circuit.synthesize(&mut prover)?;
+
+            for i in 0..prover.input_assignment.len() {
+                prover.enforce(|| "", |lc| lc + Variable(Index::Input(i)), |lc| lc, |lc| lc);
+            }
+
+            Ok(prover)
+        })
+        .collect::<Result<Vec<_>, _>>()?;
+
+    info!("synthesis time: {:?}", start.elapsed());
+
+    Ok(provers)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/src/groth16/prover/native.rs b/src/groth16/prover/native.rs
index 69d4fa8b..39bcaa94 100644
--- a/src/groth16/prover/native.rs
+++ b/src/groth16/prover/native.rs
@@ -6,7 +6,7 @@ use std::{
     time::Instant,
 };
 
-use bellpepper_core::{Circuit, ConstraintSystem, Index, SynthesisError, Variable};
+use bellpepper_core::{Circuit, SynthesisError};
 use ec_gpu_gen::{
     multiexp_cpu::FullDensity,
     threadpool::{Worker, THREAD_POOL},
@@ -47,7 +47,7 @@ where
 {
     info!("Bellperson {} is being used!", BELLMAN_VERSION);
 
-    let provers = synthesize_circuits_batch(circuits)?;
+    let provers = super::synthesize_circuits_batch(circuits)?;
     proof_circuits_batch(provers, params, randomization, priority)
 }
 
@@ -445,33 +445,3 @@ where
         .collect::<Vec<_>>();
     Ok(Arc::new(a))
 }
-
-fn synthesize_circuits_batch<Scalar, C>(
-    circuits: Vec<C>,
-) -> Result<std::vec::Vec<ProvingAssignment<Scalar>>, SynthesisError>
-where
-    Scalar: PrimeField,
-    C: Circuit<Scalar> + Send,
-{
-    let start = Instant::now();
-    let provers = circuits
-        .into_par_iter()
-        .map(|circuit| -> Result<_, SynthesisError> {
-            let mut prover = ProvingAssignment::new();
-
-            prover.alloc_input(|| "", || Ok(Scalar::ONE))?;
-
-            circuit.synthesize(&mut prover)?;
-
-            for i in 0..prover.input_assignment.len() {
-                prover.enforce(|| "", |lc| lc + Variable(Index::Input(i)), |lc| lc, |lc| lc);
-            }
-
-            Ok(prover)
-        })
-        .collect::<Result<Vec<_>, _>>()?;
-
-    info!("synthesis time: {:?}", start.elapsed());
-
-    Ok(provers)
-}
diff --git a/src/groth16/prover/supraseal.rs b/src/groth16/prover/supraseal.rs
index 6411ff9a..2748cf27 100644
--- a/src/groth16/prover/supraseal.rs
+++ b/src/groth16/prover/supraseal.rs
@@ -1,8 +1,9 @@
 //! Prover implementation implemented using SupraSeal (C++).
 
-use std::time::Instant;
+use std::{cmp, collections::BTreeMap, io, ops, thread, time::Instant};
 
 use bellpepper_core::{Circuit, ConstraintSystem, Index, SynthesisError, Variable};
+use ec_gpu_gen::multiexp_cpu::DensityTracker;
 use ff::{Field, PrimeField};
 use log::info;
 use pairing::MultiMillerLoop;
@@ -11,6 +12,268 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use super::{ParameterSource, Proof, ProvingAssignment};
 use crate::{gpu::GpuName, BELLMAN_VERSION};
 
+/// The number of circuits that will synthesized in parallel.
+///
+/// Due to a memory optimized representation it's possible to synthesize circuits in bigger batches
+/// than proving them. That optimized representation will then be transformed into the one the
+/// prover expects in a separate step.
+const SYNTHESIZE_BATCH_SIZE: usize = 20;
+
+/// The number of synthesized circuits that are passed on to the prover. Those need a lot of memory
+/// and the proving is mostly sequentially anyway, which means that bigger sized won't result in
+/// much faster proving times. Lower memory usage is usally worth the trade-off.
+const PROVER_BATCH_SIZE: usize = 5;
+
+/// The number of scalars we pack into a single byte.
+const SCALARS_PER_BYTE: usize = 4;
+
+/// An enum to distinguish between common and other scalar values.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum ScalarValue {
+    Zero = 0,
+    One = 1,
+    Two = 2,
+    Other = 3,
+}
+
+impl Default for ScalarValue {
+    fn default() -> Self {
+        Self::Zero
+    }
+}
+
+/// Use a custom representation in order to use less memory. In Filecoin the synthesized exponents
+/// are mostly zero, ones or twos. Those can be represented with 2 bits instead of their full field
+/// representation of 256 bits. Other values have a slight overhead, but as there are so few, it
+/// doesn't matter much.
+#[derive(Debug, Eq, PartialEq)]
+pub struct ScalarVec<Scalar> {
+    /// The scalar representing zero. It's owned here so that it can be referenced later.
+    zero: Scalar,
+    /// The scalar representing one. It's owned here so that it can be referenced later.
+    one: Scalar,
+    /// The scalar representing two. It's owned here so that it can be referenced later.
+    two: Scalar,
+    /// This is the vector of all values. 4 values are packed into a single byte. If the value is
+    /// [`ScalarValue::Other`], then there will be the actual value stored in the `other` field,
+    /// keyed by the current position in the list of values (where the position is the one as if it
+    /// wouldn't be packed).
+    values: Vec<u8>,
+    /// In case the value is [`ScalarValue::Other`], then the actual scalar is stored in this map,
+    /// where the key the position within the list of values.
+    other: BTreeMap<usize, Scalar>,
+    /// Temporary buffer before the values are packed into a single byte.
+    buffer: [ScalarValue; SCALARS_PER_BYTE],
+    /// The offset where the next value within the buffer will be written to.
+    buffer_pos: usize,
+}
+
+impl<Scalar: PrimeField> ScalarVec<Scalar> {
+    pub fn new() -> Self {
+        Self {
+            zero: Scalar::ZERO,
+            one: Scalar::ONE,
+            two: Scalar::ONE.double(),
+            values: Vec::new(),
+            other: BTreeMap::new(),
+            buffer: [ScalarValue::Zero; SCALARS_PER_BYTE],
+            buffer_pos: 0,
+        }
+    }
+
+    /// Tthe number of scalars stored.
+    pub fn len(&self) -> usize {
+        // The scalar values are 2 bit, we store 4 of them in a single byte.
+        (self.values.len() * SCALARS_PER_BYTE) + self.buffer_pos
+    }
+
+    pub fn push(&mut self, scalar: Scalar) {
+        let value = if scalar == Scalar::ZERO {
+            ScalarValue::Zero
+        } else if scalar == Scalar::ONE {
+            ScalarValue::One
+        } else if scalar == self.two {
+            ScalarValue::Two
+        } else {
+            self.other.insert(self.len(), scalar);
+            ScalarValue::Other
+        };
+
+        if self.buffer_pos < SCALARS_PER_BYTE {
+            self.buffer[self.buffer_pos] = value;
+            self.buffer_pos += 1;
+        }
+
+        // The buffer is full, flush the values into the actual data vector.
+        if self.buffer_pos == SCALARS_PER_BYTE {
+            self.buffer_pos = 0;
+            self.flush_buffer();
+        }
+    }
+
+    pub fn iter(&self) -> ScalarVecIterator<Scalar> {
+        ScalarVecIterator {
+            scalar_vec: self,
+            pos: 0,
+        }
+    }
+
+    /// Transform into arepresentation where all elements arranged in continuous memory.
+    pub fn into_vec(self) -> Vec<Scalar> {
+        // NOTE vmx 2023-12-13: A simple collect of the iterator is slower when micro-benchmarking.
+        let mut output = Vec::with_capacity(self.len());
+        for scalar in self.iter() {
+            output.push(*scalar)
+        }
+        output
+    }
+
+    /// Flush the buffer into the actual vector of data.
+    fn flush_buffer(&mut self) {
+        let mut data_byte = 0;
+        data_byte |= self.buffer[0] as u8;
+        data_byte |= (self.buffer[1] as u8) << 2;
+        data_byte |= (self.buffer[2] as u8) << 4;
+        data_byte |= (self.buffer[3] as u8) << 6;
+        self.values.push(data_byte);
+    }
+
+    fn get(&self, pos: usize) -> Option<&Scalar> {
+        if pos < self.len() {
+            // The position is within the stored values (not the buffer)
+            if pos < self.values.len() * SCALARS_PER_BYTE {
+                let value_byte = &self.values[pos / SCALARS_PER_BYTE];
+                let within_buffer_pos = pos % SCALARS_PER_BYTE;
+                // Determine where the bits we want to read. Each value is 2 bits => `* 2`.
+                let bitmask = 0b11 << (within_buffer_pos * 2);
+                // Read those bits and shift them back, so that it matches the enum values.
+                let value = (value_byte & bitmask) >> (within_buffer_pos * 2);
+
+                if value == ScalarValue::Zero as u8 {
+                    Some(&self.zero)
+                } else if value == ScalarValue::One as u8 {
+                    Some(&self.one)
+                } else if value == ScalarValue::Two as u8 {
+                    Some(&self.two)
+                } else if value == ScalarValue::Other as u8 {
+                    self.other.get(&pos)
+                } else {
+                    unreachable!()
+                }
+            } else {
+                let within_buffer_pos = pos - (self.values.len() * SCALARS_PER_BYTE);
+                match self.buffer[within_buffer_pos] {
+                    ScalarValue::Zero => Some(&self.zero),
+                    ScalarValue::One => Some(&self.one),
+                    ScalarValue::Two => Some(&self.two),
+                    ScalarValue::Other => self.other.get(&pos),
+                }
+            }
+        } else {
+            None
+        }
+    }
+}
+
+impl<Scalar: PrimeField> Default for ScalarVec<Scalar> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<'a, Scalar: PrimeField> Extend<&'a Scalar> for ScalarVec<Scalar> {
+    fn extend<T: IntoIterator<Item = &'a Scalar>>(&mut self, iter: T) {
+        for scalar in iter {
+            self.push(*scalar);
+        }
+    }
+}
+
+impl<Scalar: PrimeField> ops::Index<usize> for ScalarVec<Scalar> {
+    type Output = Scalar;
+
+    fn index(&self, index: usize) -> &Self::Output {
+        self.get(index).expect("index out of range")
+    }
+}
+
+pub struct ScalarVecIterator<'a, Scalar> {
+    scalar_vec: &'a ScalarVec<Scalar>,
+    pos: usize,
+}
+
+impl<'a, Scalar: PrimeField> Iterator for ScalarVecIterator<'a, Scalar> {
+    type Item = &'a Scalar;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // Early return in case index is out of range.
+        let value = self.scalar_vec.get(self.pos)?;
+        self.pos += 1;
+        Some(value)
+    }
+}
+
+impl<'a, Scalar: PrimeField> IntoIterator for &'a ScalarVec<Scalar> {
+    type Item = &'a Scalar;
+    type IntoIter = ScalarVecIterator<'a, Scalar>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        ScalarVecIterator {
+            scalar_vec: self,
+            pos: 0,
+        }
+    }
+}
+
+/// A copy of `[prover::ProvingAssignment` which has a lower memory footprint.
+///
+/// At the cost of the need to convert into the usual representation when it's passed into the
+/// prover.
+#[derive(Default)]
+struct ProvingAssignmentCompact<Scalar: PrimeField> {
+    // Density of queries
+    a_aux_density: DensityTracker,
+    b_input_density: DensityTracker,
+    b_aux_density: DensityTracker,
+
+    // Evaluations of A, B, C polynomials
+    a: ScalarVec<Scalar>,
+    b: ScalarVec<Scalar>,
+    c: ScalarVec<Scalar>,
+
+    // Assignments of variables
+    input_assignment: Vec<Scalar>,
+    aux_assignment: ScalarVec<Scalar>,
+}
+
+super::proving_assignment_impls!(ProvingAssignmentCompact<Scalar>);
+
+impl<Scalar: PrimeField> From<ProvingAssignmentCompact<Scalar>> for ProvingAssignment<Scalar> {
+    fn from(assignment: ProvingAssignmentCompact<Scalar>) -> Self {
+        let mut a = Vec::new();
+        let mut b = Vec::new();
+        let mut c = Vec::new();
+        let mut aux_assignment = Vec::new();
+        rayon::scope(|s| {
+            s.spawn(|_| a = assignment.a.into_vec());
+            s.spawn(|_| b = assignment.b.into_vec());
+            s.spawn(|_| c = assignment.c.into_vec());
+            s.spawn(|_| aux_assignment = assignment.aux_assignment.into_vec());
+        });
+
+        Self {
+            a_aux_density: assignment.a_aux_density,
+            b_input_density: assignment.b_input_density,
+            b_aux_density: assignment.b_aux_density,
+            a,
+            b,
+            c,
+            input_assignment: assignment.input_assignment,
+            aux_assignment,
+        }
+    }
+}
+
 impl<Scalar> From<&ProvingAssignment<Scalar>> for supraseal_c2::Assignment<Scalar>
 where
     Scalar: PrimeField,
@@ -65,15 +328,164 @@ where
         BELLMAN_VERSION
     );
 
-    let provers = synthesize_circuits_batch(circuits)?;
-    proof_circuits_batch(provers, params, randomization)
+    let (r_s, s_s) = randomization.unwrap_or((
+        vec![E::Fr::ZERO; circuits.len()],
+        vec![E::Fr::ZERO; circuits.len()],
+    ));
+
+    // The memory-optimized version, which is more CPU intensive only makes sense for larger batch
+    // sizes. Hence use the normal synthesis for smaller batches.
+    if circuits.len() <= 10 {
+        let provers = super::synthesize_circuits_batch(circuits)?;
+        proof_circuits_batch(provers, params, (r_s, s_s))
+    } else {
+        create_proof_batch_pipelined(circuits, params, (r_s, s_s))
+    }
+}
+
+/// Create a custom [`SynthesisError`].
+///
+/// The closest to a custom error is the IO Error, hence use that.
+fn custom_error(error: &str) -> SynthesisError {
+    SynthesisError::IoError(io::Error::new(io::ErrorKind::Other, error))
+}
+
+/// The circuit synthesis is CPU intensive. Itself isn't parallelized, hence we parallelize with
+/// running several synthesis at the same time. The proving isn't that CPU intensive.
+/// Therefore we interleave the synthesis with the proving.
+/// We create a large batch of synthesized circuits, and then proof in smaller batches as the
+/// proving takes way more memory. Whenever the proving of synthesized batch starts, we kick of a
+/// new batch for synthesis, while the proving is going on. We achieve that with having a bounded
+/// message queue which blocks after a certain amount of batches.
+///
+/// The flow looks like that:
+///
+///   - Each uppercase letter corresponds to one proof.
+///   - The total number of proofs is 18.
+///   - The batch size for synthesis is 6.
+///   - The batch size for proving is 2.
+///   - The message queue size is the batch size of the synthesis divided bt the batch size of
+///     the proving minus one, so that the queue blocks before the next synthesis starts.
+///     => (6 / 2) - 1 = 2.
+///
+/// ```text
+///  The downwards axis is time. The Synthesize and Prover thread run in parallel. If things
+///  appeach on the same line it means that they start at the same time, but they might take
+///  different amounts of time.
+///
+///  Description                             Synthesize thread    Message queue    Prover thread
+///
+///  The full set of proofs is:
+///  A B C D E F G H I J K L M N O P Q R
+///
+///  Start with synthesizing a batch of         A B C D E F
+///  circuits.
+///
+///  Once finished, put them into the                              (C D) (A B)
+///  message queue. One item in the queue
+///  consists is one batch for the prover.
+///
+///  Once the prover starts, the last item      G H I J K L        (E F) (C D)         A B
+///  of the synthesis batch is pushed into                               (E F)         C D
+///  queue, hence a new synthesis starts.                                              E F
+///
+///  The synthesis keeps pushing into the                          (I J) (G H)
+///  queue whenever there's a free spot.
+///
+///  Keep repeating the previous two steps.     M N O P Q R        (K L) (I J)         G H
+///                                                                      (K L)         I J
+///
+///                                                                (O P) (M N)
+///
+///  All sircuits were synthesized, hence                          (Q R) (O P)         M N
+///  only the proving is to be done.                                     (Q R)         O P
+///                                                                                    Q R
+///  ```
+fn create_proof_batch_pipelined<E, C, P>(
+    circuits: Vec<C>,
+    params: P,
+    randomization: (Vec<E::Fr>, Vec<E::Fr>),
+) -> Result<Vec<Proof<E>>, SynthesisError>
+where
+    E: MultiMillerLoop,
+    C: Circuit<E::Fr> + Send,
+    E::Fr: GpuName,
+    E::G1Affine: GpuName,
+    E::G2Affine: GpuName,
+    P: ParameterSource<E>,
+{
+    let (r_s, s_s) = randomization;
+    assert_eq!(circuits.len(), r_s.len());
+    assert_eq!(circuits.len(), s_s.len());
+
+    // This channel size makes sure that the next synthesizing batch starts as soon as the first
+    // proving batch starts.
+    let (sender, receiver) =
+        crossbeam_channel::bounded((SYNTHESIZE_BATCH_SIZE / PROVER_BATCH_SIZE) - 1);
+
+    let num_circuits = circuits.len();
+
+    thread::scope(|s| {
+        let synthesis = s.spawn(|| -> Result<(), SynthesisError> {
+            let mut circuits_mut = circuits;
+            // A vector of proofs is expected, hence drain it from the list of proofs, so that we
+            // don't need to keep an extra copy around.
+            while !circuits_mut.is_empty() {
+                let size = cmp::min(SYNTHESIZE_BATCH_SIZE, circuits_mut.len());
+                let batch = circuits_mut.drain(0..size).collect();
+                let mut provers = synthesize_circuits_batch(batch)?;
+                // Do not send all synthesized circuits at once, but only a subset as the memory
+                // footprint will increase in the proving stage.
+                while !provers.is_empty() {
+                    let provers_size = cmp::min(PROVER_BATCH_SIZE, provers.len());
+                    let provers_batch: Vec<_> = provers.drain(0..provers_size).collect();
+                    sender
+                        .send(provers_batch)
+                        .map_err(|_| custom_error("cannot send circuits"))?;
+                }
+            }
+            Ok(())
+        });
+
+        let prover = s.spawn(|| {
+            let mut groth_proofs = Vec::with_capacity(num_circuits);
+            // There is one randomnes element per circuit, hence we can use that as termination
+            // condition for the loop.
+            let mut r_s_mut = r_s;
+            let mut s_s_mut = s_s;
+            while !r_s_mut.is_empty() {
+                let provers_compact = receiver
+                    .recv()
+                    .map_err(|_| custom_error("cannot receive circuits"))?;
+                let r_s_batch = r_s_mut.drain(0..provers_compact.len()).collect();
+                let s_s_batch = s_s_mut.drain(0..provers_compact.len()).collect();
+
+                // Transform the provers from the memory efficient representation into one suitable
+                // to be used with SupraSeal.
+                log::trace!("converting representation of provers");
+                let provers: Vec<ProvingAssignment<E::Fr>> =
+                    provers_compact.into_par_iter().map(Into::into).collect();
+
+                let proofs = proof_circuits_batch(provers, params.clone(), (r_s_batch, s_s_batch))?;
+                groth_proofs.extend_from_slice(&proofs);
+            }
+            Ok(groth_proofs)
+        });
+
+        synthesis
+            .join()
+            .map_err(|_| custom_error("cannot prove circuits"))??;
+        // The prover result is what we actually return.
+        prover
+            .join()
+            .map_err(|_| custom_error("cannot prove circuits"))?
+    })
 }
 
-#[allow(clippy::type_complexity)]
 fn proof_circuits_batch<E, P>(
-    provers: std::vec::Vec<ProvingAssignment<E::Fr>>,
+    provers: Vec<ProvingAssignment<E::Fr>>,
     params: P,
-    randomization: Option<(Vec<E::Fr>, Vec<E::Fr>)>,
+    randomization: (Vec<E::Fr>, Vec<E::Fr>),
 ) -> Result<Vec<Proof<E>>, SynthesisError>
 where
     E: MultiMillerLoop,
@@ -87,10 +499,7 @@ where
     info!("starting proof timer");
 
     let num_circuits = provers.len();
-    let (r_s, s_s) = randomization.unwrap_or((
-        vec![E::Fr::ZERO; num_circuits],
-        vec![E::Fr::ZERO; num_circuits],
-    ));
+    let (r_s, s_s) = randomization;
 
     // Make sure all circuits have the same input len.
     for prover in &provers {
@@ -129,10 +538,11 @@ where
     Ok(proofs)
 }
 
-#[allow(clippy::type_complexity)]
+// The only difference to [`groth16::prover::synthesize_circuits-batch`] is, that it's using the
+// memory optimized representation for the proving assignment.
 fn synthesize_circuits_batch<Scalar, C>(
     circuits: Vec<C>,
-) -> Result<std::vec::Vec<ProvingAssignment<Scalar>>, SynthesisError>
+) -> Result<Vec<ProvingAssignmentCompact<Scalar>>, SynthesisError>
 where
     Scalar: PrimeField,
     C: Circuit<Scalar> + Send,
@@ -142,7 +552,7 @@ where
     let provers = circuits
         .into_par_iter()
         .map(|circuit| -> Result<_, SynthesisError> {
-            let mut prover = ProvingAssignment::new();
+            let mut prover = ProvingAssignmentCompact::new();
 
             prover.alloc_input(|| "", || Ok(Scalar::ONE))?;
 
diff --git a/src/lc.rs b/src/lc.rs
index 12facbc9..f4af8e20 100644
--- a/src/lc.rs
+++ b/src/lc.rs
@@ -1,14 +1,59 @@
-use crate::LinearCombination;
+use std::ops;
+
 use ec_gpu_gen::multiexp_cpu::DensityTracker;
 use ff::PrimeField;
 
-pub fn eval_with_trackers<Scalar: PrimeField>(
-    lc: &LinearCombination<Scalar>,
-    mut input_density: Option<&mut DensityTracker>,
-    mut aux_density: Option<&mut DensityTracker>,
+use crate::LinearCombination;
+
+/// Copy of `eval` from bellpepper that also works with a
+/// [`groth16::prover::superaseal::ScalarVec`].
+// `T` is a slice of `Scalar`s. This way it works with `&[Scalar]` as well as `&ScalarVec<Scalar>`
+pub(crate) fn eval<'a, Scalar, T>(
+    lc: &'a LinearCombination<Scalar>,
+    input_assignment: &[Scalar],
+    aux_assignment: &'a T,
+) -> Scalar
+where
+    Scalar: PrimeField + ops::AddAssign<T::Output>,
+    T: ops::Index<usize>,
+    T::Output: PrimeField + std::ops::MulAssign<&'a Scalar>,
+{
+    let mut acc = Scalar::ZERO;
+
+    let one = Scalar::ONE;
+
+    for (index, coeff) in lc.iter_inputs() {
+        let mut tmp = input_assignment[*index];
+        if coeff != &one {
+            tmp *= coeff;
+        }
+        acc += tmp;
+    }
+
+    for (index, coeff) in lc.iter_aux() {
+        let mut tmp = aux_assignment[*index];
+        if coeff != &one {
+            tmp *= coeff;
+        }
+        acc += tmp;
+    }
+
+    acc
+}
+
+// `T` is a slice of `Scalar`s. This way it works with `&[Scalar]` as well as `&ScalarVec<Scalar>`
+pub(crate) fn eval_with_trackers<'a, Scalar, T>(
+    lc: &'a LinearCombination<Scalar>,
+    mut input_density: Option<&'a mut DensityTracker>,
+    mut aux_density: Option<&'a mut DensityTracker>,
     input_assignment: &[Scalar],
-    aux_assignment: &[Scalar],
-) -> Scalar {
+    aux_assignment: &'a T,
+) -> Scalar
+where
+    Scalar: PrimeField + ops::AddAssign<T::Output>,
+    T: ops::Index<usize>,
+    T::Output: PrimeField + std::ops::MulAssign<&'a Scalar>,
+{
     let mut acc = Scalar::ZERO;
 
     let one = Scalar::ONE;