diff --git a/src/groth16/params.rs b/src/groth16/params.rs
index e909c640..0de903de 100644
--- a/src/groth16/params.rs
+++ b/src/groth16/params.rs
@@ -405,7 +405,7 @@ where
     }
 }
 
-pub trait ParameterSource<E>: Send + Sync
+pub trait ParameterSource<E>: Clone + Send + Sync
 where
     E: MultiMillerLoop,
 {
diff --git a/src/groth16/prover/mod.rs b/src/groth16/prover/mod.rs
index 39a08b19..7b8f80ab 100644
--- a/src/groth16/prover/mod.rs
+++ b/src/groth16/prover/mod.rs
@@ -3,213 +3,224 @@ mod native;
 #[cfg(feature = "cuda-supraseal")]
 mod supraseal;
 
-use std::fmt;
+use std::time::Instant;
 
-use bellpepper_core::{
-    Circuit, ConstraintSystem, Index, LinearCombination, SynthesisError, Variable,
-};
+use bellpepper_core::{Circuit, ConstraintSystem, Index, SynthesisError, Variable};
 use ec_gpu_gen::multiexp_cpu::DensityTracker;
 use ff::{Field, PrimeField};
+use log::info;
 use pairing::MultiMillerLoop;
 use rand_core::RngCore;
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
 
 #[cfg(not(feature = "cuda-supraseal"))]
 use self::native as prover;
 #[cfg(feature = "cuda-supraseal")]
 use self::supraseal as prover;
 use super::{ParameterSource, Proof};
-use crate::{gpu::GpuName, lc};
-
-struct ProvingAssignment<Scalar: PrimeField> {
-    // Density of queries
-    a_aux_density: DensityTracker,
-    b_input_density: DensityTracker,
-    b_aux_density: DensityTracker,
+use crate::gpu::GpuName;
+
+/// Implement various traits for the proving assignment.
+///
+/// It's a macro so that it can be used for different types. SupraSeal is using some special memory
+/// optimized data structures internally. Using a macro makes sure that the implementation will not
+/// diverge over time.
+macro_rules! proving_assignment_impls {
+    ($type:ty) => {
+        use bellpepper_core as bc;
+        impl<Scalar: PrimeField> std::fmt::Debug for $type {
+            fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
+                fmt.debug_struct(stringify!($type))
+                    .field("a_aux_density", &self.a_aux_density)
+                    .field("b_input_density", &self.b_input_density)
+                    .field("b_aux_density", &self.b_aux_density)
+                    .field(
+                        "a",
+                        &self
+                            .a
+                            .iter()
+                            .map(|v| format!("Fr({:?})", v))
+                            .collect::<Vec<_>>(),
+                    )
+                    .field(
+                        "b",
+                        &self
+                            .b
+                            .iter()
+                            .map(|v| format!("Fr({:?})", v))
+                            .collect::<Vec<_>>(),
+                    )
+                    .field(
+                        "c",
+                        &self
+                            .c
+                            .iter()
+                            .map(|v| format!("Fr({:?})", v))
+                            .collect::<Vec<_>>(),
+                    )
+                    .field("input_assignment", &self.input_assignment)
+                    .field("aux_assignment", &self.aux_assignment)
+                    .finish()
+            }
+        }
 
-    // Evaluations of A, B, C polynomials
-    a: Vec<Scalar>,
-    b: Vec<Scalar>,
-    c: Vec<Scalar>,
+        impl<Scalar: PrimeField> PartialEq for $type {
+            fn eq(&self, other: &$type) -> bool {
+                self.a_aux_density == other.a_aux_density
+                    && self.b_input_density == other.b_input_density
+                    && self.b_aux_density == other.b_aux_density
+                    && self.a == other.a
+                    && self.b == other.b
+                    && self.c == other.c
+                    && self.input_assignment == other.input_assignment
+                    && self.aux_assignment == other.aux_assignment
+            }
+        }
 
-    // Assignments of variables
-    input_assignment: Vec<Scalar>,
-    aux_assignment: Vec<Scalar>,
-}
+        impl<Scalar: PrimeField> bc::ConstraintSystem<Scalar> for $type {
+            type Root = Self;
 
-impl<Scalar: PrimeField> fmt::Debug for ProvingAssignment<Scalar> {
-    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
-        fmt.debug_struct("ProvingAssignment")
-            .field("a_aux_density", &self.a_aux_density)
-            .field("b_input_density", &self.b_input_density)
-            .field("b_aux_density", &self.b_aux_density)
-            .field(
-                "a",
-                &self
-                    .a
-                    .iter()
-                    .map(|v| format!("Fr({:?})", v))
-                    .collect::<Vec<_>>(),
-            )
-            .field(
-                "b",
-                &self
-                    .b
-                    .iter()
-                    .map(|v| format!("Fr({:?})", v))
-                    .collect::<Vec<_>>(),
-            )
-            .field(
-                "c",
-                &self
-                    .c
-                    .iter()
-                    .map(|v| format!("Fr({:?})", v))
-                    .collect::<Vec<_>>(),
-            )
-            .field("input_assignment", &self.input_assignment)
-            .field("aux_assignment", &self.aux_assignment)
-            .finish()
-    }
-}
+            fn new() -> Self {
+                Self::default()
+            }
 
-impl<Scalar: PrimeField> PartialEq for ProvingAssignment<Scalar> {
-    fn eq(&self, other: &ProvingAssignment<Scalar>) -> bool {
-        self.a_aux_density == other.a_aux_density
-            && self.b_input_density == other.b_input_density
-            && self.b_aux_density == other.b_aux_density
-            && self.a == other.a
-            && self.b == other.b
-            && self.c == other.c
-            && self.input_assignment == other.input_assignment
-            && self.aux_assignment == other.aux_assignment
-    }
-}
+            fn alloc<F, A, AR>(&mut self, _: A, f: F) -> Result<bc::Variable, bc::SynthesisError>
+            where
+                F: FnOnce() -> Result<Scalar, bc::SynthesisError>,
+                A: FnOnce() -> AR,
+                AR: Into<String>,
+            {
+                self.aux_assignment.push(f()?);
+                self.a_aux_density.add_element();
+                self.b_aux_density.add_element();
+
+                Ok(bc::Variable(bc::Index::Aux(self.aux_assignment.len() - 1)))
+            }
 
-impl<Scalar: PrimeField> ConstraintSystem<Scalar> for ProvingAssignment<Scalar> {
-    type Root = Self;
-
-    fn new() -> Self {
-        Self {
-            a_aux_density: DensityTracker::new(),
-            b_input_density: DensityTracker::new(),
-            b_aux_density: DensityTracker::new(),
-            a: vec![],
-            b: vec![],
-            c: vec![],
-            input_assignment: vec![],
-            aux_assignment: vec![],
-        }
-    }
+            fn alloc_input<F, A, AR>(
+                &mut self,
+                _: A,
+                f: F,
+            ) -> Result<bc::Variable, bc::SynthesisError>
+            where
+                F: FnOnce() -> Result<Scalar, bc::SynthesisError>,
+                A: FnOnce() -> AR,
+                AR: Into<String>,
+            {
+                self.input_assignment.push(f()?);
+                self.b_input_density.add_element();
+
+                Ok(bc::Variable(bc::Index::Input(
+                    self.input_assignment.len() - 1,
+                )))
+            }
 
-    fn alloc<F, A, AR>(&mut self, _: A, f: F) -> Result<Variable, SynthesisError>
-    where
-        F: FnOnce() -> Result<Scalar, SynthesisError>,
-        A: FnOnce() -> AR,
-        AR: Into<String>,
-    {
-        self.aux_assignment.push(f()?);
-        self.a_aux_density.add_element();
-        self.b_aux_density.add_element();
-
-        Ok(Variable(Index::Aux(self.aux_assignment.len() - 1)))
-    }
+            fn enforce<A, AR, LA, LB, LC>(&mut self, _: A, a: LA, b: LB, c: LC)
+            where
+                A: FnOnce() -> AR,
+                AR: Into<String>,
+                LA: FnOnce(bc::LinearCombination<Scalar>) -> bc::LinearCombination<Scalar>,
+                LB: FnOnce(bc::LinearCombination<Scalar>) -> bc::LinearCombination<Scalar>,
+                LC: FnOnce(bc::LinearCombination<Scalar>) -> bc::LinearCombination<Scalar>,
+            {
+                let a = a(bc::LinearCombination::zero());
+                let b = b(bc::LinearCombination::zero());
+                let c = c(bc::LinearCombination::zero());
+
+                let input_assignment = &self.input_assignment;
+                let aux_assignment = &self.aux_assignment;
+                let a_aux_density = &mut self.a_aux_density;
+                let b_input_density = &mut self.b_input_density;
+                let b_aux_density = &mut self.b_aux_density;
+
+                let a_res = crate::lc::eval_with_trackers(
+                    &a,
+                    // Inputs have full density in the A query
+                    // because there are constraints of the
+                    // form x * 0 = 0 for each input.
+                    None,
+                    Some(a_aux_density),
+                    input_assignment,
+                    aux_assignment,
+                );
+
+                let b_res = crate::lc::eval_with_trackers(
+                    &b,
+                    Some(b_input_density),
+                    Some(b_aux_density),
+                    input_assignment,
+                    aux_assignment,
+                );
+
+                // There is no C polynomial query,
+                // though there is an (beta)A + (alpha)B + C
+                // query for all aux variables.
+                // However, that query has full density.
+                let c_res = crate::lc::eval(&c, input_assignment, aux_assignment);
+
+                self.a.push(a_res);
+                self.b.push(b_res);
+                self.c.push(c_res);
+            }
 
-    fn alloc_input<F, A, AR>(&mut self, _: A, f: F) -> Result<Variable, SynthesisError>
-    where
-        F: FnOnce() -> Result<Scalar, SynthesisError>,
-        A: FnOnce() -> AR,
-        AR: Into<String>,
-    {
-        self.input_assignment.push(f()?);
-        self.b_input_density.add_element();
+            fn push_namespace<NR, N>(&mut self, _: N)
+            where
+                NR: Into<String>,
+                N: FnOnce() -> NR,
+            {
+                // Do nothing; we don't care about namespaces in this context.
+            }
 
-        Ok(Variable(Index::Input(self.input_assignment.len() - 1)))
-    }
+            fn pop_namespace(&mut self) {
+                // Do nothing; we don't care about namespaces in this context.
+            }
 
-    fn enforce<A, AR, LA, LB, LC>(&mut self, _: A, a: LA, b: LB, c: LC)
-    where
-        A: FnOnce() -> AR,
-        AR: Into<String>,
-        LA: FnOnce(LinearCombination<Scalar>) -> LinearCombination<Scalar>,
-        LB: FnOnce(LinearCombination<Scalar>) -> LinearCombination<Scalar>,
-        LC: FnOnce(LinearCombination<Scalar>) -> LinearCombination<Scalar>,
-    {
-        let a = a(LinearCombination::zero());
-        let b = b(LinearCombination::zero());
-        let c = c(LinearCombination::zero());
-
-        let input_assignment = &self.input_assignment;
-        let aux_assignment = &self.aux_assignment;
-        let a_aux_density = &mut self.a_aux_density;
-        let b_input_density = &mut self.b_input_density;
-        let b_aux_density = &mut self.b_aux_density;
-
-        let a_res = lc::eval_with_trackers(
-            &a,
-            // Inputs have full density in the A query
-            // because there are constraints of the
-            // form x * 0 = 0 for each input.
-            None,
-            Some(a_aux_density),
-            input_assignment,
-            aux_assignment,
-        );
-
-        let b_res = lc::eval_with_trackers(
-            &b,
-            Some(b_input_density),
-            Some(b_aux_density),
-            input_assignment,
-            aux_assignment,
-        );
-
-        // There is no C polynomial query,
-        // though there is an (beta)A + (alpha)B + C
-        // query for all aux variables.
-        // However, that query has full density.
-        let c_res = c.eval(input_assignment, aux_assignment);
-
-        self.a.push(a_res);
-        self.b.push(b_res);
-        self.c.push(c_res);
-    }
+            fn get_root(&mut self) -> &mut Self::Root {
+                self
+            }
 
-    fn push_namespace<NR, N>(&mut self, _: N)
-    where
-        NR: Into<String>,
-        N: FnOnce() -> NR,
-    {
-        // Do nothing; we don't care about namespaces in this context.
-    }
+            fn is_extensible() -> bool {
+                true
+            }
 
-    fn pop_namespace(&mut self) {
-        // Do nothing; we don't care about namespaces in this context.
-    }
+            fn extend(&mut self, other: &Self) {
+                self.a_aux_density.extend(&other.a_aux_density, false);
+                self.b_input_density.extend(&other.b_input_density, true);
+                self.b_aux_density.extend(&other.b_aux_density, false);
 
-    fn get_root(&mut self) -> &mut Self::Root {
-        self
-    }
+                self.a.extend(&other.a);
+                self.b.extend(&other.b);
+                self.c.extend(&other.c);
 
-    fn is_extensible() -> bool {
-        true
-    }
+                self.input_assignment
+                    // Skip first input, which must have been a temporarily allocated one variable.
+                    .extend(&other.input_assignment[1..]);
+                self.aux_assignment.extend(&other.aux_assignment);
+            }
+        }
+    };
+}
+pub(crate) use proving_assignment_impls;
 
-    fn extend(&mut self, other: &Self) {
-        self.a_aux_density.extend(&other.a_aux_density, false);
-        self.b_input_density.extend(&other.b_input_density, true);
-        self.b_aux_density.extend(&other.b_aux_density, false);
+#[derive(Default)]
+struct ProvingAssignment<Scalar: PrimeField> {
+    // Density of queries
+    a_aux_density: DensityTracker,
+    b_input_density: DensityTracker,
+    b_aux_density: DensityTracker,
 
-        self.a.extend(&other.a);
-        self.b.extend(&other.b);
-        self.c.extend(&other.c);
+    // Evaluations of A, B, C polynomials
+    a: Vec<Scalar>,
+    b: Vec<Scalar>,
+    c: Vec<Scalar>,
 
-        self.input_assignment
-            // Skip first input, which must have been a temporarily allocated one variable.
-            .extend(&other.input_assignment[1..]);
-        self.aux_assignment.extend(&other.aux_assignment);
-    }
+    // Assignments of variables
+    input_assignment: Vec<Scalar>,
+    aux_assignment: Vec<Scalar>,
 }
 
+proving_assignment_impls!(ProvingAssignment<Scalar>);
+
 pub(super) fn create_random_proof_batch_priority<E, C, R, P: ParameterSource<E>>(
     circuits: Vec<C>,
     params: P,
@@ -253,6 +264,36 @@ where
     prover::create_proof_batch_priority_inner(circuits, params, Some((r_s, s_s)), priority)
 }
 
+fn synthesize_circuits_batch<Scalar, C>(
+    circuits: Vec<C>,
+) -> Result<std::vec::Vec<ProvingAssignment<Scalar>>, SynthesisError>
+where
+    Scalar: PrimeField,
+    C: Circuit<Scalar> + Send,
+{
+    let start = Instant::now();
+    let provers = circuits
+        .into_par_iter()
+        .map(|circuit| -> Result<_, SynthesisError> {
+            let mut prover = ProvingAssignment::new();
+
+            prover.alloc_input(|| "", || Ok(Scalar::ONE))?;
+
+            circuit.synthesize(&mut prover)?;
+
+            for i in 0..prover.input_assignment.len() {
+                prover.enforce(|| "", |lc| lc + Variable(Index::Input(i)), |lc| lc, |lc| lc);
+            }
+
+            Ok(prover)
+        })
+        .collect::<Result<Vec<_>, _>>()?;
+
+    info!("synthesis time: {:?}", start.elapsed());
+
+    Ok(provers)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/src/groth16/prover/native.rs b/src/groth16/prover/native.rs
index 69d4fa8b..39bcaa94 100644
--- a/src/groth16/prover/native.rs
+++ b/src/groth16/prover/native.rs
@@ -6,7 +6,7 @@ use std::{
     time::Instant,
 };
 
-use bellpepper_core::{Circuit, ConstraintSystem, Index, SynthesisError, Variable};
+use bellpepper_core::{Circuit, SynthesisError};
 use ec_gpu_gen::{
     multiexp_cpu::FullDensity,
     threadpool::{Worker, THREAD_POOL},
@@ -47,7 +47,7 @@ where
 {
     info!("Bellperson {} is being used!", BELLMAN_VERSION);
 
-    let provers = synthesize_circuits_batch(circuits)?;
+    let provers = super::synthesize_circuits_batch(circuits)?;
     proof_circuits_batch(provers, params, randomization, priority)
 }
 
@@ -445,33 +445,3 @@ where
         .collect::<Vec<_>>();
     Ok(Arc::new(a))
 }
-
-fn synthesize_circuits_batch<Scalar, C>(
-    circuits: Vec<C>,
-) -> Result<std::vec::Vec<ProvingAssignment<Scalar>>, SynthesisError>
-where
-    Scalar: PrimeField,
-    C: Circuit<Scalar> + Send,
-{
-    let start = Instant::now();
-    let provers = circuits
-        .into_par_iter()
-        .map(|circuit| -> Result<_, SynthesisError> {
-            let mut prover = ProvingAssignment::new();
-
-            prover.alloc_input(|| "", || Ok(Scalar::ONE))?;
-
-            circuit.synthesize(&mut prover)?;
-
-            for i in 0..prover.input_assignment.len() {
-                prover.enforce(|| "", |lc| lc + Variable(Index::Input(i)), |lc| lc, |lc| lc);
-            }
-
-            Ok(prover)
-        })
-        .collect::<Result<Vec<_>, _>>()?;
-
-    info!("synthesis time: {:?}", start.elapsed());
-
-    Ok(provers)
-}
diff --git a/src/groth16/prover/supraseal.rs b/src/groth16/prover/supraseal.rs
index 6411ff9a..2748cf27 100644
--- a/src/groth16/prover/supraseal.rs
+++ b/src/groth16/prover/supraseal.rs
@@ -1,8 +1,9 @@
 //! Prover implementation implemented using SupraSeal (C++).
 
-use std::time::Instant;
+use std::{cmp, collections::BTreeMap, io, ops, thread, time::Instant};
 
 use bellpepper_core::{Circuit, ConstraintSystem, Index, SynthesisError, Variable};
+use ec_gpu_gen::multiexp_cpu::DensityTracker;
 use ff::{Field, PrimeField};
 use log::info;
 use pairing::MultiMillerLoop;
@@ -11,6 +12,268 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use super::{ParameterSource, Proof, ProvingAssignment};
 use crate::{gpu::GpuName, BELLMAN_VERSION};
 
+/// The number of circuits that will synthesized in parallel.
+///
+/// Due to a memory optimized representation it's possible to synthesize circuits in bigger batches
+/// than proving them. That optimized representation will then be transformed into the one the
+/// prover expects in a separate step.
+const SYNTHESIZE_BATCH_SIZE: usize = 20;
+
+/// The number of synthesized circuits that are passed on to the prover. Those need a lot of memory
+/// and the proving is mostly sequentially anyway, which means that bigger sized won't result in
+/// much faster proving times. Lower memory usage is usally worth the trade-off.
+const PROVER_BATCH_SIZE: usize = 5;
+
+/// The number of scalars we pack into a single byte.
+const SCALARS_PER_BYTE: usize = 4;
+
+/// An enum to distinguish between common and other scalar values.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum ScalarValue {
+    Zero = 0,
+    One = 1,
+    Two = 2,
+    Other = 3,
+}
+
+impl Default for ScalarValue {
+    fn default() -> Self {
+        Self::Zero
+    }
+}
+
+/// Use a custom representation in order to use less memory. In Filecoin the synthesized exponents
+/// are mostly zero, ones or twos. Those can be represented with 2 bits instead of their full field
+/// representation of 256 bits. Other values have a slight overhead, but as there are so few, it
+/// doesn't matter much.
+#[derive(Debug, Eq, PartialEq)]
+pub struct ScalarVec<Scalar> {
+    /// The scalar representing zero. It's owned here so that it can be referenced later.
+    zero: Scalar,
+    /// The scalar representing one. It's owned here so that it can be referenced later.
+    one: Scalar,
+    /// The scalar representing two. It's owned here so that it can be referenced later.
+    two: Scalar,
+    /// This is the vector of all values. 4 values are packed into a single byte. If the value is
+    /// [`ScalarValue::Other`], then there will be the actual value stored in the `other` field,
+    /// keyed by the current position in the list of values (where the position is the one as if it
+    /// wouldn't be packed).
+    values: Vec<u8>,
+    /// In case the value is [`ScalarValue::Other`], then the actual scalar is stored in this map,
+    /// where the key the position within the list of values.
+    other: BTreeMap<usize, Scalar>,
+    /// Temporary buffer before the values are packed into a single byte.
+    buffer: [ScalarValue; SCALARS_PER_BYTE],
+    /// The offset where the next value within the buffer will be written to.
+    buffer_pos: usize,
+}
+
+impl<Scalar: PrimeField> ScalarVec<Scalar> {
+    pub fn new() -> Self {
+        Self {
+            zero: Scalar::ZERO,
+            one: Scalar::ONE,
+            two: Scalar::ONE.double(),
+            values: Vec::new(),
+            other: BTreeMap::new(),
+            buffer: [ScalarValue::Zero; SCALARS_PER_BYTE],
+            buffer_pos: 0,
+        }
+    }
+
+    /// Tthe number of scalars stored.
+    pub fn len(&self) -> usize {
+        // The scalar values are 2 bit, we store 4 of them in a single byte.
+        (self.values.len() * SCALARS_PER_BYTE) + self.buffer_pos
+    }
+
+    pub fn push(&mut self, scalar: Scalar) {
+        let value = if scalar == Scalar::ZERO {
+            ScalarValue::Zero
+        } else if scalar == Scalar::ONE {
+            ScalarValue::One
+        } else if scalar == self.two {
+            ScalarValue::Two
+        } else {
+            self.other.insert(self.len(), scalar);
+            ScalarValue::Other
+        };
+
+        if self.buffer_pos < SCALARS_PER_BYTE {
+            self.buffer[self.buffer_pos] = value;
+            self.buffer_pos += 1;
+        }
+
+        // The buffer is full, flush the values into the actual data vector.
+        if self.buffer_pos == SCALARS_PER_BYTE {
+            self.buffer_pos = 0;
+            self.flush_buffer();
+        }
+    }
+
+    pub fn iter(&self) -> ScalarVecIterator<Scalar> {
+        ScalarVecIterator {
+            scalar_vec: self,
+            pos: 0,
+        }
+    }
+
+    /// Transform into arepresentation where all elements arranged in continuous memory.
+    pub fn into_vec(self) -> Vec<Scalar> {
+        // NOTE vmx 2023-12-13: A simple collect of the iterator is slower when micro-benchmarking.
+        let mut output = Vec::with_capacity(self.len());
+        for scalar in self.iter() {
+            output.push(*scalar)
+        }
+        output
+    }
+
+    /// Flush the buffer into the actual vector of data.
+    fn flush_buffer(&mut self) {
+        let mut data_byte = 0;
+        data_byte |= self.buffer[0] as u8;
+        data_byte |= (self.buffer[1] as u8) << 2;
+        data_byte |= (self.buffer[2] as u8) << 4;
+        data_byte |= (self.buffer[3] as u8) << 6;
+        self.values.push(data_byte);
+    }
+
+    fn get(&self, pos: usize) -> Option<&Scalar> {
+        if pos < self.len() {
+            // The position is within the stored values (not the buffer)
+            if pos < self.values.len() * SCALARS_PER_BYTE {
+                let value_byte = &self.values[pos / SCALARS_PER_BYTE];
+                let within_buffer_pos = pos % SCALARS_PER_BYTE;
+                // Determine where the bits we want to read. Each value is 2 bits => `* 2`.
+                let bitmask = 0b11 << (within_buffer_pos * 2);
+                // Read those bits and shift them back, so that it matches the enum values.
+                let value = (value_byte & bitmask) >> (within_buffer_pos * 2);
+
+                if value == ScalarValue::Zero as u8 {
+                    Some(&self.zero)
+                } else if value == ScalarValue::One as u8 {
+                    Some(&self.one)
+                } else if value == ScalarValue::Two as u8 {
+                    Some(&self.two)
+                } else if value == ScalarValue::Other as u8 {
+                    self.other.get(&pos)
+                } else {
+                    unreachable!()
+                }
+            } else {
+                let within_buffer_pos = pos - (self.values.len() * SCALARS_PER_BYTE);
+                match self.buffer[within_buffer_pos] {
+                    ScalarValue::Zero => Some(&self.zero),
+                    ScalarValue::One => Some(&self.one),
+                    ScalarValue::Two => Some(&self.two),
+                    ScalarValue::Other => self.other.get(&pos),
+                }
+            }
+        } else {
+            None
+        }
+    }
+}
+
+impl<Scalar: PrimeField> Default for ScalarVec<Scalar> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<'a, Scalar: PrimeField> Extend<&'a Scalar> for ScalarVec<Scalar> {
+    fn extend<T: IntoIterator<Item = &'a Scalar>>(&mut self, iter: T) {
+        for scalar in iter {
+            self.push(*scalar);
+        }
+    }
+}
+
+impl<Scalar: PrimeField> ops::Index<usize> for ScalarVec<Scalar> {
+    type Output = Scalar;
+
+    fn index(&self, index: usize) -> &Self::Output {
+        self.get(index).expect("index out of range")
+    }
+}
+
+pub struct ScalarVecIterator<'a, Scalar> {
+    scalar_vec: &'a ScalarVec<Scalar>,
+    pos: usize,
+}
+
+impl<'a, Scalar: PrimeField> Iterator for ScalarVecIterator<'a, Scalar> {
+    type Item = &'a Scalar;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // Early return in case index is out of range.
+        let value = self.scalar_vec.get(self.pos)?;
+        self.pos += 1;
+        Some(value)
+    }
+}
+
+impl<'a, Scalar: PrimeField> IntoIterator for &'a ScalarVec<Scalar> {
+    type Item = &'a Scalar;
+    type IntoIter = ScalarVecIterator<'a, Scalar>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        ScalarVecIterator {
+            scalar_vec: self,
+            pos: 0,
+        }
+    }
+}
+
+/// A copy of `[prover::ProvingAssignment` which has a lower memory footprint.
+///
+/// At the cost of the need to convert into the usual representation when it's passed into the
+/// prover.
+#[derive(Default)]
+struct ProvingAssignmentCompact<Scalar: PrimeField> {
+    // Density of queries
+    a_aux_density: DensityTracker,
+    b_input_density: DensityTracker,
+    b_aux_density: DensityTracker,
+
+    // Evaluations of A, B, C polynomials
+    a: ScalarVec<Scalar>,
+    b: ScalarVec<Scalar>,
+    c: ScalarVec<Scalar>,
+
+    // Assignments of variables
+    input_assignment: Vec<Scalar>,
+    aux_assignment: ScalarVec<Scalar>,
+}
+
+super::proving_assignment_impls!(ProvingAssignmentCompact<Scalar>);
+
+impl<Scalar: PrimeField> From<ProvingAssignmentCompact<Scalar>> for ProvingAssignment<Scalar> {
+    fn from(assignment: ProvingAssignmentCompact<Scalar>) -> Self {
+        let mut a = Vec::new();
+        let mut b = Vec::new();
+        let mut c = Vec::new();
+        let mut aux_assignment = Vec::new();
+        rayon::scope(|s| {
+            s.spawn(|_| a = assignment.a.into_vec());
+            s.spawn(|_| b = assignment.b.into_vec());
+            s.spawn(|_| c = assignment.c.into_vec());
+            s.spawn(|_| aux_assignment = assignment.aux_assignment.into_vec());
+        });
+
+        Self {
+            a_aux_density: assignment.a_aux_density,
+            b_input_density: assignment.b_input_density,
+            b_aux_density: assignment.b_aux_density,
+            a,
+            b,
+            c,
+            input_assignment: assignment.input_assignment,
+            aux_assignment,
+        }
+    }
+}
+
 impl<Scalar> From<&ProvingAssignment<Scalar>> for supraseal_c2::Assignment<Scalar>
 where
     Scalar: PrimeField,
@@ -65,15 +328,164 @@ where
         BELLMAN_VERSION
     );
 
-    let provers = synthesize_circuits_batch(circuits)?;
-    proof_circuits_batch(provers, params, randomization)
+    let (r_s, s_s) = randomization.unwrap_or((
+        vec![E::Fr::ZERO; circuits.len()],
+        vec![E::Fr::ZERO; circuits.len()],
+    ));
+
+    // The memory-optimized version, which is more CPU intensive only makes sense for larger batch
+    // sizes. Hence use the normal synthesis for smaller batches.
+    if circuits.len() <= 10 {
+        let provers = super::synthesize_circuits_batch(circuits)?;
+        proof_circuits_batch(provers, params, (r_s, s_s))
+    } else {
+        create_proof_batch_pipelined(circuits, params, (r_s, s_s))
+    }
+}
+
+/// Create a custom [`SynthesisError`].
+///
+/// The closest to a custom error is the IO Error, hence use that.
+fn custom_error(error: &str) -> SynthesisError {
+    SynthesisError::IoError(io::Error::new(io::ErrorKind::Other, error))
+}
+
+/// The circuit synthesis is CPU intensive. Itself isn't parallelized, hence we parallelize with
+/// running several synthesis at the same time. The proving isn't that CPU intensive.
+/// Therefore we interleave the synthesis with the proving.
+/// We create a large batch of synthesized circuits, and then proof in smaller batches as the
+/// proving takes way more memory. Whenever the proving of synthesized batch starts, we kick of a
+/// new batch for synthesis, while the proving is going on. We achieve that with having a bounded
+/// message queue which blocks after a certain amount of batches.
+///
+/// The flow looks like that:
+///
+///   - Each uppercase letter corresponds to one proof.
+///   - The total number of proofs is 18.
+///   - The batch size for synthesis is 6.
+///   - The batch size for proving is 2.
+///   - The message queue size is the batch size of the synthesis divided bt the batch size of
+///     the proving minus one, so that the queue blocks before the next synthesis starts.
+///     => (6 / 2) - 1 = 2.
+///
+/// ```text
+///  The downwards axis is time. The Synthesize and Prover thread run in parallel. If things
+///  appeach on the same line it means that they start at the same time, but they might take
+///  different amounts of time.
+///
+///  Description                             Synthesize thread    Message queue    Prover thread
+///
+///  The full set of proofs is:
+///  A B C D E F G H I J K L M N O P Q R
+///
+///  Start with synthesizing a batch of         A B C D E F
+///  circuits.
+///
+///  Once finished, put them into the                              (C D) (A B)
+///  message queue. One item in the queue
+///  consists is one batch for the prover.
+///
+///  Once the prover starts, the last item      G H I J K L        (E F) (C D)         A B
+///  of the synthesis batch is pushed into                               (E F)         C D
+///  queue, hence a new synthesis starts.                                              E F
+///
+///  The synthesis keeps pushing into the                          (I J) (G H)
+///  queue whenever there's a free spot.
+///
+///  Keep repeating the previous two steps.     M N O P Q R        (K L) (I J)         G H
+///                                                                      (K L)         I J
+///
+///                                                                (O P) (M N)
+///
+///  All sircuits were synthesized, hence                          (Q R) (O P)         M N
+///  only the proving is to be done.                                     (Q R)         O P
+///                                                                                    Q R
+///  ```
+fn create_proof_batch_pipelined<E, C, P>(
+    circuits: Vec<C>,
+    params: P,
+    randomization: (Vec<E::Fr>, Vec<E::Fr>),
+) -> Result<Vec<Proof<E>>, SynthesisError>
+where
+    E: MultiMillerLoop,
+    C: Circuit<E::Fr> + Send,
+    E::Fr: GpuName,
+    E::G1Affine: GpuName,
+    E::G2Affine: GpuName,
+    P: ParameterSource<E>,
+{
+    let (r_s, s_s) = randomization;
+    assert_eq!(circuits.len(), r_s.len());
+    assert_eq!(circuits.len(), s_s.len());
+
+    // This channel size makes sure that the next synthesizing batch starts as soon as the first
+    // proving batch starts.
+    let (sender, receiver) =
+        crossbeam_channel::bounded((SYNTHESIZE_BATCH_SIZE / PROVER_BATCH_SIZE) - 1);
+
+    let num_circuits = circuits.len();
+
+    thread::scope(|s| {
+        let synthesis = s.spawn(|| -> Result<(), SynthesisError> {
+            let mut circuits_mut = circuits;
+            // A vector of proofs is expected, hence drain it from the list of proofs, so that we
+            // don't need to keep an extra copy around.
+            while !circuits_mut.is_empty() {
+                let size = cmp::min(SYNTHESIZE_BATCH_SIZE, circuits_mut.len());
+                let batch = circuits_mut.drain(0..size).collect();
+                let mut provers = synthesize_circuits_batch(batch)?;
+                // Do not send all synthesized circuits at once, but only a subset as the memory
+                // footprint will increase in the proving stage.
+                while !provers.is_empty() {
+                    let provers_size = cmp::min(PROVER_BATCH_SIZE, provers.len());
+                    let provers_batch: Vec<_> = provers.drain(0..provers_size).collect();
+                    sender
+                        .send(provers_batch)
+                        .map_err(|_| custom_error("cannot send circuits"))?;
+                }
+            }
+            Ok(())
+        });
+
+        let prover = s.spawn(|| {
+            let mut groth_proofs = Vec::with_capacity(num_circuits);
+            // There is one randomnes element per circuit, hence we can use that as termination
+            // condition for the loop.
+            let mut r_s_mut = r_s;
+            let mut s_s_mut = s_s;
+            while !r_s_mut.is_empty() {
+                let provers_compact = receiver
+                    .recv()
+                    .map_err(|_| custom_error("cannot receive circuits"))?;
+                let r_s_batch = r_s_mut.drain(0..provers_compact.len()).collect();
+                let s_s_batch = s_s_mut.drain(0..provers_compact.len()).collect();
+
+                // Transform the provers from the memory efficient representation into one suitable
+                // to be used with SupraSeal.
+                log::trace!("converting representation of provers");
+                let provers: Vec<ProvingAssignment<E::Fr>> =
+                    provers_compact.into_par_iter().map(Into::into).collect();
+
+                let proofs = proof_circuits_batch(provers, params.clone(), (r_s_batch, s_s_batch))?;
+                groth_proofs.extend_from_slice(&proofs);
+            }
+            Ok(groth_proofs)
+        });
+
+        synthesis
+            .join()
+            .map_err(|_| custom_error("cannot prove circuits"))??;
+        // The prover result is what we actually return.
+        prover
+            .join()
+            .map_err(|_| custom_error("cannot prove circuits"))?
+    })
 }
 
-#[allow(clippy::type_complexity)]
 fn proof_circuits_batch<E, P>(
-    provers: std::vec::Vec<ProvingAssignment<E::Fr>>,
+    provers: Vec<ProvingAssignment<E::Fr>>,
     params: P,
-    randomization: Option<(Vec<E::Fr>, Vec<E::Fr>)>,
+    randomization: (Vec<E::Fr>, Vec<E::Fr>),
 ) -> Result<Vec<Proof<E>>, SynthesisError>
 where
     E: MultiMillerLoop,
@@ -87,10 +499,7 @@ where
     info!("starting proof timer");
 
     let num_circuits = provers.len();
-    let (r_s, s_s) = randomization.unwrap_or((
-        vec![E::Fr::ZERO; num_circuits],
-        vec![E::Fr::ZERO; num_circuits],
-    ));
+    let (r_s, s_s) = randomization;
 
     // Make sure all circuits have the same input len.
     for prover in &provers {
@@ -129,10 +538,11 @@ where
     Ok(proofs)
 }
 
-#[allow(clippy::type_complexity)]
+// The only difference to [`groth16::prover::synthesize_circuits-batch`] is, that it's using the
+// memory optimized representation for the proving assignment.
 fn synthesize_circuits_batch<Scalar, C>(
     circuits: Vec<C>,
-) -> Result<std::vec::Vec<ProvingAssignment<Scalar>>, SynthesisError>
+) -> Result<Vec<ProvingAssignmentCompact<Scalar>>, SynthesisError>
 where
     Scalar: PrimeField,
     C: Circuit<Scalar> + Send,
@@ -142,7 +552,7 @@ where
     let provers = circuits
         .into_par_iter()
         .map(|circuit| -> Result<_, SynthesisError> {
-            let mut prover = ProvingAssignment::new();
+            let mut prover = ProvingAssignmentCompact::new();
 
             prover.alloc_input(|| "", || Ok(Scalar::ONE))?;
 
diff --git a/src/lc.rs b/src/lc.rs
index 12facbc9..f4af8e20 100644
--- a/src/lc.rs
+++ b/src/lc.rs
@@ -1,14 +1,59 @@
-use crate::LinearCombination;
+use std::ops;
+
 use ec_gpu_gen::multiexp_cpu::DensityTracker;
 use ff::PrimeField;
 
-pub fn eval_with_trackers<Scalar: PrimeField>(
-    lc: &LinearCombination<Scalar>,
-    mut input_density: Option<&mut DensityTracker>,
-    mut aux_density: Option<&mut DensityTracker>,
+use crate::LinearCombination;
+
+/// Copy of `eval` from bellpepper that also works with a
+/// [`groth16::prover::superaseal::ScalarVec`].
+// `T` is a slice of `Scalar`s. This way it works with `&[Scalar]` as well as `&ScalarVec<Scalar>`
+pub(crate) fn eval<'a, Scalar, T>(
+    lc: &'a LinearCombination<Scalar>,
+    input_assignment: &[Scalar],
+    aux_assignment: &'a T,
+) -> Scalar
+where
+    Scalar: PrimeField + ops::AddAssign<T::Output>,
+    T: ops::Index<usize>,
+    T::Output: PrimeField + std::ops::MulAssign<&'a Scalar>,
+{
+    let mut acc = Scalar::ZERO;
+
+    let one = Scalar::ONE;
+
+    for (index, coeff) in lc.iter_inputs() {
+        let mut tmp = input_assignment[*index];
+        if coeff != &one {
+            tmp *= coeff;
+        }
+        acc += tmp;
+    }
+
+    for (index, coeff) in lc.iter_aux() {
+        let mut tmp = aux_assignment[*index];
+        if coeff != &one {
+            tmp *= coeff;
+        }
+        acc += tmp;
+    }
+
+    acc
+}
+
+// `T` is a slice of `Scalar`s. This way it works with `&[Scalar]` as well as `&ScalarVec<Scalar>`
+pub(crate) fn eval_with_trackers<'a, Scalar, T>(
+    lc: &'a LinearCombination<Scalar>,
+    mut input_density: Option<&'a mut DensityTracker>,
+    mut aux_density: Option<&'a mut DensityTracker>,
     input_assignment: &[Scalar],
-    aux_assignment: &[Scalar],
-) -> Scalar {
+    aux_assignment: &'a T,
+) -> Scalar
+where
+    Scalar: PrimeField + ops::AddAssign<T::Output>,
+    T: ops::Index<usize>,
+    T::Output: PrimeField + std::ops::MulAssign<&'a Scalar>,
+{
     let mut acc = Scalar::ZERO;
 
     let one = Scalar::ONE;