From a979221c6e1d255e53850ea0268701655af17ad4 Mon Sep 17 00:00:00 2001
From: Mikhail Volkhov <volhovm.cs@gmail.com>
Date: Thu, 6 Feb 2025 14:58:16 +0000
Subject: [PATCH] Turn off constraint parallelisation for small vector size?

---
 kimchi/src/circuits/constraints.rs | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/kimchi/src/circuits/constraints.rs b/kimchi/src/circuits/constraints.rs
index b5935b142b..b138a6c7fd 100644
--- a/kimchi/src/circuits/constraints.rs
+++ b/kimchi/src/circuits/constraints.rs
@@ -374,11 +374,25 @@ impl<F: PrimeField, G: KimchiCurve<ScalarField = F>, OpeningProof: OpenProof<G>>
 impl<F: PrimeField> ConstraintSystem<F> {
     /// evaluate witness polynomials over domains
     pub fn evaluate(&self, w: &[DP<F>; COLUMNS], z: &DP<F>) -> WitnessOverDomains<F> {
-        // this optimisation saves 100ms
+        // this optimisation saves 100ms for the prover.
+        // but it adds 3% = 2.5ms to the verifier on small (1k rows) circuits.
+
+        // the idea is to have threading minimised below a certain threshold.
+        let min_len = {
+            let threads_to_use = if w[0].len() <= 2048 {
+                1
+            } else {
+                rayon::max_num_threads()
+            };
+            // min batch size is COLUMNS, when threads_to_use == 1,
+            // in which case every iterator will use one thread
+            std::cmp::max(COLUMNS, COLUMNS / threads_to_use)
+        };
 
         // compute shifted witness polynomials
         let w8: [E<F, D<F>>; COLUMNS] = (0..COLUMNS)
             .into_par_iter()
+            .with_min_len(min_len)
             .map(|i| w[i].evaluate_over_domain_by_ref(self.domain.d8))
             .collect::<Vec<_>>()
             .try_into()
@@ -386,6 +400,7 @@ impl<F: PrimeField> ConstraintSystem<F> {
 
         let w4: [E<F, D<F>>; COLUMNS] = (0..COLUMNS)
             .into_par_iter()
+            .with_min_len(min_len)
             .map(|i| {
                 E::<F, D<F>>::from_vec_and_domain(
                     (0..self.domain.d4.size)
@@ -405,6 +420,7 @@ impl<F: PrimeField> ConstraintSystem<F> {
 
         let d4_next_w: [_; COLUMNS] = (0..COLUMNS)
             .into_par_iter()
+            .with_min_len(min_len)
             .map(|i| w4[i].shift(4))
             .collect::<Vec<_>>()
             .try_into()
@@ -412,6 +428,7 @@ impl<F: PrimeField> ConstraintSystem<F> {
 
         let d8_next_w: [_; COLUMNS] = (0..COLUMNS)
             .into_par_iter()
+            .with_min_len(min_len)
             .map(|i| w8[i].shift(8))
             .collect::<Vec<_>>()
             .try_into()