From 1841ca122c09b252d8b1d8180307d0d4fa7ae66f Mon Sep 17 00:00:00 2001 From: Bobbin Threadbare Date: Fri, 27 Oct 2023 15:54:39 -0700 Subject: [PATCH] fix: gpu trace lde --- air/src/lib.rs | 2 +- prover/Cargo.toml | 2 +- prover/src/gpu.rs | 414 ++++++++++++++++++++++------------------------ prover/src/lib.rs | 2 +- 4 files changed, 200 insertions(+), 220 deletions(-) diff --git a/air/src/lib.rs b/air/src/lib.rs index f347914e13..b8c323aa62 100644 --- a/air/src/lib.rs +++ b/air/src/lib.rs @@ -37,7 +37,7 @@ pub use vm_core::{ utils::{DeserializationError, ToElements}, Felt, FieldElement, StarkField, }; -pub use winter_air::{FieldExtension, ProofOptions as WinterProofOptions}; +pub use winter_air::{proof::Queries, FieldExtension, ProofOptions as WinterProofOptions}; // PROCESSOR AIR // ================================================================================================ diff --git a/prover/Cargo.toml b/prover/Cargo.toml index 55c23c357e..3cffc65b80 100644 --- a/prover/Cargo.toml +++ b/prover/Cargo.toml @@ -27,5 +27,5 @@ winter-prover = { package = "winter-prover", version = "0.7", default-features = [target.'cfg(all(target_arch = "aarch64", target_os = "macos"))'.dependencies] elsa = { version = "1.9", optional = true } -ministark-gpu = { version = "0.1", features = [ "winterfell" ], optional = true } +ministark-gpu = { version = "0.2", features = [ "winterfell" ], optional = true } pollster = { version = "0.3", optional = true } diff --git a/prover/src/gpu.rs b/prover/src/gpu.rs index 93d868bce6..942a491917 100644 --- a/prover/src/gpu.rs +++ b/prover/src/gpu.rs @@ -1,7 +1,12 @@ //! This module contains GPU acceleration logic for Apple Silicon devices. For now the //! logic is limited to GPU accelerating RPO 256 trace commitments. -use crate::{ExecutionProver, WinterProofOptions}; -use air::{FieldElement, PublicInputs}; + +use super::{ + crypto::{RandomCoin, Rpo256, RpoDigest}, + math::{fft, Felt}, + ExecutionProver, ExecutionTrace, ProcessorAir, WinterProofOptions, +}; +use air::{FieldElement, PublicInputs, Queries}; use elsa::FrozenVec; use log::debug; use ministark_gpu::{ @@ -9,39 +14,41 @@ use ministark_gpu::{ utils::page_aligned_uninit_vector, }; use pollster::block_on; -use processor::{ - crypto::{RandomCoin, Rpo256, RpoDigest}, - math::{fft, Felt}, - ExecutionTrace, ONE, -}; +use processor::ONE; use std::time::Instant; use winter_prover::{ - crypto::{ElementHasher, Hasher, MerkleTree}, + crypto::MerkleTree, matrix::{build_segments, get_evaluation_offsets, Segment}, - ColMatrix, CompositionPoly, CompositionPolyTrace, ConstraintCommitment, EvaluationFrame, - Prover, RowMatrix, StarkDomain, TraceInfo, TraceLayout, TraceLde, TracePolyTable, + AuxTraceRandElements, ColMatrix, CompositionPoly, CompositionPolyTrace, ConstraintCommitment, + ConstraintCompositionCoefficients, DefaultConstraintEvaluator, EvaluationFrame, Prover, + RowMatrix, StarkDomain, TraceInfo, TraceLayout, TraceLde, TracePolyTable, }; +// CONSTANTS +// ================================================================================================ + const RPO_RATE: usize = Rpo256::RATE_RANGE.end - Rpo256::RATE_RANGE.start; +// METAL RPO PROVER +// ================================================================================================ + /// Wraps an [ExecutionProver] and provides GPU acceleration for building Rpo256 trace commitments. -pub(crate) struct GpuRpoExecutionProver(pub ExecutionProver) +pub(crate) struct MetalRpoExecutionProver(pub ExecutionProver) where R: RandomCoin; -impl Prover for GpuRpoExecutionProver +impl Prover for MetalRpoExecutionProver where R: RandomCoin, { - type Air = as Prover>::Air; + type Air = ProcessorAir; type BaseField = Felt; - type Trace = as Prover>::Trace; + type Trace = ExecutionTrace; type HashFn = Rpo256; type RandomCoin = R; - type TraceLde> = - DefaultGpuTraceLde; - type ConstraintEvaluator<'a, E: air::FieldElement> = - as Prover>::ConstraintEvaluator<'a, E>; + type TraceLde> = MetalRpoTraceLde; + type ConstraintEvaluator<'a, E: air::FieldElement> = + DefaultConstraintEvaluator<'a, ProcessorAir, E>; fn options(&self) -> &WinterProofOptions { self.0.options() @@ -51,6 +58,24 @@ where self.0.get_pub_inputs(trace) } + fn new_trace_lde>( + &self, + trace_info: &TraceInfo, + main_trace: &ColMatrix, + domain: &StarkDomain, + ) -> (Self::TraceLde, TracePolyTable) { + MetalRpoTraceLde::new(trace_info, main_trace, domain) + } + + fn new_evaluator<'a, E: FieldElement>( + &self, + air: &'a ProcessorAir, + aux_rand_elements: AuxTraceRandElements, + composition_coefficients: ConstraintCompositionCoefficients, + ) -> Self::ConstraintEvaluator<'a, E> { + self.0.new_evaluator(air, aux_rand_elements, composition_coefficients) + } + /// Evaluates constraint composition polynomial over the LDE domain and builds a commitment /// to these evaluations. /// @@ -75,28 +100,23 @@ where /// ────┼────────┼────────┼────────┼────────┼────────┼─── /// t=n t=n+1 t=n+2 t=n+3 t=n+4 t=n+5 /// ``` - // TODO: consider merging build_constraint_commitment and build_trace_commitment in Winterfell - // * https://github.com/facebook/winterfell/pull/192 - // * https://github.com/0xPolygonMiden/miden-vm/issues/877 - fn build_constraint_commitment( + fn build_constraint_commitment>( &self, - composition_poly: CompositionPolyTrace, + composition_poly_trace: CompositionPolyTrace, num_trace_poly_columns: usize, - domain: &StarkDomain, - ) -> (ConstraintCommitment, CompositionPoly) - where - E: FieldElement, - { + domain: &StarkDomain, + ) -> (ConstraintCommitment, CompositionPoly) { // evaluate composition polynomial columns over the LDE domain let now = Instant::now(); - let polys = CompositionPoly::new(composition_poly, domain, num_trace_poly_columns); + let composition_poly = + CompositionPoly::new(composition_poly_trace, domain, num_trace_poly_columns); let blowup = domain.trace_to_lde_blowup(); let offsets = - get_evaluation_offsets::(composition_poly.num_rows(), blowup, domain.offset()); - let segments = build_segments(polys.data(), domain.trace_twiddles(), &offsets); + get_evaluation_offsets::(composition_poly.column_len(), blowup, domain.offset()); + let segments = build_segments(composition_poly.data(), domain.trace_twiddles(), &offsets); debug!( "Evaluated {} composition polynomial columns over LDE domain (2^{} elements) in {} ms", - polys.num_columns(), + composition_poly.num_columns(), offsets.len().ilog2(), now.elapsed().as_millis() ); @@ -104,7 +124,8 @@ where // build constraint evaluation commitment let now = Instant::now(); let lde_domain_size = domain.lde_domain_size(); - let num_base_columns = polys.num_columns() * ::EXTENSION_DEGREE; + let num_base_columns = + composition_poly.num_columns() * ::EXTENSION_DEGREE; let rpo_requires_padding = num_base_columns % RPO_RATE != 0; let rpo_padded_segment_idx = rpo_requires_padding.then_some(num_base_columns / RPO_RATE); let mut row_hasher = GpuRpo256RowMajor::::new(lde_domain_size, rpo_requires_padding); @@ -138,113 +159,13 @@ where constraint_commitment.tree_depth(), now.elapsed().as_millis() ); - (constraint_commitment, polys) - } - - fn new_trace_lde( - &self, - trace_info: &TraceInfo, - main_trace: &ColMatrix, - domain: &StarkDomain, - ) -> (Self::TraceLde, TracePolyTable) - where - E: FieldElement, - { - DefaultGpuTraceLde::new(trace_info, main_trace, domain) - } - - fn new_evaluator<'a, E>( - &self, - air: &'a Self::Air, - aux_rand_elements: winter_prover::AuxTraceRandElements, - composition_coefficients: winter_prover::ConstraintCompositionCoefficients, - ) -> Self::ConstraintEvaluator<'a, E> - where - E: FieldElement, - { - self.0.new_evaluator(air, aux_rand_elements, composition_coefficients) - } -} - -struct SegmentGenerator<'a, E, I, const N: usize> -where - E: FieldElement, - I: IntoIterator>, -{ - poly_iter: I::IntoIter, - polys: Option>, - poly_offset: usize, - offsets: Vec, - domain: &'a StarkDomain, -} - -impl<'a, E, I, const N: usize> SegmentGenerator<'a, E, I, N> -where - E: FieldElement, - I: IntoIterator>, -{ - fn new(polys: I, domain: &'a StarkDomain) -> Self { - assert!(N > 0, "batch size N must be greater than zero"); - let poly_size = domain.trace_length(); - let lde_blowup = domain.trace_to_lde_blowup(); - let offsets = get_evaluation_offsets::(poly_size, lde_blowup, domain.offset()); - Self { - poly_iter: polys.into_iter(), - polys: None, - poly_offset: 0, - offsets, - domain, - } - } - - /// Returns the matrix of polynomials used to generate segments. - fn into_polys(self) -> Option> { - self.polys - } - - /// Returns a segment generating iterator. - fn gen_segment_iter(&mut self) -> SegmentIterator<'a, '_, E, I, N> { - SegmentIterator(self) - } - - /// Generates the next segment if it exists otherwise returns None. - fn gen_next_segment(&mut self) -> Option> { - // initialize our col matrix - if self.polys.is_none() { - self.polys = Some(ColMatrix::new(vec![self.poly_iter.next()?])); - } - - let offset = self.poly_offset; - let polys = self.polys.as_mut().unwrap(); - while polys.num_base_cols() < offset + N { - if let Some(poly) = self.poly_iter.next() { - polys.merge_column(poly) - } else { - break; - } - } - - // terminate if there are no more segments to create - if polys.num_base_cols() <= offset { - return None; - } - - let domain_size = self.domain.lde_domain_size(); - let mut data = unsafe { page_aligned_uninit_vector(domain_size) }; - if polys.num_base_cols() < offset + N { - // the segment will remain unfilled so we pad it with zeros - data.fill([E::BaseField::ZERO; N]); - } - - let twiddles = self.domain.trace_twiddles(); - let segment = Segment::new_with_buffer(data, &*polys, offset, &self.offsets, twiddles); - self.poly_offset += N; - Some(segment) + (constraint_commitment, composition_poly) } } -// TRACE LOW DEGREE EXTENSION +// TRACE LOW DEGREE EXTENSION (METAL) // ================================================================================================ + /// Contains all segments of the extended execution trace, the commitments to these segments, the /// LDE blowup factor, and the [TraceInfo]. /// @@ -253,25 +174,20 @@ where /// will always be elements in the base field (even when an extension field is used). /// - Auxiliary segments: a list of 0 or more segments for traces generated after the prover /// commits to the first trace segment. Currently, at most 1 auxiliary segment is possible. -pub struct DefaultGpuTraceLde< - E: FieldElement, - H: ElementHasher, -> { +pub struct MetalRpoTraceLde> { // low-degree extension of the main segment of the trace - main_segment_lde: RowMatrix, + main_segment_lde: RowMatrix, // commitment to the main segment of the trace - main_segment_tree: MerkleTree, + main_segment_tree: MerkleTree, // low-degree extensions of the auxiliary segments of the trace aux_segment_ldes: Vec>, // commitment to the auxiliary segments of the trace - aux_segment_trees: Vec>, + aux_segment_trees: Vec>, blowup: usize, trace_info: TraceInfo, } -impl, H: ElementHasher> - DefaultGpuTraceLde -{ +impl> MetalRpoTraceLde { /// Takes the main trace segment columns as input, interpolates them into polynomials in /// coefficient form, evaluates the polynomials over the LDE domain, commits to the /// polynomial evaluations, and creates a new [DefaultTraceLde] with the LDE of the main trace @@ -282,14 +198,14 @@ impl, H: ElementHasher, - domain: &StarkDomain, + domain: &StarkDomain, ) -> (Self, TracePolyTable) { // extend the main execution trace and build a Merkle tree from the extended trace let (main_segment_lde, main_segment_tree, main_segment_polys) = - build_trace_commitment::(main_trace, domain); + build_trace_commitment(main_trace, domain); let trace_poly_table = TracePolyTable::new(main_segment_polys); - let trace_lde = DefaultGpuTraceLde { + let trace_lde = MetalRpoTraceLde { main_segment_lde, main_segment_tree, aux_segment_ldes: Vec::new(), @@ -312,28 +228,24 @@ impl, H: ElementHasher &RowMatrix { + pub fn get_main_segment(&self) -> &RowMatrix { &self.main_segment_lde } /// Returns the entire trace for the column at the specified index. #[cfg(test)] - pub fn get_main_segment_column(&self, col_idx: usize) -> Vec { + pub fn get_main_segment_column(&self, col_idx: usize) -> Vec { (0..self.main_segment_lde.num_rows()) .map(|row_idx| self.main_segment_lde.get(col_idx, row_idx)) .collect() } } -impl TraceLde for DefaultGpuTraceLde -where - E: FieldElement, - H: ElementHasher, -{ - type HashFn = H; +impl> TraceLde for MetalRpoTraceLde { + type HashFn = Rpo256; /// Returns the commitment to the low-degree extension of the main trace segment. - fn get_main_trace_commitment(&self) -> ::Digest { + fn get_main_trace_commitment(&self) -> RpoDigest { let root_hash = self.main_segment_tree.root(); *root_hash } @@ -353,11 +265,11 @@ where fn add_aux_segment( &mut self, aux_trace: &ColMatrix, - domain: &StarkDomain, - ) -> (ColMatrix, ::Digest) { + domain: &StarkDomain, + ) -> (ColMatrix, RpoDigest) { // extend the auxiliary trace segment and build a Merkle tree from the extended trace let (aux_segment_lde, aux_segment_tree, aux_segment_polys) = - build_trace_commitment::(aux_trace, domain); + build_trace_commitment::(aux_trace, domain); // check errors assert!( @@ -379,11 +291,7 @@ where } /// Reads current and next rows from the main trace segment into the specified frame. - fn read_main_trace_frame_into( - &self, - lde_step: usize, - frame: &mut EvaluationFrame, - ) { + fn read_main_trace_frame_into(&self, lde_step: usize, frame: &mut EvaluationFrame) { // at the end of the trace, next state wraps around and we read the first step again let next_lde_step = (lde_step + self.blowup()) % self.trace_len(); @@ -442,45 +350,6 @@ where } } -fn build_segment_queries( - segment_lde: &RowMatrix, - segment_tree: &MerkleTree, - positions: &[usize], -) -> Queries -where - E: FieldElement, - H: ElementHasher, -{ - // for each position, get the corresponding row from the trace segment LDE and put all these - // rows into a single vector - let trace_states = - positions.iter().map(|&pos| segment_lde.row(pos).to_vec()).collect::>(); - - // build Merkle authentication paths to the leaves specified by positions - let trace_proof = segment_tree - .prove_batch(positions) - .expect("failed to generate a Merkle proof for trace queries"); - - Queries::new(trace_proof, trace_states) -} - -struct SegmentIterator<'a, 'b, E, I, const N: usize>(&'b mut SegmentGenerator<'a, E, I, N>) -where - E: FieldElement, - I: IntoIterator>; - -impl<'a, 'b, E, I, const N: usize> Iterator for SegmentIterator<'a, 'b, E, I, N> -where - E: FieldElement, - I: IntoIterator>, -{ - type Item = Segment; - - fn next(&mut self) -> Option { - self.0.gen_next_segment() - } -} - /// Computes a low-degree extension (LDE) of the provided execution trace over the specified /// domain and builds a commitment to the extended trace. /// @@ -506,17 +375,13 @@ where /// ────┼────────┼────────┼────────┼────────┼────────┼──── /// t=n t=n+1 t=n+2 t=n+3 t=n+4 t=n+5 /// ``` -fn build_trace_commitment( +fn build_trace_commitment>( trace: &ColMatrix, - domain: &StarkDomain, -) -> (RowMatrix, MerkleTree, ColMatrix) -where - E: FieldElement, - H: ElementHasher, -{ + domain: &StarkDomain, +) -> (RowMatrix, MerkleTree, ColMatrix) { // interpolate the execution trace let now = Instant::now(); - let inv_twiddles = fft::get_inv_twiddles::(trace.num_rows()); + let inv_twiddles = fft::get_inv_twiddles::(trace.num_rows()); let trace_polys = trace.columns().map(|col| { let mut poly = col.to_vec(); fft::interpolate_poly(&mut poly, &inv_twiddles); @@ -570,6 +435,121 @@ where (trace_lde, trace_tree, trace_polys) } +// SEGMENT GENERATOR +// ================================================================================================ + +struct SegmentGenerator<'a, E, I, const N: usize> +where + E: FieldElement, + I: IntoIterator>, +{ + poly_iter: I::IntoIter, + polys: Option>, + poly_offset: usize, + offsets: Vec, + domain: &'a StarkDomain, +} + +impl<'a, E, I, const N: usize> SegmentGenerator<'a, E, I, N> +where + E: FieldElement, + I: IntoIterator>, +{ + fn new(polys: I, domain: &'a StarkDomain) -> Self { + assert!(N > 0, "batch size N must be greater than zero"); + let poly_size = domain.trace_length(); + let lde_blowup = domain.trace_to_lde_blowup(); + let offsets = get_evaluation_offsets::(poly_size, lde_blowup, domain.offset()); + Self { + poly_iter: polys.into_iter(), + polys: None, + poly_offset: 0, + offsets, + domain, + } + } + + /// Returns the matrix of polynomials used to generate segments. + fn into_polys(self) -> Option> { + self.polys + } + + /// Returns a segment generating iterator. + fn gen_segment_iter(&mut self) -> SegmentIterator<'a, '_, E, I, N> { + SegmentIterator(self) + } + + /// Generates the next segment if it exists otherwise returns None. + fn gen_next_segment(&mut self) -> Option> { + // initialize our col matrix + if self.polys.is_none() { + self.polys = Some(ColMatrix::new(vec![self.poly_iter.next()?])); + } + + let offset = self.poly_offset; + let polys = self.polys.as_mut().unwrap(); + while polys.num_base_cols() < offset + N { + if let Some(poly) = self.poly_iter.next() { + polys.merge_column(poly) + } else { + break; + } + } + + // terminate if there are no more segments to create + if polys.num_base_cols() <= offset { + return None; + } + + let domain_size = self.domain.lde_domain_size(); + let mut data = unsafe { page_aligned_uninit_vector(domain_size) }; + if polys.num_base_cols() < offset + N { + // the segment will remain unfilled so we pad it with zeros + data.fill([Felt::ZERO; N]); + } + + let twiddles = self.domain.trace_twiddles(); + let segment = Segment::new_with_buffer(data, &*polys, offset, &self.offsets, twiddles); + self.poly_offset += N; + Some(segment) + } +} + +fn build_segment_queries>( + segment_lde: &RowMatrix, + segment_tree: &MerkleTree, + positions: &[usize], +) -> Queries { + // for each position, get the corresponding row from the trace segment LDE and put all these + // rows into a single vector + let trace_states = + positions.iter().map(|&pos| segment_lde.row(pos).to_vec()).collect::>(); + + // build Merkle authentication paths to the leaves specified by positions + let trace_proof = segment_tree + .prove_batch(positions) + .expect("failed to generate a Merkle proof for trace queries"); + + Queries::new(trace_proof, trace_states) +} + +struct SegmentIterator<'a, 'b, E, I, const N: usize>(&'b mut SegmentGenerator<'a, E, I, N>) +where + E: FieldElement, + I: IntoIterator>; + +impl<'a, 'b, E, I, const N: usize> Iterator for SegmentIterator<'a, 'b, E, I, N> +where + E: FieldElement, + I: IntoIterator>, +{ + type Item = Segment; + + fn next(&mut self) -> Option { + self.0.gen_next_segment() + } +} + // TESTS // ================================================================================================ @@ -583,7 +563,7 @@ mod tests { #[test] fn build_trace_commitment_on_gpu_with_padding_matches_cpu() { let cpu_prover = create_test_prover(); - let gpu_prover = GpuRpoExecutionProver(create_test_prover()); + let gpu_prover = MetalRpoExecutionProver(create_test_prover()); let num_rows = 1 << 8; let trace = gen_random_trace(num_rows, RPO_RATE + 1); let domain = StarkDomain::from_twiddles(fft::get_twiddles(num_rows), 8, Felt::GENERATOR); @@ -599,7 +579,7 @@ mod tests { #[test] fn build_trace_commitment_on_gpu_without_padding_matches_cpu() { let cpu_prover = create_test_prover(); - let gpu_prover = GpuRpoExecutionProver(create_test_prover()); + let gpu_prover = MetalRpoExecutionProver(create_test_prover()); let num_rows = 1 << 8; let trace = gen_random_trace(num_rows, RPO_RATE); let domain = StarkDomain::from_twiddles(fft::get_twiddles(num_rows), 8, Felt::GENERATOR); @@ -615,7 +595,7 @@ mod tests { #[test] fn build_constraint_commitment_on_gpu_with_padding_matches_cpu() { let cpu_prover = create_test_prover(); - let gpu_prover = GpuRpoExecutionProver(create_test_prover()); + let gpu_prover = MetalRpoExecutionProver(create_test_prover()); let num_rows = 1 << 8; let ce_blowup_factor = 2; let coeffs = gen_random_coeffs::>(num_rows * ce_blowup_factor); @@ -632,7 +612,7 @@ mod tests { #[test] fn build_constraint_commitment_on_gpu_without_padding_matches_cpu() { let cpu_prover = create_test_prover(); - let gpu_prover = GpuRpoExecutionProver(create_test_prover()); + let gpu_prover = MetalRpoExecutionProver(create_test_prover()); let num_rows = 1 << 8; let ce_blowup_factor = 8; let coeffs = gen_random_coeffs::(num_rows * ce_blowup_factor); diff --git a/prover/src/lib.rs b/prover/src/lib.rs index e941c04b96..4fb859deab 100644 --- a/prover/src/lib.rs +++ b/prover/src/lib.rs @@ -97,7 +97,7 @@ where stack_outputs.clone(), ); #[cfg(all(feature = "metal", target_arch = "aarch64", target_os = "macos"))] - let prover = gpu::GpuRpoExecutionProver(prover); + let prover = gpu::MetalRpoExecutionProver(prover); prover.prove(trace) } }