Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crypto/stark/benches/profile_prover.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ fn main() {
fri_number_of_queries: 100,
coset_offset: 3,
grinding_factor: 0,
fri_final_poly_log_degree: 7,
};

let num_columns = 16;
Expand Down
1 change: 1 addition & 0 deletions crypto/stark/benches/prover_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ fn benchmark_proof_options() -> ProofOptions {
fri_number_of_queries: 30,
coset_offset: 3,
grinding_factor: 0,
fri_final_poly_log_degree: 7,
}
}

Expand Down
91 changes: 63 additions & 28 deletions crypto/stark/src/fri/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
pub mod fri_commitment;
pub mod fri_decommit;
pub(crate) mod fri_functions;
pub(crate) mod terminal;

use crypto::fiat_shamir::is_transcript::IsStarkTranscript;
use math::field::element::FieldElement;
Expand All @@ -16,25 +17,30 @@ use self::fri_functions::{
};

/// FRI commit phase from pre-computed bit-reversed evaluations, skipping the
/// initial FFT. Use this when the caller already has the evaluation vector
/// (e.g. from a fused LDE pipeline).
/// initial FFT. Stops folding when the remaining codeword encodes a polynomial
/// of degree < 2^`final_poly_log_degree` with blowup 2^`blowup_log`, and
/// returns the coefficient vector of that terminal polynomial.
///
/// The `T: Clone` and `F/E: 'static` bounds are required by the cuda GPU
/// fast path (`try_fri_commit_gpu` snapshots the transcript and TypeId-
/// checks the field types). They are present unconditionally (including
/// in builds without the `cuda` feature) to keep one stable signature.
#[allow(clippy::type_complexity)]
pub fn commit_phase_from_evaluations<
F: IsFFTField + IsSubFieldOf<E> + 'static,
E: IsField + 'static,
E: IsField + 'static + Send + Sync,
T: IsStarkTranscript<E, F> + Clone,
>(
number_layers: usize,
// `_number_layers`: retained for signature stability with the cuda fast path; termination is now driven by blowup_log + final_poly_log_degree.
_number_layers: usize,
mut evals: Vec<FieldElement<E>>,
transcript: &mut T,
coset_offset: &FieldElement<F>,
domain_size: usize,
blowup_log: u32,
final_poly_log_degree: u32,
) -> (
FieldElement<E>,
Vec<FieldElement<E>>,
Vec<FriLayer<E, FriLayerMerkleTreeBackend<E>>>,
)
where
Expand All @@ -50,27 +56,39 @@ where
// had never been tried.
#[cfg(feature = "cuda")]
{
// GPU FRI commit is disabled unconditionally (see `try_fri_commit_gpu`
// in gpu_lde.rs for the full explanation). The CPU fallback below
// handles all cases correctly, including early termination.
if let Some(result) = crate::gpu_lde::try_fri_commit_gpu::<F, E, T>(
number_layers,
_number_layers,
&evals,
transcript,
coset_offset,
domain_size,
blowup_log,
final_poly_log_degree,
) {
return result;
}
}

// Determine how many total folds are needed to reach the terminal codeword.
// terminal_len = 2^(blowup_log + k), clamped to initial_len for tiny inputs.
let initial_len = evals.len();
let k = final_poly_log_degree as usize;
let terminal_len = ((1usize << blowup_log) << k).min(initial_len);
let total_folds = (initial_len / terminal_len).trailing_zeros() as usize;
let num_committed = total_folds.saturating_sub(1);

// Inverse twiddle factors for evaluation-form folding.
let mut inv_twiddles = compute_coset_twiddles_inv(coset_offset, domain_size);
let mut fri_layer_list = Vec::with_capacity(num_committed);
// Track the coset offset as it squares with each fold (needed for iFFT in terminal).
let mut terminal_offset = coset_offset.clone();

// The loop commits `number_layers - 1` folded layers; the final fold below
// produces the (uncommitted) last value.
let num_committed_layers = number_layers.saturating_sub(1);
let mut fri_layer_list = Vec::with_capacity(num_committed_layers);

for _ in 0..num_committed_layers {
// <<<< Receive challenge 𝜁ₖ₋₁
// Commit `num_committed` folded layers to the transcript.
for _ in 0..num_committed {
// <<<< Receive challenge 𝜁ₖ
let zeta = transcript.sample_field_element();

// Fold evaluations in-place (no FFT needed).
Expand All @@ -89,25 +107,42 @@ where
// >>>> Send commitment: [pₖ]
transcript.append_bytes(&root);

// Update twiddles for the next level.
// Update twiddles and offset for the next level.
update_twiddles_in_place(&mut inv_twiddles);
terminal_offset = terminal_offset.square();
}

// <<<< Receive challenge: 𝜁ₙ₋₁
let zeta = transcript.sample_field_element();

// Final fold.
fold_evaluations_in_place(&mut evals, &zeta, &inv_twiddles);

let last_value = evals
.first()
.expect("FRI evals are non-empty after folding")
.clone();

// >>>> Send value: pₙ
transcript.append_field_element(&last_value);
// One final fold to reach the terminal codeword (size terminal_len), unless
// already there (total_folds == 0 means initial_len == terminal_len).
if total_folds > 0 {
// <<<< Receive challenge: 𝜁_final
let zeta = transcript.sample_field_element();
fold_evaluations_in_place(&mut evals, &zeta, &inv_twiddles);
terminal_offset = terminal_offset.square();
}
debug_assert_eq!(evals.len(), terminal_len, "terminal codeword size mismatch");

// Recover the low-degree polynomial coefficients from the terminal codeword
// and send them to the verifier.
//
// The number of coefficients is determined by the *actual* terminal codeword,
// not the requested `final_poly_log_degree`: for tiny inputs `terminal_len`
// is clamped to `initial_len`, so the terminal polynomial has degree
// < terminal_len / 2^blowup_log = 2^(log2(terminal_len) - blowup_log). Using
// this clamped exponent keeps the coefficient count in lockstep with what the
// verifier reconstructs (`expected_k = min(k, trace_bits)`); passing the raw
// `final_poly_log_degree` would over-pad with zeros and break the round-trip.
let effective_log_degree = terminal_len.trailing_zeros() - blowup_log;
let final_poly_coeffs = crate::fri::terminal::coeffs_from_terminal_codeword::<F, E>(
&evals,
&terminal_offset,
effective_log_degree,
);
for c in &final_poly_coeffs {
transcript.append_field_element(c);
}

(last_value, fri_layer_list)
(final_poly_coeffs, fri_layer_list)
}

pub fn query_phase<F: IsField>(
Expand Down
109 changes: 109 additions & 0 deletions crypto/stark/src/fri/terminal.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
//! Conversion helpers between a FRI terminal codeword and the coefficients of
//! the low-degree polynomial it encodes.
//!
//! These are pure, self-contained helpers — no transcript, no FRI logic.
//! They are used by the prover (`commit_phase_from_evaluations`) and verifier FRI step.

use math::fft::bit_reversing::in_place_bit_reverse_permute;
use math::field::element::FieldElement;
use math::field::traits::{IsFFTField, IsField, IsSubFieldOf};
use math::polynomial::Polynomial;

/// Prover side: given a FRI terminal codeword in **bit-reversed** order,
/// recover the `2^final_poly_log_degree` coefficients of the underlying
/// low-degree polynomial.
///
/// The codeword is a coset evaluation of a polynomial of degree less than
/// `2^final_poly_log_degree` on the coset `terminal_offset·⟨ω⟩` of size
/// `blowup·2^k`.
///
/// Algorithm:
/// 1. Bit-reverse permute to convert from FRI order to natural (DFT) order.
/// 2. Decimate: extract the size-`2^k` sub-coset
/// `terminal_offset·⟨ω^blowup⟩` = every `blowup`-th natural-order point.
/// 3. Coset iFFT on the small (`2^k`-point) sub-domain — a `blowup×`-smaller
/// transform that recovers the `2^k` coefficients directly (no oversized
/// transform and no wasteful truncation).
pub(crate) fn coeffs_from_terminal_codeword<F, E>(
codeword_bitrev: &[FieldElement<E>],
terminal_offset: &FieldElement<F>,
final_poly_log_degree: u32,
) -> Vec<FieldElement<E>>
where
F: IsFFTField + IsSubFieldOf<E>,
E: IsField + Send + Sync,
{
// Bit-reversed -> natural order.
let mut natural = codeword_bitrev.to_vec();
in_place_bit_reverse_permute(&mut natural);

// A degree-<2^k poly is determined by 2^k points: take the size-2^k sub-coset
// terminal_offset*<w^blowup> = every `blowup`-th natural-order evaluation.
let keep = 1usize << final_poly_log_degree;
let blowup = natural.len() / keep;
let sub_coset: Vec<FieldElement<E>> = natural.into_iter().step_by(blowup).collect();
debug_assert_eq!(sub_coset.len(), keep);

// Coset iFFT on the small domain -> the 2^k coefficients directly (no oversized trim).
let poly = Polynomial::interpolate_offset_fft::<F>(&sub_coset, terminal_offset)
.expect("terminal sub-coset must have power-of-two length and non-zero offset");

// Pad with zeros only if interpolation dropped trailing-zero coeffs, so the
// proof always carries exactly 2^k coefficients (the verifier length-checks).
let mut coeffs = poly.coefficients().to_vec();
coeffs.resize(keep, FieldElement::<E>::zero());
coeffs
}

/// Verifier side: given `2^k` coefficients of the low-degree polynomial,
/// reconstruct the full FRI terminal codeword in **bit-reversed** order.
///
/// Algorithm:
/// 1. FFT (coset): evaluate the polynomial on the full coset of size
/// `codeword_len` with shift `terminal_offset` to get natural order.
/// 2. Bit-reverse permute to convert natural order to FRI order.
///
/// # Panics
///
/// Panics if any of the following preconditions are violated:
/// - `coeffs` is non-empty,
/// - `coeffs.len()` is a power of two,
/// - `codeword_len` is a power of two,
/// - `coeffs.len() <= codeword_len`, and
/// - `codeword_len` is divisible by `coeffs.len()`.
///
/// In the normal verifier flow these conditions are guaranteed by the
/// final-polynomial length check that the verifier performs before calling
/// this helper, so the assert should never fire in production.
pub(crate) fn terminal_codeword_from_coeffs<F, E>(
coeffs: &[FieldElement<E>],
terminal_offset: &FieldElement<F>,
codeword_len: usize,
) -> Vec<FieldElement<E>>
where
F: IsFFTField + IsSubFieldOf<E>,
E: IsField + Send + Sync,
{
assert!(
!coeffs.is_empty()
&& coeffs.len().is_power_of_two()
&& codeword_len.is_power_of_two()
&& coeffs.len() <= codeword_len
&& codeword_len.is_multiple_of(coeffs.len()),
"terminal_codeword_from_coeffs: coeffs.len() ({}) must be a non-zero power of two dividing codeword_len ({}); the verifier must length-check coeffs before calling",
coeffs.len(),
codeword_len,
);

let poly = Polynomial::new(coeffs);
let blowup = codeword_len / coeffs.len();

// Step 1: coset FFT to get natural-order evaluations.
let mut natural =
Polynomial::evaluate_offset_fft::<F>(&poly, blowup, Some(coeffs.len()), terminal_offset)
.expect("terminal coset size must be a power of two within the field's two-adicity");

// Step 2: convert natural order to bit-reversed (FRI) order.
in_place_bit_reverse_permute(&mut natural);
natural
}
62 changes: 44 additions & 18 deletions crypto/stark/src/gpu_lde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1519,22 +1519,28 @@ where
/// concrete transcript type to support snapshot semantics via `Clone`.
#[allow(clippy::type_complexity)]
pub(crate) fn try_fri_commit_gpu<F, E, T>(
number_layers: usize,
_number_layers: usize,
evals: &[FieldElement<E>],
transcript: &mut T,
coset_offset: &FieldElement<F>,
domain_size: usize,
blowup_log: u32,
final_poly_log_degree: u32,
) -> Option<(
FieldElement<E>,
Vec<FieldElement<E>>,
Vec<FriLayer<E, FriLayerMerkleTreeBackend<E>>>,
)>
where
F: IsFFTField + IsField + IsSubFieldOf<E> + 'static,
E: IsField + 'static,
E: IsField + 'static + Send + Sync,
FieldElement<F>: AsBytes,
FieldElement<E>: AsBytes,
T: IsStarkTranscript<E, F> + Clone,
{
// GPU drives the early-termination FRI commit phase, mirroring
// `commit_phase_from_evaluations`: for each committed layer (sample zeta,
// fold, append root); then one final fold to the terminal codeword whose
// coefficients are emitted (not a single value).
if TypeId::of::<F>() != TypeId::of::<GoldilocksField>() {
return None;
}
Expand Down Expand Up @@ -1576,11 +1582,21 @@ where
// produced had this dispatch never been called.
let transcript_snapshot = transcript.clone();

let num_committed_layers = number_layers.saturating_sub(1);
// Early-termination schedule (mirrors commit_phase_from_evaluations):
// terminal_len = 2^(blowup_log + final_poly_log_degree), clamped to n0.
let k = final_poly_log_degree as usize;
let terminal_len = ((1usize << blowup_log) << k).min(n0);
let total_folds = (n0 / terminal_len).trailing_zeros() as usize;
// The GPU path only runs above gpu_lde_threshold(); tiny clamped traces
// (total_folds == 0) are handled by the CPU fallback.
if total_folds == 0 {
return None;
}
let num_committed = total_folds - 1;
let mut fri_layer_list: Vec<FriLayer<E, FriLayerMerkleTreeBackend<E>>> =
Vec::with_capacity(num_committed_layers);
Vec::with_capacity(num_committed);

for _ in 0..num_committed_layers {
for _ in 0..num_committed {
// <<<< Receive challenge zeta_k
let zeta: FieldElement<E> = transcript.sample_field_element();
// SAFETY: E == Ext3.
Expand Down Expand Up @@ -1614,27 +1630,37 @@ where
transcript.append_bytes(&root_arr);
}

// <<<< Receive challenge zeta_{n-1}
let zeta_last: FieldElement<E> = transcript.sample_field_element();
let zeta_ptr = &zeta_last as *const FieldElement<E> as *const u64;
// Final (uncommitted) fold to the terminal codeword. n_out == terminal_len
// >= 2, so reuse fold_and_commit_layer and keep only its evaluations; the
// Merkle root/nodes are discarded (the terminal layer is sent as coeffs).
let zeta_final: FieldElement<E> = transcript.sample_field_element();
let zeta_ptr = &zeta_final as *const FieldElement<E> as *const u64;
let zeta_raw: [u64; 3] = unsafe { [*zeta_ptr, *zeta_ptr.add(1), *zeta_ptr.add(2)] };

let last_raw = match state.fold_final(zeta_raw) {
let (_root, terminal_evals_u64, _nodes) = match state.fold_and_commit_layer(zeta_raw) {
Ok(v) => v,
Err(_) => {
*transcript = transcript_snapshot;
return None;
}
};
let last_vec = u64_to_ext3_vec::<E>(&last_raw);
let last_value = last_vec
.into_iter()
.next()
.expect("fold_final returns 1 elt");
debug_assert_eq!(terminal_evals_u64.len(), terminal_len * 3);
let terminal_codeword = u64_to_ext3_vec::<E>(&terminal_evals_u64);

// CPU-side coefficient extraction, identical to commit_phase_from_evaluations.
let terminal_offset = coset_offset.pow(1u64 << total_folds);
let effective_log_degree = terminal_len.trailing_zeros() - blowup_log;
let final_poly_coeffs = crate::fri::terminal::coeffs_from_terminal_codeword::<F, E>(
&terminal_codeword,
&terminal_offset,
effective_log_degree,
);

// >>>> Send value: p_n
transcript.append_field_element(&last_value);
// >>>> Send the final polynomial coefficients.
for c in &final_poly_coeffs {
transcript.append_field_element(c);
}

GPU_FRI_CALLS.fetch_add(1, Ordering::Relaxed);
Some((last_value, fri_layer_list))
Some((final_poly_coeffs, fri_layer_list))
}
Loading
Loading