From 1a37e4565d0253721ee59cc4f7269570a59017e3 Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Tue, 26 May 2026 17:22:48 -0400 Subject: [PATCH 1/9] Chunk abstraction for all roles --- differential-dataflow/examples/chunks.rs | 119 ++ differential-dataflow/src/trace/chunk.rs | 2390 ++++++++++++++++++++++ differential-dataflow/src/trace/mod.rs | 1 + 3 files changed, 2510 insertions(+) create mode 100644 differential-dataflow/examples/chunks.rs create mode 100644 differential-dataflow/src/trace/chunk.rs diff --git a/differential-dataflow/examples/chunks.rs b/differential-dataflow/examples/chunks.rs new file mode 100644 index 000000000..66d8f7496 --- /dev/null +++ b/differential-dataflow/examples/chunks.rs @@ -0,0 +1,119 @@ +//! Minimal dataflow over the `Vec`-backed `Chunk` container. +//! +//! Mirrors the `val` arm of `spines.rs`, but arranges through `ChunkBatcher` / +//! `ChunkRcBuilder` / `ChunkSpine` — i.e. the merge batcher, builder, and spine +//! built atop the `Chunk` trait and its `ChunkBatch`. Run as: +//! +//! ```text +//! cargo run --release --example chunks -- +//! ``` + +use differential_dataflow::Hashable; +use differential_dataflow::input::Input; +use differential_dataflow::operators::arrange::Arrange; +use differential_dataflow::operators::arrange::arrangement::arrange_core; +use differential_dataflow::trace::chunk::vec_chunk::{ChunkBatcher, ChunkRcBuilder, ChunkSpine, VecChunk}; +use differential_dataflow::trace::chunk::col_chunk::{ColChunkBatcher, ColChunkRcBuilder, ColChunkSpine, ColChunker}; +use differential_dataflow::trace::implementations::Vector; +use differential_dataflow::trace::implementations::chunker::ContainerChunker; +use differential_dataflow::trace::implementations::ord_neu::{OrdValBatcher, RcOrdValBuilder, OrdValSpine}; + +use timely::dataflow::channels::pact::Exchange; +use timely::dataflow::operators::probe::Handle; + +fn main() { + let keys: usize = std::env::args().nth(1).unwrap().parse().unwrap(); + let size: usize = std::env::args().nth(2).unwrap().parse().unwrap(); + // "chunk" (default): our `Chunk`-backed trace. "ord": the standard `ord_neu` trace. + let mode: String = std::env::args().nth(3).unwrap_or_else(|| "chunk".to_string()); + println!("Running [{mode}] arrangement"); + + let timer = std::time::Instant::now(); + + // Skip the three positional args we consume (keys, size, mode); the rest are + // timely's worker flags. + timely::execute_from_args(std::env::args().skip(4), move |worker| { + let mut probe = Handle::new(); + let (mut data_input, mut keys_input) = worker.dataflow(|scope| { + let (data_input, data) = scope.new_collection::(); + let (keys_input, keys) = scope.new_collection::(); + let data = data.map(|x| (x, ())); + let keys = keys.map(|x| (x, ())); + + match mode.as_str() { + "chunk" => { + // The chunk batcher's output (`VecChunk`) differs from the stream + // container (`Vec`), so this is a cross-container chunker case: + // drop to `arrange_core` with an explicit `ContainerChunker`. + type Ba = ChunkBatcher; + type Bu = ChunkRcBuilder; + type Sp = ChunkSpine; + type Chu = ContainerChunker>; + let data = arrange_core::<_, _, Chu, Ba, Bu, Sp>( + data.inner, Exchange::new(|u: &((u64, ()), u64, isize)| (u.0).0.hashed().into()), "Data"); + let keys = arrange_core::<_, _, Chu, Ba, Bu, Sp>( + keys.inner, Exchange::new(|u: &((u64, ()), u64, isize)| (u.0).0.hashed().into()), "Keys"); + keys.join_core(data, |_k, &(), &()| Option::<()>::None).probe_with(&mut probe); + } + "colchunk" => { + type L = Vector<((u64, ()), u64, isize)>; + type Ba = ColChunkBatcher; + type Bu = ColChunkRcBuilder; + type Sp = ColChunkSpine; + type Chu = ColChunker; + let data = arrange_core::<_, _, Chu, Ba, Bu, Sp>( + data.inner, Exchange::new(|u: &((u64, ()), u64, isize)| (u.0).0.hashed().into()), "Data"); + let keys = arrange_core::<_, _, Chu, Ba, Bu, Sp>( + keys.inner, Exchange::new(|u: &((u64, ()), u64, isize)| (u.0).0.hashed().into()), "Keys"); + keys.join_core(data, |_k, &(), &()| Option::<()>::None).probe_with(&mut probe); + } + "ord" => { + type Ba = OrdValBatcher; + type Bu = RcOrdValBuilder; + type Sp = OrdValSpine; + let data = data.arrange::(); + let keys = keys.arrange::(); + keys.join_core(data, |_k, &(), &()| Option::<()>::None).probe_with(&mut probe); + } + other => panic!("unrecognized mode: {other:?} (expected `chunk`, `colchunk`, or `ord`)"), + } + + (data_input, keys_input) + }); + + // Load `data`, advancing round by round. + let mut counter = 0; + let mut t: u64 = 1; + while counter < 10 * keys { + let mut i = worker.index(); + while i < size { + data_input.insert(((counter + i) % keys) as u64); + i += worker.peers(); + } + counter += size; + data_input.advance_to(t); data_input.flush(); + keys_input.advance_to(t); keys_input.flush(); + while probe.less_than(data_input.time()) { worker.step(); } + t += 1; + } + println!("{:?}\tloading complete", timer.elapsed()); + + // Issue `keys` queries against the arranged `data`. + let mut queries = 0; + while queries < 10 * keys { + let mut i = worker.index(); + while i < size { + keys_input.insert(((queries + i) % keys) as u64); + i += worker.peers(); + } + queries += size; + data_input.advance_to(t); data_input.flush(); + keys_input.advance_to(t); keys_input.flush(); + while probe.less_than(keys_input.time()) { worker.step(); } + t += 1; + } + println!("{:?}\tqueries complete", timer.elapsed()); + }).unwrap(); + + println!("{:?}\tshut down", timer.elapsed()); +} diff --git a/differential-dataflow/src/trace/chunk.rs b/differential-dataflow/src/trace/chunk.rs new file mode 100644 index 000000000..7d51f38a4 --- /dev/null +++ b/differential-dataflow/src/trace/chunk.rs @@ -0,0 +1,2390 @@ +//! Sorted, consolidated runs of updates, and operators over sequences of them. +//! +//! A [`Chunk`] is a consolidated, sorted run of `(data, time, diff)` updates. +//! Chunks live in sequences (`Vec`) with no constraint on where the +//! breakpoints between them fall; each chunk holds at most [`Chunk::TARGET`] +//! updates. The trait deliberately exposes only batch-level operations — merge, +//! extract, advance — leaving the layout-aware work to the implementor. The +//! orchestration in this module (the binary merger) is generic over the layout +//! and concerns itself only with feeding chunks across calls. +//! +//! # Why chunks, and why one size +//! +//! A batch could be a single monolithic sorted run. We cut it into chunks because +//! the chunk is simultaneously the unit of four things, each of which wants a size +//! bound: +//! +//! * **Suspendable work.** The fueled merger does a chunk's-worth of work per step +//! and checks fuel at the boundary, so chunk size bounds a step's latency. +//! * **Immutable sharing.** Chunks are `Rc`-shared; the merger reads its sources by +//! *cloning* chunks (a refcount bump). The chunk is the finest granularity of sharing. +//! * **Allocation recycling.** Emptied input buffers are reused as output buffers; +//! that only composes if buffers are roughly one size. +//! * **Indexing.** [`ChunkBatch`] indexes chunks by their first/last key, and the +//! cursor binary-searches *over* chunks then gallops *within* one. The chunk +//! count (≈ `len / TARGET`) sets the outer index size and search depth. +//! +//! So the size bound pulls two ways: an upper bound (latency, memory) says "not too +//! big," and a lower bound (per-chunk overhead, index bloat) says "not too +//! fragmented." Keeping chunks one size is what lets a single knob satisfy both. +//! The grading invariant ([`is_graded`]) encodes exactly this: every chunk is at +//! most `TARGET`, and every *adjacent pair* exceeds `TARGET` — i.e. no two +//! neighbours could be combined into one legal chunk. That makes `TARGET` both the +//! maximum size and the coalescing threshold (the invariant is self-similar), and +//! a graded sequence a *maximal packing*: as few chunks as the maximum allows. +//! +//! The intent is for a `Chunk` implementation to be each of +//! 1. the containers a `Collection` can transit. +//! 2. the containers a `MergeBatcher` can work with. +//! 3. the containers a `Batch` can be backed by. +//! It does this by exposing a small set of chunk-oriented primitives, which are +//! sufficient for harnesses for each of these tasks. + +use timely::progress::Antichain; +use timely::progress::frontier::AntichainRef; +use crate::lattice::Lattice; +use crate::trace::{Batch, BatchReader, Description}; +use crate::trace::cursor::Cursor; +use crate::trace::implementations::{BatchContainer, Layout, LayoutExt, WithLayout}; + +/// The key container of chunk `C`'s layout. Named via the `Layout` projection so +/// it unifies with the cursor's `Self::Key`, which also projects through `Layout`. +type KeyCon = <::Layout as Layout>::KeyContainer; +/// The val container of chunk `C`'s layout. +type ValCon = <::Layout as Layout>::ValContainer; + +/// A partially consumed head and optional tail of chunks. +pub type ChunkFeed = ((usize, C), Vec); + +/// Whether `chunks` satisfy the [`Chunk::TARGET`] grading invariant: every chunk +/// at most `TARGET`, and every adjacent pair summing to more than `TARGET` (so no +/// two neighbours could be combined into one legal chunk — a *maximal packing*). +/// +/// This is the post-[`regrade`](Chunk::regrade) shape; useful as a test/debug check. +pub fn is_graded(chunks: &[C]) -> bool { + chunks.iter().all(|c| c.len() <= C::TARGET) + && chunks.windows(2).all(|w| w[0].len() + w[1].len() > C::TARGET) +} + +/// A list of chunks that maintains the `C::regrade` structural invariant. +/// +/// Producers `push` chunks in; each push runs `C::regrade`, which moves graded +/// runs into `data` and leaves anything not yet safe to emit in `todo`. `done` +/// flushes the remainder and yields the graded sequence. +pub struct ChunkList { + todo: Vec, + data: Vec, +} + +impl Default for ChunkList { + fn default() -> Self { Self { todo: Vec::new(), data: Vec::new() } } +} + +impl ChunkList { + /// Add a new chunk to the list, regrading as far as is safe. + pub fn push(&mut self, chunk: C) { + self.todo.push(chunk); + C::regrade(&mut self.todo, false, &mut self.data); + } + /// Add several chunks. + pub fn extend>(&mut self, chunks: I) { + for chunk in chunks { self.push(chunk); } + } + /// Finalize the list, flushing the remainder, and extract the graded sequence. + pub fn done(mut self) -> Vec { + C::regrade(&mut self.todo, true, &mut self.data); + assert!(self.todo.is_empty()); + self.data + } +} + +/// A consolidated, sorted sequence of `(data, time, diff)`. +/// +/// Chunks exist in sequences, with no constraints on the breakpoints between +/// them. Each holds at most [`TARGET`](Chunk::TARGET) updates; a graded sequence +/// is a maximal packing at that size (see [`is_graded`] and the module docs). +/// +/// `Clone` is expected to be cheap — a refcount bump on shared backing storage, +/// not a deep copy. The trace merger relies on this to read its (shared, +/// immutable) source batches by cloning chunks rather than consuming them, and +/// `prune` is likewise expected to be a range adjustment over shared storage. +/// +/// A chunk *has* a [`Cursor`] over its own `(key, val, time, diff)` contents — +/// the chunk is its own cursor `Storage`, mirroring [`BatchReader`]. This is what +/// lets a batch cursor delegate downward: the batch indexes which chunk holds a +/// key (reusing the chunk's `KeyContainer` / `ValContainer` for boundaries) and +/// then reads through that chunk's cursor. As with `merge`, we do not +/// provide this; the opaque chunk implementor does. +/// +/// # Implementor contract +/// +/// The chunk-producing operations (`merge`, `extract`, `advance`, `regrade`) emit +/// into a [`ChunkList`], and implementors are expected to: +/// +/// * **Respect the chain structure.** Emit *graded* chunks — sized to the +/// `regrade` invariant — rather than collapsing a run into one monolithic chunk +/// and leaning on `regrade` to re-split it. Building the right shape directly +/// avoids a redundant copy. +/// * **Bound output by input consumed.** Produce output chunks in proportion to +/// the input chunks consumed, never buffering an unbounded amount before +/// emitting. The fueled merger debits progress by the work it feeds across +/// suspensions; output that lags input arbitrarily breaks that accounting. +/// * **Recycle where possible.** Reuse the storage of chunks drained from the +/// input as the buffers for output, so allocations balance input against output +/// rather than allocating afresh per emitted chunk. `vec_chunk::extract` is the +/// worked example: it fills `TARGET`-sized buffers reclaimed from a stash of +/// emptied input `Vec`s. +/// +/// [`BatchReader`]: crate::trace::BatchReader +pub trait Chunk: Sized + Clone + LayoutExt { + + /// The chunk size: both the maximum updates per chunk and the coalescing + /// threshold. + /// + /// A *graded* sequence (the post-[`regrade`](Chunk::regrade) shape) has every + /// chunk of length at most `TARGET`, and every adjacent pair summing to more + /// than `TARGET` — so no two neighbours could be combined into one legal chunk. + /// Equivalently, a maximal packing at size `TARGET`. [`is_graded`] checks + /// exactly this. The value is the implementor's tuning knob: larger means fewer + /// chunks (smaller index, less per-chunk overhead) but coarser merge-suspension + /// granularity and a larger within-chunk seek. + const TARGET: usize = 1024; + + /// A cursor navigating this chunk's contents; the chunk is its storage. + /// + /// The layout aliases are spelled out (mirroring [`BatchReader`]) so the + /// cursor's `Key`/`Val`/`Time`/`Diff` and their containers are *definitionally* + /// equal to the chunk's — without this the compiler won't connect the cursor's + /// layout to the chunk's when reading through it. + type Cursor: + Cursor + + WithLayout + + for<'a> LayoutExt< + Key<'a> = Self::Key<'a>, + Val<'a> = Self::Val<'a>, + ValOwn = Self::ValOwn, + Time = Self::Time, + TimeGat<'a> = Self::TimeGat<'a>, + Diff = Self::Diff, + DiffGat<'a> = Self::DiffGat<'a>, + KeyContainer = Self::KeyContainer, + ValContainer = Self::ValContainer, + TimeContainer = Self::TimeContainer, + DiffContainer = Self::DiffContainer, + >; + + /// Acquire a cursor over this chunk. + fn cursor(&self) -> Self::Cursor; + + /// The first and last `(key, val, time)` triples in the chunk. + /// + /// The chunk must be non-empty (batch chunks always are). Expected to be + /// cheap — the chunk's endpoints, e.g. columnar indices `0` and `len - 1`, + /// not a cursor walk. Indexing a batch's chunks rests on this: the last + /// triples drive a binary search to a key or `(key, val)`, and comparing one + /// chunk's last triple against the next chunk's first detects keys or + /// `(key, val)` pairs that straddle the boundary — all without touching chunk + /// contents. Returned by reference (no owned key type exists in the layout); + /// the index materializes them into its own containers. + fn bounds(&self) -> ( + (Self::Key<'_>, Self::Val<'_>, Self::TimeGat<'_>), + (Self::Key<'_>, Self::Val<'_>, Self::TimeGat<'_>), + ); + + /// The number of updates in the chunk. + /// + /// Chunks are always non-empty (`len() > 0`): producers drop empties before + /// they reach a chunk sequence, and [`ChunkBatch::new`] asserts the invariant. + fn len(&self) -> usize; + + /// Remove some first few updates, returning the remainder. + /// + /// Implemented via a singleton `merge`: with one input there is no horizon to + /// hold back, so the whole suffix `[prefix..]` is emitted. The remainder of a + /// graded chunk is at most one graded chunk. + fn prune(self, prefix: usize) -> Self { + let mut buffer = ChunkList::default(); + Self::merge(&mut [(prefix, self)], &mut buffer); + let mut data = buffer.done(); + assert_eq!(data.len(), 1); + data.pop().unwrap() + } + + /// Merges as much as possible from each of the input chunks. + /// + /// Input chunks come with a number of consumed prefix updates, which are not + /// intended for merging. The chunks are only able to merge through updates + /// that would be present in all inputs, generally up to the least last + /// `(key, val, time)` triple across the inputs. On return, the consumed + /// prefix of at least one input has advanced to that input's length, marking + /// it drained and signalling the caller to refill that slot. + fn merge(chunks: &mut [(usize, Self)], out: &mut ChunkList); + + /// Partition chunks into updates greater or equal `frontier` (`keep`) or not (`ship`). + /// + /// The lower envelope of the times routed to `keep` is folded into + /// `residual`, so the caller learns the frontier of data it still holds + /// without a second pass over the chunks. + fn extract( + chunks: &mut Vec, + frontier: &Antichain, + residual: &mut Antichain, + keep: &mut ChunkList, + ship: &mut ChunkList, + ); + + /// Advance times in input chunks by `frontier` and push consolidated result out. + /// + /// To be certainly consolidated, all `(key, val)` updates must be present in + /// the input, or `done` must be set. A run of chunks may fail to be emitted if + /// they all share the same `(key, val)` and the implementor cannot be sure no + /// future times for the pair are yet to arrive. + fn advance( + feed: &mut ChunkFeed, + frontier: &Antichain, + done: bool, + out: &mut ChunkList, + ); + + /// Reshapes a sequence of consolidated chunks into a maximal packing: each at + /// most [`TARGET`](Chunk::TARGET), and any two adjacent chunks summing past + /// `TARGET` (so no neighbours could be combined). See [`is_graded`]. + /// + /// The implementor should guard against emitting sequences of chunks that violate + /// the invariant, until the set `done` indicates that the queues is complete. + /// The implementor is allowed to push back at `queue` if it needs, but should + /// not corrupt the order of chunks and updates. + fn regrade( + queue: &mut Vec, + done: bool, + out: &mut Vec, + ); + +} + +/// Merge two sorted chains of chunks into one sorted chain. +/// +/// Presents the heads of `chain1` and `chain2` to [`Chunk::merge`], each +/// tagged with the prefix already consumed. After each call at least one head has +/// been drained to its length; that slot is refilled from its chain. When either +/// chain is exhausted, the partially-consumed remainder of the other is pruned of +/// its consumed prefix and the rest of that chain is appended verbatim. +pub fn merge_chains( + chain1: Vec, + chain2: Vec, + out: &mut ChunkList, +) { + let mut iter1 = chain1.into_iter(); + let mut iter2 = chain2.into_iter(); + + // Current head of each chain, tagged with its consumed prefix; `None` once + // that chain's iterator is exhausted. + let mut head1 = iter1.next().map(|c| (0, c)); + let mut head2 = iter2.next().map(|c| (0, c)); + + while head1.is_some() && head2.is_some() { + let mut window = [head1.take().unwrap(), head2.take().unwrap()]; + C::merge(&mut window, out); + let [(p1, c1), (p2, c2)] = window; + // Refill whichever side(s) drained to length; keep partially-consumed ones. + head1 = if p1 >= c1.len() { iter1.next().map(|c| (0, c)) } else { Some((p1, c1)) }; + head2 = if p2 >= c2.len() { iter2.next().map(|c| (0, c)) } else { Some((p2, c2)) }; + } + + // One chain is exhausted; flush the partially-consumed remainder of the other, + // then its untouched tail. + for head in [head1, head2] { + if let Some((consumed, chunk)) = head { + // A retained head always has `consumed < len` (a fully-consumed one + // would have been refilled), so the pruned remainder is non-empty. + let chunk = if consumed > 0 { chunk.prune(consumed) } else { chunk }; + out.push(chunk); + } + } + out.extend(iter1); + out.extend(iter2); +} + +/// Drives [`Chunk::advance`] over a growing queue of chunks. +/// +/// Compaction may need to see several chunks before it can emit a consolidated +/// output chunk, because a `(key, val)` run can span chunk boundaries. The +/// implementor owns the `(next, tail)` representation and rotates it itself: it +/// can consume across chunks by amounts the driver cannot see, so the driver +/// never promotes from `tail` into `next`. The driver only appends incoming +/// chunks to `tail` and calls `advance`; a final [`Self::finish`] sets `done` to +/// flush whatever was being withheld. +pub struct AdvanceQueue { + /// The chunks awaiting advancement, as a head (with consumed prefix) and tail; + /// the implementor owns rotation between them. + feed: ChunkFeed, + /// Frontier to advance times by during compaction. + frontier: Antichain, +} + +impl AdvanceQueue { + /// A compactor that advances times by `frontier`. + pub fn new(frontier: Antichain) -> Self { + Self { feed: ((0, C::default()), Vec::new()), frontier } + } + /// Append a completed merge's chunks and advance as far as is certain. + pub fn push>(&mut self, chunks: I, out: &mut ChunkList) { + self.feed.1.extend(chunks); + C::advance(&mut self.feed, &self.frontier, false, out); + } + /// Flush all remaining updates; no further chunks will be pushed. + pub fn finish(mut self, out: &mut ChunkList) { + C::advance(&mut self.feed, &self.frontier, true, out); + } +} + +/// A merge-batcher [`Merger`](crate::trace::implementations::merge_batcher::Merger) +/// over chains of [`Chunk`]s. +/// +/// `merge` runs the binary merger; `extract` splits by the seal frontier using +/// [`Chunk::extract`]. The batcher consolidates equal `(data, time)` updates +/// but does *not* advance times — time advancement is advance's job, handled +/// later in the trace. +pub struct ChunkMerger { + _marker: std::marker::PhantomData, +} + +impl Default for ChunkMerger { + fn default() -> Self { Self { _marker: std::marker::PhantomData } } +} + +impl crate::trace::implementations::merge_batcher::Merger for ChunkMerger +where + C: Chunk + Default + 'static, + C::Time: Clone + timely::PartialOrder + 'static, +{ + type Chunk = C; + type Time = C::Time; + + fn merge( + &mut self, + list1: Vec, + list2: Vec, + output: &mut Vec, + _stash: &mut Vec, + ) { + // The merge-batcher's chains are plain `Vec`s; grade through a `ChunkList`. + let mut graded = ChunkList::default(); + merge_chains(list1, list2, &mut graded); + output.extend(graded.done()); + } + + fn extract( + &mut self, + mut merged: Vec, + upper: AntichainRef, + frontier: &mut Antichain, + ship: &mut Vec, + kept: &mut Vec, + _stash: &mut Vec, + ) { + // `extract` keeps updates greater-or-equal `upper` and ships the rest, + // folding the lower envelope of kept times into `frontier`. + let upper = upper.to_owned(); + let (mut keep, mut shipped) = (ChunkList::default(), ChunkList::default()); + C::extract(&mut merged, &upper, frontier, &mut keep, &mut shipped); + kept.extend(keep.done()); + ship.extend(shipped.done()); + } + + fn account(chunk: &C) -> (usize, usize, usize, usize) { (chunk.len(), 0, 0, 0) } +} + +/// The merge batcher for chunks of type `C`, merging pre-chunked `C` runs. +/// +/// The batcher accepts already-formed `C` chunks via `PushInto` and merges them +/// through [`ChunkMerger`]; it holds no chunker. The `Input → C` bridge lives at the +/// `arrange_core` callsite, which supplies the chunker (e.g. [`ContainerChunker`] +/// for same-shape input, where `C` satisfies the batcher-side container traits +/// `SizableContainer`, `Consolidate`, `Container`, `PushInto`). +/// +/// [`ContainerChunker`]: crate::trace::implementations::chunker::ContainerChunker +pub type ChunkBatcher = crate::trace::implementations::merge_batcher::MergeBatcher>; + +/// A spine of `Rc`-shared [`ChunkBatch`]es of type `C`: the trace type for `arrange`. +pub type ChunkSpine = crate::trace::implementations::spine_fueled::Spine>>; + +/// A reference-counted [`ChunkBatch`] builder over chunks of type `C`. +pub type ChunkRcBuilder = crate::trace::rc_blanket_impls::RcBuilder>; + +/// A batch is just an ordered sequence of [`Chunk`]s plus its time description. +/// +/// The chunks are sorted and consolidated, with chunk boundaries arbitrary; the +/// concatenation of their contents is the batch. +/// +/// This is a full [`Batch`](crate::trace::Batch): [`ChunkBatchCursor`] reads +/// across the chunks (delegating to each chunk's own cursor and continuing past +/// boundaries), [`ChunkBatchMerger`] performs the resumable merge-and-advance, +/// and [`ChunkBuilder`] collects pre-sorted chunks. All of those are below. +pub struct ChunkBatch { + /// Ordered, consolidated chunks; their concatenation is the batch. + pub chunks: Vec, + /// The lower, upper, and since frontiers of the batch. + pub description: Description, + /// Per-chunk first and last key, and first and last val, parallel to `chunks`. + first_keys: KeyCon, + last_keys: KeyCon, + first_vals: ValCon, + last_vals: ValCon, +} + +impl ChunkBatch { + /// Assemble a batch from ordered chunks, building the per-chunk index. + pub fn new(chunks: Vec, description: Description) -> Self { + let n = chunks.len(); + let mut first_keys = >::with_capacity(n); + let mut last_keys = >::with_capacity(n); + let mut first_vals = >::with_capacity(n); + let mut last_vals = >::with_capacity(n); + for chunk in &chunks { + assert!(chunk.len() > 0, "ChunkBatch chunks must be non-empty"); + let ((fk, fv, _), (lk, lv, _)) = chunk.bounds(); + first_keys.push_ref(fk); + last_keys.push_ref(lk); + first_vals.push_ref(fv); + last_vals.push_ref(lv); + } + ChunkBatch { chunks, description, first_keys, last_keys, first_vals, last_vals } + } +} + +impl WithLayout for ChunkBatch { + type Layout = C::Layout; +} + +/// A cursor over a [`ChunkBatch`], merging the per-chunk cursors. +/// +/// Chunk breakpoints are unconstrained, so a single key — or `(key, val)` — may +/// straddle consecutive chunks. But the chunks are one globally-sorted sequence +/// merely cut at arbitrary points, so the operation is *concatenation*, never a +/// merge: across a boundary a key's vals concatenate and a `(key, val)`'s times +/// concatenate. The cursor exploits this. It holds the chunk currently being read +/// and a cursor into it; it seeks by binary-searching the per-chunk index on +/// `ChunkBatch`, and at boundaries it *continues* into the next chunk rather than +/// merging — using the index to detect when a key or `(key, val)` spills forward, +/// without touching chunk contents. +pub struct ChunkBatchCursor { + /// First chunk of the current key's run; where `rewind_vals` returns to. + key_chunk: usize, + /// Chunk currently being read; `>= key_chunk`, within the current key's span. + chunk: usize, + /// Cursor into `chunk`; `None` once `chunk` is past the last chunk. + inner: Option, +} + +impl WithLayout for ChunkBatchCursor { + type Layout = C::Layout; +} + +impl ChunkBatchCursor { + /// Move the active chunk to `c`, opening a fresh inner cursor at its start. + fn goto(&mut self, c: usize, storage: &ChunkBatch) { + self.chunk = c; + self.inner = storage.chunks.get(c).map(C::cursor); + } +} + +impl Cursor for ChunkBatchCursor { + type Storage = ChunkBatch; + + fn key_valid(&self, s: &Self::Storage) -> bool { self.chunk < s.chunks.len() && self.inner.as_ref().is_some_and(|i| i.key_valid(&s.chunks[self.chunk])) } + fn val_valid(&self, s: &Self::Storage) -> bool { self.chunk < s.chunks.len() && self.inner.as_ref().is_some_and(|i| i.val_valid(&s.chunks[self.chunk])) } + fn key<'a>(&self, s: &'a Self::Storage) -> Self::Key<'a> { self.inner.as_ref().unwrap().key(&s.chunks[self.chunk]) } + fn val<'a>(&self, s: &'a Self::Storage) -> Self::Val<'a> { self.inner.as_ref().unwrap().val(&s.chunks[self.chunk]) } + fn get_key<'a>(&self, s: &'a Self::Storage) -> Option> { if self.key_valid(s) { Some(self.key(s)) } else { None } } + fn get_val<'a>(&self, s: &'a Self::Storage) -> Option> { if self.val_valid(s) { Some(self.val(s)) } else { None } } + + fn map_times, Self::DiffGat<'_>)>(&mut self, s: &Self::Storage, mut logic: L) { + if !self.val_valid(s) { return; } + let (k, v) = (self.key(s), self.val(s)); + self.inner.as_mut().unwrap().map_times(&s.chunks[self.chunk], &mut logic); + // Follow the (key, val) forward across boundaries while it spills. + let mut c = self.chunk; + while c + 1 < s.chunks.len() + && s.last_keys.index(c) == k && s.first_keys.index(c + 1) == k + && s.last_vals.index(c) == v && s.first_vals.index(c + 1) == v + { + c += 1; + s.chunks[c].cursor().map_times(&s.chunks[c], &mut logic); + } + } + + fn step_key(&mut self, s: &Self::Storage) { + if !self.key_valid(s) { return; } + let n = s.chunks.len(); + let k = self.key(s); + // Advance to the last chunk the key spans. + while self.chunk + 1 < n && s.last_keys.index(self.chunk) == k && s.first_keys.index(self.chunk + 1) == k { + self.goto(self.chunk + 1, s); + } + // Step past the key within its last chunk. + { + let inner = self.inner.as_mut().unwrap(); + inner.seek_key(&s.chunks[self.chunk], k); + inner.step_key(&s.chunks[self.chunk]); + } + // If that exhausted the chunk, the next key (if any) starts the next chunk. + if !self.inner.as_ref().unwrap().key_valid(&s.chunks[self.chunk]) && self.chunk + 1 < n { + self.goto(self.chunk + 1, s); + } + self.key_chunk = self.chunk; + } + + fn seek_key(&mut self, s: &Self::Storage, key: Self::Key<'_>) { + let n = s.chunks.len(); + // First chunk whose last key is `>= key`: where `key`'s run begins. + let c = s.last_keys.advance(0, n, |x| { + as BatchContainer>::reborrow(x).lt(& as BatchContainer>::reborrow(key)) + }); + self.goto(c, s); + self.key_chunk = c; + if c < n { self.inner.as_mut().unwrap().seek_key(&s.chunks[c], key); } + } + + fn step_val(&mut self, s: &Self::Storage) { + if !self.val_valid(s) { return; } + let n = s.chunks.len(); + let (k, v) = (self.key(s), self.val(s)); + // Advance to the last chunk the (key, val) spans. + while self.chunk + 1 < n + && s.last_keys.index(self.chunk) == k && s.first_keys.index(self.chunk + 1) == k + && s.last_vals.index(self.chunk) == v && s.first_vals.index(self.chunk + 1) == v + { + self.goto(self.chunk + 1, s); + } + // Step past the (key, val) within that chunk. + self.inner.as_mut().unwrap().step_val(&s.chunks[self.chunk]); + // If the key's vals are exhausted here but the key spills, roll forward. + if !self.inner.as_ref().unwrap().val_valid(&s.chunks[self.chunk]) + && self.chunk + 1 < n && s.last_keys.index(self.chunk) == k && s.first_keys.index(self.chunk + 1) == k + { + self.goto(self.chunk + 1, s); + self.inner.as_mut().unwrap().seek_key(&s.chunks[self.chunk], k); + } + } + + fn seek_val(&mut self, s: &Self::Storage, val: Self::Val<'_>) { + if !self.key_valid(s) { return; } + let n = s.chunks.len(); + let k = self.key(s); + loop { + self.inner.as_mut().unwrap().seek_val(&s.chunks[self.chunk], val); + if self.inner.as_ref().unwrap().val_valid(&s.chunks[self.chunk]) { return; } + // Key's vals exhausted in this chunk; if the key spills, retry in the next. + if self.chunk + 1 < n && s.last_keys.index(self.chunk) == k && s.first_keys.index(self.chunk + 1) == k { + self.goto(self.chunk + 1, s); + self.inner.as_mut().unwrap().seek_key(&s.chunks[self.chunk], k); + } else { + return; + } + } + } + + fn rewind_keys(&mut self, s: &Self::Storage) { + self.key_chunk = 0; + self.goto(0, s); + } + + fn rewind_vals(&mut self, s: &Self::Storage) { + if !self.key_valid(s) { return; } + let k = self.key(s); + let kc = self.key_chunk; + self.goto(kc, s); + self.inner.as_mut().unwrap().seek_key(&s.chunks[kc], k); + } +} + +impl BatchReader for ChunkBatch { + type Cursor = ChunkBatchCursor; + fn cursor(&self) -> Self::Cursor { + ChunkBatchCursor { key_chunk: 0, chunk: 0, inner: self.chunks.first().map(C::cursor) } + } + fn len(&self) -> usize { self.chunks.iter().map(C::len).sum() } + fn description(&self) -> &Description { &self.description } +} + +impl Batch for ChunkBatch +where + C::Time: timely::progress::Timestamp + Lattice + Ord, +{ + type Merger = ChunkBatchMerger; + + fn empty(lower: Antichain, upper: Antichain) -> Self { + use timely::progress::Timestamp; + let since = Antichain::from_elem(Self::Time::minimum()); + ChunkBatch::new(Vec::new(), Description::new(lower, upper, since)) + } +} + +/// Live state of the binary merge: an index into each (shared, immutable) source +/// chain marking the next chunk to clone, and the current head of each (a cloned +/// chunk tagged with its consumed prefix). A head is `None` once its chain is +/// exhausted; the merge proper runs while both are `Some`. The indices are the +/// "cursor positions": the same sources arrive on each `work` call, so they are +/// stable across suspensions. +struct MergeState { + idx1: usize, + idx2: usize, + head1: Option<(usize, C)>, + head2: Option<(usize, C)>, +} + +/// Clone the chunk at `*idx` (if any), advancing `*idx`, tagged with prefix `0`. +fn clone_chunk(chunks: &[C], idx: &mut usize) -> Option<(usize, C)> { + let chunk = chunks.get(*idx)?.clone(); + *idx += 1; + Some((0, chunk)) +} + +/// A merge of two [`ChunkBatch`]es in progress. +/// +/// This is the [`ChunkBatch`] merger, wired in as its +/// [`Batch::Merger`](crate::trace::Batch::Merger), and has that trait's +/// `new` / `work` / `done` shape. +/// +/// The merge is *resumable*: `work` drains one [`Chunk::merge`]'s-worth of +/// updates per step, feeding the output into a live [`AdvanceQueue`], and stops once +/// `fuel` is exhausted, retaining the iterators, heads, and advancer for the +/// next call. Fuel is debited by the (consolidated) updates fed into the advancer; +/// summed over all steps this is the total *output*, not the input scanned — +/// matching how the trace's other mergers account (cf. `ord_neu`, which debits the +/// consolidated updates it stages). Compaction's final flush (`done = true`) rides +/// along uncounted, bounded by the data withheld during streaming. +pub struct ChunkBatchMerger { + /// Compaction frontier supplied at construction. + frontier: Antichain, + /// Result frontiers, retained for the output description. + lower: Antichain, + upper: Antichain, + /// Merged-and-advanced chunks, grown by `work`. + result: ChunkList, + /// Live merge state; `None` before the first `work` and after merging completes. + state: Option>, + /// Live advancer; `Some` until its final flush, then `None`. + advancer: Option>, + /// Whether the inputs have been moved into `state` yet. + initialized: bool, +} + +impl crate::trace::Merger> for ChunkBatchMerger +where + C: Chunk + Default + 'static, + C::Time: timely::progress::Timestamp + Lattice + Ord + 'static, +{ + /// Begin merging `source1` and `source2`, advancing to `frontier`. + fn new(source1: &ChunkBatch, source2: &ChunkBatch, frontier: AntichainRef) -> Self { + let lower = source1.description.lower().meet(source2.description.lower()); + let upper = source1.description.upper().join(source2.description.upper()); + Self { + frontier: frontier.to_owned(), + lower, + upper, + result: ChunkList::default(), + state: None, + advancer: None, + initialized: false, + } + } + + /// Advance the merge by up to `fuel` updates, suspending when it runs out. + /// + /// The sources are read by *cloning* chunks (a cheap refcount bump, per the + /// [`Chunk`] contract), never consumed or mutated, so they remain shared and + /// immutable. The same `source1`/`source2` must be supplied on every call. + fn work(&mut self, source1: &ChunkBatch, source2: &ChunkBatch, fuel: &mut isize) { + if !self.initialized { + let mut idx1 = 0; + let mut idx2 = 0; + let head1 = clone_chunk(&source1.chunks, &mut idx1); + let head2 = clone_chunk(&source2.chunks, &mut idx2); + self.state = Some(MergeState { idx1, idx2, head1, head2 }); + self.advancer = Some(AdvanceQueue::new(self.frontier.clone())); + self.initialized = true; + } + + while *fuel > 0 { + let state = match &mut self.state { Some(s) => s, None => break }; + let advancer = self.advancer.as_mut().unwrap(); + + if state.head1.is_some() && state.head2.is_some() { + // One merge step: present both heads, refill whichever drains. + let mut window = [state.head1.take().unwrap(), state.head2.take().unwrap()]; + let mut merged = ChunkList::default(); + C::merge(&mut window, &mut merged); + let [(p1, c1), (p2, c2)] = window; + state.head1 = if p1 >= c1.len() { clone_chunk(&source1.chunks, &mut state.idx1) } else { Some((p1, c1)) }; + state.head2 = if p2 >= c2.len() { clone_chunk(&source2.chunks, &mut state.idx2) } else { Some((p2, c2)) }; + let chunks = merged.done(); + let work: usize = chunks.iter().map(C::len).sum(); + advancer.push(chunks, &mut self.result); + *fuel -= work as isize; + } else if let Some((consumed, chunk)) = state.head1.take().or_else(|| state.head2.take()) { + // One chain exhausted; flush the partially-consumed head of the + // other. It was retained with `consumed < len`, so the pruned + // remainder is non-empty. + let chunk = if consumed > 0 { chunk.prune(consumed) } else { chunk }; + let work = chunk.len(); + advancer.push(std::iter::once(chunk), &mut self.result); + *fuel -= work as isize; + } else if let Some((_, chunk)) = clone_chunk(&source1.chunks, &mut state.idx1).or_else(|| clone_chunk(&source2.chunks, &mut state.idx2)) { + // Flush the untouched tail of the surviving chain, one chunk per step. + let work = chunk.len(); + advancer.push(std::iter::once(chunk), &mut self.result); + *fuel -= work as isize; + } else { + // Both chains fully fed; flush withheld advancement and retire. + self.state = None; + if let Some(advancer) = self.advancer.take() { + advancer.finish(&mut self.result); + } + break; + } + } + } + + /// Extract the merged batch over `[lower, upper)` advanced to the frontier. + /// + /// Only valid once `work` has driven the merge to completion (left `fuel` + /// positive), as the [`trace::Merger`](crate::trace::Merger) contract requires. + fn done(self) -> ChunkBatch { + let description = Description::new(self.lower, self.upper, self.frontier); + ChunkBatch::new(self.result.done(), description) + } +} + +/// A [`Builder`](crate::trace::Builder) that collects pre-sorted chunks into a +/// [`ChunkBatch`]. +/// +/// The builder assumes its inputs arrive already sorted and consolidated (as the +/// `Builder` contract requires), so it does no merging: each pushed chunk is an +/// ordered run, appended in order. They accumulate in a [`ChunkList`], which +/// regrades them to the size invariant as they arrive — so a batch built here is +/// graded like one produced by the merger, rather than inheriting whatever chunk +/// sizes the caller happened to push. +pub struct ChunkBuilder { + chunks: ChunkList, +} + +impl crate::trace::Builder for ChunkBuilder +where + C: Chunk + Default + 'static, + C::Time: timely::progress::Timestamp, +{ + type Input = C; + type Time = C::Time; + type Output = ChunkBatch; + + fn with_capacity(_keys: usize, _vals: usize, _upds: usize) -> Self { + Self { chunks: ChunkList::default() } + } + + fn push(&mut self, chunk: &mut C) { + let chunk = std::mem::take(chunk); + if chunk.len() > 0 { self.chunks.push(chunk); } + } + + fn done(self, description: Description) -> ChunkBatch { + ChunkBatch::new(self.chunks.done(), description) + } + + fn seal(chain: &mut Vec, description: Description) -> ChunkBatch { + // The chain is sorted and consolidated but not necessarily graded; regrade + // it. Already-sized chunks pass through as cheap `Rc` moves, so a chain that + // arrives graded (as the batcher's does) pays only an O(#chunks) walk. + let mut chunks = ChunkList::default(); + chunks.extend(std::mem::take(chain)); + ChunkBatch::new(chunks.done(), description) + } +} + +pub mod vec_chunk { + //! A worked [`Chunk`] implementation: `Vec<((K, V), T, R)>` behind an `Rc`. + //! + //! This is the reference example — a next implementor (e.g. columnar) follows + //! its *shape*, not its layout. It shows the two integration points any chunk + //! type satisfies, and how leaning on the parent module's generic harnesses + //! keeps the code terse: + //! + //! * **Batcher side.** The merge batcher's `ContainerChunker` builds chunks, so + //! the type implements timely's container traits (`Accountable`, + //! `SizableContainer`, `Consolidate`, `PushInto`). Here they delegate to the + //! inner `Vec` via `Rc::make_mut` — free while a chunk is being built + //! (refcount 1), and it never copies a *shared* chunk because batches are + //! immutable once built. + //! * **Trace side.** [`Chunk`] (merge / extract / advance / prune / bounds) + //! plus a cursor. Key lookups are logarithmic by galloping search (`seek_*`), + //! independent of chunk size; stepping stays linear (short hops). + //! + //! `Clone` is a refcount bump, so the trace merger shares source chunks instead + //! of copying them. + + use std::marker::PhantomData; + use std::rc::Rc; + + use timely::Accountable; + use timely::container::{PushInto, SizableContainer}; + use timely::progress::{Antichain, Timestamp}; + + use crate::consolidation::Consolidate; + use crate::difference::Semigroup; + use crate::lattice::Lattice; + use crate::trace::cursor::Cursor; + use crate::trace::implementations::{Vector, WithLayout}; + + use super::{Chunk, ChunkFeed, ChunkList}; + + /// The chunk size: both the maximum updates per chunk and the coalescing + /// threshold (see [`Chunk::TARGET`]). Chosen for the reference impl; exposed as + /// the associated const below, and used internally for buffer sizing. + const TARGET: usize = 1024; + + /// A sorted, consolidated run of `((key, val), time, diff)`, shared via `Rc`. + pub struct VecChunk(Rc>); + + impl Clone for VecChunk { + fn clone(&self) -> Self { VecChunk(Rc::clone(&self.0)) } + } + impl Default for VecChunk { + fn default() -> Self { VecChunk(Rc::new(Vec::new())) } + } + + /// The trace type for `arrange`: a spine of `Rc`-shared chunk batches. + pub type ChunkSpine = super::ChunkSpine>; + /// Merge batcher over `VecChunk`s. Unordered `Vec<((K, V), T, R)>` input is + /// consolidated into sorted `VecChunk`s by a `ContainerChunker` supplied + /// at the `arrange_core` callsite (it drives the container-trait impls below); the + /// batcher itself only merges the resulting chunks. + pub type ChunkBatcher = super::ChunkBatcher>; + /// Reference-counted batch builder. + pub type ChunkRcBuilder = super::ChunkRcBuilder>; + + // --- batcher side: timely container traits, delegating to the inner `Vec` --- + + impl Accountable for VecChunk { + fn record_count(&self) -> i64 { self.0.len() as i64 } + } + + impl SizableContainer for VecChunk + where K: Clone+'static, V: Clone+'static, T: Clone+'static, R: Clone+'static { + fn at_capacity(&self) -> bool { self.0.at_capacity() } + fn ensure_capacity(&mut self, _stash: &mut Option) { + Rc::make_mut(&mut self.0).ensure_capacity(&mut None); + } + } + + impl Consolidate for VecChunk + where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Ord+Clone+'static, R: Semigroup+'static { + fn len(&self) -> usize { self.0.len() } + fn clear(&mut self) { Rc::make_mut(&mut self.0).clear() } + fn consolidate_into(&mut self, target: &mut Self) { + Rc::make_mut(&mut self.0).consolidate_into(Rc::make_mut(&mut target.0)); + } + } + + impl PushInto<((K, V), T, R)> for VecChunk + where K: Clone+'static, V: Clone+'static, T: Clone+'static, R: Clone+'static { + fn push_into(&mut self, item: ((K, V), T, R)) { Rc::make_mut(&mut self.0).push(item); } + } + + // --- trace side: a logarithmic cursor and the `Chunk` operations --- + + /// First index `>= start` at which `pred` turns false, by galloping (exponential) + /// search. `pred` must hold for a prefix then not — i.e. `|u| u < target`. + /// O(log distance), so O(1) for short hops and logarithmic for long ones. + fn gallop(s: &[U], start: usize, pred: impl Fn(&U) -> bool) -> usize { + let mut pos = start; + if pos < s.len() && pred(&s[pos]) { + let mut step = 1; + while pos + step < s.len() && pred(&s[pos + step]) { pos += step; step <<= 1; } + step >>= 1; + while step > 0 { + if pos + step < s.len() && pred(&s[pos + step]) { pos += step; } + step >>= 1; + } + pos += 1; + } + pos + } + + /// A cursor over a [`VecChunk`], tracking the current key and `(key, val)` + /// group starts as indices into the flat vector. + pub struct VecChunkCursor { + key_pos: usize, + val_pos: usize, + phantom: PhantomData<(K, V, T, R)>, + } + + impl WithLayout for VecChunk + where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { + type Layout = Vector<((K, V), T, R)>; + } + + impl WithLayout for VecChunkCursor + where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { + type Layout = Vector<((K, V), T, R)>; + } + + impl Cursor for VecChunkCursor + where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { + type Storage = VecChunk; + + fn key_valid(&self, s: &Self::Storage) -> bool { self.key_pos < s.0.len() } + fn val_valid(&self, s: &Self::Storage) -> bool { + self.key_pos < s.0.len() && self.val_pos < s.0.len() && s.0[self.val_pos].0.0 == s.0[self.key_pos].0.0 + } + fn key<'a>(&self, s: &'a Self::Storage) -> &'a K { &s.0[self.key_pos].0.0 } + fn val<'a>(&self, s: &'a Self::Storage) -> &'a V { &s.0[self.val_pos].0.1 } + fn get_key<'a>(&self, s: &'a Self::Storage) -> Option<&'a K> { + if self.key_valid(s) { Some(self.key(s)) } else { None } + } + fn get_val<'a>(&self, s: &'a Self::Storage) -> Option<&'a V> { + if self.val_valid(s) { Some(self.val(s)) } else { None } + } + fn map_times(&mut self, s: &Self::Storage, mut logic: L) { + if !self.val_valid(s) { return; } + let kv = &s.0[self.val_pos].0; + let mut i = self.val_pos; + while i < s.0.len() && &s.0[i].0 == kv { + logic(&s.0[i].1, &s.0[i].2); + i += 1; + } + } + fn step_key(&mut self, s: &Self::Storage) { + // Linear: stepping is a short hop to the next group; an inlined scan + // beats a gallop call for the common small-group case. + if self.key_pos >= s.0.len() { return; } + let key = s.0[self.key_pos].0.0.clone(); + let mut i = self.key_pos; + while i < s.0.len() && s.0[i].0.0 == key { i += 1; } + self.key_pos = i; + self.val_pos = i; + } + fn seek_key(&mut self, s: &Self::Storage, key: &K) { + // Logarithmic: O(log distance), independent of chunk size. + self.key_pos = gallop(&s.0, self.key_pos, |u| &u.0.0 < key); + self.val_pos = self.key_pos; + } + fn step_val(&mut self, s: &Self::Storage) { + if !self.val_valid(s) { return; } + let kv = s.0[self.val_pos].0.clone(); + let mut i = self.val_pos; + while i < s.0.len() && s.0[i].0 == kv { i += 1; } + self.val_pos = i; + } + fn seek_val(&mut self, s: &Self::Storage, val: &V) { + if !self.key_valid(s) { return; } + let key = s.0[self.key_pos].0.0.clone(); + self.val_pos = gallop(&s.0, self.val_pos, |u| (&u.0.0, &u.0.1) < (&key, val)); + } + fn rewind_keys(&mut self, _s: &Self::Storage) { self.key_pos = 0; self.val_pos = 0; } + fn rewind_vals(&mut self, _s: &Self::Storage) { self.val_pos = self.key_pos; } + } + + /// Take the `Vec` out of a chunk, copying only if the `Rc` is shared. + fn take(chunk: VecChunk) -> Vec<((K, V), T, R)> { + Rc::try_unwrap(chunk.0).unwrap_or_else(|rc| (*rc).clone()) + } + + impl Chunk for VecChunk + where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { + type Cursor = VecChunkCursor; + + const TARGET: usize = TARGET; + + fn cursor(&self) -> Self::Cursor { + VecChunkCursor { key_pos: 0, val_pos: 0, phantom: PhantomData } + } + + fn bounds(&self) -> ((&K, &V, &T), (&K, &V, &T)) { + let s = &self.0[..]; + let (f, l) = (&s[0], &s[s.len() - 1]); + ((&f.0.0, &f.0.1, &f.1), (&l.0.0, &l.0.1, &l.1)) + } + + fn len(&self) -> usize { self.0.len() } + + fn prune(self, prefix: usize) -> Self { + let mut v = take(self); + v.drain(..prefix); + VecChunk(Rc::new(v)) + } + + fn merge(chunks: &mut [(usize, Self)], out: &mut ChunkList) { + let mut consumed: Vec = chunks.iter().map(|(c, _)| *c).collect(); + { + let inputs: Vec<&[_]> = chunks.iter().map(|(_, ch)| &ch.0[..]).collect(); + merge_buf(&inputs, &mut consumed, out); + } + for (i, (c, _)) in chunks.iter_mut().enumerate() { *c = consumed[i]; } + } + + fn extract( + chunks: &mut Vec, + frontier: &Antichain, + residual: &mut Antichain, + keep: &mut ChunkList, + ship: &mut ChunkList, + ) { + // Fill `TARGET`-sized buffers directly, so the chunks pushed are already + // graded and `regrade` passes them through as `Rc` moves rather than + // re-splitting (and re-copying) a monolithic chunk. Emptied input `Vec`s + // are recycled as the next buffers, so allocations balance input against + // output instead of one fresh buffer per emitted chunk. + let mut stash: Vec> = Vec::new(); + let take_buf = |stash: &mut Vec<_>| stash.pop().unwrap_or_default(); + let (mut k, mut s) = (take_buf(&mut stash), take_buf(&mut stash)); + for chunk in chunks.drain(..) { + let mut v = take(chunk); + for u in v.drain(..) { + if frontier.borrow().less_equal(&u.1) { + residual.insert_ref(&u.1); + k.push(u); + if k.len() >= TARGET { keep.push(VecChunk(Rc::new(std::mem::replace(&mut k, take_buf(&mut stash))))); } + } else { + s.push(u); + if s.len() >= TARGET { ship.push(VecChunk(Rc::new(std::mem::replace(&mut s, take_buf(&mut stash))))); } + } + } + stash.push(v); + } + if !k.is_empty() { keep.push(VecChunk(Rc::new(k))); } + if !s.is_empty() { ship.push(VecChunk(Rc::new(s))); } + } + + fn advance( + feed: &mut ChunkFeed, + frontier: &Antichain, + done: bool, + out: &mut ChunkList, + ) { + // Advance and consolidate every *complete* `(key, val)` group eagerly, + // so its updates can be released as soon as the input proves no later + // time for the pair can arrive. A group is contiguous in the sorted + // chain, so the only one that might continue in a future push is the + // last; unless `done`, we process up to its start and withhold the rest + // as the head for the next call. + let mut stash: Vec> = Vec::new(); + let (consumed, ch) = &mut feed.0; + // Build the working buffer by *reusing the head's storage* and appending + // the tail (recycling each emptied tail `Vec`). Reusing the head is what + // keeps a withheld group from being recopied across calls: it just + // accumulates in place, so a `(key, val)` larger than the working set + // costs O(total) over the run rather than O(total²). + let mut buf = take(std::mem::take(ch)); + if *consumed > 0 { buf.drain(..*consumed); *consumed = 0; } + for chunk in feed.1.drain(..) { + let mut v = take(chunk); + buf.append(&mut v); + stash.push(v); + } + if buf.is_empty() { return; } + + // If every available update shares one `(key, val)`, no group is provably + // complete — the next push may extend it — so make no progress unless + // `done`: retain the accumulated buffer as the head and return. This is + // the giant-key case; comparing only the first and last pair detects it + // without scanning, and reusing the head above makes the retention free. + if !done && buf[0].0 == buf[buf.len() - 1].0 { + *ch = VecChunk(Rc::new(buf)); + return; + } + + // Otherwise at least the first group is complete. Withhold the last group + // (a single `(key, val)`) as the next head unless the input is complete. + let end = if done { buf.len() } else { + let last_kv = buf[buf.len() - 1].0.clone(); + let mut start = buf.len(); + while start > 0 && buf[start - 1].0 == last_kv { start -= 1; } + start + }; + if end < buf.len() { + let tail = buf.split_off(end); + *ch = VecChunk(Rc::new(tail)); + } + // Advance + consolidate each group into `TARGET`-sized output chunks, + // filling buffers reclaimed from the recycled tail `Vec`s. + let mut result = stash.pop().unwrap_or_default(); + let mut i = 0; + while i < buf.len() { + let mut j = i; + while j < buf.len() && buf[j].0 == buf[i].0 { j += 1; } + for u in &mut buf[i..j] { u.1.advance_by(frontier.borrow()); } + // Advancing is monotone w.r.t. the lattice but not the + // representation's total order, so re-sort the group by time. + buf[i..j].sort_by(|a, b| a.1.cmp(&b.1)); + let mut k = i; + while k < j { + let kv = buf[k].0.clone(); + let t = buf[k].1.clone(); + let mut diff = buf[k].2.clone(); + k += 1; + while k < j && buf[k].1 == t { diff.plus_equals(&buf[k].2); k += 1; } + if !diff.is_zero() { + result.push((kv, t, diff)); + if result.len() >= TARGET { out.push(VecChunk(Rc::new(std::mem::replace(&mut result, stash.pop().unwrap_or_default())))); } + } + } + i = j; + } + if !result.is_empty() { out.push(VecChunk(Rc::new(result))); } + } + + fn regrade(queue: &mut Vec, done: bool, out: &mut Vec) { + // Maximal packing: emit chunks as large as possible up to `TARGET`, + // never splitting a pair that could combine into one legal (`<= TARGET`) + // chunk. A chunk of exactly `TARGET` is maximal — it cannot grow — so it + // passes straight through as an `Rc` move; only sub-`TARGET` chunks are + // copied, and only to coalesce with a neighbour. Producers fill to + // `TARGET`, so in steady state every chunk passes through and only the + // occasional trailing partial is coalesced. + // + // `carry` is the (sub-`TARGET`) chunk under construction. It is flushed + // once it reaches `TARGET`, carried back onto `queue` between calls, or + // emitted on `done`. Whenever `carry` is non-empty its left neighbour in + // `out` is a `TARGET` chunk (or `carry` is `out`'s first chunk), so + // emitting `carry` against a neighbour it cannot merge with — their sum + // exceeds `TARGET` — keeps the packing maximal on both sides. + let mut carry: Vec<((K, V), T, R)> = Vec::new(); + for chunk in queue.drain(..) { + if carry.is_empty() { + absorb(chunk, &mut carry, out); + } else if carry.len() + chunk.0.len() <= TARGET { + // Combines into one legal chunk; coalesce in place. + carry.extend(take(chunk)); + if carry.len() == TARGET { + out.push(VecChunk(Rc::new(std::mem::take(&mut carry)))); + } + } else { + // Cannot combine without exceeding `TARGET`; `carry` is maximal + // against this neighbour, so emit it and absorb the chunk afresh. + out.push(VecChunk(Rc::new(std::mem::take(&mut carry)))); + absorb(chunk, &mut carry, out); + } + } + if !carry.is_empty() { + let chunk = VecChunk(Rc::new(carry)); + if done { out.push(chunk); } else { queue.push(chunk); } + } + } + } + + /// Emit maximal `TARGET`-sized chunks off the front of `carry`, leaving the + /// sub-`TARGET` tail behind. + fn peel( + carry: &mut Vec<((K, V), T, R)>, + out: &mut Vec>, + ) { + let mut start = 0; + while carry.len() - start >= TARGET { + out.push(VecChunk(Rc::new(carry[start..start + TARGET].to_vec()))); + start += TARGET; + } + carry.drain(..start); + } + + /// Absorb a chunk when nothing is carried: pass a `TARGET` chunk through as an + /// `Rc` move, hold a smaller one in `carry`, or split a larger one (peeling off + /// `TARGET` pieces and carrying the remainder). `carry` must be empty on entry. + fn absorb( + chunk: VecChunk, + carry: &mut Vec<((K, V), T, R)>, + out: &mut Vec>, + ) { + use std::cmp::Ordering::{Equal, Greater, Less}; + match chunk.0.len().cmp(&TARGET) { + Equal => out.push(chunk), + Less => *carry = take(chunk), + Greater => { *carry = take(chunk); peel(carry, out); } + } + } + + /// K-way merge of in-range prefixes of sorted, consolidated inputs, emitting + /// graded chunks directly into `out`. + /// + /// `inputs[i][consumed[i]..]` is the unconsumed, sorted suffix of input `i`. + /// Merges through the least last `((key, val), time)` across inputs (nothing + /// interleaves below it), consolidating triples shared across inputs, and + /// advances each `consumed[i]` past what it merged. Output is filled into + /// `TARGET`-sized buffers and pushed as it fills, so the run arrives *graded* + /// rather than as one monolithic chunk that `regrade` would re-split (and + /// re-copy) — mirroring `extract`. Sizing buffers to `TARGET` also avoids the + /// over-reservation a single up-front `with_capacity(total)` would incur. + fn merge_buf( + inputs: &[&[((K, V), T, R)]], + consumed: &mut [usize], + out: &mut ChunkList>, + ) + where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { + let Some(horizon) = inputs.iter().enumerate() + .filter(|(i, s)| consumed[*i] < s.len()) + .map(|(_, s)| { let u = &s[s.len() - 1]; (u.0.clone(), u.1.clone()) }) + .min() + else { return; }; + + let in_range = |i: usize, p: usize| { + p < inputs[i].len() && (&inputs[i][p].0, &inputs[i][p].1) <= (&horizon.0, &horizon.1) + }; + + let mut result: Vec<((K, V), T, R)> = Vec::with_capacity(TARGET); + loop { + let mut best: Option = None; + for i in 0..inputs.len() { + if in_range(i, consumed[i]) && best.is_none_or(|b| { + let (bi, bb) = (&inputs[i][consumed[i]], &inputs[b][consumed[b]]); + (&bi.0, &bi.1) < (&bb.0, &bb.1) + }) { + best = Some(i); + } + } + let Some(b) = best else { break; }; + let kv = inputs[b][consumed[b]].0.clone(); + let t = inputs[b][consumed[b]].1.clone(); + let mut diff: Option = None; + for i in 0..inputs.len() { + if in_range(i, consumed[i]) && inputs[i][consumed[i]].0 == kv && inputs[i][consumed[i]].1 == t { + match &mut diff { + None => diff = Some(inputs[i][consumed[i]].2.clone()), + Some(d) => d.plus_equals(&inputs[i][consumed[i]].2), + } + consumed[i] += 1; + } + } + if let Some(diff) = diff { + if !diff.is_zero() { + result.push((kv, t, diff)); + if result.len() >= TARGET { + out.push(VecChunk(Rc::new(std::mem::replace(&mut result, Vec::with_capacity(TARGET))))); + } + } + } + } + if !result.is_empty() { out.push(VecChunk(Rc::new(result))); } + } + + #[cfg(test)] + mod test { + use super::VecChunk; + use crate::trace::chunk::merge_chains; + use std::rc::Rc; + + fn chunk(updates: Vec<((u64, u64), u64, i64)>) -> VecChunk { + VecChunk(Rc::new(updates)) + } + + // `extract` must partition by frontier, fold the kept frontier into + // `residual`, and emit graded chunks directly — without leaning on a regrade + // re-split. + #[test] + fn extract_partitions_and_grades() { + use super::{Chunk, TARGET}; + use crate::trace::chunk::{is_graded, ChunkList}; + use timely::progress::Antichain; + + // 4·TARGET updates spread over many input chunks; even times ship + // (< frontier), odd times keep (>= frontier), so both sides straddle. + let n = 4 * TARGET as u64; + let input: Vec<_> = (0..n) + .map(|i| chunk(vec![((i, 0), i % 2, 1)])) + .collect(); + let mut chunks = input; + let frontier = Antichain::from_elem(1u64); + let mut residual = Antichain::new(); + let (mut keep, mut ship) = (ChunkList::default(), ChunkList::default()); + VecChunk::extract(&mut chunks, &frontier, &mut residual, &mut keep, &mut ship); + let (keep, ship) = (keep.done(), ship.done()); + + // Kept times are exactly {1}; that is the residual frontier. + assert_eq!(residual, Antichain::from_elem(1u64)); + // Both sides emerge graded directly from `extract`. + assert!(is_graded(&keep), "ungraded keep: {:?}", keep.iter().map(Chunk::len).collect::>()); + assert!(is_graded(&ship), "ungraded ship: {:?}", ship.iter().map(Chunk::len).collect::>()); + // Nothing lost: half the updates each way. + assert_eq!(keep.iter().map(Chunk::len).sum::(), n as usize / 2); + assert_eq!(ship.iter().map(Chunk::len).sum::(), n as usize / 2); + } + + // `advance` advances and consolidates complete `(key, val)` groups eagerly, + // withholding only the (possibly-growing) last group when not `done`. + #[test] + fn advance_emits_complete_groups_eagerly() { + use super::Chunk; + use crate::trace::chunk::ChunkList; + use timely::progress::Antichain; + + let frontier = Antichain::from_elem(5u64); + // Group (0,0) is complete within this chunk; group (1,0) might still grow. + let c0 = chunk(vec![((0, 0), 0, 1), ((0, 0), 1, 1), ((1, 0), 0, 1)]); + let mut feed = ((0usize, VecChunk::default()), vec![c0]); + let mut out = ChunkList::default(); + VecChunk::advance(&mut feed, &frontier, false, &mut out); + + // The trailing group (1,0) is withheld as the head for the next call. + assert_eq!(Chunk::len(&feed.0.1), 1); + assert!(feed.1.is_empty()); + // Group (0,0)'s times {0,1} advanced to 5 and consolidated, emitted now. + let emitted: Vec<_> = out.done().into_iter().flat_map(|c| (*c.0).clone()).collect(); + assert_eq!(emitted, vec![((0, 0), 5, 2)]); + } + + // Streaming the input one chunk at a time must yield exactly what a single + // all-at-once flush does — the resumable path is just the one-shot path cut + // at group boundaries. + #[test] + fn advance_resumable_matches_oneshot() { + use crate::trace::chunk::{AdvanceQueue, ChunkList}; + use timely::progress::Antichain; + + let frontier = Antichain::from_elem(3u64); + // Groups span chunk boundaries and carry several times each. + let input = || vec![ + chunk(vec![((0, 0), 0, 1), ((0, 0), 1, 1), ((1, 0), 0, 1)]), + chunk(vec![((1, 0), 5, 1), ((1, 1), 0, 1), ((2, 0), 0, 1)]), + chunk(vec![((2, 0), 2, 1), ((2, 0), 9, 1)]), + ]; + let flat = |v: Vec>| + v.into_iter().flat_map(|c| (*c.0).clone()).collect::>(); + + let oneshot = { + let mut q = AdvanceQueue::new(frontier.clone()); + let mut out = ChunkList::default(); + q.push(input(), &mut out); + q.finish(&mut out); + flat(out.done()) + }; + let incremental = { + let mut q = AdvanceQueue::new(frontier.clone()); + let mut out = ChunkList::default(); + for c in input() { q.push(std::iter::once(c), &mut out); } + q.finish(&mut out); + flat(out.done()) + }; + assert_eq!(oneshot, incremental); + // Times are advanced: nothing below the frontier survives. + for u in &oneshot { assert!(u.1 >= 3); } + } + + // A single `(key, val)` whose updates span every pushed chunk: `advance` + // can make no progress until `done`, accumulating in the head in place. + // It must still produce the right advanced+consolidated result at the end. + #[test] + fn advance_single_key_spanning_pushes() { + use crate::trace::chunk::{AdvanceQueue, ChunkList}; + use timely::progress::Antichain; + + let frontier = Antichain::from_elem(100u64); + let n = 50u64; + let make = || (0..n).map(|t| chunk(vec![((7u64, 0u64), t, 1i64)])).collect::>(); + let flat = |v: Vec>| + v.into_iter().flat_map(|c| (*c.0).clone()).collect::>(); + + let mut q = AdvanceQueue::new(frontier); + let mut out = ChunkList::default(); + for c in make() { q.push(std::iter::once(c), &mut out); } + q.finish(&mut out); + // All times advance to 100 and consolidate to one update of diff `n`. + assert_eq!(flat(out.done()), vec![((7u64, 0u64), 100u64, n as i64)]); + } + + #[test] + fn merge_chains_consolidates() { + let a = chunk(vec![((0, 0), 0, 1), ((1, 0), 0, 1)]); + let b = chunk(vec![((0, 0), 0, 1), ((2, 0), 0, 1)]); + let mut out = crate::trace::chunk::ChunkList::default(); + merge_chains(vec![a], vec![b], &mut out); + let merged: Vec<_> = out.done().into_iter().flat_map(|c| (*c.0).clone()).collect(); + assert_eq!(merged, vec![((0, 0), 0, 2), ((1, 0), 0, 1), ((2, 0), 0, 1)]); + } + + // Merging runs larger than `TARGET` must emit a *graded* sequence directly + // (each chunk `<= TARGET`, adjacent pairs summing past `TARGET`), not one + // monolithic chunk, while reproducing the consolidated sorted contents. + #[test] + fn merge_emits_graded_chunks() { + use super::{Chunk, TARGET}; + use crate::trace::chunk::{ChunkList, is_graded, merge_chains}; + + // Two interleaving single-chunk chains: evens and odds over `0..4·TARGET`. + let n = 4 * TARGET as u64; + let evens = chunk((0..n).step_by(2).map(|k| ((k, 0), 0, 1)).collect()); + let odds = chunk((0..n).step_by(2).map(|k| ((k + 1, 0), 0, 1)).collect()); + + let mut out = ChunkList::default(); + merge_chains(vec![evens], vec![odds], &mut out); + let chunks = out.done(); + + assert!(is_graded(&chunks), "merge output not graded: {:?}", + chunks.iter().map(Chunk::len).collect::>()); + // Contents are exactly the sorted keys `0..4·TARGET`, each once. + let merged: Vec<_> = chunks.into_iter().flat_map(|c| (*c.0).clone()).collect(); + let want: Vec<_> = (0..n).map(|k| ((k, 0u64), 0u64, 1i64)).collect(); + assert_eq!(merged, want); + } + + // `regrade` must produce a *maximal packing*: adjacent sub-`TARGET` chunks + // that could combine into one legal chunk are coalesced (the prior rule left + // any pair summing past `TARGET/2` alone), full chunks pass through, and + // contents are preserved exactly. + #[test] + fn regrade_maximal_packing() { + use super::{Chunk, TARGET}; + use crate::trace::chunk::{is_graded, ChunkList}; + + // A mix of small and full chunks with distinct, increasing keys (so the + // concatenation is sorted and nothing consolidates away). + let t = TARGET; + let sizes = [t / 3, t / 3, t / 3, t, t / 2, t / 2, t, 1, t - 1]; + let total: usize = sizes.iter().sum(); + let mut key = 0u64; + let mut list = ChunkList::default(); + for &s in &sizes { + let updates: Vec<_> = (0..s).map(|_| { let k = key; key += 1; ((k, 0u64), 0u64, 1i64) }).collect(); + list.push(chunk(updates)); + } + let chunks = list.done(); + + assert!(is_graded(&chunks), "not graded: {:?}", + chunks.iter().map(Chunk::len).collect::>()); + // Nothing lost, and the keys stay strictly sorted across the new breaks. + let got: Vec<_> = chunks.into_iter().flat_map(|c| (*c.0).clone()).collect(); + assert_eq!(got.len(), total); + assert!(got.windows(2).all(|w| w[0].0.0 < w[1].0.0)); + } + + // The indexed cursor must reconstruct the same grouped updates as a flat + // reference, even when a key — and a `(key, val)`'s times — straddle a + // chunk boundary. + #[test] + fn cursor_handles_straddle() { + use crate::trace::cursor::Cursor; + use crate::trace::{BatchReader, Description}; + use crate::trace::chunk::ChunkBatch; + use timely::progress::Antichain; + + let chunks = vec![ + chunk(vec![((0, 0), 0, 1), ((1, 0), 0, 1), ((1, 1), 0, 1)]), + chunk(vec![((1, 1), 1, 1), ((1, 2), 0, 1)]), + chunk(vec![((2, 0), 0, 1)]), + ]; + let desc = Description::new( + Antichain::from_elem(0u64), + Antichain::from_elem(2u64), + Antichain::from_elem(0u64), + ); + let batch = ChunkBatch::new(chunks, desc); + + let mut cursor = batch.cursor(); + let got = cursor.to_vec(&batch, |k| *k, |v| *v); + let want = vec![ + ((0u64, 0u64), vec![(0u64, 1i64)]), + ((1, 0), vec![(0, 1)]), + ((1, 1), vec![(0, 1), (1, 1)]), + ((1, 2), vec![(0, 1)]), + ((2, 0), vec![(0, 1)]), + ]; + assert_eq!(got, want); + } + + // Isolated: gallop vs linear forward-seek over one big chunk, for sparse to + // dense probe sets. Run: cargo test seek_microbench -- --ignored --nocapture + #[test] + #[ignore] + fn seek_microbench() { + use std::time::Instant; + use std::hint::black_box; + use super::gallop; + let n = 1_000_000u64; + let data: Vec<((u64, ()), u64, isize)> = (0..n).map(|k| ((3 * k, ()), 0u64, 1isize)).collect(); + for probes in [100u64, 10_000, 1_000_000] { + let targets: Vec = (0..probes).map(|i| 3 * (i * n / probes)).collect(); + let best = |f: &dyn Fn() -> u64| { + let mut b = std::time::Duration::MAX; + for _ in 0..5 { let t = Instant::now(); black_box(f()); b = b.min(t.elapsed()); } + b + }; + let data = black_box(&data[..]); + let g = best(&|| { + let (mut pos, mut acc) = (0usize, 0u64); + for &tgt in &targets { pos = gallop(data, pos, |u| u.0.0 < tgt); acc += pos as u64; } + acc + }); + let l = best(&|| { + let (mut pos, mut acc) = (0usize, 0u64); + for &tgt in &targets { while pos < data.len() && data[pos].0.0 < tgt { pos += 1; } acc += pos as u64; } + acc + }); + eprintln!("probes={probes:>7}: gallop={g:>12?} linear={l:>12?}"); + } + } + } +} + +pub mod col_chunk { + //! A columnar [`Chunk`] mirroring the `ord_neu` layout: separate key / val / + //! time / diff columns linked by offset lists, with the singleton update + //! optimization (an empty `[offs[i], offs[i+1])` range reuses the prior single + //! `(time, diff)`). + //! + //! Where [`vec_chunk`](super::vec_chunk) stores a flat `Vec<((K,V),T,R)>` — + //! repeating each key on every update — this stores each distinct key once, each + //! `(key, val)` once, and shares a single `(time, diff)` across vals when they + //! coincide. It reuses the very containers `ord_neu` is built from + //! ([`Vals`], [`Upds`], [`UpdsBuilder`], [`OffsetList`](crate::trace::implementations::OffsetList)), + //! so the layout and the singleton encoding are shared code, not a re-derivation. + //! + //! The chunk-producing operations still emit *graded* runs (cut to + //! [`Chunk::TARGET`]) rather than one monolithic batch, preserving proportional + //! merging. + //! + //! # Status (v1) + //! + //! The full [`Chunk`] trait is implemented and tested (merge / extract / advance / + //! regrade / prune / cursor), so `ColChunk` works as a [`ChunkBatch`] backing — a + //! spine and reference-counted builder are aliased below. Known limitations, all + //! noted at their sites and worth review: + //! + //! * **Decompress/recompress.** `merge`/`extract`/`advance` read inputs into an + //! owned `(key, val, time, diff)` stream and rebuild via [`Builder`]. This is + //! proportional and graded, but does not yet exploit the columnar layout to copy + //! runs of distinct keys by range (as `ord_neu`'s merger does); it also + //! materializes owned keys/vals during a merge. + //! * **`consumed` ↔ singleton.** The merge `consumed` prefix is a *logical* update + //! count, which under the singleton encoding does not map `O(1)` to a column + //! position, so a retained head re-walks its prefix each call. + //! * **Cut granularity.** Chunks are cut only at val boundaries, so a single + //! `(key, val)` exceeding `TARGET` yields one over-sized chunk; mid-val splitting + //! is unimplemented. + //! * **Merge-batcher integration.** Wired via [`ColChunkBatcher`] / [`ColChunker`]: + //! rather than make the consolidated trie double as the merge batcher's unsorted + //! accumulation buffer (which `vec_chunk` can, being flat, but a trie can't), the + //! chunker keeps a `Vec` scratch and *builds* graded chunks from each sorted run. + //! The `Chunk` trait stays minimal — the accumulation traits live on the scratch, + //! not the chunk. `ColChunk` arranges end-to-end (`Collection → MergeBatcher → + //! Batch → Merger`); see the `chunks` example's `colchunk` mode. + + use std::marker::PhantomData; + use std::rc::Rc; + + use std::collections::VecDeque; + + use timely::Accountable; + use timely::container::{ContainerBuilder, PushInto, SizableContainer}; + use timely::progress::Antichain; + + use crate::consolidation::Consolidate; + use crate::difference::{IsZero, Semigroup}; + use crate::lattice::Lattice; + use crate::trace::cursor::Cursor; + use crate::trace::implementations::{BatchContainer, Layout, WithLayout}; + use crate::trace::implementations::layout::Time; + use crate::trace::implementations::chunker::ContainerChunker; + use crate::trace::implementations::merge_batcher::MergeBatcher; + use crate::trace::implementations::ord_neu::layers::{UpdsBuilder, Upds, Vals}; + use crate::trace::implementations::ord_neu::val_batch::OrdValStorage; + + use super::{Chunk, ChunkFeed, ChunkList, ChunkMerger}; + + /// The chunk size: maximum updates per chunk and the coalescing threshold. + const TARGET: usize = 1024; + + /// Columnar storage plus the *logical* update count, which exceeds the number + /// of stored `(time, diff)` pairs by the number of singleton reuses. + struct Inner { + storage: OrdValStorage, + updates: usize, + } + + /// A sorted, consolidated columnar run of `((key, val), time, diff)`, shared via `Rc`. + pub struct ColChunk(Rc>); + + impl Clone for ColChunk { + fn clone(&self) -> Self { ColChunk(Rc::clone(&self.0)) } + } + + impl Default for ColChunk { + fn default() -> Self { + ColChunk(Rc::new(Inner { + storage: OrdValStorage { + keys: ::with_capacity(0), + vals: Vals::default(), + upds: Upds::default(), + }, + updates: 0, + })) + } + } + + impl WithLayout for ColChunk { + type Layout = L; + } + + // --- cursor: mirrors `ord_neu`'s `OrdValCursor`, over `ColChunk` storage --- + + /// A cursor over a [`ColChunk`], tracking absolute key and val positions. + pub struct ColChunkCursor { + key_cursor: usize, + val_cursor: usize, + phantom: PhantomData, + } + + impl WithLayout for ColChunkCursor { + type Layout = L; + } + + impl Cursor for ColChunkCursor { + type Storage = ColChunk; + + fn get_key<'a>(&self, s: &'a Self::Storage) -> Option> { s.0.storage.keys.get(self.key_cursor) } + fn get_val<'a>(&self, s: &'a Self::Storage) -> Option> { if self.val_valid(s) { Some(self.val(s)) } else { None } } + + fn key<'a>(&self, s: &'a Self::Storage) -> Self::Key<'a> { s.0.storage.keys.index(self.key_cursor) } + fn val<'a>(&self, s: &'a Self::Storage) -> Self::Val<'a> { s.0.storage.vals.get_abs(self.val_cursor) } + + fn map_times, Self::DiffGat<'_>)>(&mut self, s: &Self::Storage, mut logic: L2) { + let (lower, upper) = s.0.storage.upds.bounds(self.val_cursor); + for index in lower .. upper { + let (time, diff) = s.0.storage.upds.get_abs(index); + logic(time, diff); + } + } + + fn key_valid(&self, s: &Self::Storage) -> bool { self.key_cursor < s.0.storage.keys.len() } + fn val_valid(&self, s: &Self::Storage) -> bool { self.val_cursor < s.0.storage.vals.bounds(self.key_cursor).1 } + + fn step_key(&mut self, s: &Self::Storage) { + self.key_cursor += 1; + if self.key_valid(s) { self.rewind_vals(s); } + else { self.key_cursor = s.0.storage.keys.len(); } + } + fn seek_key(&mut self, s: &Self::Storage, key: Self::Key<'_>) { + self.key_cursor += s.0.storage.keys.advance(self.key_cursor, s.0.storage.keys.len(), |x| { + ::reborrow(x).lt(&::reborrow(key)) + }); + if self.key_valid(s) { self.rewind_vals(s); } + } + fn step_val(&mut self, s: &Self::Storage) { + self.val_cursor += 1; + if !self.val_valid(s) { self.val_cursor = s.0.storage.vals.bounds(self.key_cursor).1; } + } + fn seek_val(&mut self, s: &Self::Storage, val: Self::Val<'_>) { + self.val_cursor += s.0.storage.vals.vals.advance(self.val_cursor, s.0.storage.vals.bounds(self.key_cursor).1, |x| { + ::reborrow(x).lt(&::reborrow(val)) + }); + } + fn rewind_keys(&mut self, s: &Self::Storage) { + self.key_cursor = 0; + if self.key_valid(s) { self.rewind_vals(s); } + } + fn rewind_vals(&mut self, s: &Self::Storage) { + self.val_cursor = s.0.storage.vals.bounds(self.key_cursor).0; + } + } + + // --- trace side: the `Chunk` operations --- + + impl Chunk for ColChunk + where + // Implied by `Layout` but not elaborated at direct call sites; state them so + // `advance` can advance times and consolidate diffs. + ::Owned: Lattice, + ::Owned: Semigroup, + { + type Cursor = ColChunkCursor; + + const TARGET: usize = TARGET; + + fn cursor(&self) -> Self::Cursor { + ColChunkCursor { key_cursor: 0, val_cursor: 0, phantom: PhantomData } + } + + fn bounds(&self) -> ( + (Self::Key<'_>, Self::Val<'_>, Self::TimeGat<'_>), + (Self::Key<'_>, Self::Val<'_>, Self::TimeGat<'_>), + ) { + let s = &self.0.storage; + let nk = s.keys.len(); + let nv = s.vals.len(); + // First (key, val, time): first key, its first val, that val's first time. + // The first val is never a singleton (the encoding needs a prior update). + let fk = s.keys.index(0); + let fv = s.vals.get_abs(0); + let ft = s.upds.times.index(s.upds.bounds(0).0); + // Last (key, val, time): last key, last val, that val's last time. + let lk = s.keys.index(nk - 1); + let lv = s.vals.get_abs(nv - 1); + let lt = s.upds.times.index(s.upds.bounds(nv - 1).1 - 1); + ((fk, fv, ft), (lk, lv, lt)) + } + + fn len(&self) -> usize { self.0.updates } + + fn prune(self, prefix: usize) -> Self { + // The suffix `[prefix..]` is already sorted + consolidated; rebuild it as + // a single chunk (never cutting), mirroring `vec_chunk::prune`. + let mut updates: Vec> = Vec::new(); + collect_in_range(&self.0.storage, prefix, None, &mut updates); + build_single(updates) + } + + fn merge(chunks: &mut [(usize, Self)], out: &mut ChunkList) { + // Horizon: least last `(key, val, time)` across in-range inputs; nothing + // strictly below it can interleave with as-yet-unmerged updates. + let horizon = chunks.iter() + .filter(|(consumed, ch)| *consumed < ch.len()) + .map(|(_, ch)| { + let (_, (lk, lv, lt)) = ch.bounds(); + ( + ::into_owned(lk), + ::into_owned(lv), + ::into_owned(lt), + ) + }) + .min(); + let Some(horizon) = horizon else { return; }; + + // Decompress each input's in-range prefix, advancing its consumed count. + // The input achieving the horizon drains fully (all its updates are + // `<= horizon`), satisfying the merge contract. + let mut collected: Vec> = Vec::new(); + for (consumed, ch) in chunks.iter_mut() { + let added = collect_in_range(&ch.0.storage, *consumed, Some(&horizon), &mut collected); + *consumed += added; + } + // Sort + sum equal `(key, val, time)` across inputs, dropping cancellations. + crate::consolidation::consolidate(&mut collected); + // Re-pack into graded chunks (the builder re-applies the singleton opt). + let mut builder = Builder::new(); + for ((k, v, t), d) in collected { builder.push(k, v, t, d); } + for c in builder.finish() { out.push(c); } + } + + fn extract( + chunks: &mut Vec, + frontier: &Antichain>, + residual: &mut Antichain>, + keep: &mut ChunkList, + ship: &mut ChunkList, + ) { + // Route each update to `keep` (time `>=` frontier) or `ship` (otherwise), + // folding kept times into `residual`. Iterating sorted input and appending + // in order keeps each side sorted, so the builders emit graded chunks. + let mut keep_b = Builder::new(); + let mut ship_b = Builder::new(); + for chunk in chunks.drain(..) { + let mut updates: Vec> = Vec::new(); + collect_in_range(&chunk.0.storage, 0, None, &mut updates); + for ((k, v, t), d) in updates { + if frontier.borrow().less_equal(&t) { + residual.insert_ref(&t); + keep_b.push(k, v, t, d); + } else { + ship_b.push(k, v, t, d); + } + } + } + for c in keep_b.finish() { keep.push(c); } + for c in ship_b.finish() { ship.push(c); } + } + + fn advance( + feed: &mut ChunkFeed, + frontier: &Antichain>, + done: bool, + out: &mut ChunkList, + ) { + // Decompress the withheld head (past its consumed prefix) and all freshly + // pushed tail chunks into one sorted owned buffer. The feed is a single + // sorted run, so the head precedes the tail. + let (consumed, head) = &mut feed.0; + let mut buf: Vec> = Vec::new(); + collect_in_range(&head.0.storage, *consumed, None, &mut buf); + *consumed = 0; + *head = ColChunk::default(); + for chunk in feed.1.drain(..) { + collect_in_range(&chunk.0.storage, 0, None, &mut buf); + } + if buf.is_empty() { return; } + + // The `(key, val)` of an update. + let kv = |u: &OwnedUpdate| (u.0.0.clone(), u.0.1.clone()); + + // If every update shares one `(key, val)` and we are not done, no group is + // provably complete (a later push may extend it); withhold everything as + // the head for the next call. (v1: re-packs the head each call, so a key + // spanning many pushes is quadratic — noted for review.) + if !done && kv(&buf[0]) == kv(&buf[buf.len() - 1]) { + *head = build_single(buf); + return; + } + + // Otherwise withhold the trailing (single `(key, val)`) group unless done. + let end = if done { buf.len() } else { + let last = kv(&buf[buf.len() - 1]); + let mut start = buf.len(); + while start > 0 && kv(&buf[start - 1]) == last { start -= 1; } + start + }; + if end < buf.len() { + let tail = buf.split_off(end); + *head = build_single(tail); + } + + // Advance + consolidate each complete group into graded output chunks. + let mut builder = Builder::new(); + let mut i = 0; + while i < buf.len() { + let mut j = i; + let group = kv(&buf[i]); + while j < buf.len() && kv(&buf[j]) == group { j += 1; } + for u in &mut buf[i..j] { u.0.2.advance_by(frontier.borrow()); } + // Advancing is monotone w.r.t. the lattice but not the total order; + // re-sort the group by time, then consolidate equal times. + buf[i..j].sort_by(|a, b| a.0.2.cmp(&b.0.2)); + let mut k = i; + while k < j { + let (kk, vv, t) = (buf[k].0.0.clone(), buf[k].0.1.clone(), buf[k].0.2.clone()); + let mut diff = buf[k].1.clone(); + k += 1; + while k < j && buf[k].0.2 == t { diff.plus_equals(&buf[k].1); k += 1; } + if !diff.is_zero() { + builder.push(kk, vv, t, diff); + } + } + i = j; + } + for c in builder.finish() { out.push(c); } + } + + fn regrade(queue: &mut Vec, done: bool, out: &mut Vec) { + // Maximal packing over columnar chunks. A chunk already at least `TARGET` + // is passed through by `Rc` move; smaller chunks are decompressed into a + // `Builder` that re-emits `TARGET`-sized runs (and re-consolidates / + // re-applies the singleton optimization). The builder's trailing partial + // is carried back onto `queue` between calls, or emitted on `done`. + // + // (v1 note: a chunk *larger* than `TARGET` — only producible by a single + // `(key, val)` whose updates exceed `TARGET` — is passed through rather + // than split, since columnar mid-val splitting is not yet implemented.) + let mut builder = Builder::new(); + for chunk in queue.drain(..) { + if !builder.has_pending() && chunk.len() >= TARGET { + out.push(chunk); + } else if chunk.len() >= TARGET { + // Flush the carried partial before the larger chunk (order!). + if let Some(c) = builder.finish_pending() { out.push(c); } + out.push(chunk); + } else { + feed_chunk(&mut builder, &chunk); + out.append(&mut builder.drain_done()); + } + } + out.append(&mut builder.drain_done()); + if let Some(c) = builder.finish_pending() { + if done { out.push(c); } else { queue.push(c); } + } + } + } + + /// Owned key / val / time / diff for layout `L`. + type KOwned = <::KeyContainer as BatchContainer>::Owned; + type VOwned = <::ValContainer as BatchContainer>::Owned; + type TOwned = <::TimeContainer as BatchContainer>::Owned; + type DOwned = <::DiffContainer as BatchContainer>::Owned; + /// An owned update grouped for consolidation: `((key, val, time), diff)`. + type OwnedUpdate = ((KOwned, VOwned, TOwned), DOwned); + + /// Push every update of `chunk` into `builder`, in order. Decompresses the + /// columnar layout to an owned `(key, val, time, diff)` stream; used by `regrade` + /// to re-pack small chunks. + fn feed_chunk(builder: &mut Builder, chunk: &ColChunk) { + let mut updates: Vec> = Vec::new(); + collect_in_range(&chunk.0.storage, 0, None, &mut updates); + for ((k, v, t), d) in updates { builder.push(k, v, t, d); } + } + + /// Append owned updates of `s`, in sorted `(key, val, time)` order, starting at + /// logical index `skip`, while `(key, val, time) <= horizon` (when `Some`). + /// Returns the number appended. Used by the producing ops to decompress the + /// in-range portion of a chunk for re-merging. + /// + /// `skip` is walked linearly; under the singleton encoding a logical update index + /// does not map O(1) to a column position, so a retained head re-walks its + /// consumed prefix each call (a perf wrinkle worth revisiting — see notes). + fn collect_in_range( + s: &OrdValStorage, + skip: usize, + horizon: Option<&(KOwned, VOwned, TOwned)>, + out: &mut Vec>, + ) -> usize { + let mut seen = 0; + let mut count = 0; + for ki in 0..s.keys.len() { + let key = s.keys.index(ki); + let (vlo, vhi) = s.vals.bounds(ki); + for vi in vlo..vhi { + let val = s.vals.get_abs(vi); + let (ulo, uhi) = s.upds.bounds(vi); + for ui in ulo..uhi { + if seen < skip { seen += 1; continue; } + let (t, d) = s.upds.get_abs(ui); + let triple = ( + ::into_owned(key), + ::into_owned(val), + ::into_owned(t), + ); + if let Some(h) = horizon { + if &triple > h { return count; } + } + out.push((triple, ::into_owned(d))); + count += 1; + seen += 1; + } + } + } + count + } + + /// An empty columnar storage (offset lists seeded with their leading `0`). + fn empty_storage() -> OrdValStorage { + OrdValStorage { + keys: ::with_capacity(0), + vals: Vals::default(), + upds: Upds::default(), + } + } + + /// Build a single (un-cut) chunk from a sorted, consolidated owned update stream. + /// Used to repackage withheld/pruned suffixes; returns an empty chunk if there + /// are no updates. + fn build_single(updates: Vec>) -> ColChunk { + let mut builder = Builder::with_target(usize::MAX); + for ((k, v, t), d) in updates { builder.push(k, v, t, d); } + builder.finish().pop().unwrap_or_default() + } + + /// Builds graded [`ColChunk`]s from a sorted, consolidated `(key, val, time, diff)` + /// stream, reusing `ord_neu`'s columnar machinery (and its singleton optimization + /// via [`UpdsBuilder::seal`]). Cuts a chunk once its logical update count reaches + /// `TARGET`, at a val boundary, so output arrives graded rather than monolithic. + /// + /// Completed chunks accumulate in `done`; the in-progress chunk lives in + /// `result`/`staging`. (v1: cuts only at val boundaries, so a single `(key, val)` + /// with more than `target` updates yields one over-sized chunk.) + struct Builder { + result: OrdValStorage, + staging: UpdsBuilder, + done: Vec>, + /// Cut the in-progress chunk once its update count reaches this. + target: usize, + /// Last-pushed key / val, for boundary detection without re-reading columns. + cur_key: Option>, + cur_val: Option>, + } + + impl Builder { + /// A builder cutting at `TARGET`. + fn new() -> Self { Self::with_target(TARGET) } + /// A builder cutting once the in-progress chunk reaches `target` updates; + /// `usize::MAX` never cuts (one chunk). + fn with_target(target: usize) -> Self { + Self { + result: empty_storage(), + staging: UpdsBuilder::default(), + done: Vec::new(), + target, + cur_key: None, + cur_val: None, + } + } + + /// Whether the in-progress chunk holds any updates. + fn has_pending(&self) -> bool { !self.result.keys.is_empty() } + + /// Take the completed chunks accumulated so far. + fn drain_done(&mut self) -> Vec> { std::mem::take(&mut self.done) } + + /// Push one update; assumes pushes arrive in sorted `(key, val, time)` order. + fn push(&mut self, key: KOwned, val: VOwned, time: TOwned, diff: DOwned) { + let same_key = self.cur_key.as_ref() == Some(&key); + let same_val = same_key && self.cur_val.as_ref() == Some(&val); + if same_val { + self.staging.push(time, diff); + return; + } + // Crossing a val boundary: seal the prior val, and maybe cut a chunk. + if self.has_pending() { + self.staging.seal(&mut self.result.upds); + if self.staging.total() >= self.target { + self.cut(); + self.open(key, val, time, diff); + return; + } + } + if same_key { + // New val under the same key. + self.staging.push(time, diff); + self.result.vals.vals.push_own(&val); + self.cur_val = Some(val); + } else { + // New key (or the very first update). + if self.has_pending() { + self.result.vals.offs.push_ref(self.result.vals.vals.len()); + } + self.staging.push(time, diff); + self.result.vals.vals.push_own(&val); + self.result.keys.push_own(&key); + self.cur_key = Some(key); + self.cur_val = Some(val); + } + } + + /// Open a fresh chunk's first `(key, val)`; `result` must be empty. + fn open(&mut self, key: KOwned, val: VOwned, time: TOwned, diff: DOwned) { + self.result.vals.vals.push_own(&val); + self.result.keys.push_own(&key); + self.staging.push(time, diff); + self.cur_key = Some(key); + self.cur_val = Some(val); + } + + /// Finalize the in-progress chunk into `done`. The prior val must already be + /// sealed (as it is at the `push` cut point). + fn cut(&mut self) { + self.result.vals.offs.push_ref(self.result.vals.vals.len()); + let updates = self.staging.total(); + let storage = std::mem::replace(&mut self.result, empty_storage()); + self.staging = UpdsBuilder::default(); + self.cur_key = None; + self.cur_val = None; + self.done.push(ColChunk(Rc::new(Inner { storage, updates }))); + } + + /// Seal and finalize the in-progress chunk (if any), returning it. + fn finish_pending(&mut self) -> Option> { + if !self.has_pending() { return None; } + self.staging.seal(&mut self.result.upds); + self.result.vals.offs.push_ref(self.result.vals.vals.len()); + let updates = self.staging.total(); + let storage = std::mem::replace(&mut self.result, empty_storage()); + self.staging = UpdsBuilder::default(); + self.cur_key = None; + self.cur_val = None; + Some(ColChunk(Rc::new(Inner { storage, updates }))) + } + + /// Finish the build, returning all chunks (completed plus the final partial). + fn finish(mut self) -> Vec> { + if let Some(c) = self.finish_pending() { self.done.push(c); } + self.done + } + } + + /// A spine of `Rc`-shared columnar [`ChunkBatch`]es: the `arrange` trace type. + pub type ColChunkSpine = super::ChunkSpine>; + /// A reference-counted builder of columnar [`ChunkBatch`]es. + pub type ColChunkRcBuilder = super::ChunkRcBuilder>; + + // --- batcher side: a `Vec`-scratch chunker that *builds* columnar chunks --- + // + // A consolidated columnar trie can't serve as the merge batcher's unsorted + // accumulation buffer, and the `Chunk` trait deliberately does not require it to. + // Instead this "build-from-input" chunker keeps a flat `Vec` scratch — which the + // stock `ContainerChunker` accumulates and consolidates — and transcodes each + // sorted run into graded `ColChunk`s via [`Builder`]. The `Vec` scratch carries + // the `SizableContainer`/`Consolidate`/`PushInto` burden; the chunk stays a + // minimal, consolidated `Chunk`. (This is the seam: `Input → Chunk` happens here, + // and only here.) + + /// Updates as the dataflow delivers them, before consolidation. + type ColInput = Vec<((KOwned, VOwned), TOwned, DOwned)>; + + /// A [`ContainerBuilder`] that consolidates input in a `Vec` scratch and emits + /// graded [`ColChunk`]s built via [`Builder`]. + pub struct ColChunker { + scratch: ContainerChunker>, + ready: VecDeque>, + output: ColChunk, + } + + impl Default for ColChunker { + fn default() -> Self { + Self { scratch: ContainerChunker::default(), ready: VecDeque::new(), output: ColChunk::default() } + } + } + + impl ColChunker { + /// Transcode one consolidated, sorted `Vec` run into graded `ColChunk`s. + fn transcode(&mut self, mut run: ColInput) { + let mut builder = Builder::new(); + for ((k, v), t, d) in run.drain(..) { builder.push(k, v, t, d); } + self.ready.extend(builder.finish()); + } + fn pop(&mut self) -> Option<&mut ColChunk> { + if let Some(c) = self.ready.pop_front() { + self.output = c; + Some(&mut self.output) + } else { + None + } + } + } + + impl ContainerBuilder for ColChunker + where ColInput: SizableContainer + Consolidate { + type Container = ColChunk; + fn extract(&mut self) -> Option<&mut ColChunk> { + if self.ready.is_empty() { + if let Some(run) = self.scratch.extract() { + let run = std::mem::take(run); + self.transcode(run); + } + } + self.pop() + } + fn finish(&mut self) -> Option<&mut ColChunk> { + if self.ready.is_empty() { + if let Some(run) = self.scratch.finish() { + let run = std::mem::take(run); + self.transcode(run); + } + } + self.pop() + } + } + + impl<'a, L: Layout, Input> PushInto<&'a mut Input> for ColChunker + where ContainerChunker>: PushInto<&'a mut Input> { + fn push_into(&mut self, input: &'a mut Input) { self.scratch.push_into(input); } + } + + impl Accountable for ColChunk { + fn record_count(&self) -> i64 { self.0.updates as i64 } + } + + /// Merge batcher for columnar chunks: merges `ColChunk` chains via [`ChunkMerger`]. + /// The `Input → ColChunk` bridge is [`ColChunker`], now supplied at the `arrange_core` + /// callsite (it consolidates `Input` in a `Vec` scratch and transcodes to `ColChunk`). + pub type ColChunkBatcher = MergeBatcher>>; + + #[cfg(test)] + mod test { + use super::{Builder, ColChunk}; + use crate::trace::cursor::Cursor; + use crate::trace::{BatchReader, Description}; + use crate::trace::chunk::{is_graded, ChunkBatch, ChunkList}; + use crate::trace::implementations::Vector; + use timely::progress::Antichain; + + type CC = ColChunk>; + + /// Build graded columnar chunks from a sorted, consolidated update stream. + fn build(updates: Vec<((u64, u64), u64, i64)>) -> Vec { + let mut b = Builder::new(); + for ((k, v), t, d) in &updates { b.push(*k, *v, *t, *d); } + b.finish() + } + + /// Flatten chunks back to a `(key, val, time, diff)` stream. + fn dump(chunks: &[CC]) -> Vec<((u64, u64), u64, i64)> { + let mut out = Vec::new(); + for c in chunks { + let mut v = Vec::new(); + super::collect_in_range(&c.0.storage, 0, None, &mut v); + for ((k, vv, t), d) in v { out.push(((k, vv), t, d)); } + } + out + } + + // A columnar chunk built from a sorted stream must read back, through the + // batch cursor, exactly the grouped updates — including a key and a + // `(key, val)` that straddle chunk boundaries. + #[test] + fn cursor_round_trips() { + let updates = vec![ + ((0u64, 0u64), 0u64, 1i64), + ((1, 0), 0, 1), + ((1, 1), 0, 1), + ((1, 1), 1, 1), + ((1, 2), 0, 1), + ((2, 0), 0, 1), + ]; + let chunks = build(updates.clone()); + assert!(!chunks.is_empty()); + let desc = Description::new( + Antichain::from_elem(0u64), + Antichain::from_elem(2u64), + Antichain::from_elem(0u64), + ); + let batch = ChunkBatch::new(chunks, desc); + let mut cursor = batch.cursor(); + let got = cursor.to_vec(&batch, |k| *k, |v| *v); + let want = vec![ + ((0u64, 0u64), vec![(0u64, 1i64)]), + ((1, 0), vec![(0, 1)]), + ((1, 1), vec![(0, 1), (1, 1)]), + ((1, 2), vec![(0, 1)]), + ((2, 0), vec![(0, 1)]), + ]; + assert_eq!(got, want); + } + + // Building a run larger than `TARGET` yields multiple graded chunks (cut at + // val boundaries), and the singleton optimization keeps storage compact. + #[test] + fn build_cuts_and_grades() { + use super::TARGET; + // Distinct keys, each a single val with one update at the same (time,diff) + // — the singleton case. 3·TARGET keys ⇒ several chunks. + let n = 3 * TARGET as u64; + let updates: Vec<_> = (0..n).map(|k| ((k, 0u64), 0u64, 1i64)).collect(); + let chunks = build(updates); + assert!(chunks.len() >= 3, "expected several chunks, got {}", chunks.len()); + let total: usize = chunks.iter().map(|c| c.0.updates).sum(); + assert_eq!(total, n as usize); + // Every chunk but the last is at least TARGET (cut on reaching it). + for c in &chunks[..chunks.len() - 1] { + assert!(c.0.updates >= TARGET); + } + } + + // `merge_chains` over columnar chunks must consolidate equal `(k,v,t)` across + // chains and reproduce the sorted, summed contents. + #[test] + fn merge_chains_consolidates() { + use crate::trace::chunk::merge_chains; + let a = build(vec![((0, 0), 0, 1), ((1, 0), 0, 1)]); + let b = build(vec![((0, 0), 0, 1), ((2, 0), 0, 1)]); + let mut out = ChunkList::default(); + merge_chains(a, b, &mut out); + assert_eq!(dump(&out.done()), vec![((0, 0), 0, 2), ((1, 0), 0, 1), ((2, 0), 0, 1)]); + } + + // Merging two large interleaving chains yields a graded sequence with exactly + // the merged contents. + #[test] + fn merge_chains_grades() { + use super::TARGET; + use crate::trace::chunk::{is_graded, merge_chains}; + let n = 4 * TARGET as u64; + let evens = build((0..n).step_by(2).map(|k| ((k, 0), 0, 1)).collect()); + let odds = build((0..n).step_by(2).map(|k| ((k + 1, 0), 0, 1)).collect()); + let mut out = ChunkList::default(); + merge_chains(evens, odds, &mut out); + let chunks = out.done(); + assert!(is_graded(&chunks), "ungraded: {:?}", + chunks.iter().map(super::super::Chunk::len).collect::>()); + let want: Vec<_> = (0..n).map(|k| ((k, 0u64), 0u64, 1i64)).collect(); + assert_eq!(dump(&chunks), want); + } + + // `regrade` (via `ChunkList`) coalesces small adjacent chunks while + // preserving sorted contents. + #[test] + fn regrade_coalesces() { + // Many tiny single-update chunks with distinct increasing keys. + let n = 5 * super::TARGET as u64; + let mut list = ChunkList::default(); + for k in 0..n { + let mut c = build(vec![((k, 0u64), 0u64, 1i64)]); + assert_eq!(c.len(), 1); + list.push(c.pop().unwrap()); + } + let chunks = list.done(); + // Coalesced to roughly n/TARGET chunks, all sorted and accounted for. + let total: usize = chunks.iter().map(super::super::Chunk::len).sum(); + assert_eq!(total, n as usize); + assert!(is_graded(&chunks), "ungraded: {:?}", + chunks.iter().map(super::super::Chunk::len).collect::>()); + } + + // `extract` partitions by frontier (kept `>=`, shipped `<`), folds the kept + // frontier into `residual`, and emits graded chunks on both sides. + #[test] + fn extract_partitions() { + use super::super::Chunk; + let n = 4 * super::TARGET as u64; + // Distinct keys; even keys at time 0 (ship), odd keys at time 1 (keep). + let mut chunks = build((0..n).map(|i| ((i, 0), i % 2, 1)).collect()); + let frontier = Antichain::from_elem(1u64); + let mut residual = Antichain::new(); + let (mut keep, mut ship) = (ChunkList::default(), ChunkList::default()); + CC::extract(&mut chunks, &frontier, &mut residual, &mut keep, &mut ship); + let (keep, ship) = (keep.done(), ship.done()); + assert_eq!(residual, Antichain::from_elem(1u64)); + assert!(is_graded(&keep) && is_graded(&ship)); + let (kd, sd) = (dump(&keep), dump(&ship)); + assert_eq!(kd.len() + sd.len(), n as usize); + assert!(kd.iter().all(|u| u.1 == 1)); + assert!(sd.iter().all(|u| u.1 == 0)); + } + + // Resumable `advance` (chunk-at-a-time) must match a single all-at-once flush, + // with every time advanced to the frontier. + #[test] + fn advance_resumable_matches_oneshot() { + use crate::trace::chunk::AdvanceQueue; + let frontier = Antichain::from_elem(3u64); + // One sorted run, split across three pushes; groups straddle the splits. + let input = || vec![ + vec![((0u64, 0u64), 0u64, 1i64), ((0, 0), 1, 1), ((1, 0), 0, 1)], + vec![((1, 0), 5, 1), ((1, 1), 0, 1), ((2, 0), 0, 1)], + vec![((2, 0), 2, 1), ((2, 0), 9, 1)], + ]; + let oneshot = { + let mut q = AdvanceQueue::new(frontier.clone()); + let mut out = ChunkList::default(); + for g in input() { q.push(build(g), &mut out); } + q.finish(&mut out); + dump(&out.done()) + }; + let incremental = { + let mut q = AdvanceQueue::new(frontier.clone()); + let mut out = ChunkList::default(); + for g in input() { + for c in build(g) { q.push(std::iter::once(c), &mut out); } + } + q.finish(&mut out); + dump(&out.done()) + }; + assert_eq!(oneshot, incremental); + for u in &oneshot { assert!(u.1 >= 3); } + } + + // End-to-end: merge two columnar batches through the real `ChunkBatchMerger` + // (which drives merge + advance), then read the result through the batch + // cursor. Validates the whole columnar pipeline. + #[test] + fn batch_merge_end_to_end() { + use crate::trace::cursor::Cursor; + use crate::trace::{BatchReader, Description, Merger}; + use crate::trace::chunk::{ChunkBatch, ChunkBatchMerger}; + + let desc = |lo: u64, up: u64| Description::new( + Antichain::from_elem(lo), + Antichain::from_elem(up), + Antichain::from_elem(0u64), + ); + let b1 = ChunkBatch::new(build(vec![((0, 0), 0, 1), ((1, 0), 1, 1)]), desc(0, 1)); + let b2 = ChunkBatch::new(build(vec![((0, 0), 1, 1), ((2, 0), 0, 1)]), desc(1, 2)); + + let frontier = Antichain::from_elem(0u64); // since == minimum: no advance + let mut m = ChunkBatchMerger::new(&b1, &b2, frontier.borrow()); + let mut fuel = isize::MAX; + m.work(&b1, &b2, &mut fuel); + let merged = m.done(); + + let mut cursor = merged.cursor(); + let got = cursor.to_vec(&merged, |k| *k, |v| *v); + let want = vec![ + ((0u64, 0u64), vec![(0u64, 1i64), (1, 1)]), + ((1, 0), vec![(1, 1)]), + ((2, 0), vec![(0, 1)]), + ]; + assert_eq!(got, want); + } + } +} diff --git a/differential-dataflow/src/trace/mod.rs b/differential-dataflow/src/trace/mod.rs index 6fd1da106..61ee30d65 100644 --- a/differential-dataflow/src/trace/mod.rs +++ b/differential-dataflow/src/trace/mod.rs @@ -7,6 +7,7 @@ //! collection trace. This trait allows operator implementations to be generic with respect to the type of trace, //! and allows various data structures to be interpretable as multiple different types of trace. +pub mod chunk; pub mod cursor; pub mod description; pub mod implementations; From f34a6e8293d8ec86115f7db9fb016deb87befcf6 Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Thu, 11 Jun 2026 12:31:13 -0400 Subject: [PATCH 2/9] chunk: remove col_chunk backend col_chunk was an early columnar Chunk mirroring ord_neu; the phase-2 TrieChunk supersedes it and fixes its recorded limitations (decompress/recompress merge, the singleton-vs-logical-count prefix re-walk, val-boundary-only cuts). Carrying it taxed every chunk_basis modification with a re-application to a full Chunk impl slated for deletion, so it goes early. The chunks example drops its `colchunk` mode. Co-Authored-By: Claude Opus 4.8 (1M context) --- differential-dataflow/examples/chunks.rs | 16 +- differential-dataflow/src/trace/chunk.rs | 866 ----------------------- 2 files changed, 1 insertion(+), 881 deletions(-) diff --git a/differential-dataflow/examples/chunks.rs b/differential-dataflow/examples/chunks.rs index 66d8f7496..94c3ac61c 100644 --- a/differential-dataflow/examples/chunks.rs +++ b/differential-dataflow/examples/chunks.rs @@ -13,8 +13,6 @@ use differential_dataflow::input::Input; use differential_dataflow::operators::arrange::Arrange; use differential_dataflow::operators::arrange::arrangement::arrange_core; use differential_dataflow::trace::chunk::vec_chunk::{ChunkBatcher, ChunkRcBuilder, ChunkSpine, VecChunk}; -use differential_dataflow::trace::chunk::col_chunk::{ColChunkBatcher, ColChunkRcBuilder, ColChunkSpine, ColChunker}; -use differential_dataflow::trace::implementations::Vector; use differential_dataflow::trace::implementations::chunker::ContainerChunker; use differential_dataflow::trace::implementations::ord_neu::{OrdValBatcher, RcOrdValBuilder, OrdValSpine}; @@ -55,18 +53,6 @@ fn main() { keys.inner, Exchange::new(|u: &((u64, ()), u64, isize)| (u.0).0.hashed().into()), "Keys"); keys.join_core(data, |_k, &(), &()| Option::<()>::None).probe_with(&mut probe); } - "colchunk" => { - type L = Vector<((u64, ()), u64, isize)>; - type Ba = ColChunkBatcher; - type Bu = ColChunkRcBuilder; - type Sp = ColChunkSpine; - type Chu = ColChunker; - let data = arrange_core::<_, _, Chu, Ba, Bu, Sp>( - data.inner, Exchange::new(|u: &((u64, ()), u64, isize)| (u.0).0.hashed().into()), "Data"); - let keys = arrange_core::<_, _, Chu, Ba, Bu, Sp>( - keys.inner, Exchange::new(|u: &((u64, ()), u64, isize)| (u.0).0.hashed().into()), "Keys"); - keys.join_core(data, |_k, &(), &()| Option::<()>::None).probe_with(&mut probe); - } "ord" => { type Ba = OrdValBatcher; type Bu = RcOrdValBuilder; @@ -75,7 +61,7 @@ fn main() { let keys = keys.arrange::(); keys.join_core(data, |_k, &(), &()| Option::<()>::None).probe_with(&mut probe); } - other => panic!("unrecognized mode: {other:?} (expected `chunk`, `colchunk`, or `ord`)"), + other => panic!("unrecognized mode: {other:?} (expected `chunk` or `ord`)"), } (data_input, keys_input) diff --git a/differential-dataflow/src/trace/chunk.rs b/differential-dataflow/src/trace/chunk.rs index 7d51f38a4..b3eb6589b 100644 --- a/differential-dataflow/src/trace/chunk.rs +++ b/differential-dataflow/src/trace/chunk.rs @@ -1522,869 +1522,3 @@ pub mod vec_chunk { } } } - -pub mod col_chunk { - //! A columnar [`Chunk`] mirroring the `ord_neu` layout: separate key / val / - //! time / diff columns linked by offset lists, with the singleton update - //! optimization (an empty `[offs[i], offs[i+1])` range reuses the prior single - //! `(time, diff)`). - //! - //! Where [`vec_chunk`](super::vec_chunk) stores a flat `Vec<((K,V),T,R)>` — - //! repeating each key on every update — this stores each distinct key once, each - //! `(key, val)` once, and shares a single `(time, diff)` across vals when they - //! coincide. It reuses the very containers `ord_neu` is built from - //! ([`Vals`], [`Upds`], [`UpdsBuilder`], [`OffsetList`](crate::trace::implementations::OffsetList)), - //! so the layout and the singleton encoding are shared code, not a re-derivation. - //! - //! The chunk-producing operations still emit *graded* runs (cut to - //! [`Chunk::TARGET`]) rather than one monolithic batch, preserving proportional - //! merging. - //! - //! # Status (v1) - //! - //! The full [`Chunk`] trait is implemented and tested (merge / extract / advance / - //! regrade / prune / cursor), so `ColChunk` works as a [`ChunkBatch`] backing — a - //! spine and reference-counted builder are aliased below. Known limitations, all - //! noted at their sites and worth review: - //! - //! * **Decompress/recompress.** `merge`/`extract`/`advance` read inputs into an - //! owned `(key, val, time, diff)` stream and rebuild via [`Builder`]. This is - //! proportional and graded, but does not yet exploit the columnar layout to copy - //! runs of distinct keys by range (as `ord_neu`'s merger does); it also - //! materializes owned keys/vals during a merge. - //! * **`consumed` ↔ singleton.** The merge `consumed` prefix is a *logical* update - //! count, which under the singleton encoding does not map `O(1)` to a column - //! position, so a retained head re-walks its prefix each call. - //! * **Cut granularity.** Chunks are cut only at val boundaries, so a single - //! `(key, val)` exceeding `TARGET` yields one over-sized chunk; mid-val splitting - //! is unimplemented. - //! * **Merge-batcher integration.** Wired via [`ColChunkBatcher`] / [`ColChunker`]: - //! rather than make the consolidated trie double as the merge batcher's unsorted - //! accumulation buffer (which `vec_chunk` can, being flat, but a trie can't), the - //! chunker keeps a `Vec` scratch and *builds* graded chunks from each sorted run. - //! The `Chunk` trait stays minimal — the accumulation traits live on the scratch, - //! not the chunk. `ColChunk` arranges end-to-end (`Collection → MergeBatcher → - //! Batch → Merger`); see the `chunks` example's `colchunk` mode. - - use std::marker::PhantomData; - use std::rc::Rc; - - use std::collections::VecDeque; - - use timely::Accountable; - use timely::container::{ContainerBuilder, PushInto, SizableContainer}; - use timely::progress::Antichain; - - use crate::consolidation::Consolidate; - use crate::difference::{IsZero, Semigroup}; - use crate::lattice::Lattice; - use crate::trace::cursor::Cursor; - use crate::trace::implementations::{BatchContainer, Layout, WithLayout}; - use crate::trace::implementations::layout::Time; - use crate::trace::implementations::chunker::ContainerChunker; - use crate::trace::implementations::merge_batcher::MergeBatcher; - use crate::trace::implementations::ord_neu::layers::{UpdsBuilder, Upds, Vals}; - use crate::trace::implementations::ord_neu::val_batch::OrdValStorage; - - use super::{Chunk, ChunkFeed, ChunkList, ChunkMerger}; - - /// The chunk size: maximum updates per chunk and the coalescing threshold. - const TARGET: usize = 1024; - - /// Columnar storage plus the *logical* update count, which exceeds the number - /// of stored `(time, diff)` pairs by the number of singleton reuses. - struct Inner { - storage: OrdValStorage, - updates: usize, - } - - /// A sorted, consolidated columnar run of `((key, val), time, diff)`, shared via `Rc`. - pub struct ColChunk(Rc>); - - impl Clone for ColChunk { - fn clone(&self) -> Self { ColChunk(Rc::clone(&self.0)) } - } - - impl Default for ColChunk { - fn default() -> Self { - ColChunk(Rc::new(Inner { - storage: OrdValStorage { - keys: ::with_capacity(0), - vals: Vals::default(), - upds: Upds::default(), - }, - updates: 0, - })) - } - } - - impl WithLayout for ColChunk { - type Layout = L; - } - - // --- cursor: mirrors `ord_neu`'s `OrdValCursor`, over `ColChunk` storage --- - - /// A cursor over a [`ColChunk`], tracking absolute key and val positions. - pub struct ColChunkCursor { - key_cursor: usize, - val_cursor: usize, - phantom: PhantomData, - } - - impl WithLayout for ColChunkCursor { - type Layout = L; - } - - impl Cursor for ColChunkCursor { - type Storage = ColChunk; - - fn get_key<'a>(&self, s: &'a Self::Storage) -> Option> { s.0.storage.keys.get(self.key_cursor) } - fn get_val<'a>(&self, s: &'a Self::Storage) -> Option> { if self.val_valid(s) { Some(self.val(s)) } else { None } } - - fn key<'a>(&self, s: &'a Self::Storage) -> Self::Key<'a> { s.0.storage.keys.index(self.key_cursor) } - fn val<'a>(&self, s: &'a Self::Storage) -> Self::Val<'a> { s.0.storage.vals.get_abs(self.val_cursor) } - - fn map_times, Self::DiffGat<'_>)>(&mut self, s: &Self::Storage, mut logic: L2) { - let (lower, upper) = s.0.storage.upds.bounds(self.val_cursor); - for index in lower .. upper { - let (time, diff) = s.0.storage.upds.get_abs(index); - logic(time, diff); - } - } - - fn key_valid(&self, s: &Self::Storage) -> bool { self.key_cursor < s.0.storage.keys.len() } - fn val_valid(&self, s: &Self::Storage) -> bool { self.val_cursor < s.0.storage.vals.bounds(self.key_cursor).1 } - - fn step_key(&mut self, s: &Self::Storage) { - self.key_cursor += 1; - if self.key_valid(s) { self.rewind_vals(s); } - else { self.key_cursor = s.0.storage.keys.len(); } - } - fn seek_key(&mut self, s: &Self::Storage, key: Self::Key<'_>) { - self.key_cursor += s.0.storage.keys.advance(self.key_cursor, s.0.storage.keys.len(), |x| { - ::reborrow(x).lt(&::reborrow(key)) - }); - if self.key_valid(s) { self.rewind_vals(s); } - } - fn step_val(&mut self, s: &Self::Storage) { - self.val_cursor += 1; - if !self.val_valid(s) { self.val_cursor = s.0.storage.vals.bounds(self.key_cursor).1; } - } - fn seek_val(&mut self, s: &Self::Storage, val: Self::Val<'_>) { - self.val_cursor += s.0.storage.vals.vals.advance(self.val_cursor, s.0.storage.vals.bounds(self.key_cursor).1, |x| { - ::reborrow(x).lt(&::reborrow(val)) - }); - } - fn rewind_keys(&mut self, s: &Self::Storage) { - self.key_cursor = 0; - if self.key_valid(s) { self.rewind_vals(s); } - } - fn rewind_vals(&mut self, s: &Self::Storage) { - self.val_cursor = s.0.storage.vals.bounds(self.key_cursor).0; - } - } - - // --- trace side: the `Chunk` operations --- - - impl Chunk for ColChunk - where - // Implied by `Layout` but not elaborated at direct call sites; state them so - // `advance` can advance times and consolidate diffs. - ::Owned: Lattice, - ::Owned: Semigroup, - { - type Cursor = ColChunkCursor; - - const TARGET: usize = TARGET; - - fn cursor(&self) -> Self::Cursor { - ColChunkCursor { key_cursor: 0, val_cursor: 0, phantom: PhantomData } - } - - fn bounds(&self) -> ( - (Self::Key<'_>, Self::Val<'_>, Self::TimeGat<'_>), - (Self::Key<'_>, Self::Val<'_>, Self::TimeGat<'_>), - ) { - let s = &self.0.storage; - let nk = s.keys.len(); - let nv = s.vals.len(); - // First (key, val, time): first key, its first val, that val's first time. - // The first val is never a singleton (the encoding needs a prior update). - let fk = s.keys.index(0); - let fv = s.vals.get_abs(0); - let ft = s.upds.times.index(s.upds.bounds(0).0); - // Last (key, val, time): last key, last val, that val's last time. - let lk = s.keys.index(nk - 1); - let lv = s.vals.get_abs(nv - 1); - let lt = s.upds.times.index(s.upds.bounds(nv - 1).1 - 1); - ((fk, fv, ft), (lk, lv, lt)) - } - - fn len(&self) -> usize { self.0.updates } - - fn prune(self, prefix: usize) -> Self { - // The suffix `[prefix..]` is already sorted + consolidated; rebuild it as - // a single chunk (never cutting), mirroring `vec_chunk::prune`. - let mut updates: Vec> = Vec::new(); - collect_in_range(&self.0.storage, prefix, None, &mut updates); - build_single(updates) - } - - fn merge(chunks: &mut [(usize, Self)], out: &mut ChunkList) { - // Horizon: least last `(key, val, time)` across in-range inputs; nothing - // strictly below it can interleave with as-yet-unmerged updates. - let horizon = chunks.iter() - .filter(|(consumed, ch)| *consumed < ch.len()) - .map(|(_, ch)| { - let (_, (lk, lv, lt)) = ch.bounds(); - ( - ::into_owned(lk), - ::into_owned(lv), - ::into_owned(lt), - ) - }) - .min(); - let Some(horizon) = horizon else { return; }; - - // Decompress each input's in-range prefix, advancing its consumed count. - // The input achieving the horizon drains fully (all its updates are - // `<= horizon`), satisfying the merge contract. - let mut collected: Vec> = Vec::new(); - for (consumed, ch) in chunks.iter_mut() { - let added = collect_in_range(&ch.0.storage, *consumed, Some(&horizon), &mut collected); - *consumed += added; - } - // Sort + sum equal `(key, val, time)` across inputs, dropping cancellations. - crate::consolidation::consolidate(&mut collected); - // Re-pack into graded chunks (the builder re-applies the singleton opt). - let mut builder = Builder::new(); - for ((k, v, t), d) in collected { builder.push(k, v, t, d); } - for c in builder.finish() { out.push(c); } - } - - fn extract( - chunks: &mut Vec, - frontier: &Antichain>, - residual: &mut Antichain>, - keep: &mut ChunkList, - ship: &mut ChunkList, - ) { - // Route each update to `keep` (time `>=` frontier) or `ship` (otherwise), - // folding kept times into `residual`. Iterating sorted input and appending - // in order keeps each side sorted, so the builders emit graded chunks. - let mut keep_b = Builder::new(); - let mut ship_b = Builder::new(); - for chunk in chunks.drain(..) { - let mut updates: Vec> = Vec::new(); - collect_in_range(&chunk.0.storage, 0, None, &mut updates); - for ((k, v, t), d) in updates { - if frontier.borrow().less_equal(&t) { - residual.insert_ref(&t); - keep_b.push(k, v, t, d); - } else { - ship_b.push(k, v, t, d); - } - } - } - for c in keep_b.finish() { keep.push(c); } - for c in ship_b.finish() { ship.push(c); } - } - - fn advance( - feed: &mut ChunkFeed, - frontier: &Antichain>, - done: bool, - out: &mut ChunkList, - ) { - // Decompress the withheld head (past its consumed prefix) and all freshly - // pushed tail chunks into one sorted owned buffer. The feed is a single - // sorted run, so the head precedes the tail. - let (consumed, head) = &mut feed.0; - let mut buf: Vec> = Vec::new(); - collect_in_range(&head.0.storage, *consumed, None, &mut buf); - *consumed = 0; - *head = ColChunk::default(); - for chunk in feed.1.drain(..) { - collect_in_range(&chunk.0.storage, 0, None, &mut buf); - } - if buf.is_empty() { return; } - - // The `(key, val)` of an update. - let kv = |u: &OwnedUpdate| (u.0.0.clone(), u.0.1.clone()); - - // If every update shares one `(key, val)` and we are not done, no group is - // provably complete (a later push may extend it); withhold everything as - // the head for the next call. (v1: re-packs the head each call, so a key - // spanning many pushes is quadratic — noted for review.) - if !done && kv(&buf[0]) == kv(&buf[buf.len() - 1]) { - *head = build_single(buf); - return; - } - - // Otherwise withhold the trailing (single `(key, val)`) group unless done. - let end = if done { buf.len() } else { - let last = kv(&buf[buf.len() - 1]); - let mut start = buf.len(); - while start > 0 && kv(&buf[start - 1]) == last { start -= 1; } - start - }; - if end < buf.len() { - let tail = buf.split_off(end); - *head = build_single(tail); - } - - // Advance + consolidate each complete group into graded output chunks. - let mut builder = Builder::new(); - let mut i = 0; - while i < buf.len() { - let mut j = i; - let group = kv(&buf[i]); - while j < buf.len() && kv(&buf[j]) == group { j += 1; } - for u in &mut buf[i..j] { u.0.2.advance_by(frontier.borrow()); } - // Advancing is monotone w.r.t. the lattice but not the total order; - // re-sort the group by time, then consolidate equal times. - buf[i..j].sort_by(|a, b| a.0.2.cmp(&b.0.2)); - let mut k = i; - while k < j { - let (kk, vv, t) = (buf[k].0.0.clone(), buf[k].0.1.clone(), buf[k].0.2.clone()); - let mut diff = buf[k].1.clone(); - k += 1; - while k < j && buf[k].0.2 == t { diff.plus_equals(&buf[k].1); k += 1; } - if !diff.is_zero() { - builder.push(kk, vv, t, diff); - } - } - i = j; - } - for c in builder.finish() { out.push(c); } - } - - fn regrade(queue: &mut Vec, done: bool, out: &mut Vec) { - // Maximal packing over columnar chunks. A chunk already at least `TARGET` - // is passed through by `Rc` move; smaller chunks are decompressed into a - // `Builder` that re-emits `TARGET`-sized runs (and re-consolidates / - // re-applies the singleton optimization). The builder's trailing partial - // is carried back onto `queue` between calls, or emitted on `done`. - // - // (v1 note: a chunk *larger* than `TARGET` — only producible by a single - // `(key, val)` whose updates exceed `TARGET` — is passed through rather - // than split, since columnar mid-val splitting is not yet implemented.) - let mut builder = Builder::new(); - for chunk in queue.drain(..) { - if !builder.has_pending() && chunk.len() >= TARGET { - out.push(chunk); - } else if chunk.len() >= TARGET { - // Flush the carried partial before the larger chunk (order!). - if let Some(c) = builder.finish_pending() { out.push(c); } - out.push(chunk); - } else { - feed_chunk(&mut builder, &chunk); - out.append(&mut builder.drain_done()); - } - } - out.append(&mut builder.drain_done()); - if let Some(c) = builder.finish_pending() { - if done { out.push(c); } else { queue.push(c); } - } - } - } - - /// Owned key / val / time / diff for layout `L`. - type KOwned = <::KeyContainer as BatchContainer>::Owned; - type VOwned = <::ValContainer as BatchContainer>::Owned; - type TOwned = <::TimeContainer as BatchContainer>::Owned; - type DOwned = <::DiffContainer as BatchContainer>::Owned; - /// An owned update grouped for consolidation: `((key, val, time), diff)`. - type OwnedUpdate = ((KOwned, VOwned, TOwned), DOwned); - - /// Push every update of `chunk` into `builder`, in order. Decompresses the - /// columnar layout to an owned `(key, val, time, diff)` stream; used by `regrade` - /// to re-pack small chunks. - fn feed_chunk(builder: &mut Builder, chunk: &ColChunk) { - let mut updates: Vec> = Vec::new(); - collect_in_range(&chunk.0.storage, 0, None, &mut updates); - for ((k, v, t), d) in updates { builder.push(k, v, t, d); } - } - - /// Append owned updates of `s`, in sorted `(key, val, time)` order, starting at - /// logical index `skip`, while `(key, val, time) <= horizon` (when `Some`). - /// Returns the number appended. Used by the producing ops to decompress the - /// in-range portion of a chunk for re-merging. - /// - /// `skip` is walked linearly; under the singleton encoding a logical update index - /// does not map O(1) to a column position, so a retained head re-walks its - /// consumed prefix each call (a perf wrinkle worth revisiting — see notes). - fn collect_in_range( - s: &OrdValStorage, - skip: usize, - horizon: Option<&(KOwned, VOwned, TOwned)>, - out: &mut Vec>, - ) -> usize { - let mut seen = 0; - let mut count = 0; - for ki in 0..s.keys.len() { - let key = s.keys.index(ki); - let (vlo, vhi) = s.vals.bounds(ki); - for vi in vlo..vhi { - let val = s.vals.get_abs(vi); - let (ulo, uhi) = s.upds.bounds(vi); - for ui in ulo..uhi { - if seen < skip { seen += 1; continue; } - let (t, d) = s.upds.get_abs(ui); - let triple = ( - ::into_owned(key), - ::into_owned(val), - ::into_owned(t), - ); - if let Some(h) = horizon { - if &triple > h { return count; } - } - out.push((triple, ::into_owned(d))); - count += 1; - seen += 1; - } - } - } - count - } - - /// An empty columnar storage (offset lists seeded with their leading `0`). - fn empty_storage() -> OrdValStorage { - OrdValStorage { - keys: ::with_capacity(0), - vals: Vals::default(), - upds: Upds::default(), - } - } - - /// Build a single (un-cut) chunk from a sorted, consolidated owned update stream. - /// Used to repackage withheld/pruned suffixes; returns an empty chunk if there - /// are no updates. - fn build_single(updates: Vec>) -> ColChunk { - let mut builder = Builder::with_target(usize::MAX); - for ((k, v, t), d) in updates { builder.push(k, v, t, d); } - builder.finish().pop().unwrap_or_default() - } - - /// Builds graded [`ColChunk`]s from a sorted, consolidated `(key, val, time, diff)` - /// stream, reusing `ord_neu`'s columnar machinery (and its singleton optimization - /// via [`UpdsBuilder::seal`]). Cuts a chunk once its logical update count reaches - /// `TARGET`, at a val boundary, so output arrives graded rather than monolithic. - /// - /// Completed chunks accumulate in `done`; the in-progress chunk lives in - /// `result`/`staging`. (v1: cuts only at val boundaries, so a single `(key, val)` - /// with more than `target` updates yields one over-sized chunk.) - struct Builder { - result: OrdValStorage, - staging: UpdsBuilder, - done: Vec>, - /// Cut the in-progress chunk once its update count reaches this. - target: usize, - /// Last-pushed key / val, for boundary detection without re-reading columns. - cur_key: Option>, - cur_val: Option>, - } - - impl Builder { - /// A builder cutting at `TARGET`. - fn new() -> Self { Self::with_target(TARGET) } - /// A builder cutting once the in-progress chunk reaches `target` updates; - /// `usize::MAX` never cuts (one chunk). - fn with_target(target: usize) -> Self { - Self { - result: empty_storage(), - staging: UpdsBuilder::default(), - done: Vec::new(), - target, - cur_key: None, - cur_val: None, - } - } - - /// Whether the in-progress chunk holds any updates. - fn has_pending(&self) -> bool { !self.result.keys.is_empty() } - - /// Take the completed chunks accumulated so far. - fn drain_done(&mut self) -> Vec> { std::mem::take(&mut self.done) } - - /// Push one update; assumes pushes arrive in sorted `(key, val, time)` order. - fn push(&mut self, key: KOwned, val: VOwned, time: TOwned, diff: DOwned) { - let same_key = self.cur_key.as_ref() == Some(&key); - let same_val = same_key && self.cur_val.as_ref() == Some(&val); - if same_val { - self.staging.push(time, diff); - return; - } - // Crossing a val boundary: seal the prior val, and maybe cut a chunk. - if self.has_pending() { - self.staging.seal(&mut self.result.upds); - if self.staging.total() >= self.target { - self.cut(); - self.open(key, val, time, diff); - return; - } - } - if same_key { - // New val under the same key. - self.staging.push(time, diff); - self.result.vals.vals.push_own(&val); - self.cur_val = Some(val); - } else { - // New key (or the very first update). - if self.has_pending() { - self.result.vals.offs.push_ref(self.result.vals.vals.len()); - } - self.staging.push(time, diff); - self.result.vals.vals.push_own(&val); - self.result.keys.push_own(&key); - self.cur_key = Some(key); - self.cur_val = Some(val); - } - } - - /// Open a fresh chunk's first `(key, val)`; `result` must be empty. - fn open(&mut self, key: KOwned, val: VOwned, time: TOwned, diff: DOwned) { - self.result.vals.vals.push_own(&val); - self.result.keys.push_own(&key); - self.staging.push(time, diff); - self.cur_key = Some(key); - self.cur_val = Some(val); - } - - /// Finalize the in-progress chunk into `done`. The prior val must already be - /// sealed (as it is at the `push` cut point). - fn cut(&mut self) { - self.result.vals.offs.push_ref(self.result.vals.vals.len()); - let updates = self.staging.total(); - let storage = std::mem::replace(&mut self.result, empty_storage()); - self.staging = UpdsBuilder::default(); - self.cur_key = None; - self.cur_val = None; - self.done.push(ColChunk(Rc::new(Inner { storage, updates }))); - } - - /// Seal and finalize the in-progress chunk (if any), returning it. - fn finish_pending(&mut self) -> Option> { - if !self.has_pending() { return None; } - self.staging.seal(&mut self.result.upds); - self.result.vals.offs.push_ref(self.result.vals.vals.len()); - let updates = self.staging.total(); - let storage = std::mem::replace(&mut self.result, empty_storage()); - self.staging = UpdsBuilder::default(); - self.cur_key = None; - self.cur_val = None; - Some(ColChunk(Rc::new(Inner { storage, updates }))) - } - - /// Finish the build, returning all chunks (completed plus the final partial). - fn finish(mut self) -> Vec> { - if let Some(c) = self.finish_pending() { self.done.push(c); } - self.done - } - } - - /// A spine of `Rc`-shared columnar [`ChunkBatch`]es: the `arrange` trace type. - pub type ColChunkSpine = super::ChunkSpine>; - /// A reference-counted builder of columnar [`ChunkBatch`]es. - pub type ColChunkRcBuilder = super::ChunkRcBuilder>; - - // --- batcher side: a `Vec`-scratch chunker that *builds* columnar chunks --- - // - // A consolidated columnar trie can't serve as the merge batcher's unsorted - // accumulation buffer, and the `Chunk` trait deliberately does not require it to. - // Instead this "build-from-input" chunker keeps a flat `Vec` scratch — which the - // stock `ContainerChunker` accumulates and consolidates — and transcodes each - // sorted run into graded `ColChunk`s via [`Builder`]. The `Vec` scratch carries - // the `SizableContainer`/`Consolidate`/`PushInto` burden; the chunk stays a - // minimal, consolidated `Chunk`. (This is the seam: `Input → Chunk` happens here, - // and only here.) - - /// Updates as the dataflow delivers them, before consolidation. - type ColInput = Vec<((KOwned, VOwned), TOwned, DOwned)>; - - /// A [`ContainerBuilder`] that consolidates input in a `Vec` scratch and emits - /// graded [`ColChunk`]s built via [`Builder`]. - pub struct ColChunker { - scratch: ContainerChunker>, - ready: VecDeque>, - output: ColChunk, - } - - impl Default for ColChunker { - fn default() -> Self { - Self { scratch: ContainerChunker::default(), ready: VecDeque::new(), output: ColChunk::default() } - } - } - - impl ColChunker { - /// Transcode one consolidated, sorted `Vec` run into graded `ColChunk`s. - fn transcode(&mut self, mut run: ColInput) { - let mut builder = Builder::new(); - for ((k, v), t, d) in run.drain(..) { builder.push(k, v, t, d); } - self.ready.extend(builder.finish()); - } - fn pop(&mut self) -> Option<&mut ColChunk> { - if let Some(c) = self.ready.pop_front() { - self.output = c; - Some(&mut self.output) - } else { - None - } - } - } - - impl ContainerBuilder for ColChunker - where ColInput: SizableContainer + Consolidate { - type Container = ColChunk; - fn extract(&mut self) -> Option<&mut ColChunk> { - if self.ready.is_empty() { - if let Some(run) = self.scratch.extract() { - let run = std::mem::take(run); - self.transcode(run); - } - } - self.pop() - } - fn finish(&mut self) -> Option<&mut ColChunk> { - if self.ready.is_empty() { - if let Some(run) = self.scratch.finish() { - let run = std::mem::take(run); - self.transcode(run); - } - } - self.pop() - } - } - - impl<'a, L: Layout, Input> PushInto<&'a mut Input> for ColChunker - where ContainerChunker>: PushInto<&'a mut Input> { - fn push_into(&mut self, input: &'a mut Input) { self.scratch.push_into(input); } - } - - impl Accountable for ColChunk { - fn record_count(&self) -> i64 { self.0.updates as i64 } - } - - /// Merge batcher for columnar chunks: merges `ColChunk` chains via [`ChunkMerger`]. - /// The `Input → ColChunk` bridge is [`ColChunker`], now supplied at the `arrange_core` - /// callsite (it consolidates `Input` in a `Vec` scratch and transcodes to `ColChunk`). - pub type ColChunkBatcher = MergeBatcher>>; - - #[cfg(test)] - mod test { - use super::{Builder, ColChunk}; - use crate::trace::cursor::Cursor; - use crate::trace::{BatchReader, Description}; - use crate::trace::chunk::{is_graded, ChunkBatch, ChunkList}; - use crate::trace::implementations::Vector; - use timely::progress::Antichain; - - type CC = ColChunk>; - - /// Build graded columnar chunks from a sorted, consolidated update stream. - fn build(updates: Vec<((u64, u64), u64, i64)>) -> Vec { - let mut b = Builder::new(); - for ((k, v), t, d) in &updates { b.push(*k, *v, *t, *d); } - b.finish() - } - - /// Flatten chunks back to a `(key, val, time, diff)` stream. - fn dump(chunks: &[CC]) -> Vec<((u64, u64), u64, i64)> { - let mut out = Vec::new(); - for c in chunks { - let mut v = Vec::new(); - super::collect_in_range(&c.0.storage, 0, None, &mut v); - for ((k, vv, t), d) in v { out.push(((k, vv), t, d)); } - } - out - } - - // A columnar chunk built from a sorted stream must read back, through the - // batch cursor, exactly the grouped updates — including a key and a - // `(key, val)` that straddle chunk boundaries. - #[test] - fn cursor_round_trips() { - let updates = vec![ - ((0u64, 0u64), 0u64, 1i64), - ((1, 0), 0, 1), - ((1, 1), 0, 1), - ((1, 1), 1, 1), - ((1, 2), 0, 1), - ((2, 0), 0, 1), - ]; - let chunks = build(updates.clone()); - assert!(!chunks.is_empty()); - let desc = Description::new( - Antichain::from_elem(0u64), - Antichain::from_elem(2u64), - Antichain::from_elem(0u64), - ); - let batch = ChunkBatch::new(chunks, desc); - let mut cursor = batch.cursor(); - let got = cursor.to_vec(&batch, |k| *k, |v| *v); - let want = vec![ - ((0u64, 0u64), vec![(0u64, 1i64)]), - ((1, 0), vec![(0, 1)]), - ((1, 1), vec![(0, 1), (1, 1)]), - ((1, 2), vec![(0, 1)]), - ((2, 0), vec![(0, 1)]), - ]; - assert_eq!(got, want); - } - - // Building a run larger than `TARGET` yields multiple graded chunks (cut at - // val boundaries), and the singleton optimization keeps storage compact. - #[test] - fn build_cuts_and_grades() { - use super::TARGET; - // Distinct keys, each a single val with one update at the same (time,diff) - // — the singleton case. 3·TARGET keys ⇒ several chunks. - let n = 3 * TARGET as u64; - let updates: Vec<_> = (0..n).map(|k| ((k, 0u64), 0u64, 1i64)).collect(); - let chunks = build(updates); - assert!(chunks.len() >= 3, "expected several chunks, got {}", chunks.len()); - let total: usize = chunks.iter().map(|c| c.0.updates).sum(); - assert_eq!(total, n as usize); - // Every chunk but the last is at least TARGET (cut on reaching it). - for c in &chunks[..chunks.len() - 1] { - assert!(c.0.updates >= TARGET); - } - } - - // `merge_chains` over columnar chunks must consolidate equal `(k,v,t)` across - // chains and reproduce the sorted, summed contents. - #[test] - fn merge_chains_consolidates() { - use crate::trace::chunk::merge_chains; - let a = build(vec![((0, 0), 0, 1), ((1, 0), 0, 1)]); - let b = build(vec![((0, 0), 0, 1), ((2, 0), 0, 1)]); - let mut out = ChunkList::default(); - merge_chains(a, b, &mut out); - assert_eq!(dump(&out.done()), vec![((0, 0), 0, 2), ((1, 0), 0, 1), ((2, 0), 0, 1)]); - } - - // Merging two large interleaving chains yields a graded sequence with exactly - // the merged contents. - #[test] - fn merge_chains_grades() { - use super::TARGET; - use crate::trace::chunk::{is_graded, merge_chains}; - let n = 4 * TARGET as u64; - let evens = build((0..n).step_by(2).map(|k| ((k, 0), 0, 1)).collect()); - let odds = build((0..n).step_by(2).map(|k| ((k + 1, 0), 0, 1)).collect()); - let mut out = ChunkList::default(); - merge_chains(evens, odds, &mut out); - let chunks = out.done(); - assert!(is_graded(&chunks), "ungraded: {:?}", - chunks.iter().map(super::super::Chunk::len).collect::>()); - let want: Vec<_> = (0..n).map(|k| ((k, 0u64), 0u64, 1i64)).collect(); - assert_eq!(dump(&chunks), want); - } - - // `regrade` (via `ChunkList`) coalesces small adjacent chunks while - // preserving sorted contents. - #[test] - fn regrade_coalesces() { - // Many tiny single-update chunks with distinct increasing keys. - let n = 5 * super::TARGET as u64; - let mut list = ChunkList::default(); - for k in 0..n { - let mut c = build(vec![((k, 0u64), 0u64, 1i64)]); - assert_eq!(c.len(), 1); - list.push(c.pop().unwrap()); - } - let chunks = list.done(); - // Coalesced to roughly n/TARGET chunks, all sorted and accounted for. - let total: usize = chunks.iter().map(super::super::Chunk::len).sum(); - assert_eq!(total, n as usize); - assert!(is_graded(&chunks), "ungraded: {:?}", - chunks.iter().map(super::super::Chunk::len).collect::>()); - } - - // `extract` partitions by frontier (kept `>=`, shipped `<`), folds the kept - // frontier into `residual`, and emits graded chunks on both sides. - #[test] - fn extract_partitions() { - use super::super::Chunk; - let n = 4 * super::TARGET as u64; - // Distinct keys; even keys at time 0 (ship), odd keys at time 1 (keep). - let mut chunks = build((0..n).map(|i| ((i, 0), i % 2, 1)).collect()); - let frontier = Antichain::from_elem(1u64); - let mut residual = Antichain::new(); - let (mut keep, mut ship) = (ChunkList::default(), ChunkList::default()); - CC::extract(&mut chunks, &frontier, &mut residual, &mut keep, &mut ship); - let (keep, ship) = (keep.done(), ship.done()); - assert_eq!(residual, Antichain::from_elem(1u64)); - assert!(is_graded(&keep) && is_graded(&ship)); - let (kd, sd) = (dump(&keep), dump(&ship)); - assert_eq!(kd.len() + sd.len(), n as usize); - assert!(kd.iter().all(|u| u.1 == 1)); - assert!(sd.iter().all(|u| u.1 == 0)); - } - - // Resumable `advance` (chunk-at-a-time) must match a single all-at-once flush, - // with every time advanced to the frontier. - #[test] - fn advance_resumable_matches_oneshot() { - use crate::trace::chunk::AdvanceQueue; - let frontier = Antichain::from_elem(3u64); - // One sorted run, split across three pushes; groups straddle the splits. - let input = || vec![ - vec![((0u64, 0u64), 0u64, 1i64), ((0, 0), 1, 1), ((1, 0), 0, 1)], - vec![((1, 0), 5, 1), ((1, 1), 0, 1), ((2, 0), 0, 1)], - vec![((2, 0), 2, 1), ((2, 0), 9, 1)], - ]; - let oneshot = { - let mut q = AdvanceQueue::new(frontier.clone()); - let mut out = ChunkList::default(); - for g in input() { q.push(build(g), &mut out); } - q.finish(&mut out); - dump(&out.done()) - }; - let incremental = { - let mut q = AdvanceQueue::new(frontier.clone()); - let mut out = ChunkList::default(); - for g in input() { - for c in build(g) { q.push(std::iter::once(c), &mut out); } - } - q.finish(&mut out); - dump(&out.done()) - }; - assert_eq!(oneshot, incremental); - for u in &oneshot { assert!(u.1 >= 3); } - } - - // End-to-end: merge two columnar batches through the real `ChunkBatchMerger` - // (which drives merge + advance), then read the result through the batch - // cursor. Validates the whole columnar pipeline. - #[test] - fn batch_merge_end_to_end() { - use crate::trace::cursor::Cursor; - use crate::trace::{BatchReader, Description, Merger}; - use crate::trace::chunk::{ChunkBatch, ChunkBatchMerger}; - - let desc = |lo: u64, up: u64| Description::new( - Antichain::from_elem(lo), - Antichain::from_elem(up), - Antichain::from_elem(0u64), - ); - let b1 = ChunkBatch::new(build(vec![((0, 0), 0, 1), ((1, 0), 1, 1)]), desc(0, 1)); - let b2 = ChunkBatch::new(build(vec![((0, 0), 1, 1), ((2, 0), 0, 1)]), desc(1, 2)); - - let frontier = Antichain::from_elem(0u64); // since == minimum: no advance - let mut m = ChunkBatchMerger::new(&b1, &b2, frontier.borrow()); - let mut fuel = isize::MAX; - m.work(&b1, &b2, &mut fuel); - let merged = m.done(); - - let mut cursor = merged.cursor(); - let got = cursor.to_vec(&merged, |k| *k, |v| *v); - let want = vec![ - ((0u64, 0u64), vec![(0u64, 1i64), (1, 1)]), - ((1, 0), vec![(1, 1)]), - ((2, 0), vec![(0, 1)]), - ]; - assert_eq!(got, want); - } - } -} From e03512127c32a1eadac84507540890bfa839cb93 Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Thu, 11 Jun 2026 13:39:08 -0400 Subject: [PATCH 3/9] chunk: align chunker absorb point with TARGET; weigh batcher ladder by updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two merge-batcher / chunker fixes for the bfs/probe regression (plan 1.5, 1.4): 1.5 — VecChunk's SizableContainer absorbs to TARGET updates (len >= TARGET, ensure_capacity reserves TARGET) instead of timely's byte-derived buffer size, so chunks arrive pre-graded rather than re-melded downstream. 1.4 — MergeBatcher's geometric ladder weighs chains by summed updates, not chunk counts: regrading decouples the two, so a trickle of single-update chunks re-merged the head chain on every insert. A chain is immutable until merged, so the weight is cached alongside it (chains: Vec<(usize, Vec)>). Also refocuses the Merger trait: the bundled account() -> (records, size, capacity, allocations) splits into len() -> usize (update count, drives the ladder and the logger's records field) and a defaulted allocation() -> (size, capacity, allocations) for memory telemetry. BatcherEvent's shape is unchanged. NOTE: breaking change for out-of-tree Merger implementors (e.g. Materialize) — rename account -> len, optionally override allocation. chunks 100k/200k (u64 probes): ~127ms -> ~100ms queries-complete. Co-Authored-By: Claude Opus 4.8 (1M context) --- differential-dataflow/src/trace/chunk.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/differential-dataflow/src/trace/chunk.rs b/differential-dataflow/src/trace/chunk.rs index b3eb6589b..326f28e9b 100644 --- a/differential-dataflow/src/trace/chunk.rs +++ b/differential-dataflow/src/trace/chunk.rs @@ -392,7 +392,7 @@ where ship.extend(shipped.done()); } - fn account(chunk: &C) -> (usize, usize, usize, usize) { (chunk.len(), 0, 0, 0) } + fn len(chunk: &C) -> usize { chunk.len() } } /// The merge batcher for chunks of type `C`, merging pre-chunked `C` runs. @@ -871,9 +871,13 @@ pub mod vec_chunk { impl SizableContainer for VecChunk where K: Clone+'static, V: Clone+'static, T: Clone+'static, R: Clone+'static { - fn at_capacity(&self) -> bool { self.0.at_capacity() } + // The absorb point is the grading target: the chunker fills a scratch chunk + // to `TARGET` updates before emitting, so chunks arrive pre-graded rather than + // at timely's byte-derived buffer size (which downstream regrading re-melds). + fn at_capacity(&self) -> bool { self.0.len() >= TARGET } fn ensure_capacity(&mut self, _stash: &mut Option) { - Rc::make_mut(&mut self.0).ensure_capacity(&mut None); + let inner = Rc::make_mut(&mut self.0); + inner.reserve(TARGET.saturating_sub(inner.len())); } } From 9cd111cda185a1e11b2edc5c6b6254c326160246 Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Thu, 11 Jun 2026 16:47:03 -0400 Subject: [PATCH 4/9] chunk: binary merge_pair primitive + run-copy merge (plan 1.2, 1.3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1.2 — invert the merge surface so binary merge_pair is the required primitive: merge_pair(&mut (usize,Self), &mut (usize,Self), out) is what the harnesses (merge_chains, the batch merger) drive directly; the k-way `merge` becomes a provided dispatcher (arity 0 / 2 / unimplemented!); `prune` loses its default (it was the only arity-1 merge caller). Backends with a genuine k-way merge (vec_chunk's merge_buf) override `merge`. 1.3 — vec_chunk::merge_pair is a dedicated two-pointer binary merge: one gallop pins the horizon, disjoint runs bulk-copy via extend_from_slice, only collisions consolidate element-wise. merge_buf stays as the k-way override and correctness reference. Chunks example ~100ms -> ~83ms. Fixes a latent bug in the spike's run-copy: it pinned the horizon at the lesser last (key,val), which is wrong for multi-chunk chains — a (key,val) group that straddles a chunk boundary on one side and overlaps times with the other gets the other side's whole group merged before its straddled continuation arrives, emitting duplicate, out-of-order (key,val,time) entries. The horizon must be (key,val,time), as the reference merge_buf already used. The spike's merge tests are single-chunk and never reached it. New property test merge_pair_matches_reference: 300 random multi-chunk merges (tiny chunks force straddling) vs a union-consolidate oracle. Co-Authored-By: Claude Opus 4.8 (1M context) --- differential-dataflow/src/trace/chunk.rs | 190 ++++++++++++++++++++--- 1 file changed, 171 insertions(+), 19 deletions(-) diff --git a/differential-dataflow/src/trace/chunk.rs b/differential-dataflow/src/trace/chunk.rs index 326f28e9b..bfa7ffb55 100644 --- a/differential-dataflow/src/trace/chunk.rs +++ b/differential-dataflow/src/trace/chunk.rs @@ -199,26 +199,39 @@ pub trait Chunk: Sized + Clone + LayoutExt { /// Remove some first few updates, returning the remainder. /// - /// Implemented via a singleton `merge`: with one input there is no horizon to - /// hold back, so the whole suffix `[prefix..]` is emitted. The remainder of a - /// graded chunk is at most one graded chunk. - fn prune(self, prefix: usize) -> Self { - let mut buffer = ChunkList::default(); - Self::merge(&mut [(prefix, self)], &mut buffer); - let mut data = buffer.done(); - assert_eq!(data.len(), 1); - data.pop().unwrap() - } + /// The whole suffix `[prefix..]` is emitted as a single chunk; the remainder of + /// a graded chunk is at most one graded chunk. Used to flush a partially-consumed + /// merge head (its `prefix` is the consumed position). + fn prune(self, prefix: usize) -> Self; + + /// Merges as much as possible from each of two input chunks. + /// + /// Each input is a chunk with its consumed-prefix position, which is not + /// intended for merging. The chunks are only able to merge through updates that + /// would be present in both inputs, generally up to the least last + /// `(key, val, time)` triple across the two. On return, the consumed prefix of + /// at least one input has advanced to that input's length, marking it drained + /// and signalling the caller to refill that slot. + /// + /// This is the required merge primitive; the harnesses in this module drive it + /// pairwise ([`merge_chains`], the batch merger). The k-way [`merge`](Chunk::merge) + /// is a provided convenience over it. + fn merge_pair(slot1: &mut (usize, Self), slot2: &mut (usize, Self), out: &mut ChunkList); /// Merges as much as possible from each of the input chunks. /// - /// Input chunks come with a number of consumed prefix updates, which are not - /// intended for merging. The chunks are only able to merge through updates - /// that would be present in all inputs, generally up to the least last - /// `(key, val, time)` triple across the inputs. On return, the consumed - /// prefix of at least one input has advanced to that input's length, marking - /// it drained and signalling the caller to refill that slot. - fn merge(chunks: &mut [(usize, Self)], out: &mut ChunkList); + /// The same contract as [`merge_pair`](Chunk::merge_pair), generalized to a + /// slice of inputs. The provided implementation handles exactly the arities the + /// harnesses produce: zero (nothing) and two (via `merge_pair`). A backend whose + /// merge is genuinely k-way (e.g. `vec_chunk`'s `merge_buf`) can override this; + /// callers wanting other arities require such an override. + fn merge(chunks: &mut [(usize, Self)], out: &mut ChunkList) { + match chunks { + [] => {} + [slot1, slot2] => Self::merge_pair(slot1, slot2, out), + _ => unimplemented!("merge of arity {} requires a backend override (binary `merge_pair` is the required primitive; `prune` handles suffixes)", chunks.len()), + } + } /// Partition chunks into updates greater or equal `frontier` (`keep`) or not (`ship`). /// @@ -284,7 +297,8 @@ pub fn merge_chains( while head1.is_some() && head2.is_some() { let mut window = [head1.take().unwrap(), head2.take().unwrap()]; - C::merge(&mut window, out); + let [slot1, slot2] = &mut window; + C::merge_pair(slot1, slot2, out); let [(p1, c1), (p2, c2)] = window; // Refill whichever side(s) drained to length; keep partially-consumed ones. head1 = if p1 >= c1.len() { iter1.next().map(|c| (0, c)) } else { Some((p1, c1)) }; @@ -715,7 +729,8 @@ where // One merge step: present both heads, refill whichever drains. let mut window = [state.head1.take().unwrap(), state.head2.take().unwrap()]; let mut merged = ChunkList::default(); - C::merge(&mut window, &mut merged); + let [slot1, slot2] = &mut window; + C::merge_pair(slot1, slot2, &mut merged); let [(p1, c1), (p2, c2)] = window; state.head1 = if p1 >= c1.len() { clone_chunk(&source1.chunks, &mut state.idx1) } else { Some((p1, c1)) }; state.head2 = if p2 >= c2.len() { clone_chunk(&source2.chunks, &mut state.idx2) } else { Some((p2, c2)) }; @@ -1018,6 +1033,92 @@ pub mod vec_chunk { VecChunk(Rc::new(v)) } + /// A dedicated two-pointer binary merge: one gallop pins how far each side + /// may merge (through the lesser of the two last `(key, val)`s), then a + /// single pass consolidates equal `(key, val, time)` triples and bulk-copies + /// the disjoint runs as slices. The k-way [`merge`](Chunk::merge) override + /// below serves slice callers and is the correctness reference (property + /// test `merge_pair_matches_merge_buf`). + fn merge_pair(slot1: &mut (usize, Self), slot2: &mut (usize, Self), out: &mut ChunkList) { + let (p1, c1) = slot1; + let (p2, c2) = slot2; + let s1 = &c1.0[..]; + let s2 = &c2.0[..]; + + // The merge horizon: the lesser of the two last `(key, val, time)`s. The + // side owning it drains fully; the other merges through it. The time must + // be part of the horizon: a `(key, val)` group can straddle a chunk + // boundary on the owning side, and a coarser `(key, val)` horizon would + // let the other side's whole group merge before that continuation arrives, + // emitting it unconsolidated (caught by `merge_pair_matches_reference`). + fn kv(u: &((K, V), T, R)) -> (&K, &V) { (&u.0.0, &u.0.1) } + fn kvt(u: &((K, V), T, R)) -> ((&K, &V), &T) { (kv(u), &u.1) } + let (end1, end2); + if kvt(&s1[s1.len() - 1]) <= kvt(&s2[s2.len() - 1]) { + let horizon = kvt(&s1[s1.len() - 1]); + end1 = s1.len(); + end2 = gallop(s2, *p2, |u| kvt(u) <= horizon); + } else { + let horizon = kvt(&s2[s2.len() - 1]); + end2 = s2.len(); + end1 = gallop(s1, *p1, |u| kvt(u) <= horizon); + } + + let mut result: Vec<((K, V), T, R)> = Vec::with_capacity(TARGET); + let mut flush = |result: &mut Vec<((K, V), T, R)>, force: bool| { + if result.len() >= TARGET || (force && !result.is_empty()) { + out.push(VecChunk(Rc::new(std::mem::replace(result, Vec::with_capacity(TARGET))))); + } + }; + + let (mut i, mut j) = (*p1, *p2); + while i < end1 && j < end2 { + let a = &s1[i]; + let b = &s2[j]; + match (kv(a), &a.1).cmp(&(kv(b), &b.1)) { + // Copy the whole run of one side strictly below the other's head: + // collisions are impossible within it, so it moves as slices (cut + // at the grading target) rather than element by element. + std::cmp::Ordering::Less => { + let run = gallop(s1, i + 1, |u| (kv(u), &u.1) < (kv(b), &b.1)).min(end1); + for piece in s1[i..run].chunks(TARGET) { + result.extend_from_slice(piece); + flush(&mut result, false); + } + i = run; + } + std::cmp::Ordering::Greater => { + let run = gallop(s2, j + 1, |u| (kv(u), &u.1) < (kv(a), &a.1)).min(end2); + for piece in s2[j..run].chunks(TARGET) { + result.extend_from_slice(piece); + flush(&mut result, false); + } + j = run; + } + std::cmp::Ordering::Equal => { + let mut diff = a.2.clone(); + diff.plus_equals(&b.2); + if !diff.is_zero() { + result.push((a.0.clone(), a.1.clone(), diff)); + } + i += 1; + j += 1; + flush(&mut result, false); + } + } + } + // Bulk-copy the in-horizon tails, cutting at the grading target. + for tail in [&s1[i..end1], &s2[j..end2]] { + for piece in tail.chunks(TARGET) { + result.extend_from_slice(piece); + flush(&mut result, false); + } + } + flush(&mut result, true); + *p1 = end1; + *p2 = end2; + } + fn merge(chunks: &mut [(usize, Self)], out: &mut ChunkList) { let mut consumed: Vec = chunks.iter().map(|(c, _)| *c).collect(); { @@ -1429,6 +1530,57 @@ pub mod vec_chunk { assert_eq!(merged, want); } + // Property test: merging two *multi-chunk* chains (driven through `merge_pair` + // by `merge_chains`) reproduces the union of all updates, consolidated. Tiny + // chunks force `(key, val)` groups — which can span several times — to + // straddle chunk boundaries on both sides, exercising the refill path the + // single-chunk merge tests never reach. The independent oracle is + // `consolidate_updates` over the concatenation. + #[test] + fn merge_pair_matches_reference() { + use crate::trace::chunk::{ChunkList, merge_chains}; + use crate::consolidation::consolidate_updates; + + // Deterministic xorshift PRNG — no dev-dependency on `rand`. + let mut seed = 0x2545F4914F6CDD1Du64; + let mut rng = move || { seed ^= seed << 13; seed ^= seed >> 7; seed ^= seed << 17; seed }; + + // A sorted, consolidated update set over a small (key, val, time) space, + // so the two chains collide and a `(key, val)` carries several times. + fn gen(rng: &mut impl FnMut() -> u64, n: usize) -> Vec<((u64, u64), u64, i64)> { + let mut v: Vec<((u64, u64), u64, i64)> = (0..n).map(|_| { + let k = rng() % 20; let val = rng() % 3; let t = rng() % 8; + let d = if rng() % 4 == 0 { -1 } else { 1 }; + ((k, val), t, d) + }).collect(); + consolidate_updates(&mut v); + v + } + // Split a consolidated set into a chain of small chunks (each sorted and + // consolidated; together globally sorted), so groups straddle boundaries. + fn chain(updates: &[((u64, u64), u64, i64)], sz: usize) -> Vec> { + updates.chunks(sz).map(|c| VecChunk(Rc::new(c.to_vec()))).collect() + } + + for _ in 0..300 { + let n1 = (rng() as usize % 60) + 1; + let u1 = gen(&mut rng, n1); + let n2 = (rng() as usize % 60) + 1; + let u2 = gen(&mut rng, n2); + if u1.is_empty() || u2.is_empty() { continue; } + let sz = (rng() as usize % 5) + 1; // tiny chunks → heavy straddling + + let mut out = ChunkList::default(); + merge_chains(chain(&u1, sz), chain(&u2, sz), &mut out); + let merged: Vec<_> = out.done().into_iter().flat_map(|c| (*c.0).clone()).collect(); + + let mut reference: Vec<_> = u1.iter().chain(u2.iter()).cloned().collect(); + consolidate_updates(&mut reference); + + assert_eq!(merged, reference, "chunk size {sz}\n u1={u1:?}\n u2={u2:?}"); + } + } + // `regrade` must produce a *maximal packing*: adjacent sub-`TARGET` chunks // that could combine into one legal chunk are coalesced (the prior rule left // any pair summing past `TARGET/2` alone), full chunks pass through, and From 05817599e44563417569e3abb90136a5da72489f Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Sun, 21 Jun 2026 18:08:08 -0400 Subject: [PATCH 5/9] chunk: unify the four transducers on one VecDeque protocol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit merge / extract / advance / regrade had four different calling conventions and three support types (ChunkList, ChunkFeed, plus bare Vec); the variation was mostly incidental. Collapse them onto one stream-transducer protocol: consume from the front of a VecDeque input; withhold anything not yet committable by reforming it into one owned chunk and push_front-ing it back; append committed chunks to a VecDeque output; `done` forces the flush. Cross-call state is now only the deques — no index escapes a call. Concretely: * merge is binary, `(in1, in2, out)`, no `done`: it merges the two front chunks through their shared horizon, pops the drained side, and prunes+push_front's the partial one. The harness (merge_chains / ChunkBatchMerger) handles a drained input by flushing the other side's verbatim tail, so merge never reasons about end-of-input. The persisted `(usize, Self)` slot and the k-way `merge`/`merge_buf` are gone; `prune` is now an inline drain, off the trait. * advance / regrade keep `done` and withhold via push_front (advance's last group; regrade's sub-TARGET carry). advance reuses the front chunk's storage in place, so a giant key stays linear. * extract stays the one-shot splitter: drains its whole input, two outputs plus the residual frontier, no `done`. Grading is now a seal-time property, not a between-stage invariant: the producers emit near-graded output and a terminal regrade coalesces the seams. ChunkList (sink + regrade-on-push) splits into a raw VecDeque plus an explicit regrade stage; ChunkFeed (whose usize was dead for advance) and AdvanceQueue are deleted. ChunkBatchMerger becomes a fixed merge -> advance deque pipeline (regrade at done()); ChunkBuilder drives regrade directly. Net -176 lines. All chunk tests pass; the chunks example is unchanged at ~75ms vs ord's ~98ms (100k/200k). Co-Authored-By: Claude Opus 4.8 (1M context) --- differential-dataflow/src/trace/chunk.rs | 940 +++++++++-------------- 1 file changed, 382 insertions(+), 558 deletions(-) diff --git a/differential-dataflow/src/trace/chunk.rs b/differential-dataflow/src/trace/chunk.rs index bfa7ffb55..3e48c33c3 100644 --- a/differential-dataflow/src/trace/chunk.rs +++ b/differential-dataflow/src/trace/chunk.rs @@ -40,6 +40,8 @@ //! It does this by exposing a small set of chunk-oriented primitives, which are //! sufficient for harnesses for each of these tasks. +use std::collections::VecDeque; + use timely::progress::Antichain; use timely::progress::frontier::AntichainRef; use crate::lattice::Lattice; @@ -53,9 +55,6 @@ type KeyCon = <::Layout as Layout>::KeyContainer; /// The val container of chunk `C`'s layout. type ValCon = <::Layout as Layout>::ValContainer; -/// A partially consumed head and optional tail of chunks. -pub type ChunkFeed = ((usize, C), Vec); - /// Whether `chunks` satisfy the [`Chunk::TARGET`] grading invariant: every chunk /// at most `TARGET`, and every adjacent pair summing to more than `TARGET` (so no /// two neighbours could be combined into one legal chunk — a *maximal packing*). @@ -66,36 +65,17 @@ pub fn is_graded(chunks: &[C]) -> bool { && chunks.windows(2).all(|w| w[0].len() + w[1].len() > C::TARGET) } -/// A list of chunks that maintains the `C::regrade` structural invariant. +/// Regrade `input` to completion into a fresh graded `Vec` (see [`Chunk::regrade`]). /// -/// Producers `push` chunks in; each push runs `C::regrade`, which moves graded -/// runs into `data` and leaves anything not yet safe to emit in `todo`. `done` -/// flushes the remainder and yields the graded sequence. -pub struct ChunkList { - todo: Vec, - data: Vec, -} - -impl Default for ChunkList { - fn default() -> Self { Self { todo: Vec::new(), data: Vec::new() } } -} - -impl ChunkList { - /// Add a new chunk to the list, regrading as far as is safe. - pub fn push(&mut self, chunk: C) { - self.todo.push(chunk); - C::regrade(&mut self.todo, false, &mut self.data); - } - /// Add several chunks. - pub fn extend>(&mut self, chunks: I) { - for chunk in chunks { self.push(chunk); } - } - /// Finalize the list, flushing the remainder, and extract the graded sequence. - pub fn done(mut self) -> Vec { - C::regrade(&mut self.todo, true, &mut self.data); - assert!(self.todo.is_empty()); - self.data - } +/// A convenience for the one-shot callers (batch sealing, the batcher's merge and +/// extract) that have a whole sequence in hand and want it graded; the streaming +/// callers drive [`Chunk::regrade`] directly across ticks. +pub fn regrade_all(input: impl IntoIterator) -> Vec { + let mut input: VecDeque = input.into_iter().collect(); + let mut out = VecDeque::new(); + C::regrade(&mut input, true, &mut out); + debug_assert!(input.is_empty()); + out.into() } /// A consolidated, sorted sequence of `(data, time, diff)`. @@ -106,34 +86,53 @@ impl ChunkList { /// /// `Clone` is expected to be cheap — a refcount bump on shared backing storage, /// not a deep copy. The trace merger relies on this to read its (shared, -/// immutable) source batches by cloning chunks rather than consuming them, and -/// `prune` is likewise expected to be a range adjustment over shared storage. +/// immutable) source batches by cloning chunks rather than consuming them. /// /// A chunk *has* a [`Cursor`] over its own `(key, val, time, diff)` contents — /// the chunk is its own cursor `Storage`, mirroring [`BatchReader`]. This is what /// lets a batch cursor delegate downward: the batch indexes which chunk holds a /// key (reusing the chunk's `KeyContainer` / `ValContainer` for boundaries) and -/// then reads through that chunk's cursor. As with `merge`, we do not -/// provide this; the opaque chunk implementor does. +/// then reads through that chunk's cursor. We do not provide this; the opaque +/// chunk implementor does. +/// +/// # The transducer protocol +/// +/// The four chunk-producing operations ([`merge`](Chunk::merge), +/// [`extract`](Chunk::extract), [`advance`](Chunk::advance), +/// [`regrade`](Chunk::regrade)) are all *stream transducers* over `VecDeque`, +/// sharing one calling convention so an implementor learns it once: /// -/// # Implementor contract +/// * **Consume from the front.** Read chunks off the front of the input deque(s). +/// * **Withhold by pushing back.** Anything consumed but not yet safe to commit +/// (advance's still-growing last group; regrade's sub-`TARGET` carry; merge's +/// partially-consumed front) is reformed into a single owned chunk and +/// `push_front`ed back onto its input. The only cross-call state is therefore the +/// deques themselves — clean owned runs, no indices escape a call. +/// * **Commit by appending.** Append committed chunks to the output deque; once +/// appended they are written and a downstream stage may take them immediately. +/// * **`done` forces the flush.** The unary stages take `done: bool`; while it is +/// false they may withhold, and a call that appends nothing has yielded — the +/// harness will not call again until more input arrives or `done` flips true. On +/// `done` the stage must drain its withheld state (the harness keeps calling +/// until the output stops growing). /// -/// The chunk-producing operations (`merge`, `extract`, `advance`, `regrade`) emit -/// into a [`ChunkList`], and implementors are expected to: +/// Two operations vary only where their job demands it: [`merge`](Chunk::merge) is +/// binary (and the harness, not `merge`, handles a drained input by flushing the +/// other side's verbatim tail, so `merge` needs no `done`); [`extract`](Chunk::extract) +/// is the one-shot splitter (it drains its whole input, so it needs no `done` and +/// has two outputs plus a residual frontier). /// -/// * **Respect the chain structure.** Emit *graded* chunks — sized to the -/// `regrade` invariant — rather than collapsing a run into one monolithic chunk -/// and leaning on `regrade` to re-split it. Building the right shape directly -/// avoids a redundant copy. -/// * **Bound output by input consumed.** Produce output chunks in proportion to -/// the input chunks consumed, never buffering an unbounded amount before -/// emitting. The fueled merger debits progress by the work it feeds across -/// suspensions; output that lags input arbitrarily breaks that accounting. -/// * **Recycle where possible.** Reuse the storage of chunks drained from the -/// input as the buffers for output, so allocations balance input against output -/// rather than allocating afresh per emitted chunk. `vec_chunk::extract` is the -/// worked example: it fills `TARGET`-sized buffers reclaimed from a stash of -/// emptied input `Vec`s. +/// Implementors are further expected to: +/// +/// * **Emit near-graded output.** Fill `TARGET`-sized output chunks directly rather +/// than emitting one monolithic chunk; the terminal [`regrade`](Chunk::regrade) +/// only has to coalesce the trailing partials at the seams. Grading is a +/// *seal-time* property, not an invariant maintained between stages. +/// * **Recycle where possible.** Reuse the storage of chunks drained from the input +/// as the buffers for output, so allocations balance input against output rather +/// than allocating afresh per emitted chunk. `vec_chunk` is the worked example: it +/// fills buffers reclaimed from a stash of emptied input `Vec`s, and advance reuses +/// its withheld carry's storage in place so a giant key stays linear, not quadratic. /// /// [`BatchReader`]: crate::trace::BatchReader pub trait Chunk: Sized + Clone + LayoutExt { @@ -197,168 +196,94 @@ pub trait Chunk: Sized + Clone + LayoutExt { /// they reach a chunk sequence, and [`ChunkBatch::new`] asserts the invariant. fn len(&self) -> usize; - /// Remove some first few updates, returning the remainder. - /// - /// The whole suffix `[prefix..]` is emitted as a single chunk; the remainder of - /// a graded chunk is at most one graded chunk. Used to flush a partially-consumed - /// merge head (its `prefix` is the consumed position). - fn prune(self, prefix: usize) -> Self; - - /// Merges as much as possible from each of two input chunks. + /// Merge the fronts of two input deques through their shared horizon. /// - /// Each input is a chunk with its consumed-prefix position, which is not - /// intended for merging. The chunks are only able to merge through updates that - /// would be present in both inputs, generally up to the least last - /// `(key, val, time)` triple across the two. On return, the consumed prefix of - /// at least one input has advanced to that input's length, marking it drained - /// and signalling the caller to refill that slot. + /// Both deques are non-empty (the caller guarantees it). The two front chunks + /// merge through updates present in both — up to the least last `(key, val, time)` + /// triple across them — consolidating collisions and emitting committed chunks to + /// `out`. The side owning the horizon is fully consumed and `pop_front`ed; the + /// other's partially-consumed front is reformed (its consumed prefix dropped) and + /// `push_front`ed back. So on return at least one deque has had its front retired. /// - /// This is the required merge primitive; the harnesses in this module drive it - /// pairwise ([`merge_chains`], the batch merger). The k-way [`merge`](Chunk::merge) - /// is a provided convenience over it. - fn merge_pair(slot1: &mut (usize, Self), slot2: &mut (usize, Self), out: &mut ChunkList); - - /// Merges as much as possible from each of the input chunks. + /// `merge` makes one front-pair's worth of progress and returns; the harness + /// re-ticks it, refilling a drained deque from its source, and itself handles an + /// exhausted source by flushing the other deque's verbatim tail — so `merge` needs + /// no `done` and never has to reason about end-of-input. + fn merge(in1: &mut VecDeque, in2: &mut VecDeque, out: &mut VecDeque); + + /// Partition the input by `frontier` into updates greater-or-equal it (`keep`) or + /// not (`ship`). One-shot: the whole of `input` is consumed. /// - /// The same contract as [`merge_pair`](Chunk::merge_pair), generalized to a - /// slice of inputs. The provided implementation handles exactly the arities the - /// harnesses produce: zero (nothing) and two (via `merge_pair`). A backend whose - /// merge is genuinely k-way (e.g. `vec_chunk`'s `merge_buf`) can override this; - /// callers wanting other arities require such an override. - fn merge(chunks: &mut [(usize, Self)], out: &mut ChunkList) { - match chunks { - [] => {} - [slot1, slot2] => Self::merge_pair(slot1, slot2, out), - _ => unimplemented!("merge of arity {} requires a backend override (binary `merge_pair` is the required primitive; `prune` handles suffixes)", chunks.len()), - } - } - - /// Partition chunks into updates greater or equal `frontier` (`keep`) or not (`ship`). - /// - /// The lower envelope of the times routed to `keep` is folded into - /// `residual`, so the caller learns the frontier of data it still holds - /// without a second pass over the chunks. + /// The lower envelope of the times routed to `keep` is folded into `residual`, so + /// the caller learns the frontier of data it still holds without a second pass. + /// Outputs are near-graded but not regraded; a terminal [`regrade`](Chunk::regrade) + /// zips up the seams. fn extract( - chunks: &mut Vec, + input: &mut VecDeque, frontier: &Antichain, residual: &mut Antichain, - keep: &mut ChunkList, - ship: &mut ChunkList, + keep: &mut VecDeque, + ship: &mut VecDeque, ); - /// Advance times in input chunks by `frontier` and push consolidated result out. + /// Advance times by `frontier`, consolidating each complete `(key, val)` group from + /// the front of `input` into `out`. /// - /// To be certainly consolidated, all `(key, val)` updates must be present in - /// the input, or `done` must be set. A run of chunks may fail to be emitted if - /// they all share the same `(key, val)` and the implementor cannot be sure no - /// future times for the pair are yet to arrive. + /// A group is complete once a later `(key, val)` is seen, so every group but the + /// last is emitted; the last (which a future call might extend) is reformed and + /// `push_front`ed back as the withheld carry — unless `done`, which flushes it too. + /// The degenerate case is a single `(key, val)` spanning all available input: no + /// group is provably complete, so nothing is committed (the whole buffer is + /// withheld) until `done`. fn advance( - feed: &mut ChunkFeed, + input: &mut VecDeque, frontier: &Antichain, done: bool, - out: &mut ChunkList, + out: &mut VecDeque, ); - /// Reshapes a sequence of consolidated chunks into a maximal packing: each at - /// most [`TARGET`](Chunk::TARGET), and any two adjacent chunks summing past - /// `TARGET` (so no neighbours could be combined). See [`is_graded`]. + /// Reshape the front of `input` into a maximal packing in `out`: each chunk at most + /// [`TARGET`](Chunk::TARGET), and any two adjacent summing past `TARGET` (so no + /// neighbours could be combined). See [`is_graded`]. /// - /// The implementor should guard against emitting sequences of chunks that violate - /// the invariant, until the set `done` indicates that the queues is complete. - /// The implementor is allowed to push back at `queue` if it needs, but should - /// not corrupt the order of chunks and updates. + /// The terminal stage of every pipeline. A sub-`TARGET` carry that might still grow + /// is `push_front`ed back as the withheld remainder until `done`, which flushes it. fn regrade( - queue: &mut Vec, + input: &mut VecDeque, done: bool, - out: &mut Vec, + out: &mut VecDeque, ); } -/// Merge two sorted chains of chunks into one sorted chain. +/// Merge two full chains of chunks into one, to completion, appending to `out`. /// -/// Presents the heads of `chain1` and `chain2` to [`Chunk::merge`], each -/// tagged with the prefix already consumed. After each call at least one head has -/// been drained to its length; that slot is refilled from its chain. When either -/// chain is exhausted, the partially-consumed remainder of the other is pruned of -/// its consumed prefix and the rest of that chain is appended verbatim. +/// The whole-chain (non-fueled) driver used by the batcher's +/// [`Merger`](crate::trace::implementations::merge_batcher::Merger): both chains are in +/// hand, so it ticks [`Chunk::merge`] until one deque empties, then appends the other's +/// remainder (the verbatim tail). Output is near-graded; callers regrade as needed. pub fn merge_chains( chain1: Vec, chain2: Vec, - out: &mut ChunkList, + out: &mut VecDeque, ) { - let mut iter1 = chain1.into_iter(); - let mut iter2 = chain2.into_iter(); - - // Current head of each chain, tagged with its consumed prefix; `None` once - // that chain's iterator is exhausted. - let mut head1 = iter1.next().map(|c| (0, c)); - let mut head2 = iter2.next().map(|c| (0, c)); - - while head1.is_some() && head2.is_some() { - let mut window = [head1.take().unwrap(), head2.take().unwrap()]; - let [slot1, slot2] = &mut window; - C::merge_pair(slot1, slot2, out); - let [(p1, c1), (p2, c2)] = window; - // Refill whichever side(s) drained to length; keep partially-consumed ones. - head1 = if p1 >= c1.len() { iter1.next().map(|c| (0, c)) } else { Some((p1, c1)) }; - head2 = if p2 >= c2.len() { iter2.next().map(|c| (0, c)) } else { Some((p2, c2)) }; - } - - // One chain is exhausted; flush the partially-consumed remainder of the other, - // then its untouched tail. - for head in [head1, head2] { - if let Some((consumed, chunk)) = head { - // A retained head always has `consumed < len` (a fully-consumed one - // would have been refilled), so the pruned remainder is non-empty. - let chunk = if consumed > 0 { chunk.prune(consumed) } else { chunk }; - out.push(chunk); - } - } - out.extend(iter1); - out.extend(iter2); -} - -/// Drives [`Chunk::advance`] over a growing queue of chunks. -/// -/// Compaction may need to see several chunks before it can emit a consolidated -/// output chunk, because a `(key, val)` run can span chunk boundaries. The -/// implementor owns the `(next, tail)` representation and rotates it itself: it -/// can consume across chunks by amounts the driver cannot see, so the driver -/// never promotes from `tail` into `next`. The driver only appends incoming -/// chunks to `tail` and calls `advance`; a final [`Self::finish`] sets `done` to -/// flush whatever was being withheld. -pub struct AdvanceQueue { - /// The chunks awaiting advancement, as a head (with consumed prefix) and tail; - /// the implementor owns rotation between them. - feed: ChunkFeed, - /// Frontier to advance times by during compaction. - frontier: Antichain, -} - -impl AdvanceQueue { - /// A compactor that advances times by `frontier`. - pub fn new(frontier: Antichain) -> Self { - Self { feed: ((0, C::default()), Vec::new()), frontier } - } - /// Append a completed merge's chunks and advance as far as is certain. - pub fn push>(&mut self, chunks: I, out: &mut ChunkList) { - self.feed.1.extend(chunks); - C::advance(&mut self.feed, &self.frontier, false, out); - } - /// Flush all remaining updates; no further chunks will be pushed. - pub fn finish(mut self, out: &mut ChunkList) { - C::advance(&mut self.feed, &self.frontier, true, out); + let mut in1: VecDeque = chain1.into(); + let mut in2: VecDeque = chain2.into(); + while !in1.is_empty() && !in2.is_empty() { + C::merge(&mut in1, &mut in2, out); } + // One deque is empty; the other's remainder is all greater than everything merged. + out.extend(in1.drain(..)); + out.extend(in2.drain(..)); } /// A merge-batcher [`Merger`](crate::trace::implementations::merge_batcher::Merger) /// over chains of [`Chunk`]s. /// -/// `merge` runs the binary merger; `extract` splits by the seal frontier using -/// [`Chunk::extract`]. The batcher consolidates equal `(data, time)` updates -/// but does *not* advance times — time advancement is advance's job, handled -/// later in the trace. +/// `merge` runs the whole-chain binary merger; `extract` splits by the seal frontier +/// using [`Chunk::extract`]. The batcher consolidates equal `(data, time)` updates +/// but does *not* advance times — time advancement is advance's job, handled later in +/// the trace. Both regrade their output, since the batcher's chains want to be graded. pub struct ChunkMerger { _marker: std::marker::PhantomData, } @@ -382,15 +307,14 @@ where output: &mut Vec, _stash: &mut Vec, ) { - // The merge-batcher's chains are plain `Vec`s; grade through a `ChunkList`. - let mut graded = ChunkList::default(); - merge_chains(list1, list2, &mut graded); - output.extend(graded.done()); + let mut merged = VecDeque::new(); + merge_chains(list1, list2, &mut merged); + output.extend(regrade_all(merged)); } fn extract( &mut self, - mut merged: Vec, + merged: Vec, upper: AntichainRef, frontier: &mut Antichain, ship: &mut Vec, @@ -400,10 +324,11 @@ where // `extract` keeps updates greater-or-equal `upper` and ships the rest, // folding the lower envelope of kept times into `frontier`. let upper = upper.to_owned(); - let (mut keep, mut shipped) = (ChunkList::default(), ChunkList::default()); - C::extract(&mut merged, &upper, frontier, &mut keep, &mut shipped); - kept.extend(keep.done()); - ship.extend(shipped.done()); + let mut input: VecDeque = merged.into(); + let (mut keep, mut shipped) = (VecDeque::new(), VecDeque::new()); + C::extract(&mut input, &upper, frontier, &mut keep, &mut shipped); + kept.extend(regrade_all(keep)); + ship.extend(regrade_all(shipped)); } fn len(chunk: &C) -> usize { chunk.len() } @@ -635,54 +560,41 @@ where } } -/// Live state of the binary merge: an index into each (shared, immutable) source -/// chain marking the next chunk to clone, and the current head of each (a cloned -/// chunk tagged with its consumed prefix). A head is `None` once its chain is -/// exhausted; the merge proper runs while both are `Some`. The indices are the -/// "cursor positions": the same sources arrive on each `work` call, so they are -/// stable across suspensions. -struct MergeState { - idx1: usize, - idx2: usize, - head1: Option<(usize, C)>, - head2: Option<(usize, C)>, -} - -/// Clone the chunk at `*idx` (if any), advancing `*idx`, tagged with prefix `0`. -fn clone_chunk(chunks: &[C], idx: &mut usize) -> Option<(usize, C)> { - let chunk = chunks.get(*idx)?.clone(); - *idx += 1; - Some((0, chunk)) -} - /// A merge of two [`ChunkBatch`]es in progress. /// /// This is the [`ChunkBatch`] merger, wired in as its /// [`Batch::Merger`](crate::trace::Batch::Merger), and has that trait's /// `new` / `work` / `done` shape. /// -/// The merge is *resumable*: `work` drains one [`Chunk::merge`]'s-worth of -/// updates per step, feeding the output into a live [`AdvanceQueue`], and stops once -/// `fuel` is exhausted, retaining the iterators, heads, and advancer for the -/// next call. Fuel is debited by the (consolidated) updates fed into the advancer; -/// summed over all steps this is the total *output*, not the input scanned — -/// matching how the trace's other mergers account (cf. `ord_neu`, which debits the -/// consolidated updates it stages). Compaction's final flush (`done = true`) rides -/// along uncounted, bounded by the data withheld during streaming. +/// The merge is *resumable* and runs a two-stage deque pipeline: +/// [`merge`](Chunk::merge) feeds `merged`, [`advance`](Chunk::advance) consumes it +/// into `advanced`; the terminal [`regrade`](Chunk::regrade) runs once at `done`. Each +/// `work` step clones a head from each source (the burst is head-of-each-list), ticks +/// `merge` once, then advances the fresh output, debiting `fuel` by the *merged* +/// records that entered the pipe — the total output across the merge, matching how the +/// trace's other mergers account (cf. `ord_neu`). The sources are read by *cloning* +/// chunks (a cheap refcount bump per the [`Chunk`] contract), never consumed or +/// mutated; the same `source1`/`source2` must be supplied on every call. When a source +/// exhausts, the harness flushes the other's verbatim tail one chunk per step. Once +/// both are drained, a final `advance(done)` flushes advance's withheld carry. pub struct ChunkBatchMerger { /// Compaction frontier supplied at construction. frontier: Antichain, /// Result frontiers, retained for the output description. lower: Antichain, upper: Antichain, - /// Merged-and-advanced chunks, grown by `work`. - result: ChunkList, - /// Live merge state; `None` before the first `work` and after merging completes. - state: Option>, - /// Live advancer; `Some` until its final flush, then `None`. - advancer: Option>, - /// Whether the inputs have been moved into `state` yet. - initialized: bool, + /// Input deques, refilled from the sources (clones) head-of-list at a time. + in1: VecDeque, + in2: VecDeque, + /// Next source chunk to clone into `in1` / `in2`. + idx1: usize, + idx2: usize, + /// `advance`'s input: the merge output plus advance's withheld carry at the front. + merged: VecDeque, + /// `advance`'s output: the merged-and-advanced chunks, grown by `work`. + advanced: VecDeque, + /// Set once both sources are drained and advance's final flush has run. + complete: bool, } impl crate::trace::Merger> for ChunkBatchMerger @@ -698,67 +610,52 @@ where frontier: frontier.to_owned(), lower, upper, - result: ChunkList::default(), - state: None, - advancer: None, - initialized: false, + in1: VecDeque::new(), + in2: VecDeque::new(), + idx1: 0, + idx2: 0, + merged: VecDeque::new(), + advanced: VecDeque::new(), + complete: false, } } /// Advance the merge by up to `fuel` updates, suspending when it runs out. - /// - /// The sources are read by *cloning* chunks (a cheap refcount bump, per the - /// [`Chunk`] contract), never consumed or mutated, so they remain shared and - /// immutable. The same `source1`/`source2` must be supplied on every call. fn work(&mut self, source1: &ChunkBatch, source2: &ChunkBatch, fuel: &mut isize) { - if !self.initialized { - let mut idx1 = 0; - let mut idx2 = 0; - let head1 = clone_chunk(&source1.chunks, &mut idx1); - let head2 = clone_chunk(&source2.chunks, &mut idx2); - self.state = Some(MergeState { idx1, idx2, head1, head2 }); - self.advancer = Some(AdvanceQueue::new(self.frontier.clone())); - self.initialized = true; - } + if self.complete { return; } while *fuel > 0 { - let state = match &mut self.state { Some(s) => s, None => break }; - let advancer = self.advancer.as_mut().unwrap(); - - if state.head1.is_some() && state.head2.is_some() { - // One merge step: present both heads, refill whichever drains. - let mut window = [state.head1.take().unwrap(), state.head2.take().unwrap()]; - let mut merged = ChunkList::default(); - let [slot1, slot2] = &mut window; - C::merge_pair(slot1, slot2, &mut merged); - let [(p1, c1), (p2, c2)] = window; - state.head1 = if p1 >= c1.len() { clone_chunk(&source1.chunks, &mut state.idx1) } else { Some((p1, c1)) }; - state.head2 = if p2 >= c2.len() { clone_chunk(&source2.chunks, &mut state.idx2) } else { Some((p2, c2)) }; - let chunks = merged.done(); - let work: usize = chunks.iter().map(C::len).sum(); - advancer.push(chunks, &mut self.result); - *fuel -= work as isize; - } else if let Some((consumed, chunk)) = state.head1.take().or_else(|| state.head2.take()) { - // One chain exhausted; flush the partially-consumed head of the - // other. It was retained with `consumed < len`, so the pruned - // remainder is non-empty. - let chunk = if consumed > 0 { chunk.prune(consumed) } else { chunk }; - let work = chunk.len(); - advancer.push(std::iter::once(chunk), &mut self.result); - *fuel -= work as isize; - } else if let Some((_, chunk)) = clone_chunk(&source1.chunks, &mut state.idx1).or_else(|| clone_chunk(&source2.chunks, &mut state.idx2)) { - // Flush the untouched tail of the surviving chain, one chunk per step. - let work = chunk.len(); - advancer.push(std::iter::once(chunk), &mut self.result); - *fuel -= work as isize; + // Refill each empty input deque with the next source chunk (head-of-list + // burst). After this, a deque is non-empty iff its source still has data. + if self.in1.is_empty() && self.idx1 < source1.chunks.len() { + self.in1.push_back(source1.chunks[self.idx1].clone()); + self.idx1 += 1; + } + if self.in2.is_empty() && self.idx2 < source2.chunks.len() { + self.in2.push_back(source2.chunks[self.idx2].clone()); + self.idx2 += 1; + } + + // Merge's per-tick output (small: one front-pair, or one tail chunk), + // measured for fuel before it joins the carry already in `merged`. + let mut produced = VecDeque::new(); + if !self.in1.is_empty() && !self.in2.is_empty() { + // Both sides have data: one front-pair merge. + C::merge(&mut self.in1, &mut self.in2, &mut produced); + } else if let Some(chunk) = self.in1.pop_front().or_else(|| self.in2.pop_front()) { + // Exactly one side has data: flush its verbatim tail, one chunk a step. + produced.push_back(chunk); } else { - // Both chains fully fed; flush withheld advancement and retire. - self.state = None; - if let Some(advancer) = self.advancer.take() { - advancer.finish(&mut self.result); - } + // Both sources drained: final flush of advance's withheld carry. + C::advance(&mut self.merged, &self.frontier, true, &mut self.advanced); + self.complete = true; break; } + + let work: usize = produced.iter().map(C::len).sum(); + self.merged.extend(produced); + C::advance(&mut self.merged, &self.frontier, false, &mut self.advanced); + *fuel -= work as isize; } } @@ -768,7 +665,7 @@ where /// positive), as the [`trace::Merger`](crate::trace::Merger) contract requires. fn done(self) -> ChunkBatch { let description = Description::new(self.lower, self.upper, self.frontier); - ChunkBatch::new(self.result.done(), description) + ChunkBatch::new(regrade_all(self.advanced), description) } } @@ -777,12 +674,14 @@ where /// /// The builder assumes its inputs arrive already sorted and consolidated (as the /// `Builder` contract requires), so it does no merging: each pushed chunk is an -/// ordered run, appended in order. They accumulate in a [`ChunkList`], which -/// regrades them to the size invariant as they arrive — so a batch built here is -/// graded like one produced by the merger, rather than inheriting whatever chunk -/// sizes the caller happened to push. +/// ordered run, fed straight to [`regrade`](Chunk::regrade) as it arrives — so a batch +/// built here is graded like one produced by the merger, rather than inheriting +/// whatever chunk sizes the caller happened to push. pub struct ChunkBuilder { - chunks: ChunkList, + /// Pushed chunks awaiting regrading; holds regrade's sub-`TARGET` carry at the front. + input: VecDeque, + /// The graded chunks emitted so far. + output: VecDeque, } impl crate::trace::Builder for ChunkBuilder @@ -795,25 +694,28 @@ where type Output = ChunkBatch; fn with_capacity(_keys: usize, _vals: usize, _upds: usize) -> Self { - Self { chunks: ChunkList::default() } + Self { input: VecDeque::new(), output: VecDeque::new() } } fn push(&mut self, chunk: &mut C) { let chunk = std::mem::take(chunk); - if chunk.len() > 0 { self.chunks.push(chunk); } + if chunk.len() > 0 { + self.input.push_back(chunk); + C::regrade(&mut self.input, false, &mut self.output); + } } fn done(self, description: Description) -> ChunkBatch { - ChunkBatch::new(self.chunks.done(), description) + let ChunkBuilder { mut input, mut output } = self; + C::regrade(&mut input, true, &mut output); + ChunkBatch::new(output.into(), description) } fn seal(chain: &mut Vec, description: Description) -> ChunkBatch { - // The chain is sorted and consolidated but not necessarily graded; regrade - // it. Already-sized chunks pass through as cheap `Rc` moves, so a chain that + // The chain is sorted and consolidated but not necessarily graded; regrade it. + // Already-`TARGET` chunks pass through as cheap `Rc` moves, so a chain that // arrives graded (as the batcher's does) pays only an O(#chunks) walk. - let mut chunks = ChunkList::default(); - chunks.extend(std::mem::take(chain)); - ChunkBatch::new(chunks.done(), description) + ChunkBatch::new(regrade_all(std::mem::take(chain)), description) } } @@ -831,13 +733,14 @@ pub mod vec_chunk { //! inner `Vec` via `Rc::make_mut` — free while a chunk is being built //! (refcount 1), and it never copies a *shared* chunk because batches are //! immutable once built. - //! * **Trace side.** [`Chunk`] (merge / extract / advance / prune / bounds) + //! * **Trace side.** [`Chunk`] (merge / extract / advance / regrade / bounds) //! plus a cursor. Key lookups are logarithmic by galloping search (`seek_*`), //! independent of chunk size; stepping stays linear (short hops). //! //! `Clone` is a refcount bump, so the trace merger shares source chunks instead //! of copying them. + use std::collections::VecDeque; use std::marker::PhantomData; use std::rc::Rc; @@ -851,7 +754,7 @@ pub mod vec_chunk { use crate::trace::cursor::Cursor; use crate::trace::implementations::{Vector, WithLayout}; - use super::{Chunk, ChunkFeed, ChunkList}; + use super::Chunk; /// The chunk size: both the maximum updates per chunk and the coalescing /// threshold (see [`Chunk::TARGET`]). Chosen for the reference impl; exposed as @@ -1027,113 +930,107 @@ pub mod vec_chunk { fn len(&self) -> usize { self.0.len() } - fn prune(self, prefix: usize) -> Self { - let mut v = take(self); - v.drain(..prefix); - VecChunk(Rc::new(v)) - } - - /// A dedicated two-pointer binary merge: one gallop pins how far each side - /// may merge (through the lesser of the two last `(key, val)`s), then a - /// single pass consolidates equal `(key, val, time)` triples and bulk-copies - /// the disjoint runs as slices. The k-way [`merge`](Chunk::merge) override - /// below serves slice callers and is the correctness reference (property - /// test `merge_pair_matches_merge_buf`). - fn merge_pair(slot1: &mut (usize, Self), slot2: &mut (usize, Self), out: &mut ChunkList) { - let (p1, c1) = slot1; - let (p2, c2) = slot2; - let s1 = &c1.0[..]; - let s2 = &c2.0[..]; - - // The merge horizon: the lesser of the two last `(key, val, time)`s. The - // side owning it drains fully; the other merges through it. The time must - // be part of the horizon: a `(key, val)` group can straddle a chunk - // boundary on the owning side, and a coarser `(key, val)` horizon would - // let the other side's whole group merge before that continuation arrives, - // emitting it unconsolidated (caught by `merge_pair_matches_reference`). + /// A dedicated two-pointer binary merge of the two front chunks: one gallop + /// pins how far each side may merge (through the lesser of the two last + /// `(key, val, time)`s), then a single pass consolidates equal triples and + /// bulk-copies disjoint runs as slices. The drained side's front is popped; the + /// other's partially-consumed front is pruned (its prefix dropped) and + /// `push_front`ed back — so the only persisted state is the deques themselves. + fn merge(in1: &mut VecDeque, in2: &mut VecDeque, out: &mut VecDeque) { fn kv(u: &((K, V), T, R)) -> (&K, &V) { (&u.0.0, &u.0.1) } fn kvt(u: &((K, V), T, R)) -> ((&K, &V), &T) { (kv(u), &u.1) } - let (end1, end2); - if kvt(&s1[s1.len() - 1]) <= kvt(&s2[s2.len() - 1]) { - let horizon = kvt(&s1[s1.len() - 1]); - end1 = s1.len(); - end2 = gallop(s2, *p2, |u| kvt(u) <= horizon); - } else { - let horizon = kvt(&s2[s2.len() - 1]); - end2 = s2.len(); - end1 = gallop(s1, *p1, |u| kvt(u) <= horizon); - } - let mut result: Vec<((K, V), T, R)> = Vec::with_capacity(TARGET); - let mut flush = |result: &mut Vec<((K, V), T, R)>, force: bool| { - if result.len() >= TARGET || (force && !result.is_empty()) { - out.push(VecChunk(Rc::new(std::mem::replace(result, Vec::with_capacity(TARGET))))); + // How far each front may merge, computed while the fronts are borrowed; the + // deques are mutated only after these borrows end. + let (end1, end2, len1, len2); + { + let s1 = &in1.front().unwrap().0[..]; + let s2 = &in2.front().unwrap().0[..]; + len1 = s1.len(); + len2 = s2.len(); + + // The merge horizon: the lesser of the two last `(key, val, time)`s. The + // side owning it drains fully; the other merges through it. The time must + // be part of the horizon: a `(key, val)` group can straddle a chunk + // boundary on the owning side, and a coarser `(key, val)` horizon would + // let the other side's whole group merge before that continuation arrives, + // emitting it unconsolidated (caught by `merge_matches_reference`). + if kvt(&s1[len1 - 1]) <= kvt(&s2[len2 - 1]) { + let horizon = kvt(&s1[len1 - 1]); + end1 = len1; + end2 = gallop(s2, 0, |u| kvt(u) <= horizon); + } else { + let horizon = kvt(&s2[len2 - 1]); + end2 = len2; + end1 = gallop(s1, 0, |u| kvt(u) <= horizon); } - }; - let (mut i, mut j) = (*p1, *p2); - while i < end1 && j < end2 { - let a = &s1[i]; - let b = &s2[j]; - match (kv(a), &a.1).cmp(&(kv(b), &b.1)) { - // Copy the whole run of one side strictly below the other's head: - // collisions are impossible within it, so it moves as slices (cut - // at the grading target) rather than element by element. - std::cmp::Ordering::Less => { - let run = gallop(s1, i + 1, |u| (kv(u), &u.1) < (kv(b), &b.1)).min(end1); - for piece in s1[i..run].chunks(TARGET) { - result.extend_from_slice(piece); - flush(&mut result, false); - } - i = run; + let mut result: Vec<((K, V), T, R)> = Vec::with_capacity(TARGET); + let mut flush = |result: &mut Vec<((K, V), T, R)>, force: bool| { + if result.len() >= TARGET || (force && !result.is_empty()) { + out.push_back(VecChunk(Rc::new(std::mem::replace(result, Vec::with_capacity(TARGET))))); } - std::cmp::Ordering::Greater => { - let run = gallop(s2, j + 1, |u| (kv(u), &u.1) < (kv(a), &a.1)).min(end2); - for piece in s2[j..run].chunks(TARGET) { - result.extend_from_slice(piece); + }; + + let (mut i, mut j) = (0, 0); + while i < end1 && j < end2 { + let a = &s1[i]; + let b = &s2[j]; + match (kv(a), &a.1).cmp(&(kv(b), &b.1)) { + // Copy the whole run of one side strictly below the other's head: + // collisions are impossible within it, so it moves as slices (cut + // at the grading target) rather than element by element. + std::cmp::Ordering::Less => { + let run = gallop(s1, i + 1, |u| (kv(u), &u.1) < (kv(b), &b.1)).min(end1); + for piece in s1[i..run].chunks(TARGET) { + result.extend_from_slice(piece); + flush(&mut result, false); + } + i = run; + } + std::cmp::Ordering::Greater => { + let run = gallop(s2, j + 1, |u| (kv(u), &u.1) < (kv(a), &a.1)).min(end2); + for piece in s2[j..run].chunks(TARGET) { + result.extend_from_slice(piece); + flush(&mut result, false); + } + j = run; + } + std::cmp::Ordering::Equal => { + let mut diff = a.2.clone(); + diff.plus_equals(&b.2); + if !diff.is_zero() { + result.push((a.0.clone(), a.1.clone(), diff)); + } + i += 1; + j += 1; flush(&mut result, false); } - j = run; } - std::cmp::Ordering::Equal => { - let mut diff = a.2.clone(); - diff.plus_equals(&b.2); - if !diff.is_zero() { - result.push((a.0.clone(), a.1.clone(), diff)); - } - i += 1; - j += 1; + } + // Bulk-copy the in-horizon tails, cutting at the grading target. + for tail in [&s1[i..end1], &s2[j..end2]] { + for piece in tail.chunks(TARGET) { + result.extend_from_slice(piece); flush(&mut result, false); } } + flush(&mut result, true); } - // Bulk-copy the in-horizon tails, cutting at the grading target. - for tail in [&s1[i..end1], &s2[j..end2]] { - for piece in tail.chunks(TARGET) { - result.extend_from_slice(piece); - flush(&mut result, false); - } - } - flush(&mut result, true); - *p1 = end1; - *p2 = end2; - } - fn merge(chunks: &mut [(usize, Self)], out: &mut ChunkList) { - let mut consumed: Vec = chunks.iter().map(|(c, _)| *c).collect(); - { - let inputs: Vec<&[_]> = chunks.iter().map(|(_, ch)| &ch.0[..]).collect(); - merge_buf(&inputs, &mut consumed, out); - } - for (i, (c, _)) in chunks.iter_mut().enumerate() { *c = consumed[i]; } + // Retire the fronts: pop the drained side, prune+push_front the partial one. + if end1 >= len1 { in1.pop_front(); } + else { let mut v = take(in1.pop_front().unwrap()); v.drain(..end1); in1.push_front(VecChunk(Rc::new(v))); } + if end2 >= len2 { in2.pop_front(); } + else { let mut v = take(in2.pop_front().unwrap()); v.drain(..end2); in2.push_front(VecChunk(Rc::new(v))); } } fn extract( - chunks: &mut Vec, + input: &mut VecDeque, frontier: &Antichain, residual: &mut Antichain, - keep: &mut ChunkList, - ship: &mut ChunkList, + keep: &mut VecDeque, + ship: &mut VecDeque, ) { // Fill `TARGET`-sized buffers directly, so the chunks pushed are already // graded and `regrade` passes them through as `Rc` moves rather than @@ -1143,46 +1040,44 @@ pub mod vec_chunk { let mut stash: Vec> = Vec::new(); let take_buf = |stash: &mut Vec<_>| stash.pop().unwrap_or_default(); let (mut k, mut s) = (take_buf(&mut stash), take_buf(&mut stash)); - for chunk in chunks.drain(..) { + for chunk in input.drain(..) { let mut v = take(chunk); for u in v.drain(..) { if frontier.borrow().less_equal(&u.1) { residual.insert_ref(&u.1); k.push(u); - if k.len() >= TARGET { keep.push(VecChunk(Rc::new(std::mem::replace(&mut k, take_buf(&mut stash))))); } + if k.len() >= TARGET { keep.push_back(VecChunk(Rc::new(std::mem::replace(&mut k, take_buf(&mut stash))))); } } else { s.push(u); - if s.len() >= TARGET { ship.push(VecChunk(Rc::new(std::mem::replace(&mut s, take_buf(&mut stash))))); } + if s.len() >= TARGET { ship.push_back(VecChunk(Rc::new(std::mem::replace(&mut s, take_buf(&mut stash))))); } } } stash.push(v); } - if !k.is_empty() { keep.push(VecChunk(Rc::new(k))); } - if !s.is_empty() { ship.push(VecChunk(Rc::new(s))); } + if !k.is_empty() { keep.push_back(VecChunk(Rc::new(k))); } + if !s.is_empty() { ship.push_back(VecChunk(Rc::new(s))); } } fn advance( - feed: &mut ChunkFeed, + input: &mut VecDeque, frontier: &Antichain, done: bool, - out: &mut ChunkList, + out: &mut VecDeque, ) { // Advance and consolidate every *complete* `(key, val)` group eagerly, // so its updates can be released as soon as the input proves no later // time for the pair can arrive. A group is contiguous in the sorted - // chain, so the only one that might continue in a future push is the - // last; unless `done`, we process up to its start and withhold the rest - // as the head for the next call. + // chain, so the only one that might continue in a future call is the last; + // unless `done`, we process up to its start and `push_front` the rest as + // the withheld carry for the next call. let mut stash: Vec> = Vec::new(); - let (consumed, ch) = &mut feed.0; - // Build the working buffer by *reusing the head's storage* and appending - // the tail (recycling each emptied tail `Vec`). Reusing the head is what - // keeps a withheld group from being recopied across calls: it just - // accumulates in place, so a `(key, val)` larger than the working set - // costs O(total) over the run rather than O(total²). - let mut buf = take(std::mem::take(ch)); - if *consumed > 0 { buf.drain(..*consumed); *consumed = 0; } - for chunk in feed.1.drain(..) { + // Build the working buffer by *reusing the front chunk's storage* (the + // carry from last time) and appending the rest (recycling each emptied + // `Vec`). Reusing the front is what keeps a withheld group from being + // recopied across calls: it just accumulates in place, so a `(key, val)` + // larger than the working set costs O(total) over the run, not O(total²). + let mut buf = match input.pop_front() { Some(chunk) => take(chunk), None => return }; + while let Some(chunk) = input.pop_front() { let mut v = take(chunk); buf.append(&mut v); stash.push(v); @@ -1190,17 +1085,17 @@ pub mod vec_chunk { if buf.is_empty() { return; } // If every available update shares one `(key, val)`, no group is provably - // complete — the next push may extend it — so make no progress unless - // `done`: retain the accumulated buffer as the head and return. This is + // complete — a later call may extend it — so make no progress unless + // `done`: push the accumulated buffer back as the carry and return. This is // the giant-key case; comparing only the first and last pair detects it - // without scanning, and reusing the head above makes the retention free. + // without scanning, and reusing the front above makes the retention free. if !done && buf[0].0 == buf[buf.len() - 1].0 { - *ch = VecChunk(Rc::new(buf)); + input.push_front(VecChunk(Rc::new(buf))); return; } // Otherwise at least the first group is complete. Withhold the last group - // (a single `(key, val)`) as the next head unless the input is complete. + // (a single `(key, val)`) as the next carry unless the input is complete. let end = if done { buf.len() } else { let last_kv = buf[buf.len() - 1].0.clone(); let mut start = buf.len(); @@ -1208,11 +1103,10 @@ pub mod vec_chunk { start }; if end < buf.len() { - let tail = buf.split_off(end); - *ch = VecChunk(Rc::new(tail)); + input.push_front(VecChunk(Rc::new(buf.split_off(end)))); } // Advance + consolidate each group into `TARGET`-sized output chunks, - // filling buffers reclaimed from the recycled tail `Vec`s. + // filling buffers reclaimed from the recycled `Vec`s. let mut result = stash.pop().unwrap_or_default(); let mut i = 0; while i < buf.len() { @@ -1231,15 +1125,15 @@ pub mod vec_chunk { while k < j && buf[k].1 == t { diff.plus_equals(&buf[k].2); k += 1; } if !diff.is_zero() { result.push((kv, t, diff)); - if result.len() >= TARGET { out.push(VecChunk(Rc::new(std::mem::replace(&mut result, stash.pop().unwrap_or_default())))); } + if result.len() >= TARGET { out.push_back(VecChunk(Rc::new(std::mem::replace(&mut result, stash.pop().unwrap_or_default())))); } } } i = j; } - if !result.is_empty() { out.push(VecChunk(Rc::new(result))); } + if !result.is_empty() { out.push_back(VecChunk(Rc::new(result))); } } - fn regrade(queue: &mut Vec, done: bool, out: &mut Vec) { + fn regrade(input: &mut VecDeque, done: bool, out: &mut VecDeque) { // Maximal packing: emit chunks as large as possible up to `TARGET`, // never splitting a pair that could combine into one legal (`<= TARGET`) // chunk. A chunk of exactly `TARGET` is maximal — it cannot grow — so it @@ -1249,31 +1143,31 @@ pub mod vec_chunk { // occasional trailing partial is coalesced. // // `carry` is the (sub-`TARGET`) chunk under construction. It is flushed - // once it reaches `TARGET`, carried back onto `queue` between calls, or - // emitted on `done`. Whenever `carry` is non-empty its left neighbour in + // once it reaches `TARGET`, `push_front`ed back onto `input` between calls, + // or emitted on `done`. Whenever `carry` is non-empty its left neighbour in // `out` is a `TARGET` chunk (or `carry` is `out`'s first chunk), so // emitting `carry` against a neighbour it cannot merge with — their sum // exceeds `TARGET` — keeps the packing maximal on both sides. let mut carry: Vec<((K, V), T, R)> = Vec::new(); - for chunk in queue.drain(..) { + while let Some(chunk) = input.pop_front() { if carry.is_empty() { absorb(chunk, &mut carry, out); } else if carry.len() + chunk.0.len() <= TARGET { // Combines into one legal chunk; coalesce in place. carry.extend(take(chunk)); if carry.len() == TARGET { - out.push(VecChunk(Rc::new(std::mem::take(&mut carry)))); + out.push_back(VecChunk(Rc::new(std::mem::take(&mut carry)))); } } else { // Cannot combine without exceeding `TARGET`; `carry` is maximal // against this neighbour, so emit it and absorb the chunk afresh. - out.push(VecChunk(Rc::new(std::mem::take(&mut carry)))); + out.push_back(VecChunk(Rc::new(std::mem::take(&mut carry)))); absorb(chunk, &mut carry, out); } } if !carry.is_empty() { let chunk = VecChunk(Rc::new(carry)); - if done { out.push(chunk); } else { queue.push(chunk); } + if done { out.push_back(chunk); } else { input.push_front(chunk); } } } } @@ -1282,11 +1176,11 @@ pub mod vec_chunk { /// sub-`TARGET` tail behind. fn peel( carry: &mut Vec<((K, V), T, R)>, - out: &mut Vec>, + out: &mut VecDeque>, ) { let mut start = 0; while carry.len() - start >= TARGET { - out.push(VecChunk(Rc::new(carry[start..start + TARGET].to_vec()))); + out.push_back(VecChunk(Rc::new(carry[start..start + TARGET].to_vec()))); start += TARGET; } carry.drain(..start); @@ -1298,82 +1192,20 @@ pub mod vec_chunk { fn absorb( chunk: VecChunk, carry: &mut Vec<((K, V), T, R)>, - out: &mut Vec>, + out: &mut VecDeque>, ) { use std::cmp::Ordering::{Equal, Greater, Less}; match chunk.0.len().cmp(&TARGET) { - Equal => out.push(chunk), + Equal => out.push_back(chunk), Less => *carry = take(chunk), Greater => { *carry = take(chunk); peel(carry, out); } } } - /// K-way merge of in-range prefixes of sorted, consolidated inputs, emitting - /// graded chunks directly into `out`. - /// - /// `inputs[i][consumed[i]..]` is the unconsumed, sorted suffix of input `i`. - /// Merges through the least last `((key, val), time)` across inputs (nothing - /// interleaves below it), consolidating triples shared across inputs, and - /// advances each `consumed[i]` past what it merged. Output is filled into - /// `TARGET`-sized buffers and pushed as it fills, so the run arrives *graded* - /// rather than as one monolithic chunk that `regrade` would re-split (and - /// re-copy) — mirroring `extract`. Sizing buffers to `TARGET` also avoids the - /// over-reservation a single up-front `with_capacity(total)` would incur. - fn merge_buf( - inputs: &[&[((K, V), T, R)]], - consumed: &mut [usize], - out: &mut ChunkList>, - ) - where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { - let Some(horizon) = inputs.iter().enumerate() - .filter(|(i, s)| consumed[*i] < s.len()) - .map(|(_, s)| { let u = &s[s.len() - 1]; (u.0.clone(), u.1.clone()) }) - .min() - else { return; }; - - let in_range = |i: usize, p: usize| { - p < inputs[i].len() && (&inputs[i][p].0, &inputs[i][p].1) <= (&horizon.0, &horizon.1) - }; - - let mut result: Vec<((K, V), T, R)> = Vec::with_capacity(TARGET); - loop { - let mut best: Option = None; - for i in 0..inputs.len() { - if in_range(i, consumed[i]) && best.is_none_or(|b| { - let (bi, bb) = (&inputs[i][consumed[i]], &inputs[b][consumed[b]]); - (&bi.0, &bi.1) < (&bb.0, &bb.1) - }) { - best = Some(i); - } - } - let Some(b) = best else { break; }; - let kv = inputs[b][consumed[b]].0.clone(); - let t = inputs[b][consumed[b]].1.clone(); - let mut diff: Option = None; - for i in 0..inputs.len() { - if in_range(i, consumed[i]) && inputs[i][consumed[i]].0 == kv && inputs[i][consumed[i]].1 == t { - match &mut diff { - None => diff = Some(inputs[i][consumed[i]].2.clone()), - Some(d) => d.plus_equals(&inputs[i][consumed[i]].2), - } - consumed[i] += 1; - } - } - if let Some(diff) = diff { - if !diff.is_zero() { - result.push((kv, t, diff)); - if result.len() >= TARGET { - out.push(VecChunk(Rc::new(std::mem::replace(&mut result, Vec::with_capacity(TARGET))))); - } - } - } - } - if !result.is_empty() { out.push(VecChunk(Rc::new(result))); } - } - #[cfg(test)] mod test { - use super::VecChunk; + use std::collections::VecDeque; + use super::{Chunk, VecChunk}; use crate::trace::chunk::merge_chains; use std::rc::Rc; @@ -1381,31 +1213,32 @@ pub mod vec_chunk { VecChunk(Rc::new(updates)) } - // `extract` must partition by frontier, fold the kept frontier into - // `residual`, and emit graded chunks directly — without leaning on a regrade - // re-split. + // Flatten a chunk sequence back to its update stream. + fn flat>>(chunks: I) -> Vec<((u64, u64), u64, i64)> { + chunks.into_iter().flat_map(|c| (*c.0).clone()).collect() + } + + // `extract` partitions by frontier and folds the kept frontier into `residual`; + // a terminal `regrade` then grades each side (the seams of near-graded output). #[test] fn extract_partitions_and_grades() { - use super::{Chunk, TARGET}; - use crate::trace::chunk::{is_graded, ChunkList}; + use super::TARGET; + use crate::trace::chunk::{is_graded, regrade_all}; use timely::progress::Antichain; // 4·TARGET updates spread over many input chunks; even times ship // (< frontier), odd times keep (>= frontier), so both sides straddle. let n = 4 * TARGET as u64; - let input: Vec<_> = (0..n) - .map(|i| chunk(vec![((i, 0), i % 2, 1)])) - .collect(); - let mut chunks = input; + let mut input: VecDeque<_> = (0..n).map(|i| chunk(vec![((i, 0), i % 2, 1)])).collect(); let frontier = Antichain::from_elem(1u64); let mut residual = Antichain::new(); - let (mut keep, mut ship) = (ChunkList::default(), ChunkList::default()); - VecChunk::extract(&mut chunks, &frontier, &mut residual, &mut keep, &mut ship); - let (keep, ship) = (keep.done(), ship.done()); + let (mut keep, mut ship) = (VecDeque::new(), VecDeque::new()); + VecChunk::extract(&mut input, &frontier, &mut residual, &mut keep, &mut ship); + let (keep, ship) = (regrade_all(keep), regrade_all(ship)); // Kept times are exactly {1}; that is the residual frontier. assert_eq!(residual, Antichain::from_elem(1u64)); - // Both sides emerge graded directly from `extract`. + // Both sides are graded after the regrade. assert!(is_graded(&keep), "ungraded keep: {:?}", keep.iter().map(Chunk::len).collect::>()); assert!(is_graded(&ship), "ungraded ship: {:?}", ship.iter().map(Chunk::len).collect::>()); // Nothing lost: half the updates each way. @@ -1414,26 +1247,23 @@ pub mod vec_chunk { } // `advance` advances and consolidates complete `(key, val)` groups eagerly, - // withholding only the (possibly-growing) last group when not `done`. + // pushing the (possibly-growing) last group back as the carry when not `done`. #[test] fn advance_emits_complete_groups_eagerly() { - use super::Chunk; - use crate::trace::chunk::ChunkList; use timely::progress::Antichain; let frontier = Antichain::from_elem(5u64); // Group (0,0) is complete within this chunk; group (1,0) might still grow. let c0 = chunk(vec![((0, 0), 0, 1), ((0, 0), 1, 1), ((1, 0), 0, 1)]); - let mut feed = ((0usize, VecChunk::default()), vec![c0]); - let mut out = ChunkList::default(); - VecChunk::advance(&mut feed, &frontier, false, &mut out); + let mut input: VecDeque<_> = VecDeque::from([c0]); + let mut out = VecDeque::new(); + VecChunk::advance(&mut input, &frontier, false, &mut out); - // The trailing group (1,0) is withheld as the head for the next call. - assert_eq!(Chunk::len(&feed.0.1), 1); - assert!(feed.1.is_empty()); + // The trailing group (1,0) is withheld as the carry at the front of `input`. + assert_eq!(input.len(), 1); + assert_eq!(Chunk::len(&input[0]), 1); // Group (0,0)'s times {0,1} advanced to 5 and consolidated, emitted now. - let emitted: Vec<_> = out.done().into_iter().flat_map(|c| (*c.0).clone()).collect(); - assert_eq!(emitted, vec![((0, 0), 5, 2)]); + assert_eq!(flat(out), vec![((0, 0), 5, 2)]); } // Streaming the input one chunk at a time must yield exactly what a single @@ -1441,7 +1271,6 @@ pub mod vec_chunk { // at group boundaries. #[test] fn advance_resumable_matches_oneshot() { - use crate::trace::chunk::{AdvanceQueue, ChunkList}; use timely::progress::Antichain; let frontier = Antichain::from_elem(3u64); @@ -1451,22 +1280,20 @@ pub mod vec_chunk { chunk(vec![((1, 0), 5, 1), ((1, 1), 0, 1), ((2, 0), 0, 1)]), chunk(vec![((2, 0), 2, 1), ((2, 0), 9, 1)]), ]; - let flat = |v: Vec>| - v.into_iter().flat_map(|c| (*c.0).clone()).collect::>(); let oneshot = { - let mut q = AdvanceQueue::new(frontier.clone()); - let mut out = ChunkList::default(); - q.push(input(), &mut out); - q.finish(&mut out); - flat(out.done()) + let mut q: VecDeque<_> = input().into(); + let mut out = VecDeque::new(); + VecChunk::advance(&mut q, &frontier, false, &mut out); + VecChunk::advance(&mut q, &frontier, true, &mut out); + flat(out) }; let incremental = { - let mut q = AdvanceQueue::new(frontier.clone()); - let mut out = ChunkList::default(); - for c in input() { q.push(std::iter::once(c), &mut out); } - q.finish(&mut out); - flat(out.done()) + let mut q = VecDeque::new(); + let mut out = VecDeque::new(); + for c in input() { q.push_back(c); VecChunk::advance(&mut q, &frontier, false, &mut out); } + VecChunk::advance(&mut q, &frontier, true, &mut out); + flat(out) }; assert_eq!(oneshot, incremental); // Times are advanced: nothing below the frontier survives. @@ -1474,71 +1301,66 @@ pub mod vec_chunk { } // A single `(key, val)` whose updates span every pushed chunk: `advance` - // can make no progress until `done`, accumulating in the head in place. + // can make no progress until `done`, accumulating in the carry in place. // It must still produce the right advanced+consolidated result at the end. #[test] fn advance_single_key_spanning_pushes() { - use crate::trace::chunk::{AdvanceQueue, ChunkList}; use timely::progress::Antichain; let frontier = Antichain::from_elem(100u64); let n = 50u64; let make = || (0..n).map(|t| chunk(vec![((7u64, 0u64), t, 1i64)])).collect::>(); - let flat = |v: Vec>| - v.into_iter().flat_map(|c| (*c.0).clone()).collect::>(); - let mut q = AdvanceQueue::new(frontier); - let mut out = ChunkList::default(); - for c in make() { q.push(std::iter::once(c), &mut out); } - q.finish(&mut out); + let mut q = VecDeque::new(); + let mut out = VecDeque::new(); + for c in make() { q.push_back(c); VecChunk::advance(&mut q, &frontier, false, &mut out); } + VecChunk::advance(&mut q, &frontier, true, &mut out); // All times advance to 100 and consolidate to one update of diff `n`. - assert_eq!(flat(out.done()), vec![((7u64, 0u64), 100u64, n as i64)]); + assert_eq!(flat(out), vec![((7u64, 0u64), 100u64, n as i64)]); } #[test] fn merge_chains_consolidates() { let a = chunk(vec![((0, 0), 0, 1), ((1, 0), 0, 1)]); let b = chunk(vec![((0, 0), 0, 1), ((2, 0), 0, 1)]); - let mut out = crate::trace::chunk::ChunkList::default(); + let mut out = VecDeque::new(); merge_chains(vec![a], vec![b], &mut out); - let merged: Vec<_> = out.done().into_iter().flat_map(|c| (*c.0).clone()).collect(); - assert_eq!(merged, vec![((0, 0), 0, 2), ((1, 0), 0, 1), ((2, 0), 0, 1)]); + assert_eq!(flat(out), vec![((0, 0), 0, 2), ((1, 0), 0, 1), ((2, 0), 0, 1)]); } - // Merging runs larger than `TARGET` must emit a *graded* sequence directly - // (each chunk `<= TARGET`, adjacent pairs summing past `TARGET`), not one - // monolithic chunk, while reproducing the consolidated sorted contents. + // Merging runs larger than `TARGET`, then regrading, yields a *graded* sequence + // (each chunk `<= TARGET`, adjacent pairs summing past `TARGET`) reproducing the + // consolidated sorted contents. #[test] fn merge_emits_graded_chunks() { - use super::{Chunk, TARGET}; - use crate::trace::chunk::{ChunkList, is_graded, merge_chains}; + use super::TARGET; + use crate::trace::chunk::{is_graded, merge_chains, regrade_all}; // Two interleaving single-chunk chains: evens and odds over `0..4·TARGET`. let n = 4 * TARGET as u64; let evens = chunk((0..n).step_by(2).map(|k| ((k, 0), 0, 1)).collect()); let odds = chunk((0..n).step_by(2).map(|k| ((k + 1, 0), 0, 1)).collect()); - let mut out = ChunkList::default(); + let mut out = VecDeque::new(); merge_chains(vec![evens], vec![odds], &mut out); - let chunks = out.done(); + let chunks = regrade_all(out); assert!(is_graded(&chunks), "merge output not graded: {:?}", chunks.iter().map(Chunk::len).collect::>()); // Contents are exactly the sorted keys `0..4·TARGET`, each once. - let merged: Vec<_> = chunks.into_iter().flat_map(|c| (*c.0).clone()).collect(); let want: Vec<_> = (0..n).map(|k| ((k, 0u64), 0u64, 1i64)).collect(); - assert_eq!(merged, want); + assert_eq!(flat(chunks), want); } - // Property test: merging two *multi-chunk* chains (driven through `merge_pair` - // by `merge_chains`) reproduces the union of all updates, consolidated. Tiny + // Property test: merging two *multi-chunk* chains (driven through `merge` by + // `merge_chains`) reproduces the union of all updates, consolidated. Tiny // chunks force `(key, val)` groups — which can span several times — to // straddle chunk boundaries on both sides, exercising the refill path the // single-chunk merge tests never reach. The independent oracle is // `consolidate_updates` over the concatenation. #[test] - fn merge_pair_matches_reference() { - use crate::trace::chunk::{ChunkList, merge_chains}; + fn merge_matches_reference() { + use crate::trace::chunk::merge_chains; use crate::consolidation::consolidate_updates; // Deterministic xorshift PRNG — no dev-dependency on `rand`. @@ -1570,9 +1392,9 @@ pub mod vec_chunk { if u1.is_empty() || u2.is_empty() { continue; } let sz = (rng() as usize % 5) + 1; // tiny chunks → heavy straddling - let mut out = ChunkList::default(); + let mut out = VecDeque::new(); merge_chains(chain(&u1, sz), chain(&u2, sz), &mut out); - let merged: Vec<_> = out.done().into_iter().flat_map(|c| (*c.0).clone()).collect(); + let merged = flat(out); let mut reference: Vec<_> = u1.iter().chain(u2.iter()).cloned().collect(); consolidate_updates(&mut reference); @@ -1582,13 +1404,12 @@ pub mod vec_chunk { } // `regrade` must produce a *maximal packing*: adjacent sub-`TARGET` chunks - // that could combine into one legal chunk are coalesced (the prior rule left - // any pair summing past `TARGET/2` alone), full chunks pass through, and - // contents are preserved exactly. + // that could combine into one legal chunk are coalesced, full chunks pass + // through as `Rc` moves, and contents are preserved exactly. #[test] fn regrade_maximal_packing() { - use super::{Chunk, TARGET}; - use crate::trace::chunk::{is_graded, ChunkList}; + use super::TARGET; + use crate::trace::chunk::is_graded; // A mix of small and full chunks with distinct, increasing keys (so the // concatenation is sorted and nothing consolidates away). @@ -1596,12 +1417,15 @@ pub mod vec_chunk { let sizes = [t / 3, t / 3, t / 3, t, t / 2, t / 2, t, 1, t - 1]; let total: usize = sizes.iter().sum(); let mut key = 0u64; - let mut list = ChunkList::default(); + let mut input = VecDeque::new(); + let mut output = VecDeque::new(); for &s in &sizes { let updates: Vec<_> = (0..s).map(|_| { let k = key; key += 1; ((k, 0u64), 0u64, 1i64) }).collect(); - list.push(chunk(updates)); + input.push_back(chunk(updates)); + VecChunk::regrade(&mut input, false, &mut output); } - let chunks = list.done(); + VecChunk::regrade(&mut input, true, &mut output); + let chunks: Vec<_> = output.into(); assert!(is_graded(&chunks), "not graded: {:?}", chunks.iter().map(Chunk::len).collect::>()); From 812bb432c6f3708374a9d89824a1eded46f73a09 Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Sun, 21 Jun 2026 20:15:02 -0400 Subject: [PATCH 6/9] chunk: drain the loaded burst in merge, recovering the deque regression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The one-front-pair merge pruned and pushed back the partial side on every call, so a chunk straddling many chunks on the other side was re-pruned (copied) once per straddled chunk — a measurable ~4% queries / ~10% loading regression at 1M versus the pre-deque baseline. Have merge instead drain both deques' loaded content in one call: walk across chunk boundaries with local indices (p1/p2) reset as each working chunk is retired, reading through the shared Rc (never `take`n, so a source clone is not deep-copied), and prune the partial side's suffix exactly once at the yield boundary. The merge harness (ChunkBatchMerger) now loads a burst of chunks per side (BURST=8) so that single prune amortizes over the burst; merge_chains (the batcher) already feeds whole chains, so it drains in one pass for free. The drain is a `while p1 < c1.len() && p2 < c2.len()` guarding the cursor indexing, refilling exhausted working chunks at the foot of the loop — no bare `loop`, and the cursor state stays in locals (an earlier `refill(&mut …)` helper spilled it to memory and cost ~3%). Recovers to pre-deque parity: 1M/2M ~205ms loading / ~643ms queries, ~1.6x faster than ord_neu. All chunk tests (incl. the 300-case merge property test) and the full suite pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- differential-dataflow/src/trace/chunk.rs | 179 +++++++++++------------ 1 file changed, 88 insertions(+), 91 deletions(-) diff --git a/differential-dataflow/src/trace/chunk.rs b/differential-dataflow/src/trace/chunk.rs index 3e48c33c3..8f3d3ff93 100644 --- a/differential-dataflow/src/trace/chunk.rs +++ b/differential-dataflow/src/trace/chunk.rs @@ -309,7 +309,10 @@ where ) { let mut merged = VecDeque::new(); merge_chains(list1, list2, &mut merged); - output.extend(regrade_all(merged)); + // No regrade: the batcher's ladder weighs chains by updates (not chunk count) + // since #767, so intermediate grading buys nothing; the final batch is graded + // at seal. merge's output is already near-`TARGET`. + output.extend(merged); } fn extract( @@ -327,8 +330,10 @@ where let mut input: VecDeque = merged.into(); let (mut keep, mut shipped) = (VecDeque::new(), VecDeque::new()); C::extract(&mut input, &upper, frontier, &mut keep, &mut shipped); - kept.extend(regrade_all(keep)); - ship.extend(regrade_all(shipped)); + // No regrade: `kept` is re-merged later and `shipped` is regraded at seal by + // the builder, so neither needs grading here. + kept.extend(keep); + ship.extend(shipped); } fn len(chunk: &C) -> usize { chunk.len() } @@ -625,22 +630,25 @@ where if self.complete { return; } while *fuel > 0 { - // Refill each empty input deque with the next source chunk (head-of-list - // burst). After this, a deque is non-empty iff its source still has data. - if self.in1.is_empty() && self.idx1 < source1.chunks.len() { + // Refill each input deque up to a burst of source chunks (clones). `merge` + // drains the loaded burst per call, so a larger burst amortizes the single + // partial-chunk prune it does at the yield boundary. After this, a deque is + // non-empty iff its source still has data. + const BURST: usize = 8; + while self.in1.len() < BURST && self.idx1 < source1.chunks.len() { self.in1.push_back(source1.chunks[self.idx1].clone()); self.idx1 += 1; } - if self.in2.is_empty() && self.idx2 < source2.chunks.len() { + while self.in2.len() < BURST && self.idx2 < source2.chunks.len() { self.in2.push_back(source2.chunks[self.idx2].clone()); self.idx2 += 1; } - // Merge's per-tick output (small: one front-pair, or one tail chunk), - // measured for fuel before it joins the carry already in `merged`. + // Merge's per-tick output (a burst's worth, or one tail chunk), measured + // for fuel before it joins the carry already in `merged`. let mut produced = VecDeque::new(); if !self.in1.is_empty() && !self.in2.is_empty() { - // Both sides have data: one front-pair merge. + // Both sides have data: drain the loaded burst. C::merge(&mut self.in1, &mut self.in2, &mut produced); } else if let Some(chunk) = self.in1.pop_front().or_else(|| self.in2.pop_front()) { // Exactly one side has data: flush its verbatim tail, one chunk a step. @@ -930,99 +938,88 @@ pub mod vec_chunk { fn len(&self) -> usize { self.0.len() } - /// A dedicated two-pointer binary merge of the two front chunks: one gallop - /// pins how far each side may merge (through the lesser of the two last - /// `(key, val, time)`s), then a single pass consolidates equal triples and - /// bulk-copies disjoint runs as slices. The drained side's front is popped; the - /// other's partially-consumed front is pruned (its prefix dropped) and - /// `push_front`ed back — so the only persisted state is the deques themselves. + /// A two-pointer binary merge that drains the two deques' *loaded* content + /// through their shared horizon — the lesser of the two deques' last loaded + /// `(key, val, time)`s — rather than one front-pair at a time. Consolidates + /// equal triples and bulk-copies disjoint runs as slices, walking across chunk + /// boundaries with local indices (`p1`/`p2`) that reset as each working chunk + /// is retired. The side owning the horizon drains fully; the other's partial + /// working chunk is pruned (its prefix dropped) and `push_front`ed back exactly + /// once at the yield boundary — so the per-call prune cost amortizes over the + /// whole burst the harness loaded, not over each chunk. fn merge(in1: &mut VecDeque, in2: &mut VecDeque, out: &mut VecDeque) { fn kv(u: &((K, V), T, R)) -> (&K, &V) { (&u.0.0, &u.0.1) } - fn kvt(u: &((K, V), T, R)) -> ((&K, &V), &T) { (kv(u), &u.1) } - - // How far each front may merge, computed while the fronts are borrowed; the - // deques are mutated only after these borrows end. - let (end1, end2, len1, len2); - { - let s1 = &in1.front().unwrap().0[..]; - let s2 = &in2.front().unwrap().0[..]; - len1 = s1.len(); - len2 = s2.len(); - - // The merge horizon: the lesser of the two last `(key, val, time)`s. The - // side owning it drains fully; the other merges through it. The time must - // be part of the horizon: a `(key, val)` group can straddle a chunk - // boundary on the owning side, and a coarser `(key, val)` horizon would - // let the other side's whole group merge before that continuation arrives, - // emitting it unconsolidated (caught by `merge_matches_reference`). - if kvt(&s1[len1 - 1]) <= kvt(&s2[len2 - 1]) { - let horizon = kvt(&s1[len1 - 1]); - end1 = len1; - end2 = gallop(s2, 0, |u| kvt(u) <= horizon); - } else { - let horizon = kvt(&s2[len2 - 1]); - end2 = len2; - end1 = gallop(s1, 0, |u| kvt(u) <= horizon); - } - let mut result: Vec<((K, V), T, R)> = Vec::with_capacity(TARGET); - let mut flush = |result: &mut Vec<((K, V), T, R)>, force: bool| { - if result.len() >= TARGET || (force && !result.is_empty()) { - out.push_back(VecChunk(Rc::new(std::mem::replace(result, Vec::with_capacity(TARGET))))); - } - }; + let mut result: Vec<((K, V), T, R)> = Vec::with_capacity(TARGET); + let mut flush = |result: &mut Vec<((K, V), T, R)>, force: bool| { + if result.len() >= TARGET || (force && !result.is_empty()) { + out.push_back(VecChunk(Rc::new(std::mem::replace(result, Vec::with_capacity(TARGET))))); + } + }; - let (mut i, mut j) = (0, 0); - while i < end1 && j < end2 { - let a = &s1[i]; - let b = &s2[j]; - match (kv(a), &a.1).cmp(&(kv(b), &b.1)) { - // Copy the whole run of one side strictly below the other's head: - // collisions are impossible within it, so it moves as slices (cut - // at the grading target) rather than element by element. - std::cmp::Ordering::Less => { - let run = gallop(s1, i + 1, |u| (kv(u), &u.1) < (kv(b), &b.1)).min(end1); - for piece in s1[i..run].chunks(TARGET) { - result.extend_from_slice(piece); - flush(&mut result, false); - } - i = run; - } - std::cmp::Ordering::Greater => { - let run = gallop(s2, j + 1, |u| (kv(u), &u.1) < (kv(a), &a.1)).min(end2); - for piece in s2[j..run].chunks(TARGET) { - result.extend_from_slice(piece); - flush(&mut result, false); - } - j = run; + // Working chunks (the shared `Rc`, read by index — never `take`n, so a + // source clone is not deep-copied) and positions; both deques are non-empty + // on entry. When a working chunk is consumed we refill from its deque; when a + // deque is empty that side has presented all its loaded data — its last + // triple is the horizon, so we stop and leave the other side's remainder. + // Working chunks (the shared `Rc`, read by index — never `take`n, so a + // source clone is not deep-copied) and their positions; both deques are + // non-empty on entry. The guard keeps both cursors valid for indexing; a + // working chunk consumed mid-merge is refilled at the foot of the loop, and + // when a deque runs dry we stop — that side has presented all its loaded + // data, so its last triple is the horizon and the rest is left for next time. + let mut c1 = in1.pop_front().unwrap(); + let mut c2 = in2.pop_front().unwrap(); + let (mut p1, mut p2) = (0usize, 0usize); + while p1 < c1.0.len() && p2 < c2.0.len() { + let a = &c1.0[p1]; + let b = &c2.0[p2]; + match (kv(a), &a.1).cmp(&(kv(b), &b.1)) { + // Copy the run of one side strictly below the other's head (within + // the current working chunk): collisions are impossible within it, + // so it moves as slices cut at the grading target. + std::cmp::Ordering::Less => { + let run = gallop(&c1.0[..], p1 + 1, |u| (kv(u), &u.1) < (kv(b), &b.1)); + for piece in c1.0[p1..run].chunks(TARGET) { + result.extend_from_slice(piece); + flush(&mut result, false); } - std::cmp::Ordering::Equal => { - let mut diff = a.2.clone(); - diff.plus_equals(&b.2); - if !diff.is_zero() { - result.push((a.0.clone(), a.1.clone(), diff)); - } - i += 1; - j += 1; + p1 = run; + } + std::cmp::Ordering::Greater => { + let run = gallop(&c2.0[..], p2 + 1, |u| (kv(u), &u.1) < (kv(a), &a.1)); + for piece in c2.0[p2..run].chunks(TARGET) { + result.extend_from_slice(piece); flush(&mut result, false); } + p2 = run; } - } - // Bulk-copy the in-horizon tails, cutting at the grading target. - for tail in [&s1[i..end1], &s2[j..end2]] { - for piece in tail.chunks(TARGET) { - result.extend_from_slice(piece); + std::cmp::Ordering::Equal => { + let mut diff = a.2.clone(); + diff.plus_equals(&b.2); + if !diff.is_zero() { + result.push((a.0.clone(), a.1.clone(), diff)); + } + p1 += 1; + p2 += 1; flush(&mut result, false); } } - flush(&mut result, true); + // Refill either working chunk consumed by the step above; stop the drain + // once a deque is exhausted (the `&&` guard then never re-enters). + if p1 == c1.0.len() { + match in1.pop_front() { Some(c) => { c1 = c; p1 = 0; } None => break } + } + if p2 == c2.0.len() { + match in2.pop_front() { Some(c) => { c2 = c; p2 = 0; } None => break } + } } - - // Retire the fronts: pop the drained side, prune+push_front the partial one. - if end1 >= len1 { in1.pop_front(); } - else { let mut v = take(in1.pop_front().unwrap()); v.drain(..end1); in1.push_front(VecChunk(Rc::new(v))); } - if end2 >= len2 { in2.pop_front(); } - else { let mut v = take(in2.pop_front().unwrap()); v.drain(..end2); in2.push_front(VecChunk(Rc::new(v))); } + flush(&mut result, true); + // One side's deque emptied with its working chunk exhausted; the other's + // working chunk is partial — push back just its unconsumed suffix (one copy + // per call), ahead of whatever loaded chunks remain in that deque. + if p1 < c1.0.len() { in1.push_front(VecChunk(Rc::new(c1.0[p1..].to_vec()))); } + if p2 < c2.0.len() { in2.push_front(VecChunk(Rc::new(c2.0[p2..].to_vec()))); } } fn extract( From a501de71fa76493f7647d9c6c3f0551a4ee8287b Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Sun, 21 Jun 2026 21:10:32 -0400 Subject: [PATCH 7/9] =?UTF-8?q?chunk:=20review=20fixups=20=E2=80=94=20dedu?= =?UTF-8?q?p=20comment,=20document=20BURST,=20latency,=20columnar=20risk?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addressing review feedback on the phase-1 foundation; all doc/comment, no logic. * merge: delete the duplicated "working chunks" paragraph (copy-paste residue from the loop→while cleanup). * BURST: a sweep shows it is insensitive on this workload (1..32 flat to ~noise at 1M), so the recovery was the drain-merge reading through the Rc + copying only the partial suffix, not the burst. Rewrite the comment to say what BURST actually trades (fuel granularity vs. re-pruning a pushed-back partial under straddle-heavy inputs) and that 8 is a conservative default, not a tuned value. * ChunkBatchMerger: state the latency bound explicitly — fuel bounds each step to ~one burst-merge, but the terminal advance(done) + regrade ride outside fuel; worst case (one (key,val) spanning the whole merge) is one unfueled step, kept linear in the group by vec_chunk's in-place carry reuse. A chunk impl must keep that flush linear. * vec_chunk docs: spell out that the protocol is layout-agnostic but the merge *body* is not — it bulk-copies a contiguous slice; a columnar chunk must range-copy the key/val/time/diff columns with offset bookkeeping (the operation that beats flat on repetitive keys, and where the removed col_chunk struggled). Nothing here exercises a columnar merge — that body is the open phase-2 risk. Co-Authored-By: Claude Opus 4.8 (1M context) --- differential-dataflow/src/trace/chunk.rs | 57 +++++++++++++++++------- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/differential-dataflow/src/trace/chunk.rs b/differential-dataflow/src/trace/chunk.rs index 8f3d3ff93..f85e3d340 100644 --- a/differential-dataflow/src/trace/chunk.rs +++ b/differential-dataflow/src/trace/chunk.rs @@ -574,14 +574,24 @@ where /// The merge is *resumable* and runs a two-stage deque pipeline: /// [`merge`](Chunk::merge) feeds `merged`, [`advance`](Chunk::advance) consumes it /// into `advanced`; the terminal [`regrade`](Chunk::regrade) runs once at `done`. Each -/// `work` step clones a head from each source (the burst is head-of-each-list), ticks -/// `merge` once, then advances the fresh output, debiting `fuel` by the *merged* -/// records that entered the pipe — the total output across the merge, matching how the -/// trace's other mergers account (cf. `ord_neu`). The sources are read by *cloning* -/// chunks (a cheap refcount bump per the [`Chunk`] contract), never consumed or -/// mutated; the same `source1`/`source2` must be supplied on every call. When a source -/// exhausts, the harness flushes the other's verbatim tail one chunk per step. Once -/// both are drained, a final `advance(done)` flushes advance's withheld carry. +/// `work` step clones a burst from each source, ticks `merge` once, then advances the +/// fresh output, debiting `fuel` by the *merged* records that entered the pipe — the +/// total output across the merge, matching how the trace's other mergers account (cf. +/// `ord_neu`). The sources are read by *cloning* chunks (a cheap refcount bump per the +/// [`Chunk`] contract), never consumed or mutated; the same `source1`/`source2` must be +/// supplied on every call. When a source exhausts, the harness flushes the other's +/// verbatim tail one chunk per step. Once both are drained, a final `advance(done)` +/// flushes advance's withheld carry. +/// +/// **Latency bound.** `fuel` bounds each step to roughly one burst-merge's output. Two +/// things ride *outside* fuel: the terminal `advance(done)` and `done`'s `regrade`. In +/// the worst case — a single `(key, val)` spanning the whole merge — `advance` withholds +/// the entire group until `done`, then sorts and consolidates it in one unfueled step. +/// `vec_chunk` keeps that step *linear* in the group (it accumulates the carry in place, +/// reusing its storage), so it is not the quadratic blow-up of an earlier design, but it +/// is one unbounded-latency step bounded by the largest single `(key, val)` group. A +/// chunk impl must keep this flush linear; the latency claimed is "per step ≈ a burst, +/// plus a final flush ≤ the largest group." pub struct ChunkBatchMerger { /// Compaction frontier supplied at construction. frontier: Antichain, @@ -630,10 +640,15 @@ where if self.complete { return; } while *fuel > 0 { - // Refill each input deque up to a burst of source chunks (clones). `merge` - // drains the loaded burst per call, so a larger burst amortizes the single - // partial-chunk prune it does at the yield boundary. After this, a deque is - // non-empty iff its source still has data. + // Refill each input deque up to a burst of source chunks (clones); `merge` + // drains the loaded burst per call. The burst trades fuel granularity (a + // call does up to a burst's work before checking fuel) against re-pruning: + // a chunk that straddles many chunks on the other side is walked by index + // within one call but, once its tail spills past the loaded burst, its + // unconsumed suffix is pushed back and re-copied next call — a bigger burst + // absorbs more straddle per call. This workload is insensitive (1..32 flat + // to ~noise at 1M), so 8 is a conservative default, not a tuned optimum. + // After this, a deque is non-empty iff its source still has data. const BURST: usize = 8; while self.in1.len() < BURST && self.idx1 < source1.chunks.len() { self.in1.push_back(source1.chunks[self.idx1].clone()); @@ -747,6 +762,19 @@ pub mod vec_chunk { //! //! `Clone` is a refcount bump, so the trace merger shares source chunks instead //! of copying them. + //! + //! **What a columnar impl can and can't reuse.** The protocol (the `VecDeque` + //! in/out, withhold-by-`push_front`, grade-at-seal) is layout-agnostic and carries + //! over unchanged. The *merge body* does not: this one merges a single contiguous + //! `&[((K,V),T,R)]` and bulk-copies disjoint runs with `extend_from_slice` + + //! `chunks(TARGET)`. A columnar chunk (ranging over `ord_neu`'s deduped layout) has + //! no such slice — it must range-copy the key / val / time / diff columns with + //! offset bookkeeping, emitting one key + its val/time run rather than repeated rows. + //! That is the operation that beats the flat layout on repetitive keys (see the + //! module-level note on the row-major vs. columnar crossover), and it is also where + //! the earlier `col_chunk` got into trouble (decompress-and-recompress instead of a + //! true range-copy). So a columnar `Chunk` is the open bet: nothing here exercises a + //! columnar merge, and that body — not the protocol — is the phase-2 risk. use std::collections::VecDeque; use std::marker::PhantomData; @@ -957,11 +985,6 @@ pub mod vec_chunk { } }; - // Working chunks (the shared `Rc`, read by index — never `take`n, so a - // source clone is not deep-copied) and positions; both deques are non-empty - // on entry. When a working chunk is consumed we refill from its deque; when a - // deque is empty that side has presented all its loaded data — its last - // triple is the horizon, so we stop and leave the other side's remainder. // Working chunks (the shared `Rc`, read by index — never `take`n, so a // source clone is not deep-copied) and their positions; both deques are // non-empty on entry. The guard keeps both cursors valid for indexing; a From 16e01104fe89e487d630d1971ba0b23dbc117ca0 Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Mon, 22 Jun 2026 08:22:24 -0400 Subject: [PATCH 8/9] chunk: split vec_chunk into its own file; require Chunk::TARGET Move the worked `vec_chunk` reference impl (and its tests) out of the 1.5k-line `chunk.rs` into `chunk/vec_chunk.rs`, leaving `chunk/mod.rs` holding just the `Chunk` trait and the generic harness. This makes room for sibling impls (e.g. a columnar `col_chunk`) without a single massive file; the module's own docs already frame `vec_chunk` as the reference shape a next implementor follows. Also drop the `= 1024` default on `Chunk::TARGET`, making it a required associated const. The right value is layout-dependent (it trades against the row-major vs. columnar crossover), and the sole impl already set it explicitly, so the default had no users and only risked a silent, mistuned inheritance for the next implementor. Co-Authored-By: Claude Opus 4.8 (1M context) --- differential-dataflow/src/trace/chunk.rs | 1524 ----------------- differential-dataflow/src/trace/chunk/mod.rs | 746 ++++++++ .../src/trace/chunk/vec_chunk.rs | 778 +++++++++ 3 files changed, 1524 insertions(+), 1524 deletions(-) delete mode 100644 differential-dataflow/src/trace/chunk.rs create mode 100644 differential-dataflow/src/trace/chunk/mod.rs create mode 100644 differential-dataflow/src/trace/chunk/vec_chunk.rs diff --git a/differential-dataflow/src/trace/chunk.rs b/differential-dataflow/src/trace/chunk.rs deleted file mode 100644 index f85e3d340..000000000 --- a/differential-dataflow/src/trace/chunk.rs +++ /dev/null @@ -1,1524 +0,0 @@ -//! Sorted, consolidated runs of updates, and operators over sequences of them. -//! -//! A [`Chunk`] is a consolidated, sorted run of `(data, time, diff)` updates. -//! Chunks live in sequences (`Vec`) with no constraint on where the -//! breakpoints between them fall; each chunk holds at most [`Chunk::TARGET`] -//! updates. The trait deliberately exposes only batch-level operations — merge, -//! extract, advance — leaving the layout-aware work to the implementor. The -//! orchestration in this module (the binary merger) is generic over the layout -//! and concerns itself only with feeding chunks across calls. -//! -//! # Why chunks, and why one size -//! -//! A batch could be a single monolithic sorted run. We cut it into chunks because -//! the chunk is simultaneously the unit of four things, each of which wants a size -//! bound: -//! -//! * **Suspendable work.** The fueled merger does a chunk's-worth of work per step -//! and checks fuel at the boundary, so chunk size bounds a step's latency. -//! * **Immutable sharing.** Chunks are `Rc`-shared; the merger reads its sources by -//! *cloning* chunks (a refcount bump). The chunk is the finest granularity of sharing. -//! * **Allocation recycling.** Emptied input buffers are reused as output buffers; -//! that only composes if buffers are roughly one size. -//! * **Indexing.** [`ChunkBatch`] indexes chunks by their first/last key, and the -//! cursor binary-searches *over* chunks then gallops *within* one. The chunk -//! count (≈ `len / TARGET`) sets the outer index size and search depth. -//! -//! So the size bound pulls two ways: an upper bound (latency, memory) says "not too -//! big," and a lower bound (per-chunk overhead, index bloat) says "not too -//! fragmented." Keeping chunks one size is what lets a single knob satisfy both. -//! The grading invariant ([`is_graded`]) encodes exactly this: every chunk is at -//! most `TARGET`, and every *adjacent pair* exceeds `TARGET` — i.e. no two -//! neighbours could be combined into one legal chunk. That makes `TARGET` both the -//! maximum size and the coalescing threshold (the invariant is self-similar), and -//! a graded sequence a *maximal packing*: as few chunks as the maximum allows. -//! -//! The intent is for a `Chunk` implementation to be each of -//! 1. the containers a `Collection` can transit. -//! 2. the containers a `MergeBatcher` can work with. -//! 3. the containers a `Batch` can be backed by. -//! It does this by exposing a small set of chunk-oriented primitives, which are -//! sufficient for harnesses for each of these tasks. - -use std::collections::VecDeque; - -use timely::progress::Antichain; -use timely::progress::frontier::AntichainRef; -use crate::lattice::Lattice; -use crate::trace::{Batch, BatchReader, Description}; -use crate::trace::cursor::Cursor; -use crate::trace::implementations::{BatchContainer, Layout, LayoutExt, WithLayout}; - -/// The key container of chunk `C`'s layout. Named via the `Layout` projection so -/// it unifies with the cursor's `Self::Key`, which also projects through `Layout`. -type KeyCon = <::Layout as Layout>::KeyContainer; -/// The val container of chunk `C`'s layout. -type ValCon = <::Layout as Layout>::ValContainer; - -/// Whether `chunks` satisfy the [`Chunk::TARGET`] grading invariant: every chunk -/// at most `TARGET`, and every adjacent pair summing to more than `TARGET` (so no -/// two neighbours could be combined into one legal chunk — a *maximal packing*). -/// -/// This is the post-[`regrade`](Chunk::regrade) shape; useful as a test/debug check. -pub fn is_graded(chunks: &[C]) -> bool { - chunks.iter().all(|c| c.len() <= C::TARGET) - && chunks.windows(2).all(|w| w[0].len() + w[1].len() > C::TARGET) -} - -/// Regrade `input` to completion into a fresh graded `Vec` (see [`Chunk::regrade`]). -/// -/// A convenience for the one-shot callers (batch sealing, the batcher's merge and -/// extract) that have a whole sequence in hand and want it graded; the streaming -/// callers drive [`Chunk::regrade`] directly across ticks. -pub fn regrade_all(input: impl IntoIterator) -> Vec { - let mut input: VecDeque = input.into_iter().collect(); - let mut out = VecDeque::new(); - C::regrade(&mut input, true, &mut out); - debug_assert!(input.is_empty()); - out.into() -} - -/// A consolidated, sorted sequence of `(data, time, diff)`. -/// -/// Chunks exist in sequences, with no constraints on the breakpoints between -/// them. Each holds at most [`TARGET`](Chunk::TARGET) updates; a graded sequence -/// is a maximal packing at that size (see [`is_graded`] and the module docs). -/// -/// `Clone` is expected to be cheap — a refcount bump on shared backing storage, -/// not a deep copy. The trace merger relies on this to read its (shared, -/// immutable) source batches by cloning chunks rather than consuming them. -/// -/// A chunk *has* a [`Cursor`] over its own `(key, val, time, diff)` contents — -/// the chunk is its own cursor `Storage`, mirroring [`BatchReader`]. This is what -/// lets a batch cursor delegate downward: the batch indexes which chunk holds a -/// key (reusing the chunk's `KeyContainer` / `ValContainer` for boundaries) and -/// then reads through that chunk's cursor. We do not provide this; the opaque -/// chunk implementor does. -/// -/// # The transducer protocol -/// -/// The four chunk-producing operations ([`merge`](Chunk::merge), -/// [`extract`](Chunk::extract), [`advance`](Chunk::advance), -/// [`regrade`](Chunk::regrade)) are all *stream transducers* over `VecDeque`, -/// sharing one calling convention so an implementor learns it once: -/// -/// * **Consume from the front.** Read chunks off the front of the input deque(s). -/// * **Withhold by pushing back.** Anything consumed but not yet safe to commit -/// (advance's still-growing last group; regrade's sub-`TARGET` carry; merge's -/// partially-consumed front) is reformed into a single owned chunk and -/// `push_front`ed back onto its input. The only cross-call state is therefore the -/// deques themselves — clean owned runs, no indices escape a call. -/// * **Commit by appending.** Append committed chunks to the output deque; once -/// appended they are written and a downstream stage may take them immediately. -/// * **`done` forces the flush.** The unary stages take `done: bool`; while it is -/// false they may withhold, and a call that appends nothing has yielded — the -/// harness will not call again until more input arrives or `done` flips true. On -/// `done` the stage must drain its withheld state (the harness keeps calling -/// until the output stops growing). -/// -/// Two operations vary only where their job demands it: [`merge`](Chunk::merge) is -/// binary (and the harness, not `merge`, handles a drained input by flushing the -/// other side's verbatim tail, so `merge` needs no `done`); [`extract`](Chunk::extract) -/// is the one-shot splitter (it drains its whole input, so it needs no `done` and -/// has two outputs plus a residual frontier). -/// -/// Implementors are further expected to: -/// -/// * **Emit near-graded output.** Fill `TARGET`-sized output chunks directly rather -/// than emitting one monolithic chunk; the terminal [`regrade`](Chunk::regrade) -/// only has to coalesce the trailing partials at the seams. Grading is a -/// *seal-time* property, not an invariant maintained between stages. -/// * **Recycle where possible.** Reuse the storage of chunks drained from the input -/// as the buffers for output, so allocations balance input against output rather -/// than allocating afresh per emitted chunk. `vec_chunk` is the worked example: it -/// fills buffers reclaimed from a stash of emptied input `Vec`s, and advance reuses -/// its withheld carry's storage in place so a giant key stays linear, not quadratic. -/// -/// [`BatchReader`]: crate::trace::BatchReader -pub trait Chunk: Sized + Clone + LayoutExt { - - /// The chunk size: both the maximum updates per chunk and the coalescing - /// threshold. - /// - /// A *graded* sequence (the post-[`regrade`](Chunk::regrade) shape) has every - /// chunk of length at most `TARGET`, and every adjacent pair summing to more - /// than `TARGET` — so no two neighbours could be combined into one legal chunk. - /// Equivalently, a maximal packing at size `TARGET`. [`is_graded`] checks - /// exactly this. The value is the implementor's tuning knob: larger means fewer - /// chunks (smaller index, less per-chunk overhead) but coarser merge-suspension - /// granularity and a larger within-chunk seek. - const TARGET: usize = 1024; - - /// A cursor navigating this chunk's contents; the chunk is its storage. - /// - /// The layout aliases are spelled out (mirroring [`BatchReader`]) so the - /// cursor's `Key`/`Val`/`Time`/`Diff` and their containers are *definitionally* - /// equal to the chunk's — without this the compiler won't connect the cursor's - /// layout to the chunk's when reading through it. - type Cursor: - Cursor + - WithLayout + - for<'a> LayoutExt< - Key<'a> = Self::Key<'a>, - Val<'a> = Self::Val<'a>, - ValOwn = Self::ValOwn, - Time = Self::Time, - TimeGat<'a> = Self::TimeGat<'a>, - Diff = Self::Diff, - DiffGat<'a> = Self::DiffGat<'a>, - KeyContainer = Self::KeyContainer, - ValContainer = Self::ValContainer, - TimeContainer = Self::TimeContainer, - DiffContainer = Self::DiffContainer, - >; - - /// Acquire a cursor over this chunk. - fn cursor(&self) -> Self::Cursor; - - /// The first and last `(key, val, time)` triples in the chunk. - /// - /// The chunk must be non-empty (batch chunks always are). Expected to be - /// cheap — the chunk's endpoints, e.g. columnar indices `0` and `len - 1`, - /// not a cursor walk. Indexing a batch's chunks rests on this: the last - /// triples drive a binary search to a key or `(key, val)`, and comparing one - /// chunk's last triple against the next chunk's first detects keys or - /// `(key, val)` pairs that straddle the boundary — all without touching chunk - /// contents. Returned by reference (no owned key type exists in the layout); - /// the index materializes them into its own containers. - fn bounds(&self) -> ( - (Self::Key<'_>, Self::Val<'_>, Self::TimeGat<'_>), - (Self::Key<'_>, Self::Val<'_>, Self::TimeGat<'_>), - ); - - /// The number of updates in the chunk. - /// - /// Chunks are always non-empty (`len() > 0`): producers drop empties before - /// they reach a chunk sequence, and [`ChunkBatch::new`] asserts the invariant. - fn len(&self) -> usize; - - /// Merge the fronts of two input deques through their shared horizon. - /// - /// Both deques are non-empty (the caller guarantees it). The two front chunks - /// merge through updates present in both — up to the least last `(key, val, time)` - /// triple across them — consolidating collisions and emitting committed chunks to - /// `out`. The side owning the horizon is fully consumed and `pop_front`ed; the - /// other's partially-consumed front is reformed (its consumed prefix dropped) and - /// `push_front`ed back. So on return at least one deque has had its front retired. - /// - /// `merge` makes one front-pair's worth of progress and returns; the harness - /// re-ticks it, refilling a drained deque from its source, and itself handles an - /// exhausted source by flushing the other deque's verbatim tail — so `merge` needs - /// no `done` and never has to reason about end-of-input. - fn merge(in1: &mut VecDeque, in2: &mut VecDeque, out: &mut VecDeque); - - /// Partition the input by `frontier` into updates greater-or-equal it (`keep`) or - /// not (`ship`). One-shot: the whole of `input` is consumed. - /// - /// The lower envelope of the times routed to `keep` is folded into `residual`, so - /// the caller learns the frontier of data it still holds without a second pass. - /// Outputs are near-graded but not regraded; a terminal [`regrade`](Chunk::regrade) - /// zips up the seams. - fn extract( - input: &mut VecDeque, - frontier: &Antichain, - residual: &mut Antichain, - keep: &mut VecDeque, - ship: &mut VecDeque, - ); - - /// Advance times by `frontier`, consolidating each complete `(key, val)` group from - /// the front of `input` into `out`. - /// - /// A group is complete once a later `(key, val)` is seen, so every group but the - /// last is emitted; the last (which a future call might extend) is reformed and - /// `push_front`ed back as the withheld carry — unless `done`, which flushes it too. - /// The degenerate case is a single `(key, val)` spanning all available input: no - /// group is provably complete, so nothing is committed (the whole buffer is - /// withheld) until `done`. - fn advance( - input: &mut VecDeque, - frontier: &Antichain, - done: bool, - out: &mut VecDeque, - ); - - /// Reshape the front of `input` into a maximal packing in `out`: each chunk at most - /// [`TARGET`](Chunk::TARGET), and any two adjacent summing past `TARGET` (so no - /// neighbours could be combined). See [`is_graded`]. - /// - /// The terminal stage of every pipeline. A sub-`TARGET` carry that might still grow - /// is `push_front`ed back as the withheld remainder until `done`, which flushes it. - fn regrade( - input: &mut VecDeque, - done: bool, - out: &mut VecDeque, - ); - -} - -/// Merge two full chains of chunks into one, to completion, appending to `out`. -/// -/// The whole-chain (non-fueled) driver used by the batcher's -/// [`Merger`](crate::trace::implementations::merge_batcher::Merger): both chains are in -/// hand, so it ticks [`Chunk::merge`] until one deque empties, then appends the other's -/// remainder (the verbatim tail). Output is near-graded; callers regrade as needed. -pub fn merge_chains( - chain1: Vec, - chain2: Vec, - out: &mut VecDeque, -) { - let mut in1: VecDeque = chain1.into(); - let mut in2: VecDeque = chain2.into(); - while !in1.is_empty() && !in2.is_empty() { - C::merge(&mut in1, &mut in2, out); - } - // One deque is empty; the other's remainder is all greater than everything merged. - out.extend(in1.drain(..)); - out.extend(in2.drain(..)); -} - -/// A merge-batcher [`Merger`](crate::trace::implementations::merge_batcher::Merger) -/// over chains of [`Chunk`]s. -/// -/// `merge` runs the whole-chain binary merger; `extract` splits by the seal frontier -/// using [`Chunk::extract`]. The batcher consolidates equal `(data, time)` updates -/// but does *not* advance times — time advancement is advance's job, handled later in -/// the trace. Both regrade their output, since the batcher's chains want to be graded. -pub struct ChunkMerger { - _marker: std::marker::PhantomData, -} - -impl Default for ChunkMerger { - fn default() -> Self { Self { _marker: std::marker::PhantomData } } -} - -impl crate::trace::implementations::merge_batcher::Merger for ChunkMerger -where - C: Chunk + Default + 'static, - C::Time: Clone + timely::PartialOrder + 'static, -{ - type Chunk = C; - type Time = C::Time; - - fn merge( - &mut self, - list1: Vec, - list2: Vec, - output: &mut Vec, - _stash: &mut Vec, - ) { - let mut merged = VecDeque::new(); - merge_chains(list1, list2, &mut merged); - // No regrade: the batcher's ladder weighs chains by updates (not chunk count) - // since #767, so intermediate grading buys nothing; the final batch is graded - // at seal. merge's output is already near-`TARGET`. - output.extend(merged); - } - - fn extract( - &mut self, - merged: Vec, - upper: AntichainRef, - frontier: &mut Antichain, - ship: &mut Vec, - kept: &mut Vec, - _stash: &mut Vec, - ) { - // `extract` keeps updates greater-or-equal `upper` and ships the rest, - // folding the lower envelope of kept times into `frontier`. - let upper = upper.to_owned(); - let mut input: VecDeque = merged.into(); - let (mut keep, mut shipped) = (VecDeque::new(), VecDeque::new()); - C::extract(&mut input, &upper, frontier, &mut keep, &mut shipped); - // No regrade: `kept` is re-merged later and `shipped` is regraded at seal by - // the builder, so neither needs grading here. - kept.extend(keep); - ship.extend(shipped); - } - - fn len(chunk: &C) -> usize { chunk.len() } -} - -/// The merge batcher for chunks of type `C`, merging pre-chunked `C` runs. -/// -/// The batcher accepts already-formed `C` chunks via `PushInto` and merges them -/// through [`ChunkMerger`]; it holds no chunker. The `Input → C` bridge lives at the -/// `arrange_core` callsite, which supplies the chunker (e.g. [`ContainerChunker`] -/// for same-shape input, where `C` satisfies the batcher-side container traits -/// `SizableContainer`, `Consolidate`, `Container`, `PushInto`). -/// -/// [`ContainerChunker`]: crate::trace::implementations::chunker::ContainerChunker -pub type ChunkBatcher = crate::trace::implementations::merge_batcher::MergeBatcher>; - -/// A spine of `Rc`-shared [`ChunkBatch`]es of type `C`: the trace type for `arrange`. -pub type ChunkSpine = crate::trace::implementations::spine_fueled::Spine>>; - -/// A reference-counted [`ChunkBatch`] builder over chunks of type `C`. -pub type ChunkRcBuilder = crate::trace::rc_blanket_impls::RcBuilder>; - -/// A batch is just an ordered sequence of [`Chunk`]s plus its time description. -/// -/// The chunks are sorted and consolidated, with chunk boundaries arbitrary; the -/// concatenation of their contents is the batch. -/// -/// This is a full [`Batch`](crate::trace::Batch): [`ChunkBatchCursor`] reads -/// across the chunks (delegating to each chunk's own cursor and continuing past -/// boundaries), [`ChunkBatchMerger`] performs the resumable merge-and-advance, -/// and [`ChunkBuilder`] collects pre-sorted chunks. All of those are below. -pub struct ChunkBatch { - /// Ordered, consolidated chunks; their concatenation is the batch. - pub chunks: Vec, - /// The lower, upper, and since frontiers of the batch. - pub description: Description, - /// Per-chunk first and last key, and first and last val, parallel to `chunks`. - first_keys: KeyCon, - last_keys: KeyCon, - first_vals: ValCon, - last_vals: ValCon, -} - -impl ChunkBatch { - /// Assemble a batch from ordered chunks, building the per-chunk index. - pub fn new(chunks: Vec, description: Description) -> Self { - let n = chunks.len(); - let mut first_keys = >::with_capacity(n); - let mut last_keys = >::with_capacity(n); - let mut first_vals = >::with_capacity(n); - let mut last_vals = >::with_capacity(n); - for chunk in &chunks { - assert!(chunk.len() > 0, "ChunkBatch chunks must be non-empty"); - let ((fk, fv, _), (lk, lv, _)) = chunk.bounds(); - first_keys.push_ref(fk); - last_keys.push_ref(lk); - first_vals.push_ref(fv); - last_vals.push_ref(lv); - } - ChunkBatch { chunks, description, first_keys, last_keys, first_vals, last_vals } - } -} - -impl WithLayout for ChunkBatch { - type Layout = C::Layout; -} - -/// A cursor over a [`ChunkBatch`], merging the per-chunk cursors. -/// -/// Chunk breakpoints are unconstrained, so a single key — or `(key, val)` — may -/// straddle consecutive chunks. But the chunks are one globally-sorted sequence -/// merely cut at arbitrary points, so the operation is *concatenation*, never a -/// merge: across a boundary a key's vals concatenate and a `(key, val)`'s times -/// concatenate. The cursor exploits this. It holds the chunk currently being read -/// and a cursor into it; it seeks by binary-searching the per-chunk index on -/// `ChunkBatch`, and at boundaries it *continues* into the next chunk rather than -/// merging — using the index to detect when a key or `(key, val)` spills forward, -/// without touching chunk contents. -pub struct ChunkBatchCursor { - /// First chunk of the current key's run; where `rewind_vals` returns to. - key_chunk: usize, - /// Chunk currently being read; `>= key_chunk`, within the current key's span. - chunk: usize, - /// Cursor into `chunk`; `None` once `chunk` is past the last chunk. - inner: Option, -} - -impl WithLayout for ChunkBatchCursor { - type Layout = C::Layout; -} - -impl ChunkBatchCursor { - /// Move the active chunk to `c`, opening a fresh inner cursor at its start. - fn goto(&mut self, c: usize, storage: &ChunkBatch) { - self.chunk = c; - self.inner = storage.chunks.get(c).map(C::cursor); - } -} - -impl Cursor for ChunkBatchCursor { - type Storage = ChunkBatch; - - fn key_valid(&self, s: &Self::Storage) -> bool { self.chunk < s.chunks.len() && self.inner.as_ref().is_some_and(|i| i.key_valid(&s.chunks[self.chunk])) } - fn val_valid(&self, s: &Self::Storage) -> bool { self.chunk < s.chunks.len() && self.inner.as_ref().is_some_and(|i| i.val_valid(&s.chunks[self.chunk])) } - fn key<'a>(&self, s: &'a Self::Storage) -> Self::Key<'a> { self.inner.as_ref().unwrap().key(&s.chunks[self.chunk]) } - fn val<'a>(&self, s: &'a Self::Storage) -> Self::Val<'a> { self.inner.as_ref().unwrap().val(&s.chunks[self.chunk]) } - fn get_key<'a>(&self, s: &'a Self::Storage) -> Option> { if self.key_valid(s) { Some(self.key(s)) } else { None } } - fn get_val<'a>(&self, s: &'a Self::Storage) -> Option> { if self.val_valid(s) { Some(self.val(s)) } else { None } } - - fn map_times, Self::DiffGat<'_>)>(&mut self, s: &Self::Storage, mut logic: L) { - if !self.val_valid(s) { return; } - let (k, v) = (self.key(s), self.val(s)); - self.inner.as_mut().unwrap().map_times(&s.chunks[self.chunk], &mut logic); - // Follow the (key, val) forward across boundaries while it spills. - let mut c = self.chunk; - while c + 1 < s.chunks.len() - && s.last_keys.index(c) == k && s.first_keys.index(c + 1) == k - && s.last_vals.index(c) == v && s.first_vals.index(c + 1) == v - { - c += 1; - s.chunks[c].cursor().map_times(&s.chunks[c], &mut logic); - } - } - - fn step_key(&mut self, s: &Self::Storage) { - if !self.key_valid(s) { return; } - let n = s.chunks.len(); - let k = self.key(s); - // Advance to the last chunk the key spans. - while self.chunk + 1 < n && s.last_keys.index(self.chunk) == k && s.first_keys.index(self.chunk + 1) == k { - self.goto(self.chunk + 1, s); - } - // Step past the key within its last chunk. - { - let inner = self.inner.as_mut().unwrap(); - inner.seek_key(&s.chunks[self.chunk], k); - inner.step_key(&s.chunks[self.chunk]); - } - // If that exhausted the chunk, the next key (if any) starts the next chunk. - if !self.inner.as_ref().unwrap().key_valid(&s.chunks[self.chunk]) && self.chunk + 1 < n { - self.goto(self.chunk + 1, s); - } - self.key_chunk = self.chunk; - } - - fn seek_key(&mut self, s: &Self::Storage, key: Self::Key<'_>) { - let n = s.chunks.len(); - // First chunk whose last key is `>= key`: where `key`'s run begins. - let c = s.last_keys.advance(0, n, |x| { - as BatchContainer>::reborrow(x).lt(& as BatchContainer>::reborrow(key)) - }); - self.goto(c, s); - self.key_chunk = c; - if c < n { self.inner.as_mut().unwrap().seek_key(&s.chunks[c], key); } - } - - fn step_val(&mut self, s: &Self::Storage) { - if !self.val_valid(s) { return; } - let n = s.chunks.len(); - let (k, v) = (self.key(s), self.val(s)); - // Advance to the last chunk the (key, val) spans. - while self.chunk + 1 < n - && s.last_keys.index(self.chunk) == k && s.first_keys.index(self.chunk + 1) == k - && s.last_vals.index(self.chunk) == v && s.first_vals.index(self.chunk + 1) == v - { - self.goto(self.chunk + 1, s); - } - // Step past the (key, val) within that chunk. - self.inner.as_mut().unwrap().step_val(&s.chunks[self.chunk]); - // If the key's vals are exhausted here but the key spills, roll forward. - if !self.inner.as_ref().unwrap().val_valid(&s.chunks[self.chunk]) - && self.chunk + 1 < n && s.last_keys.index(self.chunk) == k && s.first_keys.index(self.chunk + 1) == k - { - self.goto(self.chunk + 1, s); - self.inner.as_mut().unwrap().seek_key(&s.chunks[self.chunk], k); - } - } - - fn seek_val(&mut self, s: &Self::Storage, val: Self::Val<'_>) { - if !self.key_valid(s) { return; } - let n = s.chunks.len(); - let k = self.key(s); - loop { - self.inner.as_mut().unwrap().seek_val(&s.chunks[self.chunk], val); - if self.inner.as_ref().unwrap().val_valid(&s.chunks[self.chunk]) { return; } - // Key's vals exhausted in this chunk; if the key spills, retry in the next. - if self.chunk + 1 < n && s.last_keys.index(self.chunk) == k && s.first_keys.index(self.chunk + 1) == k { - self.goto(self.chunk + 1, s); - self.inner.as_mut().unwrap().seek_key(&s.chunks[self.chunk], k); - } else { - return; - } - } - } - - fn rewind_keys(&mut self, s: &Self::Storage) { - self.key_chunk = 0; - self.goto(0, s); - } - - fn rewind_vals(&mut self, s: &Self::Storage) { - if !self.key_valid(s) { return; } - let k = self.key(s); - let kc = self.key_chunk; - self.goto(kc, s); - self.inner.as_mut().unwrap().seek_key(&s.chunks[kc], k); - } -} - -impl BatchReader for ChunkBatch { - type Cursor = ChunkBatchCursor; - fn cursor(&self) -> Self::Cursor { - ChunkBatchCursor { key_chunk: 0, chunk: 0, inner: self.chunks.first().map(C::cursor) } - } - fn len(&self) -> usize { self.chunks.iter().map(C::len).sum() } - fn description(&self) -> &Description { &self.description } -} - -impl Batch for ChunkBatch -where - C::Time: timely::progress::Timestamp + Lattice + Ord, -{ - type Merger = ChunkBatchMerger; - - fn empty(lower: Antichain, upper: Antichain) -> Self { - use timely::progress::Timestamp; - let since = Antichain::from_elem(Self::Time::minimum()); - ChunkBatch::new(Vec::new(), Description::new(lower, upper, since)) - } -} - -/// A merge of two [`ChunkBatch`]es in progress. -/// -/// This is the [`ChunkBatch`] merger, wired in as its -/// [`Batch::Merger`](crate::trace::Batch::Merger), and has that trait's -/// `new` / `work` / `done` shape. -/// -/// The merge is *resumable* and runs a two-stage deque pipeline: -/// [`merge`](Chunk::merge) feeds `merged`, [`advance`](Chunk::advance) consumes it -/// into `advanced`; the terminal [`regrade`](Chunk::regrade) runs once at `done`. Each -/// `work` step clones a burst from each source, ticks `merge` once, then advances the -/// fresh output, debiting `fuel` by the *merged* records that entered the pipe — the -/// total output across the merge, matching how the trace's other mergers account (cf. -/// `ord_neu`). The sources are read by *cloning* chunks (a cheap refcount bump per the -/// [`Chunk`] contract), never consumed or mutated; the same `source1`/`source2` must be -/// supplied on every call. When a source exhausts, the harness flushes the other's -/// verbatim tail one chunk per step. Once both are drained, a final `advance(done)` -/// flushes advance's withheld carry. -/// -/// **Latency bound.** `fuel` bounds each step to roughly one burst-merge's output. Two -/// things ride *outside* fuel: the terminal `advance(done)` and `done`'s `regrade`. In -/// the worst case — a single `(key, val)` spanning the whole merge — `advance` withholds -/// the entire group until `done`, then sorts and consolidates it in one unfueled step. -/// `vec_chunk` keeps that step *linear* in the group (it accumulates the carry in place, -/// reusing its storage), so it is not the quadratic blow-up of an earlier design, but it -/// is one unbounded-latency step bounded by the largest single `(key, val)` group. A -/// chunk impl must keep this flush linear; the latency claimed is "per step ≈ a burst, -/// plus a final flush ≤ the largest group." -pub struct ChunkBatchMerger { - /// Compaction frontier supplied at construction. - frontier: Antichain, - /// Result frontiers, retained for the output description. - lower: Antichain, - upper: Antichain, - /// Input deques, refilled from the sources (clones) head-of-list at a time. - in1: VecDeque, - in2: VecDeque, - /// Next source chunk to clone into `in1` / `in2`. - idx1: usize, - idx2: usize, - /// `advance`'s input: the merge output plus advance's withheld carry at the front. - merged: VecDeque, - /// `advance`'s output: the merged-and-advanced chunks, grown by `work`. - advanced: VecDeque, - /// Set once both sources are drained and advance's final flush has run. - complete: bool, -} - -impl crate::trace::Merger> for ChunkBatchMerger -where - C: Chunk + Default + 'static, - C::Time: timely::progress::Timestamp + Lattice + Ord + 'static, -{ - /// Begin merging `source1` and `source2`, advancing to `frontier`. - fn new(source1: &ChunkBatch, source2: &ChunkBatch, frontier: AntichainRef) -> Self { - let lower = source1.description.lower().meet(source2.description.lower()); - let upper = source1.description.upper().join(source2.description.upper()); - Self { - frontier: frontier.to_owned(), - lower, - upper, - in1: VecDeque::new(), - in2: VecDeque::new(), - idx1: 0, - idx2: 0, - merged: VecDeque::new(), - advanced: VecDeque::new(), - complete: false, - } - } - - /// Advance the merge by up to `fuel` updates, suspending when it runs out. - fn work(&mut self, source1: &ChunkBatch, source2: &ChunkBatch, fuel: &mut isize) { - if self.complete { return; } - - while *fuel > 0 { - // Refill each input deque up to a burst of source chunks (clones); `merge` - // drains the loaded burst per call. The burst trades fuel granularity (a - // call does up to a burst's work before checking fuel) against re-pruning: - // a chunk that straddles many chunks on the other side is walked by index - // within one call but, once its tail spills past the loaded burst, its - // unconsumed suffix is pushed back and re-copied next call — a bigger burst - // absorbs more straddle per call. This workload is insensitive (1..32 flat - // to ~noise at 1M), so 8 is a conservative default, not a tuned optimum. - // After this, a deque is non-empty iff its source still has data. - const BURST: usize = 8; - while self.in1.len() < BURST && self.idx1 < source1.chunks.len() { - self.in1.push_back(source1.chunks[self.idx1].clone()); - self.idx1 += 1; - } - while self.in2.len() < BURST && self.idx2 < source2.chunks.len() { - self.in2.push_back(source2.chunks[self.idx2].clone()); - self.idx2 += 1; - } - - // Merge's per-tick output (a burst's worth, or one tail chunk), measured - // for fuel before it joins the carry already in `merged`. - let mut produced = VecDeque::new(); - if !self.in1.is_empty() && !self.in2.is_empty() { - // Both sides have data: drain the loaded burst. - C::merge(&mut self.in1, &mut self.in2, &mut produced); - } else if let Some(chunk) = self.in1.pop_front().or_else(|| self.in2.pop_front()) { - // Exactly one side has data: flush its verbatim tail, one chunk a step. - produced.push_back(chunk); - } else { - // Both sources drained: final flush of advance's withheld carry. - C::advance(&mut self.merged, &self.frontier, true, &mut self.advanced); - self.complete = true; - break; - } - - let work: usize = produced.iter().map(C::len).sum(); - self.merged.extend(produced); - C::advance(&mut self.merged, &self.frontier, false, &mut self.advanced); - *fuel -= work as isize; - } - } - - /// Extract the merged batch over `[lower, upper)` advanced to the frontier. - /// - /// Only valid once `work` has driven the merge to completion (left `fuel` - /// positive), as the [`trace::Merger`](crate::trace::Merger) contract requires. - fn done(self) -> ChunkBatch { - let description = Description::new(self.lower, self.upper, self.frontier); - ChunkBatch::new(regrade_all(self.advanced), description) - } -} - -/// A [`Builder`](crate::trace::Builder) that collects pre-sorted chunks into a -/// [`ChunkBatch`]. -/// -/// The builder assumes its inputs arrive already sorted and consolidated (as the -/// `Builder` contract requires), so it does no merging: each pushed chunk is an -/// ordered run, fed straight to [`regrade`](Chunk::regrade) as it arrives — so a batch -/// built here is graded like one produced by the merger, rather than inheriting -/// whatever chunk sizes the caller happened to push. -pub struct ChunkBuilder { - /// Pushed chunks awaiting regrading; holds regrade's sub-`TARGET` carry at the front. - input: VecDeque, - /// The graded chunks emitted so far. - output: VecDeque, -} - -impl crate::trace::Builder for ChunkBuilder -where - C: Chunk + Default + 'static, - C::Time: timely::progress::Timestamp, -{ - type Input = C; - type Time = C::Time; - type Output = ChunkBatch; - - fn with_capacity(_keys: usize, _vals: usize, _upds: usize) -> Self { - Self { input: VecDeque::new(), output: VecDeque::new() } - } - - fn push(&mut self, chunk: &mut C) { - let chunk = std::mem::take(chunk); - if chunk.len() > 0 { - self.input.push_back(chunk); - C::regrade(&mut self.input, false, &mut self.output); - } - } - - fn done(self, description: Description) -> ChunkBatch { - let ChunkBuilder { mut input, mut output } = self; - C::regrade(&mut input, true, &mut output); - ChunkBatch::new(output.into(), description) - } - - fn seal(chain: &mut Vec, description: Description) -> ChunkBatch { - // The chain is sorted and consolidated but not necessarily graded; regrade it. - // Already-`TARGET` chunks pass through as cheap `Rc` moves, so a chain that - // arrives graded (as the batcher's does) pays only an O(#chunks) walk. - ChunkBatch::new(regrade_all(std::mem::take(chain)), description) - } -} - -pub mod vec_chunk { - //! A worked [`Chunk`] implementation: `Vec<((K, V), T, R)>` behind an `Rc`. - //! - //! This is the reference example — a next implementor (e.g. columnar) follows - //! its *shape*, not its layout. It shows the two integration points any chunk - //! type satisfies, and how leaning on the parent module's generic harnesses - //! keeps the code terse: - //! - //! * **Batcher side.** The merge batcher's `ContainerChunker` builds chunks, so - //! the type implements timely's container traits (`Accountable`, - //! `SizableContainer`, `Consolidate`, `PushInto`). Here they delegate to the - //! inner `Vec` via `Rc::make_mut` — free while a chunk is being built - //! (refcount 1), and it never copies a *shared* chunk because batches are - //! immutable once built. - //! * **Trace side.** [`Chunk`] (merge / extract / advance / regrade / bounds) - //! plus a cursor. Key lookups are logarithmic by galloping search (`seek_*`), - //! independent of chunk size; stepping stays linear (short hops). - //! - //! `Clone` is a refcount bump, so the trace merger shares source chunks instead - //! of copying them. - //! - //! **What a columnar impl can and can't reuse.** The protocol (the `VecDeque` - //! in/out, withhold-by-`push_front`, grade-at-seal) is layout-agnostic and carries - //! over unchanged. The *merge body* does not: this one merges a single contiguous - //! `&[((K,V),T,R)]` and bulk-copies disjoint runs with `extend_from_slice` + - //! `chunks(TARGET)`. A columnar chunk (ranging over `ord_neu`'s deduped layout) has - //! no such slice — it must range-copy the key / val / time / diff columns with - //! offset bookkeeping, emitting one key + its val/time run rather than repeated rows. - //! That is the operation that beats the flat layout on repetitive keys (see the - //! module-level note on the row-major vs. columnar crossover), and it is also where - //! the earlier `col_chunk` got into trouble (decompress-and-recompress instead of a - //! true range-copy). So a columnar `Chunk` is the open bet: nothing here exercises a - //! columnar merge, and that body — not the protocol — is the phase-2 risk. - - use std::collections::VecDeque; - use std::marker::PhantomData; - use std::rc::Rc; - - use timely::Accountable; - use timely::container::{PushInto, SizableContainer}; - use timely::progress::{Antichain, Timestamp}; - - use crate::consolidation::Consolidate; - use crate::difference::Semigroup; - use crate::lattice::Lattice; - use crate::trace::cursor::Cursor; - use crate::trace::implementations::{Vector, WithLayout}; - - use super::Chunk; - - /// The chunk size: both the maximum updates per chunk and the coalescing - /// threshold (see [`Chunk::TARGET`]). Chosen for the reference impl; exposed as - /// the associated const below, and used internally for buffer sizing. - const TARGET: usize = 1024; - - /// A sorted, consolidated run of `((key, val), time, diff)`, shared via `Rc`. - pub struct VecChunk(Rc>); - - impl Clone for VecChunk { - fn clone(&self) -> Self { VecChunk(Rc::clone(&self.0)) } - } - impl Default for VecChunk { - fn default() -> Self { VecChunk(Rc::new(Vec::new())) } - } - - /// The trace type for `arrange`: a spine of `Rc`-shared chunk batches. - pub type ChunkSpine = super::ChunkSpine>; - /// Merge batcher over `VecChunk`s. Unordered `Vec<((K, V), T, R)>` input is - /// consolidated into sorted `VecChunk`s by a `ContainerChunker` supplied - /// at the `arrange_core` callsite (it drives the container-trait impls below); the - /// batcher itself only merges the resulting chunks. - pub type ChunkBatcher = super::ChunkBatcher>; - /// Reference-counted batch builder. - pub type ChunkRcBuilder = super::ChunkRcBuilder>; - - // --- batcher side: timely container traits, delegating to the inner `Vec` --- - - impl Accountable for VecChunk { - fn record_count(&self) -> i64 { self.0.len() as i64 } - } - - impl SizableContainer for VecChunk - where K: Clone+'static, V: Clone+'static, T: Clone+'static, R: Clone+'static { - // The absorb point is the grading target: the chunker fills a scratch chunk - // to `TARGET` updates before emitting, so chunks arrive pre-graded rather than - // at timely's byte-derived buffer size (which downstream regrading re-melds). - fn at_capacity(&self) -> bool { self.0.len() >= TARGET } - fn ensure_capacity(&mut self, _stash: &mut Option) { - let inner = Rc::make_mut(&mut self.0); - inner.reserve(TARGET.saturating_sub(inner.len())); - } - } - - impl Consolidate for VecChunk - where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Ord+Clone+'static, R: Semigroup+'static { - fn len(&self) -> usize { self.0.len() } - fn clear(&mut self) { Rc::make_mut(&mut self.0).clear() } - fn consolidate_into(&mut self, target: &mut Self) { - Rc::make_mut(&mut self.0).consolidate_into(Rc::make_mut(&mut target.0)); - } - } - - impl PushInto<((K, V), T, R)> for VecChunk - where K: Clone+'static, V: Clone+'static, T: Clone+'static, R: Clone+'static { - fn push_into(&mut self, item: ((K, V), T, R)) { Rc::make_mut(&mut self.0).push(item); } - } - - // --- trace side: a logarithmic cursor and the `Chunk` operations --- - - /// First index `>= start` at which `pred` turns false, by galloping (exponential) - /// search. `pred` must hold for a prefix then not — i.e. `|u| u < target`. - /// O(log distance), so O(1) for short hops and logarithmic for long ones. - fn gallop(s: &[U], start: usize, pred: impl Fn(&U) -> bool) -> usize { - let mut pos = start; - if pos < s.len() && pred(&s[pos]) { - let mut step = 1; - while pos + step < s.len() && pred(&s[pos + step]) { pos += step; step <<= 1; } - step >>= 1; - while step > 0 { - if pos + step < s.len() && pred(&s[pos + step]) { pos += step; } - step >>= 1; - } - pos += 1; - } - pos - } - - /// A cursor over a [`VecChunk`], tracking the current key and `(key, val)` - /// group starts as indices into the flat vector. - pub struct VecChunkCursor { - key_pos: usize, - val_pos: usize, - phantom: PhantomData<(K, V, T, R)>, - } - - impl WithLayout for VecChunk - where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { - type Layout = Vector<((K, V), T, R)>; - } - - impl WithLayout for VecChunkCursor - where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { - type Layout = Vector<((K, V), T, R)>; - } - - impl Cursor for VecChunkCursor - where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { - type Storage = VecChunk; - - fn key_valid(&self, s: &Self::Storage) -> bool { self.key_pos < s.0.len() } - fn val_valid(&self, s: &Self::Storage) -> bool { - self.key_pos < s.0.len() && self.val_pos < s.0.len() && s.0[self.val_pos].0.0 == s.0[self.key_pos].0.0 - } - fn key<'a>(&self, s: &'a Self::Storage) -> &'a K { &s.0[self.key_pos].0.0 } - fn val<'a>(&self, s: &'a Self::Storage) -> &'a V { &s.0[self.val_pos].0.1 } - fn get_key<'a>(&self, s: &'a Self::Storage) -> Option<&'a K> { - if self.key_valid(s) { Some(self.key(s)) } else { None } - } - fn get_val<'a>(&self, s: &'a Self::Storage) -> Option<&'a V> { - if self.val_valid(s) { Some(self.val(s)) } else { None } - } - fn map_times(&mut self, s: &Self::Storage, mut logic: L) { - if !self.val_valid(s) { return; } - let kv = &s.0[self.val_pos].0; - let mut i = self.val_pos; - while i < s.0.len() && &s.0[i].0 == kv { - logic(&s.0[i].1, &s.0[i].2); - i += 1; - } - } - fn step_key(&mut self, s: &Self::Storage) { - // Linear: stepping is a short hop to the next group; an inlined scan - // beats a gallop call for the common small-group case. - if self.key_pos >= s.0.len() { return; } - let key = s.0[self.key_pos].0.0.clone(); - let mut i = self.key_pos; - while i < s.0.len() && s.0[i].0.0 == key { i += 1; } - self.key_pos = i; - self.val_pos = i; - } - fn seek_key(&mut self, s: &Self::Storage, key: &K) { - // Logarithmic: O(log distance), independent of chunk size. - self.key_pos = gallop(&s.0, self.key_pos, |u| &u.0.0 < key); - self.val_pos = self.key_pos; - } - fn step_val(&mut self, s: &Self::Storage) { - if !self.val_valid(s) { return; } - let kv = s.0[self.val_pos].0.clone(); - let mut i = self.val_pos; - while i < s.0.len() && s.0[i].0 == kv { i += 1; } - self.val_pos = i; - } - fn seek_val(&mut self, s: &Self::Storage, val: &V) { - if !self.key_valid(s) { return; } - let key = s.0[self.key_pos].0.0.clone(); - self.val_pos = gallop(&s.0, self.val_pos, |u| (&u.0.0, &u.0.1) < (&key, val)); - } - fn rewind_keys(&mut self, _s: &Self::Storage) { self.key_pos = 0; self.val_pos = 0; } - fn rewind_vals(&mut self, _s: &Self::Storage) { self.val_pos = self.key_pos; } - } - - /// Take the `Vec` out of a chunk, copying only if the `Rc` is shared. - fn take(chunk: VecChunk) -> Vec<((K, V), T, R)> { - Rc::try_unwrap(chunk.0).unwrap_or_else(|rc| (*rc).clone()) - } - - impl Chunk for VecChunk - where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { - type Cursor = VecChunkCursor; - - const TARGET: usize = TARGET; - - fn cursor(&self) -> Self::Cursor { - VecChunkCursor { key_pos: 0, val_pos: 0, phantom: PhantomData } - } - - fn bounds(&self) -> ((&K, &V, &T), (&K, &V, &T)) { - let s = &self.0[..]; - let (f, l) = (&s[0], &s[s.len() - 1]); - ((&f.0.0, &f.0.1, &f.1), (&l.0.0, &l.0.1, &l.1)) - } - - fn len(&self) -> usize { self.0.len() } - - /// A two-pointer binary merge that drains the two deques' *loaded* content - /// through their shared horizon — the lesser of the two deques' last loaded - /// `(key, val, time)`s — rather than one front-pair at a time. Consolidates - /// equal triples and bulk-copies disjoint runs as slices, walking across chunk - /// boundaries with local indices (`p1`/`p2`) that reset as each working chunk - /// is retired. The side owning the horizon drains fully; the other's partial - /// working chunk is pruned (its prefix dropped) and `push_front`ed back exactly - /// once at the yield boundary — so the per-call prune cost amortizes over the - /// whole burst the harness loaded, not over each chunk. - fn merge(in1: &mut VecDeque, in2: &mut VecDeque, out: &mut VecDeque) { - fn kv(u: &((K, V), T, R)) -> (&K, &V) { (&u.0.0, &u.0.1) } - - let mut result: Vec<((K, V), T, R)> = Vec::with_capacity(TARGET); - let mut flush = |result: &mut Vec<((K, V), T, R)>, force: bool| { - if result.len() >= TARGET || (force && !result.is_empty()) { - out.push_back(VecChunk(Rc::new(std::mem::replace(result, Vec::with_capacity(TARGET))))); - } - }; - - // Working chunks (the shared `Rc`, read by index — never `take`n, so a - // source clone is not deep-copied) and their positions; both deques are - // non-empty on entry. The guard keeps both cursors valid for indexing; a - // working chunk consumed mid-merge is refilled at the foot of the loop, and - // when a deque runs dry we stop — that side has presented all its loaded - // data, so its last triple is the horizon and the rest is left for next time. - let mut c1 = in1.pop_front().unwrap(); - let mut c2 = in2.pop_front().unwrap(); - let (mut p1, mut p2) = (0usize, 0usize); - while p1 < c1.0.len() && p2 < c2.0.len() { - let a = &c1.0[p1]; - let b = &c2.0[p2]; - match (kv(a), &a.1).cmp(&(kv(b), &b.1)) { - // Copy the run of one side strictly below the other's head (within - // the current working chunk): collisions are impossible within it, - // so it moves as slices cut at the grading target. - std::cmp::Ordering::Less => { - let run = gallop(&c1.0[..], p1 + 1, |u| (kv(u), &u.1) < (kv(b), &b.1)); - for piece in c1.0[p1..run].chunks(TARGET) { - result.extend_from_slice(piece); - flush(&mut result, false); - } - p1 = run; - } - std::cmp::Ordering::Greater => { - let run = gallop(&c2.0[..], p2 + 1, |u| (kv(u), &u.1) < (kv(a), &a.1)); - for piece in c2.0[p2..run].chunks(TARGET) { - result.extend_from_slice(piece); - flush(&mut result, false); - } - p2 = run; - } - std::cmp::Ordering::Equal => { - let mut diff = a.2.clone(); - diff.plus_equals(&b.2); - if !diff.is_zero() { - result.push((a.0.clone(), a.1.clone(), diff)); - } - p1 += 1; - p2 += 1; - flush(&mut result, false); - } - } - // Refill either working chunk consumed by the step above; stop the drain - // once a deque is exhausted (the `&&` guard then never re-enters). - if p1 == c1.0.len() { - match in1.pop_front() { Some(c) => { c1 = c; p1 = 0; } None => break } - } - if p2 == c2.0.len() { - match in2.pop_front() { Some(c) => { c2 = c; p2 = 0; } None => break } - } - } - flush(&mut result, true); - // One side's deque emptied with its working chunk exhausted; the other's - // working chunk is partial — push back just its unconsumed suffix (one copy - // per call), ahead of whatever loaded chunks remain in that deque. - if p1 < c1.0.len() { in1.push_front(VecChunk(Rc::new(c1.0[p1..].to_vec()))); } - if p2 < c2.0.len() { in2.push_front(VecChunk(Rc::new(c2.0[p2..].to_vec()))); } - } - - fn extract( - input: &mut VecDeque, - frontier: &Antichain, - residual: &mut Antichain, - keep: &mut VecDeque, - ship: &mut VecDeque, - ) { - // Fill `TARGET`-sized buffers directly, so the chunks pushed are already - // graded and `regrade` passes them through as `Rc` moves rather than - // re-splitting (and re-copying) a monolithic chunk. Emptied input `Vec`s - // are recycled as the next buffers, so allocations balance input against - // output instead of one fresh buffer per emitted chunk. - let mut stash: Vec> = Vec::new(); - let take_buf = |stash: &mut Vec<_>| stash.pop().unwrap_or_default(); - let (mut k, mut s) = (take_buf(&mut stash), take_buf(&mut stash)); - for chunk in input.drain(..) { - let mut v = take(chunk); - for u in v.drain(..) { - if frontier.borrow().less_equal(&u.1) { - residual.insert_ref(&u.1); - k.push(u); - if k.len() >= TARGET { keep.push_back(VecChunk(Rc::new(std::mem::replace(&mut k, take_buf(&mut stash))))); } - } else { - s.push(u); - if s.len() >= TARGET { ship.push_back(VecChunk(Rc::new(std::mem::replace(&mut s, take_buf(&mut stash))))); } - } - } - stash.push(v); - } - if !k.is_empty() { keep.push_back(VecChunk(Rc::new(k))); } - if !s.is_empty() { ship.push_back(VecChunk(Rc::new(s))); } - } - - fn advance( - input: &mut VecDeque, - frontier: &Antichain, - done: bool, - out: &mut VecDeque, - ) { - // Advance and consolidate every *complete* `(key, val)` group eagerly, - // so its updates can be released as soon as the input proves no later - // time for the pair can arrive. A group is contiguous in the sorted - // chain, so the only one that might continue in a future call is the last; - // unless `done`, we process up to its start and `push_front` the rest as - // the withheld carry for the next call. - let mut stash: Vec> = Vec::new(); - // Build the working buffer by *reusing the front chunk's storage* (the - // carry from last time) and appending the rest (recycling each emptied - // `Vec`). Reusing the front is what keeps a withheld group from being - // recopied across calls: it just accumulates in place, so a `(key, val)` - // larger than the working set costs O(total) over the run, not O(total²). - let mut buf = match input.pop_front() { Some(chunk) => take(chunk), None => return }; - while let Some(chunk) = input.pop_front() { - let mut v = take(chunk); - buf.append(&mut v); - stash.push(v); - } - if buf.is_empty() { return; } - - // If every available update shares one `(key, val)`, no group is provably - // complete — a later call may extend it — so make no progress unless - // `done`: push the accumulated buffer back as the carry and return. This is - // the giant-key case; comparing only the first and last pair detects it - // without scanning, and reusing the front above makes the retention free. - if !done && buf[0].0 == buf[buf.len() - 1].0 { - input.push_front(VecChunk(Rc::new(buf))); - return; - } - - // Otherwise at least the first group is complete. Withhold the last group - // (a single `(key, val)`) as the next carry unless the input is complete. - let end = if done { buf.len() } else { - let last_kv = buf[buf.len() - 1].0.clone(); - let mut start = buf.len(); - while start > 0 && buf[start - 1].0 == last_kv { start -= 1; } - start - }; - if end < buf.len() { - input.push_front(VecChunk(Rc::new(buf.split_off(end)))); - } - // Advance + consolidate each group into `TARGET`-sized output chunks, - // filling buffers reclaimed from the recycled `Vec`s. - let mut result = stash.pop().unwrap_or_default(); - let mut i = 0; - while i < buf.len() { - let mut j = i; - while j < buf.len() && buf[j].0 == buf[i].0 { j += 1; } - for u in &mut buf[i..j] { u.1.advance_by(frontier.borrow()); } - // Advancing is monotone w.r.t. the lattice but not the - // representation's total order, so re-sort the group by time. - buf[i..j].sort_by(|a, b| a.1.cmp(&b.1)); - let mut k = i; - while k < j { - let kv = buf[k].0.clone(); - let t = buf[k].1.clone(); - let mut diff = buf[k].2.clone(); - k += 1; - while k < j && buf[k].1 == t { diff.plus_equals(&buf[k].2); k += 1; } - if !diff.is_zero() { - result.push((kv, t, diff)); - if result.len() >= TARGET { out.push_back(VecChunk(Rc::new(std::mem::replace(&mut result, stash.pop().unwrap_or_default())))); } - } - } - i = j; - } - if !result.is_empty() { out.push_back(VecChunk(Rc::new(result))); } - } - - fn regrade(input: &mut VecDeque, done: bool, out: &mut VecDeque) { - // Maximal packing: emit chunks as large as possible up to `TARGET`, - // never splitting a pair that could combine into one legal (`<= TARGET`) - // chunk. A chunk of exactly `TARGET` is maximal — it cannot grow — so it - // passes straight through as an `Rc` move; only sub-`TARGET` chunks are - // copied, and only to coalesce with a neighbour. Producers fill to - // `TARGET`, so in steady state every chunk passes through and only the - // occasional trailing partial is coalesced. - // - // `carry` is the (sub-`TARGET`) chunk under construction. It is flushed - // once it reaches `TARGET`, `push_front`ed back onto `input` between calls, - // or emitted on `done`. Whenever `carry` is non-empty its left neighbour in - // `out` is a `TARGET` chunk (or `carry` is `out`'s first chunk), so - // emitting `carry` against a neighbour it cannot merge with — their sum - // exceeds `TARGET` — keeps the packing maximal on both sides. - let mut carry: Vec<((K, V), T, R)> = Vec::new(); - while let Some(chunk) = input.pop_front() { - if carry.is_empty() { - absorb(chunk, &mut carry, out); - } else if carry.len() + chunk.0.len() <= TARGET { - // Combines into one legal chunk; coalesce in place. - carry.extend(take(chunk)); - if carry.len() == TARGET { - out.push_back(VecChunk(Rc::new(std::mem::take(&mut carry)))); - } - } else { - // Cannot combine without exceeding `TARGET`; `carry` is maximal - // against this neighbour, so emit it and absorb the chunk afresh. - out.push_back(VecChunk(Rc::new(std::mem::take(&mut carry)))); - absorb(chunk, &mut carry, out); - } - } - if !carry.is_empty() { - let chunk = VecChunk(Rc::new(carry)); - if done { out.push_back(chunk); } else { input.push_front(chunk); } - } - } - } - - /// Emit maximal `TARGET`-sized chunks off the front of `carry`, leaving the - /// sub-`TARGET` tail behind. - fn peel( - carry: &mut Vec<((K, V), T, R)>, - out: &mut VecDeque>, - ) { - let mut start = 0; - while carry.len() - start >= TARGET { - out.push_back(VecChunk(Rc::new(carry[start..start + TARGET].to_vec()))); - start += TARGET; - } - carry.drain(..start); - } - - /// Absorb a chunk when nothing is carried: pass a `TARGET` chunk through as an - /// `Rc` move, hold a smaller one in `carry`, or split a larger one (peeling off - /// `TARGET` pieces and carrying the remainder). `carry` must be empty on entry. - fn absorb( - chunk: VecChunk, - carry: &mut Vec<((K, V), T, R)>, - out: &mut VecDeque>, - ) { - use std::cmp::Ordering::{Equal, Greater, Less}; - match chunk.0.len().cmp(&TARGET) { - Equal => out.push_back(chunk), - Less => *carry = take(chunk), - Greater => { *carry = take(chunk); peel(carry, out); } - } - } - - #[cfg(test)] - mod test { - use std::collections::VecDeque; - use super::{Chunk, VecChunk}; - use crate::trace::chunk::merge_chains; - use std::rc::Rc; - - fn chunk(updates: Vec<((u64, u64), u64, i64)>) -> VecChunk { - VecChunk(Rc::new(updates)) - } - - // Flatten a chunk sequence back to its update stream. - fn flat>>(chunks: I) -> Vec<((u64, u64), u64, i64)> { - chunks.into_iter().flat_map(|c| (*c.0).clone()).collect() - } - - // `extract` partitions by frontier and folds the kept frontier into `residual`; - // a terminal `regrade` then grades each side (the seams of near-graded output). - #[test] - fn extract_partitions_and_grades() { - use super::TARGET; - use crate::trace::chunk::{is_graded, regrade_all}; - use timely::progress::Antichain; - - // 4·TARGET updates spread over many input chunks; even times ship - // (< frontier), odd times keep (>= frontier), so both sides straddle. - let n = 4 * TARGET as u64; - let mut input: VecDeque<_> = (0..n).map(|i| chunk(vec![((i, 0), i % 2, 1)])).collect(); - let frontier = Antichain::from_elem(1u64); - let mut residual = Antichain::new(); - let (mut keep, mut ship) = (VecDeque::new(), VecDeque::new()); - VecChunk::extract(&mut input, &frontier, &mut residual, &mut keep, &mut ship); - let (keep, ship) = (regrade_all(keep), regrade_all(ship)); - - // Kept times are exactly {1}; that is the residual frontier. - assert_eq!(residual, Antichain::from_elem(1u64)); - // Both sides are graded after the regrade. - assert!(is_graded(&keep), "ungraded keep: {:?}", keep.iter().map(Chunk::len).collect::>()); - assert!(is_graded(&ship), "ungraded ship: {:?}", ship.iter().map(Chunk::len).collect::>()); - // Nothing lost: half the updates each way. - assert_eq!(keep.iter().map(Chunk::len).sum::(), n as usize / 2); - assert_eq!(ship.iter().map(Chunk::len).sum::(), n as usize / 2); - } - - // `advance` advances and consolidates complete `(key, val)` groups eagerly, - // pushing the (possibly-growing) last group back as the carry when not `done`. - #[test] - fn advance_emits_complete_groups_eagerly() { - use timely::progress::Antichain; - - let frontier = Antichain::from_elem(5u64); - // Group (0,0) is complete within this chunk; group (1,0) might still grow. - let c0 = chunk(vec![((0, 0), 0, 1), ((0, 0), 1, 1), ((1, 0), 0, 1)]); - let mut input: VecDeque<_> = VecDeque::from([c0]); - let mut out = VecDeque::new(); - VecChunk::advance(&mut input, &frontier, false, &mut out); - - // The trailing group (1,0) is withheld as the carry at the front of `input`. - assert_eq!(input.len(), 1); - assert_eq!(Chunk::len(&input[0]), 1); - // Group (0,0)'s times {0,1} advanced to 5 and consolidated, emitted now. - assert_eq!(flat(out), vec![((0, 0), 5, 2)]); - } - - // Streaming the input one chunk at a time must yield exactly what a single - // all-at-once flush does — the resumable path is just the one-shot path cut - // at group boundaries. - #[test] - fn advance_resumable_matches_oneshot() { - use timely::progress::Antichain; - - let frontier = Antichain::from_elem(3u64); - // Groups span chunk boundaries and carry several times each. - let input = || vec![ - chunk(vec![((0, 0), 0, 1), ((0, 0), 1, 1), ((1, 0), 0, 1)]), - chunk(vec![((1, 0), 5, 1), ((1, 1), 0, 1), ((2, 0), 0, 1)]), - chunk(vec![((2, 0), 2, 1), ((2, 0), 9, 1)]), - ]; - - let oneshot = { - let mut q: VecDeque<_> = input().into(); - let mut out = VecDeque::new(); - VecChunk::advance(&mut q, &frontier, false, &mut out); - VecChunk::advance(&mut q, &frontier, true, &mut out); - flat(out) - }; - let incremental = { - let mut q = VecDeque::new(); - let mut out = VecDeque::new(); - for c in input() { q.push_back(c); VecChunk::advance(&mut q, &frontier, false, &mut out); } - VecChunk::advance(&mut q, &frontier, true, &mut out); - flat(out) - }; - assert_eq!(oneshot, incremental); - // Times are advanced: nothing below the frontier survives. - for u in &oneshot { assert!(u.1 >= 3); } - } - - // A single `(key, val)` whose updates span every pushed chunk: `advance` - // can make no progress until `done`, accumulating in the carry in place. - // It must still produce the right advanced+consolidated result at the end. - #[test] - fn advance_single_key_spanning_pushes() { - use timely::progress::Antichain; - - let frontier = Antichain::from_elem(100u64); - let n = 50u64; - let make = || (0..n).map(|t| chunk(vec![((7u64, 0u64), t, 1i64)])).collect::>(); - - let mut q = VecDeque::new(); - let mut out = VecDeque::new(); - for c in make() { q.push_back(c); VecChunk::advance(&mut q, &frontier, false, &mut out); } - VecChunk::advance(&mut q, &frontier, true, &mut out); - // All times advance to 100 and consolidate to one update of diff `n`. - assert_eq!(flat(out), vec![((7u64, 0u64), 100u64, n as i64)]); - } - - #[test] - fn merge_chains_consolidates() { - let a = chunk(vec![((0, 0), 0, 1), ((1, 0), 0, 1)]); - let b = chunk(vec![((0, 0), 0, 1), ((2, 0), 0, 1)]); - let mut out = VecDeque::new(); - merge_chains(vec![a], vec![b], &mut out); - assert_eq!(flat(out), vec![((0, 0), 0, 2), ((1, 0), 0, 1), ((2, 0), 0, 1)]); - } - - // Merging runs larger than `TARGET`, then regrading, yields a *graded* sequence - // (each chunk `<= TARGET`, adjacent pairs summing past `TARGET`) reproducing the - // consolidated sorted contents. - #[test] - fn merge_emits_graded_chunks() { - use super::TARGET; - use crate::trace::chunk::{is_graded, merge_chains, regrade_all}; - - // Two interleaving single-chunk chains: evens and odds over `0..4·TARGET`. - let n = 4 * TARGET as u64; - let evens = chunk((0..n).step_by(2).map(|k| ((k, 0), 0, 1)).collect()); - let odds = chunk((0..n).step_by(2).map(|k| ((k + 1, 0), 0, 1)).collect()); - - let mut out = VecDeque::new(); - merge_chains(vec![evens], vec![odds], &mut out); - let chunks = regrade_all(out); - - assert!(is_graded(&chunks), "merge output not graded: {:?}", - chunks.iter().map(Chunk::len).collect::>()); - // Contents are exactly the sorted keys `0..4·TARGET`, each once. - let want: Vec<_> = (0..n).map(|k| ((k, 0u64), 0u64, 1i64)).collect(); - assert_eq!(flat(chunks), want); - } - - // Property test: merging two *multi-chunk* chains (driven through `merge` by - // `merge_chains`) reproduces the union of all updates, consolidated. Tiny - // chunks force `(key, val)` groups — which can span several times — to - // straddle chunk boundaries on both sides, exercising the refill path the - // single-chunk merge tests never reach. The independent oracle is - // `consolidate_updates` over the concatenation. - #[test] - fn merge_matches_reference() { - use crate::trace::chunk::merge_chains; - use crate::consolidation::consolidate_updates; - - // Deterministic xorshift PRNG — no dev-dependency on `rand`. - let mut seed = 0x2545F4914F6CDD1Du64; - let mut rng = move || { seed ^= seed << 13; seed ^= seed >> 7; seed ^= seed << 17; seed }; - - // A sorted, consolidated update set over a small (key, val, time) space, - // so the two chains collide and a `(key, val)` carries several times. - fn gen(rng: &mut impl FnMut() -> u64, n: usize) -> Vec<((u64, u64), u64, i64)> { - let mut v: Vec<((u64, u64), u64, i64)> = (0..n).map(|_| { - let k = rng() % 20; let val = rng() % 3; let t = rng() % 8; - let d = if rng() % 4 == 0 { -1 } else { 1 }; - ((k, val), t, d) - }).collect(); - consolidate_updates(&mut v); - v - } - // Split a consolidated set into a chain of small chunks (each sorted and - // consolidated; together globally sorted), so groups straddle boundaries. - fn chain(updates: &[((u64, u64), u64, i64)], sz: usize) -> Vec> { - updates.chunks(sz).map(|c| VecChunk(Rc::new(c.to_vec()))).collect() - } - - for _ in 0..300 { - let n1 = (rng() as usize % 60) + 1; - let u1 = gen(&mut rng, n1); - let n2 = (rng() as usize % 60) + 1; - let u2 = gen(&mut rng, n2); - if u1.is_empty() || u2.is_empty() { continue; } - let sz = (rng() as usize % 5) + 1; // tiny chunks → heavy straddling - - let mut out = VecDeque::new(); - merge_chains(chain(&u1, sz), chain(&u2, sz), &mut out); - let merged = flat(out); - - let mut reference: Vec<_> = u1.iter().chain(u2.iter()).cloned().collect(); - consolidate_updates(&mut reference); - - assert_eq!(merged, reference, "chunk size {sz}\n u1={u1:?}\n u2={u2:?}"); - } - } - - // `regrade` must produce a *maximal packing*: adjacent sub-`TARGET` chunks - // that could combine into one legal chunk are coalesced, full chunks pass - // through as `Rc` moves, and contents are preserved exactly. - #[test] - fn regrade_maximal_packing() { - use super::TARGET; - use crate::trace::chunk::is_graded; - - // A mix of small and full chunks with distinct, increasing keys (so the - // concatenation is sorted and nothing consolidates away). - let t = TARGET; - let sizes = [t / 3, t / 3, t / 3, t, t / 2, t / 2, t, 1, t - 1]; - let total: usize = sizes.iter().sum(); - let mut key = 0u64; - let mut input = VecDeque::new(); - let mut output = VecDeque::new(); - for &s in &sizes { - let updates: Vec<_> = (0..s).map(|_| { let k = key; key += 1; ((k, 0u64), 0u64, 1i64) }).collect(); - input.push_back(chunk(updates)); - VecChunk::regrade(&mut input, false, &mut output); - } - VecChunk::regrade(&mut input, true, &mut output); - let chunks: Vec<_> = output.into(); - - assert!(is_graded(&chunks), "not graded: {:?}", - chunks.iter().map(Chunk::len).collect::>()); - // Nothing lost, and the keys stay strictly sorted across the new breaks. - let got: Vec<_> = chunks.into_iter().flat_map(|c| (*c.0).clone()).collect(); - assert_eq!(got.len(), total); - assert!(got.windows(2).all(|w| w[0].0.0 < w[1].0.0)); - } - - // The indexed cursor must reconstruct the same grouped updates as a flat - // reference, even when a key — and a `(key, val)`'s times — straddle a - // chunk boundary. - #[test] - fn cursor_handles_straddle() { - use crate::trace::cursor::Cursor; - use crate::trace::{BatchReader, Description}; - use crate::trace::chunk::ChunkBatch; - use timely::progress::Antichain; - - let chunks = vec![ - chunk(vec![((0, 0), 0, 1), ((1, 0), 0, 1), ((1, 1), 0, 1)]), - chunk(vec![((1, 1), 1, 1), ((1, 2), 0, 1)]), - chunk(vec![((2, 0), 0, 1)]), - ]; - let desc = Description::new( - Antichain::from_elem(0u64), - Antichain::from_elem(2u64), - Antichain::from_elem(0u64), - ); - let batch = ChunkBatch::new(chunks, desc); - - let mut cursor = batch.cursor(); - let got = cursor.to_vec(&batch, |k| *k, |v| *v); - let want = vec![ - ((0u64, 0u64), vec![(0u64, 1i64)]), - ((1, 0), vec![(0, 1)]), - ((1, 1), vec![(0, 1), (1, 1)]), - ((1, 2), vec![(0, 1)]), - ((2, 0), vec![(0, 1)]), - ]; - assert_eq!(got, want); - } - - // Isolated: gallop vs linear forward-seek over one big chunk, for sparse to - // dense probe sets. Run: cargo test seek_microbench -- --ignored --nocapture - #[test] - #[ignore] - fn seek_microbench() { - use std::time::Instant; - use std::hint::black_box; - use super::gallop; - let n = 1_000_000u64; - let data: Vec<((u64, ()), u64, isize)> = (0..n).map(|k| ((3 * k, ()), 0u64, 1isize)).collect(); - for probes in [100u64, 10_000, 1_000_000] { - let targets: Vec = (0..probes).map(|i| 3 * (i * n / probes)).collect(); - let best = |f: &dyn Fn() -> u64| { - let mut b = std::time::Duration::MAX; - for _ in 0..5 { let t = Instant::now(); black_box(f()); b = b.min(t.elapsed()); } - b - }; - let data = black_box(&data[..]); - let g = best(&|| { - let (mut pos, mut acc) = (0usize, 0u64); - for &tgt in &targets { pos = gallop(data, pos, |u| u.0.0 < tgt); acc += pos as u64; } - acc - }); - let l = best(&|| { - let (mut pos, mut acc) = (0usize, 0u64); - for &tgt in &targets { while pos < data.len() && data[pos].0.0 < tgt { pos += 1; } acc += pos as u64; } - acc - }); - eprintln!("probes={probes:>7}: gallop={g:>12?} linear={l:>12?}"); - } - } - } -} diff --git a/differential-dataflow/src/trace/chunk/mod.rs b/differential-dataflow/src/trace/chunk/mod.rs new file mode 100644 index 000000000..b1473114a --- /dev/null +++ b/differential-dataflow/src/trace/chunk/mod.rs @@ -0,0 +1,746 @@ +//! Sorted, consolidated runs of updates, and operators over sequences of them. +//! +//! A [`Chunk`] is a consolidated, sorted run of `(data, time, diff)` updates. +//! Chunks live in sequences (`Vec`) with no constraint on where the +//! breakpoints between them fall; each chunk holds at most [`Chunk::TARGET`] +//! updates. The trait deliberately exposes only batch-level operations — merge, +//! extract, advance — leaving the layout-aware work to the implementor. The +//! orchestration in this module (the binary merger) is generic over the layout +//! and concerns itself only with feeding chunks across calls. +//! +//! # Why chunks, and why one size +//! +//! A batch could be a single monolithic sorted run. We cut it into chunks because +//! the chunk is simultaneously the unit of four things, each of which wants a size +//! bound: +//! +//! * **Suspendable work.** The fueled merger does a chunk's-worth of work per step +//! and checks fuel at the boundary, so chunk size bounds a step's latency. +//! * **Immutable sharing.** Chunks are `Rc`-shared; the merger reads its sources by +//! *cloning* chunks (a refcount bump). The chunk is the finest granularity of sharing. +//! * **Allocation recycling.** Emptied input buffers are reused as output buffers; +//! that only composes if buffers are roughly one size. +//! * **Indexing.** [`ChunkBatch`] indexes chunks by their first/last key, and the +//! cursor binary-searches *over* chunks then gallops *within* one. The chunk +//! count (≈ `len / TARGET`) sets the outer index size and search depth. +//! +//! So the size bound pulls two ways: an upper bound (latency, memory) says "not too +//! big," and a lower bound (per-chunk overhead, index bloat) says "not too +//! fragmented." Keeping chunks one size is what lets a single knob satisfy both. +//! The grading invariant ([`is_graded`]) encodes exactly this: every chunk is at +//! most `TARGET`, and every *adjacent pair* exceeds `TARGET` — i.e. no two +//! neighbours could be combined into one legal chunk. That makes `TARGET` both the +//! maximum size and the coalescing threshold (the invariant is self-similar), and +//! a graded sequence a *maximal packing*: as few chunks as the maximum allows. +//! +//! The intent is for a `Chunk` implementation to be each of +//! 1. the containers a `Collection` can transit. +//! 2. the containers a `MergeBatcher` can work with. +//! 3. the containers a `Batch` can be backed by. +//! It does this by exposing a small set of chunk-oriented primitives, which are +//! sufficient for harnesses for each of these tasks. + +use std::collections::VecDeque; + +use timely::progress::Antichain; +use timely::progress::frontier::AntichainRef; +use crate::lattice::Lattice; +use crate::trace::{Batch, BatchReader, Description}; +use crate::trace::cursor::Cursor; +use crate::trace::implementations::{BatchContainer, Layout, LayoutExt, WithLayout}; + +/// The key container of chunk `C`'s layout. Named via the `Layout` projection so +/// it unifies with the cursor's `Self::Key`, which also projects through `Layout`. +type KeyCon = <::Layout as Layout>::KeyContainer; +/// The val container of chunk `C`'s layout. +type ValCon = <::Layout as Layout>::ValContainer; + +/// Whether `chunks` satisfy the [`Chunk::TARGET`] grading invariant: every chunk +/// at most `TARGET`, and every adjacent pair summing to more than `TARGET` (so no +/// two neighbours could be combined into one legal chunk — a *maximal packing*). +/// +/// This is the post-[`regrade`](Chunk::regrade) shape; useful as a test/debug check. +pub fn is_graded(chunks: &[C]) -> bool { + chunks.iter().all(|c| c.len() <= C::TARGET) + && chunks.windows(2).all(|w| w[0].len() + w[1].len() > C::TARGET) +} + +/// Regrade `input` to completion into a fresh graded `Vec` (see [`Chunk::regrade`]). +/// +/// A convenience for the one-shot callers (batch sealing, the batcher's merge and +/// extract) that have a whole sequence in hand and want it graded; the streaming +/// callers drive [`Chunk::regrade`] directly across ticks. +pub fn regrade_all(input: impl IntoIterator) -> Vec { + let mut input: VecDeque = input.into_iter().collect(); + let mut out = VecDeque::new(); + C::regrade(&mut input, true, &mut out); + debug_assert!(input.is_empty()); + out.into() +} + +/// A consolidated, sorted sequence of `(data, time, diff)`. +/// +/// Chunks exist in sequences, with no constraints on the breakpoints between +/// them. Each holds at most [`TARGET`](Chunk::TARGET) updates; a graded sequence +/// is a maximal packing at that size (see [`is_graded`] and the module docs). +/// +/// `Clone` is expected to be cheap — a refcount bump on shared backing storage, +/// not a deep copy. The trace merger relies on this to read its (shared, +/// immutable) source batches by cloning chunks rather than consuming them. +/// +/// A chunk *has* a [`Cursor`] over its own `(key, val, time, diff)` contents — +/// the chunk is its own cursor `Storage`, mirroring [`BatchReader`]. This is what +/// lets a batch cursor delegate downward: the batch indexes which chunk holds a +/// key (reusing the chunk's `KeyContainer` / `ValContainer` for boundaries) and +/// then reads through that chunk's cursor. We do not provide this; the opaque +/// chunk implementor does. +/// +/// # The transducer protocol +/// +/// The four chunk-producing operations ([`merge`](Chunk::merge), +/// [`extract`](Chunk::extract), [`advance`](Chunk::advance), +/// [`regrade`](Chunk::regrade)) are all *stream transducers* over `VecDeque`, +/// sharing one calling convention so an implementor learns it once: +/// +/// * **Consume from the front.** Read chunks off the front of the input deque(s). +/// * **Withhold by pushing back.** Anything consumed but not yet safe to commit +/// (advance's still-growing last group; regrade's sub-`TARGET` carry; merge's +/// partially-consumed front) is reformed into a single owned chunk and +/// `push_front`ed back onto its input. The only cross-call state is therefore the +/// deques themselves — clean owned runs, no indices escape a call. +/// * **Commit by appending.** Append committed chunks to the output deque; once +/// appended they are written and a downstream stage may take them immediately. +/// * **`done` forces the flush.** The unary stages take `done: bool`; while it is +/// false they may withhold, and a call that appends nothing has yielded — the +/// harness will not call again until more input arrives or `done` flips true. On +/// `done` the stage must drain its withheld state (the harness keeps calling +/// until the output stops growing). +/// +/// Two operations vary only where their job demands it: [`merge`](Chunk::merge) is +/// binary (and the harness, not `merge`, handles a drained input by flushing the +/// other side's verbatim tail, so `merge` needs no `done`); [`extract`](Chunk::extract) +/// is the one-shot splitter (it drains its whole input, so it needs no `done` and +/// has two outputs plus a residual frontier). +/// +/// Implementors are further expected to: +/// +/// * **Emit near-graded output.** Fill `TARGET`-sized output chunks directly rather +/// than emitting one monolithic chunk; the terminal [`regrade`](Chunk::regrade) +/// only has to coalesce the trailing partials at the seams. Grading is a +/// *seal-time* property, not an invariant maintained between stages. +/// * **Recycle where possible.** Reuse the storage of chunks drained from the input +/// as the buffers for output, so allocations balance input against output rather +/// than allocating afresh per emitted chunk. `vec_chunk` is the worked example: it +/// fills buffers reclaimed from a stash of emptied input `Vec`s, and advance reuses +/// its withheld carry's storage in place so a giant key stays linear, not quadratic. +/// +/// [`BatchReader`]: crate::trace::BatchReader +pub trait Chunk: Sized + Clone + LayoutExt { + + /// The chunk size: both the maximum updates per chunk and the coalescing + /// threshold. + /// + /// A *graded* sequence (the post-[`regrade`](Chunk::regrade) shape) has every + /// chunk of length at most `TARGET`, and every adjacent pair summing to more + /// than `TARGET` — so no two neighbours could be combined into one legal chunk. + /// Equivalently, a maximal packing at size `TARGET`. [`is_graded`] checks + /// exactly this. The value is the implementor's tuning knob: larger means fewer + /// chunks (smaller index, less per-chunk overhead) but coarser merge-suspension + /// granularity and a larger within-chunk seek. Required, not defaulted: the + /// right value is layout-dependent, so every implementor chooses it deliberately. + const TARGET: usize; + + /// A cursor navigating this chunk's contents; the chunk is its storage. + /// + /// The layout aliases are spelled out (mirroring [`BatchReader`]) so the + /// cursor's `Key`/`Val`/`Time`/`Diff` and their containers are *definitionally* + /// equal to the chunk's — without this the compiler won't connect the cursor's + /// layout to the chunk's when reading through it. + type Cursor: + Cursor + + WithLayout + + for<'a> LayoutExt< + Key<'a> = Self::Key<'a>, + Val<'a> = Self::Val<'a>, + ValOwn = Self::ValOwn, + Time = Self::Time, + TimeGat<'a> = Self::TimeGat<'a>, + Diff = Self::Diff, + DiffGat<'a> = Self::DiffGat<'a>, + KeyContainer = Self::KeyContainer, + ValContainer = Self::ValContainer, + TimeContainer = Self::TimeContainer, + DiffContainer = Self::DiffContainer, + >; + + /// Acquire a cursor over this chunk. + fn cursor(&self) -> Self::Cursor; + + /// The first and last `(key, val, time)` triples in the chunk. + /// + /// The chunk must be non-empty (batch chunks always are). Expected to be + /// cheap — the chunk's endpoints, e.g. columnar indices `0` and `len - 1`, + /// not a cursor walk. Indexing a batch's chunks rests on this: the last + /// triples drive a binary search to a key or `(key, val)`, and comparing one + /// chunk's last triple against the next chunk's first detects keys or + /// `(key, val)` pairs that straddle the boundary — all without touching chunk + /// contents. Returned by reference (no owned key type exists in the layout); + /// the index materializes them into its own containers. + fn bounds(&self) -> ( + (Self::Key<'_>, Self::Val<'_>, Self::TimeGat<'_>), + (Self::Key<'_>, Self::Val<'_>, Self::TimeGat<'_>), + ); + + /// The number of updates in the chunk. + /// + /// Chunks are always non-empty (`len() > 0`): producers drop empties before + /// they reach a chunk sequence, and [`ChunkBatch::new`] asserts the invariant. + fn len(&self) -> usize; + + /// Merge the fronts of two input deques through their shared horizon. + /// + /// Both deques are non-empty (the caller guarantees it). The two front chunks + /// merge through updates present in both — up to the least last `(key, val, time)` + /// triple across them — consolidating collisions and emitting committed chunks to + /// `out`. The side owning the horizon is fully consumed and `pop_front`ed; the + /// other's partially-consumed front is reformed (its consumed prefix dropped) and + /// `push_front`ed back. So on return at least one deque has had its front retired. + /// + /// `merge` makes one front-pair's worth of progress and returns; the harness + /// re-ticks it, refilling a drained deque from its source, and itself handles an + /// exhausted source by flushing the other deque's verbatim tail — so `merge` needs + /// no `done` and never has to reason about end-of-input. + fn merge(in1: &mut VecDeque, in2: &mut VecDeque, out: &mut VecDeque); + + /// Partition the input by `frontier` into updates greater-or-equal it (`keep`) or + /// not (`ship`). One-shot: the whole of `input` is consumed. + /// + /// The lower envelope of the times routed to `keep` is folded into `residual`, so + /// the caller learns the frontier of data it still holds without a second pass. + /// Outputs are near-graded but not regraded; a terminal [`regrade`](Chunk::regrade) + /// zips up the seams. + fn extract( + input: &mut VecDeque, + frontier: &Antichain, + residual: &mut Antichain, + keep: &mut VecDeque, + ship: &mut VecDeque, + ); + + /// Advance times by `frontier`, consolidating each complete `(key, val)` group from + /// the front of `input` into `out`. + /// + /// A group is complete once a later `(key, val)` is seen, so every group but the + /// last is emitted; the last (which a future call might extend) is reformed and + /// `push_front`ed back as the withheld carry — unless `done`, which flushes it too. + /// The degenerate case is a single `(key, val)` spanning all available input: no + /// group is provably complete, so nothing is committed (the whole buffer is + /// withheld) until `done`. + fn advance( + input: &mut VecDeque, + frontier: &Antichain, + done: bool, + out: &mut VecDeque, + ); + + /// Reshape the front of `input` into a maximal packing in `out`: each chunk at most + /// [`TARGET`](Chunk::TARGET), and any two adjacent summing past `TARGET` (so no + /// neighbours could be combined). See [`is_graded`]. + /// + /// The terminal stage of every pipeline. A sub-`TARGET` carry that might still grow + /// is `push_front`ed back as the withheld remainder until `done`, which flushes it. + fn regrade( + input: &mut VecDeque, + done: bool, + out: &mut VecDeque, + ); + +} + +/// Merge two full chains of chunks into one, to completion, appending to `out`. +/// +/// The whole-chain (non-fueled) driver used by the batcher's +/// [`Merger`](crate::trace::implementations::merge_batcher::Merger): both chains are in +/// hand, so it ticks [`Chunk::merge`] until one deque empties, then appends the other's +/// remainder (the verbatim tail). Output is near-graded; callers regrade as needed. +pub fn merge_chains( + chain1: Vec, + chain2: Vec, + out: &mut VecDeque, +) { + let mut in1: VecDeque = chain1.into(); + let mut in2: VecDeque = chain2.into(); + while !in1.is_empty() && !in2.is_empty() { + C::merge(&mut in1, &mut in2, out); + } + // One deque is empty; the other's remainder is all greater than everything merged. + out.extend(in1.drain(..)); + out.extend(in2.drain(..)); +} + +/// A merge-batcher [`Merger`](crate::trace::implementations::merge_batcher::Merger) +/// over chains of [`Chunk`]s. +/// +/// `merge` runs the whole-chain binary merger; `extract` splits by the seal frontier +/// using [`Chunk::extract`]. The batcher consolidates equal `(data, time)` updates +/// but does *not* advance times — time advancement is advance's job, handled later in +/// the trace. Both regrade their output, since the batcher's chains want to be graded. +pub struct ChunkMerger { + _marker: std::marker::PhantomData, +} + +impl Default for ChunkMerger { + fn default() -> Self { Self { _marker: std::marker::PhantomData } } +} + +impl crate::trace::implementations::merge_batcher::Merger for ChunkMerger +where + C: Chunk + Default + 'static, + C::Time: Clone + timely::PartialOrder + 'static, +{ + type Chunk = C; + type Time = C::Time; + + fn merge( + &mut self, + list1: Vec, + list2: Vec, + output: &mut Vec, + _stash: &mut Vec, + ) { + let mut merged = VecDeque::new(); + merge_chains(list1, list2, &mut merged); + // No regrade: the batcher's ladder weighs chains by updates (not chunk count) + // since #767, so intermediate grading buys nothing; the final batch is graded + // at seal. merge's output is already near-`TARGET`. + output.extend(merged); + } + + fn extract( + &mut self, + merged: Vec, + upper: AntichainRef, + frontier: &mut Antichain, + ship: &mut Vec, + kept: &mut Vec, + _stash: &mut Vec, + ) { + // `extract` keeps updates greater-or-equal `upper` and ships the rest, + // folding the lower envelope of kept times into `frontier`. + let upper = upper.to_owned(); + let mut input: VecDeque = merged.into(); + let (mut keep, mut shipped) = (VecDeque::new(), VecDeque::new()); + C::extract(&mut input, &upper, frontier, &mut keep, &mut shipped); + // No regrade: `kept` is re-merged later and `shipped` is regraded at seal by + // the builder, so neither needs grading here. + kept.extend(keep); + ship.extend(shipped); + } + + fn len(chunk: &C) -> usize { chunk.len() } +} + +/// The merge batcher for chunks of type `C`, merging pre-chunked `C` runs. +/// +/// The batcher accepts already-formed `C` chunks via `PushInto` and merges them +/// through [`ChunkMerger`]; it holds no chunker. The `Input → C` bridge lives at the +/// `arrange_core` callsite, which supplies the chunker (e.g. [`ContainerChunker`] +/// for same-shape input, where `C` satisfies the batcher-side container traits +/// `SizableContainer`, `Consolidate`, `Container`, `PushInto`). +/// +/// [`ContainerChunker`]: crate::trace::implementations::chunker::ContainerChunker +pub type ChunkBatcher = crate::trace::implementations::merge_batcher::MergeBatcher>; + +/// A spine of `Rc`-shared [`ChunkBatch`]es of type `C`: the trace type for `arrange`. +pub type ChunkSpine = crate::trace::implementations::spine_fueled::Spine>>; + +/// A reference-counted [`ChunkBatch`] builder over chunks of type `C`. +pub type ChunkRcBuilder = crate::trace::rc_blanket_impls::RcBuilder>; + +/// A batch is just an ordered sequence of [`Chunk`]s plus its time description. +/// +/// The chunks are sorted and consolidated, with chunk boundaries arbitrary; the +/// concatenation of their contents is the batch. +/// +/// This is a full [`Batch`](crate::trace::Batch): [`ChunkBatchCursor`] reads +/// across the chunks (delegating to each chunk's own cursor and continuing past +/// boundaries), [`ChunkBatchMerger`] performs the resumable merge-and-advance, +/// and [`ChunkBuilder`] collects pre-sorted chunks. All of those are below. +pub struct ChunkBatch { + /// Ordered, consolidated chunks; their concatenation is the batch. + pub chunks: Vec, + /// The lower, upper, and since frontiers of the batch. + pub description: Description, + /// Per-chunk first and last key, and first and last val, parallel to `chunks`. + first_keys: KeyCon, + last_keys: KeyCon, + first_vals: ValCon, + last_vals: ValCon, +} + +impl ChunkBatch { + /// Assemble a batch from ordered chunks, building the per-chunk index. + pub fn new(chunks: Vec, description: Description) -> Self { + let n = chunks.len(); + let mut first_keys = >::with_capacity(n); + let mut last_keys = >::with_capacity(n); + let mut first_vals = >::with_capacity(n); + let mut last_vals = >::with_capacity(n); + for chunk in &chunks { + assert!(chunk.len() > 0, "ChunkBatch chunks must be non-empty"); + let ((fk, fv, _), (lk, lv, _)) = chunk.bounds(); + first_keys.push_ref(fk); + last_keys.push_ref(lk); + first_vals.push_ref(fv); + last_vals.push_ref(lv); + } + ChunkBatch { chunks, description, first_keys, last_keys, first_vals, last_vals } + } +} + +impl WithLayout for ChunkBatch { + type Layout = C::Layout; +} + +/// A cursor over a [`ChunkBatch`], merging the per-chunk cursors. +/// +/// Chunk breakpoints are unconstrained, so a single key — or `(key, val)` — may +/// straddle consecutive chunks. But the chunks are one globally-sorted sequence +/// merely cut at arbitrary points, so the operation is *concatenation*, never a +/// merge: across a boundary a key's vals concatenate and a `(key, val)`'s times +/// concatenate. The cursor exploits this. It holds the chunk currently being read +/// and a cursor into it; it seeks by binary-searching the per-chunk index on +/// `ChunkBatch`, and at boundaries it *continues* into the next chunk rather than +/// merging — using the index to detect when a key or `(key, val)` spills forward, +/// without touching chunk contents. +pub struct ChunkBatchCursor { + /// First chunk of the current key's run; where `rewind_vals` returns to. + key_chunk: usize, + /// Chunk currently being read; `>= key_chunk`, within the current key's span. + chunk: usize, + /// Cursor into `chunk`; `None` once `chunk` is past the last chunk. + inner: Option, +} + +impl WithLayout for ChunkBatchCursor { + type Layout = C::Layout; +} + +impl ChunkBatchCursor { + /// Move the active chunk to `c`, opening a fresh inner cursor at its start. + fn goto(&mut self, c: usize, storage: &ChunkBatch) { + self.chunk = c; + self.inner = storage.chunks.get(c).map(C::cursor); + } +} + +impl Cursor for ChunkBatchCursor { + type Storage = ChunkBatch; + + fn key_valid(&self, s: &Self::Storage) -> bool { self.chunk < s.chunks.len() && self.inner.as_ref().is_some_and(|i| i.key_valid(&s.chunks[self.chunk])) } + fn val_valid(&self, s: &Self::Storage) -> bool { self.chunk < s.chunks.len() && self.inner.as_ref().is_some_and(|i| i.val_valid(&s.chunks[self.chunk])) } + fn key<'a>(&self, s: &'a Self::Storage) -> Self::Key<'a> { self.inner.as_ref().unwrap().key(&s.chunks[self.chunk]) } + fn val<'a>(&self, s: &'a Self::Storage) -> Self::Val<'a> { self.inner.as_ref().unwrap().val(&s.chunks[self.chunk]) } + fn get_key<'a>(&self, s: &'a Self::Storage) -> Option> { if self.key_valid(s) { Some(self.key(s)) } else { None } } + fn get_val<'a>(&self, s: &'a Self::Storage) -> Option> { if self.val_valid(s) { Some(self.val(s)) } else { None } } + + fn map_times, Self::DiffGat<'_>)>(&mut self, s: &Self::Storage, mut logic: L) { + if !self.val_valid(s) { return; } + let (k, v) = (self.key(s), self.val(s)); + self.inner.as_mut().unwrap().map_times(&s.chunks[self.chunk], &mut logic); + // Follow the (key, val) forward across boundaries while it spills. + let mut c = self.chunk; + while c + 1 < s.chunks.len() + && s.last_keys.index(c) == k && s.first_keys.index(c + 1) == k + && s.last_vals.index(c) == v && s.first_vals.index(c + 1) == v + { + c += 1; + s.chunks[c].cursor().map_times(&s.chunks[c], &mut logic); + } + } + + fn step_key(&mut self, s: &Self::Storage) { + if !self.key_valid(s) { return; } + let n = s.chunks.len(); + let k = self.key(s); + // Advance to the last chunk the key spans. + while self.chunk + 1 < n && s.last_keys.index(self.chunk) == k && s.first_keys.index(self.chunk + 1) == k { + self.goto(self.chunk + 1, s); + } + // Step past the key within its last chunk. + { + let inner = self.inner.as_mut().unwrap(); + inner.seek_key(&s.chunks[self.chunk], k); + inner.step_key(&s.chunks[self.chunk]); + } + // If that exhausted the chunk, the next key (if any) starts the next chunk. + if !self.inner.as_ref().unwrap().key_valid(&s.chunks[self.chunk]) && self.chunk + 1 < n { + self.goto(self.chunk + 1, s); + } + self.key_chunk = self.chunk; + } + + fn seek_key(&mut self, s: &Self::Storage, key: Self::Key<'_>) { + let n = s.chunks.len(); + // First chunk whose last key is `>= key`: where `key`'s run begins. + let c = s.last_keys.advance(0, n, |x| { + as BatchContainer>::reborrow(x).lt(& as BatchContainer>::reborrow(key)) + }); + self.goto(c, s); + self.key_chunk = c; + if c < n { self.inner.as_mut().unwrap().seek_key(&s.chunks[c], key); } + } + + fn step_val(&mut self, s: &Self::Storage) { + if !self.val_valid(s) { return; } + let n = s.chunks.len(); + let (k, v) = (self.key(s), self.val(s)); + // Advance to the last chunk the (key, val) spans. + while self.chunk + 1 < n + && s.last_keys.index(self.chunk) == k && s.first_keys.index(self.chunk + 1) == k + && s.last_vals.index(self.chunk) == v && s.first_vals.index(self.chunk + 1) == v + { + self.goto(self.chunk + 1, s); + } + // Step past the (key, val) within that chunk. + self.inner.as_mut().unwrap().step_val(&s.chunks[self.chunk]); + // If the key's vals are exhausted here but the key spills, roll forward. + if !self.inner.as_ref().unwrap().val_valid(&s.chunks[self.chunk]) + && self.chunk + 1 < n && s.last_keys.index(self.chunk) == k && s.first_keys.index(self.chunk + 1) == k + { + self.goto(self.chunk + 1, s); + self.inner.as_mut().unwrap().seek_key(&s.chunks[self.chunk], k); + } + } + + fn seek_val(&mut self, s: &Self::Storage, val: Self::Val<'_>) { + if !self.key_valid(s) { return; } + let n = s.chunks.len(); + let k = self.key(s); + loop { + self.inner.as_mut().unwrap().seek_val(&s.chunks[self.chunk], val); + if self.inner.as_ref().unwrap().val_valid(&s.chunks[self.chunk]) { return; } + // Key's vals exhausted in this chunk; if the key spills, retry in the next. + if self.chunk + 1 < n && s.last_keys.index(self.chunk) == k && s.first_keys.index(self.chunk + 1) == k { + self.goto(self.chunk + 1, s); + self.inner.as_mut().unwrap().seek_key(&s.chunks[self.chunk], k); + } else { + return; + } + } + } + + fn rewind_keys(&mut self, s: &Self::Storage) { + self.key_chunk = 0; + self.goto(0, s); + } + + fn rewind_vals(&mut self, s: &Self::Storage) { + if !self.key_valid(s) { return; } + let k = self.key(s); + let kc = self.key_chunk; + self.goto(kc, s); + self.inner.as_mut().unwrap().seek_key(&s.chunks[kc], k); + } +} + +impl BatchReader for ChunkBatch { + type Cursor = ChunkBatchCursor; + fn cursor(&self) -> Self::Cursor { + ChunkBatchCursor { key_chunk: 0, chunk: 0, inner: self.chunks.first().map(C::cursor) } + } + fn len(&self) -> usize { self.chunks.iter().map(C::len).sum() } + fn description(&self) -> &Description { &self.description } +} + +impl Batch for ChunkBatch +where + C::Time: timely::progress::Timestamp + Lattice + Ord, +{ + type Merger = ChunkBatchMerger; + + fn empty(lower: Antichain, upper: Antichain) -> Self { + use timely::progress::Timestamp; + let since = Antichain::from_elem(Self::Time::minimum()); + ChunkBatch::new(Vec::new(), Description::new(lower, upper, since)) + } +} + +/// A merge of two [`ChunkBatch`]es in progress. +/// +/// This is the [`ChunkBatch`] merger, wired in as its +/// [`Batch::Merger`](crate::trace::Batch::Merger), and has that trait's +/// `new` / `work` / `done` shape. +/// +/// The merge is *resumable* and runs a two-stage deque pipeline: +/// [`merge`](Chunk::merge) feeds `merged`, [`advance`](Chunk::advance) consumes it +/// into `advanced`; the terminal [`regrade`](Chunk::regrade) runs once at `done`. Each +/// `work` step clones a burst from each source, ticks `merge` once, then advances the +/// fresh output, debiting `fuel` by the *merged* records that entered the pipe — the +/// total output across the merge, matching how the trace's other mergers account (cf. +/// `ord_neu`). The sources are read by *cloning* chunks (a cheap refcount bump per the +/// [`Chunk`] contract), never consumed or mutated; the same `source1`/`source2` must be +/// supplied on every call. When a source exhausts, the harness flushes the other's +/// verbatim tail one chunk per step. Once both are drained, a final `advance(done)` +/// flushes advance's withheld carry. +/// +/// **Latency bound.** `fuel` bounds each step to roughly one burst-merge's output. Two +/// things ride *outside* fuel: the terminal `advance(done)` and `done`'s `regrade`. In +/// the worst case — a single `(key, val)` spanning the whole merge — `advance` withholds +/// the entire group until `done`, then sorts and consolidates it in one unfueled step. +/// `vec_chunk` keeps that step *linear* in the group (it accumulates the carry in place, +/// reusing its storage), so it is not the quadratic blow-up of an earlier design, but it +/// is one unbounded-latency step bounded by the largest single `(key, val)` group. A +/// chunk impl must keep this flush linear; the latency claimed is "per step ≈ a burst, +/// plus a final flush ≤ the largest group." +pub struct ChunkBatchMerger { + /// Compaction frontier supplied at construction. + frontier: Antichain, + /// Result frontiers, retained for the output description. + lower: Antichain, + upper: Antichain, + /// Input deques, refilled from the sources (clones) head-of-list at a time. + in1: VecDeque, + in2: VecDeque, + /// Next source chunk to clone into `in1` / `in2`. + idx1: usize, + idx2: usize, + /// `advance`'s input: the merge output plus advance's withheld carry at the front. + merged: VecDeque, + /// `advance`'s output: the merged-and-advanced chunks, grown by `work`. + advanced: VecDeque, + /// Set once both sources are drained and advance's final flush has run. + complete: bool, +} + +impl crate::trace::Merger> for ChunkBatchMerger +where + C: Chunk + Default + 'static, + C::Time: timely::progress::Timestamp + Lattice + Ord + 'static, +{ + /// Begin merging `source1` and `source2`, advancing to `frontier`. + fn new(source1: &ChunkBatch, source2: &ChunkBatch, frontier: AntichainRef) -> Self { + let lower = source1.description.lower().meet(source2.description.lower()); + let upper = source1.description.upper().join(source2.description.upper()); + Self { + frontier: frontier.to_owned(), + lower, + upper, + in1: VecDeque::new(), + in2: VecDeque::new(), + idx1: 0, + idx2: 0, + merged: VecDeque::new(), + advanced: VecDeque::new(), + complete: false, + } + } + + /// Advance the merge by up to `fuel` updates, suspending when it runs out. + fn work(&mut self, source1: &ChunkBatch, source2: &ChunkBatch, fuel: &mut isize) { + if self.complete { return; } + + while *fuel > 0 { + // Refill each input deque up to a burst of source chunks (clones); `merge` + // drains the loaded burst per call. The burst trades fuel granularity (a + // call does up to a burst's work before checking fuel) against re-pruning: + // a chunk that straddles many chunks on the other side is walked by index + // within one call but, once its tail spills past the loaded burst, its + // unconsumed suffix is pushed back and re-copied next call — a bigger burst + // absorbs more straddle per call. This workload is insensitive (1..32 flat + // to ~noise at 1M), so 8 is a conservative default, not a tuned optimum. + // After this, a deque is non-empty iff its source still has data. + const BURST: usize = 8; + while self.in1.len() < BURST && self.idx1 < source1.chunks.len() { + self.in1.push_back(source1.chunks[self.idx1].clone()); + self.idx1 += 1; + } + while self.in2.len() < BURST && self.idx2 < source2.chunks.len() { + self.in2.push_back(source2.chunks[self.idx2].clone()); + self.idx2 += 1; + } + + // Merge's per-tick output (a burst's worth, or one tail chunk), measured + // for fuel before it joins the carry already in `merged`. + let mut produced = VecDeque::new(); + if !self.in1.is_empty() && !self.in2.is_empty() { + // Both sides have data: drain the loaded burst. + C::merge(&mut self.in1, &mut self.in2, &mut produced); + } else if let Some(chunk) = self.in1.pop_front().or_else(|| self.in2.pop_front()) { + // Exactly one side has data: flush its verbatim tail, one chunk a step. + produced.push_back(chunk); + } else { + // Both sources drained: final flush of advance's withheld carry. + C::advance(&mut self.merged, &self.frontier, true, &mut self.advanced); + self.complete = true; + break; + } + + let work: usize = produced.iter().map(C::len).sum(); + self.merged.extend(produced); + C::advance(&mut self.merged, &self.frontier, false, &mut self.advanced); + *fuel -= work as isize; + } + } + + /// Extract the merged batch over `[lower, upper)` advanced to the frontier. + /// + /// Only valid once `work` has driven the merge to completion (left `fuel` + /// positive), as the [`trace::Merger`](crate::trace::Merger) contract requires. + fn done(self) -> ChunkBatch { + let description = Description::new(self.lower, self.upper, self.frontier); + ChunkBatch::new(regrade_all(self.advanced), description) + } +} + +/// A [`Builder`](crate::trace::Builder) that collects pre-sorted chunks into a +/// [`ChunkBatch`]. +/// +/// The builder assumes its inputs arrive already sorted and consolidated (as the +/// `Builder` contract requires), so it does no merging: each pushed chunk is an +/// ordered run, fed straight to [`regrade`](Chunk::regrade) as it arrives — so a batch +/// built here is graded like one produced by the merger, rather than inheriting +/// whatever chunk sizes the caller happened to push. +pub struct ChunkBuilder { + /// Pushed chunks awaiting regrading; holds regrade's sub-`TARGET` carry at the front. + input: VecDeque, + /// The graded chunks emitted so far. + output: VecDeque, +} + +impl crate::trace::Builder for ChunkBuilder +where + C: Chunk + Default + 'static, + C::Time: timely::progress::Timestamp, +{ + type Input = C; + type Time = C::Time; + type Output = ChunkBatch; + + fn with_capacity(_keys: usize, _vals: usize, _upds: usize) -> Self { + Self { input: VecDeque::new(), output: VecDeque::new() } + } + + fn push(&mut self, chunk: &mut C) { + let chunk = std::mem::take(chunk); + if chunk.len() > 0 { + self.input.push_back(chunk); + C::regrade(&mut self.input, false, &mut self.output); + } + } + + fn done(self, description: Description) -> ChunkBatch { + let ChunkBuilder { mut input, mut output } = self; + C::regrade(&mut input, true, &mut output); + ChunkBatch::new(output.into(), description) + } + + fn seal(chain: &mut Vec, description: Description) -> ChunkBatch { + // The chain is sorted and consolidated but not necessarily graded; regrade it. + // Already-`TARGET` chunks pass through as cheap `Rc` moves, so a chain that + // arrives graded (as the batcher's does) pays only an O(#chunks) walk. + ChunkBatch::new(regrade_all(std::mem::take(chain)), description) + } +} + +pub mod vec_chunk; diff --git a/differential-dataflow/src/trace/chunk/vec_chunk.rs b/differential-dataflow/src/trace/chunk/vec_chunk.rs new file mode 100644 index 000000000..42d6d1b01 --- /dev/null +++ b/differential-dataflow/src/trace/chunk/vec_chunk.rs @@ -0,0 +1,778 @@ +//! A worked [`Chunk`] implementation: `Vec<((K, V), T, R)>` behind an `Rc`. +//! +//! This is the reference example — a next implementor (e.g. columnar) follows +//! its *shape*, not its layout. It shows the two integration points any chunk +//! type satisfies, and how leaning on the parent module's generic harnesses +//! keeps the code terse: +//! +//! * **Batcher side.** The merge batcher's `ContainerChunker` builds chunks, so +//! the type implements timely's container traits (`Accountable`, +//! `SizableContainer`, `Consolidate`, `PushInto`). Here they delegate to the +//! inner `Vec` via `Rc::make_mut` — free while a chunk is being built +//! (refcount 1), and it never copies a *shared* chunk because batches are +//! immutable once built. +//! * **Trace side.** [`Chunk`] (merge / extract / advance / regrade / bounds) +//! plus a cursor. Key lookups are logarithmic by galloping search (`seek_*`), +//! independent of chunk size; stepping stays linear (short hops). +//! +//! `Clone` is a refcount bump, so the trace merger shares source chunks instead +//! of copying them. +//! +//! **What a columnar impl can and can't reuse.** The protocol (the `VecDeque` +//! in/out, withhold-by-`push_front`, grade-at-seal) is layout-agnostic and carries +//! over unchanged. The *merge body* does not: this one merges a single contiguous +//! `&[((K,V),T,R)]` and bulk-copies disjoint runs with `extend_from_slice` + +//! `chunks(TARGET)`. A columnar chunk (ranging over `ord_neu`'s deduped layout) has +//! no such slice — it must range-copy the key / val / time / diff columns with +//! offset bookkeeping, emitting one key + its val/time run rather than repeated rows. +//! That is the operation that beats the flat layout on repetitive keys (see the +//! module-level note on the row-major vs. columnar crossover), and it is also where +//! the earlier `col_chunk` got into trouble (decompress-and-recompress instead of a +//! true range-copy). So a columnar `Chunk` is the open bet: nothing here exercises a +//! columnar merge, and that body — not the protocol — is the phase-2 risk. + +use std::collections::VecDeque; +use std::marker::PhantomData; +use std::rc::Rc; + +use timely::Accountable; +use timely::container::{PushInto, SizableContainer}; +use timely::progress::{Antichain, Timestamp}; + +use crate::consolidation::Consolidate; +use crate::difference::Semigroup; +use crate::lattice::Lattice; +use crate::trace::cursor::Cursor; +use crate::trace::implementations::{Vector, WithLayout}; + +use super::Chunk; + +/// The chunk size: both the maximum updates per chunk and the coalescing +/// threshold (see [`Chunk::TARGET`]). Chosen for the reference impl; exposed as +/// the associated const below, and used internally for buffer sizing. +const TARGET: usize = 1024; + +/// A sorted, consolidated run of `((key, val), time, diff)`, shared via `Rc`. +pub struct VecChunk(Rc>); + +impl Clone for VecChunk { + fn clone(&self) -> Self { VecChunk(Rc::clone(&self.0)) } +} +impl Default for VecChunk { + fn default() -> Self { VecChunk(Rc::new(Vec::new())) } +} + +/// The trace type for `arrange`: a spine of `Rc`-shared chunk batches. +pub type ChunkSpine = super::ChunkSpine>; +/// Merge batcher over `VecChunk`s. Unordered `Vec<((K, V), T, R)>` input is +/// consolidated into sorted `VecChunk`s by a `ContainerChunker` supplied +/// at the `arrange_core` callsite (it drives the container-trait impls below); the +/// batcher itself only merges the resulting chunks. +pub type ChunkBatcher = super::ChunkBatcher>; +/// Reference-counted batch builder. +pub type ChunkRcBuilder = super::ChunkRcBuilder>; + +// --- batcher side: timely container traits, delegating to the inner `Vec` --- + +impl Accountable for VecChunk { + fn record_count(&self) -> i64 { self.0.len() as i64 } +} + +impl SizableContainer for VecChunk +where K: Clone+'static, V: Clone+'static, T: Clone+'static, R: Clone+'static { + // The absorb point is the grading target: the chunker fills a scratch chunk + // to `TARGET` updates before emitting, so chunks arrive pre-graded rather than + // at timely's byte-derived buffer size (which downstream regrading re-melds). + fn at_capacity(&self) -> bool { self.0.len() >= TARGET } + fn ensure_capacity(&mut self, _stash: &mut Option) { + let inner = Rc::make_mut(&mut self.0); + inner.reserve(TARGET.saturating_sub(inner.len())); + } +} + +impl Consolidate for VecChunk +where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Ord+Clone+'static, R: Semigroup+'static { + fn len(&self) -> usize { self.0.len() } + fn clear(&mut self) { Rc::make_mut(&mut self.0).clear() } + fn consolidate_into(&mut self, target: &mut Self) { + Rc::make_mut(&mut self.0).consolidate_into(Rc::make_mut(&mut target.0)); + } +} + +impl PushInto<((K, V), T, R)> for VecChunk +where K: Clone+'static, V: Clone+'static, T: Clone+'static, R: Clone+'static { + fn push_into(&mut self, item: ((K, V), T, R)) { Rc::make_mut(&mut self.0).push(item); } +} + +// --- trace side: a logarithmic cursor and the `Chunk` operations --- + +/// First index `>= start` at which `pred` turns false, by galloping (exponential) +/// search. `pred` must hold for a prefix then not — i.e. `|u| u < target`. +/// O(log distance), so O(1) for short hops and logarithmic for long ones. +fn gallop(s: &[U], start: usize, pred: impl Fn(&U) -> bool) -> usize { + let mut pos = start; + if pos < s.len() && pred(&s[pos]) { + let mut step = 1; + while pos + step < s.len() && pred(&s[pos + step]) { pos += step; step <<= 1; } + step >>= 1; + while step > 0 { + if pos + step < s.len() && pred(&s[pos + step]) { pos += step; } + step >>= 1; + } + pos += 1; + } + pos +} + +/// A cursor over a [`VecChunk`], tracking the current key and `(key, val)` +/// group starts as indices into the flat vector. +pub struct VecChunkCursor { + key_pos: usize, + val_pos: usize, + phantom: PhantomData<(K, V, T, R)>, +} + +impl WithLayout for VecChunk +where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { + type Layout = Vector<((K, V), T, R)>; +} + +impl WithLayout for VecChunkCursor +where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { + type Layout = Vector<((K, V), T, R)>; +} + +impl Cursor for VecChunkCursor +where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { + type Storage = VecChunk; + + fn key_valid(&self, s: &Self::Storage) -> bool { self.key_pos < s.0.len() } + fn val_valid(&self, s: &Self::Storage) -> bool { + self.key_pos < s.0.len() && self.val_pos < s.0.len() && s.0[self.val_pos].0.0 == s.0[self.key_pos].0.0 + } + fn key<'a>(&self, s: &'a Self::Storage) -> &'a K { &s.0[self.key_pos].0.0 } + fn val<'a>(&self, s: &'a Self::Storage) -> &'a V { &s.0[self.val_pos].0.1 } + fn get_key<'a>(&self, s: &'a Self::Storage) -> Option<&'a K> { + if self.key_valid(s) { Some(self.key(s)) } else { None } + } + fn get_val<'a>(&self, s: &'a Self::Storage) -> Option<&'a V> { + if self.val_valid(s) { Some(self.val(s)) } else { None } + } + fn map_times(&mut self, s: &Self::Storage, mut logic: L) { + if !self.val_valid(s) { return; } + let kv = &s.0[self.val_pos].0; + let mut i = self.val_pos; + while i < s.0.len() && &s.0[i].0 == kv { + logic(&s.0[i].1, &s.0[i].2); + i += 1; + } + } + fn step_key(&mut self, s: &Self::Storage) { + // Linear: stepping is a short hop to the next group; an inlined scan + // beats a gallop call for the common small-group case. + if self.key_pos >= s.0.len() { return; } + let key = s.0[self.key_pos].0.0.clone(); + let mut i = self.key_pos; + while i < s.0.len() && s.0[i].0.0 == key { i += 1; } + self.key_pos = i; + self.val_pos = i; + } + fn seek_key(&mut self, s: &Self::Storage, key: &K) { + // Logarithmic: O(log distance), independent of chunk size. + self.key_pos = gallop(&s.0, self.key_pos, |u| &u.0.0 < key); + self.val_pos = self.key_pos; + } + fn step_val(&mut self, s: &Self::Storage) { + if !self.val_valid(s) { return; } + let kv = s.0[self.val_pos].0.clone(); + let mut i = self.val_pos; + while i < s.0.len() && s.0[i].0 == kv { i += 1; } + self.val_pos = i; + } + fn seek_val(&mut self, s: &Self::Storage, val: &V) { + if !self.key_valid(s) { return; } + let key = s.0[self.key_pos].0.0.clone(); + self.val_pos = gallop(&s.0, self.val_pos, |u| (&u.0.0, &u.0.1) < (&key, val)); + } + fn rewind_keys(&mut self, _s: &Self::Storage) { self.key_pos = 0; self.val_pos = 0; } + fn rewind_vals(&mut self, _s: &Self::Storage) { self.val_pos = self.key_pos; } +} + +/// Take the `Vec` out of a chunk, copying only if the `Rc` is shared. +fn take(chunk: VecChunk) -> Vec<((K, V), T, R)> { + Rc::try_unwrap(chunk.0).unwrap_or_else(|rc| (*rc).clone()) +} + +impl Chunk for VecChunk +where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { + type Cursor = VecChunkCursor; + + const TARGET: usize = TARGET; + + fn cursor(&self) -> Self::Cursor { + VecChunkCursor { key_pos: 0, val_pos: 0, phantom: PhantomData } + } + + fn bounds(&self) -> ((&K, &V, &T), (&K, &V, &T)) { + let s = &self.0[..]; + let (f, l) = (&s[0], &s[s.len() - 1]); + ((&f.0.0, &f.0.1, &f.1), (&l.0.0, &l.0.1, &l.1)) + } + + fn len(&self) -> usize { self.0.len() } + + /// A two-pointer binary merge that drains the two deques' *loaded* content + /// through their shared horizon — the lesser of the two deques' last loaded + /// `(key, val, time)`s — rather than one front-pair at a time. Consolidates + /// equal triples and bulk-copies disjoint runs as slices, walking across chunk + /// boundaries with local indices (`p1`/`p2`) that reset as each working chunk + /// is retired. The side owning the horizon drains fully; the other's partial + /// working chunk is pruned (its prefix dropped) and `push_front`ed back exactly + /// once at the yield boundary — so the per-call prune cost amortizes over the + /// whole burst the harness loaded, not over each chunk. + fn merge(in1: &mut VecDeque, in2: &mut VecDeque, out: &mut VecDeque) { + fn kv(u: &((K, V), T, R)) -> (&K, &V) { (&u.0.0, &u.0.1) } + + let mut result: Vec<((K, V), T, R)> = Vec::with_capacity(TARGET); + let mut flush = |result: &mut Vec<((K, V), T, R)>, force: bool| { + if result.len() >= TARGET || (force && !result.is_empty()) { + out.push_back(VecChunk(Rc::new(std::mem::replace(result, Vec::with_capacity(TARGET))))); + } + }; + + // Working chunks (the shared `Rc`, read by index — never `take`n, so a + // source clone is not deep-copied) and their positions; both deques are + // non-empty on entry. The guard keeps both cursors valid for indexing; a + // working chunk consumed mid-merge is refilled at the foot of the loop, and + // when a deque runs dry we stop — that side has presented all its loaded + // data, so its last triple is the horizon and the rest is left for next time. + let mut c1 = in1.pop_front().unwrap(); + let mut c2 = in2.pop_front().unwrap(); + let (mut p1, mut p2) = (0usize, 0usize); + while p1 < c1.0.len() && p2 < c2.0.len() { + let a = &c1.0[p1]; + let b = &c2.0[p2]; + match (kv(a), &a.1).cmp(&(kv(b), &b.1)) { + // Copy the run of one side strictly below the other's head (within + // the current working chunk): collisions are impossible within it, + // so it moves as slices cut at the grading target. + std::cmp::Ordering::Less => { + let run = gallop(&c1.0[..], p1 + 1, |u| (kv(u), &u.1) < (kv(b), &b.1)); + for piece in c1.0[p1..run].chunks(TARGET) { + result.extend_from_slice(piece); + flush(&mut result, false); + } + p1 = run; + } + std::cmp::Ordering::Greater => { + let run = gallop(&c2.0[..], p2 + 1, |u| (kv(u), &u.1) < (kv(a), &a.1)); + for piece in c2.0[p2..run].chunks(TARGET) { + result.extend_from_slice(piece); + flush(&mut result, false); + } + p2 = run; + } + std::cmp::Ordering::Equal => { + let mut diff = a.2.clone(); + diff.plus_equals(&b.2); + if !diff.is_zero() { + result.push((a.0.clone(), a.1.clone(), diff)); + } + p1 += 1; + p2 += 1; + flush(&mut result, false); + } + } + // Refill either working chunk consumed by the step above; stop the drain + // once a deque is exhausted (the `&&` guard then never re-enters). + if p1 == c1.0.len() { + match in1.pop_front() { Some(c) => { c1 = c; p1 = 0; } None => break } + } + if p2 == c2.0.len() { + match in2.pop_front() { Some(c) => { c2 = c; p2 = 0; } None => break } + } + } + flush(&mut result, true); + // One side's deque emptied with its working chunk exhausted; the other's + // working chunk is partial — push back just its unconsumed suffix (one copy + // per call), ahead of whatever loaded chunks remain in that deque. + if p1 < c1.0.len() { in1.push_front(VecChunk(Rc::new(c1.0[p1..].to_vec()))); } + if p2 < c2.0.len() { in2.push_front(VecChunk(Rc::new(c2.0[p2..].to_vec()))); } + } + + fn extract( + input: &mut VecDeque, + frontier: &Antichain, + residual: &mut Antichain, + keep: &mut VecDeque, + ship: &mut VecDeque, + ) { + // Fill `TARGET`-sized buffers directly, so the chunks pushed are already + // graded and `regrade` passes them through as `Rc` moves rather than + // re-splitting (and re-copying) a monolithic chunk. Emptied input `Vec`s + // are recycled as the next buffers, so allocations balance input against + // output instead of one fresh buffer per emitted chunk. + let mut stash: Vec> = Vec::new(); + let take_buf = |stash: &mut Vec<_>| stash.pop().unwrap_or_default(); + let (mut k, mut s) = (take_buf(&mut stash), take_buf(&mut stash)); + for chunk in input.drain(..) { + let mut v = take(chunk); + for u in v.drain(..) { + if frontier.borrow().less_equal(&u.1) { + residual.insert_ref(&u.1); + k.push(u); + if k.len() >= TARGET { keep.push_back(VecChunk(Rc::new(std::mem::replace(&mut k, take_buf(&mut stash))))); } + } else { + s.push(u); + if s.len() >= TARGET { ship.push_back(VecChunk(Rc::new(std::mem::replace(&mut s, take_buf(&mut stash))))); } + } + } + stash.push(v); + } + if !k.is_empty() { keep.push_back(VecChunk(Rc::new(k))); } + if !s.is_empty() { ship.push_back(VecChunk(Rc::new(s))); } + } + + fn advance( + input: &mut VecDeque, + frontier: &Antichain, + done: bool, + out: &mut VecDeque, + ) { + // Advance and consolidate every *complete* `(key, val)` group eagerly, + // so its updates can be released as soon as the input proves no later + // time for the pair can arrive. A group is contiguous in the sorted + // chain, so the only one that might continue in a future call is the last; + // unless `done`, we process up to its start and `push_front` the rest as + // the withheld carry for the next call. + let mut stash: Vec> = Vec::new(); + // Build the working buffer by *reusing the front chunk's storage* (the + // carry from last time) and appending the rest (recycling each emptied + // `Vec`). Reusing the front is what keeps a withheld group from being + // recopied across calls: it just accumulates in place, so a `(key, val)` + // larger than the working set costs O(total) over the run, not O(total²). + let mut buf = match input.pop_front() { Some(chunk) => take(chunk), None => return }; + while let Some(chunk) = input.pop_front() { + let mut v = take(chunk); + buf.append(&mut v); + stash.push(v); + } + if buf.is_empty() { return; } + + // If every available update shares one `(key, val)`, no group is provably + // complete — a later call may extend it — so make no progress unless + // `done`: push the accumulated buffer back as the carry and return. This is + // the giant-key case; comparing only the first and last pair detects it + // without scanning, and reusing the front above makes the retention free. + if !done && buf[0].0 == buf[buf.len() - 1].0 { + input.push_front(VecChunk(Rc::new(buf))); + return; + } + + // Otherwise at least the first group is complete. Withhold the last group + // (a single `(key, val)`) as the next carry unless the input is complete. + let end = if done { buf.len() } else { + let last_kv = buf[buf.len() - 1].0.clone(); + let mut start = buf.len(); + while start > 0 && buf[start - 1].0 == last_kv { start -= 1; } + start + }; + if end < buf.len() { + input.push_front(VecChunk(Rc::new(buf.split_off(end)))); + } + // Advance + consolidate each group into `TARGET`-sized output chunks, + // filling buffers reclaimed from the recycled `Vec`s. + let mut result = stash.pop().unwrap_or_default(); + let mut i = 0; + while i < buf.len() { + let mut j = i; + while j < buf.len() && buf[j].0 == buf[i].0 { j += 1; } + for u in &mut buf[i..j] { u.1.advance_by(frontier.borrow()); } + // Advancing is monotone w.r.t. the lattice but not the + // representation's total order, so re-sort the group by time. + buf[i..j].sort_by(|a, b| a.1.cmp(&b.1)); + let mut k = i; + while k < j { + let kv = buf[k].0.clone(); + let t = buf[k].1.clone(); + let mut diff = buf[k].2.clone(); + k += 1; + while k < j && buf[k].1 == t { diff.plus_equals(&buf[k].2); k += 1; } + if !diff.is_zero() { + result.push((kv, t, diff)); + if result.len() >= TARGET { out.push_back(VecChunk(Rc::new(std::mem::replace(&mut result, stash.pop().unwrap_or_default())))); } + } + } + i = j; + } + if !result.is_empty() { out.push_back(VecChunk(Rc::new(result))); } + } + + fn regrade(input: &mut VecDeque, done: bool, out: &mut VecDeque) { + // Maximal packing: emit chunks as large as possible up to `TARGET`, + // never splitting a pair that could combine into one legal (`<= TARGET`) + // chunk. A chunk of exactly `TARGET` is maximal — it cannot grow — so it + // passes straight through as an `Rc` move; only sub-`TARGET` chunks are + // copied, and only to coalesce with a neighbour. Producers fill to + // `TARGET`, so in steady state every chunk passes through and only the + // occasional trailing partial is coalesced. + // + // `carry` is the (sub-`TARGET`) chunk under construction. It is flushed + // once it reaches `TARGET`, `push_front`ed back onto `input` between calls, + // or emitted on `done`. Whenever `carry` is non-empty its left neighbour in + // `out` is a `TARGET` chunk (or `carry` is `out`'s first chunk), so + // emitting `carry` against a neighbour it cannot merge with — their sum + // exceeds `TARGET` — keeps the packing maximal on both sides. + let mut carry: Vec<((K, V), T, R)> = Vec::new(); + while let Some(chunk) = input.pop_front() { + if carry.is_empty() { + absorb(chunk, &mut carry, out); + } else if carry.len() + chunk.0.len() <= TARGET { + // Combines into one legal chunk; coalesce in place. + carry.extend(take(chunk)); + if carry.len() == TARGET { + out.push_back(VecChunk(Rc::new(std::mem::take(&mut carry)))); + } + } else { + // Cannot combine without exceeding `TARGET`; `carry` is maximal + // against this neighbour, so emit it and absorb the chunk afresh. + out.push_back(VecChunk(Rc::new(std::mem::take(&mut carry)))); + absorb(chunk, &mut carry, out); + } + } + if !carry.is_empty() { + let chunk = VecChunk(Rc::new(carry)); + if done { out.push_back(chunk); } else { input.push_front(chunk); } + } + } +} + +/// Emit maximal `TARGET`-sized chunks off the front of `carry`, leaving the +/// sub-`TARGET` tail behind. +fn peel( + carry: &mut Vec<((K, V), T, R)>, + out: &mut VecDeque>, +) { + let mut start = 0; + while carry.len() - start >= TARGET { + out.push_back(VecChunk(Rc::new(carry[start..start + TARGET].to_vec()))); + start += TARGET; + } + carry.drain(..start); +} + +/// Absorb a chunk when nothing is carried: pass a `TARGET` chunk through as an +/// `Rc` move, hold a smaller one in `carry`, or split a larger one (peeling off +/// `TARGET` pieces and carrying the remainder). `carry` must be empty on entry. +fn absorb( + chunk: VecChunk, + carry: &mut Vec<((K, V), T, R)>, + out: &mut VecDeque>, +) { + use std::cmp::Ordering::{Equal, Greater, Less}; + match chunk.0.len().cmp(&TARGET) { + Equal => out.push_back(chunk), + Less => *carry = take(chunk), + Greater => { *carry = take(chunk); peel(carry, out); } + } +} + +#[cfg(test)] +mod test { + use std::collections::VecDeque; + use super::{Chunk, VecChunk}; + use crate::trace::chunk::merge_chains; + use std::rc::Rc; + + fn chunk(updates: Vec<((u64, u64), u64, i64)>) -> VecChunk { + VecChunk(Rc::new(updates)) + } + + // Flatten a chunk sequence back to its update stream. + fn flat>>(chunks: I) -> Vec<((u64, u64), u64, i64)> { + chunks.into_iter().flat_map(|c| (*c.0).clone()).collect() + } + + // `extract` partitions by frontier and folds the kept frontier into `residual`; + // a terminal `regrade` then grades each side (the seams of near-graded output). + #[test] + fn extract_partitions_and_grades() { + use super::TARGET; + use crate::trace::chunk::{is_graded, regrade_all}; + use timely::progress::Antichain; + + // 4·TARGET updates spread over many input chunks; even times ship + // (< frontier), odd times keep (>= frontier), so both sides straddle. + let n = 4 * TARGET as u64; + let mut input: VecDeque<_> = (0..n).map(|i| chunk(vec![((i, 0), i % 2, 1)])).collect(); + let frontier = Antichain::from_elem(1u64); + let mut residual = Antichain::new(); + let (mut keep, mut ship) = (VecDeque::new(), VecDeque::new()); + VecChunk::extract(&mut input, &frontier, &mut residual, &mut keep, &mut ship); + let (keep, ship) = (regrade_all(keep), regrade_all(ship)); + + // Kept times are exactly {1}; that is the residual frontier. + assert_eq!(residual, Antichain::from_elem(1u64)); + // Both sides are graded after the regrade. + assert!(is_graded(&keep), "ungraded keep: {:?}", keep.iter().map(Chunk::len).collect::>()); + assert!(is_graded(&ship), "ungraded ship: {:?}", ship.iter().map(Chunk::len).collect::>()); + // Nothing lost: half the updates each way. + assert_eq!(keep.iter().map(Chunk::len).sum::(), n as usize / 2); + assert_eq!(ship.iter().map(Chunk::len).sum::(), n as usize / 2); + } + + // `advance` advances and consolidates complete `(key, val)` groups eagerly, + // pushing the (possibly-growing) last group back as the carry when not `done`. + #[test] + fn advance_emits_complete_groups_eagerly() { + use timely::progress::Antichain; + + let frontier = Antichain::from_elem(5u64); + // Group (0,0) is complete within this chunk; group (1,0) might still grow. + let c0 = chunk(vec![((0, 0), 0, 1), ((0, 0), 1, 1), ((1, 0), 0, 1)]); + let mut input: VecDeque<_> = VecDeque::from([c0]); + let mut out = VecDeque::new(); + VecChunk::advance(&mut input, &frontier, false, &mut out); + + // The trailing group (1,0) is withheld as the carry at the front of `input`. + assert_eq!(input.len(), 1); + assert_eq!(Chunk::len(&input[0]), 1); + // Group (0,0)'s times {0,1} advanced to 5 and consolidated, emitted now. + assert_eq!(flat(out), vec![((0, 0), 5, 2)]); + } + + // Streaming the input one chunk at a time must yield exactly what a single + // all-at-once flush does — the resumable path is just the one-shot path cut + // at group boundaries. + #[test] + fn advance_resumable_matches_oneshot() { + use timely::progress::Antichain; + + let frontier = Antichain::from_elem(3u64); + // Groups span chunk boundaries and carry several times each. + let input = || vec![ + chunk(vec![((0, 0), 0, 1), ((0, 0), 1, 1), ((1, 0), 0, 1)]), + chunk(vec![((1, 0), 5, 1), ((1, 1), 0, 1), ((2, 0), 0, 1)]), + chunk(vec![((2, 0), 2, 1), ((2, 0), 9, 1)]), + ]; + + let oneshot = { + let mut q: VecDeque<_> = input().into(); + let mut out = VecDeque::new(); + VecChunk::advance(&mut q, &frontier, false, &mut out); + VecChunk::advance(&mut q, &frontier, true, &mut out); + flat(out) + }; + let incremental = { + let mut q = VecDeque::new(); + let mut out = VecDeque::new(); + for c in input() { q.push_back(c); VecChunk::advance(&mut q, &frontier, false, &mut out); } + VecChunk::advance(&mut q, &frontier, true, &mut out); + flat(out) + }; + assert_eq!(oneshot, incremental); + // Times are advanced: nothing below the frontier survives. + for u in &oneshot { assert!(u.1 >= 3); } + } + + // A single `(key, val)` whose updates span every pushed chunk: `advance` + // can make no progress until `done`, accumulating in the carry in place. + // It must still produce the right advanced+consolidated result at the end. + #[test] + fn advance_single_key_spanning_pushes() { + use timely::progress::Antichain; + + let frontier = Antichain::from_elem(100u64); + let n = 50u64; + let make = || (0..n).map(|t| chunk(vec![((7u64, 0u64), t, 1i64)])).collect::>(); + + let mut q = VecDeque::new(); + let mut out = VecDeque::new(); + for c in make() { q.push_back(c); VecChunk::advance(&mut q, &frontier, false, &mut out); } + VecChunk::advance(&mut q, &frontier, true, &mut out); + // All times advance to 100 and consolidate to one update of diff `n`. + assert_eq!(flat(out), vec![((7u64, 0u64), 100u64, n as i64)]); + } + + #[test] + fn merge_chains_consolidates() { + let a = chunk(vec![((0, 0), 0, 1), ((1, 0), 0, 1)]); + let b = chunk(vec![((0, 0), 0, 1), ((2, 0), 0, 1)]); + let mut out = VecDeque::new(); + merge_chains(vec![a], vec![b], &mut out); + assert_eq!(flat(out), vec![((0, 0), 0, 2), ((1, 0), 0, 1), ((2, 0), 0, 1)]); + } + + // Merging runs larger than `TARGET`, then regrading, yields a *graded* sequence + // (each chunk `<= TARGET`, adjacent pairs summing past `TARGET`) reproducing the + // consolidated sorted contents. + #[test] + fn merge_emits_graded_chunks() { + use super::TARGET; + use crate::trace::chunk::{is_graded, merge_chains, regrade_all}; + + // Two interleaving single-chunk chains: evens and odds over `0..4·TARGET`. + let n = 4 * TARGET as u64; + let evens = chunk((0..n).step_by(2).map(|k| ((k, 0), 0, 1)).collect()); + let odds = chunk((0..n).step_by(2).map(|k| ((k + 1, 0), 0, 1)).collect()); + + let mut out = VecDeque::new(); + merge_chains(vec![evens], vec![odds], &mut out); + let chunks = regrade_all(out); + + assert!(is_graded(&chunks), "merge output not graded: {:?}", + chunks.iter().map(Chunk::len).collect::>()); + // Contents are exactly the sorted keys `0..4·TARGET`, each once. + let want: Vec<_> = (0..n).map(|k| ((k, 0u64), 0u64, 1i64)).collect(); + assert_eq!(flat(chunks), want); + } + + // Property test: merging two *multi-chunk* chains (driven through `merge` by + // `merge_chains`) reproduces the union of all updates, consolidated. Tiny + // chunks force `(key, val)` groups — which can span several times — to + // straddle chunk boundaries on both sides, exercising the refill path the + // single-chunk merge tests never reach. The independent oracle is + // `consolidate_updates` over the concatenation. + #[test] + fn merge_matches_reference() { + use crate::trace::chunk::merge_chains; + use crate::consolidation::consolidate_updates; + + // Deterministic xorshift PRNG — no dev-dependency on `rand`. + let mut seed = 0x2545F4914F6CDD1Du64; + let mut rng = move || { seed ^= seed << 13; seed ^= seed >> 7; seed ^= seed << 17; seed }; + + // A sorted, consolidated update set over a small (key, val, time) space, + // so the two chains collide and a `(key, val)` carries several times. + fn gen(rng: &mut impl FnMut() -> u64, n: usize) -> Vec<((u64, u64), u64, i64)> { + let mut v: Vec<((u64, u64), u64, i64)> = (0..n).map(|_| { + let k = rng() % 20; let val = rng() % 3; let t = rng() % 8; + let d = if rng() % 4 == 0 { -1 } else { 1 }; + ((k, val), t, d) + }).collect(); + consolidate_updates(&mut v); + v + } + // Split a consolidated set into a chain of small chunks (each sorted and + // consolidated; together globally sorted), so groups straddle boundaries. + fn chain(updates: &[((u64, u64), u64, i64)], sz: usize) -> Vec> { + updates.chunks(sz).map(|c| VecChunk(Rc::new(c.to_vec()))).collect() + } + + for _ in 0..300 { + let n1 = (rng() as usize % 60) + 1; + let u1 = gen(&mut rng, n1); + let n2 = (rng() as usize % 60) + 1; + let u2 = gen(&mut rng, n2); + if u1.is_empty() || u2.is_empty() { continue; } + let sz = (rng() as usize % 5) + 1; // tiny chunks → heavy straddling + + let mut out = VecDeque::new(); + merge_chains(chain(&u1, sz), chain(&u2, sz), &mut out); + let merged = flat(out); + + let mut reference: Vec<_> = u1.iter().chain(u2.iter()).cloned().collect(); + consolidate_updates(&mut reference); + + assert_eq!(merged, reference, "chunk size {sz}\n u1={u1:?}\n u2={u2:?}"); + } + } + + // `regrade` must produce a *maximal packing*: adjacent sub-`TARGET` chunks + // that could combine into one legal chunk are coalesced, full chunks pass + // through as `Rc` moves, and contents are preserved exactly. + #[test] + fn regrade_maximal_packing() { + use super::TARGET; + use crate::trace::chunk::is_graded; + + // A mix of small and full chunks with distinct, increasing keys (so the + // concatenation is sorted and nothing consolidates away). + let t = TARGET; + let sizes = [t / 3, t / 3, t / 3, t, t / 2, t / 2, t, 1, t - 1]; + let total: usize = sizes.iter().sum(); + let mut key = 0u64; + let mut input = VecDeque::new(); + let mut output = VecDeque::new(); + for &s in &sizes { + let updates: Vec<_> = (0..s).map(|_| { let k = key; key += 1; ((k, 0u64), 0u64, 1i64) }).collect(); + input.push_back(chunk(updates)); + VecChunk::regrade(&mut input, false, &mut output); + } + VecChunk::regrade(&mut input, true, &mut output); + let chunks: Vec<_> = output.into(); + + assert!(is_graded(&chunks), "not graded: {:?}", + chunks.iter().map(Chunk::len).collect::>()); + // Nothing lost, and the keys stay strictly sorted across the new breaks. + let got: Vec<_> = chunks.into_iter().flat_map(|c| (*c.0).clone()).collect(); + assert_eq!(got.len(), total); + assert!(got.windows(2).all(|w| w[0].0.0 < w[1].0.0)); + } + + // The indexed cursor must reconstruct the same grouped updates as a flat + // reference, even when a key — and a `(key, val)`'s times — straddle a + // chunk boundary. + #[test] + fn cursor_handles_straddle() { + use crate::trace::cursor::Cursor; + use crate::trace::{BatchReader, Description}; + use crate::trace::chunk::ChunkBatch; + use timely::progress::Antichain; + + let chunks = vec![ + chunk(vec![((0, 0), 0, 1), ((1, 0), 0, 1), ((1, 1), 0, 1)]), + chunk(vec![((1, 1), 1, 1), ((1, 2), 0, 1)]), + chunk(vec![((2, 0), 0, 1)]), + ]; + let desc = Description::new( + Antichain::from_elem(0u64), + Antichain::from_elem(2u64), + Antichain::from_elem(0u64), + ); + let batch = ChunkBatch::new(chunks, desc); + + let mut cursor = batch.cursor(); + let got = cursor.to_vec(&batch, |k| *k, |v| *v); + let want = vec![ + ((0u64, 0u64), vec![(0u64, 1i64)]), + ((1, 0), vec![(0, 1)]), + ((1, 1), vec![(0, 1), (1, 1)]), + ((1, 2), vec![(0, 1)]), + ((2, 0), vec![(0, 1)]), + ]; + assert_eq!(got, want); + } + + // Isolated: gallop vs linear forward-seek over one big chunk, for sparse to + // dense probe sets. Run: cargo test seek_microbench -- --ignored --nocapture + #[test] + #[ignore] + fn seek_microbench() { + use std::time::Instant; + use std::hint::black_box; + use super::gallop; + let n = 1_000_000u64; + let data: Vec<((u64, ()), u64, isize)> = (0..n).map(|k| ((3 * k, ()), 0u64, 1isize)).collect(); + for probes in [100u64, 10_000, 1_000_000] { + let targets: Vec = (0..probes).map(|i| 3 * (i * n / probes)).collect(); + let best = |f: &dyn Fn() -> u64| { + let mut b = std::time::Duration::MAX; + for _ in 0..5 { let t = Instant::now(); black_box(f()); b = b.min(t.elapsed()); } + b + }; + let data = black_box(&data[..]); + let g = best(&|| { + let (mut pos, mut acc) = (0usize, 0u64); + for &tgt in &targets { pos = gallop(data, pos, |u| u.0.0 < tgt); acc += pos as u64; } + acc + }); + let l = best(&|| { + let (mut pos, mut acc) = (0usize, 0u64); + for &tgt in &targets { while pos < data.len() && data[pos].0.0 < tgt { pos += 1; } acc += pos as u64; } + acc + }); + eprintln!("probes={probes:>7}: gallop={g:>12?} linear={l:>12?}"); + } + } +} From 57fc7a06cd9ce9daabb3e4a1f0c9dd68dad47f53 Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Mon, 22 Jun 2026 15:27:59 -0400 Subject: [PATCH 9/9] chunk: settle as the bounded-footprint primitive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename `regrade` to `settle`: the point a chunk leaves the active set, where it grades and may later compress or spill. Settle as output is produced — in merge, extract, the batch merger's `work`, and the builder — so the un-settled in-core set stays bounded rather than settling everything at seal. Redefine `extract` to do bounded work per call, with the harness settling each side between calls. Take `AntichainRef` in `extract`/`advance` to match the `Merger` contract and drop a per-seal allocation. Rename the public builder alias to `ChunkBuilder` (inner impl `ChunkBatchBuilder`), keeping the `Rc` wrapper out of the name. Add module docs (wiring, bounded footprint) and a resumable-merger reference test; tighten comments. Co-Authored-By: Claude Opus 4.8 (1M context) --- differential-dataflow/examples/chunks.rs | 6 +- differential-dataflow/src/trace/chunk/mod.rs | 642 ++++++++---------- .../src/trace/chunk/{vec_chunk.rs => vec.rs} | 314 +++++---- 3 files changed, 446 insertions(+), 516 deletions(-) rename differential-dataflow/src/trace/chunk/{vec_chunk.rs => vec.rs} (71%) diff --git a/differential-dataflow/examples/chunks.rs b/differential-dataflow/examples/chunks.rs index 94c3ac61c..8d946b199 100644 --- a/differential-dataflow/examples/chunks.rs +++ b/differential-dataflow/examples/chunks.rs @@ -1,7 +1,7 @@ //! Minimal dataflow over the `Vec`-backed `Chunk` container. //! //! Mirrors the `val` arm of `spines.rs`, but arranges through `ChunkBatcher` / -//! `ChunkRcBuilder` / `ChunkSpine` — i.e. the merge batcher, builder, and spine +//! `ChunkBuilder` / `ChunkSpine` — i.e. the merge batcher, builder, and spine //! built atop the `Chunk` trait and its `ChunkBatch`. Run as: //! //! ```text @@ -12,7 +12,7 @@ use differential_dataflow::Hashable; use differential_dataflow::input::Input; use differential_dataflow::operators::arrange::Arrange; use differential_dataflow::operators::arrange::arrangement::arrange_core; -use differential_dataflow::trace::chunk::vec_chunk::{ChunkBatcher, ChunkRcBuilder, ChunkSpine, VecChunk}; +use differential_dataflow::trace::chunk::vec::{ChunkBatcher, ChunkBuilder, ChunkSpine, VecChunk}; use differential_dataflow::trace::implementations::chunker::ContainerChunker; use differential_dataflow::trace::implementations::ord_neu::{OrdValBatcher, RcOrdValBuilder, OrdValSpine}; @@ -44,7 +44,7 @@ fn main() { // container (`Vec`), so this is a cross-container chunker case: // drop to `arrange_core` with an explicit `ContainerChunker`. type Ba = ChunkBatcher; - type Bu = ChunkRcBuilder; + type Bu = ChunkBuilder; type Sp = ChunkSpine; type Chu = ContainerChunker>; let data = arrange_core::<_, _, Chu, Ba, Bu, Sp>( diff --git a/differential-dataflow/src/trace/chunk/mod.rs b/differential-dataflow/src/trace/chunk/mod.rs index b1473114a..660d70613 100644 --- a/differential-dataflow/src/trace/chunk/mod.rs +++ b/differential-dataflow/src/trace/chunk/mod.rs @@ -1,44 +1,54 @@ //! Sorted, consolidated runs of updates, and operators over sequences of them. //! //! A [`Chunk`] is a consolidated, sorted run of `(data, time, diff)` updates. -//! Chunks live in sequences (`Vec`) with no constraint on where the -//! breakpoints between them fall; each chunk holds at most [`Chunk::TARGET`] -//! updates. The trait deliberately exposes only batch-level operations — merge, -//! extract, advance — leaving the layout-aware work to the implementor. The -//! orchestration in this module (the binary merger) is generic over the layout -//! and concerns itself only with feeding chunks across calls. +//! A sequence of chunks is also expected to be consolidated and sorted. //! -//! # Why chunks, and why one size +//! The [`Chunk`] trait exposes whole-chunk operations, so that the implementor +//! can internally divert to their best implementations, with amortized overhead. +//! Each operation is invoked as if "streaming", providing input and output queues. +//! An implementor is expected to drain as much as possible of the inputs, and any +//! chunk written to the output is "committed" and likely to be shipped onward. //! -//! A batch could be a single monolithic sorted run. We cut it into chunks because -//! the chunk is simultaneously the unit of four things, each of which wants a size -//! bound: +//! # Wiring a `Chunk` into an arrangement //! -//! * **Suspendable work.** The fueled merger does a chunk's-worth of work per step -//! and checks fuel at the boundary, so chunk size bounds a step's latency. -//! * **Immutable sharing.** Chunks are `Rc`-shared; the merger reads its sources by -//! *cloning* chunks (a refcount bump). The chunk is the finest granularity of sharing. -//! * **Allocation recycling.** Emptied input buffers are reused as output buffers; -//! that only composes if buffers are roughly one size. -//! * **Indexing.** [`ChunkBatch`] indexes chunks by their first/last key, and the -//! cursor binary-searches *over* chunks then gallops *within* one. The chunk -//! count (≈ `len / TARGET`) sets the outer index size and search depth. +//! Implementing [`Chunk`] for a type `C` is the only bespoke code needed; three +//! aliases then expand into a full trace: //! -//! So the size bound pulls two ways: an upper bound (latency, memory) says "not too -//! big," and a lower bound (per-chunk overhead, index bloat) says "not too -//! fragmented." Keeping chunks one size is what lets a single knob satisfy both. -//! The grading invariant ([`is_graded`]) encodes exactly this: every chunk is at -//! most `TARGET`, and every *adjacent pair* exceeds `TARGET` — i.e. no two -//! neighbours could be combined into one legal chunk. That makes `TARGET` both the -//! maximum size and the coalescing threshold (the invariant is self-similar), and -//! a graded sequence a *maximal packing*: as few chunks as the maximum allows. +//! * [`ChunkBatcher`](ChunkBatcher) — the merge batcher. +//! * [`ChunkBuilder`](ChunkBuilder) — the batch builder. +//! * [`ChunkSpine`](ChunkSpine) — the trace, a spine of `Rc`-shared batches. //! -//! The intent is for a `Chunk` implementation to be each of -//! 1. the containers a `Collection` can transit. -//! 2. the containers a `MergeBatcher` can work with. -//! 3. the containers a `Batch` can be backed by. -//! It does this by exposing a small set of chunk-oriented primitives, which are -//! sufficient for harnesses for each of these tasks. +//! These are the `Batcher` / `Builder` / `Spine` to hand to +//! [`arrange_core`](crate::operators::arrange::arrangement::arrange_core), along with a +//! chunker that forms `C` from the input stream — typically +//! [`ContainerChunker`](crate::trace::implementations::chunker::ContainerChunker). +//! Everything else here ([`ChunkBatch`], [`ChunkMerger`], [`ChunkBatchMerger`], +//! [`ChunkBatchCursor`], [`ChunkBatchBuilder`]) is machinery those aliases expand to and is +//! not named directly. The [`vec`](mod@vec) module is a worked `Chunk` +//! that re-exports the three aliases specialized to its layout, and the `chunks` example +//! stands one up. +//! +//! # Bounded footprint +//! +//! There is a `TARGET` associated constant that signals the intended chunk size. +//! The constant should be chosen large enough to amortize overheads, but small +//! enough that per-chunk work does not "stall" the system when invoked. +//! The implementor is trusted to make a reasonable choice here. +//! +//! The [`Chunk::settle`] method "settles" sequences of chunks, and is called as +//! chunks are no longer expected to be needed in the near future. The implementor +//! should ensure the chunks are "graded", in that the sequence of chunks are all +//! at most `TARGET` in size, any two in order sum to strictly more than `TARGET`. +//! This is also an opportunity to compress data, or spill to disk or cloud storage. +//! +//! The active (un-settled) chunk set is kept small from both sides. Every producer +//! settles its committed output as it goes (see [`Chunk::settle`]), rather than +//! building a whole sequence and settling at the end. And every walk over a whole +//! chunk sequence reads only resident metadata — [`len`](Chunk::len) and +//! [`bounds`](Chunk::bounds) — never a chunk body: a batch indexes its chunks' +//! bounds once at construction, so cursors binary-search that resident index and +//! open only the chunk(s) a query touches. Implementors must therefore keep `len` +//! and `bounds` cheap even when a chunk's body is paged out. use std::collections::VecDeque; @@ -49,113 +59,23 @@ use crate::trace::{Batch, BatchReader, Description}; use crate::trace::cursor::Cursor; use crate::trace::implementations::{BatchContainer, Layout, LayoutExt, WithLayout}; -/// The key container of chunk `C`'s layout. Named via the `Layout` projection so -/// it unifies with the cursor's `Self::Key`, which also projects through `Layout`. -type KeyCon = <::Layout as Layout>::KeyContainer; -/// The val container of chunk `C`'s layout. -type ValCon = <::Layout as Layout>::ValContainer; - -/// Whether `chunks` satisfy the [`Chunk::TARGET`] grading invariant: every chunk -/// at most `TARGET`, and every adjacent pair summing to more than `TARGET` (so no -/// two neighbours could be combined into one legal chunk — a *maximal packing*). -/// -/// This is the post-[`regrade`](Chunk::regrade) shape; useful as a test/debug check. -pub fn is_graded(chunks: &[C]) -> bool { - chunks.iter().all(|c| c.len() <= C::TARGET) - && chunks.windows(2).all(|w| w[0].len() + w[1].len() > C::TARGET) -} +pub mod vec; -/// Regrade `input` to completion into a fresh graded `Vec` (see [`Chunk::regrade`]). -/// -/// A convenience for the one-shot callers (batch sealing, the batcher's merge and -/// extract) that have a whole sequence in hand and want it graded; the streaming -/// callers drive [`Chunk::regrade`] directly across ticks. -pub fn regrade_all(input: impl IntoIterator) -> Vec { - let mut input: VecDeque = input.into_iter().collect(); - let mut out = VecDeque::new(); - C::regrade(&mut input, true, &mut out); - debug_assert!(input.is_empty()); - out.into() -} - -/// A consolidated, sorted sequence of `(data, time, diff)`. -/// -/// Chunks exist in sequences, with no constraints on the breakpoints between -/// them. Each holds at most [`TARGET`](Chunk::TARGET) updates; a graded sequence -/// is a maximal packing at that size (see [`is_graded`] and the module docs). -/// -/// `Clone` is expected to be cheap — a refcount bump on shared backing storage, -/// not a deep copy. The trace merger relies on this to read its (shared, -/// immutable) source batches by cloning chunks rather than consuming them. -/// -/// A chunk *has* a [`Cursor`] over its own `(key, val, time, diff)` contents — -/// the chunk is its own cursor `Storage`, mirroring [`BatchReader`]. This is what -/// lets a batch cursor delegate downward: the batch indexes which chunk holds a -/// key (reusing the chunk's `KeyContainer` / `ValContainer` for boundaries) and -/// then reads through that chunk's cursor. We do not provide this; the opaque -/// chunk implementor does. -/// -/// # The transducer protocol -/// -/// The four chunk-producing operations ([`merge`](Chunk::merge), -/// [`extract`](Chunk::extract), [`advance`](Chunk::advance), -/// [`regrade`](Chunk::regrade)) are all *stream transducers* over `VecDeque`, -/// sharing one calling convention so an implementor learns it once: -/// -/// * **Consume from the front.** Read chunks off the front of the input deque(s). -/// * **Withhold by pushing back.** Anything consumed but not yet safe to commit -/// (advance's still-growing last group; regrade's sub-`TARGET` carry; merge's -/// partially-consumed front) is reformed into a single owned chunk and -/// `push_front`ed back onto its input. The only cross-call state is therefore the -/// deques themselves — clean owned runs, no indices escape a call. -/// * **Commit by appending.** Append committed chunks to the output deque; once -/// appended they are written and a downstream stage may take them immediately. -/// * **`done` forces the flush.** The unary stages take `done: bool`; while it is -/// false they may withhold, and a call that appends nothing has yielded — the -/// harness will not call again until more input arrives or `done` flips true. On -/// `done` the stage must drain its withheld state (the harness keeps calling -/// until the output stops growing). -/// -/// Two operations vary only where their job demands it: [`merge`](Chunk::merge) is -/// binary (and the harness, not `merge`, handles a drained input by flushing the -/// other side's verbatim tail, so `merge` needs no `done`); [`extract`](Chunk::extract) -/// is the one-shot splitter (it drains its whole input, so it needs no `done` and -/// has two outputs plus a residual frontier). +/// A non-empty, bounded, consolidated, sorted sequence of `(data, time, diff)`. /// -/// Implementors are further expected to: +/// An implementor gains access to types and trait implementations that provide +/// batch formation and trace maintenance with no additional effort. /// -/// * **Emit near-graded output.** Fill `TARGET`-sized output chunks directly rather -/// than emitting one monolithic chunk; the terminal [`regrade`](Chunk::regrade) -/// only has to coalesce the trailing partials at the seams. Grading is a -/// *seal-time* property, not an invariant maintained between stages. -/// * **Recycle where possible.** Reuse the storage of chunks drained from the input -/// as the buffers for output, so allocations balance input against output rather -/// than allocating afresh per emitted chunk. `vec_chunk` is the worked example: it -/// fills buffers reclaimed from a stash of emptied input `Vec`s, and advance reuses -/// its withheld carry's storage in place so a giant key stays linear, not quadratic. -/// -/// [`BatchReader`]: crate::trace::BatchReader +/// The necessary implementations are either "data" or "metadata" operations. +/// The "data" operations transform lists of chunks, are expected to do roughly +/// "one chunk's worth" of work at a time; they can afford to compress and page. +/// The "metadata" operations provide chunk information, and should be lightweight. pub trait Chunk: Sized + Clone + LayoutExt { - /// The chunk size: both the maximum updates per chunk and the coalescing - /// threshold. - /// - /// A *graded* sequence (the post-[`regrade`](Chunk::regrade) shape) has every - /// chunk of length at most `TARGET`, and every adjacent pair summing to more - /// than `TARGET` — so no two neighbours could be combined into one legal chunk. - /// Equivalently, a maximal packing at size `TARGET`. [`is_graded`] checks - /// exactly this. The value is the implementor's tuning knob: larger means fewer - /// chunks (smaller index, less per-chunk overhead) but coarser merge-suspension - /// granularity and a larger within-chunk seek. Required, not defaulted: the - /// right value is layout-dependent, so every implementor chooses it deliberately. + /// The intended maximum chunk size. const TARGET: usize; - /// A cursor navigating this chunk's contents; the chunk is its storage. - /// - /// The layout aliases are spelled out (mirroring [`BatchReader`]) so the - /// cursor's `Key`/`Val`/`Time`/`Diff` and their containers are *definitionally* - /// equal to the chunk's — without this the compiler won't connect the cursor's - /// layout to the chunk's when reading through it. + /// A cursor navigating this chunk's contents. type Cursor: Cursor + WithLayout + @@ -177,79 +97,67 @@ pub trait Chunk: Sized + Clone + LayoutExt { fn cursor(&self) -> Self::Cursor; /// The first and last `(key, val, time)` triples in the chunk. - /// - /// The chunk must be non-empty (batch chunks always are). Expected to be - /// cheap — the chunk's endpoints, e.g. columnar indices `0` and `len - 1`, - /// not a cursor walk. Indexing a batch's chunks rests on this: the last - /// triples drive a binary search to a key or `(key, val)`, and comparing one - /// chunk's last triple against the next chunk's first detects keys or - /// `(key, val)` pairs that straddle the boundary — all without touching chunk - /// contents. Returned by reference (no owned key type exists in the layout); - /// the index materializes them into its own containers. fn bounds(&self) -> ( (Self::Key<'_>, Self::Val<'_>, Self::TimeGat<'_>), (Self::Key<'_>, Self::Val<'_>, Self::TimeGat<'_>), ); /// The number of updates in the chunk. - /// - /// Chunks are always non-empty (`len() > 0`): producers drop empties before - /// they reach a chunk sequence, and [`ChunkBatch::new`] asserts the invariant. fn len(&self) -> usize; /// Merge the fronts of two input deques through their shared horizon. /// - /// Both deques are non-empty (the caller guarantees it). The two front chunks - /// merge through updates present in both — up to the least last `(key, val, time)` - /// triple across them — consolidating collisions and emitting committed chunks to - /// `out`. The side owning the horizon is fully consumed and `pop_front`ed; the - /// other's partially-consumed front is reformed (its consumed prefix dropped) and - /// `push_front`ed back. So on return at least one deque has had its front retired. + /// Both deques are non-empty (the caller guarantees it). The two queues are both + /// the heads of lists of chunks, and the implementor should only merge through the + /// least last `(key, val, time)` update, or risk emitting an unconsolidated + /// output chunk. + /// + /// When a chunk cannot be completely retired, perhaps it had the larger last update, + /// it should be rewritten as a new chunk and pushed back to the front of the queue. + /// The invocation is expected to consume at least one of its inputs, and the harness + /// may continually re-invoke if this doesn't happen. /// - /// `merge` makes one front-pair's worth of progress and returns; the harness - /// re-ticks it, refilling a drained deque from its source, and itself handles an - /// exhausted source by flushing the other deque's verbatim tail — so `merge` needs - /// no `done` and never has to reason about end-of-input. + /// A merge concludes when the harness sees that either input is now empty, at which + /// point it appends the queue to the output without the method's assistance. fn merge(in1: &mut VecDeque, in2: &mut VecDeque, out: &mut VecDeque); - /// Partition the input by `frontier` into updates greater-or-equal it (`keep`) or - /// not (`ship`). One-shot: the whole of `input` is consumed. + /// Partition `input` updates into `keep` (greater or equal `frontier`) or not (`ship`). /// - /// The lower envelope of the times routed to `keep` is folded into `residual`, so - /// the caller learns the frontier of data it still holds without a second pass. - /// Outputs are near-graded but not regraded; a terminal [`regrade`](Chunk::regrade) - /// zips up the seams. + /// An implementation should yield with some frequency to allow the output to "settle". + /// The harness may guard against this, but it prefers to provide as much context as it + /// can in order to allow broader chunk fusion where needed. fn extract( input: &mut VecDeque, - frontier: &Antichain, + frontier: AntichainRef, residual: &mut Antichain, keep: &mut VecDeque, ship: &mut VecDeque, ); - /// Advance times by `frontier`, consolidating each complete `(key, val)` group from - /// the front of `input` into `out`. + /// Advance times by `frontier` producing consolidated chunks. /// - /// A group is complete once a later `(key, val)` is seen, so every group but the - /// last is emitted; the last (which a future call might extend) is reformed and - /// `push_front`ed back as the withheld carry — unless `done`, which flushes it too. - /// The degenerate case is a single `(key, val)` spanning all available input: no - /// group is provably complete, so nothing is committed (the whole buffer is - /// withheld) until `done`. + /// An output for `(key, val)` should generally not be produced until a later pair + /// is observed, or `done` is set, to ensure the output chunks are consolidated. + /// Incomplete work can be pushed back to the front of `input`. + /// + /// On `done` a single `(key, val)` group may span the whole input; advancing and + /// consolidating it should cost time linear in its size, not quadratic. fn advance( input: &mut VecDeque, - frontier: &Antichain, + frontier: AntichainRef, done: bool, out: &mut VecDeque, ); - /// Reshape the front of `input` into a maximal packing in `out`: each chunk at most - /// [`TARGET`](Chunk::TARGET), and any two adjacent summing past `TARGET` (so no - /// neighbours could be combined). See [`is_graded`]. + /// Reshape `input` to a sequence that maintains the "grading" structural invariant. + /// + /// Specifically, the chunks in `output` should have a maximum size of `TARGET` and + /// each adjacent pair should have lengths that sum to more than `TARGET`. + /// This is also a good moment to consider compression or paging out the contents. + /// When `done` is set the input must be moved to the output. /// - /// The terminal stage of every pipeline. A sub-`TARGET` carry that might still grow - /// is `push_front`ed back as the withheld remainder until `done`, which flushes it. - fn regrade( + /// This method may be called on already settled data, and should be efficient then. + fn settle( input: &mut VecDeque, done: bool, out: &mut VecDeque, @@ -257,115 +165,12 @@ pub trait Chunk: Sized + Clone + LayoutExt { } -/// Merge two full chains of chunks into one, to completion, appending to `out`. -/// -/// The whole-chain (non-fueled) driver used by the batcher's -/// [`Merger`](crate::trace::implementations::merge_batcher::Merger): both chains are in -/// hand, so it ticks [`Chunk::merge`] until one deque empties, then appends the other's -/// remainder (the verbatim tail). Output is near-graded; callers regrade as needed. -pub fn merge_chains( - chain1: Vec, - chain2: Vec, - out: &mut VecDeque, -) { - let mut in1: VecDeque = chain1.into(); - let mut in2: VecDeque = chain2.into(); - while !in1.is_empty() && !in2.is_empty() { - C::merge(&mut in1, &mut in2, out); - } - // One deque is empty; the other's remainder is all greater than everything merged. - out.extend(in1.drain(..)); - out.extend(in2.drain(..)); -} - -/// A merge-batcher [`Merger`](crate::trace::implementations::merge_batcher::Merger) -/// over chains of [`Chunk`]s. -/// -/// `merge` runs the whole-chain binary merger; `extract` splits by the seal frontier -/// using [`Chunk::extract`]. The batcher consolidates equal `(data, time)` updates -/// but does *not* advance times — time advancement is advance's job, handled later in -/// the trace. Both regrade their output, since the batcher's chains want to be graded. -pub struct ChunkMerger { - _marker: std::marker::PhantomData, -} - -impl Default for ChunkMerger { - fn default() -> Self { Self { _marker: std::marker::PhantomData } } -} - -impl crate::trace::implementations::merge_batcher::Merger for ChunkMerger -where - C: Chunk + Default + 'static, - C::Time: Clone + timely::PartialOrder + 'static, -{ - type Chunk = C; - type Time = C::Time; - - fn merge( - &mut self, - list1: Vec, - list2: Vec, - output: &mut Vec, - _stash: &mut Vec, - ) { - let mut merged = VecDeque::new(); - merge_chains(list1, list2, &mut merged); - // No regrade: the batcher's ladder weighs chains by updates (not chunk count) - // since #767, so intermediate grading buys nothing; the final batch is graded - // at seal. merge's output is already near-`TARGET`. - output.extend(merged); - } - - fn extract( - &mut self, - merged: Vec, - upper: AntichainRef, - frontier: &mut Antichain, - ship: &mut Vec, - kept: &mut Vec, - _stash: &mut Vec, - ) { - // `extract` keeps updates greater-or-equal `upper` and ships the rest, - // folding the lower envelope of kept times into `frontier`. - let upper = upper.to_owned(); - let mut input: VecDeque = merged.into(); - let (mut keep, mut shipped) = (VecDeque::new(), VecDeque::new()); - C::extract(&mut input, &upper, frontier, &mut keep, &mut shipped); - // No regrade: `kept` is re-merged later and `shipped` is regraded at seal by - // the builder, so neither needs grading here. - kept.extend(keep); - ship.extend(shipped); - } - - fn len(chunk: &C) -> usize { chunk.len() } -} - -/// The merge batcher for chunks of type `C`, merging pre-chunked `C` runs. -/// -/// The batcher accepts already-formed `C` chunks via `PushInto` and merges them -/// through [`ChunkMerger`]; it holds no chunker. The `Input → C` bridge lives at the -/// `arrange_core` callsite, which supplies the chunker (e.g. [`ContainerChunker`] -/// for same-shape input, where `C` satisfies the batcher-side container traits -/// `SizableContainer`, `Consolidate`, `Container`, `PushInto`). -/// -/// [`ContainerChunker`]: crate::trace::implementations::chunker::ContainerChunker -pub type ChunkBatcher = crate::trace::implementations::merge_batcher::MergeBatcher>; - -/// A spine of `Rc`-shared [`ChunkBatch`]es of type `C`: the trace type for `arrange`. -pub type ChunkSpine = crate::trace::implementations::spine_fueled::Spine>>; - -/// A reference-counted [`ChunkBatch`] builder over chunks of type `C`. -pub type ChunkRcBuilder = crate::trace::rc_blanket_impls::RcBuilder>; +type KeyCon = <::Layout as Layout>::KeyContainer; +type ValCon = <::Layout as Layout>::ValContainer; -/// A batch is just an ordered sequence of [`Chunk`]s plus its time description. -/// -/// The chunks are sorted and consolidated, with chunk boundaries arbitrary; the -/// concatenation of their contents is the batch. +/// A batch is a [`Chunk`] sequence plus a [`Description`]. /// -/// This is a full [`Batch`](crate::trace::Batch): [`ChunkBatchCursor`] reads -/// across the chunks (delegating to each chunk's own cursor and continuing past -/// boundaries), [`ChunkBatchMerger`] performs the resumable merge-and-advance, -/// and [`ChunkBuilder`] collects pre-sorted chunks. All of those are below. +/// Metadata about the batches is cached to make subselection efficient. pub struct ChunkBatch { /// Ordered, consolidated chunks; their concatenation is the batch. pub chunks: Vec, @@ -402,6 +207,43 @@ impl WithLayout for ChunkBatch { type Layout = C::Layout; } +impl BatchReader for ChunkBatch { + type Cursor = ChunkBatchCursor; + fn cursor(&self) -> Self::Cursor { + ChunkBatchCursor { key_chunk: 0, chunk: 0, inner: self.chunks.first().map(C::cursor) } + } + fn len(&self) -> usize { self.chunks.iter().map(C::len).sum() } + fn description(&self) -> &Description { &self.description } +} + +impl Batch for ChunkBatch +where + C::Time: timely::progress::Timestamp + Lattice + Ord, +{ + type Merger = ChunkBatchMerger; + + fn empty(lower: Antichain, upper: Antichain) -> Self { + use timely::progress::Timestamp; + let since = Antichain::from_elem(Self::Time::minimum()); + ChunkBatch::new(Vec::new(), Description::new(lower, upper, since)) + } +} + +/// A merge-batcher [`Merger`](crate::trace::implementations::merge_batcher::Merger) +/// over chains of [`Chunk`]s. +/// +/// `merge` runs the whole-chain binary merger; `extract` splits by the seal frontier +/// using [`Chunk::extract`]. The batcher consolidates equal `(data, time)` updates +/// but does *not* advance times — time advancement is advance's job, handled later in +/// the trace. Both settle their output, since the batcher's chains want to be graded. +pub type ChunkBatcher = crate::trace::implementations::merge_batcher::MergeBatcher>; + +/// A spine of `Rc`-shared [`ChunkBatch`]es of type `C`: the trace type for `arrange`. +pub type ChunkSpine = crate::trace::implementations::spine_fueled::Spine>>; + +/// A reference-counted [`ChunkBatch`] builder over chunks of type `C`. +pub type ChunkBuilder = crate::trace::rc_blanket_impls::RcBuilder>; + /// A cursor over a [`ChunkBatch`], merging the per-chunk cursors. /// /// Chunk breakpoints are unconstrained, so a single key — or `(key, val)` — may @@ -544,55 +386,92 @@ impl Cursor for ChunkBatchCursor { } } -impl BatchReader for ChunkBatch { - type Cursor = ChunkBatchCursor; - fn cursor(&self) -> Self::Cursor { - ChunkBatchCursor { key_chunk: 0, chunk: 0, inner: self.chunks.first().map(C::cursor) } - } - fn len(&self) -> usize { self.chunks.iter().map(C::len).sum() } - fn description(&self) -> &Description { &self.description } +/// A merge-batcher [`Merger`](crate::trace::implementations::merge_batcher::Merger) +/// over chains of [`Chunk`]s. +/// +/// `merge` runs the whole-chain binary merger; `extract` splits by the seal frontier +/// using [`Chunk::extract`]. The batcher consolidates equal `(data, time)` updates +/// but does *not* advance times — time advancement is advance's job, handled later in +/// the trace. Both settle their output, since the batcher's chains want to be graded. +pub struct ChunkMerger { + _marker: std::marker::PhantomData, } -impl Batch for ChunkBatch +impl Default for ChunkMerger { + fn default() -> Self { Self { _marker: std::marker::PhantomData } } +} + +impl crate::trace::implementations::merge_batcher::Merger for ChunkMerger where - C::Time: timely::progress::Timestamp + Lattice + Ord, + C: Chunk + Default + 'static, + C::Time: Clone + timely::PartialOrder + 'static, { - type Merger = ChunkBatchMerger; + type Chunk = C; + type Time = C::Time; - fn empty(lower: Antichain, upper: Antichain) -> Self { - use timely::progress::Timestamp; - let since = Antichain::from_elem(Self::Time::minimum()); - ChunkBatch::new(Vec::new(), Description::new(lower, upper, since)) + fn merge( + &mut self, + list1: Vec, + list2: Vec, + output: &mut Vec, + _stash: &mut Vec, + ) { + // Settle the output after each merge, to maintain bounded active chunks. + let mut in1: VecDeque = list1.into(); + let mut in2: VecDeque = list2.into(); + let (mut staged, mut settled) = (VecDeque::new(), VecDeque::new()); + while !in1.is_empty() && !in2.is_empty() { + C::merge(&mut in1, &mut in2, &mut staged); + C::settle(&mut staged, false, &mut settled); + } + // Append the non-empty tail from either input, settle as we go. + for tail in in1.drain(..).chain(in2.drain(..)) { + staged.push_back(tail); + C::settle(&mut staged, false, &mut settled); + } + C::settle(&mut staged, true, &mut settled); + output.extend(settled); } + + fn extract( + &mut self, + merged: Vec, + upper: AntichainRef, + frontier: &mut Antichain, + ship: &mut Vec, + kept: &mut Vec, + _stash: &mut Vec, + ) { + // `extract` keeps updates greater-or-equal `upper` and ships the rest, folding + // the lower envelope of kept times into `frontier`. Drive it a bounded amount + // per call (≈ one input chunk) and `settle` each side as it accumulates, so + // neither `keep` (retained across yields) nor `ship` (handed to the builder) + // builds up unsettled in core. `settle` may withhold a sub-`TARGET` carry + // between calls; the final `settle(done)` flushes it. + let mut input: VecDeque = merged.into(); + let (mut keep, mut shipped) = (VecDeque::new(), VecDeque::new()); + let (mut kept_q, mut shipped_q) = (VecDeque::new(), VecDeque::new()); + while !input.is_empty() { + C::extract(&mut input, upper, frontier, &mut keep, &mut shipped); + C::settle(&mut keep, false, &mut kept_q); + C::settle(&mut shipped, false, &mut shipped_q); + } + C::settle(&mut keep, true, &mut kept_q); + C::settle(&mut shipped, true, &mut shipped_q); + kept.extend(kept_q); + ship.extend(shipped_q); + } + + fn len(chunk: &C) -> usize { chunk.len() } } -/// A merge of two [`ChunkBatch`]es in progress. -/// -/// This is the [`ChunkBatch`] merger, wired in as its -/// [`Batch::Merger`](crate::trace::Batch::Merger), and has that trait's -/// `new` / `work` / `done` shape. +/// The resumable [`Batch::Merger`] for [`ChunkBatch`]: merges two batches and advances +/// their times to the compaction frontier, a fuel-bounded step at a time. /// -/// The merge is *resumable* and runs a two-stage deque pipeline: -/// [`merge`](Chunk::merge) feeds `merged`, [`advance`](Chunk::advance) consumes it -/// into `advanced`; the terminal [`regrade`](Chunk::regrade) runs once at `done`. Each -/// `work` step clones a burst from each source, ticks `merge` once, then advances the -/// fresh output, debiting `fuel` by the *merged* records that entered the pipe — the -/// total output across the merge, matching how the trace's other mergers account (cf. -/// `ord_neu`). The sources are read by *cloning* chunks (a cheap refcount bump per the -/// [`Chunk`] contract), never consumed or mutated; the same `source1`/`source2` must be -/// supplied on every call. When a source exhausts, the harness flushes the other's -/// verbatim tail one chunk per step. Once both are drained, a final `advance(done)` -/// flushes advance's withheld carry. -/// -/// **Latency bound.** `fuel` bounds each step to roughly one burst-merge's output. Two -/// things ride *outside* fuel: the terminal `advance(done)` and `done`'s `regrade`. In -/// the worst case — a single `(key, val)` spanning the whole merge — `advance` withholds -/// the entire group until `done`, then sorts and consolidates it in one unfueled step. -/// `vec_chunk` keeps that step *linear* in the group (it accumulates the carry in place, -/// reusing its storage), so it is not the quadratic blow-up of an earlier design, but it -/// is one unbounded-latency step bounded by the largest single `(key, val)` group. A -/// chunk impl must keep this flush linear; the latency claimed is "per step ≈ a burst, -/// plus a final flush ≤ the largest group." +/// Each step pipelines [`merge`](Chunk::merge) → [`advance`](Chunk::advance) → +/// [`settle`](Chunk::settle) and settles its output, so a suspended merge holds only +/// graded chunks. The sources are read by cloning (a cheap refcount bump) and must be +/// supplied unchanged on every call. pub struct ChunkBatchMerger { /// Compaction frontier supplied at construction. frontier: Antichain, @@ -607,9 +486,13 @@ pub struct ChunkBatchMerger { idx2: usize, /// `advance`'s input: the merge output plus advance's withheld carry at the front. merged: VecDeque, - /// `advance`'s output: the merged-and-advanced chunks, grown by `work`. + /// `advance`'s output and `settle`'s input: merged-and-advanced chunks, with + /// settle's withheld sub-`TARGET` carry at the front. advanced: VecDeque, - /// Set once both sources are drained and advance's final flush has run. + /// `settle`'s output: the committed, graded result, grown by `work`. Graded at + /// every yield, so a suspended merge holds well-formed (spillable) chunk state. + settled: VecDeque, + /// Set once both sources are drained and advance's and settle's final flushes ran. complete: bool, } @@ -618,7 +501,6 @@ where C: Chunk + Default + 'static, C::Time: timely::progress::Timestamp + Lattice + Ord + 'static, { - /// Begin merging `source1` and `source2`, advancing to `frontier`. fn new(source1: &ChunkBatch, source2: &ChunkBatch, frontier: AntichainRef) -> Self { let lower = source1.description.lower().meet(source2.description.lower()); let upper = source1.description.upper().join(source2.description.upper()); @@ -632,24 +514,20 @@ where idx2: 0, merged: VecDeque::new(), advanced: VecDeque::new(), + settled: VecDeque::new(), complete: false, } } - /// Advance the merge by up to `fuel` updates, suspending when it runs out. fn work(&mut self, source1: &ChunkBatch, source2: &ChunkBatch, fuel: &mut isize) { + + // TODO: The logic is a bit tortured here, and should be improved. + if self.complete { return; } while *fuel > 0 { - // Refill each input deque up to a burst of source chunks (clones); `merge` - // drains the loaded burst per call. The burst trades fuel granularity (a - // call does up to a burst's work before checking fuel) against re-pruning: - // a chunk that straddles many chunks on the other side is walked by index - // within one call but, once its tail spills past the loaded burst, its - // unconsumed suffix is pushed back and re-copied next call — a bigger burst - // absorbs more straddle per call. This workload is insensitive (1..32 flat - // to ~noise at 1M), so 8 is a conservative default, not a tuned optimum. - // After this, a deque is non-empty iff its source still has data. + // Refill each input deque up to a burst of source chunks (clones). + // The constant trades away fuel precision for overhead amortization. const BURST: usize = 8; while self.in1.len() < BURST && self.idx1 < source1.chunks.len() { self.in1.push_back(source1.chunks[self.idx1].clone()); @@ -670,45 +548,39 @@ where // Exactly one side has data: flush its verbatim tail, one chunk a step. produced.push_back(chunk); } else { - // Both sources drained: final flush of advance's withheld carry. - C::advance(&mut self.merged, &self.frontier, true, &mut self.advanced); + // Both sources drained: final flush of advance's and settle's carries. + C::advance(&mut self.merged, self.frontier.borrow(), true, &mut self.advanced); + C::settle(&mut self.advanced, true, &mut self.settled); self.complete = true; break; } let work: usize = produced.iter().map(C::len).sum(); self.merged.extend(produced); - C::advance(&mut self.merged, &self.frontier, false, &mut self.advanced); + C::advance(&mut self.merged, self.frontier.borrow(), false, &mut self.advanced); + // Maintain grading at the yield boundary: this step may exhaust `fuel` and + // suspend with `advanced` held, and held chunk state must be graded. + C::settle(&mut self.advanced, false, &mut self.settled); *fuel -= work as isize; } } - /// Extract the merged batch over `[lower, upper)` advanced to the frontier. - /// - /// Only valid once `work` has driven the merge to completion (left `fuel` - /// positive), as the [`trace::Merger`](crate::trace::Merger) contract requires. fn done(self) -> ChunkBatch { + debug_assert!(self.merged.is_empty() && self.advanced.is_empty()); let description = Description::new(self.lower, self.upper, self.frontier); - ChunkBatch::new(regrade_all(self.advanced), description) + ChunkBatch::new(self.settled.into(), description) } } -/// A [`Builder`](crate::trace::Builder) that collects pre-sorted chunks into a -/// [`ChunkBatch`]. -/// -/// The builder assumes its inputs arrive already sorted and consolidated (as the -/// `Builder` contract requires), so it does no merging: each pushed chunk is an -/// ordered run, fed straight to [`regrade`](Chunk::regrade) as it arrives — so a batch -/// built here is graded like one produced by the merger, rather than inheriting -/// whatever chunk sizes the caller happened to push. -pub struct ChunkBuilder { - /// Pushed chunks awaiting regrading; holds regrade's sub-`TARGET` carry at the front. +/// A [`Builder`](crate::trace::Builder) that collects a chunk sequence into a [`ChunkBatch`]. +pub struct ChunkBatchBuilder { + /// Pushed chunks awaiting settling; holds settle's sub-`TARGET` carry at the front. input: VecDeque, /// The graded chunks emitted so far. output: VecDeque, } -impl crate::trace::Builder for ChunkBuilder +impl crate::trace::Builder for ChunkBatchBuilder where C: Chunk + Default + 'static, C::Time: timely::progress::Timestamp, @@ -725,22 +597,64 @@ where let chunk = std::mem::take(chunk); if chunk.len() > 0 { self.input.push_back(chunk); - C::regrade(&mut self.input, false, &mut self.output); + C::settle(&mut self.input, false, &mut self.output); } } fn done(self, description: Description) -> ChunkBatch { - let ChunkBuilder { mut input, mut output } = self; - C::regrade(&mut input, true, &mut output); + let ChunkBatchBuilder { mut input, mut output } = self; + C::settle(&mut input, true, &mut output); ChunkBatch::new(output.into(), description) } fn seal(chain: &mut Vec, description: Description) -> ChunkBatch { - // The chain is sorted and consolidated but not necessarily graded; regrade it. - // Already-`TARGET` chunks pass through as cheap `Rc` moves, so a chain that - // arrives graded (as the batcher's does) pays only an O(#chunks) walk. - ChunkBatch::new(regrade_all(std::mem::take(chain)), description) + // We settle the chain because we are not guaranteed to received pre-settled data. + // This should be efficient on pre-settled data. + ChunkBatch::new(settle_all(std::mem::take(chain)), description) } } -pub mod vec_chunk; +/// Whether `chunks` satisfy the [`Chunk::TARGET`] grading invariant: every chunk +/// at most `TARGET`, and every adjacent pair summing to more than `TARGET` (so no +/// two neighbours could be combined into one legal chunk — a *maximal packing*). +/// +/// This is the post-[`settle`](Chunk::settle) shape; useful as a test/debug check. +pub fn is_graded(chunks: &[C]) -> bool { + chunks.iter().all(|c| c.len() <= C::TARGET) + && chunks.windows(2).all(|w| w[0].len() + w[1].len() > C::TARGET) +} + +/// Settle `input` to completion into a fresh graded `Vec` (see [`Chunk::settle`]). +/// +/// A convenience for the one-shot callers (batch sealing, the batcher's merge and +/// extract) that have a whole sequence in hand and want it graded; the streaming +/// callers drive [`Chunk::settle`] directly across ticks. +pub fn settle_all(input: impl IntoIterator) -> Vec { + let mut input: VecDeque = input.into_iter().collect(); + let mut out = VecDeque::new(); + C::settle(&mut input, true, &mut out); + debug_assert!(input.is_empty()); + out.into() +} + +/// Merge two full chains of chunks into one, to completion, appending to `out`. +/// +/// The plain whole-chain driver: ticks [`Chunk::merge`] until one deque empties, then +/// appends the other's remainder (the verbatim tail). Output is near-graded, not +/// settled. The batcher's `merge` runs the same loop but settles after each push (the +/// bounded-footprint discipline) and so does not use this; it stays as the simplest way +/// to drive [`Chunk::merge`] to completion. +pub fn merge_chains( + chain1: Vec, + chain2: Vec, + out: &mut VecDeque, +) { + let mut in1: VecDeque = chain1.into(); + let mut in2: VecDeque = chain2.into(); + while !in1.is_empty() && !in2.is_empty() { + C::merge(&mut in1, &mut in2, out); + } + // One deque is empty; the other's remainder is all greater than everything merged. + out.extend(in1.drain(..)); + out.extend(in2.drain(..)); +} diff --git a/differential-dataflow/src/trace/chunk/vec_chunk.rs b/differential-dataflow/src/trace/chunk/vec.rs similarity index 71% rename from differential-dataflow/src/trace/chunk/vec_chunk.rs rename to differential-dataflow/src/trace/chunk/vec.rs index 42d6d1b01..d20681d9f 100644 --- a/differential-dataflow/src/trace/chunk/vec_chunk.rs +++ b/differential-dataflow/src/trace/chunk/vec.rs @@ -1,35 +1,17 @@ -//! A worked [`Chunk`] implementation: `Vec<((K, V), T, R)>` behind an `Rc`. +//! A worked [`Chunk`]: `Vec<((K, V), T, R)>` behind an `Rc`. //! -//! This is the reference example — a next implementor (e.g. columnar) follows -//! its *shape*, not its layout. It shows the two integration points any chunk -//! type satisfies, and how leaning on the parent module's generic harnesses -//! keeps the code terse: +//! The reference implementation. It shows the two integration points any `Chunk` +//! satisfies; another layout copies this *shape*, not the `Vec`: //! -//! * **Batcher side.** The merge batcher's `ContainerChunker` builds chunks, so -//! the type implements timely's container traits (`Accountable`, -//! `SizableContainer`, `Consolidate`, `PushInto`). Here they delegate to the -//! inner `Vec` via `Rc::make_mut` — free while a chunk is being built -//! (refcount 1), and it never copies a *shared* chunk because batches are -//! immutable once built. -//! * **Trace side.** [`Chunk`] (merge / extract / advance / regrade / bounds) -//! plus a cursor. Key lookups are logarithmic by galloping search (`seek_*`), -//! independent of chunk size; stepping stays linear (short hops). +//! * **Batcher side.** The chunker builds chunks through timely's container traits +//! (`Accountable`, `SizableContainer`, `Consolidate`, `PushInto`), which here +//! delegate to the inner `Vec` via `Rc::make_mut` (free while building, never +//! copying a shared batch). +//! * **Trace side.** [`Chunk`] plus a cursor: key lookups gallop (logarithmic in +//! chunk size), stepping is linear. //! -//! `Clone` is a refcount bump, so the trace merger shares source chunks instead -//! of copying them. -//! -//! **What a columnar impl can and can't reuse.** The protocol (the `VecDeque` -//! in/out, withhold-by-`push_front`, grade-at-seal) is layout-agnostic and carries -//! over unchanged. The *merge body* does not: this one merges a single contiguous -//! `&[((K,V),T,R)]` and bulk-copies disjoint runs with `extend_from_slice` + -//! `chunks(TARGET)`. A columnar chunk (ranging over `ord_neu`'s deduped layout) has -//! no such slice — it must range-copy the key / val / time / diff columns with -//! offset bookkeeping, emitting one key + its val/time run rather than repeated rows. -//! That is the operation that beats the flat layout on repetitive keys (see the -//! module-level note on the row-major vs. columnar crossover), and it is also where -//! the earlier `col_chunk` got into trouble (decompress-and-recompress instead of a -//! true range-copy). So a columnar `Chunk` is the open bet: nothing here exercises a -//! columnar merge, and that body — not the protocol — is the phase-2 risk. +//! `Clone` is a refcount bump, so the trace merger shares source chunks rather than +//! copying them. use std::collections::VecDeque; use std::marker::PhantomData; @@ -38,6 +20,7 @@ use std::rc::Rc; use timely::Accountable; use timely::container::{PushInto, SizableContainer}; use timely::progress::{Antichain, Timestamp}; +use timely::progress::frontier::AntichainRef; use crate::consolidation::Consolidate; use crate::difference::Semigroup; @@ -47,9 +30,7 @@ use crate::trace::implementations::{Vector, WithLayout}; use super::Chunk; -/// The chunk size: both the maximum updates per chunk and the coalescing -/// threshold (see [`Chunk::TARGET`]). Chosen for the reference impl; exposed as -/// the associated const below, and used internally for buffer sizing. +/// The chunk size: the [`Chunk::TARGET`] value, also used for buffer sizing. const TARGET: usize = 1024; /// A sorted, consolidated run of `((key, val), time, diff)`, shared via `Rc`. @@ -64,15 +45,11 @@ impl Default for VecChunk { /// The trace type for `arrange`: a spine of `Rc`-shared chunk batches. pub type ChunkSpine = super::ChunkSpine>; -/// Merge batcher over `VecChunk`s. Unordered `Vec<((K, V), T, R)>` input is -/// consolidated into sorted `VecChunk`s by a `ContainerChunker` supplied -/// at the `arrange_core` callsite (it drives the container-trait impls below); the -/// batcher itself only merges the resulting chunks. +/// Merge batcher over `VecChunk`s; a `ContainerChunker` at the +/// `arrange_core` callsite forms the chunks it merges (via the container traits below). pub type ChunkBatcher = super::ChunkBatcher>; -/// Reference-counted batch builder. -pub type ChunkRcBuilder = super::ChunkRcBuilder>; - -// --- batcher side: timely container traits, delegating to the inner `Vec` --- +/// Batch builder. +pub type ChunkBuilder = super::ChunkBuilder>; impl Accountable for VecChunk { fn record_count(&self) -> i64 { self.0.len() as i64 } @@ -80,9 +57,8 @@ impl Accountable for VecChunk SizableContainer for VecChunk where K: Clone+'static, V: Clone+'static, T: Clone+'static, R: Clone+'static { - // The absorb point is the grading target: the chunker fills a scratch chunk - // to `TARGET` updates before emitting, so chunks arrive pre-graded rather than - // at timely's byte-derived buffer size (which downstream regrading re-melds). + // Absorb at `TARGET`, the grading size, so the chunker emits pre-graded chunks + // rather than timely's byte-derived ones. fn at_capacity(&self) -> bool { self.0.len() >= TARGET } fn ensure_capacity(&mut self, _stash: &mut Option) { let inner = Rc::make_mut(&mut self.0); @@ -104,8 +80,6 @@ where K: Clone+'static, V: Clone+'static, T: Clone+'static, R: Clone+'static { fn push_into(&mut self, item: ((K, V), T, R)) { Rc::make_mut(&mut self.0).push(item); } } -// --- trace side: a logarithmic cursor and the `Chunk` operations --- - /// First index `>= start` at which `pred` turns false, by galloping (exponential) /// search. `pred` must hold for a prefix then not — i.e. `|u| u < target`. /// O(log distance), so O(1) for short hops and logarithmic for long ones. @@ -221,15 +195,11 @@ where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+S fn len(&self) -> usize { self.0.len() } - /// A two-pointer binary merge that drains the two deques' *loaded* content - /// through their shared horizon — the lesser of the two deques' last loaded - /// `(key, val, time)`s — rather than one front-pair at a time. Consolidates - /// equal triples and bulk-copies disjoint runs as slices, walking across chunk - /// boundaries with local indices (`p1`/`p2`) that reset as each working chunk - /// is retired. The side owning the horizon drains fully; the other's partial - /// working chunk is pruned (its prefix dropped) and `push_front`ed back exactly - /// once at the yield boundary — so the per-call prune cost amortizes over the - /// whole burst the harness loaded, not over each chunk. + /// A two-pointer binary merge of the two deques' loaded content, up to their shared + /// horizon — the lesser of the two last `(key, val, time)`s. Consolidates equal + /// triples and bulk-copies disjoint runs as slices, walking chunk boundaries with + /// local indices. The horizon's owner drains fully; the other's partial front is + /// pruned and pushed back once, at the yield. fn merge(in1: &mut VecDeque, in2: &mut VecDeque, out: &mut VecDeque) { fn kv(u: &((K, V), T, R)) -> (&K, &V) { (&u.0.0, &u.0.1) } @@ -240,12 +210,9 @@ where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+S } }; - // Working chunks (the shared `Rc`, read by index — never `take`n, so a - // source clone is not deep-copied) and their positions; both deques are - // non-empty on entry. The guard keeps both cursors valid for indexing; a - // working chunk consumed mid-merge is refilled at the foot of the loop, and - // when a deque runs dry we stop — that side has presented all its loaded - // data, so its last triple is the horizon and the rest is left for next time. + // Read working chunks by index (never `take`n, so a source clone stays shared). + // Both deques are non-empty on entry; the loop stops when one runs dry — its + // last triple is the horizon, and the rest waits for the next call. let mut c1 = in1.pop_front().unwrap(); let mut c2 = in2.pop_front().unwrap(); let (mut p1, mut p2) = (0usize, 0usize); @@ -253,9 +220,8 @@ where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+S let a = &c1.0[p1]; let b = &c2.0[p2]; match (kv(a), &a.1).cmp(&(kv(b), &b.1)) { - // Copy the run of one side strictly below the other's head (within - // the current working chunk): collisions are impossible within it, - // so it moves as slices cut at the grading target. + // Copy the run strictly below the other's head — no collisions there — + // as `TARGET`-sized slices. std::cmp::Ordering::Less => { let run = gallop(&c1.0[..], p1 + 1, |u| (kv(u), &u.1) < (kv(b), &b.1)); for piece in c1.0[p1..run].chunks(TARGET) { @@ -283,8 +249,7 @@ where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+S flush(&mut result, false); } } - // Refill either working chunk consumed by the step above; stop the drain - // once a deque is exhausted (the `&&` guard then never re-enters). + // Refill a working chunk consumed above; if its deque is empty, stop. if p1 == c1.0.len() { match in1.pop_front() { Some(c) => { c1 = c; p1 = 0; } None => break } } @@ -293,41 +258,27 @@ where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+S } } flush(&mut result, true); - // One side's deque emptied with its working chunk exhausted; the other's - // working chunk is partial — push back just its unconsumed suffix (one copy - // per call), ahead of whatever loaded chunks remain in that deque. + // Push back the survivor's unconsumed suffix (one copy), ahead of its + // remaining loaded chunks. if p1 < c1.0.len() { in1.push_front(VecChunk(Rc::new(c1.0[p1..].to_vec()))); } if p2 < c2.0.len() { in2.push_front(VecChunk(Rc::new(c2.0[p2..].to_vec()))); } } fn extract( input: &mut VecDeque, - frontier: &Antichain, + frontier: AntichainRef, residual: &mut Antichain, keep: &mut VecDeque, ship: &mut VecDeque, ) { - // Fill `TARGET`-sized buffers directly, so the chunks pushed are already - // graded and `regrade` passes them through as `Rc` moves rather than - // re-splitting (and re-copying) a monolithic chunk. Emptied input `Vec`s - // are recycled as the next buffers, so allocations balance input against - // output instead of one fresh buffer per emitted chunk. - let mut stash: Vec> = Vec::new(); - let take_buf = |stash: &mut Vec<_>| stash.pop().unwrap_or_default(); - let (mut k, mut s) = (take_buf(&mut stash), take_buf(&mut stash)); - for chunk in input.drain(..) { - let mut v = take(chunk); - for u in v.drain(..) { - if frontier.borrow().less_equal(&u.1) { - residual.insert_ref(&u.1); - k.push(u); - if k.len() >= TARGET { keep.push_back(VecChunk(Rc::new(std::mem::replace(&mut k, take_buf(&mut stash))))); } - } else { - s.push(u); - if s.len() >= TARGET { ship.push_back(VecChunk(Rc::new(std::mem::replace(&mut s, take_buf(&mut stash))))); } - } - } - stash.push(v); + // One input chunk per call: partition it into a keep piece and a ship piece and + // return, so the harness settles each side before the next chunk is read. The + // pieces may be small; `settle` grades them. + let Some(chunk) = input.pop_front() else { return }; + let (mut k, mut s) = (Vec::new(), Vec::new()); + for u in take(chunk) { + if frontier.less_equal(&u.1) { residual.insert_ref(&u.1); k.push(u); } + else { s.push(u); } } if !k.is_empty() { keep.push_back(VecChunk(Rc::new(k))); } if !s.is_empty() { ship.push_back(VecChunk(Rc::new(s))); } @@ -335,22 +286,17 @@ where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+S fn advance( input: &mut VecDeque, - frontier: &Antichain, + frontier: AntichainRef, done: bool, out: &mut VecDeque, ) { - // Advance and consolidate every *complete* `(key, val)` group eagerly, - // so its updates can be released as soon as the input proves no later - // time for the pair can arrive. A group is contiguous in the sorted - // chain, so the only one that might continue in a future call is the last; - // unless `done`, we process up to its start and `push_front` the rest as - // the withheld carry for the next call. + // Advance and consolidate each *complete* `(key, val)` group eagerly. Only the + // last group might still grow; unless `done`, withhold it (push it back as the + // carry) and emit the rest. let mut stash: Vec> = Vec::new(); - // Build the working buffer by *reusing the front chunk's storage* (the - // carry from last time) and appending the rest (recycling each emptied - // `Vec`). Reusing the front is what keeps a withheld group from being - // recopied across calls: it just accumulates in place, so a `(key, val)` - // larger than the working set costs O(total) over the run, not O(total²). + // Reuse the front chunk's storage (last call's carry) as the working buffer and + // append the rest, so a withheld group accumulates in place: O(total) over the + // run, not O(total²). let mut buf = match input.pop_front() { Some(chunk) => take(chunk), None => return }; while let Some(chunk) = input.pop_front() { let mut v = take(chunk); @@ -359,18 +305,15 @@ where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+S } if buf.is_empty() { return; } - // If every available update shares one `(key, val)`, no group is provably - // complete — a later call may extend it — so make no progress unless - // `done`: push the accumulated buffer back as the carry and return. This is - // the giant-key case; comparing only the first and last pair detects it - // without scanning, and reusing the front above makes the retention free. + // Giant-key case: if the whole buffer is one `(key, val)`, no group is provably + // complete, so unless `done` withhold it all and return. First-vs-last detects + // this without a scan. if !done && buf[0].0 == buf[buf.len() - 1].0 { input.push_front(VecChunk(Rc::new(buf))); return; } - // Otherwise at least the first group is complete. Withhold the last group - // (a single `(key, val)`) as the next carry unless the input is complete. + // At least the first group is complete; withhold the last as the carry unless `done`. let end = if done { buf.len() } else { let last_kv = buf[buf.len() - 1].0.clone(); let mut start = buf.len(); @@ -380,16 +323,15 @@ where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+S if end < buf.len() { input.push_front(VecChunk(Rc::new(buf.split_off(end)))); } - // Advance + consolidate each group into `TARGET`-sized output chunks, - // filling buffers reclaimed from the recycled `Vec`s. + // Advance and consolidate each group into `TARGET`-sized output chunks, filling + // buffers reclaimed from the recycled `Vec`s. let mut result = stash.pop().unwrap_or_default(); let mut i = 0; while i < buf.len() { let mut j = i; while j < buf.len() && buf[j].0 == buf[i].0 { j += 1; } - for u in &mut buf[i..j] { u.1.advance_by(frontier.borrow()); } - // Advancing is monotone w.r.t. the lattice but not the - // representation's total order, so re-sort the group by time. + for u in &mut buf[i..j] { u.1.advance_by(frontier); } + // Advancing is lattice-monotone but not total-order-monotone; re-sort by time. buf[i..j].sort_by(|a, b| a.1.cmp(&b.1)); let mut k = i; while k < j { @@ -408,21 +350,12 @@ where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+S if !result.is_empty() { out.push_back(VecChunk(Rc::new(result))); } } - fn regrade(input: &mut VecDeque, done: bool, out: &mut VecDeque) { - // Maximal packing: emit chunks as large as possible up to `TARGET`, - // never splitting a pair that could combine into one legal (`<= TARGET`) - // chunk. A chunk of exactly `TARGET` is maximal — it cannot grow — so it - // passes straight through as an `Rc` move; only sub-`TARGET` chunks are - // copied, and only to coalesce with a neighbour. Producers fill to - // `TARGET`, so in steady state every chunk passes through and only the - // occasional trailing partial is coalesced. - // - // `carry` is the (sub-`TARGET`) chunk under construction. It is flushed - // once it reaches `TARGET`, `push_front`ed back onto `input` between calls, - // or emitted on `done`. Whenever `carry` is non-empty its left neighbour in - // `out` is a `TARGET` chunk (or `carry` is `out`'s first chunk), so - // emitting `carry` against a neighbour it cannot merge with — their sum - // exceeds `TARGET` — keeps the packing maximal on both sides. + fn settle(input: &mut VecDeque, done: bool, out: &mut VecDeque) { + // Maximal packing: a `TARGET` chunk is maximal, so it passes through as an `Rc` + // move; only sub-`TARGET` chunks are copied, and only to coalesce a neighbour. + // `carry` is the chunk under construction — flushed at `TARGET`, pushed back onto + // `input` between calls, or emitted on `done`. Its left neighbour in `out` is + // always a `TARGET` chunk, so emitting it keeps the packing maximal on both sides. let mut carry: Vec<((K, V), T, R)> = Vec::new(); while let Some(chunk) = input.pop_front() { if carry.is_empty() { @@ -493,12 +426,12 @@ mod test { chunks.into_iter().flat_map(|c| (*c.0).clone()).collect() } - // `extract` partitions by frontier and folds the kept frontier into `residual`; - // a terminal `regrade` then grades each side (the seams of near-graded output). + // `extract` partitions by frontier, a bounded amount per call, folding the kept + // frontier into `residual`; `settle` then fuses each side's pieces into a graded run. #[test] fn extract_partitions_and_grades() { use super::TARGET; - use crate::trace::chunk::{is_graded, regrade_all}; + use crate::trace::chunk::{is_graded, settle_all}; use timely::progress::Antichain; // 4·TARGET updates spread over many input chunks; even times ship @@ -508,12 +441,15 @@ mod test { let frontier = Antichain::from_elem(1u64); let mut residual = Antichain::new(); let (mut keep, mut ship) = (VecDeque::new(), VecDeque::new()); - VecChunk::extract(&mut input, &frontier, &mut residual, &mut keep, &mut ship); - let (keep, ship) = (regrade_all(keep), regrade_all(ship)); + // Drive to completion, as the harness does (one input chunk per call). + while !input.is_empty() { + VecChunk::extract(&mut input, frontier.borrow(), &mut residual, &mut keep, &mut ship); + } + let (keep, ship) = (settle_all(keep), settle_all(ship)); // Kept times are exactly {1}; that is the residual frontier. assert_eq!(residual, Antichain::from_elem(1u64)); - // Both sides are graded after the regrade. + // Both sides are graded after the settle. assert!(is_graded(&keep), "ungraded keep: {:?}", keep.iter().map(Chunk::len).collect::>()); assert!(is_graded(&ship), "ungraded ship: {:?}", ship.iter().map(Chunk::len).collect::>()); // Nothing lost: half the updates each way. @@ -532,7 +468,7 @@ mod test { let c0 = chunk(vec![((0, 0), 0, 1), ((0, 0), 1, 1), ((1, 0), 0, 1)]); let mut input: VecDeque<_> = VecDeque::from([c0]); let mut out = VecDeque::new(); - VecChunk::advance(&mut input, &frontier, false, &mut out); + VecChunk::advance(&mut input, frontier.borrow(), false, &mut out); // The trailing group (1,0) is withheld as the carry at the front of `input`. assert_eq!(input.len(), 1); @@ -559,15 +495,15 @@ mod test { let oneshot = { let mut q: VecDeque<_> = input().into(); let mut out = VecDeque::new(); - VecChunk::advance(&mut q, &frontier, false, &mut out); - VecChunk::advance(&mut q, &frontier, true, &mut out); + VecChunk::advance(&mut q, frontier.borrow(), false, &mut out); + VecChunk::advance(&mut q, frontier.borrow(), true, &mut out); flat(out) }; let incremental = { let mut q = VecDeque::new(); let mut out = VecDeque::new(); - for c in input() { q.push_back(c); VecChunk::advance(&mut q, &frontier, false, &mut out); } - VecChunk::advance(&mut q, &frontier, true, &mut out); + for c in input() { q.push_back(c); VecChunk::advance(&mut q, frontier.borrow(), false, &mut out); } + VecChunk::advance(&mut q, frontier.borrow(), true, &mut out); flat(out) }; assert_eq!(oneshot, incremental); @@ -588,8 +524,8 @@ mod test { let mut q = VecDeque::new(); let mut out = VecDeque::new(); - for c in make() { q.push_back(c); VecChunk::advance(&mut q, &frontier, false, &mut out); } - VecChunk::advance(&mut q, &frontier, true, &mut out); + for c in make() { q.push_back(c); VecChunk::advance(&mut q, frontier.borrow(), false, &mut out); } + VecChunk::advance(&mut q, frontier.borrow(), true, &mut out); // All times advance to 100 and consolidate to one update of diff `n`. assert_eq!(flat(out), vec![((7u64, 0u64), 100u64, n as i64)]); } @@ -603,13 +539,13 @@ mod test { assert_eq!(flat(out), vec![((0, 0), 0, 2), ((1, 0), 0, 1), ((2, 0), 0, 1)]); } - // Merging runs larger than `TARGET`, then regrading, yields a *graded* sequence + // Merging runs larger than `TARGET`, then settling, yields a *graded* sequence // (each chunk `<= TARGET`, adjacent pairs summing past `TARGET`) reproducing the // consolidated sorted contents. #[test] fn merge_emits_graded_chunks() { use super::TARGET; - use crate::trace::chunk::{is_graded, merge_chains, regrade_all}; + use crate::trace::chunk::{is_graded, merge_chains, settle_all}; // Two interleaving single-chunk chains: evens and odds over `0..4·TARGET`. let n = 4 * TARGET as u64; @@ -618,7 +554,7 @@ mod test { let mut out = VecDeque::new(); merge_chains(vec![evens], vec![odds], &mut out); - let chunks = regrade_all(out); + let chunks = settle_all(out); assert!(is_graded(&chunks), "merge output not graded: {:?}", chunks.iter().map(Chunk::len).collect::>()); @@ -678,11 +614,91 @@ mod test { } } - // `regrade` must produce a *maximal packing*: adjacent sub-`TARGET` chunks + // Driving `ChunkBatchMerger` to completion with tiny `fuel` — so it suspends and + // settles on nearly every tick — must yield the same advanced-and-consolidated + // batch as a one-shot reference, and that batch must be graded. Exercises the + // resumable merge→advance→settle pipeline and the grade-at-yield invariant. + #[test] + fn batch_merger_resumable_matches_reference() { + use crate::trace::{BatchReader, Description, Merger}; + use crate::trace::chunk::{ChunkBatch, ChunkBatchMerger, is_graded}; + use crate::trace::cursor::Cursor; + use crate::consolidation::consolidate_updates; + use timely::progress::Antichain; + + let mut seed = 0x9E3779B97F4A7C15u64; + let mut rng = move || { seed ^= seed << 13; seed ^= seed >> 7; seed ^= seed << 17; seed }; + + // A sorted, consolidated set over a small space, so the two sources collide + // and a `(key, val)` carries several times. + fn gen(rng: &mut impl FnMut() -> u64) -> Vec<((u64, u64), u64, i64)> { + let n = rng() as usize % 40 + 1; + let mut v: Vec<((u64, u64), u64, i64)> = (0..n).map(|_| { + let k = rng() % 10; let val = rng() % 3; let t = rng() % 6; + let d = if rng() % 4 == 0 { -1 } else { 1 }; + ((k, val), t, d) + }).collect(); + consolidate_updates(&mut v); + v + } + // Cut a consolidated set into a batch of small chunks, so groups straddle. + fn batch(updates: &[((u64, u64), u64, i64)], sz: usize) -> ChunkBatch> { + let chunks: Vec<_> = updates.chunks(sz).map(|c| VecChunk(Rc::new(c.to_vec()))).collect(); + let desc = Description::new( + Antichain::from_elem(0u64), Antichain::from_elem(10u64), Antichain::from_elem(0u64)); + ChunkBatch::new(chunks, desc) + } + // Flatten a batch through its straddle-aware cursor, then consolidate. + fn read(b: &ChunkBatch>) -> Vec<((u64, u64), u64, i64)> { + let mut out = Vec::new(); + let mut c = b.cursor(); + while c.key_valid(b) { + let k = *c.key(b); + while c.val_valid(b) { + let v = *c.val(b); + c.map_times(b, |t, d| out.push(((k, v), *t, *d))); + c.step_val(b); + } + c.step_key(b); + } + consolidate_updates(&mut out); + out + } + + for _ in 0..200 { + let u1 = gen(&mut rng); + let u2 = gen(&mut rng); + if u1.is_empty() || u2.is_empty() { continue; } + let sz = (rng() as usize % 4) + 1; + let f = rng() % 6; + let (s1, s2) = (batch(&u1, sz), batch(&u2, sz)); + let frontier = Antichain::from_elem(f); + + let mut merger = ChunkBatchMerger::new(&s1, &s2, frontier.borrow()); + loop { + let mut fuel = 1isize; // tiny → many yields, each settling + merger.work(&s1, &s2, &mut fuel); + if fuel > 0 { break; } + } + let result = merger.done(); + + // The produced batch is graded (grade-at-yield, so also at done). + assert!(is_graded(&result.chunks), "ungraded result: {:?}", + result.chunks.iter().map(Chunk::len).collect::>()); + // ...and its contents are the merged sources, advanced to `f`, consolidated. + let got = read(&result); + let mut want: Vec<_> = u1.iter().chain(u2.iter()).cloned().collect(); + for u in want.iter_mut() { u.1 = u.1.max(f); } + consolidate_updates(&mut want); + assert_eq!(got, want, "fuel-driven merge mismatch\n u1={u1:?}\n u2={u2:?}\n f={f}"); + } + } + + // `settle` must produce a *maximal packing*: adjacent sub-`TARGET` chunks // that could combine into one legal chunk are coalesced, full chunks pass // through as `Rc` moves, and contents are preserved exactly. #[test] - fn regrade_maximal_packing() { + fn settle_maximal_packing() { use super::TARGET; use crate::trace::chunk::is_graded; @@ -697,9 +713,9 @@ mod test { for &s in &sizes { let updates: Vec<_> = (0..s).map(|_| { let k = key; key += 1; ((k, 0u64), 0u64, 1i64) }).collect(); input.push_back(chunk(updates)); - VecChunk::regrade(&mut input, false, &mut output); + VecChunk::settle(&mut input, false, &mut output); } - VecChunk::regrade(&mut input, true, &mut output); + VecChunk::settle(&mut input, true, &mut output); let chunks: Vec<_> = output.into(); assert!(is_graded(&chunks), "not graded: {:?}",