diff --git a/differential-dataflow/examples/chunks.rs b/differential-dataflow/examples/chunks.rs new file mode 100644 index 000000000..8d946b199 --- /dev/null +++ b/differential-dataflow/examples/chunks.rs @@ -0,0 +1,105 @@ +//! Minimal dataflow over the `Vec`-backed `Chunk` container. +//! +//! Mirrors the `val` arm of `spines.rs`, but arranges through `ChunkBatcher` / +//! `ChunkBuilder` / `ChunkSpine` — i.e. the merge batcher, builder, and spine +//! built atop the `Chunk` trait and its `ChunkBatch`. Run as: +//! +//! ```text +//! cargo run --release --example chunks -- +//! ``` + +use differential_dataflow::Hashable; +use differential_dataflow::input::Input; +use differential_dataflow::operators::arrange::Arrange; +use differential_dataflow::operators::arrange::arrangement::arrange_core; +use differential_dataflow::trace::chunk::vec::{ChunkBatcher, ChunkBuilder, ChunkSpine, VecChunk}; +use differential_dataflow::trace::implementations::chunker::ContainerChunker; +use differential_dataflow::trace::implementations::ord_neu::{OrdValBatcher, RcOrdValBuilder, OrdValSpine}; + +use timely::dataflow::channels::pact::Exchange; +use timely::dataflow::operators::probe::Handle; + +fn main() { + let keys: usize = std::env::args().nth(1).unwrap().parse().unwrap(); + let size: usize = std::env::args().nth(2).unwrap().parse().unwrap(); + // "chunk" (default): our `Chunk`-backed trace. "ord": the standard `ord_neu` trace. + let mode: String = std::env::args().nth(3).unwrap_or_else(|| "chunk".to_string()); + println!("Running [{mode}] arrangement"); + + let timer = std::time::Instant::now(); + + // Skip the three positional args we consume (keys, size, mode); the rest are + // timely's worker flags. + timely::execute_from_args(std::env::args().skip(4), move |worker| { + let mut probe = Handle::new(); + let (mut data_input, mut keys_input) = worker.dataflow(|scope| { + let (data_input, data) = scope.new_collection::(); + let (keys_input, keys) = scope.new_collection::(); + let data = data.map(|x| (x, ())); + let keys = keys.map(|x| (x, ())); + + match mode.as_str() { + "chunk" => { + // The chunk batcher's output (`VecChunk`) differs from the stream + // container (`Vec`), so this is a cross-container chunker case: + // drop to `arrange_core` with an explicit `ContainerChunker`. + type Ba = ChunkBatcher; + type Bu = ChunkBuilder; + type Sp = ChunkSpine; + type Chu = ContainerChunker>; + let data = arrange_core::<_, _, Chu, Ba, Bu, Sp>( + data.inner, Exchange::new(|u: &((u64, ()), u64, isize)| (u.0).0.hashed().into()), "Data"); + let keys = arrange_core::<_, _, Chu, Ba, Bu, Sp>( + keys.inner, Exchange::new(|u: &((u64, ()), u64, isize)| (u.0).0.hashed().into()), "Keys"); + keys.join_core(data, |_k, &(), &()| Option::<()>::None).probe_with(&mut probe); + } + "ord" => { + type Ba = OrdValBatcher; + type Bu = RcOrdValBuilder; + type Sp = OrdValSpine; + let data = data.arrange::(); + let keys = keys.arrange::(); + keys.join_core(data, |_k, &(), &()| Option::<()>::None).probe_with(&mut probe); + } + other => panic!("unrecognized mode: {other:?} (expected `chunk` or `ord`)"), + } + + (data_input, keys_input) + }); + + // Load `data`, advancing round by round. + let mut counter = 0; + let mut t: u64 = 1; + while counter < 10 * keys { + let mut i = worker.index(); + while i < size { + data_input.insert(((counter + i) % keys) as u64); + i += worker.peers(); + } + counter += size; + data_input.advance_to(t); data_input.flush(); + keys_input.advance_to(t); keys_input.flush(); + while probe.less_than(data_input.time()) { worker.step(); } + t += 1; + } + println!("{:?}\tloading complete", timer.elapsed()); + + // Issue `keys` queries against the arranged `data`. + let mut queries = 0; + while queries < 10 * keys { + let mut i = worker.index(); + while i < size { + keys_input.insert(((queries + i) % keys) as u64); + i += worker.peers(); + } + queries += size; + data_input.advance_to(t); data_input.flush(); + keys_input.advance_to(t); keys_input.flush(); + while probe.less_than(keys_input.time()) { worker.step(); } + t += 1; + } + println!("{:?}\tqueries complete", timer.elapsed()); + }).unwrap(); + + println!("{:?}\tshut down", timer.elapsed()); +} diff --git a/differential-dataflow/src/trace/chunk/mod.rs b/differential-dataflow/src/trace/chunk/mod.rs new file mode 100644 index 000000000..660d70613 --- /dev/null +++ b/differential-dataflow/src/trace/chunk/mod.rs @@ -0,0 +1,660 @@ +//! Sorted, consolidated runs of updates, and operators over sequences of them. +//! +//! A [`Chunk`] is a consolidated, sorted run of `(data, time, diff)` updates. +//! A sequence of chunks is also expected to be consolidated and sorted. +//! +//! The [`Chunk`] trait exposes whole-chunk operations, so that the implementor +//! can internally divert to their best implementations, with amortized overhead. +//! Each operation is invoked as if "streaming", providing input and output queues. +//! An implementor is expected to drain as much as possible of the inputs, and any +//! chunk written to the output is "committed" and likely to be shipped onward. +//! +//! # Wiring a `Chunk` into an arrangement +//! +//! Implementing [`Chunk`] for a type `C` is the only bespoke code needed; three +//! aliases then expand into a full trace: +//! +//! * [`ChunkBatcher`](ChunkBatcher) — the merge batcher. +//! * [`ChunkBuilder`](ChunkBuilder) — the batch builder. +//! * [`ChunkSpine`](ChunkSpine) — the trace, a spine of `Rc`-shared batches. +//! +//! These are the `Batcher` / `Builder` / `Spine` to hand to +//! [`arrange_core`](crate::operators::arrange::arrangement::arrange_core), along with a +//! chunker that forms `C` from the input stream — typically +//! [`ContainerChunker`](crate::trace::implementations::chunker::ContainerChunker). +//! Everything else here ([`ChunkBatch`], [`ChunkMerger`], [`ChunkBatchMerger`], +//! [`ChunkBatchCursor`], [`ChunkBatchBuilder`]) is machinery those aliases expand to and is +//! not named directly. The [`vec`](mod@vec) module is a worked `Chunk` +//! that re-exports the three aliases specialized to its layout, and the `chunks` example +//! stands one up. +//! +//! # Bounded footprint +//! +//! There is a `TARGET` associated constant that signals the intended chunk size. +//! The constant should be chosen large enough to amortize overheads, but small +//! enough that per-chunk work does not "stall" the system when invoked. +//! The implementor is trusted to make a reasonable choice here. +//! +//! The [`Chunk::settle`] method "settles" sequences of chunks, and is called as +//! chunks are no longer expected to be needed in the near future. The implementor +//! should ensure the chunks are "graded", in that the sequence of chunks are all +//! at most `TARGET` in size, any two in order sum to strictly more than `TARGET`. +//! This is also an opportunity to compress data, or spill to disk or cloud storage. +//! +//! The active (un-settled) chunk set is kept small from both sides. Every producer +//! settles its committed output as it goes (see [`Chunk::settle`]), rather than +//! building a whole sequence and settling at the end. And every walk over a whole +//! chunk sequence reads only resident metadata — [`len`](Chunk::len) and +//! [`bounds`](Chunk::bounds) — never a chunk body: a batch indexes its chunks' +//! bounds once at construction, so cursors binary-search that resident index and +//! open only the chunk(s) a query touches. Implementors must therefore keep `len` +//! and `bounds` cheap even when a chunk's body is paged out. + +use std::collections::VecDeque; + +use timely::progress::Antichain; +use timely::progress::frontier::AntichainRef; +use crate::lattice::Lattice; +use crate::trace::{Batch, BatchReader, Description}; +use crate::trace::cursor::Cursor; +use crate::trace::implementations::{BatchContainer, Layout, LayoutExt, WithLayout}; + +pub mod vec; + +/// A non-empty, bounded, consolidated, sorted sequence of `(data, time, diff)`. +/// +/// An implementor gains access to types and trait implementations that provide +/// batch formation and trace maintenance with no additional effort. +/// +/// The necessary implementations are either "data" or "metadata" operations. +/// The "data" operations transform lists of chunks, are expected to do roughly +/// "one chunk's worth" of work at a time; they can afford to compress and page. +/// The "metadata" operations provide chunk information, and should be lightweight. +pub trait Chunk: Sized + Clone + LayoutExt { + + /// The intended maximum chunk size. + const TARGET: usize; + + /// A cursor navigating this chunk's contents. + type Cursor: + Cursor + + WithLayout + + for<'a> LayoutExt< + Key<'a> = Self::Key<'a>, + Val<'a> = Self::Val<'a>, + ValOwn = Self::ValOwn, + Time = Self::Time, + TimeGat<'a> = Self::TimeGat<'a>, + Diff = Self::Diff, + DiffGat<'a> = Self::DiffGat<'a>, + KeyContainer = Self::KeyContainer, + ValContainer = Self::ValContainer, + TimeContainer = Self::TimeContainer, + DiffContainer = Self::DiffContainer, + >; + + /// Acquire a cursor over this chunk. + fn cursor(&self) -> Self::Cursor; + + /// The first and last `(key, val, time)` triples in the chunk. + fn bounds(&self) -> ( + (Self::Key<'_>, Self::Val<'_>, Self::TimeGat<'_>), + (Self::Key<'_>, Self::Val<'_>, Self::TimeGat<'_>), + ); + + /// The number of updates in the chunk. + fn len(&self) -> usize; + + /// Merge the fronts of two input deques through their shared horizon. + /// + /// Both deques are non-empty (the caller guarantees it). The two queues are both + /// the heads of lists of chunks, and the implementor should only merge through the + /// least last `(key, val, time)` update, or risk emitting an unconsolidated + /// output chunk. + /// + /// When a chunk cannot be completely retired, perhaps it had the larger last update, + /// it should be rewritten as a new chunk and pushed back to the front of the queue. + /// The invocation is expected to consume at least one of its inputs, and the harness + /// may continually re-invoke if this doesn't happen. + /// + /// A merge concludes when the harness sees that either input is now empty, at which + /// point it appends the queue to the output without the method's assistance. + fn merge(in1: &mut VecDeque, in2: &mut VecDeque, out: &mut VecDeque); + + /// Partition `input` updates into `keep` (greater or equal `frontier`) or not (`ship`). + /// + /// An implementation should yield with some frequency to allow the output to "settle". + /// The harness may guard against this, but it prefers to provide as much context as it + /// can in order to allow broader chunk fusion where needed. + fn extract( + input: &mut VecDeque, + frontier: AntichainRef, + residual: &mut Antichain, + keep: &mut VecDeque, + ship: &mut VecDeque, + ); + + /// Advance times by `frontier` producing consolidated chunks. + /// + /// An output for `(key, val)` should generally not be produced until a later pair + /// is observed, or `done` is set, to ensure the output chunks are consolidated. + /// Incomplete work can be pushed back to the front of `input`. + /// + /// On `done` a single `(key, val)` group may span the whole input; advancing and + /// consolidating it should cost time linear in its size, not quadratic. + fn advance( + input: &mut VecDeque, + frontier: AntichainRef, + done: bool, + out: &mut VecDeque, + ); + + /// Reshape `input` to a sequence that maintains the "grading" structural invariant. + /// + /// Specifically, the chunks in `output` should have a maximum size of `TARGET` and + /// each adjacent pair should have lengths that sum to more than `TARGET`. + /// This is also a good moment to consider compression or paging out the contents. + /// When `done` is set the input must be moved to the output. + /// + /// This method may be called on already settled data, and should be efficient then. + fn settle( + input: &mut VecDeque, + done: bool, + out: &mut VecDeque, + ); + +} + +type KeyCon = <::Layout as Layout>::KeyContainer; +type ValCon = <::Layout as Layout>::ValContainer; + +/// A batch is a [`Chunk`] sequence plus a [`Description`]. +/// +/// Metadata about the batches is cached to make subselection efficient. +pub struct ChunkBatch { + /// Ordered, consolidated chunks; their concatenation is the batch. + pub chunks: Vec, + /// The lower, upper, and since frontiers of the batch. + pub description: Description, + /// Per-chunk first and last key, and first and last val, parallel to `chunks`. + first_keys: KeyCon, + last_keys: KeyCon, + first_vals: ValCon, + last_vals: ValCon, +} + +impl ChunkBatch { + /// Assemble a batch from ordered chunks, building the per-chunk index. + pub fn new(chunks: Vec, description: Description) -> Self { + let n = chunks.len(); + let mut first_keys = >::with_capacity(n); + let mut last_keys = >::with_capacity(n); + let mut first_vals = >::with_capacity(n); + let mut last_vals = >::with_capacity(n); + for chunk in &chunks { + assert!(chunk.len() > 0, "ChunkBatch chunks must be non-empty"); + let ((fk, fv, _), (lk, lv, _)) = chunk.bounds(); + first_keys.push_ref(fk); + last_keys.push_ref(lk); + first_vals.push_ref(fv); + last_vals.push_ref(lv); + } + ChunkBatch { chunks, description, first_keys, last_keys, first_vals, last_vals } + } +} + +impl WithLayout for ChunkBatch { + type Layout = C::Layout; +} + +impl BatchReader for ChunkBatch { + type Cursor = ChunkBatchCursor; + fn cursor(&self) -> Self::Cursor { + ChunkBatchCursor { key_chunk: 0, chunk: 0, inner: self.chunks.first().map(C::cursor) } + } + fn len(&self) -> usize { self.chunks.iter().map(C::len).sum() } + fn description(&self) -> &Description { &self.description } +} + +impl Batch for ChunkBatch +where + C::Time: timely::progress::Timestamp + Lattice + Ord, +{ + type Merger = ChunkBatchMerger; + + fn empty(lower: Antichain, upper: Antichain) -> Self { + use timely::progress::Timestamp; + let since = Antichain::from_elem(Self::Time::minimum()); + ChunkBatch::new(Vec::new(), Description::new(lower, upper, since)) + } +} + +/// A merge-batcher [`Merger`](crate::trace::implementations::merge_batcher::Merger) +/// over chains of [`Chunk`]s. +/// +/// `merge` runs the whole-chain binary merger; `extract` splits by the seal frontier +/// using [`Chunk::extract`]. The batcher consolidates equal `(data, time)` updates +/// but does *not* advance times — time advancement is advance's job, handled later in +/// the trace. Both settle their output, since the batcher's chains want to be graded. +pub type ChunkBatcher = crate::trace::implementations::merge_batcher::MergeBatcher>; + +/// A spine of `Rc`-shared [`ChunkBatch`]es of type `C`: the trace type for `arrange`. +pub type ChunkSpine = crate::trace::implementations::spine_fueled::Spine>>; + +/// A reference-counted [`ChunkBatch`] builder over chunks of type `C`. +pub type ChunkBuilder = crate::trace::rc_blanket_impls::RcBuilder>; + +/// A cursor over a [`ChunkBatch`], merging the per-chunk cursors. +/// +/// Chunk breakpoints are unconstrained, so a single key — or `(key, val)` — may +/// straddle consecutive chunks. But the chunks are one globally-sorted sequence +/// merely cut at arbitrary points, so the operation is *concatenation*, never a +/// merge: across a boundary a key's vals concatenate and a `(key, val)`'s times +/// concatenate. The cursor exploits this. It holds the chunk currently being read +/// and a cursor into it; it seeks by binary-searching the per-chunk index on +/// `ChunkBatch`, and at boundaries it *continues* into the next chunk rather than +/// merging — using the index to detect when a key or `(key, val)` spills forward, +/// without touching chunk contents. +pub struct ChunkBatchCursor { + /// First chunk of the current key's run; where `rewind_vals` returns to. + key_chunk: usize, + /// Chunk currently being read; `>= key_chunk`, within the current key's span. + chunk: usize, + /// Cursor into `chunk`; `None` once `chunk` is past the last chunk. + inner: Option, +} + +impl WithLayout for ChunkBatchCursor { + type Layout = C::Layout; +} + +impl ChunkBatchCursor { + /// Move the active chunk to `c`, opening a fresh inner cursor at its start. + fn goto(&mut self, c: usize, storage: &ChunkBatch) { + self.chunk = c; + self.inner = storage.chunks.get(c).map(C::cursor); + } +} + +impl Cursor for ChunkBatchCursor { + type Storage = ChunkBatch; + + fn key_valid(&self, s: &Self::Storage) -> bool { self.chunk < s.chunks.len() && self.inner.as_ref().is_some_and(|i| i.key_valid(&s.chunks[self.chunk])) } + fn val_valid(&self, s: &Self::Storage) -> bool { self.chunk < s.chunks.len() && self.inner.as_ref().is_some_and(|i| i.val_valid(&s.chunks[self.chunk])) } + fn key<'a>(&self, s: &'a Self::Storage) -> Self::Key<'a> { self.inner.as_ref().unwrap().key(&s.chunks[self.chunk]) } + fn val<'a>(&self, s: &'a Self::Storage) -> Self::Val<'a> { self.inner.as_ref().unwrap().val(&s.chunks[self.chunk]) } + fn get_key<'a>(&self, s: &'a Self::Storage) -> Option> { if self.key_valid(s) { Some(self.key(s)) } else { None } } + fn get_val<'a>(&self, s: &'a Self::Storage) -> Option> { if self.val_valid(s) { Some(self.val(s)) } else { None } } + + fn map_times, Self::DiffGat<'_>)>(&mut self, s: &Self::Storage, mut logic: L) { + if !self.val_valid(s) { return; } + let (k, v) = (self.key(s), self.val(s)); + self.inner.as_mut().unwrap().map_times(&s.chunks[self.chunk], &mut logic); + // Follow the (key, val) forward across boundaries while it spills. + let mut c = self.chunk; + while c + 1 < s.chunks.len() + && s.last_keys.index(c) == k && s.first_keys.index(c + 1) == k + && s.last_vals.index(c) == v && s.first_vals.index(c + 1) == v + { + c += 1; + s.chunks[c].cursor().map_times(&s.chunks[c], &mut logic); + } + } + + fn step_key(&mut self, s: &Self::Storage) { + if !self.key_valid(s) { return; } + let n = s.chunks.len(); + let k = self.key(s); + // Advance to the last chunk the key spans. + while self.chunk + 1 < n && s.last_keys.index(self.chunk) == k && s.first_keys.index(self.chunk + 1) == k { + self.goto(self.chunk + 1, s); + } + // Step past the key within its last chunk. + { + let inner = self.inner.as_mut().unwrap(); + inner.seek_key(&s.chunks[self.chunk], k); + inner.step_key(&s.chunks[self.chunk]); + } + // If that exhausted the chunk, the next key (if any) starts the next chunk. + if !self.inner.as_ref().unwrap().key_valid(&s.chunks[self.chunk]) && self.chunk + 1 < n { + self.goto(self.chunk + 1, s); + } + self.key_chunk = self.chunk; + } + + fn seek_key(&mut self, s: &Self::Storage, key: Self::Key<'_>) { + let n = s.chunks.len(); + // First chunk whose last key is `>= key`: where `key`'s run begins. + let c = s.last_keys.advance(0, n, |x| { + as BatchContainer>::reborrow(x).lt(& as BatchContainer>::reborrow(key)) + }); + self.goto(c, s); + self.key_chunk = c; + if c < n { self.inner.as_mut().unwrap().seek_key(&s.chunks[c], key); } + } + + fn step_val(&mut self, s: &Self::Storage) { + if !self.val_valid(s) { return; } + let n = s.chunks.len(); + let (k, v) = (self.key(s), self.val(s)); + // Advance to the last chunk the (key, val) spans. + while self.chunk + 1 < n + && s.last_keys.index(self.chunk) == k && s.first_keys.index(self.chunk + 1) == k + && s.last_vals.index(self.chunk) == v && s.first_vals.index(self.chunk + 1) == v + { + self.goto(self.chunk + 1, s); + } + // Step past the (key, val) within that chunk. + self.inner.as_mut().unwrap().step_val(&s.chunks[self.chunk]); + // If the key's vals are exhausted here but the key spills, roll forward. + if !self.inner.as_ref().unwrap().val_valid(&s.chunks[self.chunk]) + && self.chunk + 1 < n && s.last_keys.index(self.chunk) == k && s.first_keys.index(self.chunk + 1) == k + { + self.goto(self.chunk + 1, s); + self.inner.as_mut().unwrap().seek_key(&s.chunks[self.chunk], k); + } + } + + fn seek_val(&mut self, s: &Self::Storage, val: Self::Val<'_>) { + if !self.key_valid(s) { return; } + let n = s.chunks.len(); + let k = self.key(s); + loop { + self.inner.as_mut().unwrap().seek_val(&s.chunks[self.chunk], val); + if self.inner.as_ref().unwrap().val_valid(&s.chunks[self.chunk]) { return; } + // Key's vals exhausted in this chunk; if the key spills, retry in the next. + if self.chunk + 1 < n && s.last_keys.index(self.chunk) == k && s.first_keys.index(self.chunk + 1) == k { + self.goto(self.chunk + 1, s); + self.inner.as_mut().unwrap().seek_key(&s.chunks[self.chunk], k); + } else { + return; + } + } + } + + fn rewind_keys(&mut self, s: &Self::Storage) { + self.key_chunk = 0; + self.goto(0, s); + } + + fn rewind_vals(&mut self, s: &Self::Storage) { + if !self.key_valid(s) { return; } + let k = self.key(s); + let kc = self.key_chunk; + self.goto(kc, s); + self.inner.as_mut().unwrap().seek_key(&s.chunks[kc], k); + } +} + +/// A merge-batcher [`Merger`](crate::trace::implementations::merge_batcher::Merger) +/// over chains of [`Chunk`]s. +/// +/// `merge` runs the whole-chain binary merger; `extract` splits by the seal frontier +/// using [`Chunk::extract`]. The batcher consolidates equal `(data, time)` updates +/// but does *not* advance times — time advancement is advance's job, handled later in +/// the trace. Both settle their output, since the batcher's chains want to be graded. +pub struct ChunkMerger { + _marker: std::marker::PhantomData, +} + +impl Default for ChunkMerger { + fn default() -> Self { Self { _marker: std::marker::PhantomData } } +} + +impl crate::trace::implementations::merge_batcher::Merger for ChunkMerger +where + C: Chunk + Default + 'static, + C::Time: Clone + timely::PartialOrder + 'static, +{ + type Chunk = C; + type Time = C::Time; + + fn merge( + &mut self, + list1: Vec, + list2: Vec, + output: &mut Vec, + _stash: &mut Vec, + ) { + // Settle the output after each merge, to maintain bounded active chunks. + let mut in1: VecDeque = list1.into(); + let mut in2: VecDeque = list2.into(); + let (mut staged, mut settled) = (VecDeque::new(), VecDeque::new()); + while !in1.is_empty() && !in2.is_empty() { + C::merge(&mut in1, &mut in2, &mut staged); + C::settle(&mut staged, false, &mut settled); + } + // Append the non-empty tail from either input, settle as we go. + for tail in in1.drain(..).chain(in2.drain(..)) { + staged.push_back(tail); + C::settle(&mut staged, false, &mut settled); + } + C::settle(&mut staged, true, &mut settled); + output.extend(settled); + } + + fn extract( + &mut self, + merged: Vec, + upper: AntichainRef, + frontier: &mut Antichain, + ship: &mut Vec, + kept: &mut Vec, + _stash: &mut Vec, + ) { + // `extract` keeps updates greater-or-equal `upper` and ships the rest, folding + // the lower envelope of kept times into `frontier`. Drive it a bounded amount + // per call (≈ one input chunk) and `settle` each side as it accumulates, so + // neither `keep` (retained across yields) nor `ship` (handed to the builder) + // builds up unsettled in core. `settle` may withhold a sub-`TARGET` carry + // between calls; the final `settle(done)` flushes it. + let mut input: VecDeque = merged.into(); + let (mut keep, mut shipped) = (VecDeque::new(), VecDeque::new()); + let (mut kept_q, mut shipped_q) = (VecDeque::new(), VecDeque::new()); + while !input.is_empty() { + C::extract(&mut input, upper, frontier, &mut keep, &mut shipped); + C::settle(&mut keep, false, &mut kept_q); + C::settle(&mut shipped, false, &mut shipped_q); + } + C::settle(&mut keep, true, &mut kept_q); + C::settle(&mut shipped, true, &mut shipped_q); + kept.extend(kept_q); + ship.extend(shipped_q); + } + + fn len(chunk: &C) -> usize { chunk.len() } +} + +/// The resumable [`Batch::Merger`] for [`ChunkBatch`]: merges two batches and advances +/// their times to the compaction frontier, a fuel-bounded step at a time. +/// +/// Each step pipelines [`merge`](Chunk::merge) → [`advance`](Chunk::advance) → +/// [`settle`](Chunk::settle) and settles its output, so a suspended merge holds only +/// graded chunks. The sources are read by cloning (a cheap refcount bump) and must be +/// supplied unchanged on every call. +pub struct ChunkBatchMerger { + /// Compaction frontier supplied at construction. + frontier: Antichain, + /// Result frontiers, retained for the output description. + lower: Antichain, + upper: Antichain, + /// Input deques, refilled from the sources (clones) head-of-list at a time. + in1: VecDeque, + in2: VecDeque, + /// Next source chunk to clone into `in1` / `in2`. + idx1: usize, + idx2: usize, + /// `advance`'s input: the merge output plus advance's withheld carry at the front. + merged: VecDeque, + /// `advance`'s output and `settle`'s input: merged-and-advanced chunks, with + /// settle's withheld sub-`TARGET` carry at the front. + advanced: VecDeque, + /// `settle`'s output: the committed, graded result, grown by `work`. Graded at + /// every yield, so a suspended merge holds well-formed (spillable) chunk state. + settled: VecDeque, + /// Set once both sources are drained and advance's and settle's final flushes ran. + complete: bool, +} + +impl crate::trace::Merger> for ChunkBatchMerger +where + C: Chunk + Default + 'static, + C::Time: timely::progress::Timestamp + Lattice + Ord + 'static, +{ + fn new(source1: &ChunkBatch, source2: &ChunkBatch, frontier: AntichainRef) -> Self { + let lower = source1.description.lower().meet(source2.description.lower()); + let upper = source1.description.upper().join(source2.description.upper()); + Self { + frontier: frontier.to_owned(), + lower, + upper, + in1: VecDeque::new(), + in2: VecDeque::new(), + idx1: 0, + idx2: 0, + merged: VecDeque::new(), + advanced: VecDeque::new(), + settled: VecDeque::new(), + complete: false, + } + } + + fn work(&mut self, source1: &ChunkBatch, source2: &ChunkBatch, fuel: &mut isize) { + + // TODO: The logic is a bit tortured here, and should be improved. + + if self.complete { return; } + + while *fuel > 0 { + // Refill each input deque up to a burst of source chunks (clones). + // The constant trades away fuel precision for overhead amortization. + const BURST: usize = 8; + while self.in1.len() < BURST && self.idx1 < source1.chunks.len() { + self.in1.push_back(source1.chunks[self.idx1].clone()); + self.idx1 += 1; + } + while self.in2.len() < BURST && self.idx2 < source2.chunks.len() { + self.in2.push_back(source2.chunks[self.idx2].clone()); + self.idx2 += 1; + } + + // Merge's per-tick output (a burst's worth, or one tail chunk), measured + // for fuel before it joins the carry already in `merged`. + let mut produced = VecDeque::new(); + if !self.in1.is_empty() && !self.in2.is_empty() { + // Both sides have data: drain the loaded burst. + C::merge(&mut self.in1, &mut self.in2, &mut produced); + } else if let Some(chunk) = self.in1.pop_front().or_else(|| self.in2.pop_front()) { + // Exactly one side has data: flush its verbatim tail, one chunk a step. + produced.push_back(chunk); + } else { + // Both sources drained: final flush of advance's and settle's carries. + C::advance(&mut self.merged, self.frontier.borrow(), true, &mut self.advanced); + C::settle(&mut self.advanced, true, &mut self.settled); + self.complete = true; + break; + } + + let work: usize = produced.iter().map(C::len).sum(); + self.merged.extend(produced); + C::advance(&mut self.merged, self.frontier.borrow(), false, &mut self.advanced); + // Maintain grading at the yield boundary: this step may exhaust `fuel` and + // suspend with `advanced` held, and held chunk state must be graded. + C::settle(&mut self.advanced, false, &mut self.settled); + *fuel -= work as isize; + } + } + + fn done(self) -> ChunkBatch { + debug_assert!(self.merged.is_empty() && self.advanced.is_empty()); + let description = Description::new(self.lower, self.upper, self.frontier); + ChunkBatch::new(self.settled.into(), description) + } +} + +/// A [`Builder`](crate::trace::Builder) that collects a chunk sequence into a [`ChunkBatch`]. +pub struct ChunkBatchBuilder { + /// Pushed chunks awaiting settling; holds settle's sub-`TARGET` carry at the front. + input: VecDeque, + /// The graded chunks emitted so far. + output: VecDeque, +} + +impl crate::trace::Builder for ChunkBatchBuilder +where + C: Chunk + Default + 'static, + C::Time: timely::progress::Timestamp, +{ + type Input = C; + type Time = C::Time; + type Output = ChunkBatch; + + fn with_capacity(_keys: usize, _vals: usize, _upds: usize) -> Self { + Self { input: VecDeque::new(), output: VecDeque::new() } + } + + fn push(&mut self, chunk: &mut C) { + let chunk = std::mem::take(chunk); + if chunk.len() > 0 { + self.input.push_back(chunk); + C::settle(&mut self.input, false, &mut self.output); + } + } + + fn done(self, description: Description) -> ChunkBatch { + let ChunkBatchBuilder { mut input, mut output } = self; + C::settle(&mut input, true, &mut output); + ChunkBatch::new(output.into(), description) + } + + fn seal(chain: &mut Vec, description: Description) -> ChunkBatch { + // We settle the chain because we are not guaranteed to received pre-settled data. + // This should be efficient on pre-settled data. + ChunkBatch::new(settle_all(std::mem::take(chain)), description) + } +} + +/// Whether `chunks` satisfy the [`Chunk::TARGET`] grading invariant: every chunk +/// at most `TARGET`, and every adjacent pair summing to more than `TARGET` (so no +/// two neighbours could be combined into one legal chunk — a *maximal packing*). +/// +/// This is the post-[`settle`](Chunk::settle) shape; useful as a test/debug check. +pub fn is_graded(chunks: &[C]) -> bool { + chunks.iter().all(|c| c.len() <= C::TARGET) + && chunks.windows(2).all(|w| w[0].len() + w[1].len() > C::TARGET) +} + +/// Settle `input` to completion into a fresh graded `Vec` (see [`Chunk::settle`]). +/// +/// A convenience for the one-shot callers (batch sealing, the batcher's merge and +/// extract) that have a whole sequence in hand and want it graded; the streaming +/// callers drive [`Chunk::settle`] directly across ticks. +pub fn settle_all(input: impl IntoIterator) -> Vec { + let mut input: VecDeque = input.into_iter().collect(); + let mut out = VecDeque::new(); + C::settle(&mut input, true, &mut out); + debug_assert!(input.is_empty()); + out.into() +} + +/// Merge two full chains of chunks into one, to completion, appending to `out`. +/// +/// The plain whole-chain driver: ticks [`Chunk::merge`] until one deque empties, then +/// appends the other's remainder (the verbatim tail). Output is near-graded, not +/// settled. The batcher's `merge` runs the same loop but settles after each push (the +/// bounded-footprint discipline) and so does not use this; it stays as the simplest way +/// to drive [`Chunk::merge`] to completion. +pub fn merge_chains( + chain1: Vec, + chain2: Vec, + out: &mut VecDeque, +) { + let mut in1: VecDeque = chain1.into(); + let mut in2: VecDeque = chain2.into(); + while !in1.is_empty() && !in2.is_empty() { + C::merge(&mut in1, &mut in2, out); + } + // One deque is empty; the other's remainder is all greater than everything merged. + out.extend(in1.drain(..)); + out.extend(in2.drain(..)); +} diff --git a/differential-dataflow/src/trace/chunk/vec.rs b/differential-dataflow/src/trace/chunk/vec.rs new file mode 100644 index 000000000..d20681d9f --- /dev/null +++ b/differential-dataflow/src/trace/chunk/vec.rs @@ -0,0 +1,794 @@ +//! A worked [`Chunk`]: `Vec<((K, V), T, R)>` behind an `Rc`. +//! +//! The reference implementation. It shows the two integration points any `Chunk` +//! satisfies; another layout copies this *shape*, not the `Vec`: +//! +//! * **Batcher side.** The chunker builds chunks through timely's container traits +//! (`Accountable`, `SizableContainer`, `Consolidate`, `PushInto`), which here +//! delegate to the inner `Vec` via `Rc::make_mut` (free while building, never +//! copying a shared batch). +//! * **Trace side.** [`Chunk`] plus a cursor: key lookups gallop (logarithmic in +//! chunk size), stepping is linear. +//! +//! `Clone` is a refcount bump, so the trace merger shares source chunks rather than +//! copying them. + +use std::collections::VecDeque; +use std::marker::PhantomData; +use std::rc::Rc; + +use timely::Accountable; +use timely::container::{PushInto, SizableContainer}; +use timely::progress::{Antichain, Timestamp}; +use timely::progress::frontier::AntichainRef; + +use crate::consolidation::Consolidate; +use crate::difference::Semigroup; +use crate::lattice::Lattice; +use crate::trace::cursor::Cursor; +use crate::trace::implementations::{Vector, WithLayout}; + +use super::Chunk; + +/// The chunk size: the [`Chunk::TARGET`] value, also used for buffer sizing. +const TARGET: usize = 1024; + +/// A sorted, consolidated run of `((key, val), time, diff)`, shared via `Rc`. +pub struct VecChunk(Rc>); + +impl Clone for VecChunk { + fn clone(&self) -> Self { VecChunk(Rc::clone(&self.0)) } +} +impl Default for VecChunk { + fn default() -> Self { VecChunk(Rc::new(Vec::new())) } +} + +/// The trace type for `arrange`: a spine of `Rc`-shared chunk batches. +pub type ChunkSpine = super::ChunkSpine>; +/// Merge batcher over `VecChunk`s; a `ContainerChunker` at the +/// `arrange_core` callsite forms the chunks it merges (via the container traits below). +pub type ChunkBatcher = super::ChunkBatcher>; +/// Batch builder. +pub type ChunkBuilder = super::ChunkBuilder>; + +impl Accountable for VecChunk { + fn record_count(&self) -> i64 { self.0.len() as i64 } +} + +impl SizableContainer for VecChunk +where K: Clone+'static, V: Clone+'static, T: Clone+'static, R: Clone+'static { + // Absorb at `TARGET`, the grading size, so the chunker emits pre-graded chunks + // rather than timely's byte-derived ones. + fn at_capacity(&self) -> bool { self.0.len() >= TARGET } + fn ensure_capacity(&mut self, _stash: &mut Option) { + let inner = Rc::make_mut(&mut self.0); + inner.reserve(TARGET.saturating_sub(inner.len())); + } +} + +impl Consolidate for VecChunk +where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Ord+Clone+'static, R: Semigroup+'static { + fn len(&self) -> usize { self.0.len() } + fn clear(&mut self) { Rc::make_mut(&mut self.0).clear() } + fn consolidate_into(&mut self, target: &mut Self) { + Rc::make_mut(&mut self.0).consolidate_into(Rc::make_mut(&mut target.0)); + } +} + +impl PushInto<((K, V), T, R)> for VecChunk +where K: Clone+'static, V: Clone+'static, T: Clone+'static, R: Clone+'static { + fn push_into(&mut self, item: ((K, V), T, R)) { Rc::make_mut(&mut self.0).push(item); } +} + +/// First index `>= start` at which `pred` turns false, by galloping (exponential) +/// search. `pred` must hold for a prefix then not — i.e. `|u| u < target`. +/// O(log distance), so O(1) for short hops and logarithmic for long ones. +fn gallop(s: &[U], start: usize, pred: impl Fn(&U) -> bool) -> usize { + let mut pos = start; + if pos < s.len() && pred(&s[pos]) { + let mut step = 1; + while pos + step < s.len() && pred(&s[pos + step]) { pos += step; step <<= 1; } + step >>= 1; + while step > 0 { + if pos + step < s.len() && pred(&s[pos + step]) { pos += step; } + step >>= 1; + } + pos += 1; + } + pos +} + +/// A cursor over a [`VecChunk`], tracking the current key and `(key, val)` +/// group starts as indices into the flat vector. +pub struct VecChunkCursor { + key_pos: usize, + val_pos: usize, + phantom: PhantomData<(K, V, T, R)>, +} + +impl WithLayout for VecChunk +where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { + type Layout = Vector<((K, V), T, R)>; +} + +impl WithLayout for VecChunkCursor +where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { + type Layout = Vector<((K, V), T, R)>; +} + +impl Cursor for VecChunkCursor +where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { + type Storage = VecChunk; + + fn key_valid(&self, s: &Self::Storage) -> bool { self.key_pos < s.0.len() } + fn val_valid(&self, s: &Self::Storage) -> bool { + self.key_pos < s.0.len() && self.val_pos < s.0.len() && s.0[self.val_pos].0.0 == s.0[self.key_pos].0.0 + } + fn key<'a>(&self, s: &'a Self::Storage) -> &'a K { &s.0[self.key_pos].0.0 } + fn val<'a>(&self, s: &'a Self::Storage) -> &'a V { &s.0[self.val_pos].0.1 } + fn get_key<'a>(&self, s: &'a Self::Storage) -> Option<&'a K> { + if self.key_valid(s) { Some(self.key(s)) } else { None } + } + fn get_val<'a>(&self, s: &'a Self::Storage) -> Option<&'a V> { + if self.val_valid(s) { Some(self.val(s)) } else { None } + } + fn map_times(&mut self, s: &Self::Storage, mut logic: L) { + if !self.val_valid(s) { return; } + let kv = &s.0[self.val_pos].0; + let mut i = self.val_pos; + while i < s.0.len() && &s.0[i].0 == kv { + logic(&s.0[i].1, &s.0[i].2); + i += 1; + } + } + fn step_key(&mut self, s: &Self::Storage) { + // Linear: stepping is a short hop to the next group; an inlined scan + // beats a gallop call for the common small-group case. + if self.key_pos >= s.0.len() { return; } + let key = s.0[self.key_pos].0.0.clone(); + let mut i = self.key_pos; + while i < s.0.len() && s.0[i].0.0 == key { i += 1; } + self.key_pos = i; + self.val_pos = i; + } + fn seek_key(&mut self, s: &Self::Storage, key: &K) { + // Logarithmic: O(log distance), independent of chunk size. + self.key_pos = gallop(&s.0, self.key_pos, |u| &u.0.0 < key); + self.val_pos = self.key_pos; + } + fn step_val(&mut self, s: &Self::Storage) { + if !self.val_valid(s) { return; } + let kv = s.0[self.val_pos].0.clone(); + let mut i = self.val_pos; + while i < s.0.len() && s.0[i].0 == kv { i += 1; } + self.val_pos = i; + } + fn seek_val(&mut self, s: &Self::Storage, val: &V) { + if !self.key_valid(s) { return; } + let key = s.0[self.key_pos].0.0.clone(); + self.val_pos = gallop(&s.0, self.val_pos, |u| (&u.0.0, &u.0.1) < (&key, val)); + } + fn rewind_keys(&mut self, _s: &Self::Storage) { self.key_pos = 0; self.val_pos = 0; } + fn rewind_vals(&mut self, _s: &Self::Storage) { self.val_pos = self.key_pos; } +} + +/// Take the `Vec` out of a chunk, copying only if the `Rc` is shared. +fn take(chunk: VecChunk) -> Vec<((K, V), T, R)> { + Rc::try_unwrap(chunk.0).unwrap_or_else(|rc| (*rc).clone()) +} + +impl Chunk for VecChunk +where K: Ord+Clone+'static, V: Ord+Clone+'static, T: Lattice+Timestamp, R: Ord+Semigroup+'static { + type Cursor = VecChunkCursor; + + const TARGET: usize = TARGET; + + fn cursor(&self) -> Self::Cursor { + VecChunkCursor { key_pos: 0, val_pos: 0, phantom: PhantomData } + } + + fn bounds(&self) -> ((&K, &V, &T), (&K, &V, &T)) { + let s = &self.0[..]; + let (f, l) = (&s[0], &s[s.len() - 1]); + ((&f.0.0, &f.0.1, &f.1), (&l.0.0, &l.0.1, &l.1)) + } + + fn len(&self) -> usize { self.0.len() } + + /// A two-pointer binary merge of the two deques' loaded content, up to their shared + /// horizon — the lesser of the two last `(key, val, time)`s. Consolidates equal + /// triples and bulk-copies disjoint runs as slices, walking chunk boundaries with + /// local indices. The horizon's owner drains fully; the other's partial front is + /// pruned and pushed back once, at the yield. + fn merge(in1: &mut VecDeque, in2: &mut VecDeque, out: &mut VecDeque) { + fn kv(u: &((K, V), T, R)) -> (&K, &V) { (&u.0.0, &u.0.1) } + + let mut result: Vec<((K, V), T, R)> = Vec::with_capacity(TARGET); + let mut flush = |result: &mut Vec<((K, V), T, R)>, force: bool| { + if result.len() >= TARGET || (force && !result.is_empty()) { + out.push_back(VecChunk(Rc::new(std::mem::replace(result, Vec::with_capacity(TARGET))))); + } + }; + + // Read working chunks by index (never `take`n, so a source clone stays shared). + // Both deques are non-empty on entry; the loop stops when one runs dry — its + // last triple is the horizon, and the rest waits for the next call. + let mut c1 = in1.pop_front().unwrap(); + let mut c2 = in2.pop_front().unwrap(); + let (mut p1, mut p2) = (0usize, 0usize); + while p1 < c1.0.len() && p2 < c2.0.len() { + let a = &c1.0[p1]; + let b = &c2.0[p2]; + match (kv(a), &a.1).cmp(&(kv(b), &b.1)) { + // Copy the run strictly below the other's head — no collisions there — + // as `TARGET`-sized slices. + std::cmp::Ordering::Less => { + let run = gallop(&c1.0[..], p1 + 1, |u| (kv(u), &u.1) < (kv(b), &b.1)); + for piece in c1.0[p1..run].chunks(TARGET) { + result.extend_from_slice(piece); + flush(&mut result, false); + } + p1 = run; + } + std::cmp::Ordering::Greater => { + let run = gallop(&c2.0[..], p2 + 1, |u| (kv(u), &u.1) < (kv(a), &a.1)); + for piece in c2.0[p2..run].chunks(TARGET) { + result.extend_from_slice(piece); + flush(&mut result, false); + } + p2 = run; + } + std::cmp::Ordering::Equal => { + let mut diff = a.2.clone(); + diff.plus_equals(&b.2); + if !diff.is_zero() { + result.push((a.0.clone(), a.1.clone(), diff)); + } + p1 += 1; + p2 += 1; + flush(&mut result, false); + } + } + // Refill a working chunk consumed above; if its deque is empty, stop. + if p1 == c1.0.len() { + match in1.pop_front() { Some(c) => { c1 = c; p1 = 0; } None => break } + } + if p2 == c2.0.len() { + match in2.pop_front() { Some(c) => { c2 = c; p2 = 0; } None => break } + } + } + flush(&mut result, true); + // Push back the survivor's unconsumed suffix (one copy), ahead of its + // remaining loaded chunks. + if p1 < c1.0.len() { in1.push_front(VecChunk(Rc::new(c1.0[p1..].to_vec()))); } + if p2 < c2.0.len() { in2.push_front(VecChunk(Rc::new(c2.0[p2..].to_vec()))); } + } + + fn extract( + input: &mut VecDeque, + frontier: AntichainRef, + residual: &mut Antichain, + keep: &mut VecDeque, + ship: &mut VecDeque, + ) { + // One input chunk per call: partition it into a keep piece and a ship piece and + // return, so the harness settles each side before the next chunk is read. The + // pieces may be small; `settle` grades them. + let Some(chunk) = input.pop_front() else { return }; + let (mut k, mut s) = (Vec::new(), Vec::new()); + for u in take(chunk) { + if frontier.less_equal(&u.1) { residual.insert_ref(&u.1); k.push(u); } + else { s.push(u); } + } + if !k.is_empty() { keep.push_back(VecChunk(Rc::new(k))); } + if !s.is_empty() { ship.push_back(VecChunk(Rc::new(s))); } + } + + fn advance( + input: &mut VecDeque, + frontier: AntichainRef, + done: bool, + out: &mut VecDeque, + ) { + // Advance and consolidate each *complete* `(key, val)` group eagerly. Only the + // last group might still grow; unless `done`, withhold it (push it back as the + // carry) and emit the rest. + let mut stash: Vec> = Vec::new(); + // Reuse the front chunk's storage (last call's carry) as the working buffer and + // append the rest, so a withheld group accumulates in place: O(total) over the + // run, not O(total²). + let mut buf = match input.pop_front() { Some(chunk) => take(chunk), None => return }; + while let Some(chunk) = input.pop_front() { + let mut v = take(chunk); + buf.append(&mut v); + stash.push(v); + } + if buf.is_empty() { return; } + + // Giant-key case: if the whole buffer is one `(key, val)`, no group is provably + // complete, so unless `done` withhold it all and return. First-vs-last detects + // this without a scan. + if !done && buf[0].0 == buf[buf.len() - 1].0 { + input.push_front(VecChunk(Rc::new(buf))); + return; + } + + // At least the first group is complete; withhold the last as the carry unless `done`. + let end = if done { buf.len() } else { + let last_kv = buf[buf.len() - 1].0.clone(); + let mut start = buf.len(); + while start > 0 && buf[start - 1].0 == last_kv { start -= 1; } + start + }; + if end < buf.len() { + input.push_front(VecChunk(Rc::new(buf.split_off(end)))); + } + // Advance and consolidate each group into `TARGET`-sized output chunks, filling + // buffers reclaimed from the recycled `Vec`s. + let mut result = stash.pop().unwrap_or_default(); + let mut i = 0; + while i < buf.len() { + let mut j = i; + while j < buf.len() && buf[j].0 == buf[i].0 { j += 1; } + for u in &mut buf[i..j] { u.1.advance_by(frontier); } + // Advancing is lattice-monotone but not total-order-monotone; re-sort by time. + buf[i..j].sort_by(|a, b| a.1.cmp(&b.1)); + let mut k = i; + while k < j { + let kv = buf[k].0.clone(); + let t = buf[k].1.clone(); + let mut diff = buf[k].2.clone(); + k += 1; + while k < j && buf[k].1 == t { diff.plus_equals(&buf[k].2); k += 1; } + if !diff.is_zero() { + result.push((kv, t, diff)); + if result.len() >= TARGET { out.push_back(VecChunk(Rc::new(std::mem::replace(&mut result, stash.pop().unwrap_or_default())))); } + } + } + i = j; + } + if !result.is_empty() { out.push_back(VecChunk(Rc::new(result))); } + } + + fn settle(input: &mut VecDeque, done: bool, out: &mut VecDeque) { + // Maximal packing: a `TARGET` chunk is maximal, so it passes through as an `Rc` + // move; only sub-`TARGET` chunks are copied, and only to coalesce a neighbour. + // `carry` is the chunk under construction — flushed at `TARGET`, pushed back onto + // `input` between calls, or emitted on `done`. Its left neighbour in `out` is + // always a `TARGET` chunk, so emitting it keeps the packing maximal on both sides. + let mut carry: Vec<((K, V), T, R)> = Vec::new(); + while let Some(chunk) = input.pop_front() { + if carry.is_empty() { + absorb(chunk, &mut carry, out); + } else if carry.len() + chunk.0.len() <= TARGET { + // Combines into one legal chunk; coalesce in place. + carry.extend(take(chunk)); + if carry.len() == TARGET { + out.push_back(VecChunk(Rc::new(std::mem::take(&mut carry)))); + } + } else { + // Cannot combine without exceeding `TARGET`; `carry` is maximal + // against this neighbour, so emit it and absorb the chunk afresh. + out.push_back(VecChunk(Rc::new(std::mem::take(&mut carry)))); + absorb(chunk, &mut carry, out); + } + } + if !carry.is_empty() { + let chunk = VecChunk(Rc::new(carry)); + if done { out.push_back(chunk); } else { input.push_front(chunk); } + } + } +} + +/// Emit maximal `TARGET`-sized chunks off the front of `carry`, leaving the +/// sub-`TARGET` tail behind. +fn peel( + carry: &mut Vec<((K, V), T, R)>, + out: &mut VecDeque>, +) { + let mut start = 0; + while carry.len() - start >= TARGET { + out.push_back(VecChunk(Rc::new(carry[start..start + TARGET].to_vec()))); + start += TARGET; + } + carry.drain(..start); +} + +/// Absorb a chunk when nothing is carried: pass a `TARGET` chunk through as an +/// `Rc` move, hold a smaller one in `carry`, or split a larger one (peeling off +/// `TARGET` pieces and carrying the remainder). `carry` must be empty on entry. +fn absorb( + chunk: VecChunk, + carry: &mut Vec<((K, V), T, R)>, + out: &mut VecDeque>, +) { + use std::cmp::Ordering::{Equal, Greater, Less}; + match chunk.0.len().cmp(&TARGET) { + Equal => out.push_back(chunk), + Less => *carry = take(chunk), + Greater => { *carry = take(chunk); peel(carry, out); } + } +} + +#[cfg(test)] +mod test { + use std::collections::VecDeque; + use super::{Chunk, VecChunk}; + use crate::trace::chunk::merge_chains; + use std::rc::Rc; + + fn chunk(updates: Vec<((u64, u64), u64, i64)>) -> VecChunk { + VecChunk(Rc::new(updates)) + } + + // Flatten a chunk sequence back to its update stream. + fn flat>>(chunks: I) -> Vec<((u64, u64), u64, i64)> { + chunks.into_iter().flat_map(|c| (*c.0).clone()).collect() + } + + // `extract` partitions by frontier, a bounded amount per call, folding the kept + // frontier into `residual`; `settle` then fuses each side's pieces into a graded run. + #[test] + fn extract_partitions_and_grades() { + use super::TARGET; + use crate::trace::chunk::{is_graded, settle_all}; + use timely::progress::Antichain; + + // 4·TARGET updates spread over many input chunks; even times ship + // (< frontier), odd times keep (>= frontier), so both sides straddle. + let n = 4 * TARGET as u64; + let mut input: VecDeque<_> = (0..n).map(|i| chunk(vec![((i, 0), i % 2, 1)])).collect(); + let frontier = Antichain::from_elem(1u64); + let mut residual = Antichain::new(); + let (mut keep, mut ship) = (VecDeque::new(), VecDeque::new()); + // Drive to completion, as the harness does (one input chunk per call). + while !input.is_empty() { + VecChunk::extract(&mut input, frontier.borrow(), &mut residual, &mut keep, &mut ship); + } + let (keep, ship) = (settle_all(keep), settle_all(ship)); + + // Kept times are exactly {1}; that is the residual frontier. + assert_eq!(residual, Antichain::from_elem(1u64)); + // Both sides are graded after the settle. + assert!(is_graded(&keep), "ungraded keep: {:?}", keep.iter().map(Chunk::len).collect::>()); + assert!(is_graded(&ship), "ungraded ship: {:?}", ship.iter().map(Chunk::len).collect::>()); + // Nothing lost: half the updates each way. + assert_eq!(keep.iter().map(Chunk::len).sum::(), n as usize / 2); + assert_eq!(ship.iter().map(Chunk::len).sum::(), n as usize / 2); + } + + // `advance` advances and consolidates complete `(key, val)` groups eagerly, + // pushing the (possibly-growing) last group back as the carry when not `done`. + #[test] + fn advance_emits_complete_groups_eagerly() { + use timely::progress::Antichain; + + let frontier = Antichain::from_elem(5u64); + // Group (0,0) is complete within this chunk; group (1,0) might still grow. + let c0 = chunk(vec![((0, 0), 0, 1), ((0, 0), 1, 1), ((1, 0), 0, 1)]); + let mut input: VecDeque<_> = VecDeque::from([c0]); + let mut out = VecDeque::new(); + VecChunk::advance(&mut input, frontier.borrow(), false, &mut out); + + // The trailing group (1,0) is withheld as the carry at the front of `input`. + assert_eq!(input.len(), 1); + assert_eq!(Chunk::len(&input[0]), 1); + // Group (0,0)'s times {0,1} advanced to 5 and consolidated, emitted now. + assert_eq!(flat(out), vec![((0, 0), 5, 2)]); + } + + // Streaming the input one chunk at a time must yield exactly what a single + // all-at-once flush does — the resumable path is just the one-shot path cut + // at group boundaries. + #[test] + fn advance_resumable_matches_oneshot() { + use timely::progress::Antichain; + + let frontier = Antichain::from_elem(3u64); + // Groups span chunk boundaries and carry several times each. + let input = || vec![ + chunk(vec![((0, 0), 0, 1), ((0, 0), 1, 1), ((1, 0), 0, 1)]), + chunk(vec![((1, 0), 5, 1), ((1, 1), 0, 1), ((2, 0), 0, 1)]), + chunk(vec![((2, 0), 2, 1), ((2, 0), 9, 1)]), + ]; + + let oneshot = { + let mut q: VecDeque<_> = input().into(); + let mut out = VecDeque::new(); + VecChunk::advance(&mut q, frontier.borrow(), false, &mut out); + VecChunk::advance(&mut q, frontier.borrow(), true, &mut out); + flat(out) + }; + let incremental = { + let mut q = VecDeque::new(); + let mut out = VecDeque::new(); + for c in input() { q.push_back(c); VecChunk::advance(&mut q, frontier.borrow(), false, &mut out); } + VecChunk::advance(&mut q, frontier.borrow(), true, &mut out); + flat(out) + }; + assert_eq!(oneshot, incremental); + // Times are advanced: nothing below the frontier survives. + for u in &oneshot { assert!(u.1 >= 3); } + } + + // A single `(key, val)` whose updates span every pushed chunk: `advance` + // can make no progress until `done`, accumulating in the carry in place. + // It must still produce the right advanced+consolidated result at the end. + #[test] + fn advance_single_key_spanning_pushes() { + use timely::progress::Antichain; + + let frontier = Antichain::from_elem(100u64); + let n = 50u64; + let make = || (0..n).map(|t| chunk(vec![((7u64, 0u64), t, 1i64)])).collect::>(); + + let mut q = VecDeque::new(); + let mut out = VecDeque::new(); + for c in make() { q.push_back(c); VecChunk::advance(&mut q, frontier.borrow(), false, &mut out); } + VecChunk::advance(&mut q, frontier.borrow(), true, &mut out); + // All times advance to 100 and consolidate to one update of diff `n`. + assert_eq!(flat(out), vec![((7u64, 0u64), 100u64, n as i64)]); + } + + #[test] + fn merge_chains_consolidates() { + let a = chunk(vec![((0, 0), 0, 1), ((1, 0), 0, 1)]); + let b = chunk(vec![((0, 0), 0, 1), ((2, 0), 0, 1)]); + let mut out = VecDeque::new(); + merge_chains(vec![a], vec![b], &mut out); + assert_eq!(flat(out), vec![((0, 0), 0, 2), ((1, 0), 0, 1), ((2, 0), 0, 1)]); + } + + // Merging runs larger than `TARGET`, then settling, yields a *graded* sequence + // (each chunk `<= TARGET`, adjacent pairs summing past `TARGET`) reproducing the + // consolidated sorted contents. + #[test] + fn merge_emits_graded_chunks() { + use super::TARGET; + use crate::trace::chunk::{is_graded, merge_chains, settle_all}; + + // Two interleaving single-chunk chains: evens and odds over `0..4·TARGET`. + let n = 4 * TARGET as u64; + let evens = chunk((0..n).step_by(2).map(|k| ((k, 0), 0, 1)).collect()); + let odds = chunk((0..n).step_by(2).map(|k| ((k + 1, 0), 0, 1)).collect()); + + let mut out = VecDeque::new(); + merge_chains(vec![evens], vec![odds], &mut out); + let chunks = settle_all(out); + + assert!(is_graded(&chunks), "merge output not graded: {:?}", + chunks.iter().map(Chunk::len).collect::>()); + // Contents are exactly the sorted keys `0..4·TARGET`, each once. + let want: Vec<_> = (0..n).map(|k| ((k, 0u64), 0u64, 1i64)).collect(); + assert_eq!(flat(chunks), want); + } + + // Property test: merging two *multi-chunk* chains (driven through `merge` by + // `merge_chains`) reproduces the union of all updates, consolidated. Tiny + // chunks force `(key, val)` groups — which can span several times — to + // straddle chunk boundaries on both sides, exercising the refill path the + // single-chunk merge tests never reach. The independent oracle is + // `consolidate_updates` over the concatenation. + #[test] + fn merge_matches_reference() { + use crate::trace::chunk::merge_chains; + use crate::consolidation::consolidate_updates; + + // Deterministic xorshift PRNG — no dev-dependency on `rand`. + let mut seed = 0x2545F4914F6CDD1Du64; + let mut rng = move || { seed ^= seed << 13; seed ^= seed >> 7; seed ^= seed << 17; seed }; + + // A sorted, consolidated update set over a small (key, val, time) space, + // so the two chains collide and a `(key, val)` carries several times. + fn gen(rng: &mut impl FnMut() -> u64, n: usize) -> Vec<((u64, u64), u64, i64)> { + let mut v: Vec<((u64, u64), u64, i64)> = (0..n).map(|_| { + let k = rng() % 20; let val = rng() % 3; let t = rng() % 8; + let d = if rng() % 4 == 0 { -1 } else { 1 }; + ((k, val), t, d) + }).collect(); + consolidate_updates(&mut v); + v + } + // Split a consolidated set into a chain of small chunks (each sorted and + // consolidated; together globally sorted), so groups straddle boundaries. + fn chain(updates: &[((u64, u64), u64, i64)], sz: usize) -> Vec> { + updates.chunks(sz).map(|c| VecChunk(Rc::new(c.to_vec()))).collect() + } + + for _ in 0..300 { + let n1 = (rng() as usize % 60) + 1; + let u1 = gen(&mut rng, n1); + let n2 = (rng() as usize % 60) + 1; + let u2 = gen(&mut rng, n2); + if u1.is_empty() || u2.is_empty() { continue; } + let sz = (rng() as usize % 5) + 1; // tiny chunks → heavy straddling + + let mut out = VecDeque::new(); + merge_chains(chain(&u1, sz), chain(&u2, sz), &mut out); + let merged = flat(out); + + let mut reference: Vec<_> = u1.iter().chain(u2.iter()).cloned().collect(); + consolidate_updates(&mut reference); + + assert_eq!(merged, reference, "chunk size {sz}\n u1={u1:?}\n u2={u2:?}"); + } + } + + // Driving `ChunkBatchMerger` to completion with tiny `fuel` — so it suspends and + // settles on nearly every tick — must yield the same advanced-and-consolidated + // batch as a one-shot reference, and that batch must be graded. Exercises the + // resumable merge→advance→settle pipeline and the grade-at-yield invariant. + #[test] + fn batch_merger_resumable_matches_reference() { + use crate::trace::{BatchReader, Description, Merger}; + use crate::trace::chunk::{ChunkBatch, ChunkBatchMerger, is_graded}; + use crate::trace::cursor::Cursor; + use crate::consolidation::consolidate_updates; + use timely::progress::Antichain; + + let mut seed = 0x9E3779B97F4A7C15u64; + let mut rng = move || { seed ^= seed << 13; seed ^= seed >> 7; seed ^= seed << 17; seed }; + + // A sorted, consolidated set over a small space, so the two sources collide + // and a `(key, val)` carries several times. + fn gen(rng: &mut impl FnMut() -> u64) -> Vec<((u64, u64), u64, i64)> { + let n = rng() as usize % 40 + 1; + let mut v: Vec<((u64, u64), u64, i64)> = (0..n).map(|_| { + let k = rng() % 10; let val = rng() % 3; let t = rng() % 6; + let d = if rng() % 4 == 0 { -1 } else { 1 }; + ((k, val), t, d) + }).collect(); + consolidate_updates(&mut v); + v + } + // Cut a consolidated set into a batch of small chunks, so groups straddle. + fn batch(updates: &[((u64, u64), u64, i64)], sz: usize) -> ChunkBatch> { + let chunks: Vec<_> = updates.chunks(sz).map(|c| VecChunk(Rc::new(c.to_vec()))).collect(); + let desc = Description::new( + Antichain::from_elem(0u64), Antichain::from_elem(10u64), Antichain::from_elem(0u64)); + ChunkBatch::new(chunks, desc) + } + // Flatten a batch through its straddle-aware cursor, then consolidate. + fn read(b: &ChunkBatch>) -> Vec<((u64, u64), u64, i64)> { + let mut out = Vec::new(); + let mut c = b.cursor(); + while c.key_valid(b) { + let k = *c.key(b); + while c.val_valid(b) { + let v = *c.val(b); + c.map_times(b, |t, d| out.push(((k, v), *t, *d))); + c.step_val(b); + } + c.step_key(b); + } + consolidate_updates(&mut out); + out + } + + for _ in 0..200 { + let u1 = gen(&mut rng); + let u2 = gen(&mut rng); + if u1.is_empty() || u2.is_empty() { continue; } + let sz = (rng() as usize % 4) + 1; + let f = rng() % 6; + let (s1, s2) = (batch(&u1, sz), batch(&u2, sz)); + let frontier = Antichain::from_elem(f); + + let mut merger = ChunkBatchMerger::new(&s1, &s2, frontier.borrow()); + loop { + let mut fuel = 1isize; // tiny → many yields, each settling + merger.work(&s1, &s2, &mut fuel); + if fuel > 0 { break; } + } + let result = merger.done(); + + // The produced batch is graded (grade-at-yield, so also at done). + assert!(is_graded(&result.chunks), "ungraded result: {:?}", + result.chunks.iter().map(Chunk::len).collect::>()); + // ...and its contents are the merged sources, advanced to `f`, consolidated. + let got = read(&result); + let mut want: Vec<_> = u1.iter().chain(u2.iter()).cloned().collect(); + for u in want.iter_mut() { u.1 = u.1.max(f); } + consolidate_updates(&mut want); + assert_eq!(got, want, "fuel-driven merge mismatch\n u1={u1:?}\n u2={u2:?}\n f={f}"); + } + } + + // `settle` must produce a *maximal packing*: adjacent sub-`TARGET` chunks + // that could combine into one legal chunk are coalesced, full chunks pass + // through as `Rc` moves, and contents are preserved exactly. + #[test] + fn settle_maximal_packing() { + use super::TARGET; + use crate::trace::chunk::is_graded; + + // A mix of small and full chunks with distinct, increasing keys (so the + // concatenation is sorted and nothing consolidates away). + let t = TARGET; + let sizes = [t / 3, t / 3, t / 3, t, t / 2, t / 2, t, 1, t - 1]; + let total: usize = sizes.iter().sum(); + let mut key = 0u64; + let mut input = VecDeque::new(); + let mut output = VecDeque::new(); + for &s in &sizes { + let updates: Vec<_> = (0..s).map(|_| { let k = key; key += 1; ((k, 0u64), 0u64, 1i64) }).collect(); + input.push_back(chunk(updates)); + VecChunk::settle(&mut input, false, &mut output); + } + VecChunk::settle(&mut input, true, &mut output); + let chunks: Vec<_> = output.into(); + + assert!(is_graded(&chunks), "not graded: {:?}", + chunks.iter().map(Chunk::len).collect::>()); + // Nothing lost, and the keys stay strictly sorted across the new breaks. + let got: Vec<_> = chunks.into_iter().flat_map(|c| (*c.0).clone()).collect(); + assert_eq!(got.len(), total); + assert!(got.windows(2).all(|w| w[0].0.0 < w[1].0.0)); + } + + // The indexed cursor must reconstruct the same grouped updates as a flat + // reference, even when a key — and a `(key, val)`'s times — straddle a + // chunk boundary. + #[test] + fn cursor_handles_straddle() { + use crate::trace::cursor::Cursor; + use crate::trace::{BatchReader, Description}; + use crate::trace::chunk::ChunkBatch; + use timely::progress::Antichain; + + let chunks = vec![ + chunk(vec![((0, 0), 0, 1), ((1, 0), 0, 1), ((1, 1), 0, 1)]), + chunk(vec![((1, 1), 1, 1), ((1, 2), 0, 1)]), + chunk(vec![((2, 0), 0, 1)]), + ]; + let desc = Description::new( + Antichain::from_elem(0u64), + Antichain::from_elem(2u64), + Antichain::from_elem(0u64), + ); + let batch = ChunkBatch::new(chunks, desc); + + let mut cursor = batch.cursor(); + let got = cursor.to_vec(&batch, |k| *k, |v| *v); + let want = vec![ + ((0u64, 0u64), vec![(0u64, 1i64)]), + ((1, 0), vec![(0, 1)]), + ((1, 1), vec![(0, 1), (1, 1)]), + ((1, 2), vec![(0, 1)]), + ((2, 0), vec![(0, 1)]), + ]; + assert_eq!(got, want); + } + + // Isolated: gallop vs linear forward-seek over one big chunk, for sparse to + // dense probe sets. Run: cargo test seek_microbench -- --ignored --nocapture + #[test] + #[ignore] + fn seek_microbench() { + use std::time::Instant; + use std::hint::black_box; + use super::gallop; + let n = 1_000_000u64; + let data: Vec<((u64, ()), u64, isize)> = (0..n).map(|k| ((3 * k, ()), 0u64, 1isize)).collect(); + for probes in [100u64, 10_000, 1_000_000] { + let targets: Vec = (0..probes).map(|i| 3 * (i * n / probes)).collect(); + let best = |f: &dyn Fn() -> u64| { + let mut b = std::time::Duration::MAX; + for _ in 0..5 { let t = Instant::now(); black_box(f()); b = b.min(t.elapsed()); } + b + }; + let data = black_box(&data[..]); + let g = best(&|| { + let (mut pos, mut acc) = (0usize, 0u64); + for &tgt in &targets { pos = gallop(data, pos, |u| u.0.0 < tgt); acc += pos as u64; } + acc + }); + let l = best(&|| { + let (mut pos, mut acc) = (0usize, 0u64); + for &tgt in &targets { while pos < data.len() && data[pos].0.0 < tgt { pos += 1; } acc += pos as u64; } + acc + }); + eprintln!("probes={probes:>7}: gallop={g:>12?} linear={l:>12?}"); + } + } +} diff --git a/differential-dataflow/src/trace/mod.rs b/differential-dataflow/src/trace/mod.rs index 6fd1da106..61ee30d65 100644 --- a/differential-dataflow/src/trace/mod.rs +++ b/differential-dataflow/src/trace/mod.rs @@ -7,6 +7,7 @@ //! collection trace. This trait allows operator implementations to be generic with respect to the type of trace, //! and allows various data structures to be interpretable as multiple different types of trace. +pub mod chunk; pub mod cursor; pub mod description; pub mod implementations;